Module RDig
In: lib/rdig.rb
lib/rdig/documents.rb
lib/rdig/content_extractors.rb
lib/rdig/search.rb
lib/rdig/highlight.rb
lib/rdig/index.rb
lib/rdig/url_filters.rb
lib/rdig/crawler.rb
lib/rdig/content_extractors/rubyful_soup.rb
lib/rdig/content_extractors/hpricot.rb
lib/rdig/content_extractors/doc.rb
lib/rdig/content_extractors/pdf.rb

See README for basic usage information

Methods

Classes and Modules

Module RDig::ContentExtractors
Module RDig::Index
Module RDig::Search
Module RDig::UrlFilters
Class RDig::Application
Class RDig::Crawler
Class RDig::Document
Class RDig::ETagFilter
Class RDig::FileDocument
Class RDig::HttpDocument

External Aliases

configuration -> config

Public Class methods

[Source]

    # File lib/rdig.rb, line 87
87:     def application
88:       @application ||= Application.new
89:     end

RDig configuration

may be used with a block:

  RDig.configuration do |config| ...

see doc/examples/config.rb for a commented example configuration

[Source]

     # File lib/rdig.rb, line 101
101:     def configuration
102:       if block_given?
103:         yield configuration
104:       else
105:         @config ||= OpenStruct.new(
106:           :crawler           => OpenStruct.new(
107:             :start_urls        => [ "http://localhost:3000/" ],
108:             :include_hosts     => [ "localhost" ],
109:             :include_documents => nil,
110:             :exclude_documents => nil,
111:             :index_document    => nil,
112:             :num_threads       => 2,
113:             :max_redirects     => 5,
114:             :wait_before_leave => 10
115:           ),
116:           :content_extraction  => OpenStruct.new(
117:             # settings for html content extraction (hpricot)
118:             :hpricot      => OpenStruct.new(
119:               # css selector for the element containing the page title
120:               :title_tag_selector => 'title', 
121:               # might also be a proc returning either an element or a string:
122:               # :title_tag_selector => lambda { |hpricot_doc| ... }
123:               :content_tag_selector => 'body'
124:               # might also be a proc returning either an element or a string:
125:               # :content_tag_selector => lambda { |hpricot_doc| ... }
126:             )
127:             #,
128:             # # settings for html content extraction (RubyfulSoup)
129:             # :rubyful_soup => OpenStruct.new(
130:             #  # select the html element that contains the content to index
131:             #  # by default, we index all inside the body tag:
132:             #  :content_tag_selector => lambda { |tagsoup|
133:             #    tagsoup.html.body
134:             #  },
135:             #  # select the html element containing the title 
136:             #  :title_tag_selector         => lambda { |tagsoup|
137:             #    tagsoup.html.head.title
138:             #  }
139:             # )
140:           ),
141:           :index                 => OpenStruct.new( 
142:             :path                => "index/", 
143:             :create              => true,
144:             :handle_parse_errors => true,
145:             :analyzer            => Ferret::Analysis::StandardAnalyzer.new,
146:             :occur_default       => :must,
147:             :default_field       => '*'
148:           )
149:         )
150:       end
151:     end

the filter chains are for limiting the set of indexed documents. there are two chain types - one for http, and one for file system crawling. a document has to survive all filters in the chain to get indexed.

[Source]

    # File lib/rdig.rb, line 65
65:     def filter_chain
66:       @filter_chain ||= {
67:         # filter chain for http crawling
68:         :http => [
69:           :scheme_filter_http,
70:           :fix_relative_uri,
71:           :normalize_uri,
72:           { :hostname_filter => :include_hosts },
73:           { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
74:           { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
75:           RDig::UrlFilters::VisitedUrlFilter 
76:         ],
77:         # filter chain for file system crawling
78:         :file => [
79:           :scheme_filter_file,
80:           { RDig::UrlFilters::PathInclusionFilter => :include_documents },
81:           { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
82:         ]
83:       }
84:          
85:     end

[Source]

    # File lib/rdig.rb, line 91
91:     def searcher
92:       @searcher ||= Search::Searcher.new(config.index)
93:     end

[Validate]