See README for basic usage information
| configuration | -> | config |
RDig configuration
may be used with a block:
RDig.configuration do |config| ...
see doc/examples/config.rb for a commented example configuration
# File lib/rdig.rb, line 101
101: def configuration
102: if block_given?
103: yield configuration
104: else
105: @config ||= OpenStruct.new(
106: :crawler => OpenStruct.new(
107: :start_urls => [ "http://localhost:3000/" ],
108: :include_hosts => [ "localhost" ],
109: :include_documents => nil,
110: :exclude_documents => nil,
111: :index_document => nil,
112: :num_threads => 2,
113: :max_redirects => 5,
114: :wait_before_leave => 10
115: ),
116: :content_extraction => OpenStruct.new(
117: # settings for html content extraction (hpricot)
118: :hpricot => OpenStruct.new(
119: # css selector for the element containing the page title
120: :title_tag_selector => 'title',
121: # might also be a proc returning either an element or a string:
122: # :title_tag_selector => lambda { |hpricot_doc| ... }
123: :content_tag_selector => 'body'
124: # might also be a proc returning either an element or a string:
125: # :content_tag_selector => lambda { |hpricot_doc| ... }
126: )
127: #,
128: # # settings for html content extraction (RubyfulSoup)
129: # :rubyful_soup => OpenStruct.new(
130: # # select the html element that contains the content to index
131: # # by default, we index all inside the body tag:
132: # :content_tag_selector => lambda { |tagsoup|
133: # tagsoup.html.body
134: # },
135: # # select the html element containing the title
136: # :title_tag_selector => lambda { |tagsoup|
137: # tagsoup.html.head.title
138: # }
139: # )
140: ),
141: :index => OpenStruct.new(
142: :path => "index/",
143: :create => true,
144: :handle_parse_errors => true,
145: :analyzer => Ferret::Analysis::StandardAnalyzer.new,
146: :occur_default => :must,
147: :default_field => '*'
148: )
149: )
150: end
151: end
the filter chains are for limiting the set of indexed documents. there are two chain types - one for http, and one for file system crawling. a document has to survive all filters in the chain to get indexed.
# File lib/rdig.rb, line 65
65: def filter_chain
66: @filter_chain ||= {
67: # filter chain for http crawling
68: :http => [
69: :scheme_filter_http,
70: :fix_relative_uri,
71: :normalize_uri,
72: { :hostname_filter => :include_hosts },
73: { RDig::UrlFilters::UrlInclusionFilter => :include_documents },
74: { RDig::UrlFilters::UrlExclusionFilter => :exclude_documents },
75: RDig::UrlFilters::VisitedUrlFilter
76: ],
77: # filter chain for file system crawling
78: :file => [
79: :scheme_filter_file,
80: { RDig::UrlFilters::PathInclusionFilter => :include_documents },
81: { RDig::UrlFilters::PathExclusionFilter => :exclude_documents }
82: ]
83: }
84:
85: end