| Class | RDig::Crawler |
| In: |
lib/rdig/crawler.rb
|
| Parent: | Object |
# File lib/rdig/crawler.rb, line 6 6: def initialize 7: @documents = Queue.new 8: @etag_filter = ETagFilter.new 9: end
pipes a new document pointing to url through the filter chain, if it survives that, it gets added to the documents queue for further processing
# File lib/rdig/crawler.rb, line 65
65: def add_url(url, filterchain, referring_document = nil)
66: return if url.nil? || url.empty?
67: if referring_document and referring_document.uri.scheme =~ /^https?/i
68: doc = Document.create(url, referring_document.uri)
69: else
70: doc = Document.create(url)
71: end
72:
73: doc = filterchain.apply(doc)
74:
75: if doc
76: @documents << doc
77: puts "added url #{url}" if RDig::config.verbose
78: end
79: rescue
80: nil
81: end
# File lib/rdig/crawler.rb, line 47
47: def process_document(doc, filterchain)
48: doc.fetch
49: # add links from this document to the queue
50: doc.content[:links].each { |url|
51: add_url(url, filterchain, doc)
52: } unless doc.content[:links].nil?
53:
54: return unless @etag_filter.apply(doc)
55: @indexer << doc if doc.needs_indexing?
56: rescue
57: puts "error processing document #{doc.uri.to_s}: #{$!}"
58: puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
59: end
# File lib/rdig/crawler.rb, line 11
11: def run
12: raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
13: @indexer = Index::Indexer.new(RDig.config.index)
14:
15: # check whether we are indexing on-disk or via http
16: url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
17: chain_config = RDig.filter_chain[url_type]
18:
19: filterchain = UrlFilters::FilterChain.new(chain_config)
20: RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
21:
22: num_threads = RDig.config.crawler.num_threads
23: group = ThreadsWait.new
24: num_threads.times { |i|
25: group.join_nowait Thread.new("fetcher #{i}") {
26: filterchain = UrlFilters::FilterChain.new(chain_config)
27: while (doc = @documents.pop) != :exit
28: process_document doc, filterchain
29: end
30: }
31: }
32:
33: # check for an empty queue every now and then
34: sleep_interval = RDig.config.crawler.wait_before_leave
35: begin
36: sleep sleep_interval
37: end until @documents.empty?
38: # nothing to do any more, tell the threads to exit
39: num_threads.times { @documents << :exit }
40:
41: puts "waiting for threads to finish..."
42: group.all_waits
43: ensure
44: @indexer.close if @indexer
45: end