Class RDig::Crawler
In: lib/rdig/crawler.rb
Parent: Object

Methods

add_url   new   process_document   run  

Public Class methods

[Source]

   # File lib/rdig/crawler.rb, line 6
6:     def initialize
7:       @documents = Queue.new
8:       @etag_filter = ETagFilter.new
9:     end

Public Instance methods

pipes a new document pointing to url through the filter chain, if it survives that, it gets added to the documents queue for further processing

[Source]

    # File lib/rdig/crawler.rb, line 65
65:     def add_url(url, filterchain, referring_document = nil)
66:       return if url.nil? || url.empty?
67:       if referring_document and referring_document.uri.scheme =~ /^https?/i
68:         doc = Document.create(url, referring_document.uri)
69:       else
70:         doc = Document.create(url)
71:       end
72: 
73:       doc = filterchain.apply(doc)
74:         
75:       if doc
76:         @documents << doc
77:         puts "added url #{url}" if RDig::config.verbose
78:       end
79:     rescue
80:       nil
81:     end

[Source]

    # File lib/rdig/crawler.rb, line 47
47:     def process_document(doc, filterchain)
48:       doc.fetch
49:       # add links from this document to the queue
50:       doc.content[:links].each { |url| 
51:         add_url(url, filterchain, doc) 
52:       } unless doc.content[:links].nil?
53: 
54:       return unless @etag_filter.apply(doc)
55:       @indexer << doc if doc.needs_indexing?
56:     rescue
57:       puts "error processing document #{doc.uri.to_s}: #{$!}"
58:       puts "Trace: #{$!.backtrace.join("\n")}" if RDig::config.verbose
59:     end

[Source]

    # File lib/rdig/crawler.rb, line 11
11:     def run
12:       raise 'no start urls given!' if RDig.config.crawler.start_urls.empty?
13:       @indexer = Index::Indexer.new(RDig.config.index)
14:       
15:       # check whether we are indexing on-disk or via http
16:       url_type = RDig.config.crawler.start_urls.first =~ /^file:\/\// ? :file : :http
17:       chain_config = RDig.filter_chain[url_type]
18:       
19:       filterchain = UrlFilters::FilterChain.new(chain_config)
20:       RDig.config.crawler.start_urls.each { |url| add_url(url, filterchain) }
21:       
22:       num_threads = RDig.config.crawler.num_threads
23:       group = ThreadsWait.new
24:       num_threads.times { |i|
25:         group.join_nowait Thread.new("fetcher #{i}") {
26:           filterchain = UrlFilters::FilterChain.new(chain_config)
27:           while (doc = @documents.pop) != :exit
28:             process_document doc, filterchain
29:           end
30:         }
31:       }
32: 
33:       # check for an empty queue every now and then 
34:       sleep_interval = RDig.config.crawler.wait_before_leave
35:       begin 
36:         sleep sleep_interval
37:       end until @documents.empty?
38:       # nothing to do any more, tell the threads to exit
39:       num_threads.times { @documents << :exit }
40: 
41:       puts "waiting for threads to finish..."
42:       group.all_waits
43:     ensure
44:       @indexer.close if @indexer
45:     end

[Validate]