| Class | RDig::ContentExtractors::HpricotContentExtractor |
| In: |
lib/rdig/content_extractors/hpricot.rb
|
| Parent: | ContentExtractor |
extracts title, content and links from html documents using the hpricot library
# File lib/rdig/content_extractors/hpricot.rb, line 14
14: def initialize(config)
15: super(config.hpricot)
16: # if not configured, refuse to handle any content:
17: @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.hpricot
18: end
Retrieve the root element to extract document content from
# File lib/rdig/content_extractors/hpricot.rb, line 75
75: def content_element(doc)
76: tag_from_config(doc, :content_tag_selector) || doc.at('body')
77: end
Extracts textual content from the HTML tree.
content_element method, which itself uses the content_tag_selector from RDig.configuration.
all textual content contained in the root element and all it’s children.
# File lib/rdig/content_extractors/hpricot.rb, line 41
41: def extract_content(doc)
42: content = ''
43: ce = content_element(doc)
44: content = strip_tags(strip_comments(ce.inner_html)) if ce
45: # (ce/'h1, h2, h3, h4, h5, h6, p, li, dt, dd, td, address, option, ').each do |child|
46: # extract_text child, content
47: return content.strip
48: end
extracts the href attributes of all a tags, except internal links like <a href="top">
# File lib/rdig/content_extractors/hpricot.rb, line 52
52: def extract_links(doc)
53: (doc/'a').map { |link|
54: href = link['href']
55: CGI.unescapeHTML(href) if href && href !~ /^#/
56: }.compact
57: end
Extracts the title from the given html tree
# File lib/rdig/content_extractors/hpricot.rb, line 60
60: def extract_title(doc)
61: the_title_tag = title_tag(doc)
62: return the_title_tag unless the_title_tag.respond_to? :inner_html
63: strip_tags(the_title_tag.inner_html)
64: end
returns: { :content => ‘extracted clear text’,
:title => 'Title', :links => [array of urls] }
# File lib/rdig/content_extractors/hpricot.rb, line 24
24: def process(content)
25: doc = Hpricot(content)
26: {
27: :title => extract_title(doc).decode_entities,
28: :links => extract_links(doc),
29: :content => extract_content(doc).decode_entities
30: }
31: end
Return the given string minus all html comments
# File lib/rdig/content_extractors/hpricot.rb, line 85
85: def strip_comments(string)
86: string.gsub Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), ''
87: end
# File lib/rdig/content_extractors/hpricot.rb, line 88
88: def strip_tags(string)
89: string.gsub! Regexp.new('<(script|style).*?>.*?<\/(script|style).*?>',
90: Regexp::MULTILINE, 'u'), ''
91: string.gsub! Regexp.new('<.+?>',
92: Regexp::MULTILINE, 'u'), ''
93: string.gsub Regexp.new('\s+', Regexp::MULTILINE, 'u'), ' '
94: end
# File lib/rdig/content_extractors/hpricot.rb, line 79
79: def tag_from_config(doc, config_key)
80: cfg = @config.send(config_key)
81: cfg.is_a?(String) ? doc/cfg : cfg.call(doc) if cfg
82: end