| Class | RDig::ContentExtractors::RubyfulSoupContentExtractor |
| In: |
lib/rdig/content_extractors/rubyful_soup.rb
|
| Parent: | ContentExtractor |
extracts title, content and links from html documents
# File lib/rdig/content_extractors/rubyful_soup.rb, line 37
37: def initialize(config)
38: super(config.rubyful_soup)
39: # if not configured, refuse to handle any content:
40: @pattern = /^(text\/(html|xml)|application\/(xhtml\+xml|xml))/ if config.rubyful_soup
41: end
Retrieve the root element to extract document content from
# File lib/rdig/content_extractors/rubyful_soup.rb, line 132
132: def content_element(tagsoup)
133: if @config.content_tag_selector
134: @config.content_tag_selector.call(tagsoup)
135: else
136: tagsoup.html.body
137: end
138: end
Extracts textual content from the HTML tree.
content_element method, which itself uses the content_tag_selector from RDig.configuration.
all textual content contained in the root element and all it’s children.
# File lib/rdig/content_extractors/rubyful_soup.rb, line 64
64: def extract_content(tag_soup)
65: content = ''
66: ce = content_element(tag_soup)
67: ce.children { |child|
68: extract_text(child, content)
69: } unless ce.nil?
70: return content.strip
71: end
extracts the href attributes of all a tags, except internal links like <a href="top">
# File lib/rdig/content_extractors/rubyful_soup.rb, line 75
75: def extract_links(tagsoup)
76: tagsoup.find_all('a').map { |link|
77: CGI.unescapeHTML(link['href']) if (link['href'] && link['href'] !~ /^#/)
78: }.compact
79: end
Recursively extracts all text contained in the given element, and appends it to content.
# File lib/rdig/content_extractors/rubyful_soup.rb, line 95
95: def extract_text(element, content='')
96: return nil if element.nil?
97: if element.is_a? NavigableString
98: value = strip_comments(element)
99: value.strip!
100: unless value.empty?
101: content << value
102: content << ' '
103: end
104: elsif element.string # it's a Tag, and it has some content string
105: # skip inline scripts and styles
106: return nil if element.name =~ /^(script|style)$/i
107: value = element.string.strip
108: unless value.empty?
109: content << value
110: content << ' '
111: end
112: else
113: element.children { |child|
114: extract_text(child, content)
115: }
116: end
117: end
Extracts the title from the given html tree
# File lib/rdig/content_extractors/rubyful_soup.rb, line 82
82: def extract_title(tagsoup)
83: the_title_tag = title_tag(tagsoup)
84: if the_title_tag.is_a? String
85: the_title_tag
86: else
87: title = ''
88: extract_text(the_title_tag, title)
89: title.strip
90: end
91: end
returns: { :content => ‘extracted clear text’,
:meta => { :title => 'Title' },
:links => [array of urls] }
# File lib/rdig/content_extractors/rubyful_soup.rb, line 47
47: def process(content)
48: result = { }
49: tag_soup = BeautifulSoup.new(content)
50: result[:title] = extract_title(tag_soup)
51: result[:links] = extract_links(tag_soup)
52: result[:content] = extract_content(tag_soup)
53: return result
54: end
Return the given string minus all html comments
# File lib/rdig/content_extractors/rubyful_soup.rb, line 141
141: def strip_comments(string)
142: string.gsub(Regexp.new('<!--.*?-->', Regexp::MULTILINE, 'u'), '')
143: end
Returns the element to extract the title from.
This may return a string, e.g. an attribute value selected from a meta tag, too.
# File lib/rdig/content_extractors/rubyful_soup.rb, line 123
123: def title_tag(tagsoup)
124: if @config.title_tag_selector
125: @config.title_tag_selector.call(tagsoup)
126: else
127: tagsoup.html.head.title
128: end
129: end