| Class | RDig::ContentExtractors::WordContentExtractor |
| In: |
lib/rdig/content_extractors/doc.rb
|
| Parent: | ContentExtractor |
Extract text from word documents
Requires the wvHtml utility (on debian and friends do ‘apt-get install wv’)
# File lib/rdig/content_extractors/doc.rb, line 11
11: def initialize(config)
12: super(config)
13: @wvhtml = 'wvHtml'
14: @pattern = /^application\/msword/
15: # html extractor for parsing wvHtml output
16: @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
17: :rubyful_soup => OpenStruct.new(
18: :content_tag_selector => lambda { |tagsoup|
19: tagsoup.html.body
20: },
21: :title_tag_selector => lambda { |tagsoup|
22: tagsoup.html.head.title
23: }
24: )))
25:
26: # TODO: better: if $?.exitstatus == 127 (not found)
27: @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
28: end