Class RDig::ContentExtractors::WordContentExtractor
In: lib/rdig/content_extractors/doc.rb
Parent: ContentExtractor

Extract text from word documents

Requires the wvHtml utility (on debian and friends do ‘apt-get install wv’)

Methods

new   process  

Included Modules

ExternalAppHelper

Public Class methods

[Source]

    # File lib/rdig/content_extractors/doc.rb, line 11
11:       def initialize(config)
12:         super(config)
13:         @wvhtml = 'wvHtml'
14:         @pattern = /^application\/msword/
15:         # html extractor for parsing wvHtml output
16:         @html_extractor = RubyfulSoupContentExtractor.new(OpenStruct.new(
17:             :rubyful_soup => OpenStruct.new(
18:               :content_tag_selector => lambda { |tagsoup|
19:                 tagsoup.html.body
20:               },
21:               :title_tag_selector         => lambda { |tagsoup|
22:                 tagsoup.html.head.title
23:               }
24:             )))
25: 
26:         # TODO: better: if $?.exitstatus == 127 (not found)
27:         @available = %x{#{@wvhtml} -h 2>&1} =~ /Dom Lachowicz/
28:       end

Public Instance methods

[Source]

    # File lib/rdig/content_extractors/doc.rb, line 30
30:       def process(content)
31:         result = {}
32:         as_file(content) do |file|  
33:           result = @html_extractor.process(%x{#{@wvhtml} --charset=UTF-8 '#{file.path}' -})
34:         end
35:         return result || {}
36:       end

[Validate]