Class RDig::ContentExtractors::PdfContentExtractor
In: lib/rdig/content_extractors/pdf.rb
Parent: ContentExtractor

Extract text from pdf content.

Requires the pdftotext and pdfinfo utilities from the xpdf-utils package (on debian and friends do ‘apt-get install xpdf-utils’)

Methods

get_content   get_title   new   process  

Included Modules

ExternalAppHelper

Public Class methods

[Source]

    # File lib/rdig/content_extractors/pdf.rb, line 12
12:       def initialize(config)
13:         super(config)
14:         @pattern = /^application\/pdf/
15:         @pdftotext = 'pdftotext'
16:         @pdfinfo = 'pdfinfo'
17:         @available = true
18:         [ @pdftotext, @pdfinfo].each { |program|
19:           unless %x{#{program} -h 2>&1} =~ /Copyright 1996/ 
20:             @available = false 
21:             break
22:           end
23:         }
24:       end

Public Instance methods

[Source]

    # File lib/rdig/content_extractors/pdf.rb, line 35
35:       def get_content(path_to_tempfile)
36:         %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
37:       end

extracts the title from pdf meta data needs pdfinfo returns the title or nil if no title was found

[Source]

    # File lib/rdig/content_extractors/pdf.rb, line 42
42:       def get_title(path_to_tempfile)
43:         %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
44:       rescue
45:       end

[Source]

    # File lib/rdig/content_extractors/pdf.rb, line 26
26:       def process(content)
27:         result = {}
28:         as_file(content) do |file|
29:           result[:content] = get_content(file.path).strip
30:           result[:title] = get_title(file.path)
31:         end
32:         result
33:       end

[Validate]