| Class | RDig::ContentExtractors::PdfContentExtractor |
| In: |
lib/rdig/content_extractors/pdf.rb
|
| Parent: | ContentExtractor |
Extract text from pdf content.
Requires the pdftotext and pdfinfo utilities from the xpdf-utils package (on debian and friends do ‘apt-get install xpdf-utils’)
# File lib/rdig/content_extractors/pdf.rb, line 12
12: def initialize(config)
13: super(config)
14: @pattern = /^application\/pdf/
15: @pdftotext = 'pdftotext'
16: @pdfinfo = 'pdfinfo'
17: @available = true
18: [ @pdftotext, @pdfinfo].each { |program|
19: unless %x{#{program} -h 2>&1} =~ /Copyright 1996/
20: @available = false
21: break
22: end
23: }
24: end
# File lib/rdig/content_extractors/pdf.rb, line 35
35: def get_content(path_to_tempfile)
36: %x{#{@pdftotext} -enc UTF-8 '#{path_to_tempfile}' -}
37: end
extracts the title from pdf meta data needs pdfinfo returns the title or nil if no title was found
# File lib/rdig/content_extractors/pdf.rb, line 42
42: def get_title(path_to_tempfile)
43: %x{#{@pdfinfo} -enc UTF-8 '#{path_to_tempfile}'} =~ /title:\s+(.*)$/i ? $1.strip : nil
44: rescue
45: end