Skip to content

Commit

Permalink
PDFBox integration for #30
Browse files Browse the repository at this point in the history
  • Loading branch information
mgorenstein committed May 26, 2014
1 parent 7e11bda commit 97692f1
Show file tree
Hide file tree
Showing 3 changed files with 9,424 additions and 0 deletions.
Binary file not shown.
15 changes: 15 additions & 0 deletions extractor_research/extractors/pdfbox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/python
import os

class PDFBox:
def __init__(self, pdf_file, txt_file):
self.pdf_file = pdf_file
self.txt_file = txt_file

def extract(self):
command = 'java -jar pdfbox-app-1.8.5.jar ExtractText ' + self.pdf_file + ' ' + self.txt_file
os.system(command)

if __name__ == '__main__':
pdf = PDFBox('../input/pride_and_prej/1.pdf', '../output/pride_and_prej/pdfbox/1.txt')
pdf.extract()
Loading

0 comments on commit 97692f1

Please sign in to comment.