Skip to content

Commit

Permalink
PyPDF2 support for #30
Browse files Browse the repository at this point in the history
  • Loading branch information
mgorenstein committed May 26, 2014
1 parent c9c36ce commit 7e11bda
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
16 changes: 16 additions & 0 deletions extractor_research/extractors/pdf2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/usr/bin/python
from PyPDF2 import PdfFileReader

class PDF2:
def __init__(self, pdf_file, txt_file):
self.doc = PdfFileReader(open(pdf_file, 'rb'))
self.output = open(txt_file, 'w')

def extract(self):
for page in self.doc.pages:
self.output.write(page.extractText())
self.output.close()

if __name__ == '__main__':
pdf = PDF2('../input/pride_and_prej/1.pdf', '../output/pride_and_prej/pdf2/1.txt')
pdf.extract()
1 change: 1 addition & 0 deletions extractor_research/output/pride_and_prej/pdf2/1.txt

Large diffs are not rendered by default.

0 comments on commit 7e11bda

Please sign in to comment.