Skip to content

Commit

Permalink
Create PracticePDFExtractor.py
Browse files Browse the repository at this point in the history
Does simple text extraction from pdf into html or plain text using either pyPDF or PDFMiner libraries
  • Loading branch information
grahamsack committed May 22, 2014
1 parent 53f9704 commit aef8398
Showing 1 changed file with 36 additions and 0 deletions.
36 changes: 36 additions & 0 deletions Classifiers/PracticePDFExtractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import PyPDF2
import csv
import pdfminer
import os


def extract_text_using_pypdf(fileName):
pdf = PyPDF2.PdfFileReader(open(fileName, "rb"))
allText = []
for page in pdf.pages:
allText.append(page.extractText())
return allText

def extract_text_using_pdf_miner(inputFile, outputFile):
""" This method calls a command line argument from the pdfminer library
Indicate txt or html by the file name output.txt or output.html
For more commands, see http://www.unixuser.org/~euske/python/pdfminer/
"""
commandString = "pdf2txt.py -o " + outputFile + " " + inputFile
os.system(commandString)

def export_to_csv(inputFileName, csvFileName):
f = open(inputFileName, 'r')
with open(csvFileName, 'wb') as csvfile:
myWriter = csv.writer(csvfile, delimiter='\t')
myWriter.writerow(f.readlines())

def test():
#textVector = extract_text_using_pypdf("Lunch-Money.pdf")
#export_to_csv(textVector[2:], "Lunch-Money.csv")
extract_text_using_pdf_miner("E3562014236085.pdf", "E3562014236085.html")
export_to_csv("E3562014236085.html", "E3562014236085.csv")


if __name__ == '__main__':
test()

0 comments on commit aef8398

Please sign in to comment.