-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathconvert_pdf_to_txt.py
40 lines (37 loc) · 1.3 KB
/
convert_pdf_to_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#coding=utf-8
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from cStringIO import StringIO
import os
def convert_pdf_to_txt(path,save_name):
try:
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
try:
with open(save_name,"w") as f:
for i in str:
f.write(i)
print "%s Writing Succeed!"%save_name
except:
print "Writing Failed!"
except:
print "Writing Failed!......"
if __name__ == '__main__':
convert_pdf_to_txt("pdf_test.pdf","txt_test.txt")