-
Notifications
You must be signed in to change notification settings - Fork 0
/
Batch_GI_PDF.py
106 lines (76 loc) · 2.8 KB
/
Batch_GI_PDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import pytictoc
t = pytictoc.TicToc()
t.tic()
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os
from pymongo import MongoClient
import pandas as pd
import re
import gridfs
import datetime
import warnings
warnings.filterwarnings('ignore')
client = MongoClient("localhost", 27017)
db = client["Batch_GI"]
coll = "Batch_GI"
db_coll = db[coll]
fs = gridfs.GridFS(db)
path = "/home/controllingde/G/Batch_GI/"
files = os.listdir(path)
pdffiles = [path + x for x in files if x.find(".pdf") > 0]
colnames = ['batch', 'item', '_id', 'FS', 'GI', 'MHD', 'filename']
total_df = pd.DataFrame(columns=colnames)
datepattern = ".(1[0-2]|0[1-9]|\d)\/([2-9]\d[1-9]\d|[1-9]\d)."
batchsign = "Ch.-B.:"
i = 1
tl = len(pdffiles)
timestamp = str(datetime.datetime.now()).replace(" ", "_")
timestamp = timestamp.replace(":", "-")
timestamp = timestamp[0:19]
new_filename = "Matching_Batch_" + timestamp + ".csv"
for f in pdffiles:
t2 = pytictoc.TicToc()
t2.tic()
pages = convert_from_path(f, 500)
image_counter = 1
for p in pages:
filename = "page_" + str(image_counter) + ".jpg"
p.save(filename, "JPEG")
image_counter += 1
filelimit = image_counter-1
impdict = {}
for i in range(1, filelimit + 1):
filename = "page_"+str(i)+".jpg"
text = str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
batchind_start = text.rindex(batchsign)
batchsub1 = text[(batchind_start + len(batchsign) + 1) : ]
impdict["batch"] = batchsub1[0: batchsub1.find(" ")]
impdict["item"] = batchsub1[(batchsub1.find(" ") + 1): batchsub1.find("\n")]
impdict["_id"] = impdict["batch"] + "_" + impdict["item"]
FSsub = batchsub1[batchsub1.find("FS-"): ]
impdict["FS"] = FSsub[0: FSsub.find("\n")]
GIsub = batchsub1[batchsub1.find("GI-"): ]
impdict["GI"] = GIsub[0: GIsub.find("\n")]
#FOLsub = batchsub1[batchsub1.find("FOL-"): ]
#impdict["FOL"] = FOLsub[0: FOLsub.find("\n")]
MHD = re.search(datepattern, text)
impdict["MHD"] = text[MHD.start() : MHD.end()-1]
impdict["filename"] = f
impdict["fulltext"] = text.split("\n")
impdict["pdffile"] = fs.put(open(f, 'rb'))
db_coll.remove({"_id": impdict["_id"]})
db_coll.insert_one(impdict)
del impdict["fulltext"]
del impdict["pdffile"]
pre_df = pd.DataFrame.from_dict(impdict, orient="index").transpose()
total_df = pd.concat([total_df, pre_df])
print("{} of {} processed".format(i, tl))
i += 1
t2.toc()
total_df.to_csv(path + "test01.csv")
total_df.set_index(["_id"], inplace=True, verify_integrity=True)
t.toc()