-
Notifications
You must be signed in to change notification settings - Fork 2
/
findText_usingSums.py
137 lines (113 loc) · 5.75 KB
/
findText_usingSums.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
import argparse
from pathlib import Path
import sys
os.environ['OPENCV_IO_ENABLE_JASPER']='True' #has to be set before importing cv2 otherwise it won't read the variable
import cv2
import numpy as np
import subprocess
from multiprocessing import Pool
from scipy.signal import find_peaks
#static variables for clarity
COLUMNS = 0
GREEN = (0, 255, 0)
#parameters that can be tweaked
LINE_THICKNESS = 3 #how thick to make the line around the found contours in the debug output
PADDING = 10 #padding to add around the found possible column to help account for image skew and such
BATCH_LOCATION = 'E:/chronam/dlc_flavory_ver01' #directory to start in to find the .jp2 files to process
TESSERACT = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
CREATE_COLUMN_OUTLINE_IMAGES = True #if we detect that we didn't find all the columns. Create a debug image (tiff) showing the columns that were found
REMOVE_JP2_AFTER_PROCESSING = True #If we want to save space and remove the jp2 file after we are done processing the OCR
def columnIndexes(a):
"""
creates pair of indexes for left and right index of the image column
For example [13, 1257, 2474, 3695, 4907, 6149]
becomes: [[13 1257], [1257 2474], [2474 3695], [3695 4907], [4907 6149]]
"""
nrows = (a.size-2)+1
return a[1*np.arange(nrows)[:,None] + np.arange(2)]
def convertToGrayscale(img):
print("converting to greyscale")
temp_img = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
return temp_img
def invert(img):
"""
Black becomes white in the image
"""
print("invert image")
_,temp_img = cv2.threshold(img, 140, 255, cv2.THRESH_BINARY_INV)
return temp_img
def createColumnImages(img, basename, directory):
"""
we sum each column of the inverted image. The columns should show up as peaks in the sums
uses scipy.signal.find_peaks to find those peaks and use them as column indexes
"""
files = []
temp_img = convertToGrayscale(img)
temp_img = invert(temp_img)
sums = np.sum(temp_img, axis = COLUMNS)
sums[0] = 1000 #some random value so that find_peaks properly detects the peak for the left most column
sums = sums * -4 #invert so that minimums become maximums and exagerate the data so it is more clear what the peaks are
peaks, _ = find_peaks(sums, distance=800) #the column indexs of the img array, spaced at least 800 away from the previous peak
if peaks.size == 0:
with open('troublesomeImages.txt', 'a') as f:
print("ERROR: something went wrong with finding the peaks for image: ", os.path.join(directory, basename))
f.write(os.path.join(directory, basename) + ".jp2 0\n")
return files
peaks[0] = 0 #automatically make the left most column index the start of the image
peaks[-1] =sums.size -1 #automatically make the right most column index the end of the image
boxed = np.copy(img)
if peaks.size < 6:
with open('troublesomeImages.txt', 'a') as f:
print("found image that is causing problems: ", os.path.join(directory, basename))
f.write(os.path.join(directory, basename) + ".jp2 " + str(peaks.size) + "\n")
columnIndexPairs = columnIndexes(peaks)
ystart = 0
yend = img.shape[0]
for columnIndexPair in columnIndexPairs:
xstart = max(columnIndexPair[0]-PADDING, 0)
xend = min(columnIndexPair[1]+PADDING, img.shape[1])
filepath = os.path.join(directory, '%s_xStart%s_xEnd%s.tiff' % (basename, xstart,xend))
files.append(filepath)
crop_img = img[ystart:yend, xstart:xend]
if not os.path.exists(filepath):
print("writing out cropped image: ", filepath)
cv2.imwrite(filepath, crop_img)
if CREATE_COLUMN_OUTLINE_IMAGES:
cv2.rectangle(boxed,(xstart,ystart),(xend,yend), GREEN, LINE_THICKNESS)
if CREATE_COLUMN_OUTLINE_IMAGES:
filepath = os.path.join(directory, '%s-contours.jpeg' % basename)
cv2.imwrite(filepath, boxed, [cv2.IMWRITE_JPEG_QUALITY, 50])
if REMOVE_JP2_AFTER_PROCESSING:
os.remove(os.path.join(directory, basename + ".jp2"))
return files
def createOCRFiles(dirFilePair):
directory = dirFilePair[0]
file = dirFilePair[1]
fullPath = os.path.join(directory, file)
print('processing file: ', fullPath)
basename = Path(file).stem
img = cv2.imread(fullPath)
files = createColumnImages(img, basename, directory)
for inputFile in files:
outputFile = os.path.join(directory, Path(inputFile).stem)
print('creating OCR files from: ', inputFile)
if not os.path.exists(outputFile + '.txt') or not os.path.exists(outputFile + '.hocr'):
subprocess.run([TESSERACT, inputFile, outputFile, 'hocr', 'txt'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
os.remove(inputFile) #we are done with the column image, so we delete it to save space
def createColumnHocr(batch_dir):
with Pool(8) as p:
filesToProcess = []
for root,dirs,files in os.walk(batch_dir):
for file in files:
if file.lower().endswith('.jp2'): #TODO ignore images at the F:\dlc_gritty_ver01\data\sn83030212\00206530157\ directory level as they are just for quality and don't contain newspaper pages
filesToProcess.append([root, file])
p.map(createOCRFiles, filesToProcess)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Create columns of text and run OCR on them.')
parser.add_argument('--chronamdir', help='The directory of the chronicling america batch')
args = parser.parse_args()
if args.chronamdir:
BATCH_LOCATION = args.chronamdir
createColumnHocr(BATCH_LOCATION)
print("finished")