-
Notifications
You must be signed in to change notification settings - Fork 1
/
CaptchaCracker.py
191 lines (157 loc) · 7.67 KB
/
CaptchaCracker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import os
from itertools import chain
import numpy
from PIL import Image
import sys as System
import re as REEEEE
from Letter import Letter
# Script Parameters defined globally
try:
globeImagePath = System.argv[1] # Path to captcha image
globeXLength = int(System.argv[2]) # Width of Captcha Image
globeYLength = int(System.argv[3]) # Height of Captcha Image
except IndexError:
# No command line args provided
pass
# Parses image file and returns array of (X,Y,Z) tuples corresponding to pixel color codes
def parseImageFile():
# If no input arguments were provided check std::in
if len(System.argv) == 1:
return parseConsoleInputRGBCodes()
# If the path provided is for a txt file with color codes then run the parseRGBCodeFile otherwise return list of
# pixel colours straight from PIL
try:
global globeXLength, globeYLength, globeImagePath
captchaImage = Image.open(globeImagePath, 'r').convert('RGB')
arrayOfPixels = numpy.array(captchaImage)
globeXLength, globeYLength = captchaImage.size
arrayOfPixels = list(chain.from_iterable(arrayOfPixels.tolist()))
arrayOfPixels = [tuple(x) for x in arrayOfPixels]
# Find a better way to format the output string in the future
outputString = '[{0}]'.format(','.join(map(str, arrayOfPixels)))
outputString = outputString.replace('[', '')
outputString = outputString.replace(']', '')
outputString = outputString.replace('),(', ' ')
outputString = outputString.replace('(', '')
outputString = outputString.replace(')', '')
outputString = outputString.replace(', ', ',')
outputString = outputString.replace(', ', ',')
# Create textfile that will contain the RGB codes of the image and parse it via parseRGBCodeFile
globeImagePath = os.path.dirname(globeImagePath) + "\\" + os.path.basename(
os.path.splitext(globeImagePath)[0]) + ".txt"
outputFile = open(globeImagePath, 'w')
outputFile.write(outputString)
outputFile.close()
except OSError:
# Do nothing
pass
return parseRGBCodeFile(globeImagePath)
pass
# Parses txt file path containing RGB list provided in the argument and returns array of (X,Y,Z) tuples corresponding to pixel
# color codes
def parseRGBCodeFile(fileName=str()):
# read file
file = open(fileName, "r")
lines = file.readlines()
file.close()
# Strip all new lines and whitespace replace spaces with ","
return readLines(lines)
# read input lines and return readLines(lines) list
def parseConsoleInputRGBCodes():
global globeXLength, globeYLength # First line must be size (Y,X)
globeYLength, globeXLength = [int(size) for size in input().split(" ")]
if not (globeYLength or globeXLength):
print("No input has been provided")
exit(1)
lines = list() # Get RGB list input
for y in range(globeYLength):
lines.append(input())
return readLines(lines)
# Reads raw input of lines
def readLines(lines):
listedLines = list()
for index, line in enumerate(lines):
lines[index] = line.replace('\n', ',')
lines[index] = line.replace(' ', ',')
listedLines.extend(lines[index].split(","))
imageLines = list()
for entry in listedLines:
entry = REEEEE.sub("[^0-9]", "", entry) # Strip non numerics
imageLines.append(entry)
imageLines = list(map(int, imageLines))
zipImageIterable = zip(*[imageLines[i::3] for i in range(3)]) # Group into 3 sized tuples representing RGB code
return list(zipImageIterable)
# Clean values of all non-letter related pixels
def cleanLetters(arrayOfPixels=list()):
# Clean RGB codes @Update improve this for captchas with wider colour diversity
oneDimensionArrayOfPixels = [(255, 255, 255) if (n > (100, 100, 100)) else (0, 0, 0) for n in arrayOfPixels]
twoDimensionArrayOfPixels = [oneDimensionArrayOfPixels[(n * globeXLength):(n * globeXLength + globeXLength)] for n
in
range(0, globeYLength)]
# for row in twoDimensionArrayOfPixels:
# print([row[i] for i in range(len(row))])
return oneDimensionArrayOfPixels, twoDimensionArrayOfPixels
# Seperates letters and removes additional whitespace
def separateLetters(twoDimensionArrayOfPixels=list(list())):
# Separate the list of pixels into an iterable of letter objects
listOfLetters = list()
global globeXLength, globeYLength
# Remove top and bottem whitespace from pixel array
while set(twoDimensionArrayOfPixels[0]).__eq__({(255, 255, 255)}):
del twoDimensionArrayOfPixels[0]
globeYLength -= 1
while set(twoDimensionArrayOfPixels[-1]).__eq__({(255, 255, 255)}):
del twoDimensionArrayOfPixels[-1]
globeYLength -= 1
# Remove left and right white space FIX: Find a cleaner way of deleting columns in 2D arrays
while set([row[0] for row in twoDimensionArrayOfPixels]).__eq__({(255, 255, 255)}):
for row in twoDimensionArrayOfPixels:
del row[0]
globeXLength -= 1
while set([row[-1] for row in twoDimensionArrayOfPixels]).__eq__({(255, 255, 255)}):
# print([row[-1] for row in twoDimensionArrayOfPixels])
for row in twoDimensionArrayOfPixels:
del row[-1]
globeXLength -= 1
# Iterate through all the columns, if a column that is completely whitespace is encountered, create a letter object
# with the sub array of colour codes and append it to the letter list, remove white space columns until you reach a
# column with no white space, then repeat, stop iterating once xColumnIterator = globeXLength
xStartLetterColumn = 0
xColumnIterator = 0
while xColumnIterator < globeXLength: # Assuming the shape is uniform
if set([row[xColumnIterator] for row in twoDimensionArrayOfPixels]).__eq__(
{(255, 255, 255)}) or (
xColumnIterator == globeXLength - 1): # Encountered whitespace column or capturing last letter
listOfLetters.append(Letter([row[xStartLetterColumn:xColumnIterator] for row in twoDimensionArrayOfPixels],
(xColumnIterator - xStartLetterColumn, globeYLength)))
# Iterate past useless whitespace
xStartLetterColumn = xColumnIterator
xColumnIterator += 1 if (xColumnIterator != globeXLength - 1) else 0 # Only increment if there are more columns
while set([row[xColumnIterator] for row in twoDimensionArrayOfPixels]).__eq__({(255, 255, 255)}):
xColumnIterator += 1
xStartLetterColumn += 1
xStartLetterColumn += 1
xColumnIterator += 1
return listOfLetters
def createImage(arrayOfRGBs=list(tuple())):
output = Image.new("RGB", (globeXLength, globeYLength))
output.putdata(arrayOfRGBs)
return output
def writeImage(image=Image.Image):
# Write file
try:
image.save(
"{0}\\{1}-output.png".format(os.path.dirname(globeImagePath), os.path.basename(
os.path.splitext(globeImagePath)[0])))
except IOError or NameError:
print("Path to output file not specified or does not exist")
def main():
arrayOfPixels = parseImageFile()
oneDimensionArrayOfPixels, twoDimensionArrayOfPixels = cleanLetters(arrayOfPixels)
outputImage = createImage(oneDimensionArrayOfPixels)
writeImage(
outputImage) # Outputs the cleaned version of the Captcha with only the letter of interest (Used for testing and viewing purposes)
listOfLetters = separateLetters(twoDimensionArrayOfPixels)
finalWord = [letter.identify() for letter in listOfLetters]
print(''.join(finalWord))
main()