-
Notifications
You must be signed in to change notification settings - Fork 0
/
emojiget.py
104 lines (86 loc) · 2.7 KB
/
emojiget.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#python
import urllib
import sys
import os
#page = urllib.urlopen('http://unicode.org/emoji/charts/full-emoji-list.html')
#text = page.readlines()
#outputFile = open('dataset.txt', 'w')
#outputFile.writelines(text)
#outputFile.close()
#sys.exit(0)
if not os.path.exists("images/"):
os.makedirs("images/")
#print text
htmlfile = open('data.html', 'r')
text = htmlfile.readlines()
outputFile = open('dataset.csv', 'w')
outputFile.write('code,b&w,apple,andr,twit,wind,gmail,name,annotations\n')
for i in range(len(text)):
if "<td class='code'>" in text[i]:
temp = ""
loi = text[i]
name = loi[loi.find('name=')+6:loi.find("'", loi.find('name=')+6)]
temp = name + ","
loi = text[i+2] #bw
if "td class='miss'" in loi:
temp = temp + "missing,"
else:
image = loi[loi.find('src=')+5:loi.find("'", loi.find('src=')+5)]
bw_loc = 'images/' + name + '_bw.png'
urllib.urlretrieve(image, bw_loc)
temp = temp + bw_loc + ','
loi = text[i+3] #apple
if "td class='miss'" in loi:
temp = temp + "missing,"
else:
image = loi[loi.find('src=')+5:loi.find("'", loi.find('src=')+5)]
apple_loc = 'images/' + name + '_apple.png'
urllib.urlretrieve(image, apple_loc)
temp = temp + apple_loc + ','
loi = text[i+4] #andr
if "td class='miss'" in loi:
temp = temp + "missing,"
else:
image = loi[loi.find('src=')+5:loi.find("'", loi.find('src=')+5)]
andr_loc = 'images/' + name + '_andr.png'
urllib.urlretrieve(image, andr_loc)
temp = temp + andr_loc + ','
loi = text[i+5] #twit
if "td class='miss'" in loi:
temp = temp + "missing,"
else:
image = loi[loi.find('src=')+5:loi.find("'", loi.find('src=')+5)]
twit_loc = 'images/' + name + '_twit.png'
urllib.urlretrieve(image, twit_loc)
temp = temp + twit_loc + ','
loi = text[i+6] #wind
if "td class='miss'" in loi:
temp = temp + "missing,"
else:
image = loi[loi.find('src=')+5:loi.find("'", loi.find('src=')+5)]
browser_loc = 'images/' + name+ '_wind.png'
urllib.urlretrieve(image, browser_loc)
temp = temp + browser_loc + ','
loi = text[i+7] #gmail
if "td class='miss'" in loi:
temp = temp + "missing,"
else:
image = loi[loi.find('src=')+5:loi.find("'", loi.find('src=')+5)]
gmail_loc = 'images/' + name + '_gmail.png'
urllib.urlretrieve(image, gmail_loc)
temp = temp + gmail_loc + ','
loi = text[i+11] #desc
desc = loi[loi.find('>')+1:loi.find("<", loi.find('>'))]
temp = temp + desc + ","
loi = text[i+14]
print loi
ind = 0
temp_ann = ""
while True:
if loi.find("annotate", ind) < 0:
break
ind = loi.find('annotate', ind)
temp_ann = temp_ann + ";" + loi[ind+10:loi.find('<', ind)]
ind += 1
temp = temp + temp_ann[1:] + ","
outputFile.write(temp[:-1] + "\n")