-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEN-Homophones-Filtering.py
64 lines (39 loc) · 1.14 KB
/
EN-Homophones-Filtering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# coding: utf-8
# In[1]:
import re
import sys
import codecs, csv
# In[2]:
cmuFile=codecs.open("cmudict-0.7b","r", 'utf8')
# In[3]:
elpFileCSV=codecs.open("ELP-Lexicon.csv","r", 'utf8')
# In[4]:
elpFile = csv.reader(elpFileCSV, delimiter=',', quotechar='"')
# In[5]:
lexiconELP=[]
for index,line in enumerate(elpFile):
if line:
word=line[0]
if word.islower() or word.istitle() :
lexiconELP.append(word.upper())
else:
print index
# In[6]:
get_ipython().magic(u'time')
dictionary={}
for index,line in enumerate(cmuFile):
if ";;;" not in line: #delete header
entry=line.strip().split(" ")#every entry contains 2 spaces
word=entry[0] #word is in first column
syllabes=entry[1:] #beginning of phonetic string in 2nd column
if word in lexiconELP:
dictionary[word]=syllabes
# if len(dictionary)%1000==0:
# print index,len(dictionary)
# In[7]:
with open('test.csv', 'w') as output:
for key in sorted(dictionary):
row=[key]+dictionary[key]
output.write(key+" "+" ".join(dictionary[key])+"\n")
# In[8]:
lexiconELP