-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathremove_sw_hl.py
50 lines (37 loc) · 1.55 KB
/
remove_sw_hl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from collections import Counter
#path = "mycorpusfile.txt"
def makeCounter(path):
with open(path, 'r', encoding='utf-8') as f:
count = Counter(f.read().split())
return count
#mycounter = makeCounter(path)
def makeRemoveLists(mycounter):
countersum = 0.01 * sum(mycounter.values())
#
#NOTES
#
#generator statt list oder set comprehension
#Counter als Series
#Zweiteilen
hapax = set([hapx for hapx, value in mycounter.items() if value == 1])
stopwords = set([stopw for stopw, value in mycounter.items() if value > countersum])
#print(len(hapax))
#print(len(stopwords))
return hapax, stopwords
def removestuff(inpath, outpath):
mycounter = makeCounter(inpath)
hapax, stopwords = makeRemoveLists(mycounter)
with open(inpath, 'r', encoding="utf-8") as tmp:
last = len(list(tmp)) -2
with open(inpath, 'r', encoding='utf-8') as g:
with open(outpath, 'w', encoding='utf-8') as f:
for i, line in enumerate(g):
if i != last:
print("working on ... ", i)
f.write(' '.join([word for word in line.split() if word not in hapax or stopwords]) + "\n")
else:
print("working on ... ", i)
f.write(' '.join([word for word in line.split() if word not in hapax or stopwords]))
print("\nFinished\n")
break
#removestuff(path, "mycorpusremoved.txt", hapax, stopwords)