-
Notifications
You must be signed in to change notification settings - Fork 7
/
5A_count_words.py
69 lines (58 loc) · 1.48 KB
/
5A_count_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""
Create a dictionary of all words occuring in the text samples including their count. This will be used to generate a text including the relevant kerning pairs.
"""
import os
import glob
import timeit
from collections import defaultdict, OrderedDict
import json
from tqdm import tqdm
LANGUAGES = [
"cs",
'de',
'en',
'es',
'et',
'fi',
'fr',
'hu',
'it',
'nl',
'no',
'pl',
'pt',
'se',
'sv',
'da',
'hr',
'sl',
'lt',
'tr',
'lv',
'ro',
'sk',
'sq',
]
LANGUAGES.sort()
start = timeit.default_timer()
# Parse text and count words
for LANGUAGE in LANGUAGES:
dictionary = defaultdict(lambda: 1)
for path in tqdm(glob.glob("text/" + LANGUAGE + "/*.txt")):
with open(path, "r") as file:
text = file.read().replace("\n", " ")
words = text.split()
for word in words:
dictionary[word] += 1
print(LANGUAGE, len(dictionary), 'uncleaned "words" collected')
# Sort
sorted_dictionary = {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1], reverse=True)}
# Write out
directory = "count/by_language/" + LANGUAGE + "/"
# if not os.path.exists(directory):
# os.makedirs(directory)
file = open(directory + "words.json", "w")
file.write(json.dumps(sorted_dictionary, indent=4, sort_keys=False))
file.close()
stop = timeit.default_timer()
print("Execution Time: ", stop - start)