-
Notifications
You must be signed in to change notification settings - Fork 7
/
3D_cleanup_text.py
73 lines (60 loc) · 1.19 KB
/
3D_cleanup_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
Clean-up all texts.
Removes short lines and empty .TXT files.
This could be improved to remove more redundant parts.
"""
# https://askubuntu.com/a/726242
import os
import glob
from tqdm import tqdm
LANGUAGES = [
'cs',
'cz',
'de',
'en',
'es',
'et',
'fi',
'fr',
'hu',
'it',
'nl',
'no',
'pl',
'pt',
'se',
'sv',
"da",
"hr",
"sl",
"lt",
"tr",
"lv",
"ro",
"sk",
"sq",
]
def filter_text(line):
if "==" in line or len(line) < 80:
return False
else:
return True
checked = 0
removed = 0
# https://codereview.stackexchange.com/a/145128
for LANGUAGE in LANGUAGES:
for path in tqdm(glob.glob("text/" + LANGUAGE + "/*.txt")):
with open(path, "r") as file:
lines = file.readlines()
with open(path, "w") as file:
lines = filter(filter_text, lines)
file.writelines(lines)
checked += 1
if os.path.getsize(path) == 0:
try:
os.remove(path)
removed += 1
except OSError:
print("Error", path)
print("Checked", checked)
print("Removed Total", removed)