forked from thompsonb/vecalign
-
Notifications
You must be signed in to change notification settings - Fork 0
/
normalization.py
28 lines (24 loc) · 959 Bytes
/
normalization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import re
def norm_lines(output_file, input_file):
output = []
pattern = re.compile("^\s*$")
with open(input_file, "r", encoding='utf-8-sig') as fin:
lines = fin.readlines()
for out_line in lines:
# Removing empty lines and whitespace lines
if not pattern.match(out_line):
output.append(out_line)
with open(output_file, "w", encoding='utf-8-sig') as fout:
for line in output:
fout.write(line)
def norm_spaces(output_file, input_file):
output = []
with open(input_file, "r", encoding='utf-8-sig') as fin:
lines = fin.readlines()
for out_line in lines:
prepro = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", out_line.rstrip())
output.append(re.sub('\s+',' ', prepro) + "\n")
with open(output_file, "w", encoding='utf-8-sig') as fout:
for line in output[:-1]:
fout.write(line)
fout.write(output[-1].rstrip())