forked from thompsonb/vecalign
-
Notifications
You must be signed in to change notification settings - Fork 0
/
normalization_by_lines_spaces.py
35 lines (27 loc) · 1.17 KB
/
normalization_by_lines_spaces.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
import argparse
def norm_spaces(output_file, input_files):
output = []
pattern = re.compile("^\s*$")
for fin in input_files:
lines = open(fin, 'r', encoding='utf-8-sig').readlines()
for out_line in lines:
# Removing empty lines and whitespace lines
if not pattern.match(out_line):
prepro = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", out_line.rstrip())
output.append(re.sub('\s+',' ', prepro) + "\n")
with open(output_file, "w", encoding='utf-8-sig') as fout:
for line in output:
fout.write(line)
def _main():
parser = argparse.ArgumentParser('Create normalized text file for further prccessing.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--inputs', type=str, nargs='+',
help='input text file(s).')
parser.add_argument('-o', '--output', type=str,
help='output normalized text file')
args = parser.parse_args()
norm_spaces(output_file=args.output,
input_files=args.inputs)
if __name__ == '__main__':
_main()