-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathremove_xml_chinese.py
executable file
·54 lines (49 loc) · 1.78 KB
/
remove_xml_chinese.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
'''
Author: Yuling Gu
Date: Jul 18, 2018
Description:
Take in a list of filenames for .xml format files as first argument,
and produces a list of output files. The prcoessing involves parsing,
removing xml tags, clearing ampersand characters, and removing
whitespaces. Makes use of termUtilitiesEng.py which is term_utilities.py
in the English Termolator system.
Usage: python name_of_thisFile.py background_filelist.txt outputTag
e.g. python remove_xml_chinese.py backgroundList.txt testing
'''
#!/usr/bin/env python3
import re
import sys
import xml.etree.ElementTree as ET
## from termUtilitiesEng import *
## AM Jul 1, 2020 fix -- to merge systems
from term_utilities import *
# make use of ElementTree module to parse xml
def remove_tags(text):
return(ET.fromstringlist(text, parser=None))
def main():
# open input file list (first argument)
input_file = sys.argv[1];
infilelist = get_my_string_list(input_file)
# loop through file names
for file in infilelist :
# determine ouput file name
with open(file[:-4] + sys.argv[2] + ".xml", "w") as wId:
file = file.replace('\n', '')
file_lines = get_my_string_list(file)
# process lines in each file
lines = []
# parse xml
for line in remove_tags(file_lines) :
lines.append(ET.tostring(line, encoding="UTF-8").decode("UTF-8"))
# cleaning
for i in lines:
# remove xml
x = remove_xml(i).strip()
# clear ampersand
x = clean_string_of_ampersand_characters(x)
# clear whitespace
x = interior_white_space_trim(x)
if(len(x) > 0):
wId.write(x)
if __name__ == '__main__':
main()