-
Notifications
You must be signed in to change notification settings - Fork 0
/
noun_chunker_generator.py
executable file
·173 lines (145 loc) · 6.77 KB
/
noun_chunker_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
'''
Author: Leizhen Shi
Project Name: Noun Chunker Generator
The purpose of this project is to generate .tchunk and .pos files for the distributional ranking of the Termolator
The input should be cleaned file processed by Brandies Chinese Segemeter
'''
import argparse
import os
def load_directory(directory_name):
return os.listdir(directory_name)
def process_data(filename):
file = open(filename, 'r', encoding='UTF-8')
result = []
for line in file:
line = line.strip()
line_in_list = line.split(' ') # change the line into a list by space
for pair in line_in_list:
splited_pair = pair.split('_') # change 本_DT to [‘本’, DT]
if len(splited_pair) < 2:
continue
result.append([splited_pair[0], splited_pair[1]])
return result #result is a list of pairs with information word at first followed by the information
#If anyone wants to change the rules for detecting the term, this method is a good way to start
def detect_noun(pair_list):
result = [] # information stored in the format of [word, tag, BIO_tag]
for i in range (len(pair_list)):
if is_noun(pair_list[i][1]): # detect the noun first
if i == 0: # The first of the word is 0
result.append([pair_list[i][0], pair_list[i][1], 'B-NP'])
elif i > 0 and is_inword(result[i - 1][2]) == True: # check if the previous word is in_word
result.append([pair_list[i][0], pair_list[i][1], 'I-NP'])
else: # if this noun is the start, then it will be in_word
result.append([pair_list[i][0], pair_list[i][1], 'B-NP'])
elif is_adj(pair_list[i][1]): # if an adj is detected, then it will be the start of the term
if i > 0 and is_inword(result[i - 1][2]): # check if the previous word is in_word
result.append([pair_list[i][0], pair_list[i][1], 'I-NP'])
else:
result.append([pair_list[i][0], pair_list[i][1], 'B-NP'])
else:
result.append([pair_list[i][0], pair_list[i][1], 'O'])
return result
def print_tchunk(filename, tagged_words):
file = open(filename, 'w', encoding= "UTF-8")
for tagged_word in tagged_words: # follow the CONLL format
file.write(tagged_word[0]+"\t\t"+tagged_word[0]+"\t\t"+tagged_word[1]+"\t\t"+tagged_word[2]+"\n")
file.close()
def print_pos(filename, tagged_words):
start_point = 1
file = open(filename, 'w', encoding= "UTF-8")
for tagged_word in tagged_words:
end_point = start_point + len(tagged_word[0])
file.write(tagged_word[0] + "\t|||\tS: " + str(start_point) + "E: " + str(end_point) + "\t|||\t" + tagged_word[1] + "\n")
start_point = end_point
file.close()
def is_inword(BIOtag):
inword_tag_set = set(['B-NP', 'I-NP'])
if BIOtag in inword_tag_set:
return True
else:
return False
def is_adj(tag):
adj_tag_set = set(["JJ", "JJS", "JJR"])
return True if tag in adj_tag_set else False
def is_noun(tag):
noun_tag_set = set(["NN", "NP"])
return True if tag in noun_tag_set else False
# optional filter of adding dictionary
def dict_filter(tagged_nouns):
file = open("./chinese1.txt", 'r', encoding= 'utf-8')
hownet_dict = set()
for line in file:
hownet_dict.add(line.strip())
for i in range (len(tagged_nouns)):
word_property = tagged_nouns[i] # word set of [word, propertytag, BIOtag]
locations = [] # record the location of words
if word_property[2] == 'B-NP':
to_detect = word_property[0]
locations.append(i)
j = i + 1
while (j < len(tagged_nouns) and tagged_nouns[j][2] != "O"):
to_detect += tagged_nouns[j][0]
locations.append(j)
j += 1
if to_detect in hownet_dict: # if the word is in the hownet, it is a common word and should be OOV
for num in locations:
tagged_nouns[num][2] = "O"
else:
continue
def str_to_bool(s):
if s == 'True':
return True
elif s == 'False':
return False
else:
raise ValueError
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Outputting .tchunk and .pos files. A list of the tchunk names will be provided as well.")
parser.add_argument('-f', '--foreground', nargs = 1, help = "Please enter the input foreground directory", required = True)
parser.add_argument('-b', '--background', nargs = 1, help = "Please enter the input background directory", required = True)
parser.add_argument('-d', '--dict_filter', nargs=1, default= False, help="Please enter True or False for turning dictionary on or off", required=False)
args = parser.parse_args()
foreground_files = load_directory(args.foreground[0])
background_files = load_directory(args.background[0])
dict_on = str_to_bool(args.dict_filter[0])
out_foreground_path = os.path.join(os.getcwd(), "output_foreground")
out_background_path = os.path.join(os.getcwd(), "output_background")
print("The program is runing and analyzing model.")
if not os.path.exists(out_foreground_path):
os.mkdir(out_foreground_path)
if not os.path.exists(out_background_path):
os.mkdir(out_background_path)
if os.path.exists("foreground_tchunk_list"):
file = open("foreground_tchunk_list", 'w+')
file.close()
if os.path.exists("background_tchunk_list"):
file = open("background_tchunk_list", "w+")
file.close()
#output files from foreground
print("Writing into foreground.")
for file in foreground_files:
out_tchunk_file = "./output_foreground/" + file + ".tchunk"
out_pos_file = "./output_foreground/" + file + ".pos"
processed_data = process_data(args.foreground[0] + '/' + file)
tagged_nouns = detect_noun(processed_data)
if (dict_on == True):
dict_filter(tagged_nouns)
print_tchunk(out_tchunk_file, tagged_nouns)
print_pos(out_pos_file, tagged_nouns)
to_write = open("foreground_tchunk_list", 'a+')
to_write.write(out_tchunk_file + '\n')
to_write.close()
print("Writing into background.")
for file in background_files:
out_tchunk_file = "./output_background/" + file + ".tchunk"
out_pos_file = "./output_background/" + file + ".pos"
processed_data = process_data(args.background[0] + '/' + file)
tagged_nouns = detect_noun(processed_data)
if (dict_on == True):
dict_filter(tagged_nouns)
print_tchunk(out_tchunk_file, tagged_nouns)
print_pos(out_pos_file, tagged_nouns)
to_write = open("background_tchunk_list", 'a+')
to_write.write(out_tchunk_file + '\n')
to_write.close()
print("Finished")