-
Notifications
You must be signed in to change notification settings - Fork 1
/
cwa_processing_people.py
210 lines (170 loc) · 7.65 KB
/
cwa_processing_people.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
import re
import json
def extract_adjectives_in_conditional_sentence(input_string):
words = input_string.split()
if "and" in words:
words.remove("and")
return words
def extract_words_from_sentences(sentences_str):
sentences = sentences_str.split(".")
first_is_list = []
second_is_list = []
for sentence in sentences:
sentence = sentence.strip()
if "If" in sentence and "is" in sentence and "then" in sentence and "are" in sentence:
first_is_index = sentence.find("is")
if first_is_index != -1 and "then" in sentence:
first_word = sentence[first_is_index + 3:sentence.find("then")].strip()
if first_word:
adjectives = extract_adjectives_in_conditional_sentence(first_word)
first_is_list.extend(adjectives)
second_is_index = sentence[first_is_index+3:].find("are")
if second_is_index != -1:
second_word = sentence[first_is_index+3+second_is_index+4:].strip()
if second_word:
second_is_list.append(second_word)
elif "All" in sentence and "people are" in sentence:
people_are_index = sentence.find("people are")
if people_are_index != -1:
first_word = sentence[3:people_are_index].strip()
if first_word:
adjectives = extract_adjectives_in_conditional_sentence(first_word)
first_is_list.extend(adjectives)
second_word = sentence[people_are_index + 10:].strip()
if second_word:
second_is_list.append(second_word)
elif "people are" in sentence:
people_are_index = sentence.find("people are")
if people_are_index != -1:
first_word = sentence[:people_are_index].strip().lower()
if first_word:
adjectives = extract_adjectives_in_conditional_sentence(first_word)
first_is_list.extend(adjectives)
second_word = sentence[people_are_index + 10:].strip()
if second_word:
second_is_list.append(second_word)
while "not" in first_is_list:
unique_set = set(first_is_list)
unique_set.discard("not")
first_is_list = list(unique_set)
while "not" in second_is_list:
unique_set = set(second_is_list)
unique_set.discard("not")
second_is_list = list(unique_set)
return first_is_list, second_is_list
def find_unique_elements(list1, list2):
unique_elements = []
for element in list1:
if element not in list2:
unique_elements.append(element)
return unique_elements
def extract_adjectives(words, target_name):
tagged_words = pos_tag(words)
adjectives = []
name_found = False
verb_found = False
for word, tag in tagged_words:
if tag == 'NNP' and word == target_name:
name_found = True
elif name_found and not verb_found and tag.startswith('VB'):
verb_found = True
elif verb_found and tag.startswith('JJ'):
adjectives.append(word)
else:
name_found = False
verb_found = False
return adjectives
def extract_names(words):
tagged_words = pos_tag(words)
names = []
current_name = []
for word, tag in tagged_words:
if tag == 'NNP':
current_name.append(word)
elif current_name:
names.append(" ".join(current_name))
current_name = []
if current_name:
names.append(" ".join(current_name))
return names
def is_subject_verb_adjective(sentence):
pattern = r"^(NNP\s)*(VBZ\s)"
tagged_sentence = " ".join(tag for word, tag in sentence)
return re.match(pattern, tagged_sentence)
def extract_names_and_attributes(text):
sentences = sent_tokenize(text)
dic = {}
for sentence in sentences:
if is_subject_verb_adjective(pos_tag(word_tokenize(sentence))):
words = word_tokenize(sentence)
name = "".join(extract_names(words))
if name:
if name not in dic:
dic[name] = []
adjectives = extract_adjectives(words, name)
dic[name].extend(adjectives)
return dic
def create_output_string(dct, word_list):
output_string = ""
for name, attributes in dct.items():
for word in word_list:
if word not in attributes:
output_string += f"{name} is not {word}. "
return output_string.strip()
def preprocessing(text):
sentences = ""
first_list, second_list = extract_words_from_sentences(text)
for name in extract_names_and_attributes(text).keys():
# for each words in the list
for word in find_unique_elements(first_list, second_list):
# Generate a sentence if the word is not in the value of the current key of the dictionary
if word not in extract_names_and_attributes(text)[name]:
sentences = sentences + (f" {name} is not {word}.")
# Perform preprocessing on the text
# Replace this with your actual preprocessing logic
return sentences + " " + text
if __name__ == "__main__":
# sentences_str = "Anne is huge. Anne is big. Anne is high. Harry is little. Harry is short. Alan is nice. Alan is quiet. Alan is kind. Erin is rough. Erin is poor. Erin is sad. Huge people are nice. If someone is little and short then they are small. If someone is rough and poor then they are dull. If someone is nice and quiet then they are wealthy. All small people are thin. All nice people are quiet. All wealthy people are smart. All dull people are bad."
# first_list, second_list = extract_words_from_sentences(sentences_str)
# print("The first list:", first_list)
# print("The second list:", second_list)
# print(extract_names_and_attributes(sentences_str))
# print(find_unique_elements(first_list, second_list))
#
# sentences = ""
#
# for name in extract_names_and_attributes(sentences_str).keys():
# # for each words in the list
# for word in find_unique_elements(first_list, second_list):
# # Generate a sentence if the word is not in the value of the current key of the dictionary
# if word not in extract_names_and_attributes(sentences_str)[name]:
# sentences = sentences + (f" {name} is not {word}.")
#
# print(sentences)
# call this py file to return a string base on cwa (People)
# print(create_output_string(extract_names_and_attributes(sentences_str), find_unique_elements(first_list, second_list)))
# List of JSON file names to process
json_files = [
"PARARULE_plus_step2_People_sample.json",
"PARARULE_plus_step3_People_sample.json",
"PARARULE_plus_step4_People_sample.json",
"PARARULE_plus_step5_People_sample.json"
]
# Iterate over each JSON file
for file_name in json_files:
# Load the JSON data from file
with open(file_name, 'r') as file:
data = json.load(file)
# Extract context from each data entry, preprocess, and update the data
for entry in data:
original_context = entry['context']
preprocessed_context = preprocessing(original_context)
entry['context'] = f"{preprocessed_context}"
# Write the updated data back to the JSON file
with open(file_name, 'w') as file:
json.dump(data, file, indent=4)