-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmams.py
97 lines (75 loc) · 4.36 KB
/
mams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os, spacy
from tqdm import tqdm
import xml.etree.ElementTree as ET
from cmn.review import Review
class MAMSReview(Review):
def __init__(self, id, sentences, time, author, aos):
super().__init__(self, id, sentences, time, author, aos)
@staticmethod
def load(path):
return MAMSReview._xmlloader(path)
@staticmethod
def _xmlloader(path):
reviews_list = []
nlp = spacy.load("en_core_web_sm")
tree = ET.parse(path)
sentences = tree.getroot()
i = -1
for sentence in sentences: # each sentence is an individual review, unlike SemEval16
i += 1
text = ""
tokens = []
aos_list_list = []
for data in sentence:
if data.tag == "text": # clean the associated aspect tokens from punctuations
raw_text = data.text
current_text = raw_text
opinion_text = sentence.findall(".//aspectTerm")
for o in opinion_text:
aspect = o.attrib["term"]
aspect_list = aspect.split()
if len(aspect_list) == 0: # contains no aspect (mams dataset doesn't have NULL aspects)
continue
letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to']))
current_text = current_text.replace(' ', ' ')
current_text = current_text[0:letter_index_tuple[0]] + ' ' + aspect + ' ' + current_text[letter_index_tuple[1]+1:]
#print("processing text:" + str(current_text))
tokens = current_text.split()
if data.tag == "aspectTerms":
aos_list = []
for o in data: # each o is an aspectTerm
sentiment = o.attrib["polarity"].replace('positive', '+1').replace('negative', '-1').replace('neutral', '0')
aspect = o.attrib["term"]
aspect_list = aspect.split() # the aspect may consist more than 1 word
if len(aspect_list) == 0:
continue
letter_index_tuple = (int(o.attrib['from']), int(o.attrib['to']))
# find the aspect instance of all text instances of the phrase
#print(tokens)
text_incidences = [i for i in range(len(raw_text))
if raw_text.startswith(aspect, i)
and not raw_text[i-1].isalpha()
and not raw_text[i+len(aspect)].isalpha()]
#print("text incidences: " + str(text_incidences))
idx_of_from = text_incidences.index(letter_index_tuple[0])
#print("index of from: " + str(idx_of_from))
# find the location of the aspect token
start_token_of_aspect = [i for i in range(len(tokens))
if i + len(aspect_list) <= len(tokens)
and tokens[i:i + len(aspect_list)] == aspect_list]
#print("start token of aspect: " + str(start_token_of_aspect))
idx_start_token_of_aspect = start_token_of_aspect[idx_of_from]
idx_aspect_list = list(
range(idx_start_token_of_aspect, idx_start_token_of_aspect + len(aspect_list)))
# compile the final aos 3-tuple for each aspect
aos = (idx_aspect_list, [], eval(sentiment))
if len(aos) != 0:
aos_list.append(aos)
if len(aos_list) != 0:
aos_list_list.append(aos_list)
if len(aos_list_list) == 0: # if no aspect in the sentence, it is not added
continue
reviews_list.append(
Review(id=i, sentences=[[str(t).lower() for t in current_text.split()]], time=None,
author=None, aos=aos_list_list, lempos=""))
return reviews_list