-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparsing_entities.py
67 lines (54 loc) · 2.54 KB
/
parsing_entities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
import codecs
import json
entity_value = []
entity_class = []
with codecs.open('makedonia_list.txt', 'r', encoding='utf-8', errors='ignore') as file:
for line in file:
pair = line.split(' ')
entity_value.append(pair[0])
entity_class.append(pair[1].split('\n')[0])
# remove newline
entity_class = entity_class[:-1]
entity_value = entity_value[:-1]
# replace classes in spaCy's format
entity_class = [e.replace('ORGANIZATION', 'ORG') for e in entity_class]
entity_class = [e.replace('LOCATION', 'LOC') for e in entity_class]
entity_class = [e.replace('FACILITY', 'FAC') for e in entity_class]
with codecs.open('entities.json', 'w', encoding='utf-8', errors='ignore') as entities, \
codecs.open('sentences.json', 'r', encoding='utf-8', errors='ignore') as file:
num = 0
for line in file:
print(num)
sentence = json.loads(line)['sentence']
entity_lists = [] # list with all entity values available in sentence
final_record = [] # records that are saved in entities.json
# check all entities from makedonia_list.txt
for index in range(len(entity_value)):
entity = entity_value[index]
if entity in sentence:
begin_entity = sentence.find(entity)
end_entity = begin_entity + len(entity)
entity_lists_index = 0
# whether the token is used in another entity to avoid wrong processing (example: "Αριστοτέλειο Πανεπιστήμιο Θεσσαλονίκης", entity_conflict: "Δήμος Θεσσαλονίκης")
entity_conflict = False
while(entity_lists_index < len(entity_lists)):
# if range of entity is located in entity range
if bool(set(range(begin_entity, end_entity)) & set(entity_lists[entity_lists_index])):
entity_conflict = True
break
entity_lists_index += 1
if not entity_conflict: # correct named entity
entity_lists.append(range(begin_entity, end_entity))
final_record.append((
begin_entity,
end_entity,
entity_class[index]
))
if entity_lists == []:
my_dict = {"entities": ""}
else:
my_dict = {"entities": str(final_record)}
json.dump(my_dict, entities, ensure_ascii=False)
entities.write('\n')
num += 1