-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert_to_json_format_and_split_to_train_dev.py
67 lines (54 loc) · 2.05 KB
/
convert_to_json_format_and_split_to_train_dev.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
import spacy
import codecs
import json
import random
with codecs.open('sentences.json', 'r', encoding='utf-8', errors='ignore') as sentences, \
codecs.open('tags.json', 'r', encoding='utf-8', errors='ignore') as tags, \
codecs.open('entities_biluo.json', 'r', encoding='utf-8', errors='ignore') as entities_biluo, \
codecs.open('train_data.json', 'w', encoding='utf-8', errors='ignore') as train_file, \
codecs.open('dev_data.json', 'w', encoding='utf-8', errors='ignore') as dev_file:
DATA = []
nlp = spacy.blank('el')
dict_id = 0
while (1):
print(dict_id)
try:
text = json.loads(sentences.readline())["sentence"]
doc = nlp(text)
tags_list = json.loads(tags.readline())["tags"].split(' ')
entities = json.loads(entities_biluo.readline())["entities"]
if len(tags_list) != len(doc):
continue # if bad tokenization
if entities == '':
continue # if not available entities in sentence
tokens_list = []
for index in range(len(doc)):
tokens_list.append({
"tag": tags_list[index],
"ner": entities[index],
"id": index,
"orth": doc[index].text
})
my_list = [{
"id": dict_id,
"paragraphs": [{
"raw": text,
"sentences": [{
"tokens": tokens_list
}]
}]
}]
DATA.append(my_list)
dict_id += 1
except ValueError:
break
random.shuffle(DATA)
TRAIN_DATA = DATA[:int(dict_id * 0.7)]
DEV_DATA = DATA[int(dict_id * 0.7):]
for record in TRAIN_DATA:
train_file.write(json.dumps(record, ensure_ascii=False))
train_file.write('\n')
for record in DEV_DATA:
dev_file.write(json.dumps(record, ensure_ascii=False))
dev_file.write('\n')