-
Notifications
You must be signed in to change notification settings - Fork 0
/
labels.py
62 lines (58 loc) · 1.43 KB
/
labels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
from collections import defaultdict
# set up look-up dictionaries for the model
# label2id dictionary from https://huggingface.co/datasets/Babelscape/multinerd
label2id = {
"O": 0,
"B-PER": 1,
"I-PER": 2,
"B-ORG": 3,
"I-ORG": 4,
"B-LOC": 5,
"I-LOC": 6,
"B-ANIM": 7,
"I-ANIM": 8,
"B-BIO": 9,
"I-BIO": 10,
"B-CEL": 11,
"I-CEL": 12,
"B-DIS": 13,
"I-DIS": 14,
"B-EVE": 15,
"I-EVE": 16,
"B-FOOD": 17,
"I-FOOD": 18,
"B-INST": 19,
"I-INST": 20,
"B-MEDIA": 21,
"I-MEDIA": 22,
"B-MYTH": 23,
"I-MYTH": 24,
"B-PLANT": 25,
"I-PLANT": 26,
"B-TIME": 27,
"I-TIME": 28,
"B-VEHI": 29,
"I-VEHI": 30,
}
id2label = {v:k for k,v in label2id.items()}
label_list = list(label2id.keys())
# system B: only 5 categories
label_list_B = ['O',
'B-PER',
'I-PER',
'B-ORG',
'I-ORG',
'B-LOC',
'I-LOC',
'B-ANIM',
'I-ANIM',
'B-DIS',
'I-DIS'
]
# the same label-id correspondence cannot be kept because the training process only allows label ids of range(0:number of classifications)
label2id_B = {l:i for i,l in enumerate(label_list_B)}
id2label_B = {v:k for k,v in label2id_B.items()}
# map the tokenized dataset to the simpler set of labels
# map system A ids to system B ids, set the rest to 0
A2B_mapping = defaultdict(lambda:0, {label2id[label]:label2id_B[label] for label in label_list_B})
A2B_mapping[-100] = -100 # for special tokens and trailing subtokens of NER entities