-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbuild_graph.py
94 lines (85 loc) · 4.23 KB
/
build_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import pandas as pd
from tqdm import tqdm
def map_rel(s):
d_rel_map = {"has_atom": "包含子结构", "is_atom_of": "是其子结构",
"has_part": "包含偏旁", "is_part_of": "是其偏旁",
"has_pronunciation": "拼音", "is_pronunciation_of": "是其拼音",
"has_number": "笔画数", "is_number_of": "是其笔画数",
"has_simple": "简体形式", "is_simple_to": "是其简体形式",
"has_traditional": "繁体形式", "is_traditional_to": "是其繁体形式",
"similar_to": "近似字"}
return d_rel_map.get(s, s)
def generate_entity_and_relation_tables(load_path):
print("Check entities and relations ...")
dict_entity = dict()
lst_relation = list()
df_triple = pd.read_excel(load_path)
for i in tqdm(range(df_triple.shape[0])):
head = df_triple.loc[i, "head"]
relation = df_triple.loc[i, "relation"]
tail = df_triple.loc[i, "tail"]
# check type
if relation == "has_number":
head_type = "charactor"
tail_type = "number"
elif relation == "is_number_of":
head_type = "number"
tail_type = "charactor"
elif relation == "has_pronunciation":
head_type = "charactor"
tail_type = "pronunciation"
elif relation == "is_pronunciation_of":
head_type = "pronunciation"
tail_type = "charactor"
else:
head_type = "charactor"
tail_type = "charactor"
# check id
if head not in dict_entity.keys():
head_id = len(dict_entity) + 1
dict_entity[head] = {":ID(node)": head_id, ":LABEL": head_type, "name": head}
else:
if head_type != dict_entity[head][":LABEL"]:
head_id = len(dict_entity) + 1
dict_entity[head] = {":ID(node)": head_id, ":LABEL": head_type, "name": head}
else:
head_id = dict_entity[head][":ID(node)"]
if tail not in dict_entity.keys():
tail_id = len(dict_entity) + 1
dict_entity[tail] = {":ID(node)": tail_id, ":LABEL": tail_type, "name": tail}
else:
if tail_type != dict_entity[tail][":LABEL"]:
tail_id = len(dict_entity) + 1
dict_entity[tail] = {":ID(node)": tail_id, ":LABEL": tail_type, "name": tail}
else:
tail_id = dict_entity[tail][":ID(node)"]
# add into entity & relation lists
lst_relation.append({":START_ID(node)": head_id, ":END_ID(node)": tail_id,
":TYPE": map_rel(relation), "name": map_rel(relation)})
# create entity table
print("Create Entity Table ...")
lst_e_id, lst_e_name, lst_e_label = list(), list(), list()
for _, e in dict_entity.items():
if e.get(":ID(node)", None) and e.get("name", None) and e.get(":LABEL", None):
lst_e_id.append(e.get(":ID(node)", None))
lst_e_name.append(e.get("name", None))
lst_e_label.append(e.get(":LABEL", None))
df_entity = pd.DataFrame({":ID(node)": lst_e_id, "name": lst_e_name, ":LABEL": lst_e_label})
df_entity = df_entity[[":ID(node)", "name", ":LABEL"]]
df_entity.to_csv("hanzi_entity.csv", index=False, encoding="utf-8-sig")
# create relation table
print("Create Relation Table ...")
lst_r_head, lst_r_tail, lst_r_type, lst_r_name = list(), list(), list(), list()
for r in lst_relation:
if r.get(":START_ID(node)", None) and r.get(":END_ID(node)", None) and r.get(":TYPE", None) and r.get("name", None):
lst_r_head.append(r.get(":START_ID(node)", None))
lst_r_tail.append(r.get(":END_ID(node)", None))
lst_r_type.append(r.get(":TYPE", None))
lst_r_name.append(r.get("name", None))
df_relation = pd.DataFrame({":START_ID(node)": lst_r_head, ":END_ID(node)": lst_r_tail, ":TYPE": lst_r_type, "name":lst_r_name})
df_relation = df_relation[[":START_ID(node)", ":END_ID(node)", ":TYPE", "name"]]
df_relation.to_csv("hanzi_relation.csv", index=False, encoding="utf-8-sig")
print("Finished.")
return None
if __name__ == "__main__":
generate_entity_and_relation_tables("corpus/basic_triple.xlsx")