forked from bfsujason/bertalign
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraph_merge.py
155 lines (135 loc) · 7.09 KB
/
graph_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import networkx as networkx
import string
def desambiguise(object:list, labels:list) -> list:
"""
On ajoute à chaque noeud le document d'origine, pour pouvoir transformer dans un graphe global ensuite:
((0, 0),
(1, 1),
(2, 2),
(3, 3), etc]
->
[(('0_a',), ('0_b',)),
(('1_a',), ('1_b',)),
etc]
"""
as_unique_nodes = []
for alignment_unit in object:
A, B = alignment_unit
if isinstance(A, int): A = (A,)
if isinstance(B, int): B = (B,)
ranges_A = []
ranges_B = []
for element in A:
ranges_A.append(f"{str(element)}_{labels[0]}")
for element in B:
ranges_B.append(f"{str(element)}_{labels[1]}")
as_unique_nodes.append((tuple(ranges_A), tuple(ranges_B)))
return as_unique_nodes
def deconnect(object:list) -> tuple:
"""
On passe de groupes à des paires de positions
[(('5_a', '6_a', '7_a'), ('5_b', '6_b'))]
-> [ ('5_a', '5_b'), ('5_a', '6_b'), ('6_a', '5_b'), ('6_a', '6_b'), ('7_a', '5_b'), ('7_a', '6_b')]
"""
final_list = []
for alignment_unit in object:
A, B = alignment_unit
for item_A in A:
for item_B in B:
final_list.append((item_A, item_B))
return tuple(final_list)
def merge_alignment_table(alignment_dict:dict) -> list:
"""
Cette fonction va permettre de fusionner les items d'alignement à l'aide d'un passage par le graphe.
Il en ressort une liste de dictionnaires, chaque item consistant en une unité d'alignement.
"""
# On désambiguise les noeuds
G = networkx.petersen_graph()
# On modifie la structure pour avoir des noeuds connectés 2 à 2 et des tuples
possible_witnesses = string.ascii_lowercase[:len(alignment_dict) + 1]
for index, value in alignment_dict.items():
structured_a = deconnect(desambiguise(value, (possible_witnesses[0], possible_witnesses[int(index) + 1])))
# Résultat de la forme: (('0_a', '0_b'), ('1_a', '1_b'), ('2_a', '2_b'), ('3_a', '3_b'), etc.)
G.add_edges_from(structured_a)
connected_nodes = []
# On prend chaque noeud et on en ressort les noeuds connectés
print(alignment_dict)
for node in G:
# https://stackoverflow.com/a/33089602
connected_components = list(networkx.node_connected_component(G, node))
connected_components.sort()
connected_nodes.append(tuple(connected_components))
# On supprime les noeuds redondants
connected_nodes = list(set(connected_nodes))
# Liste de la forme: [('27_a', '27_c', '28_a', '28_c', '29_c', '30_c', '33_b', '34_b'), ('19_c', '20_a', '20_c', '22_b'), etc.]
# Ici c'est un bug: il y a des noeuds qui sont des entiers, je ne sais pas pourquoi. On les supprime.
connected_nodes = [group for group in connected_nodes if not isinstance(group[0], int)]
# On trie les noeuds par position
connected_nodes.sort(key=lambda x: int(x[0].split('_')[0]))
nodes_as_dict = []
for connection in connected_nodes:
wit_dictionnary = {}
for document in possible_witnesses[:len(alignment_dict) + 1]:
pos_list = [node.replace(f'_{document}', '') for node in connection if document in node]
pos_list.sort(key=lambda x:int(x))
wit_dictionnary[document] = pos_list
nodes_as_dict.append(wit_dictionnary)
nodes_as_dict.sort(key=lambda x: int(x['a'][0]))
# Now we have to manage the omissions in the main document.
omitted_pos = dict()
# We gather all omissions for each witness in a dict
for wit in possible_witnesses:
all_positions = [align_dict[wit] for align_dict in nodes_as_dict if align_dict[wit] != []]
last_position = int(all_positions[-1][-1])
not_present_positions = list(set(range(last_position + 1)) - set([int(position) for dictionnary in nodes_as_dict for position in dictionnary[wit]]))
not_present_positions.sort()
omitted_pos[wit] = not_present_positions
# print(omitted_pos)
for wit in possible_witnesses:
print(f"\n{wit}")
for omitted in omitted_pos[wit]:
# print(f"Omitted position {omitted} for wit {wit}")
# Let's retrieve the corresponding alignment unit, we keep the index to allow the injection in case B (see below)
if omitted != 0:
corresponding_alignment_unit = [(index, node) for index, node in enumerate(nodes_as_dict) if str(omitted - 1) in node[wit]][0]
else:
corresponding_alignment_unit = (0, nodes_as_dict[0])
# print(corresponding_alignment_unit)
# On a plusieurs cas de figure
# Le premier: un fusion avec un "trou": (39, {'a': ['59', '60'], 'b': ['65', '67'], 'c': ['62', '63'], 'd': ['82'], 'e': ['58']})
# Premier cas: il faut insérer l'omission dans un noeud déjà formé: 66, ['65', '67']
if any(int(item) > omitted for item in corresponding_alignment_unit[1][wit]):
# print("Case A")
copied_node = corresponding_alignment_unit[1]
list_to_amend = copied_node[wit]
list_to_amend.append(str(omitted))
# print(f"Appending {omitted}")
list_to_amend.sort(key=lambda x:int(x))
copied_node[wit] = list_to_amend
nodes_as_dict[corresponding_alignment_unit[0]] = copied_node
# Deuxième cas, il faut créer une nouvelle unité d'alignement: 2005, ['2003', '2004']
else:
# print("Case B")
new_node = {wit:[] for wit in possible_witnesses}
new_node[wit] = [str(omitted)]
nodes_as_dict.insert(corresponding_alignment_unit[0] + 1, new_node)
# print(new_node)
# print("\n")
return nodes_as_dict
if __name__ == '__main__':
result_b = (((0), (0)), ((1, 2, 3), (1, 2)), ((4, 5), (3)), ((6, 7), (4)), ((8), (5)),
((9, 10, 11), (6)), ((12), (7, 8)), ((13), (9, 10)), ((14), (11)),
((15), (12)), ((16, 17), (13, 14, 15)), ((18), (16)), ((19), (17)),
((), (18)), ((20), (19, 20)), ((21), (21, 22)), ((22, 23, 24), (23)),
((25, 26), (24, 25, 26)), ((27), (27)), ((28), (28, 29, 30)), ((29), (31)),
((30, 31, 32), (32)), ((33), (33)), ((34, 35), (34)), ((36, 37), (35)),
((38), (36)), ((39), (37, 38, 39)))
result_a = (((0), (0)), ((1), (1)), ((2), (2)), ((3), (3)), ((4), (4)), ((5, 6, 7), (5, 6)),
((8), (7, 8, 9, 10)), ((9), (11, 12)), ((10, 11), (13)), ((12), ()), ((13), (14)),
((14), (15, 16)), ((15), (17, 18)), ((16), (19)), ((17, 18, 19), (20, 21)),
((20), (22)), ((21), (23, 24)), ((22), (25)), ((23), (26)), ((24), (27)),
((25), (28, 29, 30)), ((26), (31, 32)), ((27, 28), (33, 34)), ((29), (35)),
((30, 31), (36, 37)), ((32), (38)), ((33), (39)), ((34), ()), ((35), ()),
((36), ()), ((37), ()), ((38), ()), ((39), ()))
result = merge_alignment_table(result_a, result_b)
print(result)