@@ -14,13 +14,11 @@ kernelspec:
14
14
``` {code-cell} ipython3
15
15
from medkit.io.medkit_json import load_text_documents
16
16
from medkit.core.text import TextDocument
17
- from medkit.text.segmentation import SentenceTokenizer
18
17
from pathlib import Path
19
18
from tqdm import tqdm
20
19
from medkit.text.ner import UMLSMatcher
21
20
from medkit.text.metrics.ner import SeqEvalEvaluator
22
21
import pandas as pd
23
- from glob import glob
24
22
from medkit.text.ner.hf_entity_matcher import HFEntityMatcher
25
23
import torch
26
24
from medkit.core import Pipeline, DocPipeline, PipelineStep
@@ -30,165 +28,101 @@ import os
30
28
from medkit.core.text import Entity, Span
31
29
from spacy import displacy
32
30
from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy
33
- import random
34
31
from IPython.display import display, HTML
35
- import pandas as pd
32
+ from typing import List
36
33
37
34
DEVICE = 0 if torch.cuda.is_available() else -1
38
35
36
+
37
+ class GPT_NER:
38
+ def __init__(self, config, _open_ai_key, _name):
39
+ os.environ["OPENAI_API_KEY"] = _open_ai_key
40
+ self.nlp = assemble(config)
41
+ self.description = self.Description(_name)
42
+
43
+ def run(self, _raw_segment : []):
44
+ doc = self.nlp(_raw_segment[0].text)
45
+ gpt_remapping = { "Anatomie": "ANAT", "Médicament": "CHEM", "Appareil": "DEVI", "Pathologie": "DISO", "Région": "GEOG", "Organisme": "LIVB", "Objet": "OBJC", "Phénomène": "PHEN", "Physiologie": "PHYS", "Procédure": "PROC"}
46
+ predicted_entities = [Entity(label=gpt_remapping[ent.label_], text=ent.text, spans=[Span(ent.start_char, ent.end_char)]) for ent in doc.ents]
47
+
48
+ return predicted_entities
49
+
50
+ class Description:
51
+ def __init__(self, _name):
52
+ self.name = _name
53
+
54
+ #Mets en forme les résultats renvoyé par SeqEvalEvaluator
39
55
def results_to_df(_results, _title):
40
56
results_list = list(_results.items())
41
- arranged_results = {}
42
- arranged_results["Entities"] = ['P','R','F1']
43
- arranged_results["all"] = [ round(results_list[i ][1],2) for i in [0,1,2]]
44
- accuracy = round(results_list[4][1],2)
45
- for i in range(5,len(results_list), 4):
57
+ arranged_results = {"Entities": ['P', 'R', 'F1'],
58
+ "all": [round(results_list[i][1], 2) for i in [0, 1, 2]]}
59
+ accuracy = round(results_list[4 ][1], 2)
60
+
61
+ for i in range(5, len(results_list), 4):
46
62
key = results_list[i][0][:-10]
47
- arranged_results[key] = [round(results_list[n][1],2) for n in [i,i+1,i+2]]
48
- df = pd.DataFrame(arranged_results, index=[f"{_title} ({accuracy})",'','']).T
63
+ arranged_results[key] = [round(results_list[n][1], 2) for n in [i, i + 1, i + 2]]
64
+
65
+ df = pd.DataFrame(arranged_results, index=[f"{_title} ({accuracy})", '', '']).T
49
66
return df
50
67
51
- def LLMNER(_text, _nlp):
52
- doc = _nlp(_text)
53
-
54
- predicted_entities = []
55
-
56
- gpt_remapping = {
57
- "Anatomie": "ANAT",
58
- "Médicament": "CHEM",
59
- "Appareil": "DEVI",
60
- "Pathologie": "DISO",
61
- "Région": "GEOG",
62
- "Organisme": "LIVB",
63
- "Objet": "OBJC",
64
- "Phénomène": "PHEN",
65
- "Physiologie": "PHYS",
66
- "Procédure": "PROC",
67
- }
68
-
69
- for ent in doc.ents:
70
- predicted_entities.append(Entity(
71
- label=gpt_remapping[ent.label_],
72
- text=ent.text,
73
- spans=[Span(ent.start_char, ent.end_char)]
74
- ))
75
-
76
- return predicted_entities
77
-
78
- def eval(_docs, open_ai_key = "", _labels_remapping = {
79
- "ANAT": "ANAT", "CHEM": "CHEM", "DEVI": "DEVI", "DISO": "DISO", "GEOG": "GEOG",
80
- "LIVB": "LIVB", "OBJC": "OBJC", "PHEN": "PHEN", "PHYS": "PHYS", "PROC": "PROC",}):
81
-
82
- use_gpt = True if open_ai_key != "" else False
83
-
84
- if use_gpt:
85
- os.environ["OPENAI_API_KEY"] = open_ai_key
86
- nlp = assemble("config_gpt.cfg")
87
-
88
- #Loading entity matchers
89
- umls_matcher = UMLSMatcher(
90
- umls_dir=Path.home() / "src/UMLS",
91
- cache_dir=".umls_cache",
92
- language="FRE",
93
- semgroups=['ANAT', 'CHEM', 'DEVI', 'DISO', 'GEOG', 'LIVB', 'OBJC', 'PHEN', 'PHYS', 'PROC'],
94
- lowercase=True,
95
- normalize_unicode=True,
96
- name="umls_matcher",
97
- output_labels_by_semgroup=_labels_remapping
98
- )
99
-
100
- drbert_matcher = HFEntityMatcher(model="Thibeb/CamemBert_bio_generalized", name="drbert_matcher", device=DEVICE)
101
- camembert_matcher = HFEntityMatcher(model="Thibeb/DrBert_generalized", name="camembert_matcher", device=DEVICE)
102
-
103
- #Prediction
104
- ners = [umls_matcher, drbert_matcher, camembert_matcher]
105
- predicted_entities = {}
106
- for ner in ners:
107
- predicted_entities[ner.description.name] = []
108
- if use_gpt : predicted_entities['GPT-3.5-turbo'] = []
109
-
110
- # Predict entites for each doc for each NER tool
111
- for doc in tqdm(_docs):
112
- for ner in ners:
113
- entities = ner.run([doc.raw_segment])
114
- predicted_entities[ner.description.name].append(entities)
115
- if use_gpt:
116
- entities = LLMNER(doc.text, nlp)
117
- predicted_entities['GPT-3.5-turbo'].append(entities)
118
-
68
+ #Evalue les annotations de plusieurs outils de NER sur les documents fournis
69
+ def eval(_docs, open_ai_key="", _labels_remapping = {"ANAT": "ANAT", "CHEM": "CHEM", "DEVI": "DEVI", "DISO": "DISO", "GEOG": "GEOG",
70
+ "LIVB": "LIVB", "OBJC": "OBJC", "PHEN": "PHEN", "PHYS": "PHYS", "PROC": "PROC",}):
71
+
72
+ ners = []
73
+ ners.append(UMLSMatcher(umls_dir=Path.home() / "src/UMLS", cache_dir=".umls_cache", language="FRE", semgroups=['ANAT', 'CHEM', 'DEVI', 'DISO', 'GEOG', 'LIVB', 'OBJC', 'PHEN', 'PHYS', 'PROC'], lowercase=True, normalize_unicode=True, name="umls_matcher", output_labels_by_semgroup=_labels_remapping))
74
+ ners.append(HFEntityMatcher(model="Thibeb/CamemBert_bio_generalized", name="drbert_matcher", device=DEVICE))
75
+ ners.append(HFEntityMatcher(model="Thibeb/DrBert_generalized", name="camembert_matcher", device=DEVICE))
76
+ if open_ai_key != "" : ners.append(GPT_NER("config_gpt.cfg", open_ai_key,_name='ChatGPT-3.5-turbo'))
77
+
119
78
ner_evaluator = SeqEvalEvaluator(return_metrics_by_label=True, average='weighted', labels_remapping=_labels_remapping)
120
79
121
- # Compute NER metrics for each NER tool
122
80
dfs = []
81
+
123
82
for ner in ners:
124
- results = ner_evaluator.compute(_docs, predicted_entities[ner.description.name])
83
+ predicted_entities = [ner.run([doc.raw_segment]) for doc in tqdm(_docs)]
84
+ results = ner_evaluator.compute(_docs, predicted_entities)
125
85
dfs.append(results_to_df(_results=results, _title=ner.description.name))
126
- if use_gpt:
127
- results = ner_evaluator.compute(_docs, predicted_entities['GPTNER'])
128
- dfs.append(results_to_df(_results=results, _title='GPTNER'))
129
-
86
+
130
87
return pd.concat(dfs, axis=1)
131
88
132
- def test(_doc, open_ai_key = "", _labels_remapping = {
133
- "ANAT": "ANAT", "CHEM": "CHEM", "DEVI": "DEVI", "DISO": "DISO", "GEOG": "GEOG",
134
- "LIVB": "LIVB", "OBJC": "OBJC", "PHEN": "PHEN", "PHYS": "PHYS", "PROC": "PROC",}):
89
+ #Affiche les annotation de plusieurs outils de NER sur le document fourni
90
+ def test(_doc, open_ai_key="", _labels_remapping={ "ANAT": "ANAT", "CHEM": "CHEM", "DEVI": "DEVI", "DISO": "DISO", "GEOG": "GEOG",
91
+ "LIVB": "LIVB", "OBJC": "OBJC", "PHEN": "PHEN", "PHYS": "PHYS", "PROC": "PROC",}):
135
92
136
- use_gpt = True if open_ai_key != "" else False
137
-
138
- if use_gpt:
139
- os.environ["OPENAI_API_KEY"] = open_ai_key
140
- nlp = assemble("config_gpt.cfg")
141
-
142
- #Loading entity matchers
143
- umls_matcher = UMLSMatcher(
144
- umls_dir=Path.home() / "src/UMLS",
145
- cache_dir=".umls_cache",
146
- language="FRE",
147
- semgroups=['ANAT', 'CHEM', 'DEVI', 'DISO', 'GEOG', 'LIVB', 'OBJC', 'PHEN', 'PHYS', 'PROC'],
148
- lowercase=True,
149
- normalize_unicode=True,
150
- name="umls_matcher",
151
- output_labels_by_semgroup=_labels_remapping
152
- )
153
-
154
- drbert_matcher = HFEntityMatcher(model="Thibeb/CamemBert_bio_generalized", name="drbert_matcher", device=DEVICE)
155
- camembert_matcher = HFEntityMatcher(model="Thibeb/DrBert_generalized", name="camembert_matcher", device=DEVICE)
156
-
157
- #Prediction
158
- ners = [umls_matcher, drbert_matcher, camembert_matcher]
159
- annotated_docs = {}
160
-
161
- # Predict entites for each doc for each NER tool
93
+ ners = []
94
+ ners.append(UMLSMatcher(umls_dir=Path.home() / "src/UMLS", cache_dir=".umls_cache", language="FRE", semgroups=['ANAT', 'CHEM', 'DEVI', 'DISO', 'GEOG', 'LIVB', 'OBJC', 'PHEN', 'PHYS', 'PROC'], lowercase=True, normalize_unicode=True, name="UMLS matcher", output_labels_by_semgroup=_labels_remapping))
95
+ ners.append(HFEntityMatcher(model="Thibeb/CamemBERT_bio_generalized", name="DrBERT matcher", device=DEVICE))
96
+ ners.append(HFEntityMatcher(model="Thibeb/DrBERT_generalized", name="CamemBERT-bio matcher", device=DEVICE))
97
+ if open_ai_key != "" : ners.append(GPT_NER("config_gpt3.cfg", open_ai_key,_name='ChatGPT-3.5-turbo'))
98
+ if open_ai_key != "" : ners.append(GPT_NER("config_gpt4.cfg", open_ai_key,_name='ChatGPT-4'))
99
+
100
+ annotated_docs = {'Original':_doc}
162
101
for ner in ners:
102
+ annotated_docs[ner.description.name] = TextDocument(text=_doc.text)
163
103
entities = ner.run([_doc.raw_segment])
164
- annotated_doc = TextDocument(text=_doc.text)
165
- for ent in entities:
166
- annotated_doc.anns.add(ent)
167
- annotated_docs[ner.description.name] = annotated_doc
168
-
169
- if use_gpt:
170
- entities = LLMNER(_doc.text, nlp)
171
- annotated_doc = TextDocument(text=_doc.text)
172
104
for ent in entities:
173
- annotated_doc.anns.add(ent)
174
- annotated_docs['GPT-3.5-turbo'] = annotated_doc
105
+ annotated_docs[ner.description.name].anns.add(ent)
175
106
176
- html_datas = []
177
-
178
- for ner, doc in annotated_docs.items():
179
- displacy_data = medkit_doc_to_displacy(doc)
180
- html_data = displacy.render(displacy_data, manual=True, style="ent", jupyter=False)
181
- html_datas.append(html_data)
107
+ html_datas = [f'<h1>{name}</h1>{displacy.render(medkit_doc_to_displacy(doc), manual=True, style="ent", jupyter=False)}' for name, doc in annotated_docs.items()]
108
+ display(HTML("".join(html_datas)))
109
+ ```
182
110
183
- display(HTML("".join(html_datas)))
111
+ ``` {code-cell} ipython3
112
+ #Charge une partie du split de test du corpus QUAERO déja pre-processé
113
+ docs = list(load_text_documents("datasets/quaero/test.jsonl"))[:100]
184
114
```
185
115
186
116
``` {code-cell} ipython3
187
- docs = list(load_text_documents("datasets/quaero/test.jsonl"))[14:25]
117
+ #Evalue plusieurs outils de NER et renvoie un tableau de comparaison
118
+ eval(docs)
119
+ ```
188
120
189
- eval(docs[:10])
121
+ ``` {code-cell} ipython3
122
+ #Affiche les annotations de différent outils de NER sur un document du split
123
+ test(docs[12], open_ai_key="")
190
124
```
191
125
192
126
``` {code-cell} ipython3
193
- def eval(_docs, _labels_remapping, )
127
+
194
128
```
0 commit comments