Skip to content

Commit 45cdf35

Browse files
authored
Update ner_evalution.md
First file was not the final notebook
1 parent 2b65d8b commit 45cdf35

File tree

1 file changed

+68
-134
lines changed

1 file changed

+68
-134
lines changed

docs/examples/ner_evaluation/ner_evalution.md

+68-134
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,11 @@ kernelspec:
1414
```{code-cell} ipython3
1515
from medkit.io.medkit_json import load_text_documents
1616
from medkit.core.text import TextDocument
17-
from medkit.text.segmentation import SentenceTokenizer
1817
from pathlib import Path
1918
from tqdm import tqdm
2019
from medkit.text.ner import UMLSMatcher
2120
from medkit.text.metrics.ner import SeqEvalEvaluator
2221
import pandas as pd
23-
from glob import glob
2422
from medkit.text.ner.hf_entity_matcher import HFEntityMatcher
2523
import torch
2624
from medkit.core import Pipeline, DocPipeline, PipelineStep
@@ -30,165 +28,101 @@ import os
3028
from medkit.core.text import Entity, Span
3129
from spacy import displacy
3230
from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy
33-
import random
3431
from IPython.display import display, HTML
35-
import pandas as pd
32+
from typing import List
3633
3734
DEVICE = 0 if torch.cuda.is_available() else -1
3835
36+
37+
class GPT_NER:
38+
def __init__(self, config, _open_ai_key, _name):
39+
os.environ["OPENAI_API_KEY"] = _open_ai_key
40+
self.nlp = assemble(config)
41+
self.description = self.Description(_name)
42+
43+
def run(self, _raw_segment : []):
44+
doc = self.nlp(_raw_segment[0].text)
45+
gpt_remapping = { "Anatomie": "ANAT", "Médicament": "CHEM", "Appareil": "DEVI", "Pathologie": "DISO", "Région": "GEOG", "Organisme": "LIVB", "Objet": "OBJC", "Phénomène": "PHEN", "Physiologie": "PHYS", "Procédure": "PROC"}
46+
predicted_entities = [Entity(label=gpt_remapping[ent.label_], text=ent.text, spans=[Span(ent.start_char, ent.end_char)]) for ent in doc.ents]
47+
48+
return predicted_entities
49+
50+
class Description:
51+
def __init__(self, _name):
52+
self.name = _name
53+
54+
#Mets en forme les résultats renvoyé par SeqEvalEvaluator
3955
def results_to_df(_results, _title):
4056
results_list = list(_results.items())
41-
arranged_results = {}
42-
arranged_results["Entities"] = ['P','R','F1']
43-
arranged_results["all"] = [round(results_list[i][1],2) for i in [0,1,2]]
44-
accuracy = round(results_list[4][1],2)
45-
for i in range(5,len(results_list), 4):
57+
arranged_results = {"Entities": ['P', 'R', 'F1'],
58+
"all": [round(results_list[i][1], 2) for i in [0, 1, 2]]}
59+
accuracy = round(results_list[4][1], 2)
60+
61+
for i in range(5, len(results_list), 4):
4662
key = results_list[i][0][:-10]
47-
arranged_results[key] = [round(results_list[n][1],2) for n in [i,i+1,i+2]]
48-
df = pd.DataFrame(arranged_results, index=[f"{_title} ({accuracy})",'','']).T
63+
arranged_results[key] = [round(results_list[n][1], 2) for n in [i, i + 1, i + 2]]
64+
65+
df = pd.DataFrame(arranged_results, index=[f"{_title} ({accuracy})", '', '']).T
4966
return df
5067
51-
def LLMNER(_text, _nlp):
52-
doc = _nlp(_text)
53-
54-
predicted_entities = []
55-
56-
gpt_remapping = {
57-
"Anatomie": "ANAT",
58-
"Médicament": "CHEM",
59-
"Appareil": "DEVI",
60-
"Pathologie": "DISO",
61-
"Région": "GEOG",
62-
"Organisme": "LIVB",
63-
"Objet": "OBJC",
64-
"Phénomène": "PHEN",
65-
"Physiologie": "PHYS",
66-
"Procédure": "PROC",
67-
}
68-
69-
for ent in doc.ents:
70-
predicted_entities.append(Entity(
71-
label=gpt_remapping[ent.label_],
72-
text=ent.text,
73-
spans=[Span(ent.start_char, ent.end_char)]
74-
))
75-
76-
return predicted_entities
77-
78-
def eval(_docs, open_ai_key = "", _labels_remapping = {
79-
"ANAT": "ANAT", "CHEM": "CHEM", "DEVI": "DEVI", "DISO": "DISO", "GEOG": "GEOG",
80-
"LIVB": "LIVB", "OBJC": "OBJC", "PHEN": "PHEN", "PHYS": "PHYS", "PROC": "PROC",}):
81-
82-
use_gpt = True if open_ai_key != "" else False
83-
84-
if use_gpt:
85-
os.environ["OPENAI_API_KEY"] = open_ai_key
86-
nlp = assemble("config_gpt.cfg")
87-
88-
#Loading entity matchers
89-
umls_matcher = UMLSMatcher(
90-
umls_dir=Path.home() / "src/UMLS",
91-
cache_dir=".umls_cache",
92-
language="FRE",
93-
semgroups=['ANAT', 'CHEM', 'DEVI', 'DISO', 'GEOG', 'LIVB', 'OBJC', 'PHEN', 'PHYS', 'PROC'],
94-
lowercase=True,
95-
normalize_unicode=True,
96-
name="umls_matcher",
97-
output_labels_by_semgroup=_labels_remapping
98-
)
99-
100-
drbert_matcher = HFEntityMatcher(model="Thibeb/CamemBert_bio_generalized", name="drbert_matcher", device=DEVICE)
101-
camembert_matcher = HFEntityMatcher(model="Thibeb/DrBert_generalized", name="camembert_matcher", device=DEVICE)
102-
103-
#Prediction
104-
ners = [umls_matcher, drbert_matcher, camembert_matcher]
105-
predicted_entities = {}
106-
for ner in ners:
107-
predicted_entities[ner.description.name] = []
108-
if use_gpt : predicted_entities['GPT-3.5-turbo'] = []
109-
110-
# Predict entites for each doc for each NER tool
111-
for doc in tqdm(_docs):
112-
for ner in ners:
113-
entities = ner.run([doc.raw_segment])
114-
predicted_entities[ner.description.name].append(entities)
115-
if use_gpt:
116-
entities = LLMNER(doc.text, nlp)
117-
predicted_entities['GPT-3.5-turbo'].append(entities)
118-
68+
#Evalue les annotations de plusieurs outils de NER sur les documents fournis
69+
def eval(_docs, open_ai_key="", _labels_remapping = {"ANAT": "ANAT", "CHEM": "CHEM", "DEVI": "DEVI", "DISO": "DISO", "GEOG": "GEOG",
70+
"LIVB": "LIVB", "OBJC": "OBJC", "PHEN": "PHEN", "PHYS": "PHYS", "PROC": "PROC",}):
71+
72+
ners = []
73+
ners.append(UMLSMatcher(umls_dir=Path.home() / "src/UMLS", cache_dir=".umls_cache", language="FRE", semgroups=['ANAT', 'CHEM', 'DEVI', 'DISO', 'GEOG', 'LIVB', 'OBJC', 'PHEN', 'PHYS', 'PROC'], lowercase=True, normalize_unicode=True, name="umls_matcher", output_labels_by_semgroup=_labels_remapping))
74+
ners.append(HFEntityMatcher(model="Thibeb/CamemBert_bio_generalized", name="drbert_matcher", device=DEVICE))
75+
ners.append(HFEntityMatcher(model="Thibeb/DrBert_generalized", name="camembert_matcher", device=DEVICE))
76+
if open_ai_key != "" : ners.append(GPT_NER("config_gpt.cfg", open_ai_key,_name='ChatGPT-3.5-turbo'))
77+
11978
ner_evaluator = SeqEvalEvaluator(return_metrics_by_label=True, average='weighted', labels_remapping=_labels_remapping)
12079
121-
# Compute NER metrics for each NER tool
12280
dfs = []
81+
12382
for ner in ners:
124-
results = ner_evaluator.compute(_docs, predicted_entities[ner.description.name])
83+
predicted_entities = [ner.run([doc.raw_segment]) for doc in tqdm(_docs)]
84+
results = ner_evaluator.compute(_docs, predicted_entities)
12585
dfs.append(results_to_df(_results=results, _title=ner.description.name))
126-
if use_gpt:
127-
results = ner_evaluator.compute(_docs, predicted_entities['GPTNER'])
128-
dfs.append(results_to_df(_results=results, _title='GPTNER'))
129-
86+
13087
return pd.concat(dfs, axis=1)
13188
132-
def test(_doc, open_ai_key = "", _labels_remapping = {
133-
"ANAT": "ANAT", "CHEM": "CHEM", "DEVI": "DEVI", "DISO": "DISO", "GEOG": "GEOG",
134-
"LIVB": "LIVB", "OBJC": "OBJC", "PHEN": "PHEN", "PHYS": "PHYS", "PROC": "PROC",}):
89+
#Affiche les annotation de plusieurs outils de NER sur le document fourni
90+
def test(_doc, open_ai_key="", _labels_remapping={"ANAT": "ANAT", "CHEM": "CHEM", "DEVI": "DEVI", "DISO": "DISO", "GEOG": "GEOG",
91+
"LIVB": "LIVB", "OBJC": "OBJC", "PHEN": "PHEN", "PHYS": "PHYS", "PROC": "PROC",}):
13592
136-
use_gpt = True if open_ai_key != "" else False
137-
138-
if use_gpt:
139-
os.environ["OPENAI_API_KEY"] = open_ai_key
140-
nlp = assemble("config_gpt.cfg")
141-
142-
#Loading entity matchers
143-
umls_matcher = UMLSMatcher(
144-
umls_dir=Path.home() / "src/UMLS",
145-
cache_dir=".umls_cache",
146-
language="FRE",
147-
semgroups=['ANAT', 'CHEM', 'DEVI', 'DISO', 'GEOG', 'LIVB', 'OBJC', 'PHEN', 'PHYS', 'PROC'],
148-
lowercase=True,
149-
normalize_unicode=True,
150-
name="umls_matcher",
151-
output_labels_by_semgroup=_labels_remapping
152-
)
153-
154-
drbert_matcher = HFEntityMatcher(model="Thibeb/CamemBert_bio_generalized", name="drbert_matcher", device=DEVICE)
155-
camembert_matcher = HFEntityMatcher(model="Thibeb/DrBert_generalized", name="camembert_matcher", device=DEVICE)
156-
157-
#Prediction
158-
ners = [umls_matcher, drbert_matcher, camembert_matcher]
159-
annotated_docs = {}
160-
161-
# Predict entites for each doc for each NER tool
93+
ners = []
94+
ners.append(UMLSMatcher(umls_dir=Path.home() / "src/UMLS", cache_dir=".umls_cache", language="FRE", semgroups=['ANAT', 'CHEM', 'DEVI', 'DISO', 'GEOG', 'LIVB', 'OBJC', 'PHEN', 'PHYS', 'PROC'], lowercase=True, normalize_unicode=True, name="UMLS matcher", output_labels_by_semgroup=_labels_remapping))
95+
ners.append(HFEntityMatcher(model="Thibeb/CamemBERT_bio_generalized", name="DrBERT matcher", device=DEVICE))
96+
ners.append(HFEntityMatcher(model="Thibeb/DrBERT_generalized", name="CamemBERT-bio matcher", device=DEVICE))
97+
if open_ai_key != "" : ners.append(GPT_NER("config_gpt3.cfg", open_ai_key,_name='ChatGPT-3.5-turbo'))
98+
if open_ai_key != "" : ners.append(GPT_NER("config_gpt4.cfg", open_ai_key,_name='ChatGPT-4'))
99+
100+
annotated_docs = {'Original':_doc}
162101
for ner in ners:
102+
annotated_docs[ner.description.name] = TextDocument(text=_doc.text)
163103
entities = ner.run([_doc.raw_segment])
164-
annotated_doc = TextDocument(text=_doc.text)
165-
for ent in entities:
166-
annotated_doc.anns.add(ent)
167-
annotated_docs[ner.description.name] = annotated_doc
168-
169-
if use_gpt:
170-
entities = LLMNER(_doc.text, nlp)
171-
annotated_doc = TextDocument(text=_doc.text)
172104
for ent in entities:
173-
annotated_doc.anns.add(ent)
174-
annotated_docs['GPT-3.5-turbo'] = annotated_doc
105+
annotated_docs[ner.description.name].anns.add(ent)
175106
176-
html_datas = []
177-
178-
for ner, doc in annotated_docs.items():
179-
displacy_data = medkit_doc_to_displacy(doc)
180-
html_data = displacy.render(displacy_data, manual=True, style="ent", jupyter=False)
181-
html_datas.append(html_data)
107+
html_datas = [f'<h1>{name}</h1>{displacy.render(medkit_doc_to_displacy(doc), manual=True, style="ent", jupyter=False)}' for name, doc in annotated_docs.items()]
108+
display(HTML("".join(html_datas)))
109+
```
182110

183-
display(HTML("".join(html_datas)))
111+
```{code-cell} ipython3
112+
#Charge une partie du split de test du corpus QUAERO déja pre-processé
113+
docs = list(load_text_documents("datasets/quaero/test.jsonl"))[:100]
184114
```
185115

186116
```{code-cell} ipython3
187-
docs = list(load_text_documents("datasets/quaero/test.jsonl"))[14:25]
117+
#Evalue plusieurs outils de NER et renvoie un tableau de comparaison
118+
eval(docs)
119+
```
188120

189-
eval(docs[:10])
121+
```{code-cell} ipython3
122+
#Affiche les annotations de différent outils de NER sur un document du split
123+
test(docs[12], open_ai_key="")
190124
```
191125

192126
```{code-cell} ipython3
193-
def eval(_docs, _labels_remapping, )
127+
194128
```

0 commit comments

Comments
 (0)