-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Tokenizer customisé. Prend en compte les mots avec un '-'. Première v…
…ersion. A améliorer. Ref #19 Il faut encore ajouter plus de mots dans la liste de mots composés. Il faut aussi faire marcher le custom training pour le POS tagger, son problème c'est que l'apprentissage se passe avant le tokenizer customisé donc avant que les mots comme 'celui-là' par exemple deviennent un seul token.
- Loading branch information
1 parent
e1c6406
commit 1d08d7c
Showing
22 changed files
with
618 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,14 @@ | ||
import sys | ||
import os | ||
|
||
sys.path.append(".") | ||
from resolution_coreferences_pronominales.coreferences import analyses_texte | ||
|
||
if __name__ == '__main__': | ||
filename = os.path.basename(__file__) | ||
print('Start ' + filename) | ||
phrase = "Le chat est tombé dans le puits. Il est profond. Il s'est blessé la patte." | ||
print(phrase) | ||
print("\nChargement en cours... \n") | ||
infos = analyses_texte.informations_pronoms(phrase) | ||
print(analyses_texte.affichier_antecedents_dans_phrase(phrase, True)) | ||
print('\nRésultat de la fonction analyses_texte.informations_pronoms(phrase) :') | ||
print(analyses_texte.informations_pronoms(phrase)) | ||
|
||
|
||
print('Completed : ' + filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
41 changes: 41 additions & 0 deletions
41
resolution_coreferences_pronominales/coreferences/custom_model_creation.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import os | ||
import fr_core_news_sm | ||
from spacy.matcher import Matcher | ||
|
||
custom_exceptions_list = ['intelligence artificielle', 'pommes de terre'] | ||
nlp = fr_core_news_sm.load() | ||
matcher = Matcher(nlp.vocab) | ||
model_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) + '/models/' | ||
|
||
|
||
def hyphen_tokens_merged(): | ||
matcher.add('HYPHENS', None, [{'IS_ALPHA': True}, {'TEXT': '-'}, {'IS_ALPHA': True}]) | ||
|
||
|
||
def custom_tokenizer_exceptions_list(custom_list): | ||
for compound_word in custom_list: | ||
pattern = [] | ||
for word in compound_word.split(" "): | ||
pattern.append({'TEXT': word}) | ||
matcher.add(compound_word, None, pattern) | ||
|
||
|
||
def custom_tokenizer_merger(doc): | ||
# this methods add matches to the matches | ||
hyphen_tokens_merged() | ||
custom_tokenizer_exceptions_list(custom_exceptions_list) | ||
|
||
# this will be called on the Doc object in the pipeline | ||
matched_spans = [] | ||
matches = matcher(doc) | ||
for match_id, start, end in matches: | ||
span = doc[start:end] | ||
matched_spans.append(span) | ||
for span in matched_spans: # merge into one token after collecting all matches | ||
span.merge() | ||
return doc | ||
|
||
|
||
nlp.add_pipe(custom_tokenizer_merger, first=True) # add it right after the tokenizer | ||
# nlp.factories['custom_tokenizer_merger'] = custom_tokenizer_merger | ||
nlp.to_disk(model_path + 'costom_model_v1') |
Empty file.
57 changes: 57 additions & 0 deletions
57
...n_coreferences_pronominales/custom_model_training/custom_model_params/compound_words.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
{ | ||
"intelligence artificielle": { | ||
"lemma" : "intelligence artificielle", | ||
"pos" : "NOUN" | ||
}, | ||
"intelligences artificielles": { | ||
"lemma" : "intelligence artificielle", | ||
"pos" : "NOUN" | ||
}, | ||
"bande dessinée" : { | ||
"lemma" : "bande dessinée", | ||
"pos" : "NOUN" | ||
}, | ||
"bandes dessinées" : { | ||
"lemma" : "bande dessinée", | ||
"pos" : "NOUN" | ||
}, | ||
"compte rendu" : { | ||
"lemma" : "compte rendu", | ||
"pos" : "NOUN" | ||
}, | ||
"comptes rendus" : { | ||
"lemma" : "compte rendu", | ||
"pos" : "NOUN" | ||
}, | ||
"faux sens" : { | ||
"lemma" : "faux sens", | ||
"pos" : "NOUN" | ||
}, | ||
"hôtel de ville" : { | ||
"lemma" : "hôtel de ville", | ||
"pos" : "NOUN" | ||
}, | ||
"hôtels de ville" : { | ||
"lemma" : "hôtel de ville", | ||
"pos" : "NOUN" | ||
}, | ||
"coup de poing" : { | ||
"lemma" : "coup de poing", | ||
"pos" : "NOUN" | ||
}, | ||
"coups de poing" : { | ||
"lemma" : "coup de poing", | ||
"pos" : "NOUN" | ||
}, | ||
"ce qui" : { | ||
"lemma" : "ce qui", | ||
"pos" : "PRON", | ||
"type_pro" : "relatif" | ||
}, | ||
"ceux qui" : { | ||
"lemma" : "ce qui", | ||
"pos" : "PRON", | ||
"type_pro" : "relatif" | ||
} | ||
|
||
} |
58 changes: 58 additions & 0 deletions
58
...references_pronominales/custom_model_training/custom_model_params/compound_words_old.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
{ | ||
"intelligence artificielle": { | ||
"lemma" : "intelligence artificielle", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Fem|Number=Sing" | ||
}, | ||
"intelligences artificielles": { | ||
"lemma" : "intelligence artificielle", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Fem|Number=Plur" | ||
}, | ||
"bande dessinée" : { | ||
"lemma" : "bande dessinée", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Fem|Number=Sing" | ||
}, | ||
"bandes dessinées" : { | ||
"lemma" : "bande dessinée", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Fem|Number=Plur" | ||
}, | ||
"compte rendu" : { | ||
"lemma" : "compte rendu", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Masc|Number=Sing" | ||
}, | ||
"comptes rendus" : { | ||
"lemma" : "compte rendu", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Masc|Number=Plur" | ||
}, | ||
"faux sens" : { | ||
"lemma" : "faux sens", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Masc" | ||
}, | ||
"hôtel de ville" : { | ||
"lemma" : "hôtel de ville", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Masc|Number=Sing" | ||
}, | ||
"hôtels de ville" : { | ||
"lemma" : "hôtel de ville", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Masc|Number=Plur" | ||
}, | ||
"coup de poing" : { | ||
"lemma" : "coup de poing", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Masc|Number=Sing" | ||
}, | ||
"coups de poing" : { | ||
"lemma" : "coup de poing", | ||
"pos" : "NOUN", | ||
"tag" : "NOUN__Gender=Masc|Number=Plur" | ||
} | ||
|
||
} |
73 changes: 73 additions & 0 deletions
73
resolution_coreferences_pronominales/custom_model_training/custom_tokenizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import fr_core_news_sm | ||
import os | ||
from spacy.matcher import Matcher | ||
import json | ||
from spacy.language import Language | ||
from spacy.tokens import Doc | ||
|
||
json_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + \ | ||
'/custom_model_training/custom_model_params/compound_words.json' | ||
|
||
|
||
def nlp_loader(): | ||
""" | ||
Temporary fonction allowing to load the nlp with the custom tokenizer. | ||
This will later become a fonction creating a new model and scripts will no longer be loading the model with | ||
this fonction but directly from the new customized model | ||
:return: nlp | ||
""" | ||
nlp = fr_core_news_sm.load() | ||
|
||
class CompoundWordsMerger: | ||
def __init__(self, words_path): | ||
# self.model_size = model_size | ||
self.words_path = words_path | ||
|
||
def __call__(self, doc: Doc): | ||
# Adding hyphen compound words to the matcher | ||
matcher = Matcher(nlp.vocab) | ||
matcher.add('HYPHENS', None, [{'IS_ALPHA': True}, {'TEXT': '-'}, {'IS_ALPHA': True}]) | ||
|
||
# Opening the json file containing the information about our custom compound words | ||
with open(self.words_path) as json_file: | ||
compound_words = json.load(json_file) | ||
|
||
# Creating a list which will contain the keys of the dictionary in words_path json file | ||
# These keys correspond to the custom compound words text | ||
custom_exceptions_list = [] | ||
for key in compound_words.keys(): | ||
custom_exceptions_list.append(key) | ||
|
||
# Adding the custom compound words from the json file to the matcher | ||
for word in custom_exceptions_list: | ||
pattern = [] | ||
for word in word.split(' '): | ||
pattern.append({'TEXT': word}) | ||
matcher.add(word, None, pattern) | ||
|
||
# Adding the matches containing the compound words to the doc | ||
matched_spans = [] | ||
matches = matcher(doc) | ||
for match_id, start, end in matches: | ||
span = doc[start:end] | ||
matched_spans.append(span) | ||
if str(span) in compound_words.keys(): | ||
nlp.tokenizer.add_special_case(str(span), | ||
[{'ORTH': str(span), 'POS': compound_words[str(span)]["pos"]}]) | ||
for span in matched_spans: # merge into one token after collecting all matches | ||
span.merge() | ||
|
||
# Adding the custom lemmas for the custom compound words | ||
for token in doc: | ||
if ' ' in token.text: | ||
if token.text in compound_words.keys(): | ||
token.lemma_ = compound_words[token.text]["lemma"] | ||
|
||
return doc | ||
|
||
nlp.add_pipe(CompoundWordsMerger(json_path), | ||
first=True) # , first=True : add it right after the tokenizer; default : last | ||
|
||
# Adding the custom pipeline to the factories | ||
Language.factories['CompoundWordsMerger'] = lambda _: CompoundWordsMerger(json_path) | ||
return nlp |
Oops, something went wrong.