Skip to content

Commit

Permalink
Tokenizer customisé. Prend en compte les mots avec un '-'. Première v…
Browse files Browse the repository at this point in the history
…ersion. A améliorer. Ref #19

Il faut encore ajouter plus de mots dans la liste de mots composés. Il faut aussi faire marcher le custom training pour le POS tagger, son problème c'est que l'apprentissage se passe avant le tokenizer customisé donc avant que les mots comme 'celui-là' par exemple deviennent un seul token.
  • Loading branch information
mariastefan committed Jul 14, 2020
1 parent e1c6406 commit 1d08d7c
Show file tree
Hide file tree
Showing 22 changed files with 618 additions and 88 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,5 @@ resolution_coreferences_pronominales/tests/tests-spacy.py
uninstall.sh

resolution_coreferences_pronominales/data/phrases_tmp

resolution_coreferences_pronominales/data/brouillon
2 changes: 1 addition & 1 deletion .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ job:mon_test:
- apt-get update -qy
- sh install.sh
- python3 ./resolution_coreferences_pronominales/__main__.py
- for f in resolution_coreferences_pronominales/tests/*.py; do if [ $f != resolution_coreferences_pronominales/tests/test-duree_execution.py ]; then python3 "$f"; fi; done
- for f in resolution_coreferences_pronominales/tests/*.py; do if [ $f != resolution_coreferences_pronominales/tests/test-duree_execution.py and $f != resolution_coreferences_pronominales/tests/test-spacy.py]; then python3 "$f"; fi; done
- python3 resolution_coreferences_pronominales/tests-regression/test-duree_execution.py 1 1
- sh uninstall.sh

Expand Down
Empty file added models/__init__.py
Empty file.
10 changes: 5 additions & 5 deletions resolution_coreferences_pronominales/__main__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import sys
import os

sys.path.append(".")
from resolution_coreferences_pronominales.coreferences import analyses_texte

if __name__ == '__main__':
filename = os.path.basename(__file__)
print('Start ' + filename)
phrase = "Le chat est tombé dans le puits. Il est profond. Il s'est blessé la patte."
print(phrase)
print("\nChargement en cours... \n")
infos = analyses_texte.informations_pronoms(phrase)
print(analyses_texte.affichier_antecedents_dans_phrase(phrase, True))
print('\nRésultat de la fonction analyses_texte.informations_pronoms(phrase) :')
print(analyses_texte.informations_pronoms(phrase))


print('Completed : ' + filename)
Original file line number Diff line number Diff line change
@@ -1,35 +1,10 @@
from statistics import mean

import spacy
import fr_core_news_sm
from resolution_coreferences_pronominales.coreferences import relations_entre_mots
from spacy.matcher import Matcher


def nlp_loader():
nlp = fr_core_news_sm.load()
matcher = Matcher(nlp.vocab)
matcher.add('HYPHENS', None, [{'IS_ALPHA': True}, {'ORTH': '-', 'OP': '+'}, {'IS_ALPHA': True}])
liste = ['intelligence artificielle']
for e in liste:
zzz = []
for i in e.split(" "):
zzz.append({'ORTH': i})
matcher.add(e, None, zzz)

def quote_merger(doc):
# this will be called on the Doc object in the pipeline
matched_spans = []
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
matched_spans.append(span)
for span in matched_spans: # merge into one token after collecting all matches
span.merge()
return doc

nlp.add_pipe(quote_merger, first=True) # add it right after the tokenizer
return nlp
from spacy.tokens import Doc
import sys
sys.path.append(".")
from resolution_coreferences_pronominales.custom_model_training import custom_tokenizer

# Prend une phrase et retourne des informations sur ses pronoms
# Tous les mots sont lemmatisés
Expand All @@ -44,7 +19,7 @@ def quote_merger(doc):
def informations_pronoms(phrase: str or spacy.tokens.doc.Doc):
# Nous vérifions si phrase est de type spacy.tokens.doc.Doc pour gagner du temps (car spacy.load('fr') est lent)
if isinstance(phrase, str):
nlp = nlp_loader()
nlp = custom_tokenizer.nlp_loader()
doc = nlp(phrase)
else:
doc = phrase
Expand Down Expand Up @@ -143,7 +118,7 @@ def informations_pronoms(phrase: str or spacy.tokens.doc.Doc):
def coreferences_phrase(phrase: str or spacy.tokens.doc.Doc, cache: bool):
# Nous vérifions si phrase est de type spacy.tokens.doc.Doc pour gagner du temps (car spacy.load('fr') est lent)
if isinstance(phrase, str):
nlp = nlp_loader()
nlp = custom_tokenizer.nlp_loader()
phrase = nlp(phrase)
infos_pronoms = informations_pronoms(phrase)
coreferences = []
Expand Down Expand Up @@ -221,7 +196,7 @@ def coreferences_phrase(phrase: str or spacy.tokens.doc.Doc, cache: bool):


def affichier_antecedents_dans_phrase(phrase: str, cache: bool):
nlp = nlp_loader()
nlp = custom_tokenizer.nlp_loader()
phrase = nlp(phrase)
coreferences = coreferences_phrase(phrase, cache)
phrase_antecedents = ''
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
import fr_core_news_sm
from spacy.matcher import Matcher

custom_exceptions_list = ['intelligence artificielle', 'pommes de terre']
nlp = fr_core_news_sm.load()
matcher = Matcher(nlp.vocab)
model_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) + '/models/'


def hyphen_tokens_merged():
matcher.add('HYPHENS', None, [{'IS_ALPHA': True}, {'TEXT': '-'}, {'IS_ALPHA': True}])


def custom_tokenizer_exceptions_list(custom_list):
for compound_word in custom_list:
pattern = []
for word in compound_word.split(" "):
pattern.append({'TEXT': word})
matcher.add(compound_word, None, pattern)


def custom_tokenizer_merger(doc):
# this methods add matches to the matches
hyphen_tokens_merged()
custom_tokenizer_exceptions_list(custom_exceptions_list)

# this will be called on the Doc object in the pipeline
matched_spans = []
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
matched_spans.append(span)
for span in matched_spans: # merge into one token after collecting all matches
span.merge()
return doc


nlp.add_pipe(custom_tokenizer_merger, first=True) # add it right after the tokenizer
# nlp.factories['custom_tokenizer_merger'] = custom_tokenizer_merger
nlp.to_disk(model_path + 'costom_model_v1')
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
{
"intelligence artificielle": {
"lemma" : "intelligence artificielle",
"pos" : "NOUN"
},
"intelligences artificielles": {
"lemma" : "intelligence artificielle",
"pos" : "NOUN"
},
"bande dessinée" : {
"lemma" : "bande dessinée",
"pos" : "NOUN"
},
"bandes dessinées" : {
"lemma" : "bande dessinée",
"pos" : "NOUN"
},
"compte rendu" : {
"lemma" : "compte rendu",
"pos" : "NOUN"
},
"comptes rendus" : {
"lemma" : "compte rendu",
"pos" : "NOUN"
},
"faux sens" : {
"lemma" : "faux sens",
"pos" : "NOUN"
},
"hôtel de ville" : {
"lemma" : "hôtel de ville",
"pos" : "NOUN"
},
"hôtels de ville" : {
"lemma" : "hôtel de ville",
"pos" : "NOUN"
},
"coup de poing" : {
"lemma" : "coup de poing",
"pos" : "NOUN"
},
"coups de poing" : {
"lemma" : "coup de poing",
"pos" : "NOUN"
},
"ce qui" : {
"lemma" : "ce qui",
"pos" : "PRON",
"type_pro" : "relatif"
},
"ceux qui" : {
"lemma" : "ce qui",
"pos" : "PRON",
"type_pro" : "relatif"
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"intelligence artificielle": {
"lemma" : "intelligence artificielle",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Fem|Number=Sing"
},
"intelligences artificielles": {
"lemma" : "intelligence artificielle",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Fem|Number=Plur"
},
"bande dessinée" : {
"lemma" : "bande dessinée",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Fem|Number=Sing"
},
"bandes dessinées" : {
"lemma" : "bande dessinée",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Fem|Number=Plur"
},
"compte rendu" : {
"lemma" : "compte rendu",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Masc|Number=Sing"
},
"comptes rendus" : {
"lemma" : "compte rendu",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Masc|Number=Plur"
},
"faux sens" : {
"lemma" : "faux sens",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Masc"
},
"hôtel de ville" : {
"lemma" : "hôtel de ville",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Masc|Number=Sing"
},
"hôtels de ville" : {
"lemma" : "hôtel de ville",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Masc|Number=Plur"
},
"coup de poing" : {
"lemma" : "coup de poing",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Masc|Number=Sing"
},
"coups de poing" : {
"lemma" : "coup de poing",
"pos" : "NOUN",
"tag" : "NOUN__Gender=Masc|Number=Plur"
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import fr_core_news_sm
import os
from spacy.matcher import Matcher
import json
from spacy.language import Language
from spacy.tokens import Doc

json_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + \
'/custom_model_training/custom_model_params/compound_words.json'


def nlp_loader():
"""
Temporary fonction allowing to load the nlp with the custom tokenizer.
This will later become a fonction creating a new model and scripts will no longer be loading the model with
this fonction but directly from the new customized model
:return: nlp
"""
nlp = fr_core_news_sm.load()

class CompoundWordsMerger:
def __init__(self, words_path):
# self.model_size = model_size
self.words_path = words_path

def __call__(self, doc: Doc):
# Adding hyphen compound words to the matcher
matcher = Matcher(nlp.vocab)
matcher.add('HYPHENS', None, [{'IS_ALPHA': True}, {'TEXT': '-'}, {'IS_ALPHA': True}])

# Opening the json file containing the information about our custom compound words
with open(self.words_path) as json_file:
compound_words = json.load(json_file)

# Creating a list which will contain the keys of the dictionary in words_path json file
# These keys correspond to the custom compound words text
custom_exceptions_list = []
for key in compound_words.keys():
custom_exceptions_list.append(key)

# Adding the custom compound words from the json file to the matcher
for word in custom_exceptions_list:
pattern = []
for word in word.split(' '):
pattern.append({'TEXT': word})
matcher.add(word, None, pattern)

# Adding the matches containing the compound words to the doc
matched_spans = []
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
matched_spans.append(span)
if str(span) in compound_words.keys():
nlp.tokenizer.add_special_case(str(span),
[{'ORTH': str(span), 'POS': compound_words[str(span)]["pos"]}])
for span in matched_spans: # merge into one token after collecting all matches
span.merge()

# Adding the custom lemmas for the custom compound words
for token in doc:
if ' ' in token.text:
if token.text in compound_words.keys():
token.lemma_ = compound_words[token.text]["lemma"]

return doc

nlp.add_pipe(CompoundWordsMerger(json_path),
first=True) # , first=True : add it right after the tokenizer; default : last

# Adding the custom pipeline to the factories
Language.factories['CompoundWordsMerger'] = lambda _: CompoundWordsMerger(json_path)
return nlp
Loading

0 comments on commit 1d08d7c

Please sign in to comment.