Tokenizer customisé. Prend en compte les mots avec un '-'. Première v…

…ersion. A améliorer. Ref #19 Il faut encore ajouter plus de mots dans la liste de mots composés. Il faut aussi faire marcher le custom training pour le POS tagger, son problème c'est que l'apprentissage se passe avant le tokenizer customisé donc avant que les mots comme 'celui-là' par exemple deviennent un seul token.
mariastefan · Jul 14, 2020 · 1d08d7c · 1d08d7c
1 parent e1c6406
commit 1d08d7c
Show file tree

Hide file tree

Showing 22 changed files with 618 additions and 88 deletions.
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,5 @@ resolution_coreferences_pronominales/tests/tests-spacy.py
 uninstall.sh
 
 resolution_coreferences_pronominales/data/phrases_tmp
+
+resolution_coreferences_pronominales/data/brouillon
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -8,7 +8,7 @@ job:mon_test:
     - apt-get update -qy
     - sh install.sh
     - python3 ./resolution_coreferences_pronominales/__main__.py
-    - for f in resolution_coreferences_pronominales/tests/*.py; do if [ $f != resolution_coreferences_pronominales/tests/test-duree_execution.py ]; then python3 "$f"; fi; done
+    - for f in resolution_coreferences_pronominales/tests/*.py; do if [ $f != resolution_coreferences_pronominales/tests/test-duree_execution.py and $f != resolution_coreferences_pronominales/tests/test-spacy.py]; then python3 "$f"; fi; done
     - python3 resolution_coreferences_pronominales/tests-regression/test-duree_execution.py 1 1
     - sh uninstall.sh
 

diff --git a/models/__init__.py b/models/__init__.py
diff --git a/resolution_coreferences_pronominales/__main__.py b/resolution_coreferences_pronominales/__main__.py
@@ -1,14 +1,14 @@
 import sys
+import os
+
 sys.path.append(".")
 from resolution_coreferences_pronominales.coreferences import analyses_texte
 
 if __name__ == '__main__':
+    filename = os.path.basename(__file__)
+    print('Start ' + filename)
     phrase = "Le chat est tombé dans le puits. Il est profond. Il s'est blessé la patte."
     print(phrase)
     print("\nChargement en cours... \n")
-    infos = analyses_texte.informations_pronoms(phrase)
     print(analyses_texte.affichier_antecedents_dans_phrase(phrase, True))
-    print('\nRésultat de la fonction analyses_texte.informations_pronoms(phrase) :')
-    print(analyses_texte.informations_pronoms(phrase))
-
-
+    print('Completed : ' + filename)
diff --git a/resolution_coreferences_pronominales/coreferences/analyses_texte.py b/resolution_coreferences_pronominales/coreferences/analyses_texte.py
@@ -1,35 +1,10 @@
 from statistics import mean
-
 import spacy
-import fr_core_news_sm
 from resolution_coreferences_pronominales.coreferences import relations_entre_mots
-from spacy.matcher import Matcher
-
-
-def nlp_loader():
-    nlp = fr_core_news_sm.load()
-    matcher = Matcher(nlp.vocab)
-    matcher.add('HYPHENS', None, [{'IS_ALPHA': True}, {'ORTH': '-', 'OP': '+'}, {'IS_ALPHA': True}])
-    liste = ['intelligence artificielle']
-    for e in liste:
-        zzz = []
-        for i in e.split(" "):
-            zzz.append({'ORTH': i})
-        matcher.add(e, None, zzz)
-
-    def quote_merger(doc):
-        # this will be called on the Doc object in the pipeline
-        matched_spans = []
-        matches = matcher(doc)
-        for match_id, start, end in matches:
-            span = doc[start:end]
-            matched_spans.append(span)
-        for span in matched_spans:  # merge into one token after collecting all matches
-            span.merge()
-        return doc
-
-    nlp.add_pipe(quote_merger, first=True)  # add it right after the tokenizer
-    return nlp
+from spacy.tokens import Doc
+import sys
+sys.path.append(".")
+from resolution_coreferences_pronominales.custom_model_training import custom_tokenizer
 
 # Prend une phrase et retourne des informations sur ses pronoms
 # Tous les mots sont lemmatisés
@@ -44,7 +19,7 @@ def quote_merger(doc):
 def informations_pronoms(phrase: str or spacy.tokens.doc.Doc):
     # Nous vérifions si phrase est de type spacy.tokens.doc.Doc pour gagner du temps (car spacy.load('fr') est lent)
     if isinstance(phrase, str):
-        nlp = nlp_loader()
+        nlp = custom_tokenizer.nlp_loader()
         doc = nlp(phrase)
     else:
         doc = phrase
@@ -143,7 +118,7 @@ def informations_pronoms(phrase: str or spacy.tokens.doc.Doc):
 def coreferences_phrase(phrase: str or spacy.tokens.doc.Doc, cache: bool):
     # Nous vérifions si phrase est de type spacy.tokens.doc.Doc pour gagner du temps (car spacy.load('fr') est lent)
     if isinstance(phrase, str):
-        nlp = nlp_loader()
+        nlp = custom_tokenizer.nlp_loader()
         phrase = nlp(phrase)
     infos_pronoms = informations_pronoms(phrase)
     coreferences = []
@@ -221,7 +196,7 @@ def coreferences_phrase(phrase: str or spacy.tokens.doc.Doc, cache: bool):
 
 
 def affichier_antecedents_dans_phrase(phrase: str, cache: bool):
-    nlp = nlp_loader()
+    nlp = custom_tokenizer.nlp_loader()
     phrase = nlp(phrase)
     coreferences = coreferences_phrase(phrase, cache)
     phrase_antecedents = ''

diff --git a/resolution_coreferences_pronominales/coreferences/custom_model_creation.py b/resolution_coreferences_pronominales/coreferences/custom_model_creation.py
@@ -0,0 +1,41 @@
+import os
+import fr_core_news_sm
+from spacy.matcher import Matcher
+
+custom_exceptions_list = ['intelligence artificielle', 'pommes de terre']
+nlp = fr_core_news_sm.load()
+matcher = Matcher(nlp.vocab)
+model_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) + '/models/'
+
+
+def hyphen_tokens_merged():
+    matcher.add('HYPHENS', None, [{'IS_ALPHA': True}, {'TEXT': '-'}, {'IS_ALPHA': True}])
+
+
+def custom_tokenizer_exceptions_list(custom_list):
+    for compound_word in custom_list:
+        pattern = []
+        for word in compound_word.split(" "):
+            pattern.append({'TEXT': word})
+        matcher.add(compound_word, None, pattern)
+
+
+def custom_tokenizer_merger(doc):
+    # this methods add matches to the matches
+    hyphen_tokens_merged()
+    custom_tokenizer_exceptions_list(custom_exceptions_list)
+
+    # this will be called on the Doc object in the pipeline
+    matched_spans = []
+    matches = matcher(doc)
+    for match_id, start, end in matches:
+        span = doc[start:end]
+        matched_spans.append(span)
+    for span in matched_spans:  # merge into one token after collecting all matches
+        span.merge()
+    return doc
+
+
+nlp.add_pipe(custom_tokenizer_merger, first=True)  # add it right after the tokenizer
+# nlp.factories['custom_tokenizer_merger'] = custom_tokenizer_merger
+nlp.to_disk(model_path + 'costom_model_v1')
diff --git a/resolution_coreferences_pronominales/custom_model_training/__init__.py b/resolution_coreferences_pronominales/custom_model_training/__init__.py
diff --git a/...n_coreferences_pronominales/custom_model_training/custom_model_params/compound_words.json b/...n_coreferences_pronominales/custom_model_training/custom_model_params/compound_words.json
@@ -0,0 +1,57 @@
+{
+  "intelligence artificielle": {
+    "lemma" : "intelligence artificielle",
+    "pos" : "NOUN"
+  },
+  "intelligences artificielles": {
+    "lemma" : "intelligence artificielle",
+    "pos" : "NOUN"
+  },
+  "bande dessinée" : {
+    "lemma" : "bande dessinée",
+    "pos" : "NOUN"
+  },
+  "bandes dessinées" : {
+    "lemma" : "bande dessinée",
+    "pos" : "NOUN"
+  },
+  "compte rendu" : {
+    "lemma" : "compte rendu",
+    "pos" : "NOUN"
+  },
+  "comptes rendus" : {
+    "lemma" : "compte rendu",
+    "pos" : "NOUN"
+  },
+  "faux sens" : {
+    "lemma" : "faux sens",
+    "pos" : "NOUN"
+  },
+  "hôtel de ville" :  {
+    "lemma" : "hôtel de ville",
+    "pos" : "NOUN"
+  },
+  "hôtels de ville" :  {
+    "lemma" : "hôtel de ville",
+    "pos" : "NOUN"
+  },
+  "coup de poing" : {
+    "lemma" : "coup de poing",
+    "pos" : "NOUN"
+  },
+  "coups de poing" : {
+    "lemma" : "coup de poing",
+    "pos" : "NOUN"
+  },
+  "ce qui" : {
+    "lemma" : "ce qui",
+    "pos" : "PRON",
+    "type_pro" : "relatif"
+  },
+  "ceux qui" : {
+    "lemma" : "ce qui",
+    "pos" : "PRON",
+    "type_pro" : "relatif"
+  }
+
+}
diff --git a/...references_pronominales/custom_model_training/custom_model_params/compound_words_old.json b/...references_pronominales/custom_model_training/custom_model_params/compound_words_old.json
@@ -0,0 +1,58 @@
+{
+  "intelligence artificielle": {
+    "lemma" : "intelligence artificielle",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Fem|Number=Sing"
+  },
+  "intelligences artificielles": {
+    "lemma" : "intelligence artificielle",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Fem|Number=Plur"
+  },
+  "bande dessinée" : {
+    "lemma" : "bande dessinée",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Fem|Number=Sing"
+  },
+  "bandes dessinées" : {
+    "lemma" : "bande dessinée",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Fem|Number=Plur"
+  },
+  "compte rendu" : {
+    "lemma" : "compte rendu",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Masc|Number=Sing"
+  },
+  "comptes rendus" : {
+    "lemma" : "compte rendu",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Masc|Number=Plur"
+  },
+  "faux sens" : {
+    "lemma" : "faux sens",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Masc"
+  },
+  "hôtel de ville" :  {
+    "lemma" : "hôtel de ville",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Masc|Number=Sing"
+  },
+  "hôtels de ville" :  {
+    "lemma" : "hôtel de ville",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Masc|Number=Plur"
+  },
+  "coup de poing" : {
+    "lemma" : "coup de poing",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Masc|Number=Sing"
+  },
+  "coups de poing" : {
+    "lemma" : "coup de poing",
+    "pos" : "NOUN",
+    "tag" : "NOUN__Gender=Masc|Number=Plur"
+  }
+
+}
diff --git a/resolution_coreferences_pronominales/custom_model_training/custom_tokenizer.py b/resolution_coreferences_pronominales/custom_model_training/custom_tokenizer.py
@@ -0,0 +1,73 @@
+import fr_core_news_sm
+import os
+from spacy.matcher import Matcher
+import json
+from spacy.language import Language
+from spacy.tokens import Doc
+
+json_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + \
+            '/custom_model_training/custom_model_params/compound_words.json'
+
+
+def nlp_loader():
+    """
+    Temporary fonction allowing to load the nlp with the custom tokenizer.
+    This will later become a fonction creating a new model and scripts will no longer be loading the model with
+    this fonction but directly from the new customized model
+    :return: nlp
+    """
+    nlp = fr_core_news_sm.load()
+
+    class CompoundWordsMerger:
+        def __init__(self, words_path):
+            # self.model_size = model_size
+            self.words_path = words_path
+
+        def __call__(self, doc: Doc):
+            # Adding hyphen compound words to the matcher
+            matcher = Matcher(nlp.vocab)
+            matcher.add('HYPHENS', None, [{'IS_ALPHA': True}, {'TEXT': '-'}, {'IS_ALPHA': True}])
+
+            # Opening the json file containing the information about our custom compound words
+            with open(self.words_path) as json_file:
+                compound_words = json.load(json_file)
+
+            # Creating a list which will contain the keys of the dictionary in words_path json file
+            # These keys correspond to the custom compound words text
+            custom_exceptions_list = []
+            for key in compound_words.keys():
+                custom_exceptions_list.append(key)
+
+            # Adding the custom compound words from the json file to the matcher
+            for word in custom_exceptions_list:
+                pattern = []
+                for word in word.split(' '):
+                    pattern.append({'TEXT': word})
+                matcher.add(word, None, pattern)
+
+            # Adding the matches containing the compound words to the doc
+            matched_spans = []
+            matches = matcher(doc)
+            for match_id, start, end in matches:
+                span = doc[start:end]
+                matched_spans.append(span)
+                if str(span) in compound_words.keys():
+                    nlp.tokenizer.add_special_case(str(span),
+                                                   [{'ORTH': str(span), 'POS': compound_words[str(span)]["pos"]}])
+            for span in matched_spans:  # merge into one token after collecting all matches
+                span.merge()
+
+            # Adding the custom lemmas for the custom compound words
+            for token in doc:
+                if ' ' in token.text:
+                    if token.text in compound_words.keys():
+                        token.lemma_ = compound_words[token.text]["lemma"]
+
+            return doc
+
+    nlp.add_pipe(CompoundWordsMerger(json_path),
+                 first=True)  # , first=True : add it right after the tokenizer; default : last
+
+    # Adding the custom pipeline to the factories
+    Language.factories['CompoundWordsMerger'] = lambda _: CompoundWordsMerger(json_path)
+    return nlp
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,3 +9,5 @@ resolution_coreferences_pronominales/tests/tests-spacy.py
		uninstall.sh

		resolution_coreferences_pronominales/data/phrases_tmp

		resolution_coreferences_pronominales/data/brouillon