From 01cfa2a66360afdcab4814aafb040343c6db7778 Mon Sep 17 00:00:00 2001 From: cquest Date: Sun, 18 Feb 2018 22:55:30 +0100 Subject: [PATCH 1/3] glue usual words like 'MONT' 'VAL' 'LE' 'LA' 'L' in an additionnal token --- addok_france/__init__.py | 1 + addok_france/utils.py | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 2b20ccb..42ca4ab 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,6 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) +fold_words = utils.fold_words flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) diff --git a/addok_france/utils.py b/addok_france/utils.py index 779ab89..b1f4fba 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -127,6 +127,16 @@ def fold_ordinal(s): return s +FOLD_WORDS = ["mont", "val", "le", "la", "l"] + +def fold_words(tokens): + """ folds 'MONT GRIFFON' into 'MONTGRIFFON' """ + for _, token, next_ in neighborhood(tokens): + yield token + if token in FOLD_WORDS and next_ and next_.isalpha() and len(next_)>2: + yield token.update(token+next_) + + def remove_leading_zeros(s): """0003 => 3.""" # Limit digits from 1 to 3 in order to avoid processing postcodes. From f6e2e0b2c6a2f039f0f738f2ff7b61b93fcb44d4 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 21 Feb 2018 17:13:52 +0100 Subject: [PATCH 2/3] flod_words > glue_words --- addok_france/__init__.py | 2 +- addok_france/utils.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/addok_france/__init__.py b/addok_france/__init__.py index 42ca4ab..10507f5 100644 --- a/addok_france/__init__.py +++ b/addok_france/__init__.py @@ -14,7 +14,7 @@ extract_address = yielder(utils.extract_address) glue_ordinal = utils.glue_ordinal fold_ordinal = yielder(utils.fold_ordinal) -fold_words = utils.fold_words +glue_words = utils.glue_words flag_housenumber = utils.flag_housenumber make_labels = utils.make_labels remove_leading_zeros = yielder(utils.remove_leading_zeros) diff --git a/addok_france/utils.py b/addok_france/utils.py index b1f4fba..a5174e2 100644 --- a/addok_france/utils.py +++ b/addok_france/utils.py @@ -127,13 +127,13 @@ def fold_ordinal(s): return s -FOLD_WORDS = ["mont", "val", "le", "la", "l"] +GLUE_WORDS = ["mont", "val", "le", "la", "l", "champ"] -def fold_words(tokens): - """ folds 'MONT GRIFFON' into 'MONTGRIFFON' """ +def glue_words(tokens): + """ glue 'MONT GRIFFON' into 'MONTGRIFFON' """ for _, token, next_ in neighborhood(tokens): yield token - if token in FOLD_WORDS and next_ and next_.isalpha() and len(next_)>2: + if token in GLUE_WORDS and next_ and next_.isalpha() and len(next_)>2: yield token.update(token+next_) From 221584ac16dcd3b2b63728b3c6dab4092f50fa80 Mon Sep 17 00:00:00 2001 From: cquest Date: Wed, 21 Feb 2018 17:40:20 +0100 Subject: [PATCH 3/3] glue_words test --- tests/test_utils.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 969ee91..efc4dc3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,7 +8,7 @@ from addok.helpers.text import Token from addok_france.utils import (clean_query, extract_address, flag_housenumber, fold_ordinal, glue_ordinal, make_labels, - remove_leading_zeros) + remove_leading_zeros, glue_words) @pytest.mark.parametrize("input,expected", [ @@ -331,3 +331,15 @@ def test_make_municipality_labels(config): '59000 Lille', 'Lille 59000', ] + + +@pytest.mark.parametrize("inputs,expected", [ + (['mont', 'griffon'], ['mont', 'montgriffon', 'griffon']), + (['champ', 'vallon'], ['champ', 'champvallon', 'vallon']), + (['val', 'suzon'], ['val', 'valsuzon', 'suzon']), + (['l', 'a', 'peu', 'pres'], ['l', 'a', 'peu', 'pres']), + (['l', 'un', 'des'], ['l', 'un', 'des']), +]) +def test_glue_ordinal(inputs, expected): + tokens = [Token(input_) for input_ in inputs] + assert list(glue_words(tokens)) == expected