From b64acaddd53cdd8e4818ffb3a933bc36dc63f182 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Tue, 29 Nov 2022 05:30:36 -0800 Subject: [PATCH 1/3] Support unambiguous detection of language if only prefixes are supplied o ceases search as soon as ambiguity is resolved --- src/mnemonic/mnemonic.py | 7 +++++-- tests/test_mnemonic.py | 4 ++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/mnemonic/mnemonic.py b/src/mnemonic/mnemonic.py index 9457f9d..e64921d 100644 --- a/src/mnemonic/mnemonic.py +++ b/src/mnemonic/mnemonic.py @@ -90,13 +90,16 @@ def normalize_string(txt: AnyStr) -> str: @classmethod def detect_language(cls, code: str) -> str: - """Scan the Mnemonic until the language becomes unambiguous.""" + """Scan the Mnemonic until the language becomes unambiguous, including as abbreviation prefixes.""" code = cls.normalize_string(code) possible = set(cls(lang) for lang in cls.list_languages()) for word in code.split(): - possible = set(p for p in possible if word in p.wordlist) + # possible languages have candidate(s) starting with the word/prefix + possible = set(p for p in possible if any(c.startswith( word ) for c in p.wordlist)) if not possible: raise ConfigurationError(f"Language unrecognized for {word!r}") + if len( possible ) < 2: + break if len(possible) == 1: return possible.pop().language raise ConfigurationError( diff --git a/tests/test_mnemonic.py b/tests/test_mnemonic.py index 40785ff..9cc4eb0 100755 --- a/tests/test_mnemonic.py +++ b/tests/test_mnemonic.py @@ -57,6 +57,10 @@ def test_failed_checksum(self) -> None: def test_detection(self) -> None: self.assertEqual("english", Mnemonic.detect_language("security")) + self.assertEqual( "english", Mnemonic.detect_language( "fruit wave dwarf" )) # ambiguous up to wave + self.assertEqual( "english", Mnemonic.detect_language( "fru wago dw" )) # ambiguous french/english up to dwarf prefix + self.assertEqual( "french", Mnemonic.detect_language( "fru wago dur enje" )) # ambiguous french/english up to enjeu prefix + with self.assertRaises(Exception): Mnemonic.detect_language( "jaguar xxxxxxx" From 07a4be4f0221b379efce28ed60717aed3ebdc62a Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Wed, 14 Dec 2022 08:25:57 -0800 Subject: [PATCH 2/3] Simplify success exit criteria for detecting language --- src/mnemonic/mnemonic.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/mnemonic/mnemonic.py b/src/mnemonic/mnemonic.py index e64921d..05db998 100644 --- a/src/mnemonic/mnemonic.py +++ b/src/mnemonic/mnemonic.py @@ -96,12 +96,10 @@ def detect_language(cls, code: str) -> str: for word in code.split(): # possible languages have candidate(s) starting with the word/prefix possible = set(p for p in possible if any(c.startswith( word ) for c in p.wordlist)) + if len(possible) == 1: + return possible.pop().language if not possible: raise ConfigurationError(f"Language unrecognized for {word!r}") - if len( possible ) < 2: - break - if len(possible) == 1: - return possible.pop().language raise ConfigurationError( f"Language ambiguous between {', '.join( p.language for p in possible)}" ) From cf7df0a658ad42d6483d3d90cea6b540d33b95d2 Mon Sep 17 00:00:00 2001 From: Perry Kundert Date: Sun, 12 Nov 2023 07:37:34 -0700 Subject: [PATCH 3/3] Correct language deduction if prefixes remain ambiguous --- src/mnemonic/mnemonic.py | 26 ++++++++++++++++++++++---- tests/test_mnemonic.py | 12 ++++++++++++ 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/mnemonic/mnemonic.py b/src/mnemonic/mnemonic.py index 05db998..487bc22 100644 --- a/src/mnemonic/mnemonic.py +++ b/src/mnemonic/mnemonic.py @@ -90,16 +90,34 @@ def normalize_string(txt: AnyStr) -> str: @classmethod def detect_language(cls, code: str) -> str: - """Scan the Mnemonic until the language becomes unambiguous, including as abbreviation prefixes.""" + """Scan the Mnemonic until the language becomes unambiguous, including as abbreviation prefixes. + + Unfortunately, there are valid words that are ambiguous between languages, which are complete words + in one language and are prefixes in another: + + english: abandon ... about + french: abandon ... aboutir + + If prefixes remain ambiguous, require exactly one language where word(s) match exactly. + """ code = cls.normalize_string(code) possible = set(cls(lang) for lang in cls.list_languages()) - for word in code.split(): + words = set(code.split()) + for word in words: # possible languages have candidate(s) starting with the word/prefix possible = set(p for p in possible if any(c.startswith( word ) for c in p.wordlist)) - if len(possible) == 1: - return possible.pop().language if not possible: raise ConfigurationError(f"Language unrecognized for {word!r}") + if len(possible) == 1: + return possible.pop().language + # Multiple languages match: A prefix in many, but an exact match in one determines language. + complete = set() + for word in words: + exact = set(p for p in possible if word in p.wordlist) + if len(exact) == 1: + complete.update(exact) + if len(complete) == 1: + return complete.pop().language raise ConfigurationError( f"Language ambiguous between {', '.join( p.language for p in possible)}" ) diff --git a/tests/test_mnemonic.py b/tests/test_mnemonic.py index 9cc4eb0..9f93630 100755 --- a/tests/test_mnemonic.py +++ b/tests/test_mnemonic.py @@ -71,8 +71,20 @@ def test_detection(self) -> None: "jaguar jaguar" ) # Ambiguous after examining all words + # Allowing word prefixes in language detection presents ambiguity issues. Require exactly + # one language that matches all prefixes, or one language matching some word(s) exactly. self.assertEqual("english", Mnemonic.detect_language("jaguar security")) self.assertEqual("french", Mnemonic.detect_language("jaguar aboyer")) + self.assertEqual("english", Mnemonic.detect_language("abandon about")) + self.assertEqual("french", Mnemonic.detect_language("abandon aboutir")) + self.assertEqual("french", Mnemonic.detect_language("fav financer")) + self.assertEqual("czech", Mnemonic.detect_language("fav finance")) + with self.assertRaises(Exception): + Mnemonic.detect_language("favor finan") + self.assertEqual("czech", Mnemonic.detect_language("flanel")) + self.assertEqual("portuguese", Mnemonic.detect_language("flanela")) + with self.assertRaises(Exception): + Mnemonic.detect_language("flane") def test_utf8_nfkd(self) -> None: # The same sentence in various UTF-8 forms