BUG: Replace unidecode by anyascii

Closes #197
medkit-lib · Dec 11, 2023 · a0831db · a0831db
1 parent 494e97e
commit a0831db
Show file tree

Hide file tree

Showing 5 changed files with 24 additions and 13 deletions.
diff --git a/medkit/text/ner/_base_simstring_matcher.py b/medkit/text/ner/_base_simstring_matcher.py
@@ -14,9 +14,9 @@
 from pathlib import Path
 from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
 
+from anyascii import anyascii
 from pysimstring import simstring
 from typing_extensions import Literal
-from unidecode import unidecode
 
 try:
     import spacy
@@ -302,7 +302,7 @@ def _find_matches_in_segment(
             # simstring matching is always performed on lowercased ASCII-only text,
             # then for potential matches we will recompute the similarity
             # taking into account the actual rule parameters
-            candidate_text_processed = unidecode(candidate_text.lower())
+            candidate_text_processed = anyascii(candidate_text.lower())
             matched_terms = self._simstring_db_reader.retrieve(candidate_text_processed)
 
             for matched_term in matched_terms:
@@ -328,8 +328,8 @@ def _find_matches_in_segment(
                         candidate_text = candidate_text.lower()
                         rule_term = rule_term.lower()
                     elif not rule.unicode_sensitive:
-                        candidate_text = unidecode(candidate_text)
-                        rule_term = unidecode(rule_term)
+                        candidate_text = anyascii(candidate_text)
+                        rule_term = anyascii(rule_term)
 
                     # ignore blacklisted terms
                     if rule_term in self.blacklist:
@@ -443,7 +443,7 @@ def build_simstring_matcher_databases(
         term_to_match = rule.term
 
         # apply preprocessing
-        term_to_match = unidecode(term_to_match.lower())
+        term_to_match = anyascii(term_to_match.lower())
 
         # add to simstring db
         simstring_db_writer.insert(term_to_match)

diff --git a/medkit/text/ner/umls_utils.py b/medkit/text/ner/umls_utils.py
@@ -15,7 +15,7 @@
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional, Union
 
-import unidecode
+from anyascii import anyascii
 from tqdm import tqdm
 
 SEMGROUP_LABELS = {
@@ -255,7 +255,7 @@ def preprocess_term_to_match(
     if lowercase:
         term = term.lower()
     if normalize_unicode:
-        term = unidecode.unidecode(term)
+        term = anyascii(term)
 
     term = " " + term + " "
     if clean_nos:
@@ -303,7 +303,7 @@ def preprocess_acronym(term: str) -> Optional[str]:
 
     # try to rebuild acronym from expanded form:
     # replace special characters with ASCII
-    expanded = unidecode.unidecode(expanded)
+    expanded = anyascii(expanded)
     # keep only uppercase chars
     acronym_candidate = "".join(c for c in expanded if c.isupper())
     # if it doesn't match the part before the parenthesis

diff --git a/medkit/text/utils/decoding.py b/medkit/text/utils/decoding.py
@@ -2,7 +2,7 @@
 
 import logging
 
-import unidecode
+from anyascii import anyascii
 
 
 def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) -> str:
@@ -25,15 +25,15 @@ def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) ->
     """
     if logger is None:
         logger = logging.getLogger(__name__)
-    output = unidecode.unidecode(text)
+    output = anyascii(text)
 
     # Verify that text length is conserved
     if keep_length and len(output) != len(text):
         # if text conversion had changed its length, only change characters with same length
         output = ""
         special_chars = set()
         for c in text:
-            cprim = unidecode.unidecode(c)
+            cprim = anyascii(c)
             if len(cprim) == 1:
                 output += cprim
             else:

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ smart-open = "*"
 soundfile = "*"
 tqdm = "*"
 typing-extensions = "*"
-Unidecode = "*"
+anyascii = "^0.3"
 intervaltree = "*"
 wheel = "*"
 iamsystem = ">=0.3"