From a0831dbe00e2e6309cafed82714b17d4b02038e4 Mon Sep 17 00:00:00 2001
From: Ghislain Vaillant <ghislain.vaillant@inria.fr>
Date: Mon, 11 Dec 2023 11:50:03 +0100
Subject: [PATCH] BUG: Replace unidecode by anyascii

Closes #197
---
 medkit/text/ner/_base_simstring_matcher.py | 10 +++++-----
 medkit/text/ner/umls_utils.py              |  6 +++---
 medkit/text/utils/decoding.py              |  6 +++---
 poetry.lock                                | 13 ++++++++++++-
 pyproject.toml                             |  2 +-
 5 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/medkit/text/ner/_base_simstring_matcher.py b/medkit/text/ner/_base_simstring_matcher.py
index a497a23a..9f30fe93 100644
--- a/medkit/text/ner/_base_simstring_matcher.py
+++ b/medkit/text/ner/_base_simstring_matcher.py
@@ -14,9 +14,9 @@
 from pathlib import Path
 from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
 
+from anyascii import anyascii
 from pysimstring import simstring
 from typing_extensions import Literal
-from unidecode import unidecode
 
 try:
     import spacy
@@ -302,7 +302,7 @@ def _find_matches_in_segment(
             # simstring matching is always performed on lowercased ASCII-only text,
             # then for potential matches we will recompute the similarity
             # taking into account the actual rule parameters
-            candidate_text_processed = unidecode(candidate_text.lower())
+            candidate_text_processed = anyascii(candidate_text.lower())
             matched_terms = self._simstring_db_reader.retrieve(candidate_text_processed)
 
             for matched_term in matched_terms:
@@ -328,8 +328,8 @@ def _find_matches_in_segment(
                         candidate_text = candidate_text.lower()
                         rule_term = rule_term.lower()
                     elif not rule.unicode_sensitive:
-                        candidate_text = unidecode(candidate_text)
-                        rule_term = unidecode(rule_term)
+                        candidate_text = anyascii(candidate_text)
+                        rule_term = anyascii(rule_term)
 
                     # ignore blacklisted terms
                     if rule_term in self.blacklist:
@@ -443,7 +443,7 @@ def build_simstring_matcher_databases(
         term_to_match = rule.term
 
         # apply preprocessing
-        term_to_match = unidecode(term_to_match.lower())
+        term_to_match = anyascii(term_to_match.lower())
 
         # add to simstring db
         simstring_db_writer.insert(term_to_match)
diff --git a/medkit/text/ner/umls_utils.py b/medkit/text/ner/umls_utils.py
index be4dd43a..902d5324 100644
--- a/medkit/text/ner/umls_utils.py
+++ b/medkit/text/ner/umls_utils.py
@@ -15,7 +15,7 @@
 from pathlib import Path
 from typing import Dict, Iterator, List, Optional, Union
 
-import unidecode
+from anyascii import anyascii
 from tqdm import tqdm
 
 SEMGROUP_LABELS = {
@@ -255,7 +255,7 @@ def preprocess_term_to_match(
     if lowercase:
         term = term.lower()
     if normalize_unicode:
-        term = unidecode.unidecode(term)
+        term = anyascii(term)
 
     term = " " + term + " "
     if clean_nos:
@@ -303,7 +303,7 @@ def preprocess_acronym(term: str) -> Optional[str]:
 
     # try to rebuild acronym from expanded form:
     # replace special characters with ASCII
-    expanded = unidecode.unidecode(expanded)
+    expanded = anyascii(expanded)
     # keep only uppercase chars
     acronym_candidate = "".join(c for c in expanded if c.isupper())
     # if it doesn't match the part before the parenthesis
diff --git a/medkit/text/utils/decoding.py b/medkit/text/utils/decoding.py
index aed17b4b..5b7982ad 100644
--- a/medkit/text/utils/decoding.py
+++ b/medkit/text/utils/decoding.py
@@ -2,7 +2,7 @@
 
 import logging
 
-import unidecode
+from anyascii import anyascii
 
 
 def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) -> str:
@@ -25,7 +25,7 @@ def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) ->
     """
     if logger is None:
         logger = logging.getLogger(__name__)
-    output = unidecode.unidecode(text)
+    output = anyascii(text)
 
     # Verify that text length is conserved
     if keep_length and len(output) != len(text):
@@ -33,7 +33,7 @@ def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) ->
         output = ""
         special_chars = set()
         for c in text:
-            cprim = unidecode.unidecode(c)
+            cprim = anyascii(c)
             if len(cprim) == 1:
                 output += cprim
             else:
diff --git a/poetry.lock b/poetry.lock
index 55451ef3..450c5373 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -152,6 +152,17 @@ files = [
     {file = "antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b"},
 ]
 
+[[package]]
+name = "anyascii"
+version = "0.3.2"
+description = "Unicode to ASCII transliteration"
+optional = false
+python-versions = ">=3.3"
+files = [
+    {file = "anyascii-0.3.2-py3-none-any.whl", hash = "sha256:3b3beef6fc43d9036d3b0529050b0c48bfad8bc960e9e562d7223cfb94fe45d4"},
+    {file = "anyascii-0.3.2.tar.gz", hash = "sha256:9d5d32ef844fe225b8bc7cba7f950534fae4da27a9bf3a6bea2cb0ea46ce4730"},
+]
+
 [[package]]
 name = "anyio"
 version = "4.0.0"
@@ -7861,4 +7872,4 @@ webrtc-voice-detector = ["webrtcvad"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8, <4.0"
-content-hash = "2dc0f51f17c7dc0348a7d286fb02a0a4f5e6d96aab29f016654ee32a6cddfd2c"
+content-hash = "495fc0565da731cf2ae0e7ee05dd166460d130fdb4f02e5fa5f77377f21ded42"
diff --git a/pyproject.toml b/pyproject.toml
index 9932d9de..9a9f169b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ smart-open = "*"
 soundfile = "*"
 tqdm = "*"
 typing-extensions = "*"
-Unidecode = "*"
+anyascii = "^0.3"
 intervaltree = "*"
 wheel = "*"
 iamsystem = ">=0.3"