From a0831dbe00e2e6309cafed82714b17d4b02038e4 Mon Sep 17 00:00:00 2001 From: Ghislain Vaillant Date: Mon, 11 Dec 2023 11:50:03 +0100 Subject: [PATCH] BUG: Replace unidecode by anyascii Closes #197 --- medkit/text/ner/_base_simstring_matcher.py | 10 +++++----- medkit/text/ner/umls_utils.py | 6 +++--- medkit/text/utils/decoding.py | 6 +++--- poetry.lock | 13 ++++++++++++- pyproject.toml | 2 +- 5 files changed, 24 insertions(+), 13 deletions(-) diff --git a/medkit/text/ner/_base_simstring_matcher.py b/medkit/text/ner/_base_simstring_matcher.py index a497a23a..9f30fe93 100644 --- a/medkit/text/ner/_base_simstring_matcher.py +++ b/medkit/text/ner/_base_simstring_matcher.py @@ -14,9 +14,9 @@ from pathlib import Path from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union +from anyascii import anyascii from pysimstring import simstring from typing_extensions import Literal -from unidecode import unidecode try: import spacy @@ -302,7 +302,7 @@ def _find_matches_in_segment( # simstring matching is always performed on lowercased ASCII-only text, # then for potential matches we will recompute the similarity # taking into account the actual rule parameters - candidate_text_processed = unidecode(candidate_text.lower()) + candidate_text_processed = anyascii(candidate_text.lower()) matched_terms = self._simstring_db_reader.retrieve(candidate_text_processed) for matched_term in matched_terms: @@ -328,8 +328,8 @@ def _find_matches_in_segment( candidate_text = candidate_text.lower() rule_term = rule_term.lower() elif not rule.unicode_sensitive: - candidate_text = unidecode(candidate_text) - rule_term = unidecode(rule_term) + candidate_text = anyascii(candidate_text) + rule_term = anyascii(rule_term) # ignore blacklisted terms if rule_term in self.blacklist: @@ -443,7 +443,7 @@ def build_simstring_matcher_databases( term_to_match = rule.term # apply preprocessing - term_to_match = unidecode(term_to_match.lower()) + term_to_match = anyascii(term_to_match.lower()) # add to simstring db simstring_db_writer.insert(term_to_match) diff --git a/medkit/text/ner/umls_utils.py b/medkit/text/ner/umls_utils.py index be4dd43a..902d5324 100644 --- a/medkit/text/ner/umls_utils.py +++ b/medkit/text/ner/umls_utils.py @@ -15,7 +15,7 @@ from pathlib import Path from typing import Dict, Iterator, List, Optional, Union -import unidecode +from anyascii import anyascii from tqdm import tqdm SEMGROUP_LABELS = { @@ -255,7 +255,7 @@ def preprocess_term_to_match( if lowercase: term = term.lower() if normalize_unicode: - term = unidecode.unidecode(term) + term = anyascii(term) term = " " + term + " " if clean_nos: @@ -303,7 +303,7 @@ def preprocess_acronym(term: str) -> Optional[str]: # try to rebuild acronym from expanded form: # replace special characters with ASCII - expanded = unidecode.unidecode(expanded) + expanded = anyascii(expanded) # keep only uppercase chars acronym_candidate = "".join(c for c in expanded if c.isupper()) # if it doesn't match the part before the parenthesis diff --git a/medkit/text/utils/decoding.py b/medkit/text/utils/decoding.py index aed17b4b..5b7982ad 100644 --- a/medkit/text/utils/decoding.py +++ b/medkit/text/utils/decoding.py @@ -2,7 +2,7 @@ import logging -import unidecode +from anyascii import anyascii def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) -> str: @@ -25,7 +25,7 @@ def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) -> """ if logger is None: logger = logging.getLogger(__name__) - output = unidecode.unidecode(text) + output = anyascii(text) # Verify that text length is conserved if keep_length and len(output) != len(text): @@ -33,7 +33,7 @@ def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) -> output = "" special_chars = set() for c in text: - cprim = unidecode.unidecode(c) + cprim = anyascii(c) if len(cprim) == 1: output += cprim else: diff --git a/poetry.lock b/poetry.lock index 55451ef3..450c5373 100644 --- a/poetry.lock +++ b/poetry.lock @@ -152,6 +152,17 @@ files = [ {file = "antlr4-python3-runtime-4.9.3.tar.gz", hash = "sha256:f224469b4168294902bb1efa80a8bf7855f24c99aef99cbefc1bcd3cce77881b"}, ] +[[package]] +name = "anyascii" +version = "0.3.2" +description = "Unicode to ASCII transliteration" +optional = false +python-versions = ">=3.3" +files = [ + {file = "anyascii-0.3.2-py3-none-any.whl", hash = "sha256:3b3beef6fc43d9036d3b0529050b0c48bfad8bc960e9e562d7223cfb94fe45d4"}, + {file = "anyascii-0.3.2.tar.gz", hash = "sha256:9d5d32ef844fe225b8bc7cba7f950534fae4da27a9bf3a6bea2cb0ea46ce4730"}, +] + [[package]] name = "anyio" version = "4.0.0" @@ -7861,4 +7872,4 @@ webrtc-voice-detector = ["webrtcvad"] [metadata] lock-version = "2.0" python-versions = ">=3.8, <4.0" -content-hash = "2dc0f51f17c7dc0348a7d286fb02a0a4f5e6d96aab29f016654ee32a6cddfd2c" +content-hash = "495fc0565da731cf2ae0e7ee05dd166460d130fdb4f02e5fa5f77377f21ded42" diff --git a/pyproject.toml b/pyproject.toml index 9932d9de..9a9f169b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ smart-open = "*" soundfile = "*" tqdm = "*" typing-extensions = "*" -Unidecode = "*" +anyascii = "^0.3" intervaltree = "*" wheel = "*" iamsystem = ">=0.3"