Skip to content

Commit

Permalink
Merge branch 'bug/s-unidecode-anyascii' into 'main'
Browse files Browse the repository at this point in the history
BUG: Replace unidecode by anyascii

Closes #197

See merge request heka/medkit!254

changelog: BUG: Replace unidecode by anyascii
  • Loading branch information
ghisvail committed Dec 11, 2023
2 parents 494e97e + a0831db commit eaa6ad0
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 13 deletions.
10 changes: 5 additions & 5 deletions medkit/text/ner/_base_simstring_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
from pathlib import Path
from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union

from anyascii import anyascii
from pysimstring import simstring
from typing_extensions import Literal
from unidecode import unidecode

try:
import spacy
Expand Down Expand Up @@ -302,7 +302,7 @@ def _find_matches_in_segment(
# simstring matching is always performed on lowercased ASCII-only text,
# then for potential matches we will recompute the similarity
# taking into account the actual rule parameters
candidate_text_processed = unidecode(candidate_text.lower())
candidate_text_processed = anyascii(candidate_text.lower())
matched_terms = self._simstring_db_reader.retrieve(candidate_text_processed)

for matched_term in matched_terms:
Expand All @@ -328,8 +328,8 @@ def _find_matches_in_segment(
candidate_text = candidate_text.lower()
rule_term = rule_term.lower()
elif not rule.unicode_sensitive:
candidate_text = unidecode(candidate_text)
rule_term = unidecode(rule_term)
candidate_text = anyascii(candidate_text)
rule_term = anyascii(rule_term)

# ignore blacklisted terms
if rule_term in self.blacklist:
Expand Down Expand Up @@ -443,7 +443,7 @@ def build_simstring_matcher_databases(
term_to_match = rule.term

# apply preprocessing
term_to_match = unidecode(term_to_match.lower())
term_to_match = anyascii(term_to_match.lower())

# add to simstring db
simstring_db_writer.insert(term_to_match)
Expand Down
6 changes: 3 additions & 3 deletions medkit/text/ner/umls_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from pathlib import Path
from typing import Dict, Iterator, List, Optional, Union

import unidecode
from anyascii import anyascii
from tqdm import tqdm

SEMGROUP_LABELS = {
Expand Down Expand Up @@ -255,7 +255,7 @@ def preprocess_term_to_match(
if lowercase:
term = term.lower()
if normalize_unicode:
term = unidecode.unidecode(term)
term = anyascii(term)

term = " " + term + " "
if clean_nos:
Expand Down Expand Up @@ -303,7 +303,7 @@ def preprocess_acronym(term: str) -> Optional[str]:

# try to rebuild acronym from expanded form:
# replace special characters with ASCII
expanded = unidecode.unidecode(expanded)
expanded = anyascii(expanded)
# keep only uppercase chars
acronym_candidate = "".join(c for c in expanded if c.isupper())
# if it doesn't match the part before the parenthesis
Expand Down
6 changes: 3 additions & 3 deletions medkit/text/utils/decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging

import unidecode
from anyascii import anyascii


def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) -> str:
Expand All @@ -25,15 +25,15 @@ def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) ->
"""
if logger is None:
logger = logging.getLogger(__name__)
output = unidecode.unidecode(text)
output = anyascii(text)

# Verify that text length is conserved
if keep_length and len(output) != len(text):
# if text conversion had changed its length, only change characters with same length
output = ""
special_chars = set()
for c in text:
cprim = unidecode.unidecode(c)
cprim = anyascii(c)
if len(cprim) == 1:
output += cprim
else:
Expand Down
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ smart-open = "*"
soundfile = "*"
tqdm = "*"
typing-extensions = "*"
Unidecode = "*"
anyascii = "^0.3"
intervaltree = "*"
wheel = "*"
iamsystem = ">=0.3"
Expand Down

0 comments on commit eaa6ad0

Please sign in to comment.