Skip to content

Commit

Permalink
BUG: Reduce memory overhead in simstring db
Browse files Browse the repository at this point in the history
Closes #52
  • Loading branch information
ghisvail committed Jul 11, 2024
1 parent 13a7830 commit c58e37a
Showing 1 changed file with 18 additions and 28 deletions.
46 changes: 18 additions & 28 deletions medkit/text/ner/_base_simstring_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"build_simstring_matcher_databases",
]

import collections
import dataclasses
import math
import re
Expand Down Expand Up @@ -385,35 +386,24 @@ def build_simstring_matcher_databases(
rules : iterable of BaseSimstringMatcherRule
Rules to add to databases
"""
# the params passed to simstring.writer are copy/pasted from QuickUMLS
# cf https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L173
simstring_db_writer = simstring.writer(
str(simstring_db_file),
3, # unit of character n-grams
False, # represent begin and end of strings in n-grams
True, # use unicode mode
)

# writeback=True needed because we are updating the values in the mapping,
# not just writing
rules_db = shelve.open(str(rules_db_file), flag="n", writeback=True) # noqa: S301

# add rules to databases
# Prepare rules mapping for persistence, as:
# term -> list of rules
rules_mapping = collections.defaultdict(list)
for rule in rules:
term_to_match = rule.term

# apply preprocessing
term_to_match = anyascii(term_to_match.lower())

# add to simstring db
simstring_db_writer.insert(term_to_match)
# add to rules db
if term_to_match not in rules_db:
rules_db[term_to_match] = []
rules_db[term_to_match].append(rule)
simstring_db_writer.close()
rules_db.sync()
rules_db.close()
term = anyascii(rule.term.lower())
rules_mapping[term].append(rule)

# Persist rules mapping in new shelf.
with shelve.open(str(rules_db_file), flag="n") as rules_db: # noqa: S301
rules_db.update(rules_mapping)

# Update simstring db with terms in rules mapping.
# The simstring.writer parameters are taken from QuickUMLS,
# see https://github.com/Georgetown-IR-Lab/QuickUMLS/blob/1.4.0/quickumls/toolbox.py#L169.
simstring_db = simstring.writer(str(simstring_db_file), n=3, be=False, unicode=True)
for term in rules_mapping:
simstring_db.insert(term)
simstring_db.close()


_TOKENIZATION_PATTERN = re.compile(r"\w+|[^\w ]")
Expand Down

0 comments on commit c58e37a

Please sign in to comment.