Skip to content

Commit

Permalink
Added qualifier attr to Synonymizer (#317)
Browse files Browse the repository at this point in the history
* Added `qualifier` attr to Synonymizer

* Added docstring and made return types more precise

* added more docstring

* re made python file

* add test and tweak a slight bit of lexmatch code

* linted

* all transforms centrally through apply_transformer

* qualifier outcome determines predicate

* synonymizer returning results properly

* added check for qualifier not being None

* no need for pred condition

* Converter.data = Converter.prefix_map

* undo Converter.data = Converter.prefix_map

* removed `qualifier` from rules model

* poetry update

* removed qualifier

* removed qualifier

* adjust for new `curies`

* reinstated `qualifier` after discussion with Chris

* new rule emerged du to tox update

* added test for `other` in label

* corrected test

* formatted

* update workflow versions
  • Loading branch information
hrshdhgd authored Nov 4, 2022
1 parent a47a28e commit 3f5759e
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 22 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ jobs:
matrix:
python-version: [ '3.9', '3.10' ]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
15 changes: 12 additions & 3 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from oaklib import datamodels
from oaklib.converters.logical_definition_flattener import LogicalDefinitionFlattener
from oaklib.datamodels.cross_ontology_diff import DiffCategory
from oaklib.datamodels.lexical_index import LexicalTransformation, TransformationType
from oaklib.datamodels.obograph import PrefixDeclaration
from oaklib.datamodels.search import create_search_configuration
from oaklib.datamodels.text_annotator import TextAnnotationConfiguration
Expand Down Expand Up @@ -115,8 +116,9 @@
from oaklib.utilities.iterator_utils import chunk
from oaklib.utilities.kgcl_utilities import generate_change_id
from oaklib.utilities.lexical.lexical_indexer import (
DEFAULT_QUALIFIER,
add_labels_from_uris,
apply_synonymizer,
apply_transformation,
create_lexical_index,
lexical_index_to_sssom,
load_lexical_index,
Expand Down Expand Up @@ -4068,18 +4070,25 @@ def synonymize(terms, rules_file, apply_patch, patch, output):
# matches.extend([x for x in aliases if re.search(eval(rule.match), x) is not None])
for alias in aliases:
if alias:
synonymized, new_alias = apply_synonymizer(alias, syn_rules)
synonymized, new_alias, qualifier = apply_transformation(
alias,
LexicalTransformation(
TransformationType.Synonymization, params=syn_rules
),
)
if synonymized:
matches.append(new_alias)

if len(matches) > 0:
if qualifier is None or qualifier == "":
qualifier = DEFAULT_QUALIFIER
terms_to_synonymize[curie] = matches
change = kgcl.NewSynonym(
id="kgcl_change_id_" + str(len(terms_to_synonymize)),
about_node=curie,
old_value=alias,
new_value=new_alias,
qualifier="exact",
qualifier=qualifier,
)
change_list.append(change)
if patch:
Expand Down
15 changes: 14 additions & 1 deletion src/oaklib/datamodels/mapping_rules_datamodel.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Auto generated from mapping_rules_datamodel.yaml by pythongen.py version: 0.9.0
# Generation date: 2022-09-21T23:32:26
# Generation date: 2022-10-25T17:59:49
# Schema: mapping-rules
#
# id: https://w3id.org/linkml/mapping_rules_datamodel
Expand Down Expand Up @@ -256,6 +256,7 @@ class Synonymizer(YAMLRoot):
match: Optional[str] = None
match_scope: Optional[str] = None
replacement: Optional[str] = None
qualifier: Optional[str] = None

def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.the_rule is not None and not isinstance(self.the_rule, str):
Expand All @@ -270,6 +271,9 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.replacement is not None and not isinstance(self.replacement, str):
self.replacement = str(self.replacement)

if self.qualifier is not None and not isinstance(self.qualifier, str):
self.qualifier = str(self.qualifier)

super().__post_init__(**kwargs)


Expand Down Expand Up @@ -698,6 +702,15 @@ class slots:
range=Optional[str],
)

slots.synonymizer__qualifier = Slot(
uri=MRULES.qualifier,
name="synonymizer__qualifier",
curie=MRULES.curie("qualifier"),
model_uri=MRULES.synonymizer__qualifier,
domain=None,
range=Optional[str],
)

slots.lexicalIndex__groupings = Slot(
uri=MRULES.groupings,
name="lexicalIndex__groupings",
Expand Down
4 changes: 3 additions & 1 deletion src/oaklib/datamodels/mapping_rules_datamodel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,6 @@ classes:
replacement:
description: Reg-ex rule to replace substrings in labels
range: string

qualifier:
description: Type of match for the new synonym generated.
range: string
59 changes: 47 additions & 12 deletions src/oaklib/utilities/lexical/lexical_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@
from oaklib.utilities.basic_utils import pairs_as_dict

LEXICAL_INDEX_FORMATS = ["yaml", "json"]
DEFAULT_QUALIFIER = "exact"
QUALIFIER_DICT = {
"exact": "oio:hasExactSynonym",
"broad": "oio:hasBroadSynonym",
"narrow": "oio:hasNarrowSynonym",
"related": "oio:hasRelatedSynonym",
}


def add_labels_from_uris(oi: BasicOntologyInterface):
Expand Down Expand Up @@ -124,7 +131,10 @@ def create_lexical_index(
term2 = term
for tr in pipeline.transformations:
if tr.type.code == TransformationType.Synonymization:
synonymized, term2 = apply_transformation(term2, tr)
synonymized, term2, qualifier = apply_transformation(term2, tr)
if qualifier != DEFAULT_QUALIFIER and qualifier is not None:
pred = QUALIFIER_DICT[qualifier]

else:
term2 = apply_transformation(term2, tr)

Expand Down Expand Up @@ -384,13 +394,15 @@ def precondition_holds(precondition: Precondition, mapping: Mapping) -> bool:
return True


def apply_transformation(term: str, transformation: LexicalTransformation) -> str:
def apply_transformation(
term: str, transformation: LexicalTransformation
) -> Union[str, List[Tuple[bool, str, str]]]:
"""
Apply an individual transformation on a term
:param term:
:param transformation:
:return:
:param term: Original label.
:param transformation: Type of transformation to be performed on the label.
:return: Transformed label.
"""
typ = str(transformation.type)
logging.debug(f"Applying: {transformation}")
Expand All @@ -399,21 +411,44 @@ def apply_transformation(term: str, transformation: LexicalTransformation) -> st
elif typ == TransformationType.WhitespaceNormalization.text:
return re.sub(" {2,}", " ", term.strip())
elif typ == TransformationType.Synonymization.text:
return apply_synonymizer(term, eval(transformation.params))
synonymized_results = apply_synonymizer(term, eval(transformation.params))
true_results = [x for x in list(synonymized_results) if x[0] is True]
if len(true_results) > 0:
return true_results[-1]
else:
return (False, term, DEFAULT_QUALIFIER)
else:
raise NotImplementedError(
f"Transformation Type {typ} {type(typ)} not implemented {TransformationType.CaseNormalization.text}"
)


def apply_synonymizer(term: str, rules: List[Synonymizer]) -> str:
tmp_term = term
def apply_synonymizer(term: str, rules: List[Synonymizer]) -> Tuple[bool, str, str]:
"""Apply synonymizer rules declared in the given match-rules.yaml file.
The basic concept is looking for regex in labels and replacing the ones that match
with the string passed in 'match.replacement'. Also set qualifier ('match.qualifier')
as to whether the replacement is an 'exact', 'broad', 'narrow', or 'related' synonym.
Note: This function "yields" all intermediate results (for each rule applied)
as opposed to a final result. The reason being we only want to return a "True"
synonymized result. If the term is not synonymized, then the result will be just
the term and a default qualifier. In the case of multiple synonyms, the actual result
will be the latest synonymized result.In other words, all the rules have been
implemented on the term to finally produce the result.
:param term: Original label.
:param rules: Synonymizer rules from match-rules.yaml file.
:yield: A Tuple stating [if the label changed, new label, qualifier]
"""
for rule in rules:
tmp_term_2 = term
term = re.sub(eval(rule.match), rule.replacement, term)
if tmp_term == term:
return False, term.rstrip()
else:
return True, term.rstrip()

if tmp_term_2 != term:
yield True, term.strip(), rule.qualifier
else:
yield False, term.strip(), rule.qualifier


def save_mapping_rules(mapping_rules: MappingRuleCollection, path: str):
Expand Down
11 changes: 11 additions & 0 deletions tests/input/foo_bar.obo
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
format-version: 1.2
ontology: http://www.semanticweb.org/hhegde/ontologies/2022/9/foo_bar

[Term]
id: ABCD:1
name: other foo bar

[Term]
id: ABCD:2
name: foo bar

52 changes: 49 additions & 3 deletions tests/test_utilities/test_lexical_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
)
from oaklib.datamodels.mapping_rules_datamodel import Synonymizer
from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation
from oaklib.implementations.simpleobo.simple_obo_implementation import (
SimpleOboImplementation,
)
from oaklib.resource import OntologyResource
from oaklib.utilities.lexical.lexical_indexer import (
create_lexical_index,
Expand Down Expand Up @@ -47,6 +50,8 @@ def test_pipelines(self):
builder.add_class("X:3", "foo bar")
builder.add_class("X:4", "foo bar (foo bar)")
builder.add_class("X:5", "foo bar [foo bar]")
builder.add_class("X:6", "Other foo bar")
builder.add_class("X:7", "Other (FOO) [bar] foo bar")
builder.build()
syn_param = [
Synonymizer(
Expand All @@ -56,11 +61,18 @@ def test_pipelines(self):
replacement="",
),
Synonymizer(
the_rule="Remove parentheses bound info from the label.",
the_rule="Remove box brackets bound info from the label.",
match="r'\[[^)]*\]'", # noqa W605
match_scope="*",
replacement="",
),
Synonymizer(
the_rule="Broad match terms with the term 'other' in them.",
match="r'(?i)^Other '", # noqa W605
match_scope="*",
replacement="",
qualifier="broad",
),
]

case_norm = LexicalTransformation(TransformationType.CaseNormalization)
Expand All @@ -74,6 +86,8 @@ def test_pipelines(self):
"foo bar": ["X:1", "X:2", "X:3"],
"foo bar (foo bar)": ["X:4"],
"foo bar [foo bar]": ["X:5"],
"other foo bar": ["X:6"],
"other (foo) [bar] foo bar": ["X:7"],
},
),
(
Expand All @@ -83,6 +97,8 @@ def test_pipelines(self):
"foo bar": ["X:3"],
"foo bar (foo bar)": ["X:4"],
"foo bar [foo bar]": ["X:5"],
"other foo bar": ["X:6"],
"other (foo) [bar] foo bar": ["X:7"],
},
),
(
Expand All @@ -92,15 +108,21 @@ def test_pipelines(self):
"FOO BAR": ["X:2"],
"foo bar (foo bar)": ["X:4"],
"foo bar [foo bar]": ["X:5"],
"Other foo bar": ["X:6"],
"Other (FOO) [bar] foo bar": ["X:7"],
},
),
(
[synonymization],
{"FOO BAR": ["X:2"], "foo bar": ["X:3"], "foo bar": ["X:1", "X:4", "X:5"]},
{
"FOO BAR": ["X:2"],
"foo bar": ["X:3"],
"foo bar": ["X:1", "X:4", "X:5", "X:6", "X:7"],
},
),
(
[case_norm, whitespace_norm, synonymization],
{"foo bar": ["X:1", "X:2", "X:3", "X:4", "X:5"]},
{"foo bar": ["X:1", "X:2", "X:3", "X:4", "X:5", "X:6", "X:7"]},
),
]

Expand Down Expand Up @@ -129,3 +151,27 @@ def test_pipelines(self):

def test_save(self):
save_lexical_index(self.lexical_index, TEST_OUT)

def test_synonymizer_with_other(self):
"""Test synonymizer with 'other' in label."""
resource = OntologyResource(slug="foo_bar.obo", directory=INPUT_DIR, local=True)
oi = SimpleOboImplementation(resource)
syn_param = [
Synonymizer(
the_rule="Broad match terms with the term 'other' in them.",
match="r'(?i)^Other '", # noqa W605
match_scope="*",
replacement="",
qualifier="broad",
),
]
synonymization = LexicalTransformation(TransformationType.Synonymization, params=syn_param)
pipelines = [
LexicalTransformationPipeline(name="test_other", transformations=synonymization)
]
lexical_index = create_lexical_index(oi, pipelines=pipelines, synonym_rules=syn_param)

for _, v in lexical_index.groupings.items():
relation = [x for x in v.relationships if x.synonymized is True]
self.assertTrue(len(relation), 1)
self.assertEqual(relation[0].predicate, "oio:hasBroadSynonym")
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ ignore =
N803 # math-oriented classes can ignore this (e.g. hypergeometric.py)
N806 # math-oriented classes can ignore this (e.g. hypergeometric.py)
B019

max-line-length = 120
max-complexity = 13
import-order-style = pycharm
Expand Down

0 comments on commit 3f5759e

Please sign in to comment.