From 3f5759ea7b9859d789ebad97b63e6a1ef1e40ffc Mon Sep 17 00:00:00 2001 From: Harshad Date: Fri, 4 Nov 2022 14:46:32 -0500 Subject: [PATCH] Added qualifier attr to Synonymizer (#317) * Added `qualifier` attr to Synonymizer * Added docstring and made return types more precise * added more docstring * re made python file * add test and tweak a slight bit of lexmatch code * linted * all transforms centrally through apply_transformer * qualifier outcome determines predicate * synonymizer returning results properly * added check for qualifier not being None * no need for pred condition * Converter.data = Converter.prefix_map * undo Converter.data = Converter.prefix_map * removed `qualifier` from rules model * poetry update * removed qualifier * removed qualifier * adjust for new `curies` * reinstated `qualifier` after discussion with Chris * new rule emerged du to tox update * added test for `other` in label * corrected test * formatted * update workflow versions --- .github/workflows/main.yaml | 4 +- src/oaklib/cli.py | 15 ++++- .../datamodels/mapping_rules_datamodel.py | 15 ++++- .../datamodels/mapping_rules_datamodel.yaml | 4 +- .../utilities/lexical/lexical_indexer.py | 59 +++++++++++++++---- tests/input/foo_bar.obo | 11 ++++ tests/test_utilities/test_lexical_index.py | 52 +++++++++++++++- tox.ini | 1 + 8 files changed, 139 insertions(+), 22 deletions(-) create mode 100644 tests/input/foo_bar.obo diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index cef8cfd48..1c1199cf4 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -17,9 +17,9 @@ jobs: matrix: python-version: [ '3.9', '3.10' ] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4.3.0 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 39564054a..76339c9f9 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -48,6 +48,7 @@ from oaklib import datamodels from oaklib.converters.logical_definition_flattener import LogicalDefinitionFlattener from oaklib.datamodels.cross_ontology_diff import DiffCategory +from oaklib.datamodels.lexical_index import LexicalTransformation, TransformationType from oaklib.datamodels.obograph import PrefixDeclaration from oaklib.datamodels.search import create_search_configuration from oaklib.datamodels.text_annotator import TextAnnotationConfiguration @@ -115,8 +116,9 @@ from oaklib.utilities.iterator_utils import chunk from oaklib.utilities.kgcl_utilities import generate_change_id from oaklib.utilities.lexical.lexical_indexer import ( + DEFAULT_QUALIFIER, add_labels_from_uris, - apply_synonymizer, + apply_transformation, create_lexical_index, lexical_index_to_sssom, load_lexical_index, @@ -4068,18 +4070,25 @@ def synonymize(terms, rules_file, apply_patch, patch, output): # matches.extend([x for x in aliases if re.search(eval(rule.match), x) is not None]) for alias in aliases: if alias: - synonymized, new_alias = apply_synonymizer(alias, syn_rules) + synonymized, new_alias, qualifier = apply_transformation( + alias, + LexicalTransformation( + TransformationType.Synonymization, params=syn_rules + ), + ) if synonymized: matches.append(new_alias) if len(matches) > 0: + if qualifier is None or qualifier == "": + qualifier = DEFAULT_QUALIFIER terms_to_synonymize[curie] = matches change = kgcl.NewSynonym( id="kgcl_change_id_" + str(len(terms_to_synonymize)), about_node=curie, old_value=alias, new_value=new_alias, - qualifier="exact", + qualifier=qualifier, ) change_list.append(change) if patch: diff --git a/src/oaklib/datamodels/mapping_rules_datamodel.py b/src/oaklib/datamodels/mapping_rules_datamodel.py index 550e1ed45..09f6821b4 100644 --- a/src/oaklib/datamodels/mapping_rules_datamodel.py +++ b/src/oaklib/datamodels/mapping_rules_datamodel.py @@ -1,5 +1,5 @@ # Auto generated from mapping_rules_datamodel.yaml by pythongen.py version: 0.9.0 -# Generation date: 2022-09-21T23:32:26 +# Generation date: 2022-10-25T17:59:49 # Schema: mapping-rules # # id: https://w3id.org/linkml/mapping_rules_datamodel @@ -256,6 +256,7 @@ class Synonymizer(YAMLRoot): match: Optional[str] = None match_scope: Optional[str] = None replacement: Optional[str] = None + qualifier: Optional[str] = None def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): if self.the_rule is not None and not isinstance(self.the_rule, str): @@ -270,6 +271,9 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]): if self.replacement is not None and not isinstance(self.replacement, str): self.replacement = str(self.replacement) + if self.qualifier is not None and not isinstance(self.qualifier, str): + self.qualifier = str(self.qualifier) + super().__post_init__(**kwargs) @@ -698,6 +702,15 @@ class slots: range=Optional[str], ) +slots.synonymizer__qualifier = Slot( + uri=MRULES.qualifier, + name="synonymizer__qualifier", + curie=MRULES.curie("qualifier"), + model_uri=MRULES.synonymizer__qualifier, + domain=None, + range=Optional[str], +) + slots.lexicalIndex__groupings = Slot( uri=MRULES.groupings, name="lexicalIndex__groupings", diff --git a/src/oaklib/datamodels/mapping_rules_datamodel.yaml b/src/oaklib/datamodels/mapping_rules_datamodel.yaml index 60546ec8e..854a04ce1 100644 --- a/src/oaklib/datamodels/mapping_rules_datamodel.yaml +++ b/src/oaklib/datamodels/mapping_rules_datamodel.yaml @@ -109,4 +109,6 @@ classes: replacement: description: Reg-ex rule to replace substrings in labels range: string - \ No newline at end of file + qualifier: + description: Type of match for the new synonym generated. + range: string \ No newline at end of file diff --git a/src/oaklib/utilities/lexical/lexical_indexer.py b/src/oaklib/utilities/lexical/lexical_indexer.py index 8edccdfa2..6aa6d8a31 100644 --- a/src/oaklib/utilities/lexical/lexical_indexer.py +++ b/src/oaklib/utilities/lexical/lexical_indexer.py @@ -46,6 +46,13 @@ from oaklib.utilities.basic_utils import pairs_as_dict LEXICAL_INDEX_FORMATS = ["yaml", "json"] +DEFAULT_QUALIFIER = "exact" +QUALIFIER_DICT = { + "exact": "oio:hasExactSynonym", + "broad": "oio:hasBroadSynonym", + "narrow": "oio:hasNarrowSynonym", + "related": "oio:hasRelatedSynonym", +} def add_labels_from_uris(oi: BasicOntologyInterface): @@ -124,7 +131,10 @@ def create_lexical_index( term2 = term for tr in pipeline.transformations: if tr.type.code == TransformationType.Synonymization: - synonymized, term2 = apply_transformation(term2, tr) + synonymized, term2, qualifier = apply_transformation(term2, tr) + if qualifier != DEFAULT_QUALIFIER and qualifier is not None: + pred = QUALIFIER_DICT[qualifier] + else: term2 = apply_transformation(term2, tr) @@ -384,13 +394,15 @@ def precondition_holds(precondition: Precondition, mapping: Mapping) -> bool: return True -def apply_transformation(term: str, transformation: LexicalTransformation) -> str: +def apply_transformation( + term: str, transformation: LexicalTransformation +) -> Union[str, List[Tuple[bool, str, str]]]: """ Apply an individual transformation on a term - :param term: - :param transformation: - :return: + :param term: Original label. + :param transformation: Type of transformation to be performed on the label. + :return: Transformed label. """ typ = str(transformation.type) logging.debug(f"Applying: {transformation}") @@ -399,21 +411,44 @@ def apply_transformation(term: str, transformation: LexicalTransformation) -> st elif typ == TransformationType.WhitespaceNormalization.text: return re.sub(" {2,}", " ", term.strip()) elif typ == TransformationType.Synonymization.text: - return apply_synonymizer(term, eval(transformation.params)) + synonymized_results = apply_synonymizer(term, eval(transformation.params)) + true_results = [x for x in list(synonymized_results) if x[0] is True] + if len(true_results) > 0: + return true_results[-1] + else: + return (False, term, DEFAULT_QUALIFIER) else: raise NotImplementedError( f"Transformation Type {typ} {type(typ)} not implemented {TransformationType.CaseNormalization.text}" ) -def apply_synonymizer(term: str, rules: List[Synonymizer]) -> str: - tmp_term = term +def apply_synonymizer(term: str, rules: List[Synonymizer]) -> Tuple[bool, str, str]: + """Apply synonymizer rules declared in the given match-rules.yaml file. + + The basic concept is looking for regex in labels and replacing the ones that match + with the string passed in 'match.replacement'. Also set qualifier ('match.qualifier') + as to whether the replacement is an 'exact', 'broad', 'narrow', or 'related' synonym. + + Note: This function "yields" all intermediate results (for each rule applied) + as opposed to a final result. The reason being we only want to return a "True" + synonymized result. If the term is not synonymized, then the result will be just + the term and a default qualifier. In the case of multiple synonyms, the actual result + will be the latest synonymized result.In other words, all the rules have been + implemented on the term to finally produce the result. + + :param term: Original label. + :param rules: Synonymizer rules from match-rules.yaml file. + :yield: A Tuple stating [if the label changed, new label, qualifier] + """ for rule in rules: + tmp_term_2 = term term = re.sub(eval(rule.match), rule.replacement, term) - if tmp_term == term: - return False, term.rstrip() - else: - return True, term.rstrip() + + if tmp_term_2 != term: + yield True, term.strip(), rule.qualifier + else: + yield False, term.strip(), rule.qualifier def save_mapping_rules(mapping_rules: MappingRuleCollection, path: str): diff --git a/tests/input/foo_bar.obo b/tests/input/foo_bar.obo new file mode 100644 index 000000000..cc21e0075 --- /dev/null +++ b/tests/input/foo_bar.obo @@ -0,0 +1,11 @@ +format-version: 1.2 +ontology: http://www.semanticweb.org/hhegde/ontologies/2022/9/foo_bar + +[Term] +id: ABCD:1 +name: other foo bar + +[Term] +id: ABCD:2 +name: foo bar + diff --git a/tests/test_utilities/test_lexical_index.py b/tests/test_utilities/test_lexical_index.py index d535a3160..afcc950a2 100644 --- a/tests/test_utilities/test_lexical_index.py +++ b/tests/test_utilities/test_lexical_index.py @@ -7,6 +7,9 @@ ) from oaklib.datamodels.mapping_rules_datamodel import Synonymizer from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation +from oaklib.implementations.simpleobo.simple_obo_implementation import ( + SimpleOboImplementation, +) from oaklib.resource import OntologyResource from oaklib.utilities.lexical.lexical_indexer import ( create_lexical_index, @@ -47,6 +50,8 @@ def test_pipelines(self): builder.add_class("X:3", "foo bar") builder.add_class("X:4", "foo bar (foo bar)") builder.add_class("X:5", "foo bar [foo bar]") + builder.add_class("X:6", "Other foo bar") + builder.add_class("X:7", "Other (FOO) [bar] foo bar") builder.build() syn_param = [ Synonymizer( @@ -56,11 +61,18 @@ def test_pipelines(self): replacement="", ), Synonymizer( - the_rule="Remove parentheses bound info from the label.", + the_rule="Remove box brackets bound info from the label.", match="r'\[[^)]*\]'", # noqa W605 match_scope="*", replacement="", ), + Synonymizer( + the_rule="Broad match terms with the term 'other' in them.", + match="r'(?i)^Other '", # noqa W605 + match_scope="*", + replacement="", + qualifier="broad", + ), ] case_norm = LexicalTransformation(TransformationType.CaseNormalization) @@ -74,6 +86,8 @@ def test_pipelines(self): "foo bar": ["X:1", "X:2", "X:3"], "foo bar (foo bar)": ["X:4"], "foo bar [foo bar]": ["X:5"], + "other foo bar": ["X:6"], + "other (foo) [bar] foo bar": ["X:7"], }, ), ( @@ -83,6 +97,8 @@ def test_pipelines(self): "foo bar": ["X:3"], "foo bar (foo bar)": ["X:4"], "foo bar [foo bar]": ["X:5"], + "other foo bar": ["X:6"], + "other (foo) [bar] foo bar": ["X:7"], }, ), ( @@ -92,15 +108,21 @@ def test_pipelines(self): "FOO BAR": ["X:2"], "foo bar (foo bar)": ["X:4"], "foo bar [foo bar]": ["X:5"], + "Other foo bar": ["X:6"], + "Other (FOO) [bar] foo bar": ["X:7"], }, ), ( [synonymization], - {"FOO BAR": ["X:2"], "foo bar": ["X:3"], "foo bar": ["X:1", "X:4", "X:5"]}, + { + "FOO BAR": ["X:2"], + "foo bar": ["X:3"], + "foo bar": ["X:1", "X:4", "X:5", "X:6", "X:7"], + }, ), ( [case_norm, whitespace_norm, synonymization], - {"foo bar": ["X:1", "X:2", "X:3", "X:4", "X:5"]}, + {"foo bar": ["X:1", "X:2", "X:3", "X:4", "X:5", "X:6", "X:7"]}, ), ] @@ -129,3 +151,27 @@ def test_pipelines(self): def test_save(self): save_lexical_index(self.lexical_index, TEST_OUT) + + def test_synonymizer_with_other(self): + """Test synonymizer with 'other' in label.""" + resource = OntologyResource(slug="foo_bar.obo", directory=INPUT_DIR, local=True) + oi = SimpleOboImplementation(resource) + syn_param = [ + Synonymizer( + the_rule="Broad match terms with the term 'other' in them.", + match="r'(?i)^Other '", # noqa W605 + match_scope="*", + replacement="", + qualifier="broad", + ), + ] + synonymization = LexicalTransformation(TransformationType.Synonymization, params=syn_param) + pipelines = [ + LexicalTransformationPipeline(name="test_other", transformations=synonymization) + ] + lexical_index = create_lexical_index(oi, pipelines=pipelines, synonym_rules=syn_param) + + for _, v in lexical_index.groupings.items(): + relation = [x for x in v.relationships if x.synonymized is True] + self.assertTrue(len(relation), 1) + self.assertEqual(relation[0].predicate, "oio:hasBroadSynonym") diff --git a/tox.ini b/tox.ini index 142bf40c9..78ba660b4 100644 --- a/tox.ini +++ b/tox.ini @@ -69,6 +69,7 @@ ignore = N803 # math-oriented classes can ignore this (e.g. hypergeometric.py) N806 # math-oriented classes can ignore this (e.g. hypergeometric.py) B019 + max-line-length = 120 max-complexity = 13 import-order-style = pycharm