Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

synonymizer refactor2 #728

Merged
merged 3 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ src/oaklib/datamodels/%.py: src/oaklib/datamodels/%.yaml
# $(RUN) gen-pydantic $< > [email protected] && mv [email protected] $@
$(RUN) gen-python $< > [email protected] && mv [email protected] $@
$(RUN) tox -e lint

src/oaklib/datamodels/synonymizer.py: src/oaklib/datamodels/synonymizer.yaml
$(RUN) gen-pydantic $< > [email protected] && mv [email protected] $@


src/oaklib/datamodels/%.schema.json: src/oaklib/datamodels/%.yaml
$(RUN) gen-json-schema $< > [email protected] && mv [email protected] $@
src/oaklib/datamodels/%.owl.ttl: src/oaklib/datamodels/%.yaml
Expand Down
148 changes: 107 additions & 41 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@
import oaklib.datamodels.taxon_constraints as tcdm
from oaklib import datamodels
from oaklib.converters.logical_definition_flattener import LogicalDefinitionFlattener
from oaklib.datamodels import synonymizer_datamodel
from oaklib.datamodels.association import RollupGroup
from oaklib.datamodels.cross_ontology_diff import DiffCategory
from oaklib.datamodels.lexical_index import LexicalTransformation, TransformationType
from oaklib.datamodels.obograph import (
BasicPropertyValue,
Edge,
Expand Down Expand Up @@ -166,11 +166,9 @@
parse_kgcl_files,
write_kgcl,
)
from oaklib.utilities.lexical import patternizer
from oaklib.utilities.lexical import patternizer, synonymizer
from oaklib.utilities.lexical.lexical_indexer import (
DEFAULT_QUALIFIER,
add_labels_from_uris,
apply_transformation,
create_lexical_index,
lexical_index_to_sssom,
load_lexical_index,
Expand Down Expand Up @@ -6482,46 +6480,114 @@ def generate_synonyms(terms, rules_file, apply_patch, patch, patch_format, outpu
else:
writer = _get_writer(output_type, impl, StreamingKGCLWriter, kgcl)
writer.output = output
# TODO: Eventually get this from settings as above
ruleset = synonymizer_datamodel.RuleSet(**yaml.safe_load(open(rules_file)))
change_list = []
curie_iter = query_terms_iterator(terms, impl)
for change in synonymizer.apply_synonymizer_to_terms(impl, curie_iter, ruleset):
change_list.append(change)
writer.emit(change)

writer.finish()
if apply_patch and len(change_list) > 0:
if output:
impl.resource.slug = output
_apply_changes(impl, change_list)


@main.command()
@click.argument("terms", nargs=-1)
@click.option(
"--rules-file",
"-R",
help="path to rules file. Conforms to rules_datamodel.\
e.g. https://github.com/INCATools/ontology-access-kit/blob/main/tests/input/matcher_rules.yaml",
)
@click.option(
"--rules-expression",
"-Y",
multiple=True,
help="YAML encoding of a rules expression",
)
@click.option(
"--apply-patch/--no-apply-patch",
default=False,
show_default=True,
help="Apply KGCL syntax generated based on the synonymizer rules file.",
)
@click.option(
"--patch",
type=click.File(mode="w"),
default=sys.stdout,
help="Path to where patch file will be written.",
)
@click.option(
"--patch-format",
help="Output syntax for patches.",
)
@output_option
@output_type_option
def generate_lexical_replacements(
terms, rules_file, rules_expression, apply_patch, patch, patch_format, output, output_type
):
"""
Generate lexical replacements based on a set of synonymizer rules.


If the `--apply-patch` flag is set, the output will be an ontology file with the changes
applied. Pass the `--patch` argument to lso get the patch file in KGCL format.

Example:
-------

runoak -i foo.obo generate-lexical-replacements -R foo_rules.yaml\
--patch patch.kgcl --apply-patch -o foo_syn.obo

If the `apply-patch` flag is NOT set then the main input will be KGCL commands

Example:
-------

runoak -i foo.obo generate-lexical-replacements -R foo_rules.yaml -o changes.kgcl


You can also pass the expressions directly as YAML

Example:
-------

runoak -i foo.obo generate-lexical-replacements \
-Y '{match: "nuclear (\\w+)", replacement: "\\1 nucleus"}' .all

see https://github.com/INCATools/kgcl.

Note: this command is very similar to generate-synonyms, but the main use case here
is replacing terms, and applying rules to other elements such as definitions

"""
impl = settings.impl
if apply_patch:
writer = _get_writer(patch_format, impl, StreamingKGCLWriter, kgcl)
writer.output = patch
else:
writer = _get_writer(output_type, impl, StreamingKGCLWriter, kgcl)
writer.output = output
if rules_file:
ruleset = load_mapping_rules(rules_file)
ruleset = synonymizer_datamodel.RuleSet(**yaml.safe_load(open(rules_file)))
elif rules_expression:
ruleset = synonymizer_datamodel.RuleSet()
for rule_expression in rules_expression:
rule = synonymizer_datamodel.Synonymizer(**yaml.safe_load(rule_expression))
ruleset.rules.append(rule)
else:
ruleset = None
if not isinstance(impl, OboGraphInterface):
raise NotImplementedError
syn_rules = [x.synonymizer for x in ruleset.rules if x.synonymizer]
terms_to_synonymize = {}
raise ValueError("Must specify either --rules-file or --rules-expression")
change_list = []
for curie in query_terms_iterator(terms, impl):
# for rule in syn_rules:
for _, aliases in impl.entity_alias_map(curie).items():
matches = []
if aliases is not None:
# matches.extend([x for x in aliases if re.search(eval(rule.match), x) is not None])
for alias in aliases:
if alias:
synonymized, new_alias, qualifier = apply_transformation(
alias,
LexicalTransformation(
TransformationType.Synonymization, params=syn_rules
),
)
if synonymized:
matches.append(new_alias)

if len(matches) > 0:
if qualifier is None or qualifier == "":
qualifier = DEFAULT_QUALIFIER
terms_to_synonymize[curie] = matches
change = kgcl.NewSynonym(
id="kgcl_change_id_" + str(len(terms_to_synonymize)),
about_node=curie,
old_value=alias,
new_value=new_alias,
qualifier=qualifier,
)
change_list.append(change)
writer.emit(change)
curie_iter = query_terms_iterator(terms, impl)
for change in synonymizer.apply_synonymizer_to_terms(
impl, curie_iter, ruleset, include_all=True
):
change_list.append(change)
writer.emit(change)

writer.finish()
if apply_patch and len(change_list) > 0:
if output:
Expand Down
82 changes: 4 additions & 78 deletions src/oaklib/datamodels/mapping_rules_datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,39 +7,33 @@
# license: https://creativecommons.org/publicdomain/zero/1.0/

import dataclasses
import re
import sys
from dataclasses import dataclass
from typing import Any, ClassVar, Dict, List, Optional, Union

from jsonasobj2 import JsonObj, as_dict
from jsonasobj2 import as_dict
from linkml_runtime.linkml_model.meta import (
EnumDefinition,
PermissibleValue,
PvFormulaOptions,
)
from linkml_runtime.linkml_model.types import Boolean, Float, String, Uriorcurie
from linkml_runtime.utils.curienamespace import CurieNamespace
from linkml_runtime.utils.dataclass_extensions_376 import (
dataclasses_init_fn_with_kwargs,
)
from linkml_runtime.utils.enumerations import EnumDefinitionImpl
from linkml_runtime.utils.formatutils import camelcase, sfx, underscore
from linkml_runtime.utils.metamodelcore import (
Bool,
URIorCURIE,
bnode,
empty_dict,
empty_list,
)
from linkml_runtime.utils.slot import Slot
from linkml_runtime.utils.yamlutils import (
YAMLRoot,
extended_float,
extended_int,
extended_str,
)
from rdflib import Namespace, URIRef
from rdflib import URIRef

from oaklib.datamodels.synonymizer_datamodel import Synonymizer, Test

metamodel_version = "1.7.0"
version = None
Expand Down Expand Up @@ -253,74 +247,6 @@ def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
super().__post_init__(**kwargs)


@dataclass
class Synonymizer(YAMLRoot):
_inherited_slots: ClassVar[List[str]] = []

class_class_uri: ClassVar[URIRef] = MAPPINGRULES.Synonymizer
class_class_curie: ClassVar[str] = "mappingrules:Synonymizer"
class_name: ClassVar[str] = "Synonymizer"
class_model_uri: ClassVar[URIRef] = MAPPINGRULES.Synonymizer

the_rule: Optional[str] = None
match: Optional[str] = None
match_scope: Optional[str] = None
replacement: Optional[str] = None
qualifier: Optional[str] = None
prefix: Optional[str] = None
tests: Optional[Union[dict, "Test"]] = None

def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.the_rule is not None and not isinstance(self.the_rule, str):
self.the_rule = str(self.the_rule)

if self.match is not None and not isinstance(self.match, str):
self.match = str(self.match)

if self.match_scope is not None and not isinstance(self.match_scope, str):
self.match_scope = str(self.match_scope)

if self.replacement is not None and not isinstance(self.replacement, str):
self.replacement = str(self.replacement)

if self.qualifier is not None and not isinstance(self.qualifier, str):
self.qualifier = str(self.qualifier)

if self.prefix is not None and not isinstance(self.prefix, str):
self.prefix = str(self.prefix)

if self.tests is not None and not isinstance(self.tests, Test):
self.tests = Test(**as_dict(self.tests))

super().__post_init__(**kwargs)


@dataclass
class Test(YAMLRoot):
_inherited_slots: ClassVar[List[str]] = []

class_class_uri: ClassVar[URIRef] = MAPPINGRULES.Test
class_class_curie: ClassVar[str] = "mappingrules:Test"
class_name: ClassVar[str] = "Test"
class_model_uri: ClassVar[URIRef] = MAPPINGRULES.Test

input: Optional[str] = None
output: Optional[str] = None
prefix: Optional[str] = None

def __post_init__(self, *_: List[str], **kwargs: Dict[str, Any]):
if self.input is not None and not isinstance(self.input, str):
self.input = str(self.input)

if self.output is not None and not isinstance(self.output, str):
self.output = str(self.output)

if self.prefix is not None and not isinstance(self.prefix, str):
self.prefix = str(self.prefix)

super().__post_init__(**kwargs)


@dataclass
class LexicalIndex(YAMLRoot):
"""
Expand Down
6 changes: 3 additions & 3 deletions src/oaklib/datamodels/mapping_rules_datamodel.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ id: https://w3id.org/oak/mapping-rules-datamodel
title: Mapping Rules Datamodel
name: mapping-rules-datamodel
description: >-
A datamodel for specifying lexical mapping rules.
A datamodel for specifying lexical mapping rules
license: https://creativecommons.org/publicdomain/zero/1.0/

prefixes:
Expand Down Expand Up @@ -30,6 +30,7 @@ emit_prefixes:
imports:
- linkml:types
- lexical_index
- synonymizer_datamodel



Expand Down Expand Up @@ -86,7 +87,6 @@ classes:
predicate_id_one_of:
multivalued: true


Postcondition:
attributes:
predicate_id:
Expand All @@ -101,7 +101,7 @@ classes:

Synonymizer:
attributes:
the_rule:
description:
description: Description of the rule.
range: string
match:
Expand Down
Loading
Loading