Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More efficient simple obo diffs #719

Merged
merged 3 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions src/oaklib/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,6 +924,12 @@ def chain_results(v):
show_default=True,
help="Merge all inputs specified using --add",
)
@click.option(
"--profile/--no-profile",
default=False,
show_default=True,
help="If set, will profile the command",
)
def main(
verbose: int,
quiet: bool,
Expand All @@ -941,6 +947,7 @@ def main(
metamodel_mappings,
requests_cache_db,
prefix,
profile: bool,
import_depth: Optional[int],
**kwargs,
):
Expand Down Expand Up @@ -968,6 +975,24 @@ def main(
logger.setLevel(logging.WARNING)
if quiet:
logger.setLevel(logging.ERROR)
if profile:
import atexit
import cProfile
import io
import pstats

print("Profiling...")
pr = cProfile.Profile()
pr.enable()

def exit():
pr.disable()
print("Profiling completed")
s = io.StringIO()
pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats()
print(s.getvalue())

atexit.register(exit)
if requests_cache_db:
import requests_cache

Expand Down Expand Up @@ -5839,9 +5864,10 @@ def diff(
writer.emit(summary)
else:
if isinstance(writer, StreamingMarkdownWriter):
config.yield_individual_changes = False
for change in impl.diff(other_impl, configuration=config):
writer.emit(change, other_impl=other_impl)
# config.yield_individual_changes = False
for change_type, changes in impl.grouped_diff(other_impl, configuration=config):
print(change_type, changes)
writer.emit({change_type: changes}, other_impl=other_impl)
else:
for change in impl.diff(other_impl, configuration=config):
writer.emit(change)
Expand Down
170 changes: 167 additions & 3 deletions src/oaklib/implementations/simpleobo/simple_obo_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
OWL_VERSION_IRI,
RDFS_DOMAIN,
RDFS_RANGE,
SCOPE_TO_SYNONYM_PRED_MAP,
SEMAPV,
SKOS_CLOSE_MATCH,
SUBPROPERTY_OF,
Expand Down Expand Up @@ -105,7 +106,7 @@
RELATIONSHIP,
RELATIONSHIP_MAP,
)
from oaklib.interfaces.differ_interface import DifferInterface
from oaklib.interfaces.differ_interface import DiffConfiguration, DifferInterface
from oaklib.interfaces.dumper_interface import DumperInterface
from oaklib.interfaces.mapping_provider_interface import MappingProviderInterface
from oaklib.interfaces.merge_interface import MergeInterface
Expand All @@ -124,7 +125,7 @@
from oaklib.utilities.axioms.logical_definition_utilities import (
logical_definition_matches,
)
from oaklib.utilities.kgcl_utilities import tidy_change_object
from oaklib.utilities.kgcl_utilities import generate_change_id, tidy_change_object
from oaklib.utilities.mapping.sssom_utils import inject_mapping_sources


Expand Down Expand Up @@ -323,6 +324,13 @@ def subset_members(self, subset: SUBSET_CURIE) -> Iterable[CURIE]:
if subset in s.simple_values(TAG_SUBSET):
yield s.id

def terms_subsets(self, curies: Iterable[CURIE]) -> Iterable[Tuple[CURIE, SUBSET_CURIE]]:
for curie in curies:
s = self._stanza(curie, False)
if s:
for subset in s.simple_values(TAG_SUBSET):
yield curie, subset

def ontologies(self) -> Iterable[CURIE]:
od = self.obo_document
for v in od.header.simple_values(TAG_ONTOLOGY):
Expand Down Expand Up @@ -761,9 +769,161 @@ def logical_definitions(
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Implements: PatcherInterface
# Implements: DifferInterface
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

def diff(
hrshdhgd marked this conversation as resolved.
Show resolved Hide resolved
self,
other_ontology: DifferInterface,
configuration: DiffConfiguration = None,
**kwargs,
) -> Iterator[kgcl.Change]:
if configuration is None:
configuration = DiffConfiguration()
if not isinstance(other_ontology, SimpleOboImplementation):
raise ValueError("Can only diff SimpleOboImplementation")
stanzas1 = self.obo_document.stanzas
stanzas2 = other_ontology.obo_document.stanzas
all_ids = set(stanzas1.keys()).union(stanzas2.keys())
for id in all_ids:
yield from self._diff_stanzas(stanzas1.get(id, None), stanzas2.get(id, None))

def _diff_stanzas(
self, stanza1: Optional[Stanza], stanza2: Optional[Stanza]
) -> Iterator[kgcl.Change]:
def _id():
return generate_change_id()

node_is_deleted = False
if stanza1 is None and stanza2 is None:
raise ValueError("Both stanzas are None")
if stanza1 is None:
stanza1 = Stanza(id=stanza2.id, type=stanza2.type)
if stanza2.type == "Term":
yield kgcl.ClassCreation(id=_id(), about_node=stanza2.id)
elif stanza2.type == "Typedef":
yield kgcl.NodeCreation(id=_id(), about_node=stanza2.id)
else:
raise ValueError(f"Unknown stanza type: {stanza2.type}")
if stanza2 is None:
stanza2 = Stanza(id=stanza1.id, type=stanza1.type)
if stanza1.type == "Term":
yield kgcl.NodeDeletion(id=_id(), about_node=stanza1.id)
else:
yield kgcl.NodeDeletion(id=_id(), about_node=stanza1.id)
node_is_deleted = True
if stanza1 == stanza2:
return
if stanza1.type != stanza2.type:
raise ValueError(f"Stanza types differ: {stanza1.type} vs {stanza2.type}")
t1id = stanza1.id
t2id = stanza2.id
logging.info(f"Diffing: {t1id} vs {t2id}")

def _tv_dict(stanza: Stanza) -> Dict[str, List[str]]:
d = defaultdict(set)
for tv in stanza.tag_values:
d[tv.tag].add(tv.value)
return d

tv_dict1 = _tv_dict(stanza1)
tv_dict2 = _tv_dict(stanza2)
all_tags = set(tv_dict1.keys()).union(tv_dict2.keys())
for tag in all_tags:
vals1 = tv_dict1.get(tag, [])
vals2 = tv_dict2.get(tag, [])
vals1list = list(vals1)
vals2list = list(vals2)
tvs1 = [tv for tv in stanza1.tag_values if tv.tag == tag]
tvs2 = [tv for tv in stanza2.tag_values if tv.tag == tag]
if vals1 == vals2:
continue
logging.info(f"Difference in {tag}: {vals1} vs {vals2}")
if tag == TAG_NAME:
if node_is_deleted:
continue
if vals1 and vals2:
yield kgcl.NodeRename(
id=_id(), about_node=t1id, new_value=vals2list[0], old_value=vals1list[0]
)
elif vals1:
yield kgcl.NodeDeletion(id=_id(), about_node=t1id)
else:
yield kgcl.ClassCreation(id=_id(), about_node=t2id, name=vals2list[0])
elif tag == TAG_DEFINITION:
if node_is_deleted:
continue
# TODO: provenance changes
td1 = stanza1.quoted_value(TAG_DEFINITION)
td2 = stanza2.quoted_value(TAG_DEFINITION)
if vals1 and vals2:
yield kgcl.NodeTextDefinitionChange(
id=_id(), about_node=t1id, new_value=td2, old_value=td1
)
elif vals1:
yield kgcl.RemoveTextDefinition(id=_id(), about_node=t1id)
else:
yield kgcl.NewTextDefinition(id=_id(), about_node=t2id, new_value=td2)
elif tag == TAG_IS_OBSOLETE:
if node_is_deleted:
continue
if vals1 and not vals2:
yield kgcl.NodeUnobsoletion(id=_id(), about_node=t1id)
elif not vals1 and vals2:
replaced_by = stanza2.simple_values(TAG_REPLACED_BY)
if replaced_by:
yield kgcl.NodeObsoletionWithDirectReplacement(
id=_id(), about_node=t2id, has_direct_replacement=replaced_by[0]
)
else:
yield kgcl.NodeObsoletion(id=_id(), about_node=t2id)
elif tag == TAG_SUBSET:
if node_is_deleted:
continue
subsets1 = stanza1.simple_values(TAG_SUBSET)
subsets2 = stanza2.simple_values(TAG_SUBSET)
for subset in subsets1:
if subset not in subsets2:
yield kgcl.RemoveNodeFromSubset(id=_id(), about_node=t1id, in_subset=subset)
for subset in subsets2:
if subset not in subsets1:
yield kgcl.AddNodeToSubset(id=_id(), about_node=t2id, in_subset=subset)
elif tag == TAG_IS_A:
isas1 = stanza1.simple_values(TAG_IS_A)
isas2 = stanza2.simple_values(TAG_IS_A)
for isa in isas1:
if isa not in isas2:
yield kgcl.EdgeDeletion(id=_id(), subject=t1id, predicate=IS_A, object=isa)
for isa in isas2:
if isa not in isas1:
yield kgcl.EdgeCreation(id=_id(), subject=t2id, predicate=IS_A, object=isa)
elif tag == TAG_RELATIONSHIP:
rels1 = stanza1.pair_values(TAG_RELATIONSHIP)
rels2 = stanza2.pair_values(TAG_RELATIONSHIP)
for p, v in rels1:
p_curie = self.map_shorthand_to_curie(p)
if (p, v) not in rels2:
yield kgcl.EdgeDeletion(id=_id(), subject=t1id, predicate=p_curie, object=v)
for p, v in rels2:
p_curie = self.map_shorthand_to_curie(p)
if (p, v) not in rels1:
yield kgcl.EdgeCreation(id=_id(), subject=t2id, predicate=p_curie, object=v)
elif tag == TAG_SYNONYM:
if node_is_deleted:
continue
# TODO: make this sensitive to annotation changes; for now we truncate the tuple
syns1 = [tv.as_synonym()[0:2] for tv in tvs1]
syns2 = [tv.as_synonym()[0:2] for tv in tvs2]
for syn in syns1:
if syn not in syns2:
yield kgcl.RemoveSynonym(id=_id(), about_node=t1id, old_value=syn[0])
for syn in syns2:
if syn not in syns1:
pred = SCOPE_TO_SYNONYM_PRED_MAP[syn[1]]
yield kgcl.NewSynonym(
id=_id(), about_node=t2id, new_value=syn[0], predicate=pred
)

def different_from(self, entity: CURIE, other_ontology: DifferInterface) -> bool:
t1 = self._stanza(entity, strict=False)
if t1:
Expand All @@ -772,6 +932,10 @@ def different_from(self, entity: CURIE, other_ontology: DifferInterface) -> bool
return str(t1) != str(t2)
return True

# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# Implements: PatcherInterface
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

def migrate_curies(self, curie_map: Mapping[CURIE, CURIE]) -> None:
od = self.obo_document
for t in od.stanzas.values():
Expand Down
Loading
Loading