diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 0cf8b3e0e..e5d120c21 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -91,6 +91,7 @@ from oaklib.implementations.obograph.obograph_implementation import ( OboGraphImplementation, ) +from oaklib.implementations.semsimian.semsimian_implementation import SemSimianImplementation from oaklib.implementations.sqldb.sql_implementation import SqlImplementation from oaklib.interfaces import ( BasicOntologyInterface, @@ -2956,7 +2957,10 @@ def similarity( if not isinstance(impl, SemanticSimilarityInterface): raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}") if information_content_file: - impl.cached_information_content_map = load_information_content_map(information_content_file) + if isinstance(impl, SemSimianImplementation): + impl.custom_ic_map_path = information_content_file + else: + impl.cached_information_content_map = load_information_content_map(information_content_file) set1it = None set2it = None if not (set1_file or set2_file): @@ -3037,8 +3041,16 @@ def termset_similarity( writer.output = output if not isinstance(impl, SemanticSimilarityInterface): raise NotImplementedError(f"Cannot execute this using {impl} of type {type(impl)}") + + # TODO: @cmungall - one possibility in future is to relieve client of the need for + # out of band knowledge about impl details. The generic SemSim interface could have + # a load_ic_map method, with the generic impl being to directly load, and the semsimian + # impl passing the path through. if information_content_file: - impl.cached_information_content_map = load_information_content_map(information_content_file) + if isinstance(impl, SemSimianImplementation): + impl.custom_ic_map_path = information_content_file + else: + impl.cached_information_content_map = load_information_content_map(information_content_file) terms = list(terms) ix = terms.index("@") set1 = list(query_terms_iterator(terms[0:ix], impl)) diff --git a/src/oaklib/implementations/semsimian/semsimian_implementation.py b/src/oaklib/implementations/semsimian/semsimian_implementation.py index fde765e38..fa774e24c 100644 --- a/src/oaklib/implementations/semsimian/semsimian_implementation.py +++ b/src/oaklib/implementations/semsimian/semsimian_implementation.py @@ -54,6 +54,8 @@ class SemSimianImplementation( AssociationProviderInterface.add_associations, ] + custom_ic_map_path: str = None + semsimian_object_cache: Dict[Tuple[PRED_CURIE], Optional["Semsimian"]] = field(default_factory=dict) # type: ignore # noqa def __post_init__(self): @@ -84,6 +86,7 @@ def _get_semsimian_object( predicates: List[PRED_CURIE] = None, attributes: List[str] = None, resource_path: str = None, + custom_ic_map_path: str = None, ) -> "Semsimian": # type: ignore # noqa """ Get Semsimian object from "semsimian_object_cache" or add a new one. @@ -94,6 +97,10 @@ def _get_semsimian_object( from semsimian import Semsimian predicates = tuple(sorted(predicates)) + + if custom_ic_map_path is not None: + logging.info(f"Using custom IC map with Semsimian: {custom_ic_map_path}") + if predicates not in self.semsimian_object_cache: # spo = [ # r @@ -111,6 +118,7 @@ def _get_semsimian_object( predicates=predicates, pairwise_similarity_attributes=attributes, resource_path=self.resource_path, + custom_ic_map_path=self.custom_ic_map_path, ) return self.semsimian_object_cache[predicates] @@ -139,7 +147,9 @@ def pairwise_similarity( """ logging.debug(f"Calculating pairwise similarity for {subject} x {object} over {predicates}") semsimian = self._get_semsimian_object( - predicates=predicates, attributes=self.term_pairwise_similarity_attributes + predicates=predicates, + attributes=self.term_pairwise_similarity_attributes, + custom_ic_map_path=self.custom_ic_map_path ) jaccard_val = semsimian.jaccard_similarity(subject, object) @@ -194,7 +204,9 @@ def all_by_all_pairwise_similarity( objects = list(objects) logging.info(f"Calculating all-by-all pairwise similarity for {len(objects)} objects") semsimian = self._get_semsimian_object( - predicates=predicates, attributes=self.term_pairwise_similarity_attributes + predicates=predicates, + attributes=self.term_pairwise_similarity_attributes, + custom_ic_map_path=self.custom_ic_map_path ) all_results = semsimian.all_by_all_pairwise_similarity( subject_terms=set(subjects), diff --git a/tests/input/test_ic.tsv b/tests/input/test_ic.tsv new file mode 100644 index 000000000..4ad967948 --- /dev/null +++ b/tests/input/test_ic.tsv @@ -0,0 +1,2 @@ +GO:0005773 5.5 +GO:0012505 6.0 \ No newline at end of file diff --git a/tests/test_implementations/test_semsimian_implementation.py b/tests/test_implementations/test_semsimian_implementation.py index e43ec5191..cb0c531ff 100644 --- a/tests/test_implementations/test_semsimian_implementation.py +++ b/tests/test_implementations/test_semsimian_implementation.py @@ -21,6 +21,8 @@ DB = INPUT_DIR / "go-nucleus.db" +TEST_IC_MAP = INPUT_DIR / "test_ic.tsv" + EXPECTED_ICS = { "CARO:0000000": 21.05, "BFO:0000002": 0.7069, @@ -134,6 +136,29 @@ def test_all_by_all_pairwise_similarity(self): sem_similarity_object.phenodigm_score, sql_similarity_object.phenodigm_score, places=2 ) + def test_similarity_with_custom_ic_map(self): + adapter = self.oi + + adapter.custom_ic_map_path = TEST_IC_MAP.as_posix() + + if not isinstance(adapter, SemanticSimilarityInterface): + raise AssertionError("SemanticSimilarityInterface not implemented") + entities = [VACUOLE, ENDOMEMBRANE_SYSTEM] + + for s in entities: + for o in entities: + for preds in [self.predicates]: + sim = adapter.pairwise_similarity(s, o, predicates=preds) + if sim is not None: + if s == VACUOLE and o == VACUOLE: + self.assertEqual(sim.ancestor_information_content, 5.5) + if s == ENDOMEMBRANE_SYSTEM and o == ENDOMEMBRANE_SYSTEM: + self.assertEqual(sim.ancestor_information_content, 6.0) + if s == VACUOLE and o == ENDOMEMBRANE_SYSTEM: + self.assertEqual(sim.ancestor_information_content, 0) + else: + raise ValueError(f"Did not get similarity for got {s} and {o}") + def test_semsimian_object_cache(self): start_time = timeit.default_timer() _ = list(