From 71ebe155a51b5f5b834ff11845860f1958b61ed6 Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 26 Sep 2023 13:56:37 -0400 Subject: [PATCH 01/85] Improve readability of warnings. --- bin/vlmd_to_dbgap_xml.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/vlmd_to_dbgap_xml.py b/bin/vlmd_to_dbgap_xml.py index 6263460..5d2b9d3 100644 --- a/bin/vlmd_to_dbgap_xml.py +++ b/bin/vlmd_to_dbgap_xml.py @@ -161,10 +161,12 @@ def vlmd_to_dbgap_xml(input_file, output, file_format, study_id, appl_id, study_ # description later if that is useful. if row.get('constraints.pattern'): counters['constraints.pattern'] += 1 - logging.warning(f"`constraints.pattern` of {row['constraints.pattern']} found in row {row_index}, skipped.") + logging.warning(f"`constraints.pattern` of {row['constraints.pattern']} found in row {row_index}, " + f"but pattern constraints are not currently being written.") if row.get('format'): counters['format'] += 1 - logging.warning(f"Found `format` of {row['format']} found in row {row_index}, skipped.") + logging.warning(f"Found `format` of {row['format']} found in row {row_index}, but format is not " + f"currently being written.") # Process enumerated and encoded values. encs = {} From 1df596c7497b115e0c614c73e37ecff2018d9dcc Mon Sep 17 00:00:00 2001 From: Gaurav Vaidya Date: Tue, 17 Oct 2023 09:57:14 -0400 Subject: [PATCH 02/85] Added CORSMiddleware to allow Dug to make CORS requests. --- src/dug/server.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/dug/server.py b/src/dug/server.py index fde7e5a..f7a8466 100644 --- a/src/dug/server.py +++ b/src/dug/server.py @@ -3,6 +3,7 @@ import uvicorn from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware from dug.config import Config from dug.core.async_search import Search from pydantic import BaseModel @@ -15,6 +16,13 @@ root_path=os.environ.get("ROOT_PATH", "/"), ) +APP.add_middleware( + CORSMiddleware, + allow_origins=['*'], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) class GetFromIndex(BaseModel): index: str = "concepts_index" From 7dcb0cf84d3ae53ed598677baab9483189775e56 Mon Sep 17 00:00:00 2001 From: braswent Date: Thu, 26 Oct 2023 15:47:37 -0400 Subject: [PATCH 03/85] feat: --- src/dug/core/__init__.py | 12 +- src/dug/core/annotators/__init__.py | 35 ++ src/dug/core/annotators/_base.py | 69 ++++ src/dug/core/annotators/monarch_annotator.py | 307 ++++++++++++++++++ .../annotators/utils/biolink_purl_util.py | 175 ++++++++++ src/dug/core/concept_expander.py | 98 ++++++ src/dug/core/crawler.py | 5 +- src/dug/core/factory.py | 42 +-- src/dug/hookspecs.py | 7 + 9 files changed, 724 insertions(+), 26 deletions(-) create mode 100644 src/dug/core/annotators/__init__.py create mode 100644 src/dug/core/annotators/_base.py create mode 100644 src/dug/core/annotators/monarch_annotator.py create mode 100644 src/dug/core/annotators/utils/biolink_purl_util.py create mode 100644 src/dug/core/concept_expander.py diff --git a/src/dug/core/__init__.py b/src/dug/core/__init__.py index f1fd8ed..b353a92 100644 --- a/src/dug/core/__init__.py +++ b/src/dug/core/__init__.py @@ -12,8 +12,10 @@ from dug import hookspecs from dug.core import parsers +from dug.core import annotators from dug.core.factory import DugFactory from dug.core.parsers import DugConcept, Parser, get_parser +from dug.core.annotators import DugAnnotator, Annotator, get_annotator logger = logging.getLogger('dug') stdout_log_handler = logging.StreamHandler(sys.stdout) @@ -29,6 +31,7 @@ def get_plugin_manager() -> pluggy.PluginManager: pm.add_hookspecs(hookspecs) pm.load_setuptools_entrypoints("dug") pm.register(parsers) + pm.register(annotators) return pm @@ -56,19 +59,20 @@ def __init__(self, factory: DugFactory): ] ) - def crawl(self, target_name: str, parser_type: str, element_type: str = None): + def crawl(self, target_name: str, parser_type: str, annotator_type: str, element_type: str = None): pm = get_plugin_manager() parser = get_parser(pm.hook, parser_type) + annotator = get_annotator(pm.hook, annotator_type) targets = get_targets(target_name) for target in targets: - self._crawl(target, parser, element_type) + self._crawl(target, parser, annotator, element_type) - def _crawl(self, target: Path, parser: Parser, element_type): + def _crawl(self, target: Path, parser: Parser, annotator: Annotator, element_type): # Initialize crawler - crawler = self._factory.build_crawler(target, parser, element_type) + crawler = self._factory.build_crawler(target, parser, annotator, element_type) # Read elements, annotate, and expand using tranql queries crawler.crawl() diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py new file mode 100644 index 0000000..29b7eb4 --- /dev/null +++ b/src/dug/core/annotators/__init__.py @@ -0,0 +1,35 @@ +import logging +from typing import Dict + +import pluggy + +from ._base import DugElement, DugConcept, Indexable, Annotator, FileAnnotator +from .monarch_annotator import AnnotatorMonarch + + +logger = logging.getLogger('dug') + +hookimpl = pluggy.HookimplMarker("dug") + +@hookimpl +def define_annotators(annotator_dict: Dict[str, Annotator]): + annotator_dict["annotator-monarch"] = AnnotatorMonarch() + + +class AnnotatorNotFoundException(Exception): + ... + + +def get_annotator(hook, annotator_name) -> Annotator: + """Get the annotator from all annotators registered via the define_annotators hook""" + + available_annotators = {} + hook.define_annotators(annotator_dict=available_annotators) + annotator = available_annotators.get(annotator_name.lower()) + if annotator is not None: + return annotator + + err_msg = f"Cannot find annotator of type '{annotator_name}'\n" \ + f"Supported annotators: {', '.join(available_annotators.keys())}" + logger.error(err_msg) + raise AnnotatorNotFoundException(err_msg) \ No newline at end of file diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py new file mode 100644 index 0000000..8e8dcb8 --- /dev/null +++ b/src/dug/core/annotators/_base.py @@ -0,0 +1,69 @@ +import json +import logging +from typing import Union, Callable, Any, Iterable, Awaitable, TypeVar, Generic +from dug import utils as utils +from requests import Session +from dug.config import Config as AnnotatorConfig + +logger = logging.getLogger('dug') + +logging.getLogger("requests").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) + + +class DugIdentifier: + def __init__(self, id, label, types=None, search_text="", description=""): + self.id = id + self.label = label + self.description = description + if types is None: + types = [] + self.types = types + self.search_text = [search_text] if search_text else [] + self.equivalent_identifiers = [] + self.synonyms = [] + self.purl = "" + + def jsonable(self): + return self.__dict__ + def __str__(self): + return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) + +Input = TypeVar("Input") +Output = TypeVar("Output") + +class AnnotatorSession(Generic[Input, Output]): + + def make_request(self, value: Input, http_session: Session): + raise NotImplementedError() + + def handle_response(self, value, response: Union[dict, list]) -> Output: + raise NotImplementedError() + + def __call__(self, value: Input, http_session: Session) -> Output: + response = self.make_request(value, http_session) + + result = self.handle_response(value, response) + + return result + +# def build_annotator(self) -> DugAnnotator: + +# preprocessor = Preprocessor(**self.config.preprocessor) +# annotator = Annotate(**self.config.annotator) +# normalizer = Normalizer(**self.config.normalizer) +# synonym_finder = SynonymFinder(**self.config.synonym_service) + +# annotator = DugAnnotator( +# preprocessor=preprocessor, +# annotator=annotator, +# normalizer=normalizer, +# synonym_finder=synonym_finder +# ) + +# return annotator + +Indexable = Union[DugIdentifier, AnnotatorSession] +# Indexable = DugIdentifier +Annotator = Callable[[Any], Iterable[Indexable]] +# Annotator = Callable[[Any], Iterable[DugIdentifier]] \ No newline at end of file diff --git a/src/dug/core/annotators/monarch_annotator.py b/src/dug/core/annotators/monarch_annotator.py new file mode 100644 index 0000000..d58be67 --- /dev/null +++ b/src/dug/core/annotators/monarch_annotator.py @@ -0,0 +1,307 @@ +import json +import logging +import os +import re +import urllib.parse +from typing import TypeVar, Generic, Union, List, Tuple, Optional +import bmt +import requests +from requests import Session + +from ._base import DugIdentifier, AnnotatorSession, Input, AnnotatorConfig +from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer + +logger = logging.getLogger('dug') + +logging.getLogger("requests").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) + +class AnnotateMonarch: + def __init__( + self, + config: AnnotatorConfig, + preprocessor: "PreprocessorMonarch", + annotator: "AnnotatorMonarch", + normalizer: "NormalizerMonarch", + synonym_finder: "SynonymFinderMonarch", + ontology_greenlist=[], + ): + self.config = config + self.preprocessor = preprocessor(**self.config.preprocessor) + self.annotator = annotator(**self.config.annotator) + self.normalizer = normalizer(**self.config.normalizer) + self.synonym_finder = synonym_finder(**self.config.synonym_service) + self.ontology_greenlist = ontology_greenlist + self.norm_fails_file = "norm_fails.txt" + self.anno_fails_file = "anno_fails.txt" + + def annotate(self, text, http_session): + + # Preprocess text (debraviate, remove stopwords, etc.) + text = self.preprocessor.preprocess(text) + + # Fetch identifiers + raw_identifiers = self.annotator.annotate(text, http_session) + + # Write out to file if text fails to annotate + if not raw_identifiers: + with open(self.anno_fails_file, "a") as fh: + fh.write(f'{text}\n') + + processed_identifiers = [] + for identifier in raw_identifiers: + + # Normalize identifier using normalization service + norm_id = self.normalizer.normalize(identifier, http_session) + + # Skip adding id if it doesn't normalize + if norm_id is None: + # Write out to file if identifier doesn't normalize + with open(self.norm_fails_file, "a") as fh: + fh.write(f'{identifier.id}\n') + + # Discard non-normalized ident if not in greenlist + if identifier.id_type not in self.ontology_greenlist: + continue + + # If it is in greenlist just keep moving forward + norm_id = identifier + + # Add synonyms to identifier + norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session) + + # Get pURL for ontology identifer for more info + norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) + processed_identifiers.append(norm_id) + + return processed_identifiers + +class PreprocessorMonarch: + """"Class for preprocessing strings so they are better interpreted by NLP steps""" + + def __init__(self, debreviator=None, stopwords=None): + if debreviator is None: + debreviator = self.default_debreviator_factory() + self.decoder = debreviator + + if stopwords is None: + stopwords = [] + self.stopwords = stopwords + + def preprocess(self, text: str) -> str: + """ + Apply debreviator to replace abbreviations and other characters + + >>> pp = PreprocessorMonarch({"foo": "bar"}, ["baz"]) + >>> pp.preprocess("Hello foo") + 'Hello bar' + + >>> pp.preprocess("Hello baz world") + 'Hello world' + """ + + for key, value in self.decoder.items(): + text = text.replace(key, value) + + # Remove any stopwords + text = " ".join([word for word in text.split() if word not in self.stopwords]) + return text + + @staticmethod + def default_debreviator_factory(): + return {"bmi": "body mass index", "_": " "} + + +# Input = TypeVar("Input") +# Output = TypeVar("Output") + + +# class ApiClient(Generic[Input, Output]): + +# def make_request(self, value: Input, http_session: Session): +# raise NotImplementedError() + +# def handle_response(self, value, response: Union[dict, list]) -> Output: +# raise NotImplementedError() + +# def __call__(self, value: Input, http_session: Session) -> Output: +# response = self.make_request(value, http_session) + +# result = self.handle_response(value, response) + +# return result + + +class AnnotatorMonarch(AnnotatorSession[str, List[DugIdentifier]]): + """ + Use monarch API service to fetch ontology IDs found in text + """ + + def __init__(self, url: str): + self.url = url + + def sliding_window(self, text, max_characters=2000, padding_words=5): + """ + For long texts sliding window works as the following + "aaaa bbb ccc ddd eeee" + with a sliding max chars 8 and padding 1 + first yeild would be "aaaa bbb" + next subsequent yeilds "bbb ccc", "ccc ddd" , "ddd eeee" + allowing context to be preserved with the scope of padding + For a text of length 7653 , with max_characters 2000 and padding 5 , 4 chunks are yielded. + """ + words = text.split(' ') + total_words = len(words) + window_end = False + current_index = 0 + while not window_end: + current_string = "" + for index, word in enumerate(words[current_index: ]): + if len(current_string) + len(word) + 1 >= max_characters: + yield current_string + " " + current_index += index - padding_words + break + appendee = word if index == 0 else " " + word + current_string += appendee + + if current_index + index == len(words) - 1: + window_end = True + yield current_string + + def annotate(self, text, http_session): + logger.debug(f"Annotating: {text}") + identifiers = [] + for chunk_text in self.sliding_window(text): + identifiers += self(chunk_text, http_session) + return identifiers + + def make_request(self, value: Input, http_session: Session): + value = urllib.parse.quote(value) + url = f'{self.url}{value}' + + # This could be moved to a config file + NUM_TRIES = 5 + for _ in range(NUM_TRIES): + response = http_session.get(url) + if response is not None: + # looks like it worked + break + + # if the reponse is still None here, throw an error + if response is None: + raise RuntimeError(f"no response from {url}") + return response.json() + + def handle_response(self, value, response: dict) -> List[DugIdentifier]: + identifiers = [] + """ Parse each identifier and initialize identifier object """ + for span in response.get('spans', []): + search_text = span.get('text', None) + for token in span.get('token', []): + curie = token.get('id', None) + if not curie: + continue + + biolink_types = token.get('category') + label = token.get('terms')[0] + identifiers.append(DugIdentifier(id=curie, + label=label, + types=biolink_types, + search_text=search_text)) + return identifiers + + +class NormalizerMonarch(AnnotatorSession[DugIdentifier, DugIdentifier]): + def __init__(self, url): + self.bl_toolkit = bmt.Toolkit() + self.url = url + + def normalize(self, identifier: DugIdentifier, http_session: Session): + # Use RENCI's normalization API service to get the preferred version of an identifier + logger.debug(f"Normalizing: {identifier.id}") + return self(identifier, http_session) + + def make_request(self, value: DugIdentifier, http_session: Session) -> dict: + curie = value.id + url = f"{self.url}{urllib.parse.quote(curie)}" + try: + response = http_session.get(url) + except Exception as get_exc: + logger.info(f"Error normalizing {value} at {url}") + logger.error(f"Error {get_exc.__class__.__name__}: {get_exc}") + return {} + try: + normalized = response.json() + except Exception as json_exc: + logger.info(f"Error processing response: {response.text} (HTTP {response.status_code})") + logger.error(f"Error {json_exc.__class__.__name__}: {json_exc}") + return {} + + return normalized + + def handle_response(self, identifier: DugIdentifier, normalized: dict) -> Optional[DugIdentifier]: + """ Record normalized results. """ + curie = identifier.id + normalization = normalized.get(curie, {}) + if normalization is None: + logger.info(f"Normalization service did not return normalization for: {curie}") + return None + + preferred_id = normalization.get("id", {}) + equivalent_identifiers = normalization.get("equivalent_identifiers", []) + biolink_type = normalization.get("type", []) + + # Return none if there isn't actually a preferred id + if 'identifier' not in preferred_id: + logger.debug(f"ERROR: normalize({curie})=>({preferred_id}). No identifier?") + return None + + logger.debug(f"Preferred id: {preferred_id}") + identifier.id = preferred_id.get('identifier', '') + identifier.label = preferred_id.get('label', '') + identifier.description = preferred_id.get('description', '') + identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] + try: + identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name + except: + # converts biolink:SmallMolecule to small molecule + identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() + return identifier + + +class SynonymFinderMonarch(AnnotatorSession[str, List[str]]): + + def __init__(self, url: str): + self.url = url + + def get_synonyms(self, curie: str, http_session): + ''' + This function uses the NCATS translator service to return a list of synonyms for + curie id + ''' + + return self(curie, http_session) + + def make_request(self, curie: str, http_session: Session): + # Get response from namelookup reverse lookup op + # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) + url = f"{self.url}" + payload = { + 'curies': [curie] + } + try: + response = http_session.post(url, json=payload) + if str(response.status_code).startswith('4'): + logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") + return {curie: []} + if str(response.status_code).startswith('5'): + logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") + return {curie: []} + return response.json() + except json.decoder.JSONDecodeError as e: + logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") + return {curie: []} + + def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: + # Return curie synonyms + return raw_synonyms.get(curie, []) \ No newline at end of file diff --git a/src/dug/core/annotators/utils/biolink_purl_util.py b/src/dug/core/annotators/utils/biolink_purl_util.py new file mode 100644 index 0000000..1cbc8a5 --- /dev/null +++ b/src/dug/core/annotators/utils/biolink_purl_util.py @@ -0,0 +1,175 @@ +class BioLinkPURLerizer: + # Static class for the sole purpose of doing lookups of different ontology PURLs + # Is it pretty? No. But it gets the job done. + biolink_lookup = {"APO": "http://purl.obolibrary.org/obo/APO_", + "Aeolus": "http://translator.ncats.nih.gov/Aeolus_", + "BIOGRID": "http://identifiers.org/biogrid/", + "BIOSAMPLE": "http://identifiers.org/biosample/", + "BSPO": "http://purl.obolibrary.org/obo/BSPO_", + "CAID": "http://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=", + "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_", + "CHEMBL.COMPOUND": "http://identifiers.org/chembl.compound/", + "CHEMBL.MECHANISM": "https://www.ebi.ac.uk/chembl/mechanism/inspect/", + "CHEMBL.TARGET": "http://identifiers.org/chembl.target/", + "CID": "http://pubchem.ncbi.nlm.nih.gov/compound/", + "CL": "http://purl.obolibrary.org/obo/CL_", + "CLINVAR": "http://identifiers.org/clinvar/", + "CLO": "http://purl.obolibrary.org/obo/CLO_", + "COAR_RESOURCE": "http://purl.org/coar/resource_type/", + "CPT": "https://www.ama-assn.org/practice-management/cpt/", + "CTD": "http://translator.ncats.nih.gov/CTD_", + "ClinVarVariant": "http://www.ncbi.nlm.nih.gov/clinvar/variation/", + "DBSNP": "http://identifiers.org/dbsnp/", + "DGIdb": "https://www.dgidb.org/interaction_types", + "DOID": "http://purl.obolibrary.org/obo/DOID_", + "DRUGBANK": "http://identifiers.org/drugbank/", + "DrugCentral": "http://translator.ncats.nih.gov/DrugCentral_", + "EC": "http://www.enzyme-database.org/query.php?ec=", + "ECTO": "http://purl.obolibrary.org/obo/ECTO_", + "EDAM-DATA": "http://edamontology.org/data_", + "EDAM-FORMAT": "http://edamontology.org/format_", + "EDAM-OPERATION": "http://edamontology.org/operation_", + "EDAM-TOPIC": "http://edamontology.org/topic_", + "EFO": "http://identifiers.org/efo/", + "ENSEMBL": "http://identifiers.org/ensembl/", + "ExO": "http://purl.obolibrary.org/obo/ExO_", + "FAO": "http://purl.obolibrary.org/obo/FAO_", + "FB": "http://identifiers.org/fb/", + "FBcv": "http://purl.obolibrary.org/obo/FBcv_", + "FlyBase": "http://flybase.org/reports/", + "GAMMA": "http://translator.renci.org/GAMMA_", + "GO": "http://purl.obolibrary.org/obo/GO_", + "GOLD.META": "http://identifiers.org/gold.meta/", + "GOP": "http://purl.obolibrary.org/obo/go#", + "GOREL": "http://purl.obolibrary.org/obo/GOREL_", + "GSID": "https://scholar.google.com/citations?user=", + "GTEx": "https://www.gtexportal.org/home/gene/", + "HANCESTRO": "http://www.ebi.ac.uk/ancestro/ancestro_", + "HCPCS": "http://purl.bioontology.org/ontology/HCPCS/", + "HGNC": "http://identifiers.org/hgnc/", + "HGNC.FAMILY": "http://identifiers.org/hgnc.family/", + "HMDB": "http://identifiers.org/hmdb/", + "HP": "http://purl.obolibrary.org/obo/HP_", + "ICD0": "http://translator.ncats.nih.gov/ICD0_", + "ICD10": "http://translator.ncats.nih.gov/ICD10_", + "ICD9": "http://translator.ncats.nih.gov/ICD9_", + "INCHI": "http://identifiers.org/inchi/", + "INCHIKEY": "http://identifiers.org/inchikey/", + "INTACT": "http://identifiers.org/intact/", + "IUPHAR.FAMILY": "http://identifiers.org/iuphar.family/", + "KEGG": "http://identifiers.org/kegg/", + "LOINC": "http://loinc.org/rdf/", + "MEDDRA": "http://identifiers.org/meddra/", + "MESH": "http://identifiers.org/mesh/", + "MGI": "http://identifiers.org/mgi/", + "MI": "http://purl.obolibrary.org/obo/MI_", + "MIR": "http://identifiers.org/mir/", + "MONDO": "http://purl.obolibrary.org/obo/MONDO_", + "MP": "http://purl.obolibrary.org/obo/MP_", + "MSigDB": "https://www.gsea-msigdb.org/gsea/msigdb/", + "MetaCyc": "http://translator.ncats.nih.gov/MetaCyc_", + "NCBIGENE": "http://identifiers.org/ncbigene/", + "NCBITaxon": "http://purl.obolibrary.org/obo/NCBITaxon_", + "NCIT": "http://purl.obolibrary.org/obo/NCIT_", + "NDDF": "http://purl.bioontology.org/ontology/NDDF/", + "NLMID": "https://www.ncbi.nlm.nih.gov/nlmcatalog/?term=", + "OBAN": "http://purl.org/oban/", + "OBOREL": "http://purl.obolibrary.org/obo/RO_", + "OIO": "http://www.geneontology.org/formats/oboInOwl#", + "OMIM": "http://purl.obolibrary.org/obo/OMIM_", + "ORCID": "https://orcid.org/", + "ORPHA": "http://www.orpha.net/ORDO/Orphanet_", + "ORPHANET": "http://identifiers.org/orphanet/", + "PANTHER.FAMILY": "http://identifiers.org/panther.family/", + "PANTHER.PATHWAY": "http://identifiers.org/panther.pathway/", + "PATO-PROPERTY": "http://purl.obolibrary.org/obo/pato#", + "PDQ": "https://www.cancer.gov/publications/pdq#", + "PHARMGKB.DRUG": "http://identifiers.org/pharmgkb.drug/", + "PHARMGKB.PATHWAYS": "http://identifiers.org/pharmgkb.pathways/", + "PHAROS": "http://pharos.nih.gov", + "PMID": "http://www.ncbi.nlm.nih.gov/pubmed/", + "PO": "http://purl.obolibrary.org/obo/PO_", + "POMBASE": "http://identifiers.org/pombase/", + "PR": "http://purl.obolibrary.org/obo/PR_", + "PUBCHEM.COMPOUND": "http://identifiers.org/pubchem.compound/", + "PUBCHEM.SUBSTANCE": "http://identifiers.org/pubchem.substance/", + "PathWhiz": "http://smpdb.ca/pathways/#", + "REACT": "http://www.reactome.org/PathwayBrowser/#/", + "REPODB": "http://apps.chiragjpgroup.org/repoDB/", + "RGD": "http://identifiers.org/rgd/", + "RHEA": "http://identifiers.org/rhea/", + "RNACENTRAL": "http://identifiers.org/rnacentral/", + "RO": "http://purl.obolibrary.org/obo/RO_", + "RTXKG1": "http://kg1endpoint.rtx.ai/", + "RXNORM": "http://purl.bioontology.org/ontology/RXNORM/", + "ResearchID": "https://publons.com/researcher/", + "SEMMEDDB": "https://skr3.nlm.nih.gov/SemMedDB", + "SGD": "http://identifiers.org/sgd/", + "SIO": "http://semanticscience.org/resource/SIO_", + "SMPDB": "http://identifiers.org/smpdb/", + "SNOMEDCT": "http://identifiers.org/snomedct/", + "SNPEFF": "http://translator.ncats.nih.gov/SNPEFF_", + "ScopusID": "https://www.scopus.com/authid/detail.uri?authorId=", + "TAXRANK": "http://purl.obolibrary.org/obo/TAXRANK_", + "UBERGRAPH": "http://translator.renci.org/ubergraph-axioms.ofn#", + "UBERON": "http://purl.obolibrary.org/obo/UBERON_", + "UBERON_CORE": "http://purl.obolibrary.org/obo/uberon/core#", + "UMLS": "http://identifiers.org/umls/", + "UMLSSC": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/code#", + "UMLSSG": "https://metamap.nlm.nih.gov/Docs/SemGroups_2018.txt/group#", + "UMLSST": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/type#", + "UNII": "http://identifiers.org/unii/", + "UPHENO": "http://purl.obolibrary.org/obo/UPHENO_", + "UniProtKB": "http://identifiers.org/uniprot/", + "VANDF": "https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/VANDF/", + "VMC": "https://github.com/ga4gh/vr-spec/", + "WB": "http://identifiers.org/wb/", + "WBPhenotype": "http://purl.obolibrary.org/obo/WBPhenotype_", + "WBVocab": "http://bio2rdf.org/wormbase_vocabulary", + "WIKIDATA": "https://www.wikidata.org/wiki/", + "WIKIDATA_PROPERTY": "https://www.wikidata.org/wiki/Property:", + "WIKIPATHWAYS": "http://identifiers.org/wikipathways/", + "WormBase": "https://www.wormbase.org/get?name=", + "ZFIN": "http://identifiers.org/zfin/", + "ZP": "http://purl.obolibrary.org/obo/ZP_", + "alliancegenome": "https://www.alliancegenome.org/", + "biolink": "https://w3id.org/biolink/vocab/", + "biolinkml": "https://w3id.org/biolink/biolinkml/", + "chembio": "http://translator.ncats.nih.gov/chembio_", + "dcterms": "http://purl.org/dc/terms/", + "dictyBase": "http://dictybase.org/gene/", + "doi": "https://doi.org/", + "fabio": "http://purl.org/spar/fabio/", + "foaf": "http://xmlns.com/foaf/0.1/", + "foodb.compound": "http://foodb.ca/compounds/", + "gff3": "https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#", + "gpi": "https://github.com/geneontology/go-annotation/blob/master/specs/gpad-gpi-2-0.md#", + "gtpo": "https://rdf.guidetopharmacology.org/ns/gtpo#", + "hetio": "http://translator.ncats.nih.gov/hetio_", + "interpro": "https://www.ebi.ac.uk/interpro/entry/", + "isbn": "https://www.isbn-international.org/identifier/", + "isni": "https://isni.org/isni/", + "issn": "https://portal.issn.org/resource/ISSN/", + "medgen": "https://www.ncbi.nlm.nih.gov/medgen/", + "oboformat": "http://www.geneontology.org/formats/oboInOWL#", + "pav": "http://purl.org/pav/", + "prov": "http://www.w3.org/ns/prov#", + "qud": "http://qudt.org/1.1/schema/qudt#", + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "skos": "https://www.w3.org/TR/skos-reference/#", + "wgs": "http://www.w3.org/2003/01/geo/wgs84_pos", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "@vocab": "https://w3id.org/biolink/vocab/"} + + @staticmethod + def get_curie_purl(curie): + # Split into prefix and suffix + suffix = curie.split(":")[1] + prefix = curie.split(":")[0] + + # Check to see if the prefix exists in the hash + if prefix not in BioLinkPURLerizer.biolink_lookup: + return None + + return f"{BioLinkPURLerizer.biolink_lookup[prefix]}{suffix}" \ No newline at end of file diff --git a/src/dug/core/concept_expander.py b/src/dug/core/concept_expander.py new file mode 100644 index 0000000..2df9a8c --- /dev/null +++ b/src/dug/core/concept_expander.py @@ -0,0 +1,98 @@ +import json +import logging +import os +import requests + +import dug.core.tranql as tql + +logger = logging.getLogger('dug') + +logging.getLogger("requests").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) + +class ConceptExpander: + def __init__(self, url, min_tranql_score=0.2): + self.url = url + self.min_tranql_score = min_tranql_score + self.include_node_keys = ["id", "name", "synonyms"] + self.include_edge_keys = [] + self.tranql_headers = {"accept": "application/json", "Content-Type": "text/plain"} + + def is_acceptable_answer(self, answer): + return True + + def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False): + + answer_kgs = [] + + # Skip TranQL query if a file exists in the crawlspace exists already, but continue w/ answers + if os.path.exists(kg_filename): + logger.info(f"identifier {identifier} is already crawled. Skipping TranQL query.") + with open(kg_filename, 'r') as stream: + response = json.load(stream) + else: + query = query_factory.get_query(identifier) + logger.debug(query) + response = requests.post( + url=self.url, + headers=self.tranql_headers, + data=query).json() + + # Case: Skip if empty KG + try: + if response["message"] == 'Internal Server Error' or len(response["message"]["knowledge_graph"]["nodes"]) == 0: + logger.debug(f"Did not find a knowledge graph for {query}") + logger.debug(f"{self.url} returned response: {response}") + return [] + except KeyError as e: + logger.error(f"Could not find key: {e} in response: {response}") + + # Dump out to file if there's a knowledge graph + with open(kg_filename, 'w') as stream: + json.dump(response, stream, indent=2) + + # Get nodes in knowledge graph hashed by ids for easy lookup + noMessage = (len(response.get("message",{})) == 0) + statusError = (response.get("status","") == 'Error') + if noMessage or statusError: + # Skip on error + logger.info(f"Error with identifier: {identifier}, response: {response}, kg_filename: '{kg_filename}'") + return [] + kg = tql.QueryKG(response) + + for answer in kg.answers: + # Filter out answers that don't meet some criteria + # Right now just don't filter anything + logger.debug(f"Answer: {answer}") + if not self.is_acceptable_answer(answer): + logger.warning("Skipping answer as it failed one or more acceptance criteria. See log for details.") + continue + + # Get subgraph containing only information for this answer + try: + # Temporarily surround in try/except because sometimes the answer graphs + # contain invalid references to edges/nodes + # This will be fixed in Robokop but for now just silently warn if answer is invalid + node_attributes_filter = None if include_all_attributes else self.include_node_keys + edge_attributes_filter = None if include_all_attributes else self.include_edge_keys + answer_kg = kg.get_answer_subgraph(answer, + include_node_keys=node_attributes_filter, + include_edge_keys=edge_attributes_filter) + + # Add subgraph to list of acceptable answers to query + answer_kgs.append(answer_kg) + + except tql.MissingNodeReferenceError: + # TEMPORARY: Skip answers that have invalid node references + # Need this to be fixed in Robokop + logger.warning("Skipping answer due to presence of non-preferred id! " + "See err msg for details.") + continue + except tql.MissingEdgeReferenceError: + # TEMPORARY: Skip answers that have invalid edge references + # Need this to be fixed in Robokop + logger.warning("Skipping answer due to presence of invalid edge reference! " + "See err msg for details.") + continue + + return answer_kgs \ No newline at end of file diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 1bb64f0..744291d 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -4,6 +4,7 @@ import traceback from dug.core.parsers import Parser, DugElement, DugConcept +from dug.core.annotators import Annotator, DugAnnotator import dug.core.tranql as tql from dug.utils import biolink_snake_case, get_formatted_biolink_name @@ -11,7 +12,7 @@ class Crawler: - def __init__(self, crawl_file: str, parser: Parser, annotator, + def __init__(self, crawl_file: str, parser: Parser, annotator: Annotator, tranqlizer, tranql_queries, http_session, exclude_identifiers=None, element_type=None, element_extraction=None): @@ -22,7 +23,7 @@ def __init__(self, crawl_file: str, parser: Parser, annotator, self.crawl_file = crawl_file self.parser: Parser = parser self.element_type = element_type - self.annotator = annotator + self.annotator: Annotator = annotator self.tranqlizer = tranqlizer self.tranql_queries = tranql_queries self.http_session = http_session diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py index d1f594a..6037f97 100644 --- a/src/dug/core/factory.py +++ b/src/dug/core/factory.py @@ -4,15 +4,17 @@ from requests_cache import CachedSession import dug.core.tranql as tql -from dug.core.annotate import (DugAnnotator, - Annotator, - Normalizer, - Preprocessor, - SynonymFinder, - ConceptExpander) +# from dug.core.annotate import (DugAnnotator, +# # Annotator, +# Normalizer, +# Preprocessor, +# SynonymFinder, +# ConceptExpander) +from dug.core.concept_expander import ConceptExpander from dug.config import Config as DugConfig, TRANQL_SOURCE from dug.core.crawler import Crawler from dug.core.parsers import Parser +from dug.core.annotators import Annotator from dug.core.async_search import Search from dug.core.index import Index @@ -36,11 +38,11 @@ def build_http_session(self) -> CachedSession: connection=redis.StrictRedis(**redis_config) ) - def build_crawler(self, target, parser: Parser, element_type: str, tranql_source=None) -> Crawler: + def build_crawler(self, target, parser: Parser, annotator: Annotator, element_type: str, tranql_source=None) -> Crawler: crawler = Crawler( crawl_file=str(target), parser=parser, - annotator=self.build_annotator(), + annotator=annotator, tranqlizer=self.build_tranqlizer(), tranql_queries=self.build_tranql_queries(tranql_source), http_session=self.build_http_session(), @@ -51,21 +53,21 @@ def build_crawler(self, target, parser: Parser, element_type: str, tranql_source return crawler - def build_annotator(self) -> DugAnnotator: + # def build_annotator(self) -> Annotator: - preprocessor = Preprocessor(**self.config.preprocessor) - annotator = Annotator(**self.config.annotator) - normalizer = Normalizer(**self.config.normalizer) - synonym_finder = SynonymFinder(**self.config.synonym_service) + # preprocessor = Preprocessor(**self.config.preprocessor) + # annotator = Annotate(**self.config.annotator) + # normalizer = Normalizer(**self.config.normalizer) + # synonym_finder = SynonymFinder(**self.config.synonym_service) - annotator = DugAnnotator( - preprocessor=preprocessor, - annotator=annotator, - normalizer=normalizer, - synonym_finder=synonym_finder - ) + # annotator = Annotator( + # preprocessor=preprocessor, + # annotator=annotator, + # normalizer=normalizer, + # synonym_finder=synonym_finder + # ) - return annotator + # return annotator def build_tranqlizer(self) -> ConceptExpander: return ConceptExpander(**self.config.concept_expander) diff --git a/src/dug/hookspecs.py b/src/dug/hookspecs.py index 3a02b9a..96b984b 100644 --- a/src/dug/hookspecs.py +++ b/src/dug/hookspecs.py @@ -3,6 +3,7 @@ import pluggy from dug.core.parsers import Parser +from dug.core.annotators import Annotator hookspec = pluggy.HookspecMarker("dug") @@ -12,3 +13,9 @@ def define_parsers(parser_dict: Dict[str, Parser]): """Defines what parsers are available to Dug """ ... + +@hookspec +def define_annotators(annotator_dict: Dict[str, Annotator]): + """Defines what Annotators are available to Dug + """ + ... From 36120d59bb3ff3dcdb40bdabd7cc2e91e4ab28df Mon Sep 17 00:00:00 2001 From: braswent Date: Fri, 27 Oct 2023 13:03:58 -0400 Subject: [PATCH 04/85] feat: --- Makefile | 1 - src/dug/core/__init__.py | 2 +- src/dug/core/annotators/__init__.py | 26 +- src/dug/core/annotators/_base.py | 38 +-- src/dug/core/annotators/monarch_annotator.py | 16 +- src/dug/core/crawler.py | 4 +- tests/unit/test_annotators.py | 266 +++++++++++++++++++ tests/unit/test_parsers.py | 7 +- tests/unit/test_utils.py | 46 ++-- 9 files changed, 346 insertions(+), 60 deletions(-) create mode 100644 tests/unit/test_annotators.py diff --git a/Makefile b/Makefile index 2b4a27d..22faf3f 100644 --- a/Makefile +++ b/Makefile @@ -40,7 +40,6 @@ install.dug: #test: Run all tests test: - # ${PYTHON} -m flake8 src ${PYTHON} -m pytest --doctest-modules src coverage run -m pytest tests diff --git a/src/dug/core/__init__.py b/src/dug/core/__init__.py index b353a92..6ce8aa1 100644 --- a/src/dug/core/__init__.py +++ b/src/dug/core/__init__.py @@ -15,7 +15,7 @@ from dug.core import annotators from dug.core.factory import DugFactory from dug.core.parsers import DugConcept, Parser, get_parser -from dug.core.annotators import DugAnnotator, Annotator, get_annotator +from dug.core.annotators import DugIdentifier, Annotator, get_annotator logger = logging.getLogger('dug') stdout_log_handler = logging.StreamHandler(sys.stdout) diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index 29b7eb4..aef4b69 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -3,9 +3,9 @@ import pluggy -from ._base import DugElement, DugConcept, Indexable, Annotator, FileAnnotator -from .monarch_annotator import AnnotatorMonarch - +from dug.config import Config +from ._base import DugIdentifier, Indexable, Annotator +from .monarch_annotator import AnnotateMonarch, PreprocessorMonarch, AnnotatorMonarch, NormalizerMonarch, SynonymFinderMonarch logger = logging.getLogger('dug') @@ -13,7 +13,7 @@ @hookimpl def define_annotators(annotator_dict: Dict[str, Annotator]): - annotator_dict["annotator-monarch"] = AnnotatorMonarch() + annotator_dict["annotator-monarch"] = build_monarch_annotator() class AnnotatorNotFoundException(Exception): @@ -32,4 +32,20 @@ def get_annotator(hook, annotator_name) -> Annotator: err_msg = f"Cannot find annotator of type '{annotator_name}'\n" \ f"Supported annotators: {', '.join(available_annotators.keys())}" logger.error(err_msg) - raise AnnotatorNotFoundException(err_msg) \ No newline at end of file + raise AnnotatorNotFoundException(err_msg) + +def build_monarch_annotator(config: Config) -> AnnotateMonarch: + print(**config.preprocessor) + preprocessor = PreprocessorMonarch(**config.preprocessor) + annotator = AnnotatorMonarch(**config.annotator) + normalizer = NormalizerMonarch(**config.normalizer) + synonym_finder = SynonymFinderMonarch(**config.synonym_service) + + annotator = AnnotateMonarch( + preprocessor=preprocessor, + annotator=annotator, + normalizer=normalizer, + synonym_finder=synonym_finder + ) + + return annotator diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index 8e8dcb8..70c85a6 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -3,7 +3,6 @@ from typing import Union, Callable, Any, Iterable, Awaitable, TypeVar, Generic from dug import utils as utils from requests import Session -from dug.config import Config as AnnotatorConfig logger = logging.getLogger('dug') @@ -24,8 +23,29 @@ def __init__(self, id, label, types=None, search_text="", description=""): self.synonyms = [] self.purl = "" + @property + def id_type(self): + return self.id.split(":")[0] + + def add_search_text(self, text): + # Add text only if it's unique and if not empty string + if text and text not in self.search_text: + self.search_text.append(text) + + def get_searchable_dict(self): + # Return a version of the identifier compatible with what's in ElasticSearch + es_ident = { + 'id': self.id, + 'label': self.label, + 'equivalent_identifiers': self.equivalent_identifiers, + 'type': self.types, + 'synonyms': self.synonyms + } + return es_ident + def jsonable(self): return self.__dict__ + def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) @@ -47,22 +67,6 @@ def __call__(self, value: Input, http_session: Session) -> Output: return result -# def build_annotator(self) -> DugAnnotator: - -# preprocessor = Preprocessor(**self.config.preprocessor) -# annotator = Annotate(**self.config.annotator) -# normalizer = Normalizer(**self.config.normalizer) -# synonym_finder = SynonymFinder(**self.config.synonym_service) - -# annotator = DugAnnotator( -# preprocessor=preprocessor, -# annotator=annotator, -# normalizer=normalizer, -# synonym_finder=synonym_finder -# ) - -# return annotator - Indexable = Union[DugIdentifier, AnnotatorSession] # Indexable = DugIdentifier Annotator = Callable[[Any], Iterable[Indexable]] diff --git a/src/dug/core/annotators/monarch_annotator.py b/src/dug/core/annotators/monarch_annotator.py index d58be67..47f5bcb 100644 --- a/src/dug/core/annotators/monarch_annotator.py +++ b/src/dug/core/annotators/monarch_annotator.py @@ -5,11 +5,11 @@ import urllib.parse from typing import TypeVar, Generic, Union, List, Tuple, Optional import bmt -import requests +# import requests from requests import Session -from ._base import DugIdentifier, AnnotatorSession, Input, AnnotatorConfig -from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer +from ._base import DugIdentifier, AnnotatorSession, Input +from .utils.biolink_purl_util import BioLinkPURLerizer logger = logging.getLogger('dug') @@ -19,18 +19,16 @@ class AnnotateMonarch: def __init__( self, - config: AnnotatorConfig, preprocessor: "PreprocessorMonarch", annotator: "AnnotatorMonarch", normalizer: "NormalizerMonarch", synonym_finder: "SynonymFinderMonarch", ontology_greenlist=[], ): - self.config = config - self.preprocessor = preprocessor(**self.config.preprocessor) - self.annotator = annotator(**self.config.annotator) - self.normalizer = normalizer(**self.config.normalizer) - self.synonym_finder = synonym_finder(**self.config.synonym_service) + self.preprocessor = preprocessor + self.annotator = annotator + self.normalizer = normalizer + self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist self.norm_fails_file = "norm_fails.txt" self.anno_fails_file = "anno_fails.txt" diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 744291d..26e0c4d 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -4,7 +4,7 @@ import traceback from dug.core.parsers import Parser, DugElement, DugConcept -from dug.core.annotators import Annotator, DugAnnotator +from dug.core.annotators import Annotator, DugIdentifier import dug.core.tranql as tql from dug.utils import biolink_snake_case, get_formatted_biolink_name @@ -145,6 +145,8 @@ def annotate_element(self, element): # Annotate with a set of normalized ontology identifiers identifiers = self.annotator.annotate(text=element.ml_ready_desc, http_session=self.http_session) + # Future thoughts... should we be passing in the stpe DugIdentifier here instead? + # Each identifier then becomes a concept that links elements together for identifier in identifiers: diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py new file mode 100644 index 0000000..eb03534 --- /dev/null +++ b/tests/unit/test_annotators.py @@ -0,0 +1,266 @@ +from copy import copy +from typing import List + +import pytest + +from dug.config import Config +from dug.core.annotators import ( DugIdentifier, + AnnotateMonarch, + PreprocessorMonarch, + AnnotatorMonarch, + NormalizerMonarch, + SynonymFinderMonarch + ) +from unittest.mock import MagicMock + +def test_identifier(): + ident_1 = DugIdentifier( + "PrimaryIdent:1", "first identifier", types=[], search_text="", description="" + ) + + assert "PrimaryIdent" == ident_1.id_type + +def test_monarch_annotator(): + cfg = Config.from_env() + url = cfg.annotator["url"] + preprocessor = PreprocessorMonarch(**cfg.preprocessor) + annotator = AnnotatorMonarch(**cfg.annotator) + normalizer = NormalizerMonarch(**cfg.normalizer) + synonym_finder = SynonymFinderMonarch(**cfg.synonym_service) + + annotator = AnnotateMonarch( + preprocessor=preprocessor, + annotator=annotator, + normalizer=normalizer, + synonym_finder=synonym_finder + ) + # annotator = AnnotateMonarch() + assert annotator.annotate(text="Lama", http_session = MagicMock()) == url + +# @pytest.mark.parametrize( +# "preprocessor,input_text,expected_text", +# [ +# (Preprocessor(), "Hello_world", "Hello world"), +# (Preprocessor({"Hello": "Hi"}, ["placeholder"]), "Hello placeholder world", "Hi world"), +# ] +# ) +# def test_preprocessor_preprocess(preprocessor, input_text, expected_text): +# original_text = copy(input_text) +# output_text = preprocessor.preprocess(input_text) + +# assert input_text == original_text # Don't modify in-place +# assert output_text == expected_text + + +# def test_annotator_init(): +# cfg = Config.from_env() +# url = cfg.annotator["url"] + +# annotator = Annotator(**cfg.annotator) +# assert annotator.url == url + + +# def test_annotator_handle_response(): +# annotator = Annotator('foo') + +# response = { +# "content": "heart attack", +# "spans": [ +# { +# "start": 0, +# "end": 5, +# "text": "heart", +# "token": [ +# { +# "id": "UBERON:0015230", +# "category": [ +# "anatomical entity" +# ], +# "terms": [ +# "dorsal vessel heart" +# ] +# } +# ] +# }, +# { +# "start": 0, +# "end": 5, +# "text": "heart", +# "token": [ +# { +# "id": "UBERON:0007100", +# "category": [ +# "anatomical entity" +# ], +# "terms": [ +# "primary circulatory organ" +# ] +# } +# ] +# }, +# { +# "start": 0, +# "end": 5, +# "text": "heart", +# "token": [ +# { +# "id": "UBERON:0015228", +# "category": [ +# "anatomical entity" +# ], +# "terms": [ +# "circulatory organ" +# ] +# } +# ] +# }, +# { +# "start": 0, +# "end": 5, +# "text": "heart", +# "token": [ +# { +# "id": "ZFA:0000114", +# "category": [ +# "anatomical entity" +# ], +# "terms": [ +# "heart" +# ] +# } +# ] +# }, +# { +# "start": 0, +# "end": 5, +# "text": "heart", +# "token": [ +# { +# "id": "UBERON:0000948", +# "category": [ +# "anatomical entity" +# ], +# "terms": [ +# "heart" +# ] +# } +# ] +# }, +# { +# "start": 0, +# "end": 12, +# "text": "heart attack", +# "token": [ +# { +# "id": "MONDO:0005068", +# "category": [ +# "disease" +# ], +# "terms": [ +# "myocardial infarction (disease)" +# ] +# } +# ] +# }, +# { +# "start": 0, +# "end": 12, +# "text": "heart attack", +# "token": [ +# { +# "id": "HP:0001658", +# "category": [ +# "phenotype", +# "quality" +# ], +# "terms": [ +# "Myocardial infarction" +# ] +# } +# ] +# } +# ] +# } + +# identifiers: List[DugIdentifier] = annotator.handle_response(None, response) + +# assert len(identifiers) == 7 +# assert isinstance(identifiers[0], DugIdentifier) + + +# def test_annotator_call(annotator_api): +# url = "http://annotator.api/?content=" + +# annotator = Annotator(url) + +# text = "heart attack" +# identifiers: List[DugIdentifier] = annotator.annotate(text, annotator_api) + +# assert len(identifiers) == 7 +# assert isinstance(identifiers[0], DugIdentifier) + + +# def test_normalizer(normalizer_api): +# url = "http://normalizer.api/?curie=" + +# identifier = DugIdentifier( +# "UBERON:0007100", +# label='primary circulatory organ', +# types=['anatomical entity'], +# description="", +# search_text=['heart'], +# ) + +# normalizer = Normalizer(url) +# output = normalizer.normalize(identifier, normalizer_api) +# assert isinstance(output, DugIdentifier) +# assert output.id == 'UBERON:0007100' +# assert output.label == "primary circulatory organ" +# assert output.equivalent_identifiers == ['UBERON:0007100'] +# assert output.types == 'anatomical entity' + + + +# def test_synonym_finder(synonym_api): +# curie = "UBERON:0007100" +# url = f"http://synonyms.api" +# finder = SynonymFinder(url) +# result = finder.get_synonyms( +# curie, +# synonym_api, +# ) +# assert result == [ +# "primary circulatory organ", +# "dorsal tube", +# "adult heart", +# "heart" +# ] + + + + + +# def test_yield_partial_text(): +# annotator = Annotator('foo') +# # text contains 800 characters + 9 new lines +# text = """COG Protocol number on which the patient was enrolled [901=Trial of mouse monoclonal Anti-GD-2 antibody 14.G2A plus IL-2 with or without GM-CSF in children with refractory NBL or melanoma; 911=I-131-MIBG for therapy of advanced neuroblastoma; 914=A dose escalation study of cisplatin, doxorubicin, VP-16, and ifosfamide followed by GM-CSF in advanced NBL and peripheral neuroepithelioma; 925=Study of topotecan; 935=Study of ch14.18 with GM-CSF in children with NBL and other GD2 positive malignancies immediately post ABMT or PBSC; 937=Phase I trial of ZD1694, an inhibitor of thymidylate synthase, in pediatric patients with advanced neoplastic disease; 9709=A phase I study of fenretinide in children with high risk solid tumors; 321P2=New intensive chemotherapy for CCG stage II (with N-myc amplification), stage III and stage IV neuroblastoma; 321P3=Treatment of poor prognosis neuroblastoma before disease progression with intensive multimodal therapy and BMT; 323P=Cyclic combination chemotherapy for newly diagnosed stage III neuroblastoma age 2 and older and stage IV Nneuroblastoma all ages; 3881=Biology and therapy of good, intermediate, and selected poor prognosis neuroblastoma; 3891=Conventional dose chemoradiotherapy vs ablative chemoradiotherapy with autologous BMT for high-risk neuroblastoma; 3951=Phase I pilot study of multiple cycles of high dose chemotherapy with peripheral blood stem cell infusions in advanced stage neuroblastoma.; 4941=National Wilms tumor study V - therapeutic trial & biology study; 8605=Study of the combination of ifosfamide, mesna, and VP-16 in children and young adults with recurrent sarcomas, PNET and other tumors; 8742=Phase III portion of 8741 for neuroblastoma; 9047=Neuroblastoma biology protocol; 9082=Protocol for the development of intervention strategies to reduce the time between symptom onset and diagnosis of childhood cancer -a pediatric oncology group cancer control study; 9140=Therapy for patients with recurrent or refractory neuroblastoma - a phase II study; 9262=A Phase II study of taxol in children with recurrent/refractory soft-tissue sarcoma, rhabdomyosarcoma, osteosarcoma, Ewing's sarcoma, neuroblastoma, germ cell tumors, Wilms' tumor, hepatoblastoma, and hepatocellular carcinoma, a POG study; 9280=Neuroblastoma epidemiology protocol - A Non-Therapeutic Study - A Joint Project of: The University of North Carolina, The Pediatric Oncology Group and The Children's Cancer Study Group; 9340=Treatment of patients >365 days at diagnosis with stage IV NBL: Upfront Phase II Window - A Phase II Study; 9341=Treatment of patients >365 days at diagnosis with stage IV and stage IIB/III (N-myc) NBL - a phase III study; 9342=Neuroblastoma #5, bone marrow transplant - a phase III study; 9343=Interleukin-6 in children receiving autologous bone marrow transplantation for advanced neuroblastoma - a pediatric oncology group phase I trial; 9361=Topotecan in pediatric patients with recurrent or progressive solid tumors - a pediatric oncology group phase II study; 9375=Topotecan plus cyclophosphamide in children with solid tumors - a pediatric oncology group phase I trial; 9464=Cyclophosphamide plus topotecan in children with recurrent or refractory solid tumors - a pediatric oncology group phase II study; 9640=Treatment of patients with high risk neuroblastoma (a feasibility pilot) using two cycles of marrow ablative chemotherapy followed by rescue With peripheral blood stem cells (PBSC), radiation therapy; A3973=A randomized study of purged vs. unpurged PBSC transplant following dose intensive induction therapy for high risk NBL; AADM01P1=Protocol for registration and consent to the childhood cancer research network: a limited institution pilot; AAML00P2=A dose finding study of the safety of gemtuzumab ozogamicin combined with conventional chemotherapy for patients with relapsed or refractory acute myeloid leukemia; ACCL0331=A Randomized double blind placebo controlled clinical trial to assess the efficacy of traumeelĀ® S (IND # 66649) for the prevention and treatment of mucositis in children undergoing hematopoietic stem cell transplantation; ACCRN07=Protocol for the enrollment on the official COG registry, The Childhood Cancer Research Network (CCRN); ADVL0018=Phase I study of hu14.18-IL2 fusion protein in patients with refractory neuroblastoma and other refractory GD2 expressing tumors; ADVL0212=A Phase I study of depsipeptide (NSC#630176, IND# 51810) in pediatric patients with refractory solid tumors and leukemias; ADVL0214=A phase I study of single agent OSI-774 (Tarceva) (NSC # 718781, IND #63383) followed by OSI-774 with temozolomide for patients with selected recurrent/refractory solid tumors, including brain tumors; ADVL0215=A phase I study of decitabine in combination with doxorubicin and cyclophosphamide in the treatment of relapsed or refractory solid tumors; ADVL0421=A phase II study of oxaliplatin in children with recurrent solid tumors; ADVL0524=Phase II trial of ixabepilone (BMS-247550), an epothilone B analog, in children and young adults with refractory solid tumors; ADVL0525=A phase II study of pemetrexed in children with recurrent malignancies; ADVL06B1=A pharmacokinetic-pharmacodynamic-pharmacogenetic study of actinomycin-D and vincristine in children with cancer; ADVL0714=A phase I study of VEGF trap (NSC# 724770, IND# 100137) in children with refractory solid tumors; ALTE03N1=Key adverse events after childhood cancer; ALTE05N1=Umbrella long-term follow-up protocol; ANBL0032=Phase III randomized study of chimeric antibody 14.18 (Ch14.18) in high risk neuroblastoma following myeloablative therapy and autologous stem cell rescue; ANBL00B1=Neuroblastoma biology studies; ANBL00P1=A pilot study of tandem high dose chemotherapy with stem cell rescue following induction therapy in children with high risk neuroblastoma; ANBL02P1=A pilot induction regimen incorporating dose-intensive topotecan and cyclophosphamide for treatment of newly diagnosed high risk neuroblastoma; ANBL0321=Phase II study of fenretinide in pediatric patients with resistant or recurrent neuroblastoma; ANBL0322=A phase II study of hu14.18-IL2 (BB-IND-9728) in children with recurrent or refractory neuroblastoma; ANBL0532=Phase III randomized trial of single vs. tandem myeloablative as consolidation therapy for high-risk neuroblastoma; ANBL0621=A phase II study of ABT-751, an orally bioavailable tubulin binding agent, in children with relapsed or refractory neuroblastoma; B003=Diagnostic & prognostic studies in NBL; B903=Childhood cancer genetics; B947=Protocol for collection of biology specimens for research studies; B954=Opsoclonus-myoclonus-ataxia syndrome, neuroblastoma and the presence of anti-neuronal antibodies; B973=Laboratory-clinical studies of neuroblastoma; E04=Self-administered epidemiology questionnaire; E18=A case-control study of risk factors for neuroblastoma; I03=Neuroblastoma, diagnostic/prognostic; N891=Parents' perceptions of randomization; P9462=Randomized treatment of recurrent neuroblastoma with topotecan regimens following desferrioxamine (POG only) in an investigational window; P9641=Primary surgical therapy for biologically defined low-risk neuroblastoma; P9761=A phase II trial of irinotecan in children with refractory solid tumors; P9963=A phase II trial of rebeccamycin analogue (NSC #655649) in children with solid tumors; R9702=Prognostic implications of MIBG uptake in patients with neuroblastoma previously treated on CCG-3891; S31=Right atrial catheter study; S921=Comparison of urokinase vs heparin in preventing Infection in central venous devices in children with malignancies]""" +# chunks = "" +# is_the_beginning = True +# max_chars = 2000 +# padding_words = 3 +# counter = 0 +# print(len(text)) +# # divvy up into chunks, sum of each chunk should equal the original text. +# for chunk in annotator.sliding_window(text=text, max_characters=max_chars, padding_words= padding_words): +# assert len(chunk) <= max_chars +# counter += 1 +# if is_the_beginning: +# chunks += chunk +# else: +# # remove redundand padded words from final result +# chunks += " ".join(chunk.split(" ")[padding_words:]) +# is_the_beginning = False + +# print(counter) +# # since spaces are trimmed by tokenizer , we can execuled all spaces and do char +# assert chunks == text \ No newline at end of file diff --git a/tests/unit/test_parsers.py b/tests/unit/test_parsers.py index c37df40..0755fed 100644 --- a/tests/unit/test_parsers.py +++ b/tests/unit/test_parsers.py @@ -1,12 +1,13 @@ -from dug.core.annotate import Identifier from dug.core.parsers._base import DugElement, DugConcept +from dug.core.annotate import Identifier as DugIdentifier +from dug.core.annotators.monarch_annotator import AnnotateMonarch def test_dug_concept(): concept = DugConcept("concept-1", 'Concept-1', 'The first concept', 'secondary') - ident_1 = Identifier("ident-1", "Identifier-1") - ident_2 = Identifier("ident-2", "Identifier-2") + ident_1 = DugIdentifier("ident-1", "Identifier-1") + ident_2 = DugIdentifier("ident-2", "Identifier-2") concept.add_identifier(ident_1) concept.add_identifier(ident_2) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index fd841a8..df6f9e9 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,33 +1,33 @@ -import pytest +# import pytest -from dug.utils import get_nida_study_link -import requests +# from dug.utils import get_nida_study_link +# import requests -@pytest.mark.skip("Implement this test") -def test_object_factory(): - pass +# @pytest.mark.skip("Implement this test") +# def test_object_factory(): +# pass -@pytest.mark.skip("Implement this test") -def test_complex_handler(): - pass +# @pytest.mark.skip("Implement this test") +# def test_complex_handler(): +# pass -@pytest.mark.skip("Implement this test") -def test_get_dbgap_var_link(): - pass +# @pytest.mark.skip("Implement this test") +# def test_get_dbgap_var_link(): +# pass -@pytest.mark.skip("Implement this test") -def test_get_dbgap_study_link(): - pass +# @pytest.mark.skip("Implement this test") +# def test_get_dbgap_study_link(): +# pass -def test_get_nida_study_link(): - study_id = "NIDA-CPU-0008" - link = get_nida_study_link(study_id=study_id) - response = requests.post( - url=link - ) - content = str(response.text) - assert content.count(study_id) > 0 +# def test_get_nida_study_link(): +# study_id = "NIDA-CPU-0008" +# link = get_nida_study_link(study_id=study_id) +# response = requests.post( +# url=link +# ) +# content = str(response.text) +# assert content.count(study_id) > 0 From f8a45e1562decedd64949e6996fc0912c3853f00 Mon Sep 17 00:00:00 2001 From: braswent Date: Fri, 27 Oct 2023 16:37:50 -0400 Subject: [PATCH 05/85] feat: --- src/dug/core/annotators/__init__.py | 23 +- src/dug/core/annotators/_base.py | 103 ++++++++- src/dug/core/annotators/monarch_annotator.py | 221 ++++--------------- src/dug/core/crawler.py | 2 +- tests/unit/test_annotators.py | 9 +- 5 files changed, 164 insertions(+), 194 deletions(-) diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index aef4b69..6984594 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -4,8 +4,8 @@ import pluggy from dug.config import Config -from ._base import DugIdentifier, Indexable, Annotator -from .monarch_annotator import AnnotateMonarch, PreprocessorMonarch, AnnotatorMonarch, NormalizerMonarch, SynonymFinderMonarch +from ._base import DugIdentifier, Indexable, Annotator, DefaultNormalizer, DefaultSynonymFinder +from .monarch_annotator import AnnotateMonarch logger = logging.getLogger('dug') @@ -13,7 +13,7 @@ @hookimpl def define_annotators(annotator_dict: Dict[str, Annotator]): - annotator_dict["annotator-monarch"] = build_monarch_annotator() + annotator_dict["annotator-monarch"] = build_annotator() class AnnotatorNotFoundException(Exception): @@ -34,18 +34,15 @@ def get_annotator(hook, annotator_name) -> Annotator: logger.error(err_msg) raise AnnotatorNotFoundException(err_msg) -def build_monarch_annotator(config: Config) -> AnnotateMonarch: - print(**config.preprocessor) - preprocessor = PreprocessorMonarch(**config.preprocessor) - annotator = AnnotatorMonarch(**config.annotator) - normalizer = NormalizerMonarch(**config.normalizer) - synonym_finder = SynonymFinderMonarch(**config.synonym_service) +def build_annotator(): + # annotator = AnnotatorMonarch(**config.annotator) + # normalizer = NormalizerMonarch(**config.normalizer) + # synonym_finder = SynonymFinderMonarch(**config.synonym_service) + config = Config annotator = AnnotateMonarch( - preprocessor=preprocessor, - annotator=annotator, - normalizer=normalizer, - synonym_finder=synonym_finder + normalizer=DefaultNormalizer(**config.normalizer), + synonym_finder=DefaultSynonymFinder(**config.synonym_service) ) return annotator diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index 70c85a6..6493aa9 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -1,8 +1,12 @@ import json import logging -from typing import Union, Callable, Any, Iterable, Awaitable, TypeVar, Generic +import re +import logging +import urllib.parse +from typing import Union, Callable, Any, Iterable, TypeVar, Generic, List, Optional from dug import utils as utils from requests import Session +import bmt logger = logging.getLogger('dug') @@ -67,7 +71,102 @@ def __call__(self, value: Input, http_session: Session) -> Output: return result -Indexable = Union[DugIdentifier, AnnotatorSession] +class DefaultNormalizer(AnnotatorSession[DugIdentifier, DugIdentifier]): + def __init__(self, url): + self.bl_toolkit = bmt.Toolkit() + self.url = url + + def __call__(self, identifier: DugIdentifier, http_session: Session): + # Use RENCI's normalization API service to get the preferred version of an identifier + logger.debug(f"Normalizing: {identifier.id}") + return self(identifier, http_session) + + def make_request(self, value: DugIdentifier, http_session: Session) -> dict: + curie = value.id + url = f"{self.url}{urllib.parse.quote(curie)}" + try: + response = http_session.get(url) + except Exception as get_exc: + logger.info(f"Error normalizing {value} at {url}") + logger.error(f"Error {get_exc.__class__.__name__}: {get_exc}") + return {} + try: + normalized = response.json() + except Exception as json_exc: + logger.info(f"Error processing response: {response.text} (HTTP {response.status_code})") + logger.error(f"Error {json_exc.__class__.__name__}: {json_exc}") + return {} + + return normalized + + def handle_response(self, identifier: DugIdentifier, normalized: dict) -> Optional[DugIdentifier]: + """ Record normalized results. """ + curie = identifier.id + normalization = normalized.get(curie, {}) + if normalization is None: + logger.info(f"Normalization service did not return normalization for: {curie}") + return None + + preferred_id = normalization.get("id", {}) + equivalent_identifiers = normalization.get("equivalent_identifiers", []) + biolink_type = normalization.get("type", []) + + # Return none if there isn't actually a preferred id + if 'identifier' not in preferred_id: + logger.debug(f"ERROR: normalize({curie})=>({preferred_id}). No identifier?") + return None + + logger.debug(f"Preferred id: {preferred_id}") + identifier.id = preferred_id.get('identifier', '') + identifier.label = preferred_id.get('label', '') + identifier.description = preferred_id.get('description', '') + identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] + try: + identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name + except: + # converts biolink:SmallMolecule to small molecule + identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() + return identifier + + +class DefaultSynonymFinder(AnnotatorSession[str, List[str]]): + + def __init__(self, url: str): + self.url = url + + def get_identifier_synonyms(self, curie: str, http_session): + ''' + This function uses the NCATS translator service to return a list of synonyms for + curie id + ''' + + return self(curie, http_session) + + def make_request(self, curie: str, http_session: Session): + # Get response from namelookup reverse lookup op + # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) + url = f"{self.url}" + payload = { + 'curies': [curie] + } + try: + response = http_session.post(url, json=payload) + if str(response.status_code).startswith('4'): + logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") + return {curie: []} + if str(response.status_code).startswith('5'): + logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") + return {curie: []} + return response.json() + except json.decoder.JSONDecodeError as e: + logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") + return {curie: []} + + def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: + # Return curie synonyms + return raw_synonyms.get(curie, []) + +Indexable = Union[DugIdentifier, DugAnnotator, AnnotatorSession] # Indexable = DugIdentifier Annotator = Callable[[Any], Iterable[Indexable]] # Annotator = Callable[[Any], Iterable[DugIdentifier]] \ No newline at end of file diff --git a/src/dug/core/annotators/monarch_annotator.py b/src/dug/core/annotators/monarch_annotator.py index 47f5bcb..1294997 100644 --- a/src/dug/core/annotators/monarch_annotator.py +++ b/src/dug/core/annotators/monarch_annotator.py @@ -4,8 +4,7 @@ import re import urllib.parse from typing import TypeVar, Generic, Union, List, Tuple, Optional -import bmt -# import requests +from dug.config import Config from requests import Session from ._base import DugIdentifier, AnnotatorSession, Input @@ -19,27 +18,34 @@ class AnnotateMonarch: def __init__( self, - preprocessor: "PreprocessorMonarch", - annotator: "AnnotatorMonarch", - normalizer: "NormalizerMonarch", - synonym_finder: "SynonymFinderMonarch", + normalizer, + synonym_finder, ontology_greenlist=[], ): - self.preprocessor = preprocessor - self.annotator = annotator + self.annotatorUrl = Config.annotator.url self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist self.norm_fails_file = "norm_fails.txt" self.anno_fails_file = "anno_fails.txt" - def annotate(self, text, http_session): + if debreviator is None: + debreviator = self.default_debreviator_factory() + self.decoder = debreviator + + if stopwords is None: + stopwords = [] + self.stopwords = stopwords + + def __call__(self, text, http_session) -> List[DugIdentifier]: # Preprocess text (debraviate, remove stopwords, etc.) - text = self.preprocessor.preprocess(text) + text = self.preprocess_text(text) # Fetch identifiers - raw_identifiers = self.annotator.annotate(text, http_session) + raw_identifiers = [] + for chunk_text in self.sliding_window(text): + raw_identifiers += self(chunk_text, http_session) # Write out to file if text fails to annotate if not raw_identifiers: @@ -50,7 +56,7 @@ def annotate(self, text, http_session): for identifier in raw_identifiers: # Normalize identifier using normalization service - norm_id = self.normalizer.normalize(identifier, http_session) + norm_id = self.normalizer(identifier, http_session) # Skip adding id if it doesn't normalize if norm_id is None: @@ -66,78 +72,14 @@ def annotate(self, text, http_session): norm_id = identifier # Add synonyms to identifier - norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session) + norm_id.synonyms = self.synonym_finder.get_identifier_synonyms(norm_id.id, http_session) # Get pURL for ontology identifer for more info norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) processed_identifiers.append(norm_id) return processed_identifiers - -class PreprocessorMonarch: - """"Class for preprocessing strings so they are better interpreted by NLP steps""" - - def __init__(self, debreviator=None, stopwords=None): - if debreviator is None: - debreviator = self.default_debreviator_factory() - self.decoder = debreviator - - if stopwords is None: - stopwords = [] - self.stopwords = stopwords - - def preprocess(self, text: str) -> str: - """ - Apply debreviator to replace abbreviations and other characters - - >>> pp = PreprocessorMonarch({"foo": "bar"}, ["baz"]) - >>> pp.preprocess("Hello foo") - 'Hello bar' - - >>> pp.preprocess("Hello baz world") - 'Hello world' - """ - - for key, value in self.decoder.items(): - text = text.replace(key, value) - - # Remove any stopwords - text = " ".join([word for word in text.split() if word not in self.stopwords]) - return text - - @staticmethod - def default_debreviator_factory(): - return {"bmi": "body mass index", "_": " "} - - -# Input = TypeVar("Input") -# Output = TypeVar("Output") - - -# class ApiClient(Generic[Input, Output]): - -# def make_request(self, value: Input, http_session: Session): -# raise NotImplementedError() - -# def handle_response(self, value, response: Union[dict, list]) -> Output: -# raise NotImplementedError() - -# def __call__(self, value: Input, http_session: Session) -> Output: -# response = self.make_request(value, http_session) - -# result = self.handle_response(value, response) - -# return result - - -class AnnotatorMonarch(AnnotatorSession[str, List[DugIdentifier]]): - """ - Use monarch API service to fetch ontology IDs found in text - """ - - def __init__(self, url: str): - self.url = url - + def sliding_window(self, text, max_characters=2000, padding_words=5): """ For long texts sliding window works as the following @@ -166,16 +108,16 @@ def sliding_window(self, text, max_characters=2000, padding_words=5): window_end = True yield current_string - def annotate(self, text, http_session): - logger.debug(f"Annotating: {text}") - identifiers = [] - for chunk_text in self.sliding_window(text): - identifiers += self(chunk_text, http_session) - return identifiers + # def annotate_text(self, text, http_session): + # logger.debug(f"Annotating: {text}") + # identifiers = [] + # for chunk_text in self.sliding_window(text): + # identifiers += self(chunk_text, http_session) + # return identifiers def make_request(self, value: Input, http_session: Session): value = urllib.parse.quote(value) - url = f'{self.url}{value}' + url = f'{self.annotatorUrl}{value}' # This could be moved to a config file NUM_TRIES = 5 @@ -207,99 +149,26 @@ def handle_response(self, value, response: dict) -> List[DugIdentifier]: types=biolink_types, search_text=search_text)) return identifiers + + def preprocess_text(self, text: str) -> str: + """ + Apply debreviator to replace abbreviations and other characters + >>> pp = PreprocessorMonarch({"foo": "bar"}, ["baz"]) + >>> pp.preprocess("Hello foo") + 'Hello bar' -class NormalizerMonarch(AnnotatorSession[DugIdentifier, DugIdentifier]): - def __init__(self, url): - self.bl_toolkit = bmt.Toolkit() - self.url = url - - def normalize(self, identifier: DugIdentifier, http_session: Session): - # Use RENCI's normalization API service to get the preferred version of an identifier - logger.debug(f"Normalizing: {identifier.id}") - return self(identifier, http_session) - - def make_request(self, value: DugIdentifier, http_session: Session) -> dict: - curie = value.id - url = f"{self.url}{urllib.parse.quote(curie)}" - try: - response = http_session.get(url) - except Exception as get_exc: - logger.info(f"Error normalizing {value} at {url}") - logger.error(f"Error {get_exc.__class__.__name__}: {get_exc}") - return {} - try: - normalized = response.json() - except Exception as json_exc: - logger.info(f"Error processing response: {response.text} (HTTP {response.status_code})") - logger.error(f"Error {json_exc.__class__.__name__}: {json_exc}") - return {} - - return normalized - - def handle_response(self, identifier: DugIdentifier, normalized: dict) -> Optional[DugIdentifier]: - """ Record normalized results. """ - curie = identifier.id - normalization = normalized.get(curie, {}) - if normalization is None: - logger.info(f"Normalization service did not return normalization for: {curie}") - return None - - preferred_id = normalization.get("id", {}) - equivalent_identifiers = normalization.get("equivalent_identifiers", []) - biolink_type = normalization.get("type", []) - - # Return none if there isn't actually a preferred id - if 'identifier' not in preferred_id: - logger.debug(f"ERROR: normalize({curie})=>({preferred_id}). No identifier?") - return None - - logger.debug(f"Preferred id: {preferred_id}") - identifier.id = preferred_id.get('identifier', '') - identifier.label = preferred_id.get('label', '') - identifier.description = preferred_id.get('description', '') - identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] - try: - identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name - except: - # converts biolink:SmallMolecule to small molecule - identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() - return identifier - - -class SynonymFinderMonarch(AnnotatorSession[str, List[str]]): - - def __init__(self, url: str): - self.url = url - - def get_synonyms(self, curie: str, http_session): - ''' - This function uses the NCATS translator service to return a list of synonyms for - curie id - ''' + >>> pp.preprocess("Hello baz world") + 'Hello world' + """ - return self(curie, http_session) + for key, value in self.decoder.items(): + text = text.replace(key, value) - def make_request(self, curie: str, http_session: Session): - # Get response from namelookup reverse lookup op - # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) - url = f"{self.url}" - payload = { - 'curies': [curie] - } - try: - response = http_session.post(url, json=payload) - if str(response.status_code).startswith('4'): - logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") - return {curie: []} - if str(response.status_code).startswith('5'): - logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") - return {curie: []} - return response.json() - except json.decoder.JSONDecodeError as e: - logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") - return {curie: []} + # Remove any stopwords + text = " ".join([word for word in text.split() if word not in self.stopwords]) + return text - def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: - # Return curie synonyms - return raw_synonyms.get(curie, []) \ No newline at end of file + @staticmethod + def default_debreviator_factory(): + return {"bmi": "body mass index", "_": " "} \ No newline at end of file diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 26e0c4d..40140d8 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -143,7 +143,7 @@ def annotate_elements(self): def annotate_element(self, element): # Annotate with a set of normalized ontology identifiers - identifiers = self.annotator.annotate(text=element.ml_ready_desc, + identifiers = self.annotator(text=element.ml_ready_desc, http_session=self.http_session) # Future thoughts... should we be passing in the stpe DugIdentifier here instead? diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index eb03534..f4ffa0f 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -20,7 +20,7 @@ def test_identifier(): assert "PrimaryIdent" == ident_1.id_type -def test_monarch_annotator(): +def test_monarch_annotator(annotator_api): cfg = Config.from_env() url = cfg.annotator["url"] preprocessor = PreprocessorMonarch(**cfg.preprocessor) @@ -35,7 +35,12 @@ def test_monarch_annotator(): synonym_finder=synonym_finder ) # annotator = AnnotateMonarch() - assert annotator.annotate(text="Lama", http_session = MagicMock()) == url + # assert annotator.annotation_step(text="Lama", http_session = MagicMock()) == url + text = "heart attack" + identifiers: List[DugIdentifier] = annotator(text, annotator_api) + + assert len(identifiers) == 7 + assert isinstance(identifiers[0], DugIdentifier) # @pytest.mark.parametrize( # "preprocessor,input_text,expected_text", From 3a9c710261d97002f751faf11529423b9d9ac3e8 Mon Sep 17 00:00:00 2001 From: braswent Date: Tue, 31 Oct 2023 09:39:29 -0400 Subject: [PATCH 06/85] feat: --- src/dug/core/annotators/__init__.py | 9 +- src/dug/core/annotators/_base.py | 118 ++++--- src/dug/core/annotators/monarch_annotator.py | 44 +-- src/dug/core/crawler.py | 1 + tests/integration/test_annotators.py | 271 ++++++++++++++++ tests/unit/mocks/MockCrawler.py | 2 +- tests/unit/mocks/data/test_config.py | 33 ++ tests/unit/test_annotate.py | 1 + tests/unit/test_annotators.py | 317 ++++++------------- tests/unit/test_core/test_search.py | 1 + tests/unit/test_crawler.py | 2 +- 11 files changed, 513 insertions(+), 286 deletions(-) create mode 100644 tests/integration/test_annotators.py create mode 100644 tests/unit/mocks/data/test_config.py diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index 6984594..a617be8 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -35,14 +35,11 @@ def get_annotator(hook, annotator_name) -> Annotator: raise AnnotatorNotFoundException(err_msg) def build_annotator(): - - # annotator = AnnotatorMonarch(**config.annotator) - # normalizer = NormalizerMonarch(**config.normalizer) - # synonym_finder = SynonymFinderMonarch(**config.synonym_service) - config = Config + config = Config.from_env() annotator = AnnotateMonarch( normalizer=DefaultNormalizer(**config.normalizer), - synonym_finder=DefaultSynonymFinder(**config.synonym_service) + synonym_finder=DefaultSynonymFinder(**config.synonym_service), + config=config, ) return annotator diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index 6493aa9..645060f 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -8,13 +8,21 @@ from requests import Session import bmt -logger = logging.getLogger('dug') +logger = logging.getLogger("dug") logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) class DugIdentifier: + """ The Dug Identifier is the core piece of information about a concept that produced from a request to an annotator based on a some original source of data. + \n The information that is being stored is mostly meant to support the Monarch API but should be adjusted accordingly to suit new Annotators needs in the future. + \n The information that will be needed for all annotators are: + \n id: The CURIE identifier + \n label: The CURIE identifier + \n description: The CURIE identifier + \n When there is another supported Normalizer it will be seperated into a separate plugin like annotator. + """ def __init__(self, id, label, types=None, search_text="", description=""): self.id = id self.label = label @@ -39,25 +47,26 @@ def add_search_text(self, text): def get_searchable_dict(self): # Return a version of the identifier compatible with what's in ElasticSearch es_ident = { - 'id': self.id, - 'label': self.label, - 'equivalent_identifiers': self.equivalent_identifiers, - 'type': self.types, - 'synonyms': self.synonyms + "id": self.id, + "label": self.label, + "equivalent_identifiers": self.equivalent_identifiers, + "type": self.types, + "synonyms": self.synonyms, } return es_ident def jsonable(self): return self.__dict__ - + def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) + Input = TypeVar("Input") Output = TypeVar("Output") -class AnnotatorSession(Generic[Input, Output]): +class AnnotatorSession(Generic[Input, Output]): def make_request(self, value: Input, http_session: Session): raise NotImplementedError() @@ -71,15 +80,23 @@ def __call__(self, value: Input, http_session: Session) -> Output: return result -class DefaultNormalizer(AnnotatorSession[DugIdentifier, DugIdentifier]): + +class DefaultNormalizer(): + """ After annotation there must be a Noramlizing step to collasce equivalent concepts into one official concept. This is a needed step for the knowledge graph to map between different concepts. + \n The reason why this class in integrated into the annotators.py is because currently there is only one supported Normalizer through the NCATs Translator API. + \n When there is another supported Normalizer it will be seperated into a separate plugin like annotator. + """ + def __init__(self, url): self.bl_toolkit = bmt.Toolkit() self.url = url - def __call__(self, identifier: DugIdentifier, http_session: Session): + def __call__(self, identifier: DugIdentifier, http_session: Session) -> DugIdentifier: # Use RENCI's normalization API service to get the preferred version of an identifier logger.debug(f"Normalizing: {identifier.id}") - return self(identifier, http_session) + response = self.make_request(identifier, http_session) + result = self.handle_response(identifier, response) + return result def make_request(self, value: DugIdentifier, http_session: Session) -> dict: curie = value.id @@ -93,18 +110,24 @@ def make_request(self, value: DugIdentifier, http_session: Session) -> dict: try: normalized = response.json() except Exception as json_exc: - logger.info(f"Error processing response: {response.text} (HTTP {response.status_code})") + logger.info( + f"Error processing response: {response.text} (HTTP {response.status_code})" + ) logger.error(f"Error {json_exc.__class__.__name__}: {json_exc}") return {} return normalized - def handle_response(self, identifier: DugIdentifier, normalized: dict) -> Optional[DugIdentifier]: - """ Record normalized results. """ + def handle_response( + self, identifier: DugIdentifier, normalized: dict + ) -> Optional[DugIdentifier]: + """Record normalized results.""" curie = identifier.id normalization = normalized.get(curie, {}) if normalization is None: - logger.info(f"Normalization service did not return normalization for: {curie}") + logger.info( + f"Normalization service did not return normalization for: {curie}" + ) return None preferred_id = normalization.get("id", {}) @@ -112,61 +135,78 @@ def handle_response(self, identifier: DugIdentifier, normalized: dict) -> Option biolink_type = normalization.get("type", []) # Return none if there isn't actually a preferred id - if 'identifier' not in preferred_id: + if "identifier" not in preferred_id: logger.debug(f"ERROR: normalize({curie})=>({preferred_id}). No identifier?") return None logger.debug(f"Preferred id: {preferred_id}") - identifier.id = preferred_id.get('identifier', '') - identifier.label = preferred_id.get('label', '') - identifier.description = preferred_id.get('description', '') - identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] - try: + identifier.id = preferred_id.get("identifier", "") + identifier.label = preferred_id.get("label", "") + identifier.description = preferred_id.get("description", "") + identifier.equivalent_identifiers = [ + v["identifier"] for v in equivalent_identifiers + ] + try: identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name except: - # converts biolink:SmallMolecule to small molecule - identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() + # converts biolink:SmallMolecule to small molecule + identifier.types = ( + " ".join( + re.split("(?=[A-Z])", biolink_type[0].replace("biolink:", ""))[1:] + ) + ).lower() return identifier -class DefaultSynonymFinder(AnnotatorSession[str, List[str]]): +class DefaultSynonymFinder(): + """ The SynonymFinder stores synonyms for concepts in the knowledge graph so users in the Dug User Interface can find concepts that match their search criteria. + \n The reason why this class in integrated into the annotators.py is because currently there is only one supported SynonymFinder through the deployed by RENCI. + \n When there is another supported SynonymFinder it will be seperated into a separate plugin like annotator. + """ def __init__(self, url: str): self.url = url - def get_identifier_synonyms(self, curie: str, http_session): - ''' + # def get_identifier_synonyms + def __call__(self, curie: str, http_session): + """ This function uses the NCATS translator service to return a list of synonyms for curie id - ''' - - return self(curie, http_session) + """ + response = self.make_request(curie, http_session) + result = self.handle_response(curie, response) + return result def make_request(self, curie: str, http_session: Session): # Get response from namelookup reverse lookup op # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) url = f"{self.url}" - payload = { - 'curies': [curie] - } + payload = {"curies": [curie]} try: response = http_session.post(url, json=payload) - if str(response.status_code).startswith('4'): - logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") + if str(response.status_code).startswith("4"): + logger.error( + f"No synonyms returned for: `{curie}`. Validation error: {response.text}" + ) return {curie: []} - if str(response.status_code).startswith('5'): - logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") + if str(response.status_code).startswith("5"): + logger.error( + f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}" + ) return {curie: []} return response.json() except json.decoder.JSONDecodeError as e: - logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") + logger.error( + f"Json parse error for response from `{url}`. Exception: {str(e)}" + ) return {curie: []} def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: # Return curie synonyms return raw_synonyms.get(curie, []) -Indexable = Union[DugIdentifier, DugAnnotator, AnnotatorSession] + +Indexable = Union[DugIdentifier, AnnotatorSession] # Indexable = DugIdentifier Annotator = Callable[[Any], Iterable[Indexable]] -# Annotator = Callable[[Any], Iterable[DugIdentifier]] \ No newline at end of file +# Annotator = Callable[[Any], Iterable[DugIdentifier]] diff --git a/src/dug/core/annotators/monarch_annotator.py b/src/dug/core/annotators/monarch_annotator.py index 1294997..766e192 100644 --- a/src/dug/core/annotators/monarch_annotator.py +++ b/src/dug/core/annotators/monarch_annotator.py @@ -16,19 +16,27 @@ logging.getLogger("urllib3").setLevel(logging.WARNING) class AnnotateMonarch: + """ + Use monarch API service to fetch ontology IDs found in text + """ def __init__( self, normalizer, synonym_finder, + config, ontology_greenlist=[], ): - self.annotatorUrl = Config.annotator.url + + self.annotatorUrl = config.annotator['url'] self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist self.norm_fails_file = "norm_fails.txt" self.anno_fails_file = "anno_fails.txt" + debreviator = config.preprocessor['debreviator'] if 'debreviator' in config.preprocessor else None + stopwords = config.preprocessor['stopwords'] if 'stopwords' in config.preprocessor else None + if debreviator is None: debreviator = self.default_debreviator_factory() self.decoder = debreviator @@ -43,9 +51,7 @@ def __call__(self, text, http_session) -> List[DugIdentifier]: text = self.preprocess_text(text) # Fetch identifiers - raw_identifiers = [] - for chunk_text in self.sliding_window(text): - raw_identifiers += self(chunk_text, http_session) + raw_identifiers = self.annotate_text(text, http_session) # Write out to file if text fails to annotate if not raw_identifiers: @@ -72,7 +78,7 @@ def __call__(self, text, http_session) -> List[DugIdentifier]: norm_id = identifier # Add synonyms to identifier - norm_id.synonyms = self.synonym_finder.get_identifier_synonyms(norm_id.id, http_session) + norm_id.synonyms = self.synonym_finder(norm_id.id, http_session) # Get pURL for ontology identifer for more info norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) @@ -108,12 +114,13 @@ def sliding_window(self, text, max_characters=2000, padding_words=5): window_end = True yield current_string - # def annotate_text(self, text, http_session): - # logger.debug(f"Annotating: {text}") - # identifiers = [] - # for chunk_text in self.sliding_window(text): - # identifiers += self(chunk_text, http_session) - # return identifiers + def annotate_text(self, text, http_session) -> List[DugIdentifier]: + logger.debug(f"Annotating: {text}") + identifiers = [] + for chunk_text in self.sliding_window(text): + response = self.make_request(chunk_text, http_session) + identifiers += self.handle_response(chunk_text, response) + return identifiers def make_request(self, value: Input, http_session: Session): value = urllib.parse.quote(value) @@ -122,11 +129,10 @@ def make_request(self, value: Input, http_session: Session): # This could be moved to a config file NUM_TRIES = 5 for _ in range(NUM_TRIES): - response = http_session.get(url) - if response is not None: + response = http_session.get(url) + if response is not None: # looks like it worked - break - + break # if the reponse is still None here, throw an error if response is None: raise RuntimeError(f"no response from {url}") @@ -154,11 +160,11 @@ def preprocess_text(self, text: str) -> str: """ Apply debreviator to replace abbreviations and other characters - >>> pp = PreprocessorMonarch({"foo": "bar"}, ["baz"]) - >>> pp.preprocess("Hello foo") - 'Hello bar' + # >>> pp = PreprocessorMonarch({"foo": "bar"}, ["baz"]) + # >>> pp.preprocess("Hello foo") + # 'Hello bar' - >>> pp.preprocess("Hello baz world") + # >>> pp.preprocess("Hello baz world") 'Hello world' """ diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 40140d8..1ebc0d6 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -143,6 +143,7 @@ def annotate_elements(self): def annotate_element(self, element): # Annotate with a set of normalized ontology identifiers + # self.DugAnnotator.annotator() identifiers = self.annotator(text=element.ml_ready_desc, http_session=self.http_session) # Future thoughts... should we be passing in the stpe DugIdentifier here instead? diff --git a/tests/integration/test_annotators.py b/tests/integration/test_annotators.py new file mode 100644 index 0000000..ad24586 --- /dev/null +++ b/tests/integration/test_annotators.py @@ -0,0 +1,271 @@ +# from copy import copy +# from typing import List +# from attr import field + +# import pytest + +# from tests.unit.mocks.data.test_config import TestConfig +# from dug.core.annotators import ( +# DugIdentifier, +# AnnotateMonarch, +# DefaultNormalizer, +# DefaultSynonymFinder, +# ) +# from unittest.mock import MagicMock + + +# def test_monarch_annotator_workflow(): +# http_session = MagicMock() +# cfg = TestConfig.test_from_env() + +# annotator = AnnotateMonarch( +# normalizer=DefaultNormalizer(cfg.normalizer), +# synonym_finder=DefaultSynonymFinder(cfg.synonym_service), +# config=cfg, +# ) +# text = "heart attack" +# identifiers: List[DugIdentifier] = annotator.annotation_workflow(text, http_session) + +# assert len(identifiers) == 7 +# assert isinstance(identifiers[0], DugIdentifier) + + +# def test_monarch_annotator_annotate(): +# http_session = MagicMock() +# cfg = TestConfig.test_from_env() +# normalizer = DefaultNormalizer(cfg.normalizer) +# synonym_finder = DefaultSynonymFinder(cfg.synonym_service) + +# annotator = AnnotateMonarch( +# normalizer=normalizer, synonym_finder=synonym_finder, config=cfg +# ) +# text = "heart attack" +# identifiers: List[DugIdentifier] = annotator.annotate_text(text, http_session) + +# assert len(identifiers) == 7 +# assert isinstance(identifiers[0], DugIdentifier) + + +# # @pytest.mark.parametrize( +# # "preprocessor,input_text,expected_text", +# # [ +# # (Preprocessor(), "Hello_world", "Hello world"), +# # (Preprocessor({"Hello": "Hi"}, ["placeholder"]), "Hello placeholder world", "Hi world"), +# # ] +# # ) +# # def test_preprocessor_preprocess(preprocessor, input_text, expected_text): +# # original_text = copy(input_text) +# # output_text = preprocessor.preprocess(input_text) + +# # assert input_text == original_text # Don't modify in-place +# # assert output_text == expected_text + + +# # def test_annotator_init(): +# # cfg = Config.from_env() +# # url = cfg.annotator["url"] + +# # annotator = Annotator(**cfg.annotator) +# # assert annotator.url == url + + +# # def test_annotator_handle_response(): +# # annotator = Annotator('foo') + +# # response = { +# # "content": "heart attack", +# # "spans": [ +# # { +# # "start": 0, +# # "end": 5, +# # "text": "heart", +# # "token": [ +# # { +# # "id": "UBERON:0015230", +# # "category": [ +# # "anatomical entity" +# # ], +# # "terms": [ +# # "dorsal vessel heart" +# # ] +# # } +# # ] +# # }, +# # { +# # "start": 0, +# # "end": 5, +# # "text": "heart", +# # "token": [ +# # { +# # "id": "UBERON:0007100", +# # "category": [ +# # "anatomical entity" +# # ], +# # "terms": [ +# # "primary circulatory organ" +# # ] +# # } +# # ] +# # }, +# # { +# # "start": 0, +# # "end": 5, +# # "text": "heart", +# # "token": [ +# # { +# # "id": "UBERON:0015228", +# # "category": [ +# # "anatomical entity" +# # ], +# # "terms": [ +# # "circulatory organ" +# # ] +# # } +# # ] +# # }, +# # { +# # "start": 0, +# # "end": 5, +# # "text": "heart", +# # "token": [ +# # { +# # "id": "ZFA:0000114", +# # "category": [ +# # "anatomical entity" +# # ], +# # "terms": [ +# # "heart" +# # ] +# # } +# # ] +# # }, +# # { +# # "start": 0, +# # "end": 5, +# # "text": "heart", +# # "token": [ +# # { +# # "id": "UBERON:0000948", +# # "category": [ +# # "anatomical entity" +# # ], +# # "terms": [ +# # "heart" +# # ] +# # } +# # ] +# # }, +# # { +# # "start": 0, +# # "end": 12, +# # "text": "heart attack", +# # "token": [ +# # { +# # "id": "MONDO:0005068", +# # "category": [ +# # "disease" +# # ], +# # "terms": [ +# # "myocardial infarction (disease)" +# # ] +# # } +# # ] +# # }, +# # { +# # "start": 0, +# # "end": 12, +# # "text": "heart attack", +# # "token": [ +# # { +# # "id": "HP:0001658", +# # "category": [ +# # "phenotype", +# # "quality" +# # ], +# # "terms": [ +# # "Myocardial infarction" +# # ] +# # } +# # ] +# # } +# # ] +# # } + +# # identifiers: List[DugIdentifier] = annotator.handle_response(None, response) + +# # assert len(identifiers) == 7 +# # assert isinstance(identifiers[0], DugIdentifier) + + +# # def test_annotator_call(annotator_api): +# # url = "http://annotator.api/?content=" + +# # annotator = Annotator(url) + +# # text = "heart attack" +# # identifiers: List[DugIdentifier] = annotator.annotate(text, annotator_api) + +# # assert len(identifiers) == 7 +# # assert isinstance(identifiers[0], DugIdentifier) + + +# # def test_normalizer(normalizer_api): +# # url = "http://normalizer.api/?curie=" + +# # identifier = DugIdentifier( +# # "UBERON:0007100", +# # label='primary circulatory organ', +# # types=['anatomical entity'], +# # description="", +# # search_text=['heart'], +# # ) + +# # normalizer = Normalizer(url) +# # output = normalizer.normalize(identifier, normalizer_api) +# # assert isinstance(output, DugIdentifier) +# # assert output.id == 'UBERON:0007100' +# # assert output.label == "primary circulatory organ" +# # assert output.equivalent_identifiers == ['UBERON:0007100'] +# # assert output.types == 'anatomical entity' + + +# # def test_synonym_finder(synonym_api): +# # curie = "UBERON:0007100" +# # url = f"http://synonyms.api" +# # finder = SynonymFinder(url) +# # result = finder.get_synonyms( +# # curie, +# # synonym_api, +# # ) +# # assert result == [ +# # "primary circulatory organ", +# # "dorsal tube", +# # "adult heart", +# # "heart" +# # ] + + +# # def test_yield_partial_text(): +# # annotator = Annotator('foo') +# # # text contains 800 characters + 9 new lines +# # text = """COG Protocol number on which the patient was enrolled [901=Trial of mouse monoclonal Anti-GD-2 antibody 14.G2A plus IL-2 with or without GM-CSF in children with refractory NBL or melanoma; 911=I-131-MIBG for therapy of advanced neuroblastoma; 914=A dose escalation study of cisplatin, doxorubicin, VP-16, and ifosfamide followed by GM-CSF in advanced NBL and peripheral neuroepithelioma; 925=Study of topotecan; 935=Study of ch14.18 with GM-CSF in children with NBL and other GD2 positive malignancies immediately post ABMT or PBSC; 937=Phase I trial of ZD1694, an inhibitor of thymidylate synthase, in pediatric patients with advanced neoplastic disease; 9709=A phase I study of fenretinide in children with high risk solid tumors; 321P2=New intensive chemotherapy for CCG stage II (with N-myc amplification), stage III and stage IV neuroblastoma; 321P3=Treatment of poor prognosis neuroblastoma before disease progression with intensive multimodal therapy and BMT; 323P=Cyclic combination chemotherapy for newly diagnosed stage III neuroblastoma age 2 and older and stage IV Nneuroblastoma all ages; 3881=Biology and therapy of good, intermediate, and selected poor prognosis neuroblastoma; 3891=Conventional dose chemoradiotherapy vs ablative chemoradiotherapy with autologous BMT for high-risk neuroblastoma; 3951=Phase I pilot study of multiple cycles of high dose chemotherapy with peripheral blood stem cell infusions in advanced stage neuroblastoma.; 4941=National Wilms tumor study V - therapeutic trial & biology study; 8605=Study of the combination of ifosfamide, mesna, and VP-16 in children and young adults with recurrent sarcomas, PNET and other tumors; 8742=Phase III portion of 8741 for neuroblastoma; 9047=Neuroblastoma biology protocol; 9082=Protocol for the development of intervention strategies to reduce the time between symptom onset and diagnosis of childhood cancer -a pediatric oncology group cancer control study; 9140=Therapy for patients with recurrent or refractory neuroblastoma - a phase II study; 9262=A Phase II study of taxol in children with recurrent/refractory soft-tissue sarcoma, rhabdomyosarcoma, osteosarcoma, Ewing's sarcoma, neuroblastoma, germ cell tumors, Wilms' tumor, hepatoblastoma, and hepatocellular carcinoma, a POG study; 9280=Neuroblastoma epidemiology protocol - A Non-Therapeutic Study - A Joint Project of: The University of North Carolina, The Pediatric Oncology Group and The Children's Cancer Study Group; 9340=Treatment of patients >365 days at diagnosis with stage IV NBL: Upfront Phase II Window - A Phase II Study; 9341=Treatment of patients >365 days at diagnosis with stage IV and stage IIB/III (N-myc) NBL - a phase III study; 9342=Neuroblastoma #5, bone marrow transplant - a phase III study; 9343=Interleukin-6 in children receiving autologous bone marrow transplantation for advanced neuroblastoma - a pediatric oncology group phase I trial; 9361=Topotecan in pediatric patients with recurrent or progressive solid tumors - a pediatric oncology group phase II study; 9375=Topotecan plus cyclophosphamide in children with solid tumors - a pediatric oncology group phase I trial; 9464=Cyclophosphamide plus topotecan in children with recurrent or refractory solid tumors - a pediatric oncology group phase II study; 9640=Treatment of patients with high risk neuroblastoma (a feasibility pilot) using two cycles of marrow ablative chemotherapy followed by rescue With peripheral blood stem cells (PBSC), radiation therapy; A3973=A randomized study of purged vs. unpurged PBSC transplant following dose intensive induction therapy for high risk NBL; AADM01P1=Protocol for registration and consent to the childhood cancer research network: a limited institution pilot; AAML00P2=A dose finding study of the safety of gemtuzumab ozogamicin combined with conventional chemotherapy for patients with relapsed or refractory acute myeloid leukemia; ACCL0331=A Randomized double blind placebo controlled clinical trial to assess the efficacy of traumeelĀ® S (IND # 66649) for the prevention and treatment of mucositis in children undergoing hematopoietic stem cell transplantation; ACCRN07=Protocol for the enrollment on the official COG registry, The Childhood Cancer Research Network (CCRN); ADVL0018=Phase I study of hu14.18-IL2 fusion protein in patients with refractory neuroblastoma and other refractory GD2 expressing tumors; ADVL0212=A Phase I study of depsipeptide (NSC#630176, IND# 51810) in pediatric patients with refractory solid tumors and leukemias; ADVL0214=A phase I study of single agent OSI-774 (Tarceva) (NSC # 718781, IND #63383) followed by OSI-774 with temozolomide for patients with selected recurrent/refractory solid tumors, including brain tumors; ADVL0215=A phase I study of decitabine in combination with doxorubicin and cyclophosphamide in the treatment of relapsed or refractory solid tumors; ADVL0421=A phase II study of oxaliplatin in children with recurrent solid tumors; ADVL0524=Phase II trial of ixabepilone (BMS-247550), an epothilone B analog, in children and young adults with refractory solid tumors; ADVL0525=A phase II study of pemetrexed in children with recurrent malignancies; ADVL06B1=A pharmacokinetic-pharmacodynamic-pharmacogenetic study of actinomycin-D and vincristine in children with cancer; ADVL0714=A phase I study of VEGF trap (NSC# 724770, IND# 100137) in children with refractory solid tumors; ALTE03N1=Key adverse events after childhood cancer; ALTE05N1=Umbrella long-term follow-up protocol; ANBL0032=Phase III randomized study of chimeric antibody 14.18 (Ch14.18) in high risk neuroblastoma following myeloablative therapy and autologous stem cell rescue; ANBL00B1=Neuroblastoma biology studies; ANBL00P1=A pilot study of tandem high dose chemotherapy with stem cell rescue following induction therapy in children with high risk neuroblastoma; ANBL02P1=A pilot induction regimen incorporating dose-intensive topotecan and cyclophosphamide for treatment of newly diagnosed high risk neuroblastoma; ANBL0321=Phase II study of fenretinide in pediatric patients with resistant or recurrent neuroblastoma; ANBL0322=A phase II study of hu14.18-IL2 (BB-IND-9728) in children with recurrent or refractory neuroblastoma; ANBL0532=Phase III randomized trial of single vs. tandem myeloablative as consolidation therapy for high-risk neuroblastoma; ANBL0621=A phase II study of ABT-751, an orally bioavailable tubulin binding agent, in children with relapsed or refractory neuroblastoma; B003=Diagnostic & prognostic studies in NBL; B903=Childhood cancer genetics; B947=Protocol for collection of biology specimens for research studies; B954=Opsoclonus-myoclonus-ataxia syndrome, neuroblastoma and the presence of anti-neuronal antibodies; B973=Laboratory-clinical studies of neuroblastoma; E04=Self-administered epidemiology questionnaire; E18=A case-control study of risk factors for neuroblastoma; I03=Neuroblastoma, diagnostic/prognostic; N891=Parents' perceptions of randomization; P9462=Randomized treatment of recurrent neuroblastoma with topotecan regimens following desferrioxamine (POG only) in an investigational window; P9641=Primary surgical therapy for biologically defined low-risk neuroblastoma; P9761=A phase II trial of irinotecan in children with refractory solid tumors; P9963=A phase II trial of rebeccamycin analogue (NSC #655649) in children with solid tumors; R9702=Prognostic implications of MIBG uptake in patients with neuroblastoma previously treated on CCG-3891; S31=Right atrial catheter study; S921=Comparison of urokinase vs heparin in preventing Infection in central venous devices in children with malignancies]""" +# # chunks = "" +# # is_the_beginning = True +# # max_chars = 2000 +# # padding_words = 3 +# # counter = 0 +# # print(len(text)) +# # # divvy up into chunks, sum of each chunk should equal the original text. +# # for chunk in annotator.sliding_window(text=text, max_characters=max_chars, padding_words= padding_words): +# # assert len(chunk) <= max_chars +# # counter += 1 +# # if is_the_beginning: +# # chunks += chunk +# # else: +# # # remove redundand padded words from final result +# # chunks += " ".join(chunk.split(" ")[padding_words:]) +# # is_the_beginning = False + +# # print(counter) +# # # since spaces are trimmed by tokenizer , we can execuled all spaces and do char +# # assert chunks == text diff --git a/tests/unit/mocks/MockCrawler.py b/tests/unit/mocks/MockCrawler.py index 1c69dab..5a3f077 100644 --- a/tests/unit/mocks/MockCrawler.py +++ b/tests/unit/mocks/MockCrawler.py @@ -32,7 +32,7 @@ ids.type = ids.types[0] # annotator with annotate method returning mocked concepts AnnotatorMock = MagicMock() -AnnotatorMock.annotate = Mock(return_value=ANNOTATED_IDS) +AnnotatorMock = Mock(return_value=ANNOTATED_IDS) # tranqlizer returning mock kg when expanding concepts TranqlizerMock = MagicMock() diff --git a/tests/unit/mocks/data/test_config.py b/tests/unit/mocks/data/test_config.py new file mode 100644 index 0000000..cc94593 --- /dev/null +++ b/tests/unit/mocks/data/test_config.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass, field + + +@dataclass +class TestConfig: + + # Preprocessor config that will be passed to annotate.Preprocessor constructor + preprocessor: dict = field(default_factory=lambda: { + "debreviator": { + "BMI": "body mass index" + }, + "stopwords": ["the"] + }) + + # Annotator config that will be passed to annotate.Annotator constructor + annotator: dict = field(default_factory=lambda: { + "url": "http://annotator.api/?content=" + }) + + # Normalizer config that will be passed to annotate.Normalizer constructor + normalizer: dict = field(default_factory=lambda: { + "url": "http://normalizer.api/?curie=" + }) + + # Synonym service config that will be passed to annotate.SynonymHelper constructor + synonym_service: dict = field(default_factory=lambda: { + "url": "http://synonyms.api" + }) + + @classmethod + def test_from_env(cls): + kwargs = {} + return cls(**kwargs) \ No newline at end of file diff --git a/tests/unit/test_annotate.py b/tests/unit/test_annotate.py index 8786956..8470a37 100644 --- a/tests/unit/test_annotate.py +++ b/tests/unit/test_annotate.py @@ -35,6 +35,7 @@ def test_annotator_init(): url = cfg.annotator["url"] annotator = Annotator(**cfg.annotator) + print(f"#########URL: {annotator.url}#########") assert annotator.url == url diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index f4ffa0f..ddb0986 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -1,18 +1,20 @@ from copy import copy from typing import List +from attr import field import pytest - -from dug.config import Config -from dug.core.annotators import ( DugIdentifier, - AnnotateMonarch, - PreprocessorMonarch, - AnnotatorMonarch, - NormalizerMonarch, - SynonymFinderMonarch - ) +from dug.core.annotate import BioLinkPURLerizer + +from tests.unit.mocks.data.test_config import TestConfig +from dug.core.annotators import ( + DugIdentifier, + AnnotateMonarch, + DefaultNormalizer, + DefaultSynonymFinder, +) from unittest.mock import MagicMock + def test_identifier(): ident_1 = DugIdentifier( "PrimaryIdent:1", "first identifier", types=[], search_text="", description="" @@ -20,229 +22,104 @@ def test_identifier(): assert "PrimaryIdent" == ident_1.id_type +def test_monarch_annotation_full(annotator_api, normalizer_api, synonym_api): + cfg = TestConfig.test_from_env() + normalizer = DefaultNormalizer(**cfg.normalizer) + synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) + + annotator = AnnotateMonarch( + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg + ) + input_text = "heart attack" + + text = annotator.preprocess_text(input_text) + + # Fetch identifiers + raw_identifiers: List[DugIdentifier] = annotator.annotate_text(text, annotator_api) + + processed_identifiers: List[DugIdentifier] = [] + for identifier in raw_identifiers: + print(identifier) + output = annotator.normalizer(identifier, normalizer_api) + print(output) + + + # Should be returning normalized identifier for each identifier passed in + if output is None: + output = identifier + # assert isinstance(output, DugIdentifier) + # assert output.id == 'UBERON:0007100' + # assert output.label == "primary circulatory organ" + # assert output.equivalent_identifiers == ['UBERON:0007100'] + # assert output.types == 'anatomical entity' + + # Add synonyms to identifier + output.synonyms = annotator.synonym_finder(output.id, synonym_api) + print(output.synonyms) + # Get pURL for ontology identifer for more info + output.purl = BioLinkPURLerizer.get_curie_purl(output.id) + processed_identifiers.append(output) + + # identifiers: List[DugIdentifier] = annotator( + # text, monarch_annotation_session + # ) + print(processed_identifiers[0]) + assert isinstance(processed_identifiers, List[DugIdentifier]) + assert len(processed_identifiers) == 7 + assert isinstance(processed_identifiers[0], DugIdentifier) + + def test_monarch_annotator(annotator_api): - cfg = Config.from_env() - url = cfg.annotator["url"] - preprocessor = PreprocessorMonarch(**cfg.preprocessor) - annotator = AnnotatorMonarch(**cfg.annotator) - normalizer = NormalizerMonarch(**cfg.normalizer) - synonym_finder = SynonymFinderMonarch(**cfg.synonym_service) + cfg = TestConfig.test_from_env() + normalizer = DefaultNormalizer(cfg.normalizer) + synonym_finder = DefaultSynonymFinder(cfg.synonym_service) annotator = AnnotateMonarch( - preprocessor=preprocessor, - annotator=annotator, - normalizer=normalizer, - synonym_finder=synonym_finder + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg ) - # annotator = AnnotateMonarch() - # assert annotator.annotation_step(text="Lama", http_session = MagicMock()) == url text = "heart attack" - identifiers: List[DugIdentifier] = annotator(text, annotator_api) + identifiers: List[DugIdentifier] = annotator.annotate_text( + text, annotator_api + ) assert len(identifiers) == 7 assert isinstance(identifiers[0], DugIdentifier) -# @pytest.mark.parametrize( -# "preprocessor,input_text,expected_text", -# [ -# (Preprocessor(), "Hello_world", "Hello world"), -# (Preprocessor({"Hello": "Hi"}, ["placeholder"]), "Hello placeholder world", "Hi world"), -# ] -# ) -# def test_preprocessor_preprocess(preprocessor, input_text, expected_text): -# original_text = copy(input_text) -# output_text = preprocessor.preprocess(input_text) - -# assert input_text == original_text # Don't modify in-place -# assert output_text == expected_text - - -# def test_annotator_init(): -# cfg = Config.from_env() -# url = cfg.annotator["url"] - -# annotator = Annotator(**cfg.annotator) -# assert annotator.url == url - - -# def test_annotator_handle_response(): -# annotator = Annotator('foo') - -# response = { -# "content": "heart attack", -# "spans": [ -# { -# "start": 0, -# "end": 5, -# "text": "heart", -# "token": [ -# { -# "id": "UBERON:0015230", -# "category": [ -# "anatomical entity" -# ], -# "terms": [ -# "dorsal vessel heart" -# ] -# } -# ] -# }, -# { -# "start": 0, -# "end": 5, -# "text": "heart", -# "token": [ -# { -# "id": "UBERON:0007100", -# "category": [ -# "anatomical entity" -# ], -# "terms": [ -# "primary circulatory organ" -# ] -# } -# ] -# }, -# { -# "start": 0, -# "end": 5, -# "text": "heart", -# "token": [ -# { -# "id": "UBERON:0015228", -# "category": [ -# "anatomical entity" -# ], -# "terms": [ -# "circulatory organ" -# ] -# } -# ] -# }, -# { -# "start": 0, -# "end": 5, -# "text": "heart", -# "token": [ -# { -# "id": "ZFA:0000114", -# "category": [ -# "anatomical entity" -# ], -# "terms": [ -# "heart" -# ] -# } -# ] -# }, -# { -# "start": 0, -# "end": 5, -# "text": "heart", -# "token": [ -# { -# "id": "UBERON:0000948", -# "category": [ -# "anatomical entity" -# ], -# "terms": [ -# "heart" -# ] -# } -# ] -# }, -# { -# "start": 0, -# "end": 12, -# "text": "heart attack", -# "token": [ -# { -# "id": "MONDO:0005068", -# "category": [ -# "disease" -# ], -# "terms": [ -# "myocardial infarction (disease)" -# ] -# } -# ] -# }, -# { -# "start": 0, -# "end": 12, -# "text": "heart attack", -# "token": [ -# { -# "id": "HP:0001658", -# "category": [ -# "phenotype", -# "quality" -# ], -# "terms": [ -# "Myocardial infarction" -# ] -# } -# ] -# } -# ] -# } - -# identifiers: List[DugIdentifier] = annotator.handle_response(None, response) - -# assert len(identifiers) == 7 -# assert isinstance(identifiers[0], DugIdentifier) - - -# def test_annotator_call(annotator_api): -# url = "http://annotator.api/?content=" - -# annotator = Annotator(url) - -# text = "heart attack" -# identifiers: List[DugIdentifier] = annotator.annotate(text, annotator_api) - -# assert len(identifiers) == 7 -# assert isinstance(identifiers[0], DugIdentifier) - - -# def test_normalizer(normalizer_api): -# url = "http://normalizer.api/?curie=" - -# identifier = DugIdentifier( -# "UBERON:0007100", -# label='primary circulatory organ', -# types=['anatomical entity'], -# description="", -# search_text=['heart'], -# ) - -# normalizer = Normalizer(url) -# output = normalizer.normalize(identifier, normalizer_api) -# assert isinstance(output, DugIdentifier) -# assert output.id == 'UBERON:0007100' -# assert output.label == "primary circulatory organ" -# assert output.equivalent_identifiers == ['UBERON:0007100'] -# assert output.types == 'anatomical entity' - - - -# def test_synonym_finder(synonym_api): -# curie = "UBERON:0007100" -# url = f"http://synonyms.api" -# finder = SynonymFinder(url) -# result = finder.get_synonyms( -# curie, -# synonym_api, -# ) -# assert result == [ -# "primary circulatory organ", -# "dorsal tube", -# "adult heart", -# "heart" -# ] +def test_normalizer(normalizer_api): + url = "http://normalizer.api/?curie=" + identifier = DugIdentifier( + "UBERON:0007100", + label='primary circulatory organ', + types=['anatomical entity'], + description="", + search_text=['heart'], + ) + normalizer = DefaultNormalizer(url) + output = normalizer(identifier, normalizer_api) + assert isinstance(output, DugIdentifier) + assert output.id == 'UBERON:0007100' + assert output.label == "primary circulatory organ" + assert output.equivalent_identifiers == ['UBERON:0007100'] + assert output.types == 'anatomical entity' + + +def test_synonym_finder(synonym_api): + curie = "UBERON:0007100" + url = f"http://synonyms.api" + finder = DefaultSynonymFinder(url) + result = finder( + curie, + synonym_api, + ) + assert result == [ + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart" + ] # def test_yield_partial_text(): diff --git a/tests/unit/test_core/test_search.py b/tests/unit/test_core/test_search.py index 5ec5846..89d48b8 100644 --- a/tests/unit/test_core/test_search.py +++ b/tests/unit/test_core/test_search.py @@ -45,6 +45,7 @@ class MockIndices: def __init__(self): self._indices = {} self.call_count = 0 + self.number_of_replicas = 3 def exists(self, index): return index in self._indices diff --git a/tests/unit/test_crawler.py b/tests/unit/test_crawler.py index 1907bf3..f8e1569 100644 --- a/tests/unit/test_crawler.py +++ b/tests/unit/test_crawler.py @@ -31,7 +31,7 @@ def test_annotate_element(crawler): "collection-desc" ) crawler.annotate_element(element) - AnnotatorMock.annotate.assert_called_with(**{ + AnnotatorMock.assert_called_with(**{ "text": element.ml_ready_desc, "http_session": HTTPSessionMock }) From 0374ff42a632d008cfdae300c441e99509f3ba8e Mon Sep 17 00:00:00 2001 From: braswent Date: Tue, 31 Oct 2023 13:23:23 -0400 Subject: [PATCH 07/85] feat: --- Makefile | 1 - src/dug/cli.py | 9 +- src/dug/core/annotate.py | 618 ------------------ src/dug/core/annotators/monarch_annotator.py | 8 +- src/dug/core/concept_expander.py | 3 +- src/dug/core/crawler.py | 3 +- tests/integration/conftest.py | 172 +++++ .../mocks/mock_config.py} | 2 +- tests/integration/test_annotators.py | 343 ++-------- tests/unit/mocks/MockCrawler.py | 6 +- tests/unit/mocks/data/mock_config.py | 33 + tests/unit/test_annotate.py | 245 ------- tests/unit/test_annotators.py | 54 +- tests/unit/test_cli.py | 6 + tests/unit/test_parsers.py | 4 +- 15 files changed, 307 insertions(+), 1200 deletions(-) delete mode 100644 src/dug/core/annotate.py rename tests/{unit/mocks/data/test_config.py => integration/mocks/mock_config.py} (98%) create mode 100644 tests/unit/mocks/data/mock_config.py delete mode 100644 tests/unit/test_annotate.py diff --git a/Makefile b/Makefile index 22faf3f..70dcba6 100644 --- a/Makefile +++ b/Makefile @@ -40,7 +40,6 @@ install.dug: #test: Run all tests test: - ${PYTHON} -m pytest --doctest-modules src coverage run -m pytest tests coverage: diff --git a/src/dug/cli.py b/src/dug/cli.py index 0ec6c73..4fd5923 100755 --- a/src/dug/cli.py +++ b/src/dug/cli.py @@ -51,6 +51,13 @@ def get_argparser(): required=True ) + crawl_parser.add_argument( + '-a', '--annotator', + help='Annotator used to annotate identifiers in crawl file', + dest="annotator_type", + default="annotator-monarch" + ) + crawl_parser.add_argument( '-e', '--element-type', help='[Optional] Coerce all elements to a certain data type (e.g. DbGaP Variable).\n' @@ -108,7 +115,7 @@ def crawl(args): config.node_to_element_queries = {} factory = DugFactory(config) dug = Dug(factory) - dug.crawl(args.target, args.parser_type, args.element_type) + dug.crawl(args.target, args.parser_type, args.annotator_type, args.element_type) def search(args): diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py deleted file mode 100644 index bbf766b..0000000 --- a/src/dug/core/annotate.py +++ /dev/null @@ -1,618 +0,0 @@ -import json -import logging -import os -import re -import urllib.parse -from typing import TypeVar, Generic, Union, List, Tuple, Optional -import bmt -import requests -from requests import Session - -import dug.core.tranql as tql - - -logger = logging.getLogger('dug') - -logging.getLogger("requests").setLevel(logging.WARNING) -logging.getLogger("urllib3").setLevel(logging.WARNING) - - -class Identifier: - def __init__(self, id, label, types=None, search_text="", description=""): - self.id = id - self.label = label - self.description = description - if types is None: - types = [] - self.types = types - self.search_text = [search_text] if search_text else [] - self.equivalent_identifiers = [] - self.synonyms = [] - self.purl = "" - - @property - def id_type(self): - return self.id.split(":")[0] - - def add_search_text(self, text): - # Add text only if it's unique and if not empty string - if text and text not in self.search_text: - self.search_text.append(text) - - def get_searchable_dict(self): - # Return a version of the identifier compatible with what's in ElasticSearch - es_ident = { - 'id': self.id, - 'label': self.label, - 'equivalent_identifiers': self.equivalent_identifiers, - 'type': self.types, - 'synonyms': self.synonyms - } - return es_ident - - def jsonable(self): - return self.__dict__ - - -class DugAnnotator: - def __init__( - self, - preprocessor: "Preprocessor", - annotator: "Annotator", - normalizer: "Normalizer", - synonym_finder: "SynonymFinder", - ontology_greenlist=[], - ): - self.preprocessor = preprocessor - self.annotator = annotator - self.normalizer = normalizer - self.synonym_finder = synonym_finder - self.ontology_greenlist = ontology_greenlist - self.norm_fails_file = "norm_fails.txt" - self.anno_fails_file = "anno_fails.txt" - - def annotate(self, text, http_session): - - # Preprocess text (debraviate, remove stopwords, etc.) - text = self.preprocessor.preprocess(text) - - # Fetch identifiers - raw_identifiers = self.annotator.annotate(text, http_session) - - # Write out to file if text fails to annotate - if not raw_identifiers: - with open(self.anno_fails_file, "a") as fh: - fh.write(f'{text}\n') - - processed_identifiers = [] - for identifier in raw_identifiers: - - # Normalize identifier using normalization service - norm_id = self.normalizer.normalize(identifier, http_session) - - # Skip adding id if it doesn't normalize - if norm_id is None: - # Write out to file if identifier doesn't normalize - with open(self.norm_fails_file, "a") as fh: - fh.write(f'{identifier.id}\n') - - # Discard non-normalized ident if not in greenlist - if identifier.id_type not in self.ontology_greenlist: - continue - - # If it is in greenlist just keep moving forward - norm_id = identifier - - # Add synonyms to identifier - norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session) - - # Get pURL for ontology identifer for more info - norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) - processed_identifiers.append(norm_id) - - return processed_identifiers - - -class ConceptExpander: - def __init__(self, url, min_tranql_score=0.2): - self.url = url - self.min_tranql_score = min_tranql_score - self.include_node_keys = ["id", "name", "synonyms"] - self.include_edge_keys = [] - self.tranql_headers = {"accept": "application/json", "Content-Type": "text/plain"} - - def is_acceptable_answer(self, answer): - return True - - def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False): - - answer_kgs = [] - - # Skip TranQL query if a file exists in the crawlspace exists already, but continue w/ answers - if os.path.exists(kg_filename): - logger.info(f"identifier {identifier} is already crawled. Skipping TranQL query.") - with open(kg_filename, 'r') as stream: - response = json.load(stream) - else: - query = query_factory.get_query(identifier) - logger.debug(query) - response = requests.post( - url=self.url, - headers=self.tranql_headers, - data=query).json() - - # Case: Skip if empty KG - try: - if response["message"] == 'Internal Server Error' or len(response["message"]["knowledge_graph"]["nodes"]) == 0: - logger.debug(f"Did not find a knowledge graph for {query}") - logger.debug(f"{self.url} returned response: {response}") - return [] - except KeyError as e: - logger.error(f"Could not find key: {e} in response: {response}") - - # Dump out to file if there's a knowledge graph - with open(kg_filename, 'w') as stream: - json.dump(response, stream, indent=2) - - # Get nodes in knowledge graph hashed by ids for easy lookup - noMessage = (len(response.get("message",{})) == 0) - statusError = (response.get("status","") == 'Error') - if noMessage or statusError: - # Skip on error - logger.info(f"Error with identifier: {identifier}, response: {response}, kg_filename: '{kg_filename}'") - return [] - kg = tql.QueryKG(response) - - for answer in kg.answers: - # Filter out answers that don't meet some criteria - # Right now just don't filter anything - logger.debug(f"Answer: {answer}") - if not self.is_acceptable_answer(answer): - logger.warning("Skipping answer as it failed one or more acceptance criteria. See log for details.") - continue - - # Get subgraph containing only information for this answer - try: - # Temporarily surround in try/except because sometimes the answer graphs - # contain invalid references to edges/nodes - # This will be fixed in Robokop but for now just silently warn if answer is invalid - node_attributes_filter = None if include_all_attributes else self.include_node_keys - edge_attributes_filter = None if include_all_attributes else self.include_edge_keys - answer_kg = kg.get_answer_subgraph(answer, - include_node_keys=node_attributes_filter, - include_edge_keys=edge_attributes_filter) - - # Add subgraph to list of acceptable answers to query - answer_kgs.append(answer_kg) - - except tql.MissingNodeReferenceError: - # TEMPORARY: Skip answers that have invalid node references - # Need this to be fixed in Robokop - logger.warning("Skipping answer due to presence of non-preferred id! " - "See err msg for details.") - continue - except tql.MissingEdgeReferenceError: - # TEMPORARY: Skip answers that have invalid edge references - # Need this to be fixed in Robokop - logger.warning("Skipping answer due to presence of invalid edge reference! " - "See err msg for details.") - continue - - return answer_kgs - - -class Preprocessor: - """"Class for preprocessing strings so they are better interpreted by NLP steps""" - - def __init__(self, debreviator=None, stopwords=None): - if debreviator is None: - debreviator = self.default_debreviator_factory() - self.decoder = debreviator - - if stopwords is None: - stopwords = [] - self.stopwords = stopwords - - def preprocess(self, text: str) -> str: - """ - Apply debreviator to replace abbreviations and other characters - - >>> pp = Preprocessor({"foo": "bar"}, ["baz"]) - >>> pp.preprocess("Hello foo") - 'Hello bar' - - >>> pp.preprocess("Hello baz world") - 'Hello world' - """ - - for key, value in self.decoder.items(): - text = text.replace(key, value) - - # Remove any stopwords - text = " ".join([word for word in text.split() if word not in self.stopwords]) - return text - - @staticmethod - def default_debreviator_factory(): - return {"bmi": "body mass index", "_": " "} - - -Input = TypeVar("Input") -Output = TypeVar("Output") - - -class ApiClient(Generic[Input, Output]): - - def make_request(self, value: Input, http_session: Session): - raise NotImplementedError() - - def handle_response(self, value, response: Union[dict, list]) -> Output: - raise NotImplementedError() - - def __call__(self, value: Input, http_session: Session) -> Output: - response = self.make_request(value, http_session) - - result = self.handle_response(value, response) - - return result - - -class Annotator(ApiClient[str, List[Identifier]]): - """ - Use monarch API service to fetch ontology IDs found in text - """ - - def __init__(self, url: str): - self.url = url - - def sliding_window(self, text, max_characters=2000, padding_words=5): - """ - For long texts sliding window works as the following - "aaaa bbb ccc ddd eeee" - with a sliding max chars 8 and padding 1 - first yeild would be "aaaa bbb" - next subsequent yeilds "bbb ccc", "ccc ddd" , "ddd eeee" - allowing context to be preserved with the scope of padding - For a text of length 7653 , with max_characters 2000 and padding 5 , 4 chunks are yielded. - """ - words = text.split(' ') - total_words = len(words) - window_end = False - current_index = 0 - while not window_end: - current_string = "" - for index, word in enumerate(words[current_index: ]): - if len(current_string) + len(word) + 1 >= max_characters: - yield current_string + " " - current_index += index - padding_words - break - appendee = word if index == 0 else " " + word - current_string += appendee - - if current_index + index == len(words) - 1: - window_end = True - yield current_string - - def annotate(self, text, http_session): - logger.debug(f"Annotating: {text}") - identifiers = [] - for chunk_text in self.sliding_window(text): - identifiers += self(chunk_text, http_session) - return identifiers - - def make_request(self, value: Input, http_session: Session): - value = urllib.parse.quote(value) - url = f'{self.url}{value}' - - # This could be moved to a config file - NUM_TRIES = 5 - for _ in range(NUM_TRIES): - response = http_session.get(url) - if response is not None: - # looks like it worked - break - - # if the reponse is still None here, throw an error - if response is None: - raise RuntimeError(f"no response from {url}") - return response.json() - - def handle_response(self, value, response: dict) -> List[Identifier]: - identifiers = [] - """ Parse each identifier and initialize identifier object """ - for span in response.get('spans', []): - search_text = span.get('text', None) - for token in span.get('token', []): - curie = token.get('id', None) - if not curie: - continue - - biolink_types = token.get('category') - label = token.get('terms')[0] - identifiers.append(Identifier(id=curie, - label=label, - types=biolink_types, - search_text=search_text)) - return identifiers - - -class Normalizer(ApiClient[Identifier, Identifier]): - def __init__(self, url): - self.bl_toolkit = bmt.Toolkit() - self.url = url - - def normalize(self, identifier: Identifier, http_session: Session): - # Use RENCI's normalization API service to get the preferred version of an identifier - logger.debug(f"Normalizing: {identifier.id}") - return self(identifier, http_session) - - def make_request(self, value: Identifier, http_session: Session) -> dict: - curie = value.id - url = f"{self.url}{urllib.parse.quote(curie)}" - try: - response = http_session.get(url) - except Exception as get_exc: - logger.info(f"Error normalizing {value} at {url}") - logger.error(f"Error {get_exc.__class__.__name__}: {get_exc}") - return {} - try: - normalized = response.json() - except Exception as json_exc: - logger.info(f"Error processing response: {response.text} (HTTP {response.status_code})") - logger.error(f"Error {json_exc.__class__.__name__}: {json_exc}") - return {} - - return normalized - - def handle_response(self, identifier: Identifier, normalized: dict) -> Optional[Identifier]: - """ Record normalized results. """ - curie = identifier.id - normalization = normalized.get(curie, {}) - if normalization is None: - logger.info(f"Normalization service did not return normalization for: {curie}") - return None - - preferred_id = normalization.get("id", {}) - equivalent_identifiers = normalization.get("equivalent_identifiers", []) - biolink_type = normalization.get("type", []) - - # Return none if there isn't actually a preferred id - if 'identifier' not in preferred_id: - logger.debug(f"ERROR: normalize({curie})=>({preferred_id}). No identifier?") - return None - - logger.debug(f"Preferred id: {preferred_id}") - identifier.id = preferred_id.get('identifier', '') - identifier.label = preferred_id.get('label', '') - identifier.description = preferred_id.get('description', '') - identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] - try: - identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name - except: - # converts biolink:SmallMolecule to small molecule - identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() - return identifier - - -class SynonymFinder(ApiClient[str, List[str]]): - - def __init__(self, url: str): - self.url = url - - def get_synonyms(self, curie: str, http_session): - ''' - This function uses the NCATS translator service to return a list of synonyms for - curie id - ''' - - return self(curie, http_session) - - def make_request(self, curie: str, http_session: Session): - # Get response from namelookup reverse lookup op - # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) - url = f"{self.url}" - payload = { - 'curies': [curie] - } - try: - response = http_session.post(url, json=payload) - if str(response.status_code).startswith('4'): - logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") - return {curie: []} - if str(response.status_code).startswith('5'): - logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") - return {curie: []} - return response.json() - except json.decoder.JSONDecodeError as e: - logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") - return {curie: []} - - def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: - # Return curie synonyms - return raw_synonyms.get(curie, []) - - - - - -class BioLinkPURLerizer: - # Static class for the sole purpose of doing lookups of different ontology PURLs - # Is it pretty? No. But it gets the job done. - biolink_lookup = {"APO": "http://purl.obolibrary.org/obo/APO_", - "Aeolus": "http://translator.ncats.nih.gov/Aeolus_", - "BIOGRID": "http://identifiers.org/biogrid/", - "BIOSAMPLE": "http://identifiers.org/biosample/", - "BSPO": "http://purl.obolibrary.org/obo/BSPO_", - "CAID": "http://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=", - "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_", - "CHEMBL.COMPOUND": "http://identifiers.org/chembl.compound/", - "CHEMBL.MECHANISM": "https://www.ebi.ac.uk/chembl/mechanism/inspect/", - "CHEMBL.TARGET": "http://identifiers.org/chembl.target/", - "CID": "http://pubchem.ncbi.nlm.nih.gov/compound/", - "CL": "http://purl.obolibrary.org/obo/CL_", - "CLINVAR": "http://identifiers.org/clinvar/", - "CLO": "http://purl.obolibrary.org/obo/CLO_", - "COAR_RESOURCE": "http://purl.org/coar/resource_type/", - "CPT": "https://www.ama-assn.org/practice-management/cpt/", - "CTD": "http://translator.ncats.nih.gov/CTD_", - "ClinVarVariant": "http://www.ncbi.nlm.nih.gov/clinvar/variation/", - "DBSNP": "http://identifiers.org/dbsnp/", - "DGIdb": "https://www.dgidb.org/interaction_types", - "DOID": "http://purl.obolibrary.org/obo/DOID_", - "DRUGBANK": "http://identifiers.org/drugbank/", - "DrugCentral": "http://translator.ncats.nih.gov/DrugCentral_", - "EC": "http://www.enzyme-database.org/query.php?ec=", - "ECTO": "http://purl.obolibrary.org/obo/ECTO_", - "EDAM-DATA": "http://edamontology.org/data_", - "EDAM-FORMAT": "http://edamontology.org/format_", - "EDAM-OPERATION": "http://edamontology.org/operation_", - "EDAM-TOPIC": "http://edamontology.org/topic_", - "EFO": "http://identifiers.org/efo/", - "ENSEMBL": "http://identifiers.org/ensembl/", - "ExO": "http://purl.obolibrary.org/obo/ExO_", - "FAO": "http://purl.obolibrary.org/obo/FAO_", - "FB": "http://identifiers.org/fb/", - "FBcv": "http://purl.obolibrary.org/obo/FBcv_", - "FlyBase": "http://flybase.org/reports/", - "GAMMA": "http://translator.renci.org/GAMMA_", - "GO": "http://purl.obolibrary.org/obo/GO_", - "GOLD.META": "http://identifiers.org/gold.meta/", - "GOP": "http://purl.obolibrary.org/obo/go#", - "GOREL": "http://purl.obolibrary.org/obo/GOREL_", - "GSID": "https://scholar.google.com/citations?user=", - "GTEx": "https://www.gtexportal.org/home/gene/", - "HANCESTRO": "http://www.ebi.ac.uk/ancestro/ancestro_", - "HCPCS": "http://purl.bioontology.org/ontology/HCPCS/", - "HGNC": "http://identifiers.org/hgnc/", - "HGNC.FAMILY": "http://identifiers.org/hgnc.family/", - "HMDB": "http://identifiers.org/hmdb/", - "HP": "http://purl.obolibrary.org/obo/HP_", - "ICD0": "http://translator.ncats.nih.gov/ICD0_", - "ICD10": "http://translator.ncats.nih.gov/ICD10_", - "ICD9": "http://translator.ncats.nih.gov/ICD9_", - "INCHI": "http://identifiers.org/inchi/", - "INCHIKEY": "http://identifiers.org/inchikey/", - "INTACT": "http://identifiers.org/intact/", - "IUPHAR.FAMILY": "http://identifiers.org/iuphar.family/", - "KEGG": "http://identifiers.org/kegg/", - "LOINC": "http://loinc.org/rdf/", - "MEDDRA": "http://identifiers.org/meddra/", - "MESH": "http://identifiers.org/mesh/", - "MGI": "http://identifiers.org/mgi/", - "MI": "http://purl.obolibrary.org/obo/MI_", - "MIR": "http://identifiers.org/mir/", - "MONDO": "http://purl.obolibrary.org/obo/MONDO_", - "MP": "http://purl.obolibrary.org/obo/MP_", - "MSigDB": "https://www.gsea-msigdb.org/gsea/msigdb/", - "MetaCyc": "http://translator.ncats.nih.gov/MetaCyc_", - "NCBIGENE": "http://identifiers.org/ncbigene/", - "NCBITaxon": "http://purl.obolibrary.org/obo/NCBITaxon_", - "NCIT": "http://purl.obolibrary.org/obo/NCIT_", - "NDDF": "http://purl.bioontology.org/ontology/NDDF/", - "NLMID": "https://www.ncbi.nlm.nih.gov/nlmcatalog/?term=", - "OBAN": "http://purl.org/oban/", - "OBOREL": "http://purl.obolibrary.org/obo/RO_", - "OIO": "http://www.geneontology.org/formats/oboInOwl#", - "OMIM": "http://purl.obolibrary.org/obo/OMIM_", - "ORCID": "https://orcid.org/", - "ORPHA": "http://www.orpha.net/ORDO/Orphanet_", - "ORPHANET": "http://identifiers.org/orphanet/", - "PANTHER.FAMILY": "http://identifiers.org/panther.family/", - "PANTHER.PATHWAY": "http://identifiers.org/panther.pathway/", - "PATO-PROPERTY": "http://purl.obolibrary.org/obo/pato#", - "PDQ": "https://www.cancer.gov/publications/pdq#", - "PHARMGKB.DRUG": "http://identifiers.org/pharmgkb.drug/", - "PHARMGKB.PATHWAYS": "http://identifiers.org/pharmgkb.pathways/", - "PHAROS": "http://pharos.nih.gov", - "PMID": "http://www.ncbi.nlm.nih.gov/pubmed/", - "PO": "http://purl.obolibrary.org/obo/PO_", - "POMBASE": "http://identifiers.org/pombase/", - "PR": "http://purl.obolibrary.org/obo/PR_", - "PUBCHEM.COMPOUND": "http://identifiers.org/pubchem.compound/", - "PUBCHEM.SUBSTANCE": "http://identifiers.org/pubchem.substance/", - "PathWhiz": "http://smpdb.ca/pathways/#", - "REACT": "http://www.reactome.org/PathwayBrowser/#/", - "REPODB": "http://apps.chiragjpgroup.org/repoDB/", - "RGD": "http://identifiers.org/rgd/", - "RHEA": "http://identifiers.org/rhea/", - "RNACENTRAL": "http://identifiers.org/rnacentral/", - "RO": "http://purl.obolibrary.org/obo/RO_", - "RTXKG1": "http://kg1endpoint.rtx.ai/", - "RXNORM": "http://purl.bioontology.org/ontology/RXNORM/", - "ResearchID": "https://publons.com/researcher/", - "SEMMEDDB": "https://skr3.nlm.nih.gov/SemMedDB", - "SGD": "http://identifiers.org/sgd/", - "SIO": "http://semanticscience.org/resource/SIO_", - "SMPDB": "http://identifiers.org/smpdb/", - "SNOMEDCT": "http://identifiers.org/snomedct/", - "SNPEFF": "http://translator.ncats.nih.gov/SNPEFF_", - "ScopusID": "https://www.scopus.com/authid/detail.uri?authorId=", - "TAXRANK": "http://purl.obolibrary.org/obo/TAXRANK_", - "UBERGRAPH": "http://translator.renci.org/ubergraph-axioms.ofn#", - "UBERON": "http://purl.obolibrary.org/obo/UBERON_", - "UBERON_CORE": "http://purl.obolibrary.org/obo/uberon/core#", - "UMLS": "http://identifiers.org/umls/", - "UMLSSC": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/code#", - "UMLSSG": "https://metamap.nlm.nih.gov/Docs/SemGroups_2018.txt/group#", - "UMLSST": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/type#", - "UNII": "http://identifiers.org/unii/", - "UPHENO": "http://purl.obolibrary.org/obo/UPHENO_", - "UniProtKB": "http://identifiers.org/uniprot/", - "VANDF": "https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/VANDF/", - "VMC": "https://github.com/ga4gh/vr-spec/", - "WB": "http://identifiers.org/wb/", - "WBPhenotype": "http://purl.obolibrary.org/obo/WBPhenotype_", - "WBVocab": "http://bio2rdf.org/wormbase_vocabulary", - "WIKIDATA": "https://www.wikidata.org/wiki/", - "WIKIDATA_PROPERTY": "https://www.wikidata.org/wiki/Property:", - "WIKIPATHWAYS": "http://identifiers.org/wikipathways/", - "WormBase": "https://www.wormbase.org/get?name=", - "ZFIN": "http://identifiers.org/zfin/", - "ZP": "http://purl.obolibrary.org/obo/ZP_", - "alliancegenome": "https://www.alliancegenome.org/", - "biolink": "https://w3id.org/biolink/vocab/", - "biolinkml": "https://w3id.org/biolink/biolinkml/", - "chembio": "http://translator.ncats.nih.gov/chembio_", - "dcterms": "http://purl.org/dc/terms/", - "dictyBase": "http://dictybase.org/gene/", - "doi": "https://doi.org/", - "fabio": "http://purl.org/spar/fabio/", - "foaf": "http://xmlns.com/foaf/0.1/", - "foodb.compound": "http://foodb.ca/compounds/", - "gff3": "https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#", - "gpi": "https://github.com/geneontology/go-annotation/blob/master/specs/gpad-gpi-2-0.md#", - "gtpo": "https://rdf.guidetopharmacology.org/ns/gtpo#", - "hetio": "http://translator.ncats.nih.gov/hetio_", - "interpro": "https://www.ebi.ac.uk/interpro/entry/", - "isbn": "https://www.isbn-international.org/identifier/", - "isni": "https://isni.org/isni/", - "issn": "https://portal.issn.org/resource/ISSN/", - "medgen": "https://www.ncbi.nlm.nih.gov/medgen/", - "oboformat": "http://www.geneontology.org/formats/oboInOWL#", - "pav": "http://purl.org/pav/", - "prov": "http://www.w3.org/ns/prov#", - "qud": "http://qudt.org/1.1/schema/qudt#", - "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", - "rdfs": "http://www.w3.org/2000/01/rdf-schema#", - "skos": "https://www.w3.org/TR/skos-reference/#", - "wgs": "http://www.w3.org/2003/01/geo/wgs84_pos", - "xsd": "http://www.w3.org/2001/XMLSchema#", - "@vocab": "https://w3id.org/biolink/vocab/"} - - @staticmethod - def get_curie_purl(curie): - # Split into prefix and suffix - suffix = curie.split(":")[1] - prefix = curie.split(":")[0] - - # Check to see if the prefix exists in the hash - if prefix not in BioLinkPURLerizer.biolink_lookup: - return None - - return f"{BioLinkPURLerizer.biolink_lookup[prefix]}{suffix}" - - -if __name__ == "__main__": - import doctest - - doctest.testmod() diff --git a/src/dug/core/annotators/monarch_annotator.py b/src/dug/core/annotators/monarch_annotator.py index 766e192..841e9cf 100644 --- a/src/dug/core/annotators/monarch_annotator.py +++ b/src/dug/core/annotators/monarch_annotator.py @@ -1,13 +1,9 @@ -import json import logging -import os -import re import urllib.parse -from typing import TypeVar, Generic, Union, List, Tuple, Optional -from dug.config import Config +from typing import List from requests import Session -from ._base import DugIdentifier, AnnotatorSession, Input +from ._base import DugIdentifier, Input from .utils.biolink_purl_util import BioLinkPURLerizer logger = logging.getLogger('dug') diff --git a/src/dug/core/concept_expander.py b/src/dug/core/concept_expander.py index 2df9a8c..bc8eef5 100644 --- a/src/dug/core/concept_expander.py +++ b/src/dug/core/concept_expander.py @@ -95,4 +95,5 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_ "See err msg for details.") continue - return answer_kgs \ No newline at end of file + return answer_kgs + \ No newline at end of file diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 1ebc0d6..7331756 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -2,6 +2,7 @@ import logging import os import traceback +from typing import List from dug.core.parsers import Parser, DugElement, DugConcept from dug.core.annotators import Annotator, DugIdentifier @@ -144,7 +145,7 @@ def annotate_element(self, element): # Annotate with a set of normalized ontology identifiers # self.DugAnnotator.annotator() - identifiers = self.annotator(text=element.ml_ready_desc, + identifiers: List[DugIdentifier] = self.annotator(text=element.ml_ready_desc, http_session=self.http_session) # Future thoughts... should we be passing in the stpe DugIdentifier here instead? diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 1a6b7da..1e66644 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,3 +1,175 @@ from pathlib import Path +import json +import urllib.parse +from dataclasses import dataclass +from typing import Dict + +import pytest TEST_DATA_DIR = Path(__file__).parent.resolve() / 'data' + + +@dataclass +class MockResponse: + text: str + status_code: int = 200 + + def json(self): + return json.loads(self.text) + + +class MockApiService: + def __init__(self, urls: Dict[str, list]): + self.urls = urls + + def get(self, url, params: dict = None): + if params: + qstr = urllib.parse.urlencode(params, quote_via=urllib.parse.quote) + url = f"{url}?{qstr}" + + text, status_code = self.urls.get(url) + + if text is None: + return MockResponse(text="{}", status_code=404) + return MockResponse(text, status_code=status_code) + + def post(self, url, params: dict = None, json: dict = {}): + if params: + qstr = urllib.parse.urlencode(params, quote_via=urllib.parse.quote) + url = f"{url}?{qstr}" + text, status_code = self.urls.get(url) + + if text is None: + return MockResponse(text="{}", status_code=404) + return MockResponse(text, status_code=status_code) + +@pytest.fixture +def monarch_annotator_api(): + base_url = "http://annotator.api/?content={query}" + + def _(keyword): + return base_url.format(query=urllib.parse.quote(keyword)) + + urls = { + _("heart attack"): [ + json.dumps( + { + "content": "heart attack", + "spans": [ + { + "start": 0, + "end": 5, + "text": "heart", + "token": [ + { + "id": "UBERON:0007100", + "category": ["anatomical entity"], + "terms": ["primary circulatory organ"], + } + ], + }, + { + "start": 0, + "end": 5, + "text": "heart", + "token": [ + { + "id": "XAO:0000336", + "category": [], + "terms": ["heart primordium"], + } + ], + }, + ], + } + ), + 200, + ], + } + + return MockApiService( + urls=urls, + ) + +@pytest.fixture +def normalizer_api(): + base_url = "http://normalizer.api/?curie={curie}" + + def _(curie): + return base_url.format( + curie=urllib.parse.quote(curie), + ) + + urls = { + _("UBERON:0007100"): [json.dumps( + { + "UBERON:0007100": { + "id": { + "identifier": "UBERON:0007100", + "label": "primary circulatory organ" + }, + "equivalent_identifiers": [ + { + "identifier": "UBERON:0007100", + "label": "primary circulatory organ" + } + ], + "type": [ + "biolink:AnatomicalEntity", + "biolink:OrganismalEntity", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity" + ] + } + }, + ), 200], + + } + + return MockApiService( + urls=urls, + ) +@pytest.fixture +def null_normalizer_api(): + base_url = "http://normalizer.api/?curie={curie}" + + def _(curie): + return base_url.format( + curie=urllib.parse.quote(curie), + ) + + urls = { + _("XAO:0000336"): [json.dumps( + { + "XAO:0000336": None + }, + ), 200], + + } + + return MockApiService( + urls=urls, + ) + +@pytest.fixture +def synonym_api(): + return MockApiService(urls={ + "http://synonyms.api": [json.dumps({ + "UBERON:0007100": [ + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart" + ] + }), 200] + }) + +@pytest.fixture +def null_synonym_api(): + return MockApiService(urls={ + "http://synonyms.api": [json.dumps({ + "XAO:0000336": [ + ] + }), 200] + }) \ No newline at end of file diff --git a/tests/unit/mocks/data/test_config.py b/tests/integration/mocks/mock_config.py similarity index 98% rename from tests/unit/mocks/data/test_config.py rename to tests/integration/mocks/mock_config.py index cc94593..27ca191 100644 --- a/tests/unit/mocks/data/test_config.py +++ b/tests/integration/mocks/mock_config.py @@ -2,7 +2,7 @@ @dataclass -class TestConfig: +class MockConfig: # Preprocessor config that will be passed to annotate.Preprocessor constructor preprocessor: dict = field(default_factory=lambda: { diff --git a/tests/integration/test_annotators.py b/tests/integration/test_annotators.py index ad24586..d8f5c45 100644 --- a/tests/integration/test_annotators.py +++ b/tests/integration/test_annotators.py @@ -1,271 +1,72 @@ -# from copy import copy -# from typing import List -# from attr import field - -# import pytest - -# from tests.unit.mocks.data.test_config import TestConfig -# from dug.core.annotators import ( -# DugIdentifier, -# AnnotateMonarch, -# DefaultNormalizer, -# DefaultSynonymFinder, -# ) -# from unittest.mock import MagicMock - - -# def test_monarch_annotator_workflow(): -# http_session = MagicMock() -# cfg = TestConfig.test_from_env() - -# annotator = AnnotateMonarch( -# normalizer=DefaultNormalizer(cfg.normalizer), -# synonym_finder=DefaultSynonymFinder(cfg.synonym_service), -# config=cfg, -# ) -# text = "heart attack" -# identifiers: List[DugIdentifier] = annotator.annotation_workflow(text, http_session) - -# assert len(identifiers) == 7 -# assert isinstance(identifiers[0], DugIdentifier) - - -# def test_monarch_annotator_annotate(): -# http_session = MagicMock() -# cfg = TestConfig.test_from_env() -# normalizer = DefaultNormalizer(cfg.normalizer) -# synonym_finder = DefaultSynonymFinder(cfg.synonym_service) - -# annotator = AnnotateMonarch( -# normalizer=normalizer, synonym_finder=synonym_finder, config=cfg -# ) -# text = "heart attack" -# identifiers: List[DugIdentifier] = annotator.annotate_text(text, http_session) - -# assert len(identifiers) == 7 -# assert isinstance(identifiers[0], DugIdentifier) - - -# # @pytest.mark.parametrize( -# # "preprocessor,input_text,expected_text", -# # [ -# # (Preprocessor(), "Hello_world", "Hello world"), -# # (Preprocessor({"Hello": "Hi"}, ["placeholder"]), "Hello placeholder world", "Hi world"), -# # ] -# # ) -# # def test_preprocessor_preprocess(preprocessor, input_text, expected_text): -# # original_text = copy(input_text) -# # output_text = preprocessor.preprocess(input_text) - -# # assert input_text == original_text # Don't modify in-place -# # assert output_text == expected_text - - -# # def test_annotator_init(): -# # cfg = Config.from_env() -# # url = cfg.annotator["url"] - -# # annotator = Annotator(**cfg.annotator) -# # assert annotator.url == url - - -# # def test_annotator_handle_response(): -# # annotator = Annotator('foo') - -# # response = { -# # "content": "heart attack", -# # "spans": [ -# # { -# # "start": 0, -# # "end": 5, -# # "text": "heart", -# # "token": [ -# # { -# # "id": "UBERON:0015230", -# # "category": [ -# # "anatomical entity" -# # ], -# # "terms": [ -# # "dorsal vessel heart" -# # ] -# # } -# # ] -# # }, -# # { -# # "start": 0, -# # "end": 5, -# # "text": "heart", -# # "token": [ -# # { -# # "id": "UBERON:0007100", -# # "category": [ -# # "anatomical entity" -# # ], -# # "terms": [ -# # "primary circulatory organ" -# # ] -# # } -# # ] -# # }, -# # { -# # "start": 0, -# # "end": 5, -# # "text": "heart", -# # "token": [ -# # { -# # "id": "UBERON:0015228", -# # "category": [ -# # "anatomical entity" -# # ], -# # "terms": [ -# # "circulatory organ" -# # ] -# # } -# # ] -# # }, -# # { -# # "start": 0, -# # "end": 5, -# # "text": "heart", -# # "token": [ -# # { -# # "id": "ZFA:0000114", -# # "category": [ -# # "anatomical entity" -# # ], -# # "terms": [ -# # "heart" -# # ] -# # } -# # ] -# # }, -# # { -# # "start": 0, -# # "end": 5, -# # "text": "heart", -# # "token": [ -# # { -# # "id": "UBERON:0000948", -# # "category": [ -# # "anatomical entity" -# # ], -# # "terms": [ -# # "heart" -# # ] -# # } -# # ] -# # }, -# # { -# # "start": 0, -# # "end": 12, -# # "text": "heart attack", -# # "token": [ -# # { -# # "id": "MONDO:0005068", -# # "category": [ -# # "disease" -# # ], -# # "terms": [ -# # "myocardial infarction (disease)" -# # ] -# # } -# # ] -# # }, -# # { -# # "start": 0, -# # "end": 12, -# # "text": "heart attack", -# # "token": [ -# # { -# # "id": "HP:0001658", -# # "category": [ -# # "phenotype", -# # "quality" -# # ], -# # "terms": [ -# # "Myocardial infarction" -# # ] -# # } -# # ] -# # } -# # ] -# # } - -# # identifiers: List[DugIdentifier] = annotator.handle_response(None, response) - -# # assert len(identifiers) == 7 -# # assert isinstance(identifiers[0], DugIdentifier) - - -# # def test_annotator_call(annotator_api): -# # url = "http://annotator.api/?content=" - -# # annotator = Annotator(url) - -# # text = "heart attack" -# # identifiers: List[DugIdentifier] = annotator.annotate(text, annotator_api) - -# # assert len(identifiers) == 7 -# # assert isinstance(identifiers[0], DugIdentifier) - - -# # def test_normalizer(normalizer_api): -# # url = "http://normalizer.api/?curie=" - -# # identifier = DugIdentifier( -# # "UBERON:0007100", -# # label='primary circulatory organ', -# # types=['anatomical entity'], -# # description="", -# # search_text=['heart'], -# # ) - -# # normalizer = Normalizer(url) -# # output = normalizer.normalize(identifier, normalizer_api) -# # assert isinstance(output, DugIdentifier) -# # assert output.id == 'UBERON:0007100' -# # assert output.label == "primary circulatory organ" -# # assert output.equivalent_identifiers == ['UBERON:0007100'] -# # assert output.types == 'anatomical entity' - - -# # def test_synonym_finder(synonym_api): -# # curie = "UBERON:0007100" -# # url = f"http://synonyms.api" -# # finder = SynonymFinder(url) -# # result = finder.get_synonyms( -# # curie, -# # synonym_api, -# # ) -# # assert result == [ -# # "primary circulatory organ", -# # "dorsal tube", -# # "adult heart", -# # "heart" -# # ] - - -# # def test_yield_partial_text(): -# # annotator = Annotator('foo') -# # # text contains 800 characters + 9 new lines -# # text = """COG Protocol number on which the patient was enrolled [901=Trial of mouse monoclonal Anti-GD-2 antibody 14.G2A plus IL-2 with or without GM-CSF in children with refractory NBL or melanoma; 911=I-131-MIBG for therapy of advanced neuroblastoma; 914=A dose escalation study of cisplatin, doxorubicin, VP-16, and ifosfamide followed by GM-CSF in advanced NBL and peripheral neuroepithelioma; 925=Study of topotecan; 935=Study of ch14.18 with GM-CSF in children with NBL and other GD2 positive malignancies immediately post ABMT or PBSC; 937=Phase I trial of ZD1694, an inhibitor of thymidylate synthase, in pediatric patients with advanced neoplastic disease; 9709=A phase I study of fenretinide in children with high risk solid tumors; 321P2=New intensive chemotherapy for CCG stage II (with N-myc amplification), stage III and stage IV neuroblastoma; 321P3=Treatment of poor prognosis neuroblastoma before disease progression with intensive multimodal therapy and BMT; 323P=Cyclic combination chemotherapy for newly diagnosed stage III neuroblastoma age 2 and older and stage IV Nneuroblastoma all ages; 3881=Biology and therapy of good, intermediate, and selected poor prognosis neuroblastoma; 3891=Conventional dose chemoradiotherapy vs ablative chemoradiotherapy with autologous BMT for high-risk neuroblastoma; 3951=Phase I pilot study of multiple cycles of high dose chemotherapy with peripheral blood stem cell infusions in advanced stage neuroblastoma.; 4941=National Wilms tumor study V - therapeutic trial & biology study; 8605=Study of the combination of ifosfamide, mesna, and VP-16 in children and young adults with recurrent sarcomas, PNET and other tumors; 8742=Phase III portion of 8741 for neuroblastoma; 9047=Neuroblastoma biology protocol; 9082=Protocol for the development of intervention strategies to reduce the time between symptom onset and diagnosis of childhood cancer -a pediatric oncology group cancer control study; 9140=Therapy for patients with recurrent or refractory neuroblastoma - a phase II study; 9262=A Phase II study of taxol in children with recurrent/refractory soft-tissue sarcoma, rhabdomyosarcoma, osteosarcoma, Ewing's sarcoma, neuroblastoma, germ cell tumors, Wilms' tumor, hepatoblastoma, and hepatocellular carcinoma, a POG study; 9280=Neuroblastoma epidemiology protocol - A Non-Therapeutic Study - A Joint Project of: The University of North Carolina, The Pediatric Oncology Group and The Children's Cancer Study Group; 9340=Treatment of patients >365 days at diagnosis with stage IV NBL: Upfront Phase II Window - A Phase II Study; 9341=Treatment of patients >365 days at diagnosis with stage IV and stage IIB/III (N-myc) NBL - a phase III study; 9342=Neuroblastoma #5, bone marrow transplant - a phase III study; 9343=Interleukin-6 in children receiving autologous bone marrow transplantation for advanced neuroblastoma - a pediatric oncology group phase I trial; 9361=Topotecan in pediatric patients with recurrent or progressive solid tumors - a pediatric oncology group phase II study; 9375=Topotecan plus cyclophosphamide in children with solid tumors - a pediatric oncology group phase I trial; 9464=Cyclophosphamide plus topotecan in children with recurrent or refractory solid tumors - a pediatric oncology group phase II study; 9640=Treatment of patients with high risk neuroblastoma (a feasibility pilot) using two cycles of marrow ablative chemotherapy followed by rescue With peripheral blood stem cells (PBSC), radiation therapy; A3973=A randomized study of purged vs. unpurged PBSC transplant following dose intensive induction therapy for high risk NBL; AADM01P1=Protocol for registration and consent to the childhood cancer research network: a limited institution pilot; AAML00P2=A dose finding study of the safety of gemtuzumab ozogamicin combined with conventional chemotherapy for patients with relapsed or refractory acute myeloid leukemia; ACCL0331=A Randomized double blind placebo controlled clinical trial to assess the efficacy of traumeelĀ® S (IND # 66649) for the prevention and treatment of mucositis in children undergoing hematopoietic stem cell transplantation; ACCRN07=Protocol for the enrollment on the official COG registry, The Childhood Cancer Research Network (CCRN); ADVL0018=Phase I study of hu14.18-IL2 fusion protein in patients with refractory neuroblastoma and other refractory GD2 expressing tumors; ADVL0212=A Phase I study of depsipeptide (NSC#630176, IND# 51810) in pediatric patients with refractory solid tumors and leukemias; ADVL0214=A phase I study of single agent OSI-774 (Tarceva) (NSC # 718781, IND #63383) followed by OSI-774 with temozolomide for patients with selected recurrent/refractory solid tumors, including brain tumors; ADVL0215=A phase I study of decitabine in combination with doxorubicin and cyclophosphamide in the treatment of relapsed or refractory solid tumors; ADVL0421=A phase II study of oxaliplatin in children with recurrent solid tumors; ADVL0524=Phase II trial of ixabepilone (BMS-247550), an epothilone B analog, in children and young adults with refractory solid tumors; ADVL0525=A phase II study of pemetrexed in children with recurrent malignancies; ADVL06B1=A pharmacokinetic-pharmacodynamic-pharmacogenetic study of actinomycin-D and vincristine in children with cancer; ADVL0714=A phase I study of VEGF trap (NSC# 724770, IND# 100137) in children with refractory solid tumors; ALTE03N1=Key adverse events after childhood cancer; ALTE05N1=Umbrella long-term follow-up protocol; ANBL0032=Phase III randomized study of chimeric antibody 14.18 (Ch14.18) in high risk neuroblastoma following myeloablative therapy and autologous stem cell rescue; ANBL00B1=Neuroblastoma biology studies; ANBL00P1=A pilot study of tandem high dose chemotherapy with stem cell rescue following induction therapy in children with high risk neuroblastoma; ANBL02P1=A pilot induction regimen incorporating dose-intensive topotecan and cyclophosphamide for treatment of newly diagnosed high risk neuroblastoma; ANBL0321=Phase II study of fenretinide in pediatric patients with resistant or recurrent neuroblastoma; ANBL0322=A phase II study of hu14.18-IL2 (BB-IND-9728) in children with recurrent or refractory neuroblastoma; ANBL0532=Phase III randomized trial of single vs. tandem myeloablative as consolidation therapy for high-risk neuroblastoma; ANBL0621=A phase II study of ABT-751, an orally bioavailable tubulin binding agent, in children with relapsed or refractory neuroblastoma; B003=Diagnostic & prognostic studies in NBL; B903=Childhood cancer genetics; B947=Protocol for collection of biology specimens for research studies; B954=Opsoclonus-myoclonus-ataxia syndrome, neuroblastoma and the presence of anti-neuronal antibodies; B973=Laboratory-clinical studies of neuroblastoma; E04=Self-administered epidemiology questionnaire; E18=A case-control study of risk factors for neuroblastoma; I03=Neuroblastoma, diagnostic/prognostic; N891=Parents' perceptions of randomization; P9462=Randomized treatment of recurrent neuroblastoma with topotecan regimens following desferrioxamine (POG only) in an investigational window; P9641=Primary surgical therapy for biologically defined low-risk neuroblastoma; P9761=A phase II trial of irinotecan in children with refractory solid tumors; P9963=A phase II trial of rebeccamycin analogue (NSC #655649) in children with solid tumors; R9702=Prognostic implications of MIBG uptake in patients with neuroblastoma previously treated on CCG-3891; S31=Right atrial catheter study; S921=Comparison of urokinase vs heparin in preventing Infection in central venous devices in children with malignancies]""" -# # chunks = "" -# # is_the_beginning = True -# # max_chars = 2000 -# # padding_words = 3 -# # counter = 0 -# # print(len(text)) -# # # divvy up into chunks, sum of each chunk should equal the original text. -# # for chunk in annotator.sliding_window(text=text, max_characters=max_chars, padding_words= padding_words): -# # assert len(chunk) <= max_chars -# # counter += 1 -# # if is_the_beginning: -# # chunks += chunk -# # else: -# # # remove redundand padded words from final result -# # chunks += " ".join(chunk.split(" ")[padding_words:]) -# # is_the_beginning = False - -# # print(counter) -# # # since spaces are trimmed by tokenizer , we can execuled all spaces and do char -# # assert chunks == text +from copy import copy +from typing import List +from attr import field + +import pytest +from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer + + +from tests.integration.mocks.mock_config import MockConfig +from dug.core.annotators import ( + DugIdentifier, + AnnotateMonarch, + DefaultNormalizer, + DefaultSynonymFinder +) + +def test_monarch_annotation_full(monarch_annotator_api, normalizer_api, null_normalizer_api, synonym_api, null_synonym_api): + cfg = MockConfig.test_from_env() + normalizer = DefaultNormalizer(**cfg.normalizer) + synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) + + annotator = AnnotateMonarch( + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg + ) + input_text = "heart attack" + + text = annotator.preprocess_text(input_text) + + # Fetch identifiers + raw_identifiers: List[DugIdentifier] = annotator.annotate_text(text, monarch_annotator_api) + + processed_identifiers: List[DugIdentifier] = [] + for identifier in raw_identifiers: + if identifier.id == "UBERON:0007100": + # Perform normal normalization + output = annotator.normalizer(identifier, normalizer_api) + + assert isinstance(output, DugIdentifier) + assert output.id == 'UBERON:0007100' + assert output.label == "primary circulatory organ" + assert output.equivalent_identifiers == ['UBERON:0007100'] + assert output.types == 'anatomical entity' + else: + # act as if this is null + output = annotator.normalizer(identifier, null_normalizer_api) + + # Should be returning normalized identifier for each identifier passed in + if output is None: + output = identifier + # Test normalizer when null + assert output.id == 'XAO:0000336' + assert output.label == "heart primordium" + + # Add synonyms to identifier + if identifier.id == "UBERON:0007100": + output.synonyms = annotator.synonym_finder(output.id, synonym_api) + assert output.synonyms == [ + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart" + ] + else: + output.synonyms = annotator.synonym_finder(output.id, null_synonym_api) + assert output.synonyms == [] + # Get pURL for ontology identifer for more info + output.purl = BioLinkPURLerizer.get_curie_purl(output.id) + processed_identifiers.append(output) + + assert isinstance(processed_identifiers, List) + assert len(processed_identifiers) == 2 + assert isinstance(processed_identifiers[0], DugIdentifier) \ No newline at end of file diff --git a/tests/unit/mocks/MockCrawler.py b/tests/unit/mocks/MockCrawler.py index 5a3f077..2597d77 100644 --- a/tests/unit/mocks/MockCrawler.py +++ b/tests/unit/mocks/MockCrawler.py @@ -5,7 +5,7 @@ import json -from dug.core.annotate import Identifier +from dug.core.annotators import DugIdentifier from dug.core.tranql import QueryFactory, QueryKG # Makes some simple mokes @@ -25,8 +25,8 @@ ExcludedIDs = [] ANNOTATED_IDS = [ - Identifier("MONDO:0", "0", ["disease"]), - Identifier("PUBCHEM.COMPOUND:1", "1", ["chemical"]) + DugIdentifier("MONDO:0", "0", ["disease"]), + DugIdentifier("PUBCHEM.COMPOUND:1", "1", ["chemical"]) ] for ids in ANNOTATED_IDS: ids.type = ids.types[0] diff --git a/tests/unit/mocks/data/mock_config.py b/tests/unit/mocks/data/mock_config.py new file mode 100644 index 0000000..27ca191 --- /dev/null +++ b/tests/unit/mocks/data/mock_config.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass, field + + +@dataclass +class MockConfig: + + # Preprocessor config that will be passed to annotate.Preprocessor constructor + preprocessor: dict = field(default_factory=lambda: { + "debreviator": { + "BMI": "body mass index" + }, + "stopwords": ["the"] + }) + + # Annotator config that will be passed to annotate.Annotator constructor + annotator: dict = field(default_factory=lambda: { + "url": "http://annotator.api/?content=" + }) + + # Normalizer config that will be passed to annotate.Normalizer constructor + normalizer: dict = field(default_factory=lambda: { + "url": "http://normalizer.api/?curie=" + }) + + # Synonym service config that will be passed to annotate.SynonymHelper constructor + synonym_service: dict = field(default_factory=lambda: { + "url": "http://synonyms.api" + }) + + @classmethod + def test_from_env(cls): + kwargs = {} + return cls(**kwargs) \ No newline at end of file diff --git a/tests/unit/test_annotate.py b/tests/unit/test_annotate.py deleted file mode 100644 index 8470a37..0000000 --- a/tests/unit/test_annotate.py +++ /dev/null @@ -1,245 +0,0 @@ -from copy import copy -from typing import List - -import pytest - -from dug.config import Config -from dug.core.annotate import Identifier, Preprocessor, Annotator, Normalizer, SynonymFinder - - -def test_identifier(): - ident_1 = Identifier( - "PrimaryIdent:1", "first identifier", types=[], search_text="", description="" - ) - - assert "PrimaryIdent" == ident_1.id_type - - -@pytest.mark.parametrize( - "preprocessor,input_text,expected_text", - [ - (Preprocessor(), "Hello_world", "Hello world"), - (Preprocessor({"Hello": "Hi"}, ["placeholder"]), "Hello placeholder world", "Hi world"), - ] -) -def test_preprocessor_preprocess(preprocessor, input_text, expected_text): - original_text = copy(input_text) - output_text = preprocessor.preprocess(input_text) - - assert input_text == original_text # Don't modify in-place - assert output_text == expected_text - - -def test_annotator_init(): - cfg = Config.from_env() - url = cfg.annotator["url"] - - annotator = Annotator(**cfg.annotator) - print(f"#########URL: {annotator.url}#########") - assert annotator.url == url - - -def test_annotator_handle_response(): - annotator = Annotator('foo') - - response = { - "content": "heart attack", - "spans": [ - { - "start": 0, - "end": 5, - "text": "heart", - "token": [ - { - "id": "UBERON:0015230", - "category": [ - "anatomical entity" - ], - "terms": [ - "dorsal vessel heart" - ] - } - ] - }, - { - "start": 0, - "end": 5, - "text": "heart", - "token": [ - { - "id": "UBERON:0007100", - "category": [ - "anatomical entity" - ], - "terms": [ - "primary circulatory organ" - ] - } - ] - }, - { - "start": 0, - "end": 5, - "text": "heart", - "token": [ - { - "id": "UBERON:0015228", - "category": [ - "anatomical entity" - ], - "terms": [ - "circulatory organ" - ] - } - ] - }, - { - "start": 0, - "end": 5, - "text": "heart", - "token": [ - { - "id": "ZFA:0000114", - "category": [ - "anatomical entity" - ], - "terms": [ - "heart" - ] - } - ] - }, - { - "start": 0, - "end": 5, - "text": "heart", - "token": [ - { - "id": "UBERON:0000948", - "category": [ - "anatomical entity" - ], - "terms": [ - "heart" - ] - } - ] - }, - { - "start": 0, - "end": 12, - "text": "heart attack", - "token": [ - { - "id": "MONDO:0005068", - "category": [ - "disease" - ], - "terms": [ - "myocardial infarction (disease)" - ] - } - ] - }, - { - "start": 0, - "end": 12, - "text": "heart attack", - "token": [ - { - "id": "HP:0001658", - "category": [ - "phenotype", - "quality" - ], - "terms": [ - "Myocardial infarction" - ] - } - ] - } - ] - } - - identifiers: List[Identifier] = annotator.handle_response(None, response) - - assert len(identifiers) == 7 - assert isinstance(identifiers[0], Identifier) - - -def test_annotator_call(annotator_api): - url = "http://annotator.api/?content=" - - annotator = Annotator(url) - - text = "heart attack" - identifiers: List[Identifier] = annotator.annotate(text, annotator_api) - - assert len(identifiers) == 7 - assert isinstance(identifiers[0], Identifier) - - -def test_normalizer(normalizer_api): - url = "http://normalizer.api/?curie=" - - identifier = Identifier( - "UBERON:0007100", - label='primary circulatory organ', - types=['anatomical entity'], - description="", - search_text=['heart'], - ) - - normalizer = Normalizer(url) - output = normalizer.normalize(identifier, normalizer_api) - assert isinstance(output, Identifier) - assert output.id == 'UBERON:0007100' - assert output.label == "primary circulatory organ" - assert output.equivalent_identifiers == ['UBERON:0007100'] - assert output.types == 'anatomical entity' - - - -def test_synonym_finder(synonym_api): - curie = "UBERON:0007100" - url = f"http://synonyms.api" - finder = SynonymFinder(url) - result = finder.get_synonyms( - curie, - synonym_api, - ) - assert result == [ - "primary circulatory organ", - "dorsal tube", - "adult heart", - "heart" - ] - - - - - -def test_yield_partial_text(): - annotator = Annotator('foo') - # text contains 800 characters + 9 new lines - text = """COG Protocol number on which the patient was enrolled [901=Trial of mouse monoclonal Anti-GD-2 antibody 14.G2A plus IL-2 with or without GM-CSF in children with refractory NBL or melanoma; 911=I-131-MIBG for therapy of advanced neuroblastoma; 914=A dose escalation study of cisplatin, doxorubicin, VP-16, and ifosfamide followed by GM-CSF in advanced NBL and peripheral neuroepithelioma; 925=Study of topotecan; 935=Study of ch14.18 with GM-CSF in children with NBL and other GD2 positive malignancies immediately post ABMT or PBSC; 937=Phase I trial of ZD1694, an inhibitor of thymidylate synthase, in pediatric patients with advanced neoplastic disease; 9709=A phase I study of fenretinide in children with high risk solid tumors; 321P2=New intensive chemotherapy for CCG stage II (with N-myc amplification), stage III and stage IV neuroblastoma; 321P3=Treatment of poor prognosis neuroblastoma before disease progression with intensive multimodal therapy and BMT; 323P=Cyclic combination chemotherapy for newly diagnosed stage III neuroblastoma age 2 and older and stage IV Nneuroblastoma all ages; 3881=Biology and therapy of good, intermediate, and selected poor prognosis neuroblastoma; 3891=Conventional dose chemoradiotherapy vs ablative chemoradiotherapy with autologous BMT for high-risk neuroblastoma; 3951=Phase I pilot study of multiple cycles of high dose chemotherapy with peripheral blood stem cell infusions in advanced stage neuroblastoma.; 4941=National Wilms tumor study V - therapeutic trial & biology study; 8605=Study of the combination of ifosfamide, mesna, and VP-16 in children and young adults with recurrent sarcomas, PNET and other tumors; 8742=Phase III portion of 8741 for neuroblastoma; 9047=Neuroblastoma biology protocol; 9082=Protocol for the development of intervention strategies to reduce the time between symptom onset and diagnosis of childhood cancer -a pediatric oncology group cancer control study; 9140=Therapy for patients with recurrent or refractory neuroblastoma - a phase II study; 9262=A Phase II study of taxol in children with recurrent/refractory soft-tissue sarcoma, rhabdomyosarcoma, osteosarcoma, Ewing's sarcoma, neuroblastoma, germ cell tumors, Wilms' tumor, hepatoblastoma, and hepatocellular carcinoma, a POG study; 9280=Neuroblastoma epidemiology protocol - A Non-Therapeutic Study - A Joint Project of: The University of North Carolina, The Pediatric Oncology Group and The Children's Cancer Study Group; 9340=Treatment of patients >365 days at diagnosis with stage IV NBL: Upfront Phase II Window - A Phase II Study; 9341=Treatment of patients >365 days at diagnosis with stage IV and stage IIB/III (N-myc) NBL - a phase III study; 9342=Neuroblastoma #5, bone marrow transplant - a phase III study; 9343=Interleukin-6 in children receiving autologous bone marrow transplantation for advanced neuroblastoma - a pediatric oncology group phase I trial; 9361=Topotecan in pediatric patients with recurrent or progressive solid tumors - a pediatric oncology group phase II study; 9375=Topotecan plus cyclophosphamide in children with solid tumors - a pediatric oncology group phase I trial; 9464=Cyclophosphamide plus topotecan in children with recurrent or refractory solid tumors - a pediatric oncology group phase II study; 9640=Treatment of patients with high risk neuroblastoma (a feasibility pilot) using two cycles of marrow ablative chemotherapy followed by rescue With peripheral blood stem cells (PBSC), radiation therapy; A3973=A randomized study of purged vs. unpurged PBSC transplant following dose intensive induction therapy for high risk NBL; AADM01P1=Protocol for registration and consent to the childhood cancer research network: a limited institution pilot; AAML00P2=A dose finding study of the safety of gemtuzumab ozogamicin combined with conventional chemotherapy for patients with relapsed or refractory acute myeloid leukemia; ACCL0331=A Randomized double blind placebo controlled clinical trial to assess the efficacy of traumeelĀ® S (IND # 66649) for the prevention and treatment of mucositis in children undergoing hematopoietic stem cell transplantation; ACCRN07=Protocol for the enrollment on the official COG registry, The Childhood Cancer Research Network (CCRN); ADVL0018=Phase I study of hu14.18-IL2 fusion protein in patients with refractory neuroblastoma and other refractory GD2 expressing tumors; ADVL0212=A Phase I study of depsipeptide (NSC#630176, IND# 51810) in pediatric patients with refractory solid tumors and leukemias; ADVL0214=A phase I study of single agent OSI-774 (Tarceva) (NSC # 718781, IND #63383) followed by OSI-774 with temozolomide for patients with selected recurrent/refractory solid tumors, including brain tumors; ADVL0215=A phase I study of decitabine in combination with doxorubicin and cyclophosphamide in the treatment of relapsed or refractory solid tumors; ADVL0421=A phase II study of oxaliplatin in children with recurrent solid tumors; ADVL0524=Phase II trial of ixabepilone (BMS-247550), an epothilone B analog, in children and young adults with refractory solid tumors; ADVL0525=A phase II study of pemetrexed in children with recurrent malignancies; ADVL06B1=A pharmacokinetic-pharmacodynamic-pharmacogenetic study of actinomycin-D and vincristine in children with cancer; ADVL0714=A phase I study of VEGF trap (NSC# 724770, IND# 100137) in children with refractory solid tumors; ALTE03N1=Key adverse events after childhood cancer; ALTE05N1=Umbrella long-term follow-up protocol; ANBL0032=Phase III randomized study of chimeric antibody 14.18 (Ch14.18) in high risk neuroblastoma following myeloablative therapy and autologous stem cell rescue; ANBL00B1=Neuroblastoma biology studies; ANBL00P1=A pilot study of tandem high dose chemotherapy with stem cell rescue following induction therapy in children with high risk neuroblastoma; ANBL02P1=A pilot induction regimen incorporating dose-intensive topotecan and cyclophosphamide for treatment of newly diagnosed high risk neuroblastoma; ANBL0321=Phase II study of fenretinide in pediatric patients with resistant or recurrent neuroblastoma; ANBL0322=A phase II study of hu14.18-IL2 (BB-IND-9728) in children with recurrent or refractory neuroblastoma; ANBL0532=Phase III randomized trial of single vs. tandem myeloablative as consolidation therapy for high-risk neuroblastoma; ANBL0621=A phase II study of ABT-751, an orally bioavailable tubulin binding agent, in children with relapsed or refractory neuroblastoma; B003=Diagnostic & prognostic studies in NBL; B903=Childhood cancer genetics; B947=Protocol for collection of biology specimens for research studies; B954=Opsoclonus-myoclonus-ataxia syndrome, neuroblastoma and the presence of anti-neuronal antibodies; B973=Laboratory-clinical studies of neuroblastoma; E04=Self-administered epidemiology questionnaire; E18=A case-control study of risk factors for neuroblastoma; I03=Neuroblastoma, diagnostic/prognostic; N891=Parents' perceptions of randomization; P9462=Randomized treatment of recurrent neuroblastoma with topotecan regimens following desferrioxamine (POG only) in an investigational window; P9641=Primary surgical therapy for biologically defined low-risk neuroblastoma; P9761=A phase II trial of irinotecan in children with refractory solid tumors; P9963=A phase II trial of rebeccamycin analogue (NSC #655649) in children with solid tumors; R9702=Prognostic implications of MIBG uptake in patients with neuroblastoma previously treated on CCG-3891; S31=Right atrial catheter study; S921=Comparison of urokinase vs heparin in preventing Infection in central venous devices in children with malignancies]""" - chunks = "" - is_the_beginning = True - max_chars = 2000 - padding_words = 3 - counter = 0 - print(len(text)) - # divvy up into chunks, sum of each chunk should equal the original text. - for chunk in annotator.sliding_window(text=text, max_characters=max_chars, padding_words= padding_words): - assert len(chunk) <= max_chars - counter += 1 - if is_the_beginning: - chunks += chunk - else: - # remove redundand padded words from final result - chunks += " ".join(chunk.split(" ")[padding_words:]) - is_the_beginning = False - - print(counter) - # since spaces are trimmed by tokenizer , we can execuled all spaces and do char - assert chunks == text \ No newline at end of file diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index ddb0986..c1702ee 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -3,9 +3,9 @@ from attr import field import pytest -from dug.core.annotate import BioLinkPURLerizer +from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer -from tests.unit.mocks.data.test_config import TestConfig +from tests.unit.mocks.data.mock_config import MockConfig from dug.core.annotators import ( DugIdentifier, AnnotateMonarch, @@ -22,55 +22,9 @@ def test_identifier(): assert "PrimaryIdent" == ident_1.id_type -def test_monarch_annotation_full(annotator_api, normalizer_api, synonym_api): - cfg = TestConfig.test_from_env() - normalizer = DefaultNormalizer(**cfg.normalizer) - synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) - annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, config=cfg - ) - input_text = "heart attack" - - text = annotator.preprocess_text(input_text) - - # Fetch identifiers - raw_identifiers: List[DugIdentifier] = annotator.annotate_text(text, annotator_api) - - processed_identifiers: List[DugIdentifier] = [] - for identifier in raw_identifiers: - print(identifier) - output = annotator.normalizer(identifier, normalizer_api) - print(output) - - - # Should be returning normalized identifier for each identifier passed in - if output is None: - output = identifier - # assert isinstance(output, DugIdentifier) - # assert output.id == 'UBERON:0007100' - # assert output.label == "primary circulatory organ" - # assert output.equivalent_identifiers == ['UBERON:0007100'] - # assert output.types == 'anatomical entity' - - # Add synonyms to identifier - output.synonyms = annotator.synonym_finder(output.id, synonym_api) - print(output.synonyms) - # Get pURL for ontology identifer for more info - output.purl = BioLinkPURLerizer.get_curie_purl(output.id) - processed_identifiers.append(output) - - # identifiers: List[DugIdentifier] = annotator( - # text, monarch_annotation_session - # ) - print(processed_identifiers[0]) - assert isinstance(processed_identifiers, List[DugIdentifier]) - assert len(processed_identifiers) == 7 - assert isinstance(processed_identifiers[0], DugIdentifier) - - -def test_monarch_annotator(annotator_api): - cfg = TestConfig.test_from_env() +def test_annotator(annotator_api): + cfg = MockConfig.test_from_env() normalizer = DefaultNormalizer(cfg.normalizer) synonym_finder = DefaultSynonymFinder(cfg.synonym_service) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 99f903d..9237bd7 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -41,6 +41,12 @@ def test_dug_cli_main_extract_dug_elements_none(mock_crawl): assert mock_crawl.called_once() assert not mock_crawl.call_args_list[0].args[0].extract_dug_elements +@mark.cli +@patch('dug.cli.crawl') +def test_dug_cli_main_annotator(mock_crawl): + main(["crawl", "somefile.csv","--parser", "topmedtag", "--annotator", "annotator-monarch"]) + assert mock_crawl.called_once() + @mark.cli @patch('dug.cli.search') def test_dug_cli_main_search(mock_search): diff --git a/tests/unit/test_parsers.py b/tests/unit/test_parsers.py index 0755fed..491bfe9 100644 --- a/tests/unit/test_parsers.py +++ b/tests/unit/test_parsers.py @@ -1,6 +1,6 @@ from dug.core.parsers._base import DugElement, DugConcept -from dug.core.annotate import Identifier as DugIdentifier -from dug.core.annotators.monarch_annotator import AnnotateMonarch +from dug.core.annotators import DugIdentifier, AnnotateMonarch +# from dug.core.annotators.monarch_annotator import AnnotateMonarch def test_dug_concept(): From 2852f2250e4e5cbd1ba9f12dc756e6784fb2dcb1 Mon Sep 17 00:00:00 2001 From: braswent Date: Tue, 31 Oct 2023 15:34:00 -0400 Subject: [PATCH 08/85] feat: --- README.md | 6 +- src/dug/core/__init__.py | 6 +- src/dug/core/annotate.py | 612 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 618 insertions(+), 6 deletions(-) create mode 100644 src/dug/core/annotate.py diff --git a/README.md b/README.md index a992826..c6c368c 100644 --- a/README.md +++ b/README.md @@ -57,13 +57,13 @@ dug crawl tests/integration/data/test_variables_v1.0.csv -p "TOPMedTag" After crawling, you can search: ```shell -dug search -q "heart attack" -t "concepts" -dug search -q "heart attack" -t "variables" -k "concept=MONDO:0005068" +dug search -q "vein" -t "concepts" +dug search -q "vein" -t "variables" -k "concept=UBERON:0001638" ``` You can also query Dug's REST API: ```shell -query="`echo '{"index" : "concepts_index", "query" : "heart attack"}'`" +query="`echo '{"index" : "concepts_index", "query" : "vein"}'`" curl --data "$query" \ --header "Content-Type: application/json" \ diff --git a/src/dug/core/__init__.py b/src/dug/core/__init__.py index 6ce8aa1..9fca7ce 100644 --- a/src/dug/core/__init__.py +++ b/src/dug/core/__init__.py @@ -97,11 +97,11 @@ def search(self, target, query, **kwargs): event_loop = asyncio.get_event_loop() targets = { 'concepts': partial( - self._search.search_concepts, index=kwargs.get('index', self.concepts_index)), + self._search.search_concepts), 'variables': partial( - self._search.search_variables, index=kwargs.get('index', self.variables_index), concept=kwargs.pop('concept', None)), + self._search.search_variables, concept=kwargs.pop('concept', None)), 'kg': partial( - self._search.search_kg, index=kwargs.get('index', self.kg_index), unique_id=kwargs.pop('unique_id', None)) + self._search.search_kg, unique_id=kwargs.pop('unique_id', None)) } kwargs.pop('index', None) func = targets.get(target) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py new file mode 100644 index 0000000..240752f --- /dev/null +++ b/src/dug/core/annotate.py @@ -0,0 +1,612 @@ +# import json +# import logging +# import os +# import re +# import urllib.parse +# from typing import TypeVar, Generic, Union, List, Tuple, Optional +# import bmt +# import requests +# from requests import Session + +# import dug.core.tranql as tql + + +# logger = logging.getLogger('dug') + +# logging.getLogger("requests").setLevel(logging.WARNING) +# logging.getLogger("urllib3").setLevel(logging.WARNING) + + +# class Identifier: +# def __init__(self, id, label, types=None, search_text="", description=""): +# self.id = id +# self.label = label +# self.description = description +# if types is None: +# types = [] +# self.types = types +# self.search_text = [search_text] if search_text else [] +# self.equivalent_identifiers = [] +# self.synonyms = [] +# self.purl = "" + +# @property +# def id_type(self): +# return self.id.split(":")[0] + +# def add_search_text(self, text): +# # Add text only if it's unique and if not empty string +# if text and text not in self.search_text: +# self.search_text.append(text) + +# def get_searchable_dict(self): +# # Return a version of the identifier compatible with what's in ElasticSearch +# es_ident = { +# 'id': self.id, +# 'label': self.label, +# 'equivalent_identifiers': self.equivalent_identifiers, +# 'type': self.types, +# 'synonyms': self.synonyms +# } +# return es_ident + +# def jsonable(self): +# return self.__dict__ + + +# class DugAnnotator: +# def __init__( +# self, +# preprocessor: "Preprocessor", +# annotator: "Annotator", +# normalizer: "Normalizer", +# synonym_finder: "SynonymFinder", +# ontology_greenlist=[], +# ): +# self.preprocessor = preprocessor +# self.annotator = annotator +# self.normalizer = normalizer +# self.synonym_finder = synonym_finder +# self.ontology_greenlist = ontology_greenlist +# self.norm_fails_file = "norm_fails.txt" +# self.anno_fails_file = "anno_fails.txt" + +# def annotate(self, text, http_session): + +# # Preprocess text (debraviate, remove stopwords, etc.) +# text = self.preprocessor.preprocess(text) + +# # Fetch identifiers +# raw_identifiers = self.annotator.annotate(text, http_session) + +# # Write out to file if text fails to annotate +# if not raw_identifiers: +# with open(self.anno_fails_file, "a") as fh: +# fh.write(f'{text}\n') + +# processed_identifiers = [] +# for identifier in raw_identifiers: + +# # Normalize identifier using normalization service +# norm_id = self.normalizer.normalize(identifier, http_session) + +# # Skip adding id if it doesn't normalize +# if norm_id is None: +# # Write out to file if identifier doesn't normalize +# with open(self.norm_fails_file, "a") as fh: +# fh.write(f'{identifier.id}\n') + +# # Discard non-normalized ident if not in greenlist +# if identifier.id_type not in self.ontology_greenlist: +# continue + +# # If it is in greenlist just keep moving forward +# norm_id = identifier + +# # Add synonyms to identifier +# norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session) + +# # Get pURL for ontology identifer for more info +# norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) +# processed_identifiers.append(norm_id) + +# return processed_identifiers + + +# class ConceptExpander: +# def __init__(self, url, min_tranql_score=0.2): +# self.url = url +# self.min_tranql_score = min_tranql_score +# self.include_node_keys = ["id", "name", "synonyms"] +# self.include_edge_keys = [] +# self.tranql_headers = {"accept": "application/json", "Content-Type": "text/plain"} + +# def is_acceptable_answer(self, answer): +# return True + +# def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False): + +# answer_kgs = [] + +# # Skip TranQL query if a file exists in the crawlspace exists already, but continue w/ answers +# if os.path.exists(kg_filename): +# logger.info(f"identifier {identifier} is already crawled. Skipping TranQL query.") +# with open(kg_filename, 'r') as stream: +# response = json.load(stream) +# else: +# query = query_factory.get_query(identifier) +# logger.debug(query) +# response = requests.post( +# url=self.url, +# headers=self.tranql_headers, +# data=query).json() + +# # Case: Skip if empty KG +# try: +# if response["message"] == 'Internal Server Error' or len(response["message"]["knowledge_graph"]["nodes"]) == 0: +# logger.debug(f"Did not find a knowledge graph for {query}") +# logger.debug(f"{self.url} returned response: {response}") +# return [] +# except KeyError as e: +# logger.error(f"Could not find key: {e} in response: {response}") + +# # Dump out to file if there's a knowledge graph +# with open(kg_filename, 'w') as stream: +# json.dump(response, stream, indent=2) + +# # Get nodes in knowledge graph hashed by ids for easy lookup +# noMessage = (len(response.get("message",{})) == 0) +# statusError = (response.get("status","") == 'Error') +# if noMessage or statusError: +# # Skip on error +# logger.info(f"Error with identifier: {identifier}, response: {response}, kg_filename: '{kg_filename}'") +# return [] +# kg = tql.QueryKG(response) + +# for answer in kg.answers: +# # Filter out answers that don't meet some criteria +# # Right now just don't filter anything +# logger.debug(f"Answer: {answer}") +# if not self.is_acceptable_answer(answer): +# logger.warning("Skipping answer as it failed one or more acceptance criteria. See log for details.") +# continue + +# # Get subgraph containing only information for this answer +# try: +# # Temporarily surround in try/except because sometimes the answer graphs +# # contain invalid references to edges/nodes +# # This will be fixed in Robokop but for now just silently warn if answer is invalid +# node_attributes_filter = None if include_all_attributes else self.include_node_keys +# edge_attributes_filter = None if include_all_attributes else self.include_edge_keys +# answer_kg = kg.get_answer_subgraph(answer, +# include_node_keys=node_attributes_filter, +# include_edge_keys=edge_attributes_filter) + +# # Add subgraph to list of acceptable answers to query +# answer_kgs.append(answer_kg) + +# except tql.MissingNodeReferenceError: +# # TEMPORARY: Skip answers that have invalid node references +# # Need this to be fixed in Robokop +# logger.warning("Skipping answer due to presence of non-preferred id! " +# "See err msg for details.") +# continue +# except tql.MissingEdgeReferenceError: +# # TEMPORARY: Skip answers that have invalid edge references +# # Need this to be fixed in Robokop +# logger.warning("Skipping answer due to presence of invalid edge reference! " +# "See err msg for details.") +# continue + +# return answer_kgs + + +# class Preprocessor: +# """"Class for preprocessing strings so they are better interpreted by NLP steps""" + +# def __init__(self, debreviator=None, stopwords=None): +# if debreviator is None: +# debreviator = self.default_debreviator_factory() +# self.decoder = debreviator + +# if stopwords is None: +# stopwords = [] +# self.stopwords = stopwords + +# def preprocess(self, text: str) -> str: +# """ +# Apply debreviator to replace abbreviations and other characters + +# >>> pp = Preprocessor({"foo": "bar"}, ["baz"]) +# >>> pp.preprocess("Hello foo") +# 'Hello bar' + +# >>> pp.preprocess("Hello baz world") +# 'Hello world' +# """ + +# for key, value in self.decoder.items(): +# text = text.replace(key, value) + +# # Remove any stopwords +# text = " ".join([word for word in text.split() if word not in self.stopwords]) +# return text + +# @staticmethod +# def default_debreviator_factory(): +# return {"bmi": "body mass index", "_": " "} + + +# Input = TypeVar("Input") +# Output = TypeVar("Output") + + +# class ApiClient(Generic[Input, Output]): + +# def make_request(self, value: Input, http_session: Session): +# raise NotImplementedError() + +# def handle_response(self, value, response: Union[dict, list]) -> Output: +# raise NotImplementedError() + +# def __call__(self, value: Input, http_session: Session) -> Output: +# response = self.make_request(value, http_session) + +# result = self.handle_response(value, response) + +# return result + + +# class Annotator(ApiClient[str, List[Identifier]]): +# """ +# Use monarch API service to fetch ontology IDs found in text +# """ + +# def __init__(self, url: str): +# self.url = url + +# def sliding_window(self, text, max_characters=2000, padding_words=5): +# """ +# For long texts sliding window works as the following +# "aaaa bbb ccc ddd eeee" +# with a sliding max chars 8 and padding 1 +# first yeild would be "aaaa bbb" +# next subsequent yeilds "bbb ccc", "ccc ddd" , "ddd eeee" +# allowing context to be preserved with the scope of padding +# For a text of length 7653 , with max_characters 2000 and padding 5 , 4 chunks are yielded. +# """ +# words = text.split(' ') +# total_words = len(words) +# window_end = False +# current_index = 0 +# while not window_end: +# current_string = "" +# for index, word in enumerate(words[current_index: ]): +# if len(current_string) + len(word) + 1 >= max_characters: +# yield current_string + " " +# current_index += index - padding_words +# break +# appendee = word if index == 0 else " " + word +# current_string += appendee + +# if current_index + index == len(words) - 1: +# window_end = True +# yield current_string + +# def annotate(self, text, http_session): +# logger.debug(f"Annotating: {text}") +# identifiers = [] +# for chunk_text in self.sliding_window(text): +# identifiers += self(chunk_text, http_session) +# return identifiers + +# def make_request(self, value: Input, http_session: Session): +# value = urllib.parse.quote(value) +# url = f'{self.url}{value}' + +# # This could be moved to a config file +# NUM_TRIES = 5 +# for _ in range(NUM_TRIES): +# response = http_session.get(url) +# if response is not None: +# # looks like it worked +# break + +# # if the reponse is still None here, throw an error +# if response is None: +# raise RuntimeError(f"no response from {url}") +# return response.json() + +# def handle_response(self, value, response: dict) -> List[Identifier]: +# identifiers = [] +# """ Parse each identifier and initialize identifier object """ +# for span in response.get('spans', []): +# search_text = span.get('text', None) +# for token in span.get('token', []): +# curie = token.get('id', None) +# if not curie: +# continue + +# biolink_types = token.get('category') +# label = token.get('terms')[0] +# identifiers.append(Identifier(id=curie, +# label=label, +# types=biolink_types, +# search_text=search_text)) +# return identifiers + + +# class Normalizer(ApiClient[Identifier, Identifier]): +# def __init__(self, url): +# self.bl_toolkit = bmt.Toolkit() +# self.url = url + +# def normalize(self, identifier: Identifier, http_session: Session): +# # Use RENCI's normalization API service to get the preferred version of an identifier +# logger.debug(f"Normalizing: {identifier.id}") +# return self(identifier, http_session) + +# def make_request(self, value: Identifier, http_session: Session) -> dict: +# curie = value.id +# url = f"{self.url}{urllib.parse.quote(curie)}" +# try: +# response = http_session.get(url) +# except Exception as get_exc: +# logger.info(f"Error normalizing {value} at {url}") +# logger.error(f"Error {get_exc.__class__.__name__}: {get_exc}") +# return {} +# try: +# normalized = response.json() +# except Exception as json_exc: +# logger.info(f"Error processing response: {response.text} (HTTP {response.status_code})") +# logger.error(f"Error {json_exc.__class__.__name__}: {json_exc}") +# return {} + +# return normalized + +# def handle_response(self, identifier: Identifier, normalized: dict) -> Optional[Identifier]: +# """ Record normalized results. """ +# curie = identifier.id +# normalization = normalized.get(curie, {}) +# if normalization is None: +# logger.info(f"Normalization service did not return normalization for: {curie}") +# return None + +# preferred_id = normalization.get("id", {}) +# equivalent_identifiers = normalization.get("equivalent_identifiers", []) +# biolink_type = normalization.get("type", []) + +# # Return none if there isn't actually a preferred id +# if 'identifier' not in preferred_id: +# logger.debug(f"ERROR: normalize({curie})=>({preferred_id}). No identifier?") +# return None + +# logger.debug(f"Preferred id: {preferred_id}") +# identifier.id = preferred_id.get('identifier', '') +# identifier.label = preferred_id.get('label', '') +# identifier.description = preferred_id.get('description', '') +# identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] +# try: +# identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name +# except: +# # converts biolink:SmallMolecule to small molecule +# identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() +# return identifier + + +# class SynonymFinder(ApiClient[str, List[str]]): + +# def __init__(self, url: str): +# self.url = url + +# def get_synonyms(self, curie: str, http_session): +# ''' +# This function uses the NCATS translator service to return a list of synonyms for +# curie id +# ''' + +# return self(curie, http_session) + +# def make_request(self, curie: str, http_session: Session): +# # Get response from namelookup reverse lookup op +# # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) +# url = f"{self.url}" +# payload = { +# 'curies': [curie] +# } +# try: +# response = http_session.post(url, json=payload) +# if str(response.status_code).startswith('4'): +# logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") +# return {curie: []} +# if str(response.status_code).startswith('5'): +# logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") +# return {curie: []} +# return response.json() +# except json.decoder.JSONDecodeError as e: +# logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") +# return {curie: []} + +# def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: +# # Return curie synonyms +# return raw_synonyms.get(curie, []) + + + + + +# class BioLinkPURLerizer: +# # Static class for the sole purpose of doing lookups of different ontology PURLs +# # Is it pretty? No. But it gets the job done. +# biolink_lookup = {"APO": "http://purl.obolibrary.org/obo/APO_", +# "Aeolus": "http://translator.ncats.nih.gov/Aeolus_", +# "BIOGRID": "http://identifiers.org/biogrid/", +# "BIOSAMPLE": "http://identifiers.org/biosample/", +# "BSPO": "http://purl.obolibrary.org/obo/BSPO_", +# "CAID": "http://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=", +# "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_", +# "CHEMBL.COMPOUND": "http://identifiers.org/chembl.compound/", +# "CHEMBL.MECHANISM": "https://www.ebi.ac.uk/chembl/mechanism/inspect/", +# "CHEMBL.TARGET": "http://identifiers.org/chembl.target/", +# "CID": "http://pubchem.ncbi.nlm.nih.gov/compound/", +# "CL": "http://purl.obolibrary.org/obo/CL_", +# "CLINVAR": "http://identifiers.org/clinvar/", +# "CLO": "http://purl.obolibrary.org/obo/CLO_", +# "COAR_RESOURCE": "http://purl.org/coar/resource_type/", +# "CPT": "https://www.ama-assn.org/practice-management/cpt/", +# "CTD": "http://translator.ncats.nih.gov/CTD_", +# "ClinVarVariant": "http://www.ncbi.nlm.nih.gov/clinvar/variation/", +# "DBSNP": "http://identifiers.org/dbsnp/", +# "DGIdb": "https://www.dgidb.org/interaction_types", +# "DOID": "http://purl.obolibrary.org/obo/DOID_", +# "DRUGBANK": "http://identifiers.org/drugbank/", +# "DrugCentral": "http://translator.ncats.nih.gov/DrugCentral_", +# "EC": "http://www.enzyme-database.org/query.php?ec=", +# "ECTO": "http://purl.obolibrary.org/obo/ECTO_", +# "EDAM-DATA": "http://edamontology.org/data_", +# "EDAM-FORMAT": "http://edamontology.org/format_", +# "EDAM-OPERATION": "http://edamontology.org/operation_", +# "EDAM-TOPIC": "http://edamontology.org/topic_", +# "EFO": "http://identifiers.org/efo/", +# "ENSEMBL": "http://identifiers.org/ensembl/", +# "ExO": "http://purl.obolibrary.org/obo/ExO_", +# "FAO": "http://purl.obolibrary.org/obo/FAO_", +# "FB": "http://identifiers.org/fb/", +# "FBcv": "http://purl.obolibrary.org/obo/FBcv_", +# "FlyBase": "http://flybase.org/reports/", +# "GAMMA": "http://translator.renci.org/GAMMA_", +# "GO": "http://purl.obolibrary.org/obo/GO_", +# "GOLD.META": "http://identifiers.org/gold.meta/", +# "GOP": "http://purl.obolibrary.org/obo/go#", +# "GOREL": "http://purl.obolibrary.org/obo/GOREL_", +# "GSID": "https://scholar.google.com/citations?user=", +# "GTEx": "https://www.gtexportal.org/home/gene/", +# "HANCESTRO": "http://www.ebi.ac.uk/ancestro/ancestro_", +# "HCPCS": "http://purl.bioontology.org/ontology/HCPCS/", +# "HGNC": "http://identifiers.org/hgnc/", +# "HGNC.FAMILY": "http://identifiers.org/hgnc.family/", +# "HMDB": "http://identifiers.org/hmdb/", +# "HP": "http://purl.obolibrary.org/obo/HP_", +# "ICD0": "http://translator.ncats.nih.gov/ICD0_", +# "ICD10": "http://translator.ncats.nih.gov/ICD10_", +# "ICD9": "http://translator.ncats.nih.gov/ICD9_", +# "INCHI": "http://identifiers.org/inchi/", +# "INCHIKEY": "http://identifiers.org/inchikey/", +# "INTACT": "http://identifiers.org/intact/", +# "IUPHAR.FAMILY": "http://identifiers.org/iuphar.family/", +# "KEGG": "http://identifiers.org/kegg/", +# "LOINC": "http://loinc.org/rdf/", +# "MEDDRA": "http://identifiers.org/meddra/", +# "MESH": "http://identifiers.org/mesh/", +# "MGI": "http://identifiers.org/mgi/", +# "MI": "http://purl.obolibrary.org/obo/MI_", +# "MIR": "http://identifiers.org/mir/", +# "MONDO": "http://purl.obolibrary.org/obo/MONDO_", +# "MP": "http://purl.obolibrary.org/obo/MP_", +# "MSigDB": "https://www.gsea-msigdb.org/gsea/msigdb/", +# "MetaCyc": "http://translator.ncats.nih.gov/MetaCyc_", +# "NCBIGENE": "http://identifiers.org/ncbigene/", +# "NCBITaxon": "http://purl.obolibrary.org/obo/NCBITaxon_", +# "NCIT": "http://purl.obolibrary.org/obo/NCIT_", +# "NDDF": "http://purl.bioontology.org/ontology/NDDF/", +# "NLMID": "https://www.ncbi.nlm.nih.gov/nlmcatalog/?term=", +# "OBAN": "http://purl.org/oban/", +# "OBOREL": "http://purl.obolibrary.org/obo/RO_", +# "OIO": "http://www.geneontology.org/formats/oboInOwl#", +# "OMIM": "http://purl.obolibrary.org/obo/OMIM_", +# "ORCID": "https://orcid.org/", +# "ORPHA": "http://www.orpha.net/ORDO/Orphanet_", +# "ORPHANET": "http://identifiers.org/orphanet/", +# "PANTHER.FAMILY": "http://identifiers.org/panther.family/", +# "PANTHER.PATHWAY": "http://identifiers.org/panther.pathway/", +# "PATO-PROPERTY": "http://purl.obolibrary.org/obo/pato#", +# "PDQ": "https://www.cancer.gov/publications/pdq#", +# "PHARMGKB.DRUG": "http://identifiers.org/pharmgkb.drug/", +# "PHARMGKB.PATHWAYS": "http://identifiers.org/pharmgkb.pathways/", +# "PHAROS": "http://pharos.nih.gov", +# "PMID": "http://www.ncbi.nlm.nih.gov/pubmed/", +# "PO": "http://purl.obolibrary.org/obo/PO_", +# "POMBASE": "http://identifiers.org/pombase/", +# "PR": "http://purl.obolibrary.org/obo/PR_", +# "PUBCHEM.COMPOUND": "http://identifiers.org/pubchem.compound/", +# "PUBCHEM.SUBSTANCE": "http://identifiers.org/pubchem.substance/", +# "PathWhiz": "http://smpdb.ca/pathways/#", +# "REACT": "http://www.reactome.org/PathwayBrowser/#/", +# "REPODB": "http://apps.chiragjpgroup.org/repoDB/", +# "RGD": "http://identifiers.org/rgd/", +# "RHEA": "http://identifiers.org/rhea/", +# "RNACENTRAL": "http://identifiers.org/rnacentral/", +# "RO": "http://purl.obolibrary.org/obo/RO_", +# "RTXKG1": "http://kg1endpoint.rtx.ai/", +# "RXNORM": "http://purl.bioontology.org/ontology/RXNORM/", +# "ResearchID": "https://publons.com/researcher/", +# "SEMMEDDB": "https://skr3.nlm.nih.gov/SemMedDB", +# "SGD": "http://identifiers.org/sgd/", +# "SIO": "http://semanticscience.org/resource/SIO_", +# "SMPDB": "http://identifiers.org/smpdb/", +# "SNOMEDCT": "http://identifiers.org/snomedct/", +# "SNPEFF": "http://translator.ncats.nih.gov/SNPEFF_", +# "ScopusID": "https://www.scopus.com/authid/detail.uri?authorId=", +# "TAXRANK": "http://purl.obolibrary.org/obo/TAXRANK_", +# "UBERGRAPH": "http://translator.renci.org/ubergraph-axioms.ofn#", +# "UBERON": "http://purl.obolibrary.org/obo/UBERON_", +# "UBERON_CORE": "http://purl.obolibrary.org/obo/uberon/core#", +# "UMLS": "http://identifiers.org/umls/", +# "UMLSSC": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/code#", +# "UMLSSG": "https://metamap.nlm.nih.gov/Docs/SemGroups_2018.txt/group#", +# "UMLSST": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/type#", +# "UNII": "http://identifiers.org/unii/", +# "UPHENO": "http://purl.obolibrary.org/obo/UPHENO_", +# "UniProtKB": "http://identifiers.org/uniprot/", +# "VANDF": "https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/VANDF/", +# "VMC": "https://github.com/ga4gh/vr-spec/", +# "WB": "http://identifiers.org/wb/", +# "WBPhenotype": "http://purl.obolibrary.org/obo/WBPhenotype_", +# "WBVocab": "http://bio2rdf.org/wormbase_vocabulary", +# "WIKIDATA": "https://www.wikidata.org/wiki/", +# "WIKIDATA_PROPERTY": "https://www.wikidata.org/wiki/Property:", +# "WIKIPATHWAYS": "http://identifiers.org/wikipathways/", +# "WormBase": "https://www.wormbase.org/get?name=", +# "ZFIN": "http://identifiers.org/zfin/", +# "ZP": "http://purl.obolibrary.org/obo/ZP_", +# "alliancegenome": "https://www.alliancegenome.org/", +# "biolink": "https://w3id.org/biolink/vocab/", +# "biolinkml": "https://w3id.org/biolink/biolinkml/", +# "chembio": "http://translator.ncats.nih.gov/chembio_", +# "dcterms": "http://purl.org/dc/terms/", +# "dictyBase": "http://dictybase.org/gene/", +# "doi": "https://doi.org/", +# "fabio": "http://purl.org/spar/fabio/", +# "foaf": "http://xmlns.com/foaf/0.1/", +# "foodb.compound": "http://foodb.ca/compounds/", +# "gff3": "https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#", +# "gpi": "https://github.com/geneontology/go-annotation/blob/master/specs/gpad-gpi-2-0.md#", +# "gtpo": "https://rdf.guidetopharmacology.org/ns/gtpo#", +# "hetio": "http://translator.ncats.nih.gov/hetio_", +# "interpro": "https://www.ebi.ac.uk/interpro/entry/", +# "isbn": "https://www.isbn-international.org/identifier/", +# "isni": "https://isni.org/isni/", +# "issn": "https://portal.issn.org/resource/ISSN/", +# "medgen": "https://www.ncbi.nlm.nih.gov/medgen/", +# "oboformat": "http://www.geneontology.org/formats/oboInOWL#", +# "pav": "http://purl.org/pav/", +# "prov": "http://www.w3.org/ns/prov#", +# "qud": "http://qudt.org/1.1/schema/qudt#", +# "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", +# "rdfs": "http://www.w3.org/2000/01/rdf-schema#", +# "skos": "https://www.w3.org/TR/skos-reference/#", +# "wgs": "http://www.w3.org/2003/01/geo/wgs84_pos", +# "xsd": "http://www.w3.org/2001/XMLSchema#", +# "@vocab": "https://w3id.org/biolink/vocab/"} + +# @staticmethod +# def get_curie_purl(curie): +# # Split into prefix and suffix +# suffix = curie.split(":")[1] +# prefix = curie.split(":")[0] + +# # Check to see if the prefix exists in the hash +# if prefix not in BioLinkPURLerizer.biolink_lookup: +# return None + +# return f"{BioLinkPURLerizer.biolink_lookup[prefix]}{suffix}" \ No newline at end of file From ac5c8261cb086b9b9c6588f4be94fc3d1918b8e4 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 2 Nov 2023 11:06:39 -0400 Subject: [PATCH 09/85] use action vs collection_action --- src/dug/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/config.py b/src/dug/config.py index ba050bb..5f4d59d 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -83,7 +83,7 @@ class Config: "desc": "summary", "collection_name": "cde_category", "collection_id": "cde_category", - "collection_action": "files" + "action": "files" } } }) From d687e0db0bbb87fb8ced9ae9b1ec159f01b9bbd0 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 2 Nov 2023 14:14:52 -0400 Subject: [PATCH 10/85] handle synonyms only take in list of strings --- src/dug/core/tranql.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dug/core/tranql.py b/src/dug/core/tranql.py index c4c495b..1a417a7 100644 --- a/src/dug/core/tranql.py +++ b/src/dug/core/tranql.py @@ -113,11 +113,14 @@ def get_node_names(self, include_curie=True): return node_names def get_node_synonyms(self, include_curie=True): + # @TODO call name-resolver node_synonyms = [] curie_ids = self.get_curie_ids() for node in self.get_nodes(): if include_curie or node['id'] not in curie_ids: - node_synonyms += node.get('synonyms') or [] + syn = node.get('synonyms') + if isinstance(list, syn): + node_synonyms += syn return node_synonyms def get_curie_ids(self): From 05e65283696c8300a42e5b59b5fd96b42241e3e1 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Mon, 6 Nov 2023 16:32:06 -0500 Subject: [PATCH 11/85] pin linkml --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 14208fc..55b0296 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,5 +25,6 @@ six==1.16.0 # We use Click 7.0 because that's what one of the pinned packages above use. click httpx>=0.24.1 +linkml==1.6.0 bmt==1.1.0 urllib3>=1.26.17 \ No newline at end of file From a8bcf52df07ee24b804e34f7550fc27dba96052d Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Mon, 6 Nov 2023 16:42:18 -0500 Subject: [PATCH 12/85] pin linkml --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 55b0296..062bcdd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,6 +25,6 @@ six==1.16.0 # We use Click 7.0 because that's what one of the pinned packages above use. click httpx>=0.24.1 -linkml==1.6.0 +linkml-runtime==1.6.0 bmt==1.1.0 urllib3>=1.26.17 \ No newline at end of file From e0625e5d1b85a3e18dcf92eb4281e97b7ae24875 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Mon, 6 Nov 2023 17:29:09 -0500 Subject: [PATCH 13/85] is instance swapped args --- src/dug/core/tranql.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/tranql.py b/src/dug/core/tranql.py index 1a417a7..4c458a2 100644 --- a/src/dug/core/tranql.py +++ b/src/dug/core/tranql.py @@ -119,7 +119,7 @@ def get_node_synonyms(self, include_curie=True): for node in self.get_nodes(): if include_curie or node['id'] not in curie_ids: syn = node.get('synonyms') - if isinstance(list, syn): + if isinstance(syn,list): node_synonyms += syn return node_synonyms From 93eeb605f59f350757cc4e74fb30cde177000b5c Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 7 Nov 2023 18:11:48 -0500 Subject: [PATCH 14/85] fix list first bug; --- src/dug/core/crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 1bb64f0..1b5b877 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -259,7 +259,7 @@ def expand_to_dug_element(self, for key in attribute_mapping: mapped_value = node.get(attribute_mapping[key], "") # treat all attributes as strings - if key in array_to_string and isinstance(mapped_value, list) and len(mapped_value) > 0: + if attribute_mapping[key] in array_to_string and isinstance(mapped_value, list) and len(mapped_value) > 0: mapped_value = mapped_value[0] element_attribute_args.update({key: mapped_value}) element = DugElement( From bdc5ca95b2c2c53320617834570eaf40a1df320d Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 7 Nov 2023 19:54:38 -0500 Subject: [PATCH 15/85] add total counts to vars unscored --- src/dug/core/async_search.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index 59f60ba..44d7c98 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -651,6 +651,7 @@ async def search_vars_unscored(self, concept="", query="", new_results = new_results[data_type] else: new_results = {} + new_results.update({'total_items': total_items['count']}) return new_results async def search_kg(self, unique_id, query, offset=0, size=None, From 341e534e228220ff21cd6664eaae78f482322620 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Wed, 22 Nov 2023 15:02:16 -0500 Subject: [PATCH 16/85] Updates and new logic for trivy scan --- .github/workflows/build-push-dev-image.yml | 8 ++++---- .github/workflows/build-push-release.yml | 8 ++++---- .github/workflows/code-checks.yml | 23 ++++++++++++++++------ 3 files changed, 25 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build-push-dev-image.yml b/.github/workflows/build-push-dev-image.yml index 24abc61..6951c57 100644 --- a/.github/workflows/build-push-dev-image.yml +++ b/.github/workflows/build-push-dev-image.yml @@ -48,20 +48,20 @@ jobs: # https://github.com/marketplace/actions/build-and-push-docker-images - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 with: driver-opts: | network=host - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} logout: true - name: Login to Container Registry - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: registry: containers.renci.org username: ${{ secrets.CONTAINERHUB_USERNAME }} @@ -72,7 +72,7 @@ jobs: # Notes on Cache: # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache - name: Build Push Container - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: . push: true diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml index 06656b6..f23dc15 100644 --- a/.github/workflows/build-push-release.yml +++ b/.github/workflows/build-push-release.yml @@ -63,20 +63,20 @@ jobs: # step # https://github.com/marketplace/actions/build-and-push-docker-images - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 with: driver-opts: | network=host - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} logout: true - name: Login to Container Registry - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: registry: containers.renci.org username: ${{ secrets.CONTAINERHUB_USERNAME }} @@ -86,7 +86,7 @@ jobs: # Notes on Cache: # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache - name: Build Push Container - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: push: true # Push to renci-registry and dockerhub here. diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 193756d..98acdfd 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -21,8 +21,8 @@ on: paths-ignore: - README.md - .old_cicd/* - # - .github/* - # - .github/workflows/* + - .github/* + - .github/workflows/* - LICENSE - .gitignore - .dockerignore @@ -34,7 +34,6 @@ on: - main types: [ opened, synchronize ] - jobs: ############################## flake8-linter ############################## flake8-linter: @@ -77,13 +76,13 @@ jobs: - uses: actions/checkout@v3 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 with: driver-opts: | network=host - name: Login to DockerHub - uses: docker/login-action@v2 + uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -92,12 +91,24 @@ jobs: # Notes on Cache: # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache - name: Build Container - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v5 with: context: . push: false + load: true + tags: ${{ github.repository }}:vuln-test cache-from: type=registry,ref=${{ github.repository }}:buildcache cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: '${{ github.repository }}:vuln-test' + format: 'table' + exit-code: '1' + ignore-unfixed: true + vuln-type: 'os,library' + severity: 'CRITICAL,HIGH' ################################### PYTEST ################################### pytest: runs-on: ubuntu-latest From 3cf5f8234840a4f1443f6d9d94fc6f158e76c874 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Wed, 22 Nov 2023 15:04:56 -0500 Subject: [PATCH 17/85] A few more adjustments to trivy scan logic --- .github/workflows/code-checks.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 98acdfd..b78ba63 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -68,8 +68,8 @@ jobs: # flake8 --ignore=E,W --exit-zero . continue-on-error: true -############################## test-image-build ############################## - test-image-build: +############################## build-vuln-test ############################## + build-vuln-test: # needs: flake8-linter runs-on: ubuntu-latest steps: @@ -104,9 +104,6 @@ jobs: uses: aquasecurity/trivy-action@master with: image-ref: '${{ github.repository }}:vuln-test' - format: 'table' - exit-code: '1' - ignore-unfixed: true vuln-type: 'os,library' severity: 'CRITICAL,HIGH' ################################### PYTEST ################################### From 6fe7f947481d04b8ce650f474c18ea496afd937c Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Thu, 23 Nov 2023 04:03:53 -0500 Subject: [PATCH 18/85] Introducing trivy with codeql integration for prs and removing a step from code-checks --- .github/workflows/code-checks.yml | 69 ++++++++++++----------------- .github/workflows/trivy-pr-scan.yml | 63 ++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 40 deletions(-) create mode 100644 .github/workflows/trivy-pr-scan.yml diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index b78ba63..d1b1f7b 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -1,11 +1,9 @@ # Workflow responsible for core acceptance testing. # Tests Currently Run: # - flake8-linter -# - image-build-test -# -# This workflow only validates images can build -# but does not push images to any repository. -# +# - PYTest +# - Bandit +# For PR Vulnerability Scanning a separate workflow will run. # The build-push-dev-image and build-push-release workflows # handle the develop and release image storage respectively. # @@ -68,44 +66,35 @@ jobs: # flake8 --ignore=E,W --exit-zero . continue-on-error: true -############################## build-vuln-test ############################## - build-vuln-test: - # needs: flake8-linter - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 +# ############################## build-test ############################## +# build-vuln-test: +# # needs: flake8-linter +# runs-on: ubuntu-latest +# steps: +# - uses: actions/checkout@v3 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - driver-opts: | - network=host +# - name: Set up Docker Buildx +# uses: docker/setup-buildx-action@v3 +# with: +# driver-opts: | +# network=host - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - logout: true - - # Notes on Cache: - # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache - - name: Build Container - uses: docker/build-push-action@v5 - with: - context: . - push: false - load: true - tags: ${{ github.repository }}:vuln-test - cache-from: type=registry,ref=${{ github.repository }}:buildcache - cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max +# - name: Login to DockerHub +# uses: docker/login-action@v3 +# with: +# username: ${{ secrets.DOCKERHUB_USERNAME }} +# password: ${{ secrets.DOCKERHUB_TOKEN }} +# logout: true - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - with: - image-ref: '${{ github.repository }}:vuln-test' - vuln-type: 'os,library' - severity: 'CRITICAL,HIGH' +# # Notes on Cache: +# # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache +# - name: Build Container +# uses: docker/build-push-action@v5 +# with: +# context: . +# push: false +# cache-from: type=registry,ref=${{ github.repository }}:buildcache +# cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max ################################### PYTEST ################################### pytest: runs-on: ubuntu-latest diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml new file mode 100644 index 0000000..2b76cc2 --- /dev/null +++ b/.github/workflows/trivy-pr-scan.yml @@ -0,0 +1,63 @@ + +name: trivy-pr-scan +on: + pull_request: + branches: + - develop + - master + - main + types: [ opened, synchronize ] + paths-ignore: + - README.md + - .old_cicd/* + # - .github/* + # - .github/workflows/* + - LICENSE + - .gitignore + - .dockerignore + - .githooks + +jobs: + trivy-pr-scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: false + load: true + tags: ${{ github.repository }}:vuln-test + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + + # We will not be concerned with Medium and Low vulnerabilities + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: '${{ github.repository }}:vuln-test' + format: 'sarif' + severity: 'CRITICAL,HIGH' + output: 'trivy-results.sarif' + # Scan results should be viewable in GitHub Security Dashboard + - name: Upload Trivy scan results to GitHub Security tab + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: 'trivy-results.sarif' \ No newline at end of file From 4bf178d545252f4716f778fbcb05f83510b6ebc0 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Thu, 23 Nov 2023 04:26:47 -0500 Subject: [PATCH 19/85] For fidelity --- .github/workflows/code-checks.yml | 65 ++++++++++++++++++------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index d1b1f7b..1033f04 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -19,8 +19,8 @@ on: paths-ignore: - README.md - .old_cicd/* - - .github/* - - .github/workflows/* + # - .github/* + # - .github/workflows/* - LICENSE - .gitignore - .dockerignore @@ -66,35 +66,44 @@ jobs: # flake8 --ignore=E,W --exit-zero . continue-on-error: true -# ############################## build-test ############################## -# build-vuln-test: -# # needs: flake8-linter -# runs-on: ubuntu-latest -# steps: -# - uses: actions/checkout@v3 +# ############################## build-vuln-test ############################## + build-vuln-test: + # needs: flake8-linter + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 -# - name: Set up Docker Buildx -# uses: docker/setup-buildx-action@v3 -# with: -# driver-opts: | -# network=host + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + with: + driver-opts: | + network=host -# - name: Login to DockerHub -# uses: docker/login-action@v3 -# with: -# username: ${{ secrets.DOCKERHUB_USERNAME }} -# password: ${{ secrets.DOCKERHUB_TOKEN }} -# logout: true + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true -# # Notes on Cache: -# # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache -# - name: Build Container -# uses: docker/build-push-action@v5 -# with: -# context: . -# push: false -# cache-from: type=registry,ref=${{ github.repository }}:buildcache -# cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: false + load: true + tag: ${{ github.repository }}:vuln-test + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + ####### Run for Fidelity ###### + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + image-ref: '${{ github.repository }}:vuln-test' + severity: 'CRITICAL,HIGH' + exit-code: '1' ################################### PYTEST ################################### pytest: runs-on: ubuntu-latest From 0313505637e4f5daee2ff8ddce4fc734f589a981 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Thu, 23 Nov 2023 04:33:56 -0500 Subject: [PATCH 20/85] Fix on calls to actions --- .github/workflows/code-checks.yml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 1033f04..b4e6088 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -11,11 +11,17 @@ name: Code-Checks on: - push: - branches-ignore: - - master - - main + # push: + # branches-ignore: + # - master + # - main + # - develop + pull_request: + branches: - develop + - master + - main + types: [ opened, synchronize ] paths-ignore: - README.md - .old_cicd/* @@ -25,12 +31,6 @@ on: - .gitignore - .dockerignore - .githooks - pull_request: - branches: - - develop - - master - - main - types: [ opened, synchronize ] jobs: ############################## flake8-linter ############################## From cce9cb9a9197e03d1778c28db57a1be0b36f2fb6 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Thu, 23 Nov 2023 05:16:20 -0500 Subject: [PATCH 21/85] Adjusting trivy checks in two files --- .github/workflows/code-checks.yml | 69 +++++++++++++++-------------- .github/workflows/trivy-pr-scan.yml | 4 ++ 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index b4e6088..92b0456 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -67,43 +67,44 @@ jobs: continue-on-error: true # ############################## build-vuln-test ############################## - build-vuln-test: - # needs: flake8-linter - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 + # build-vuln-test: + # # needs: flake8-linter + # runs-on: ubuntu-latest + # steps: + # - uses: actions/checkout@v3 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - driver-opts: | - network=host + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # with: + # driver-opts: | + # network=host - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - logout: true + # - name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_TOKEN }} + # logout: true + + # # Notes on Cache: + # # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + # - name: Build Container + # uses: docker/build-push-action@v5 + # with: + # context: . + # push: false + # load: true + # tag: ${{ github.repository }}:vuln-test + # cache-from: type=registry,ref=${{ github.repository }}:buildcache + # cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + # ####### Run for Fidelity ###### + # - name: Run Trivy vulnerability scanner + # uses: aquasecurity/trivy-action@master + # with: + # image-ref: '${{ github.repository }}:vuln-test' + # severity: 'CRITICAL,HIGH' + # exit-code: '1' - # Notes on Cache: - # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache - - name: Build Container - uses: docker/build-push-action@v5 - with: - context: . - push: false - load: true - tag: ${{ github.repository }}:vuln-test - cache-from: type=registry,ref=${{ github.repository }}:buildcache - cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max - ####### Run for Fidelity ###### - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - with: - image-ref: '${{ github.repository }}:vuln-test' - severity: 'CRITICAL,HIGH' - exit-code: '1' ################################### PYTEST ################################### pytest: runs-on: ubuntu-latest diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 2b76cc2..e27e369 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -56,8 +56,12 @@ jobs: format: 'sarif' severity: 'CRITICAL,HIGH' output: 'trivy-results.sarif' + exit-code: '1' # Scan results should be viewable in GitHub Security Dashboard + # We still fail the job if results are found, so below will always run + # unless manually canceled. - name: Upload Trivy scan results to GitHub Security tab uses: github/codeql-action/upload-sarif@v2 + if: '!cancelled()' with: sarif_file: 'trivy-results.sarif' \ No newline at end of file From 7def7876af31fcc23d04130c055d2650b4e49843 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Thu, 23 Nov 2023 05:41:44 -0500 Subject: [PATCH 22/85] Uncomment ignore-files for action stuff --- .github/workflows/trivy-pr-scan.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index e27e369..142572d 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -10,8 +10,8 @@ on: paths-ignore: - README.md - .old_cicd/* - # - .github/* - # - .github/workflows/* + - .github/* + - .github/workflows/* - LICENSE - .gitignore - .dockerignore From 1c9d31e4928f4397170e803f043c7f48b0a6548a Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Thu, 23 Nov 2023 06:07:54 -0500 Subject: [PATCH 23/85] Update readme --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index a992826..ae31d29 100644 --- a/README.md +++ b/README.md @@ -290,3 +290,8 @@ TOPMed phenotypic concept data is [here](https://github.com/helxplatform/dug/tre ## Release To release, commit the change and select feature. + +#### Fail on Vulnerability Detection + +During PR's several vulnerability scanners are run. If there are vulnerabilities detected, the pr checks will fail and a report will be sent to Github Security Dashboard for viewing. Please ensure the vulnerability is mitigated prior to continuing the merge to protected branches. + From b333178aa9fd071ecbc3996e2ede2f0e61e70054 Mon Sep 17 00:00:00 2001 From: braswent Date: Mon, 4 Dec 2023 15:28:16 -0500 Subject: [PATCH 24/85] feat: --- src/dug/core/annotators/__init__.py | 15 +- src/dug/core/annotators/sapbert_annotator.py | 238 +++++++++++++++++++ tests/integration/conftest.py | 160 +++++++++---- tests/integration/test_annotators.py | 105 ++++++-- 4 files changed, 454 insertions(+), 64 deletions(-) create mode 100644 src/dug/core/annotators/sapbert_annotator.py diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index a617be8..2838f1c 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -6,6 +6,7 @@ from dug.config import Config from ._base import DugIdentifier, Indexable, Annotator, DefaultNormalizer, DefaultSynonymFinder from .monarch_annotator import AnnotateMonarch +from .sapbert_annotator import AnnotateSapbert logger = logging.getLogger('dug') @@ -13,7 +14,8 @@ @hookimpl def define_annotators(annotator_dict: Dict[str, Annotator]): - annotator_dict["annotator-monarch"] = build_annotator() + annotator_dict["annotator-monarch"] = build_monarch_annotator() + annotator_dict["annotator-sapbert"] = build_sapbert_annotator() class AnnotatorNotFoundException(Exception): @@ -34,7 +36,7 @@ def get_annotator(hook, annotator_name) -> Annotator: logger.error(err_msg) raise AnnotatorNotFoundException(err_msg) -def build_annotator(): +def build_monarch_annotator(): config = Config.from_env() annotator = AnnotateMonarch( normalizer=DefaultNormalizer(**config.normalizer), @@ -43,3 +45,12 @@ def build_annotator(): ) return annotator + +def build_sapbert_annotator(): + config = Config.from_env() + annotator = AnnotateSapbert( + normalizer=DefaultNormalizer(**config.normalizer), + synonym_finder=DefaultSynonymFinder(**config.synonym_service), + ) + return annotator + diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py new file mode 100644 index 0000000..80c05eb --- /dev/null +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -0,0 +1,238 @@ +import logging +from typing import List +from requests import Session +import json + +from ._base import DugIdentifier, Input +from .utils.biolink_purl_util import BioLinkPURLerizer + +logger = logging.getLogger("dug") + +logging.getLogger("requests").setLevel(logging.WARNING) +logging.getLogger("urllib3").setLevel(logging.WARNING) + + +class AnnotateSapbert: + """ + Use the RENCI Sapbert API service to fetch ontology IDs found in text + """ + + def __init__( + self, + normalizer, + synonym_finder, + ontology_greenlist=[], + ): + self.classificationUrl = "https://med-nemo.apps.renci.org/annotate/" + self.annotatorUrl = "https://babel-sapbert.apps.renci.org/annotate/" + self.normalizer = normalizer + self.synonym_finder = synonym_finder + self.ontology_greenlist = ontology_greenlist + self.norm_fails_file = "norm_fails.txt" + self.anno_fails_file = "anno_fails.txt" + + def __call__(self, text, http_session) -> List[DugIdentifier]: + # Fetch identifiers + classifiers: List = self.text_classification(text, http_session) + + raw_identifiers: List[DugIdentifier] = self.annotate_classifiers( + classifiers, http_session + ) + + # Write out to file if text fails to annotate + if not raw_identifiers: + with open(self.anno_fails_file, "a") as fh: + fh.write(f"{text}\n") + + processed_identifiers = [] + for identifier in raw_identifiers: + # Normalize identifier using normalization service + norm_id = self.normalizer(identifier, http_session) + + # Skip adding id if it doesn't normalize + if norm_id is None: + # Write out to file if identifier doesn't normalize + with open(self.norm_fails_file, "a") as fh: + fh.write(f"{identifier.id}\n") + + # Discard non-normalized ident if not in greenlist + if identifier.id_type not in self.ontology_greenlist: + continue + + # If it is in greenlist just keep moving forward + norm_id = identifier + + # Add synonyms to identifier + norm_id.synonyms = self.synonym_finder(norm_id.id, http_session) + + # Get pURL for ontology identifer for more info + norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) + processed_identifiers.append(norm_id) + + return processed_identifiers + + def text_classification(self, text, http_session) -> List: + """ + Send variable text to a token classifier API and return list of classified terms and biolink types + + Param: + text: String -- Full variable text, API does text preprocessing + + Request: + { + "text": "{{text}}", + "model_name": "token_classification" + } + + Response: List of dicts from which we want to extract the following: + { + "obj": "{{Biolink Classification}}", + "text": "{{Classified Term}}" + } + + Returns: List Dicts each with a Classified Term and Biolink Classification + """ + logger.debug(f"Classification") + response = self.make_classification_request(text, http_session) + classifiers = self.handle_classification_response(response) + return classifiers + + def make_classification_request(self, text: Input, http_session: Session): + url = self.classificationUrl + payload = { + "text": text, + "model_name": "token_classification", + } + # This could be moved to a config file + NUM_TRIES = 5 + for _ in range(NUM_TRIES): + response = http_session.post(url, json=payload) + if response is not None: + # looks like it worked + break + # if the reponse is still None here, throw an error + if response is None: + raise RuntimeError(f"no response from {url}") + if response.status_code == 403: + raise RuntimeError(f"You are not authorized to use this API -- {url}") + return response.json() + + def handle_classification_response(self, response: dict) -> List: + classifiers = [] + """ Parse each identifier and initialize identifier object """ + for denotation in response.get("denotations", []): + text = denotation.get("text", None) + bl_type = denotation.get("obj", None) + classifiers.append( + {"text": text, "bl_type": bl_type.replace("biolink:", "")} + ) + return classifiers + + def annotate_classifiers( + self, classifiers: List, http_session + ) -> List[DugIdentifier]: + """ + Send Classified Terms to Sapbert API + + Param: + List: [ + term_dict: Dict { + "text": String -- Classified term received from token classification API + "bl_type": String -- Biolink Classification + } + ] + + Request: + { + "text": "{{term_dict['text']}}", + "model_name": "sapbert", + "count": {{Limits the number of results}}, + "args": { + "bl_type": "{{ term_dict['bl_type'] -- NOTE omit `biolink:`}}" + } + } + + Response: List of dicts with the following structure: + { + "name": "{{Identified Name}}", + "curie": "{{Curie ID}}", + "category": "{{Biolink term with `biolink:`}}", + "score": "{{Float confidence in the annotation}}" + } + TBD: Organize the results by highest score + Return: List of DugIdentifiers with a Curie ID + """ + identifiers = [] + for term_dict in classifiers: + logger.debug(f"Annotating: {term_dict['text']}") + + response = self.make_annotation_request(term_dict, http_session) + identifiers += self.handle_annotation_response(term_dict, response) + + return identifiers + + def make_annotation_request(self, term_dict: Input, http_session: Session): + url = self.annotatorUrl + payload = { + "text": term_dict["text"], + "model_name": "sapbert", + "count": 1000, + "args": {"bl_type": term_dict["bl_type"]}, + } + # This could be moved to a config file + NUM_TRIES = 5 + for _ in range(NUM_TRIES): + response = http_session.post(url, json=payload) + if response is not None: + # looks like it worked + break + # if the reponse is still None here, throw an error + if response is None: + raise RuntimeError(f"no response from {url}") + if response.status_code == 403: + raise RuntimeError(f"You are not authorized to use this API -- {url}") + return response.json() + + def handle_annotation_response(self, value, response: dict) -> List[DugIdentifier]: + identifiers = [] + """ Parse each identifier and initialize identifier object """ + for identifier in response: + search_text = value.get("text", None) + curie = identifier.get("curie", None) + if not curie: + continue + + biolink_type = identifier.get('category') + score = identifier.get("score", None) + label = identifier.get("name") + identifiers.append( + DugIdentifier(id=curie, label=label, types=[biolink_type], search_text=search_text) + ) + return identifiers + +## Testing Purposes +# if __name__ == "__main__": +# from dug.config import Config +# import json +# import redis +# from requests_cache import CachedSession +# from dug.core.annotators._base import DefaultNormalizer, DefaultSynonymFinder + +# config = Config.from_env() +# annotator = AnnotateSapbert( +# normalizer=DefaultNormalizer(**config.normalizer), +# synonym_finder=DefaultSynonymFinder(**config.synonym_service), +# ) + +# redis_config = { +# "host": "localhost", +# "port": config.redis_port, +# "password": config.redis_password, +# } + +# http_sesh = CachedSession( +# cache_name="annotator", +# backend="redis", +# connection=redis.StrictRedis(**redis_config), +# ) +# annotator(text="Have you ever had a heart attack?", http_session=http_sesh) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 1e66644..15392b4 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -6,7 +6,8 @@ from typing import Dict import pytest -TEST_DATA_DIR = Path(__file__).parent.resolve() / 'data' + +TEST_DATA_DIR = Path(__file__).parent.resolve() / "data" @dataclass @@ -32,7 +33,7 @@ def get(self, url, params: dict = None): if text is None: return MockResponse(text="{}", status_code=404) return MockResponse(text, status_code=status_code) - + def post(self, url, params: dict = None, json: dict = {}): if params: qstr = urllib.parse.urlencode(params, quote_via=urllib.parse.quote) @@ -43,6 +44,7 @@ def post(self, url, params: dict = None, json: dict = {}): return MockResponse(text="{}", status_code=404) return MockResponse(text, status_code=status_code) + @pytest.fixture def monarch_annotator_api(): base_url = "http://annotator.api/?content={query}" @@ -91,6 +93,58 @@ def _(keyword): urls=urls, ) + +@pytest.fixture +def token_classifier_api(): + return MockApiService( + urls={ + "https://med-nemo.apps.renci.org/annotate/": [ + json.dumps( + { + "text": "Have you ever had a heart attack?", + "denotations": [ + { + "id": "I5-", + "span": {"begin": 20, "end": 32}, + "obj": "biolink:Disease", + "text": "heart attack", + } + ], + } + ), + 200, + ] + } + ) + + +@pytest.fixture +def sapbert_annotator_api(): + return MockApiService( + urls={ + "https://babel-sapbert.apps.renci.org/annotate/": [ + json.dumps( + [ + { + "name": "attack; cardiovascular", + "curie": "UBERON:0007100", + "category": "biolink:Disease", + "score": "0.15857231617", + }, + { + "name": "Angina attack", + "curie": "XAO:0000336", + "category": "biolink:Disease", + "score": "0.206502258778", + } + ] + ), + 200, + ] + } + ) + + @pytest.fixture def normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -101,35 +155,39 @@ def _(curie): ) urls = { - _("UBERON:0007100"): [json.dumps( - { - "UBERON:0007100": { - "id": { - "identifier": "UBERON:0007100", - "label": "primary circulatory organ" - }, - "equivalent_identifiers": [ - { + _("UBERON:0007100"): [ + json.dumps( + { + "UBERON:0007100": { + "id": { "identifier": "UBERON:0007100", - "label": "primary circulatory organ" - } - ], - "type": [ - "biolink:AnatomicalEntity", - "biolink:OrganismalEntity", - "biolink:BiologicalEntity", - "biolink:NamedThing", - "biolink:Entity" - ] - } - }, - ), 200], - + "label": "primary circulatory organ", + }, + "equivalent_identifiers": [ + { + "identifier": "UBERON:0007100", + "label": "primary circulatory organ", + } + ], + "type": [ + "biolink:AnatomicalEntity", + "biolink:OrganismalEntity", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + ], + } + }, + ), + 200, + ], } return MockApiService( urls=urls, ) + + @pytest.fixture def null_normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -140,36 +198,42 @@ def _(curie): ) urls = { - _("XAO:0000336"): [json.dumps( - { - "XAO:0000336": None - }, - ), 200], - + _("XAO:0000336"): [ + json.dumps( + {"XAO:0000336": None}, + ), + 200, + ], } return MockApiService( urls=urls, ) + @pytest.fixture -def synonym_api(): - return MockApiService(urls={ - "http://synonyms.api": [json.dumps({ - "UBERON:0007100": [ - "primary circulatory organ", - "dorsal tube", - "adult heart", - "heart" +def synonym_api(): + return MockApiService( + urls={ + "http://synonyms.api": [ + json.dumps( + { + "UBERON:0007100": [ + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart", + ] + } + ), + 200, ] - }), 200] - }) + } + ) + @pytest.fixture -def null_synonym_api(): - return MockApiService(urls={ - "http://synonyms.api": [json.dumps({ - "XAO:0000336": [ - ] - }), 200] - }) \ No newline at end of file +def null_synonym_api(): + return MockApiService( + urls={"http://synonyms.api": [json.dumps({"XAO:0000336": []}), 200]} + ) diff --git a/tests/integration/test_annotators.py b/tests/integration/test_annotators.py index d8f5c45..8004d0d 100644 --- a/tests/integration/test_annotators.py +++ b/tests/integration/test_annotators.py @@ -11,10 +11,18 @@ DugIdentifier, AnnotateMonarch, DefaultNormalizer, - DefaultSynonymFinder + DefaultSynonymFinder, + AnnotateSapbert, ) -def test_monarch_annotation_full(monarch_annotator_api, normalizer_api, null_normalizer_api, synonym_api, null_synonym_api): + +def test_monarch_annotation_full( + monarch_annotator_api, + normalizer_api, + null_normalizer_api, + synonym_api, + null_synonym_api, +): cfg = MockConfig.test_from_env() normalizer = DefaultNormalizer(**cfg.normalizer) synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) @@ -27,19 +35,21 @@ def test_monarch_annotation_full(monarch_annotator_api, normalizer_api, null_nor text = annotator.preprocess_text(input_text) # Fetch identifiers - raw_identifiers: List[DugIdentifier] = annotator.annotate_text(text, monarch_annotator_api) + raw_identifiers: List[DugIdentifier] = annotator.annotate_text( + text, monarch_annotator_api + ) processed_identifiers: List[DugIdentifier] = [] for identifier in raw_identifiers: if identifier.id == "UBERON:0007100": # Perform normal normalization output = annotator.normalizer(identifier, normalizer_api) - + assert isinstance(output, DugIdentifier) - assert output.id == 'UBERON:0007100' + assert output.id == "UBERON:0007100" assert output.label == "primary circulatory organ" - assert output.equivalent_identifiers == ['UBERON:0007100'] - assert output.types == 'anatomical entity' + assert output.equivalent_identifiers == ["UBERON:0007100"] + assert output.types == "anatomical entity" else: # act as if this is null output = annotator.normalizer(identifier, null_normalizer_api) @@ -48,17 +58,18 @@ def test_monarch_annotation_full(monarch_annotator_api, normalizer_api, null_nor if output is None: output = identifier # Test normalizer when null - assert output.id == 'XAO:0000336' + assert output.id == "XAO:0000336" assert output.label == "heart primordium" # Add synonyms to identifier if identifier.id == "UBERON:0007100": output.synonyms = annotator.synonym_finder(output.id, synonym_api) + print(output.synonyms) assert output.synonyms == [ - "primary circulatory organ", - "dorsal tube", - "adult heart", - "heart" + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart", ] else: output.synonyms = annotator.synonym_finder(output.id, null_synonym_api) @@ -66,7 +77,73 @@ def test_monarch_annotation_full(monarch_annotator_api, normalizer_api, null_nor # Get pURL for ontology identifer for more info output.purl = BioLinkPURLerizer.get_curie_purl(output.id) processed_identifiers.append(output) - + + assert isinstance(processed_identifiers, List) + assert len(processed_identifiers) == 2 + assert isinstance(processed_identifiers[0], DugIdentifier) + + +def test_sapbert_annotation_full( + token_classifier_api, + sapbert_annotator_api, + normalizer_api, + null_normalizer_api, + synonym_api, + null_synonym_api, +): + cfg = MockConfig.test_from_env() + normalizer = DefaultNormalizer(**cfg.normalizer) + synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) + + annotator = AnnotateSapbert(normalizer=normalizer, synonym_finder=synonym_finder) + input_text = "Have you ever had a heart attack?" + + # Fetch Classifiers + classifiers: List = annotator.text_classification(input_text, token_classifier_api) + + # Fetch identifiers + raw_identifiers: List[DugIdentifier] = annotator.annotate_classifiers( + classifiers, sapbert_annotator_api + ) + processed_identifiers: List[DugIdentifier] = [] + for identifier in raw_identifiers: + if identifier.id == "UBERON:0007100": + # Perform normal normalization + output = annotator.normalizer(identifier, normalizer_api) + print(output) + + assert isinstance(output, DugIdentifier) + assert output.id == "UBERON:0007100" + assert output.label == "primary circulatory organ" + assert output.equivalent_identifiers == ["UBERON:0007100"] + assert output.types == "anatomical entity" + else: + # act as if this is null + output = annotator.normalizer(identifier, null_normalizer_api) + + # Should be returning normalized identifier for each identifier passed in + if output is None: + output = identifier + # Test normalizer when null + assert output.id == "XAO:0000336" + assert output.label == "Angina attack" + + # Add synonyms to identifier + if identifier.id == "UBERON:0007100": + output.synonyms = annotator.synonym_finder(output.id, synonym_api) + assert output.synonyms == [ + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart", + ] + else: + output.synonyms = annotator.synonym_finder(output.id, null_synonym_api) + assert output.synonyms == [] + # Get pURL for ontology identifer for more info + output.purl = BioLinkPURLerizer.get_curie_purl(output.id) + processed_identifiers.append(output) + assert isinstance(processed_identifiers, List) assert len(processed_identifiers) == 2 - assert isinstance(processed_identifiers[0], DugIdentifier) \ No newline at end of file + assert isinstance(processed_identifiers[0], DugIdentifier) From ce35cd30206b352f4aea3eb99953e08b3201e72a Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 13 Dec 2023 14:15:49 -0500 Subject: [PATCH 25/85] fix response from nn --- src/dug/core/annotate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index bbf766b..79686de 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -429,7 +429,7 @@ def make_request(self, curie: str, http_session: Session): def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: # Return curie synonyms - return raw_synonyms.get(curie, []) + return raw_synonyms.get(curie, {}).get('names', []) From 149be9fb7a6c62f94c95c476c4ca839437905727 Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 13 Dec 2023 14:58:12 -0500 Subject: [PATCH 26/85] norm returned values from make_request --- src/dug/core/annotate.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 79686de..59dd379 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -418,14 +418,14 @@ def make_request(self, curie: str, http_session: Session): response = http_session.post(url, json=payload) if str(response.status_code).startswith('4'): logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") - return {curie: []} + return {curie: {"names": []}} if str(response.status_code).startswith('5'): logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") - return {curie: []} + return {curie: {"names": []}} return response.json() except json.decoder.JSONDecodeError as e: logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") - return {curie: []} + return {curie: {"names": []}} def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: # Return curie synonyms From 59801124cba4364c9943371514a7063658540b84 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Wed, 13 Dec 2023 18:50:47 -0500 Subject: [PATCH 27/85] Short circuit the integration test if dummy configuration is detected --- tests/integration/test_async_search.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_async_search.py b/tests/integration/test_async_search.py index 0ce6cb5..8e0a65c 100644 --- a/tests/integration/test_async_search.py +++ b/tests/integration/test_async_search.py @@ -5,12 +5,21 @@ from fastapi.testclient import TestClient from elasticsearch.exceptions import ConnectionError +from dug.config import Config + class APISearchTestCase(TestCase): "API search with mocked elasticsearch" def test_concepts_types_parameter(self): "Test API concepts search with types parameter" - # This should patch the elasticsearch object with the mock + cfg = Config.from_env() + if cfg.elastic_password == "changeme": + # Dummy config is in place, skip the test + self.skipTest( + "For the integration test, a populated elasticsearch " + "instance must be available and configured in the " + "environment variables. See dug.config for more.") + from dug.server import APP client = TestClient(APP) types = ['anatomical entity', 'drug'] From de8610deec70e09ade94a9b145d5d54153ed05cc Mon Sep 17 00:00:00 2001 From: braswent Date: Thu, 14 Dec 2023 13:11:14 -0500 Subject: [PATCH 28/85] feat: --- requirements.txt | 2 +- setup.cfg | 2 +- src/dug/core/annotators/_base.py | 8 ++++---- src/dug/core/annotators/sapbert_annotator.py | 5 +++++ 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/requirements.txt b/requirements.txt index 14208fc..f602432 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,7 +17,7 @@ pytz==2021.1 PyYAML==6.0 requests==2.31.0 # old redis==4.4.2 -redis==4.5.1 +redis==4.5.4 requests-cache==0.9.8 six==1.16.0 diff --git a/setup.cfg b/setup.cfg index cab748f..b470aef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,7 +24,7 @@ install_requires = pluggy requests requests_cache==0.9.8 - redis==4.5.1 + redis==4.5.4 [options.entry_points] console_scripts = diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index 645060f..c725bff 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -188,22 +188,22 @@ def make_request(self, curie: str, http_session: Session): logger.error( f"No synonyms returned for: `{curie}`. Validation error: {response.text}" ) - return {curie: []} + return {curie: {"names": []}} if str(response.status_code).startswith("5"): logger.error( f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}" ) - return {curie: []} + return {curie: {"names": []}} return response.json() except json.decoder.JSONDecodeError as e: logger.error( f"Json parse error for response from `{url}`. Exception: {str(e)}" ) - return {curie: []} + return {curie: {"names": []}} def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: # Return curie synonyms - return raw_synonyms.get(curie, []) + return raw_synonyms.get(curie, {}).get('names', []) Indexable = Union[DugIdentifier, AnnotatorSession] diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index 80c05eb..73eefe9 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -99,6 +99,7 @@ def text_classification(self, text, http_session) -> List: def make_classification_request(self, text: Input, http_session: Session): url = self.classificationUrl + logger.debug(f"response from {text}") payload = { "text": text, "model_name": "token_classification", @@ -115,6 +116,8 @@ def make_classification_request(self, text: Input, http_session: Session): raise RuntimeError(f"no response from {url}") if response.status_code == 403: raise RuntimeError(f"You are not authorized to use this API -- {url}") + if response.status_code == 500: + raise RuntimeError(f"Classification API is temporarily down -- vist docs here: {url.replace('annotate', 'docs')}") return response.json() def handle_classification_response(self, response: dict) -> List: @@ -191,6 +194,8 @@ def make_annotation_request(self, term_dict: Input, http_session: Session): raise RuntimeError(f"no response from {url}") if response.status_code == 403: raise RuntimeError(f"You are not authorized to use this API -- {url}") + if response.status_code == 500: + raise RuntimeError(f"Annotation API is temporarily down -- vist docs here: {url.replace('annotate', 'docs')}") return response.json() def handle_annotation_response(self, value, response: dict) -> List[DugIdentifier]: From 24b34e9331e237e55540b58b5aeee60c3ed5cd8e Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 13:34:37 -0500 Subject: [PATCH 29/85] correcting jsonable to recursively serialize sub objects --- src/dug/core/parsers/_base.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index acfc5bb..57184ae 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -1,5 +1,6 @@ import json from typing import Union, Callable, Any, Iterable +import copy from dug.core.loaders import InputFile @@ -29,7 +30,11 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - return self.__dict__ + copy_dict = copy(self.__dict__) + concepts = {k: v.jsonable() for k, v in concepts.items} + copy_dict['concepts'] = concepts + return copy_dict + def get_searchable_dict(self): # Translate DugElement to ES-style dict @@ -132,7 +137,10 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - return self.__dict__ + copy_dict = copy(self.__dict__) + identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} + copy_dict['identifiers'] = identifiers + return copy_dict def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) From 785b789ee698c5697112986056423b574dd8e23f Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 13:52:22 -0500 Subject: [PATCH 30/85] correcting jsonable to recursively serialize sub objects --- src/dug/core/parsers/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 57184ae..a67c6cd 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -30,7 +30,7 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - copy_dict = copy(self.__dict__) + copy_dict = copy.deepcopy(self.__dict__) concepts = {k: v.jsonable() for k, v in concepts.items} copy_dict['concepts'] = concepts return copy_dict @@ -137,7 +137,7 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - copy_dict = copy(self.__dict__) + copy_dict = copy.deepcopy(self.__dict__) identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} copy_dict['identifiers'] = identifiers return copy_dict From fec990a8278df2b4087bb2bc35309ff04bb880ac Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 13:58:20 -0500 Subject: [PATCH 31/85] correcting jsonable to recursively serialize sub objects --- src/dug/core/parsers/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index a67c6cd..46083e7 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -31,7 +31,7 @@ def add_concept(self, concept): def jsonable(self): copy_dict = copy.deepcopy(self.__dict__) - concepts = {k: v.jsonable() for k, v in concepts.items} + concepts = {k: v.jsonable() for k, v in self.concepts.items} copy_dict['concepts'] = concepts return copy_dict From edfff4f9eb9e42e7fce4f6157782546a8b5b608b Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 14:10:21 -0500 Subject: [PATCH 32/85] correcting jsonable to recursively serialize sub objects --- src/dug/core/parsers/_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 46083e7..231608e 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -31,7 +31,7 @@ def add_concept(self, concept): def jsonable(self): copy_dict = copy.deepcopy(self.__dict__) - concepts = {k: v.jsonable() for k, v in self.concepts.items} + concepts = {k: v.jsonable() for k, v in self.concepts.items()} copy_dict['concepts'] = concepts return copy_dict From 2cac291c4d91172dc8be71a56a8d9fbd09df25fc Mon Sep 17 00:00:00 2001 From: braswent Date: Tue, 19 Dec 2023 15:10:42 -0500 Subject: [PATCH 33/85] feat: --- tests/integration/conftest.py | 18 +- tests/unit/conftest.py | 332 +++++++++++++--------------- tests/unit/test_core/test_search.py | 82 ++++--- 3 files changed, 199 insertions(+), 233 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 15392b4..7bc0bcf 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -136,7 +136,7 @@ def sapbert_annotator_api(): "curie": "XAO:0000336", "category": "biolink:Disease", "score": "0.206502258778", - } + }, ] ), 200, @@ -218,12 +218,14 @@ def synonym_api(): "http://synonyms.api": [ json.dumps( { - "UBERON:0007100": [ - "primary circulatory organ", - "dorsal tube", - "adult heart", - "heart", - ] + "UBERON:0007100": { + "names": [ + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart", + ] + } } ), 200, @@ -235,5 +237,5 @@ def synonym_api(): @pytest.fixture def null_synonym_api(): return MockApiService( - urls={"http://synonyms.api": [json.dumps({"XAO:0000336": []}), 200]} + urls={"http://synonyms.api": [json.dumps({"XAO:0000336": {"names":[]}}), 200]} ) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index e1b63d9..f40d4f6 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -29,7 +29,7 @@ def get(self, url, params: dict = None): if text is None: return MockResponse(text="{}", status_code=404) return MockResponse(text, status_code=status_code) - + def post(self, url, params: dict = None, json: dict = {}): if params: qstr = urllib.parse.urlencode(params, quote_via=urllib.parse.quote) @@ -46,129 +46,103 @@ def annotator_api(): base_url = "http://annotator.api/?content={query}" def _(keyword): - return base_url.format( - query=urllib.parse.quote(keyword) - ) + return base_url.format(query=urllib.parse.quote(keyword)) urls = { - _("heart attack"): [json.dumps({ - "content": "heart attack", - "spans": [ + _("heart attack"): [ + json.dumps( { - "start": 0, - "end": 5, - "text": "heart", - "token": [ + "content": "heart attack", + "spans": [ { - "id": "UBERON:0015230", - "category": [ - "anatomical entity" + "start": 0, + "end": 5, + "text": "heart", + "token": [ + { + "id": "UBERON:0015230", + "category": ["anatomical entity"], + "terms": ["dorsal vessel heart"], + } ], - "terms": [ - "dorsal vessel heart" - ] - } - ] - }, - { - "start": 0, - "end": 5, - "text": "heart", - "token": [ + }, { - "id": "UBERON:0007100", - "category": [ - "anatomical entity" + "start": 0, + "end": 5, + "text": "heart", + "token": [ + { + "id": "UBERON:0007100", + "category": ["anatomical entity"], + "terms": ["primary circulatory organ"], + } ], - "terms": [ - "primary circulatory organ" - ] - } - ] - }, - { - "start": 0, - "end": 5, - "text": "heart", - "token": [ + }, { - "id": "UBERON:0015228", - "category": [ - "anatomical entity" + "start": 0, + "end": 5, + "text": "heart", + "token": [ + { + "id": "UBERON:0015228", + "category": ["anatomical entity"], + "terms": ["circulatory organ"], + } ], - "terms": [ - "circulatory organ" - ] - } - ] - }, - { - "start": 0, - "end": 5, - "text": "heart", - "token": [ + }, { - "id": "ZFA:0000114", - "category": [ - "anatomical entity" + "start": 0, + "end": 5, + "text": "heart", + "token": [ + { + "id": "ZFA:0000114", + "category": ["anatomical entity"], + "terms": ["heart"], + } ], - "terms": [ - "heart" - ] - } - ] - }, - { - "start": 0, - "end": 5, - "text": "heart", - "token": [ + }, { - "id": "UBERON:0000948", - "category": [ - "anatomical entity" + "start": 0, + "end": 5, + "text": "heart", + "token": [ + { + "id": "UBERON:0000948", + "category": ["anatomical entity"], + "terms": ["heart"], + } ], - "terms": [ - "heart" - ] - } - ] - }, - { - "start": 0, - "end": 12, - "text": "heart attack", - "token": [ + }, { - "id": "MONDO:0005068", - "category": [ - "disease" + "start": 0, + "end": 12, + "text": "heart attack", + "token": [ + { + "id": "MONDO:0005068", + "category": ["disease"], + "terms": ["myocardial infarction (disease)"], + } ], - "terms": [ - "myocardial infarction (disease)" - ] - } - ] - }, - { - "start": 0, - "end": 12, - "text": "heart attack", - "token": [ + }, { - "id": "HP:0001658", - "category": [ - "phenotype", - "quality" + "start": 0, + "end": 12, + "text": "heart attack", + "token": [ + { + "id": "HP:0001658", + "category": ["phenotype", "quality"], + "terms": ["Myocardial infarction"], + } ], - "terms": [ - "Myocardial infarction" - ] - } - ] + }, + ], } - ] - }), 200], + ), + 200, + ], } return MockApiService( @@ -186,30 +160,32 @@ def _(curie): ) urls = { - _("UBERON:0007100"): [json.dumps( - { - "UBERON:0007100": { - "id": { - "identifier": "UBERON:0007100", - "label": "primary circulatory organ" - }, - "equivalent_identifiers": [ - { + _("UBERON:0007100"): [ + json.dumps( + { + "UBERON:0007100": { + "id": { "identifier": "UBERON:0007100", - "label": "primary circulatory organ" - } - ], - "type": [ - "biolink:AnatomicalEntity", - "biolink:OrganismalEntity", - "biolink:BiologicalEntity", - "biolink:NamedThing", - "biolink:Entity" - ] - } - }, - ), 200], - + "label": "primary circulatory organ", + }, + "equivalent_identifiers": [ + { + "identifier": "UBERON:0007100", + "label": "primary circulatory organ", + } + ], + "type": [ + "biolink:AnatomicalEntity", + "biolink:OrganismalEntity", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + ], + } + }, + ), + 200, + ], } return MockApiService( @@ -218,17 +194,26 @@ def _(curie): @pytest.fixture -def synonym_api(): - return MockApiService(urls={ - "http://synonyms.api": [json.dumps({ - "UBERON:0007100": [ - "primary circulatory organ", - "dorsal tube", - "adult heart", - "heart" +def synonym_api(): + return MockApiService( + urls={ + "http://synonyms.api": [ + json.dumps( + { + "UBERON:0007100": { + "names": [ + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart", + ] + } + } + ), + 200, ] - }), 200] - }) + } + ) @pytest.fixture() @@ -240,48 +225,31 @@ def _(curie): curie=urllib.parse.quote(curie), ) - return MockApiService(urls={ - _("UBERON:0007100"): [json.dumps( - { - "taxon": { - "id": None, - "label": None - }, - "association_counts": None, - "xrefs": [ - "SPD:0000130", - "FBbt:00003154", - "TADS:0000147" - ], - "description": "A hollow, muscular organ, which, by contracting rhythmically, keeps up the circulation of the blood or analogs[GO,modified].", - "types": None, - "synonyms": [ - { - "val": "dorsal tube", - "pred": "synonym", - "xrefs": None - }, - { - "val": "adult heart", - "pred": "synonym", - "xrefs": None - }, + return MockApiService( + urls={ + _("UBERON:0007100"): [ + json.dumps( { - "val": "heart", - "pred": "synonym", - "xrefs": None + "taxon": {"id": None, "label": None}, + "association_counts": None, + "xrefs": ["SPD:0000130", "FBbt:00003154", "TADS:0000147"], + "description": "A hollow, muscular organ, which, by contracting rhythmically, keeps up the circulation of the blood or analogs[GO,modified].", + "types": None, + "synonyms": [ + {"val": "dorsal tube", "pred": "synonym", "xrefs": None}, + {"val": "adult heart", "pred": "synonym", "xrefs": None}, + {"val": "heart", "pred": "synonym", "xrefs": None}, + ], + "deprecated": None, + "replaced_by": None, + "consider": None, + "id": "UBERON:0007100", + "label": "primary circulatory organ", + "iri": "http://purl.obolibrary.org/obo/UBERON_0007100", + "category": ["anatomical entity"], } - ], - "deprecated": None, - "replaced_by": None, - "consider": None, - "id": "UBERON:0007100", - "label": "primary circulatory organ", - "iri": "http://purl.obolibrary.org/obo/UBERON_0007100", - "category": [ - "anatomical entity" - ] - } - ), 200] - }) - + ), + 200, + ] + } + ) diff --git a/tests/unit/test_core/test_search.py b/tests/unit/test_core/test_search.py index 89d48b8..b7edc83 100644 --- a/tests/unit/test_core/test_search.py +++ b/tests/unit/test_core/test_search.py @@ -7,18 +7,20 @@ from dug.core.index import Index, SearchException from dug.config import Config -default_indices = ['concepts_index', 'variables_index', 'kg_index'] +default_indices = ["concepts_index", "variables_index", "kg_index"] -host = 'localhost' +host = "localhost" port = 9200 -username = 'elastic' -password = 'hunter2' -nboost_host = 'localhost' -hosts = [{'host': host, 'port': port, 'scheme': 'http'}] +username = "elastic" +password = "hunter2" +nboost_host = "localhost" +hosts = [{"host": host, "port": port, "scheme": "http"}] -class MockEsNode(): + +class MockEsNode: def info(): - return {"_nodes" : {"total": 1}} + return {"_nodes": {"total": 1}} + @dataclass class MockIndex: @@ -37,34 +39,34 @@ def get(self, id): def count(self, body): return len(self.values) - class MockIndices: - def __init__(self): self._indices = {} self.call_count = 0 - self.number_of_replicas = 3 + self.number_of_replicas = 1 def exists(self, index): return index in self._indices - def create( - self, - index, - body, - **_kwargs - ): + def create(self, index, body, **_kwargs): self.call_count += 1 self._indices[index] = MockIndex(**body) def get_index(self, index) -> MockIndex: return self._indices.get(index) + def get_settings(self, index): + index_schema = {"settings": {"index": {"number_of_replicas": self.number_of_replicas}}} + settings = { + "kg_index": index_schema, + "concepts_index": index_schema, + "variables_index": index_schema, + } + return settings class MockElastic: - def __init__(self, indices: MockIndices): self.indices = indices self._up = True @@ -86,36 +88,28 @@ def disconnect(self): self._up = False def count(self, body, index): - return { - 'count': self.indices.get_index(index).count(body) - } + return {"count": self.indices.get_index(index).count(body)} def search(self, index, body, **kwargs): values = self.indices.get_index(index).values - return { - 'results': { - k: v - for k, v in values.items() - if body in v - } - } - - + return {"results": {k: v for k, v in values.items() if body in v}} @pytest.fixture def elastic(): - with patch('dug.core.index.Elasticsearch') as es_class: + with patch("dug.core.index.Elasticsearch") as es_class: es_instance = MockElastic(indices=MockIndices()) es_class.return_value = es_instance yield es_instance def test_init(elastic): - cfg = Config(elastic_host='localhost', - elastic_username='elastic', - elastic_password='hunter2', - nboost_host='localhost') + cfg = Config( + elastic_host="localhost", + elastic_username="elastic", + elastic_password="hunter2", + nboost_host="localhost", + ) search = Index(cfg) @@ -129,6 +123,7 @@ def test_init_no_ping(elastic): with pytest.raises(SearchException): _search = Index(Config.from_env()) + @pytest.mark.asyncio async def test_init_indices(elastic): search = Index(Config.from_env()) @@ -142,16 +137,17 @@ async def test_init_indices(elastic): def test_index_doc(elastic: MockElastic): search = Index(Config.from_env()) - assert len(elastic.indices.get_index('concepts_index').values) == 0 - search.index_doc('concepts_index', {'name': 'sample'}, "ID:1") - assert len(elastic.indices.get_index('concepts_index').values) == 1 - assert elastic.indices.get_index('concepts_index').get("ID:1") == {'name': 'sample'} + assert len(elastic.indices.get_index("concepts_index").values) == 0 + search.index_doc("concepts_index", {"name": "sample"}, "ID:1") + assert len(elastic.indices.get_index("concepts_index").values) == 1 + assert elastic.indices.get_index("concepts_index").get("ID:1") == {"name": "sample"} def test_update_doc(elastic: MockElastic): search = Index(Config.from_env()) - search.index_doc('concepts_index', {'name': 'sample'}, "ID:1") - search.update_doc('concepts_index', {'name': 'new value!'}, "ID:1") - assert elastic.indices.get_index('concepts_index').get("ID:1") == {'name': 'new value!'} - + search.index_doc("concepts_index", {"name": "sample"}, "ID:1") + search.update_doc("concepts_index", {"name": "new value!"}, "ID:1") + assert elastic.indices.get_index("concepts_index").get("ID:1") == { + "name": "new value!" + } From f01844ac42e160748bb67812b570e3e7c8bead65 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 17:23:33 -0500 Subject: [PATCH 34/85] parameterize all identifier inner vars; --- src/dug/core/annotate.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 59dd379..fa566dd 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -18,7 +18,7 @@ class Identifier: - def __init__(self, id, label, types=None, search_text="", description=""): + def __init__(self, id, label, types=None, search_text=[], description="", equivalent_identifiers=[], synonyms=[], purl = []): self.id = id self.label = label self.description = description @@ -26,9 +26,9 @@ def __init__(self, id, label, types=None, search_text="", description=""): types = [] self.types = types self.search_text = [search_text] if search_text else [] - self.equivalent_identifiers = [] - self.synonyms = [] - self.purl = "" + self.equivalent_identifiers = equivalent_identifiers + self.synonyms = synonyms + self.purl = purl @property def id_type(self): From c70940fba144d92eb5732c46470ed4ba116be080 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 17:44:54 -0500 Subject: [PATCH 35/85] parameterize everything for init from json form --- src/dug/core/parsers/_base.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 231608e..b42d4f1 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -11,7 +11,7 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc="", action="", collection_action=""): + def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): self.id = elem_id self.name = name self.description = desc @@ -21,10 +21,10 @@ def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_ self.collection_desc = collection_desc self.action = action self.collection_action = collection_action - self.concepts = {} - self.ml_ready_desc = desc - self.search_terms = [] - self.optional_terms = [] + self.concepts = concepts + self.ml_ready_desc = ml_ready_desc or desc + self.search_terms = search_terms + self.optional_terms = optional_terms def add_concept(self, concept): self.concepts[concept.id] = concept @@ -78,17 +78,17 @@ def __str__(self): class DugConcept: # Basic class for holding information about concepts that are used to organize elements # All Concepts map to at least one element - def __init__(self, concept_id, name, desc, concept_type): - self.id = concept_id + def __init__(self, concept_id, name, desc, concept_type, id="" , description="", type="", concept_action="", identifiers = {}, kg_answers={}, search_terms = [] , optional_terms=[], ml_ready_desc=""): + self.id = concept_id or id self.name = name - self.description = desc - self.type = concept_type + self.description = desc or description + self.type = concept_type or type self.concept_action = "" - self.identifiers = {} - self.kg_answers = {} - self.search_terms = [] - self.optional_terms = [] - self.ml_ready_desc = desc + self.identifiers = identifiers + self.kg_answers = kg_answers + self.search_terms = search_terms + self.optional_terms = optional_terms + self.ml_ready_desc = desc or ml_ready_desc def add_identifier(self, ident): if ident.id in self.identifiers: From a95bd2e9ba9f9ab014c26bcdad8ac151ec53ee50 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 17:51:11 -0500 Subject: [PATCH 36/85] probably not a revealation but making everything optional in initialization --- src/dug/core/parsers/_base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index b42d4f1..16e88a4 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -11,11 +11,11 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): + def __init__(self, elem_id, name, desc="", elem_type="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): self.id = elem_id self.name = name self.description = desc - self.type = elem_type + self.type = elem_type or type self.collection_id = collection_id self.collection_name = collection_name self.collection_desc = collection_desc @@ -78,7 +78,7 @@ def __str__(self): class DugConcept: # Basic class for holding information about concepts that are used to organize elements # All Concepts map to at least one element - def __init__(self, concept_id, name, desc, concept_type, id="" , description="", type="", concept_action="", identifiers = {}, kg_answers={}, search_terms = [] , optional_terms=[], ml_ready_desc=""): + def __init__(self, concept_id="", name="", desc="", concept_type="", id="" , description="", type="", concept_action="", identifiers = {}, kg_answers={}, search_terms = [] , optional_terms=[], ml_ready_desc=""): self.id = concept_id or id self.name = name self.description = desc or description From f3fca0f8b0d2eccda36985d5d8457b0529bf0340 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 17:51:29 -0500 Subject: [PATCH 37/85] probably not a revealation but making everything optional in initialization --- src/dug/core/parsers/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 16e88a4..98e5fe9 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -11,8 +11,8 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id, name, desc="", elem_type="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): - self.id = elem_id + def __init__(self, elem_id="", name="", desc="", elem_type="", id="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): + self.id = elem_id or id self.name = name self.description = desc self.type = elem_type or type From 1bd901f8ab60672af7dea27bcd1ad88c0232f0e0 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 18:01:07 -0500 Subject: [PATCH 38/85] missed description --- src/dug/core/parsers/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 98e5fe9..da6e726 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -11,10 +11,10 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id="", name="", desc="", elem_type="", id="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): + def __init__(self, elem_id="", name="", desc="", description="", elem_type="", id="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): self.id = elem_id or id self.name = name - self.description = desc + self.description = desc or description self.type = elem_type or type self.collection_id = collection_id self.collection_name = collection_name From 3f4e3347fe19abdd13ae96d2cfc88d45c75f16ad Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 18:49:55 -0500 Subject: [PATCH 39/85] normalize search test in identifier --- src/dug/core/annotate.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index fa566dd..291cf2f 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -25,7 +25,10 @@ def __init__(self, id, label, types=None, search_text=[], description="", equiva if types is None: types = [] self.types = types - self.search_text = [search_text] if search_text else [] + if isinstance(search_text, str): + self.search_text = [search_text] if search_text else [] + elif isinstance(search_text, list): + self.search_text = search_text self.equivalent_identifiers = equivalent_identifiers self.synonyms = synonyms self.purl = purl From 227ad4a5bea5460346a0b40e143b832e9842f223 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Tue, 19 Dec 2023 20:31:01 -0500 Subject: [PATCH 40/85] https://github.com/TranslatorSRI/NameResolution/issues/129 --- src/dug/core/annotate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 291cf2f..d3066a1 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -428,7 +428,9 @@ def make_request(self, curie: str, http_session: Session): return response.json() except json.decoder.JSONDecodeError as e: logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") - return {curie: {"names": []}} + except requests.exceptions.ConnectionError as e: + logger.error(f'connection reset') + return {curie: {"names": []}} def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: # Return curie synonyms From 5888094500914bfd30b2dfaee8f4ad29b30173a1 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 09:11:35 -0500 Subject: [PATCH 41/85] avoid deep copy --- src/dug/core/parsers/_base.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index da6e726..d1272e7 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -30,10 +30,10 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - copy_dict = copy.deepcopy(self.__dict__) + dict_style = self.__dict__ concepts = {k: v.jsonable() for k, v in self.concepts.items()} - copy_dict['concepts'] = concepts - return copy_dict + dict_style['concepts'] = concepts + return dict_style def get_searchable_dict(self): @@ -137,10 +137,10 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - copy_dict = copy.deepcopy(self.__dict__) identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} - copy_dict['identifiers'] = identifiers - return copy_dict + dict_style = self.__dict__ + dict_style['identifiers'] = identifiers + return dict_style def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) From 527fbb851adf7a4a5555552a4d45f1e03e3a9bcb Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 09:32:12 -0500 Subject: [PATCH 42/85] see if this helps --- src/dug/core/parsers/_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index d1272e7..26037a0 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -29,9 +29,9 @@ def __init__(self, elem_id="", name="", desc="", description="", elem_type="", i def add_concept(self, concept): self.concepts[concept.id] = concept - def jsonable(self): - dict_style = self.__dict__ + def jsonable(self): concepts = {k: v.jsonable() for k, v in self.concepts.items()} + dict_style = self.__dict__ dict_style['concepts'] = concepts return dict_style From 17893a031b8e919ef21fd0946ffb88355b8ee2b5 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 12:37:23 -0500 Subject: [PATCH 43/85] shallow copy and dump --- src/dug/core/parsers/_base.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index 26037a0..f8ba301 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -31,8 +31,13 @@ def add_concept(self, concept): def jsonable(self): concepts = {k: v.jsonable() for k, v in self.concepts.items()} - dict_style = self.__dict__ - dict_style['concepts'] = concepts + dict_style = {} + # make a shallow copy + for k, v in self.__dict__.items(): + if k == 'concepts': + dict_style[k] = concepts + continue + dict_style[k] = v return dict_style @@ -138,8 +143,12 @@ def get_searchable_dict(self): def jsonable(self): identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} - dict_style = self.__dict__ - dict_style['identifiers'] = identifiers + dict_style = {} + for k, v in self.__dict__.items(): + if k == 'identifiers': + dict_style[k] = identifiers + continue + dict_style[k] = v return dict_style def __str__(self): From 1aa475fc8e83878883158af0519c8e7deef5919d Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 17:16:35 -0500 Subject: [PATCH 44/85] logging for crawler --- src/dug/core/crawler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/dug/core/crawler.py b/src/dug/core/crawler.py index 1b5b877..32d96bb 100644 --- a/src/dug/core/crawler.py +++ b/src/dug/core/crawler.py @@ -146,6 +146,7 @@ def annotate_element(self, element): http_session=self.http_session) # Each identifier then becomes a concept that links elements together + logger.info("Got %d identifiers for %s", len(identifiers) , element.ml_ready_desc) for identifier in identifiers: if identifier.id not in self.concepts: # Create concept for newly seen identifier From b5405ebf2a3b76a266927254eac5b99e7082b8b3 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 20 Dec 2023 22:38:02 -0500 Subject: [PATCH 45/85] reverting cause of memory leak --- src/dug/core/annotate.py | 15 ++++----- src/dug/core/parsers/_base.py | 59 +++++++++++++---------------------- 2 files changed, 27 insertions(+), 47 deletions(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index d3066a1..6294526 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -18,20 +18,17 @@ class Identifier: - def __init__(self, id, label, types=None, search_text=[], description="", equivalent_identifiers=[], synonyms=[], purl = []): + def __init__(self, id, label, types=None, search_text="", description=""): self.id = id self.label = label self.description = description if types is None: types = [] self.types = types - if isinstance(search_text, str): - self.search_text = [search_text] if search_text else [] - elif isinstance(search_text, list): - self.search_text = search_text - self.equivalent_identifiers = equivalent_identifiers - self.synonyms = synonyms - self.purl = purl + self.search_text = [search_text] if search_text else [] + self.equivalent_identifiers = [] + self.synonyms = [] + self.purl = "" @property def id_type(self): @@ -620,4 +617,4 @@ def get_curie_purl(curie): if __name__ == "__main__": import doctest - doctest.testmod() + doctest.testmod() \ No newline at end of file diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index f8ba301..43a1801 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -1,6 +1,5 @@ import json from typing import Union, Callable, Any, Iterable -import copy from dug.core.loaders import InputFile @@ -11,35 +10,26 @@ class DugElement: # Basic class for holding information for an object you want to make searchable via Dug # Could be a DbGaP variable, DICOM image, App, or really anything # Optionally can hold information pertaining to a containing collection (e.g. dbgap study or dicom image series) - def __init__(self, elem_id="", name="", desc="", description="", elem_type="", id="", type= "", collection_id="", collection_name="", collection_desc="", action="", collection_action="", concepts={}, ml_ready_desc="", search_terms=[], optional_terms=[]): - self.id = elem_id or id + def __init__(self, elem_id, name, desc, elem_type, collection_id="", collection_name="", collection_desc="", action="", collection_action=""): + self.id = elem_id self.name = name - self.description = desc or description - self.type = elem_type or type + self.description = desc + self.type = elem_type self.collection_id = collection_id self.collection_name = collection_name self.collection_desc = collection_desc self.action = action self.collection_action = collection_action - self.concepts = concepts - self.ml_ready_desc = ml_ready_desc or desc - self.search_terms = search_terms - self.optional_terms = optional_terms + self.concepts = {} + self.ml_ready_desc = desc + self.search_terms = [] + self.optional_terms = [] def add_concept(self, concept): self.concepts[concept.id] = concept - def jsonable(self): - concepts = {k: v.jsonable() for k, v in self.concepts.items()} - dict_style = {} - # make a shallow copy - for k, v in self.__dict__.items(): - if k == 'concepts': - dict_style[k] = concepts - continue - dict_style[k] = v - return dict_style - + def jsonable(self): + return self.__dict__ def get_searchable_dict(self): # Translate DugElement to ES-style dict @@ -83,17 +73,17 @@ def __str__(self): class DugConcept: # Basic class for holding information about concepts that are used to organize elements # All Concepts map to at least one element - def __init__(self, concept_id="", name="", desc="", concept_type="", id="" , description="", type="", concept_action="", identifiers = {}, kg_answers={}, search_terms = [] , optional_terms=[], ml_ready_desc=""): - self.id = concept_id or id + def __init__(self, concept_id, name, desc, concept_type): + self.id = concept_id self.name = name - self.description = desc or description - self.type = concept_type or type + self.description = desc + self.type = concept_type self.concept_action = "" - self.identifiers = identifiers - self.kg_answers = kg_answers - self.search_terms = search_terms - self.optional_terms = optional_terms - self.ml_ready_desc = desc or ml_ready_desc + self.identifiers = {} + self.kg_answers = {} + self.search_terms = [] + self.optional_terms = [] + self.ml_ready_desc = desc def add_identifier(self, ident): if ident.id in self.identifiers: @@ -142,14 +132,7 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - identifiers = {k: v.jsonable() for k, v in self.identifiers.items()} - dict_style = {} - for k, v in self.__dict__.items(): - if k == 'identifiers': - dict_style[k] = identifiers - continue - dict_style[k] = v - return dict_style + return self.__dict__ def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) @@ -159,4 +142,4 @@ def __str__(self): Parser = Callable[[Any], Iterable[Indexable]] -FileParser = Callable[[InputFile], Iterable[Indexable]] +FileParser = Callable[[InputFile], Iterable[Indexable]] \ No newline at end of file From f1950e0bd63168c5b55aeedf52be338385d80f4d Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Thu, 21 Dec 2023 18:37:24 -0500 Subject: [PATCH 46/85] debug message for tranql --- src/dug/core/annotate.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 6294526..e605292 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -135,7 +135,7 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_ response = json.load(stream) else: query = query_factory.get_query(identifier) - logger.debug(query) + logger.info(query) response = requests.post( url=self.url, headers=self.tranql_headers, @@ -149,6 +149,9 @@ def expand_identifier(self, identifier, query_factory, kg_filename, include_all_ return [] except KeyError as e: logger.error(f"Could not find key: {e} in response: {response}") + except Exception as ex: + logger.error(response) + raise ex # Dump out to file if there's a knowledge graph with open(kg_filename, 'w') as stream: @@ -617,4 +620,4 @@ def get_curie_purl(curie): if __name__ == "__main__": import doctest - doctest.testmod() \ No newline at end of file + doctest.testmod() From b94374dc25cb51f2cadc65f0c148c511fd39acbd Mon Sep 17 00:00:00 2001 From: Nathaniel Braswell Date: Tue, 2 Jan 2024 14:38:04 -0500 Subject: [PATCH 47/85] Update tests command in code-checks.yml --- .github/workflows/code-checks.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 193756d..d07e0df 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -116,8 +116,7 @@ jobs: - name: Test with pytest run: | - pytest --doctest-modules src - coverage run -m pytest tests/unit + coverage run -m pytest tests ############################ Bandit ################################ bandit: @@ -138,4 +137,4 @@ jobs: # Only report high security issues - name: Test with Bandit run: | - bandit -r src -n3 -lll \ No newline at end of file + bandit -r src -n3 -lll From ea610e151fc5c37dcfc2de66deebdede92625f05 Mon Sep 17 00:00:00 2001 From: Nathaniel Braswell Date: Tue, 2 Jan 2024 14:44:54 -0500 Subject: [PATCH 48/85] Update code-checks.yml to use make test --- .github/workflows/code-checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index d07e0df..d9722f9 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -116,7 +116,7 @@ jobs: - name: Test with pytest run: | - coverage run -m pytest tests + make tests ############################ Bandit ################################ bandit: From dfa65cf6017864388a971feaaa0184c680f732f4 Mon Sep 17 00:00:00 2001 From: Nathaniel Braswell Date: Tue, 2 Jan 2024 14:47:01 -0500 Subject: [PATCH 49/85] Fixed make test in workflow --- .github/workflows/code-checks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index d9722f9..78cf048 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -116,7 +116,7 @@ jobs: - name: Test with pytest run: | - make tests + make test ############################ Bandit ################################ bandit: From 5658b38b4016141e397ff23cece036343c9573cc Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 3 Jan 2024 09:04:13 -0500 Subject: [PATCH 50/85] bump python version to minimal image --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 6f5b10e..8135d0d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.10.10-slim +FROM python:3.12.0-alpine3.18 # Install required packages RUN apt-get update && \ From 0677a775e6e8db09982864d0dc8ae8082839d08c Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 3 Jan 2024 09:36:46 -0500 Subject: [PATCH 51/85] bump versions --- Dockerfile | 4 ++-- requirements.txt | 29 ++++++++++++++--------------- setup.cfg | 10 +++++----- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8135d0d..4cc17ef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,11 +3,11 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.0-alpine3.18 +FROM python:3.12.0-slim # Install required packages RUN apt-get update && \ - apt-get install -y curl make vim && \ + apt-get install -y g++ make && \ rm -rf /var/cache/apt/* # Create a non-root user. diff --git a/requirements.txt b/requirements.txt index f602432..891033e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,29 +1,28 @@ aiohttp asyncio -fastapi==0.95.0 -uvicorn==0.23.2 +fastapi +uvicorn elasticsearch[async]==8.5.2 gunicorn itsdangerous Jinja2 jsonschema MarkupSafe -ormar==0.12.1 -mistune==2.0.3 -pluggy==1.0.0 -pyrsistent==0.17.3 +ormar +mistune +pluggy +pyrsistent pytest -pytz==2021.1 -PyYAML==6.0 -requests==2.31.0 -# old redis==4.4.2 -redis==4.5.4 -requests-cache==0.9.8 -six==1.16.0 +pytz +PyYAML +requests +redis +requests-cache +six # Click for command line arguments # We use Click 7.0 because that's what one of the pinned packages above use. click -httpx>=0.24.1 +httpx bmt==1.1.0 -urllib3>=1.26.17 \ No newline at end of file +urllib3 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index b470aef..b551ef3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,14 +17,14 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.10 +python_requires = >=3.12 include_package_data = true install_requires = elasticsearch==8.5.2 pluggy requests - requests_cache==0.9.8 - redis==4.5.4 + requests_cache + redis [options.entry_points] console_scripts = @@ -32,8 +32,8 @@ console_scripts = [options.extras_require] rest = - fastapi==0.95.0 - uvicorn==0.23.2 + fastapi + uvicorn gunicorn jsonschema From 1da155cb92d94829ea764af63e8a88e3038810e5 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 3 Jan 2024 09:41:09 -0500 Subject: [PATCH 52/85] bump python version in workflows --- .github/workflows/code-checks.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 78cf048..a1ef3f0 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -45,7 +45,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.12' # Currently actions/setup-python supports caching # but the cache is not as robust as cache action. @@ -106,7 +106,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.12' - name: Install Requirements run: | @@ -126,7 +126,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.12' - name: Install Requirements run: | From c3a337194006fa1ba571cb78880648c694b5b1c8 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 3 Jan 2024 10:00:18 -0500 Subject: [PATCH 53/85] change to bullseye --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4cc17ef..1d76d00 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.0-slim +FROM python:3.12.1-slim-bullseye # Install required packages RUN apt-get update && \ From 8e10f695740238cb8b71c886a213b9d834949d42 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 3 Jan 2024 12:53:27 -0500 Subject: [PATCH 54/85] alpine image --- Dockerfile | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1d76d00..d57f06b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,19 +3,18 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.1-slim-bullseye +FROM python:3.12.0-alpine3.18 # Install required packages -RUN apt-get update && \ - apt-get install -y g++ make && \ - rm -rf /var/cache/apt/* +RUN apk update && \ + apk add g++ make # Create a non-root user. ENV USER dug ENV HOME /home/$USER ENV UID 1000 -RUN adduser --disabled-login --home $HOME --shell /bin/bash --uid $UID $USER +RUN adduser -D --home $HOME --uid $UID $USER USER $USER WORKDIR $HOME From 0a7c368b5d3d67aa18fb5acff5a80042006ada39 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 3 Jan 2024 13:33:35 -0500 Subject: [PATCH 55/85] do absolute imports for annotator modules for webserver --- src/dug/core/annotators/__init__.py | 6 +++--- src/dug/core/annotators/monarch_annotator.py | 4 ++-- src/dug/core/annotators/sapbert_annotator.py | 4 ++-- src/dug/core/annotators/utils/__init__.py | 0 4 files changed, 7 insertions(+), 7 deletions(-) create mode 100644 src/dug/core/annotators/utils/__init__.py diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index 2838f1c..1a58c40 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -4,9 +4,9 @@ import pluggy from dug.config import Config -from ._base import DugIdentifier, Indexable, Annotator, DefaultNormalizer, DefaultSynonymFinder -from .monarch_annotator import AnnotateMonarch -from .sapbert_annotator import AnnotateSapbert +from dug.core.annotators._base import DugIdentifier, Indexable, Annotator, DefaultNormalizer, DefaultSynonymFinder +from dug.core.annotators.monarch_annotator import AnnotateMonarch +from dug.core.annotators.sapbert_annotator import AnnotateSapbert logger = logging.getLogger('dug') diff --git a/src/dug/core/annotators/monarch_annotator.py b/src/dug/core/annotators/monarch_annotator.py index 841e9cf..1c67f40 100644 --- a/src/dug/core/annotators/monarch_annotator.py +++ b/src/dug/core/annotators/monarch_annotator.py @@ -3,8 +3,8 @@ from typing import List from requests import Session -from ._base import DugIdentifier, Input -from .utils.biolink_purl_util import BioLinkPURLerizer +from dug.core.annotators._base import DugIdentifier, Input +from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer logger = logging.getLogger('dug') diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index 73eefe9..7c2fa81 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -3,8 +3,8 @@ from requests import Session import json -from ._base import DugIdentifier, Input -from .utils.biolink_purl_util import BioLinkPURLerizer +from dug.core.annotators._base import DugIdentifier, Input +from dug.core.annotators.utils.biolink_purl_util import BioLinkPURLerizer logger = logging.getLogger("dug") diff --git a/src/dug/core/annotators/utils/__init__.py b/src/dug/core/annotators/utils/__init__.py new file mode 100644 index 0000000..e69de29 From a863d384e0cf097c6cb4ed254e3554315e5e5ccc Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 3 Jan 2024 13:37:56 -0500 Subject: [PATCH 56/85] assert called once is a function these days --- tests/unit/test_cli.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 9237bd7..3a2d97e 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -25,31 +25,31 @@ def test_dug_cli_parser(): @patch('dug.cli.crawl') def test_dug_cli_main_crawl(mock_crawl): main(["crawl", "somefile.csv", "--parser", "topmedtag"]) - assert mock_crawl.called_once() + mock_crawl.assert_called_once() @mark.cli @patch('dug.cli.crawl') def test_dug_cli_main_extract_dug_elements(mock_crawl): main(["crawl", "somefile.csv", "--parser", "topmedtag", "-x"]) - assert mock_crawl.called_once() + mock_crawl.assert_called_once() assert mock_crawl.call_args_list[0].args[0].extract_dug_elements @mark.cli @patch('dug.cli.crawl') def test_dug_cli_main_extract_dug_elements_none(mock_crawl): main(["crawl", "somefile.csv", "--parser", "topmedtag"]) - assert mock_crawl.called_once() + mock_crawl.assert_called_once() assert not mock_crawl.call_args_list[0].args[0].extract_dug_elements @mark.cli @patch('dug.cli.crawl') def test_dug_cli_main_annotator(mock_crawl): main(["crawl", "somefile.csv","--parser", "topmedtag", "--annotator", "annotator-monarch"]) - assert mock_crawl.called_once() + mock_crawl.assert_called_once() @mark.cli @patch('dug.cli.search') def test_dug_cli_main_search(mock_search): # mock_search.search.return_value = "Searching!" main(["search", "-q", "heart attack", "-t", "variables", "-k", "namespace=default"]) - assert mock_search.called_once() + mock_search.assert_called_once() From 814a247457fe7808bf28b7b25f878e342d1d641d Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 3 Jan 2024 15:57:51 -0500 Subject: [PATCH 57/85] trivy scan skip unfixed --- .github/workflows/trivy-pr-scan.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 142572d..19f86e1 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -55,6 +55,7 @@ jobs: image-ref: '${{ github.repository }}:vuln-test' format: 'sarif' severity: 'CRITICAL,HIGH' + ignore-unfixed: true output: 'trivy-results.sarif' exit-code: '1' # Scan results should be viewable in GitHub Security Dashboard From ceece16bf6bbbffcdb52f63a8cc6db27fd507c6a Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 3 Jan 2024 16:08:04 -0500 Subject: [PATCH 58/85] upgrade system pip version for vul scan --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index d57f06b..c009bc5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,7 @@ FROM python:3.12.0-alpine3.18 RUN apk update && \ apk add g++ make +RUN pip install --upgrade pip # Create a non-root user. ENV USER dug ENV HOME /home/$USER From 9ae8e0aa945301df2718e65e8e959199c665b1d8 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 4 Jan 2024 10:05:17 -0500 Subject: [PATCH 59/85] remove annotate commented out code, backdrop python min requriement --- setup.cfg | 2 +- src/dug/core/annotate.py | 612 --------------------------------------- 2 files changed, 1 insertion(+), 613 deletions(-) delete mode 100644 src/dug/core/annotate.py diff --git a/setup.cfg b/setup.cfg index b551ef3..75fe4d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.12 +python_requires = >=3.11 include_package_data = true install_requires = elasticsearch==8.5.2 diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py deleted file mode 100644 index 29fa85f..0000000 --- a/src/dug/core/annotate.py +++ /dev/null @@ -1,612 +0,0 @@ -# import json -# import logging -# import os -# import re -# import urllib.parse -# from typing import TypeVar, Generic, Union, List, Tuple, Optional -# import bmt -# import requests -# from requests import Session - -# import dug.core.tranql as tql - - -# logger = logging.getLogger('dug') - -# logging.getLogger("requests").setLevel(logging.WARNING) -# logging.getLogger("urllib3").setLevel(logging.WARNING) - - -# class Identifier: -# def __init__(self, id, label, types=None, search_text="", description=""): -# self.id = id -# self.label = label -# self.description = description -# if types is None: -# types = [] -# self.types = types -# self.search_text = [search_text] if search_text else [] -# self.equivalent_identifiers = [] -# self.synonyms = [] -# self.purl = "" - -# @property -# def id_type(self): -# return self.id.split(":")[0] - -# def add_search_text(self, text): -# # Add text only if it's unique and if not empty string -# if text and text not in self.search_text: -# self.search_text.append(text) - -# def get_searchable_dict(self): -# # Return a version of the identifier compatible with what's in ElasticSearch -# es_ident = { -# 'id': self.id, -# 'label': self.label, -# 'equivalent_identifiers': self.equivalent_identifiers, -# 'type': self.types, -# 'synonyms': self.synonyms -# } -# return es_ident - -# def jsonable(self): -# return self.__dict__ - - -# class DugAnnotator: -# def __init__( -# self, -# preprocessor: "Preprocessor", -# annotator: "Annotator", -# normalizer: "Normalizer", -# synonym_finder: "SynonymFinder", -# ontology_greenlist=[], -# ): -# self.preprocessor = preprocessor -# self.annotator = annotator -# self.normalizer = normalizer -# self.synonym_finder = synonym_finder -# self.ontology_greenlist = ontology_greenlist -# self.norm_fails_file = "norm_fails.txt" -# self.anno_fails_file = "anno_fails.txt" - -# def annotate(self, text, http_session): - -# # Preprocess text (debraviate, remove stopwords, etc.) -# text = self.preprocessor.preprocess(text) - -# # Fetch identifiers -# raw_identifiers = self.annotator.annotate(text, http_session) - -# # Write out to file if text fails to annotate -# if not raw_identifiers: -# with open(self.anno_fails_file, "a") as fh: -# fh.write(f'{text}\n') - -# processed_identifiers = [] -# for identifier in raw_identifiers: - -# # Normalize identifier using normalization service -# norm_id = self.normalizer.normalize(identifier, http_session) - -# # Skip adding id if it doesn't normalize -# if norm_id is None: -# # Write out to file if identifier doesn't normalize -# with open(self.norm_fails_file, "a") as fh: -# fh.write(f'{identifier.id}\n') - -# # Discard non-normalized ident if not in greenlist -# if identifier.id_type not in self.ontology_greenlist: -# continue - -# # If it is in greenlist just keep moving forward -# norm_id = identifier - -# # Add synonyms to identifier -# norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session) - -# # Get pURL for ontology identifer for more info -# norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) -# processed_identifiers.append(norm_id) - -# return processed_identifiers - - -# class ConceptExpander: -# def __init__(self, url, min_tranql_score=0.2): -# self.url = url -# self.min_tranql_score = min_tranql_score -# self.include_node_keys = ["id", "name", "synonyms"] -# self.include_edge_keys = [] -# self.tranql_headers = {"accept": "application/json", "Content-Type": "text/plain"} - -# def is_acceptable_answer(self, answer): -# return True - -# def expand_identifier(self, identifier, query_factory, kg_filename, include_all_attributes=False): - -# answer_kgs = [] - -# # Skip TranQL query if a file exists in the crawlspace exists already, but continue w/ answers -# if os.path.exists(kg_filename): -# logger.info(f"identifier {identifier} is already crawled. Skipping TranQL query.") -# with open(kg_filename, 'r') as stream: -# response = json.load(stream) -# else: -# query = query_factory.get_query(identifier) -# logger.debug(query) -# response = requests.post( -# url=self.url, -# headers=self.tranql_headers, -# data=query).json() - -# # Case: Skip if empty KG -# try: -# if response["message"] == 'Internal Server Error' or len(response["message"]["knowledge_graph"]["nodes"]) == 0: -# logger.debug(f"Did not find a knowledge graph for {query}") -# logger.debug(f"{self.url} returned response: {response}") -# return [] -# except KeyError as e: -# logger.error(f"Could not find key: {e} in response: {response}") - -# # Dump out to file if there's a knowledge graph -# with open(kg_filename, 'w') as stream: -# json.dump(response, stream, indent=2) - -# # Get nodes in knowledge graph hashed by ids for easy lookup -# noMessage = (len(response.get("message",{})) == 0) -# statusError = (response.get("status","") == 'Error') -# if noMessage or statusError: -# # Skip on error -# logger.info(f"Error with identifier: {identifier}, response: {response}, kg_filename: '{kg_filename}'") -# return [] -# kg = tql.QueryKG(response) - -# for answer in kg.answers: -# # Filter out answers that don't meet some criteria -# # Right now just don't filter anything -# logger.debug(f"Answer: {answer}") -# if not self.is_acceptable_answer(answer): -# logger.warning("Skipping answer as it failed one or more acceptance criteria. See log for details.") -# continue - -# # Get subgraph containing only information for this answer -# try: -# # Temporarily surround in try/except because sometimes the answer graphs -# # contain invalid references to edges/nodes -# # This will be fixed in Robokop but for now just silently warn if answer is invalid -# node_attributes_filter = None if include_all_attributes else self.include_node_keys -# edge_attributes_filter = None if include_all_attributes else self.include_edge_keys -# answer_kg = kg.get_answer_subgraph(answer, -# include_node_keys=node_attributes_filter, -# include_edge_keys=edge_attributes_filter) - -# # Add subgraph to list of acceptable answers to query -# answer_kgs.append(answer_kg) - -# except tql.MissingNodeReferenceError: -# # TEMPORARY: Skip answers that have invalid node references -# # Need this to be fixed in Robokop -# logger.warning("Skipping answer due to presence of non-preferred id! " -# "See err msg for details.") -# continue -# except tql.MissingEdgeReferenceError: -# # TEMPORARY: Skip answers that have invalid edge references -# # Need this to be fixed in Robokop -# logger.warning("Skipping answer due to presence of invalid edge reference! " -# "See err msg for details.") -# continue - -# return answer_kgs - - -# class Preprocessor: -# """"Class for preprocessing strings so they are better interpreted by NLP steps""" - -# def __init__(self, debreviator=None, stopwords=None): -# if debreviator is None: -# debreviator = self.default_debreviator_factory() -# self.decoder = debreviator - -# if stopwords is None: -# stopwords = [] -# self.stopwords = stopwords - -# def preprocess(self, text: str) -> str: -# """ -# Apply debreviator to replace abbreviations and other characters - -# >>> pp = Preprocessor({"foo": "bar"}, ["baz"]) -# >>> pp.preprocess("Hello foo") -# 'Hello bar' - -# >>> pp.preprocess("Hello baz world") -# 'Hello world' -# """ - -# for key, value in self.decoder.items(): -# text = text.replace(key, value) - -# # Remove any stopwords -# text = " ".join([word for word in text.split() if word not in self.stopwords]) -# return text - -# @staticmethod -# def default_debreviator_factory(): -# return {"bmi": "body mass index", "_": " "} - - -# Input = TypeVar("Input") -# Output = TypeVar("Output") - - -# class ApiClient(Generic[Input, Output]): - -# def make_request(self, value: Input, http_session: Session): -# raise NotImplementedError() - -# def handle_response(self, value, response: Union[dict, list]) -> Output: -# raise NotImplementedError() - -# def __call__(self, value: Input, http_session: Session) -> Output: -# response = self.make_request(value, http_session) - -# result = self.handle_response(value, response) - -# return result - - -# class Annotator(ApiClient[str, List[Identifier]]): -# """ -# Use monarch API service to fetch ontology IDs found in text -# """ - -# def __init__(self, url: str): -# self.url = url - -# def sliding_window(self, text, max_characters=2000, padding_words=5): -# """ -# For long texts sliding window works as the following -# "aaaa bbb ccc ddd eeee" -# with a sliding max chars 8 and padding 1 -# first yeild would be "aaaa bbb" -# next subsequent yeilds "bbb ccc", "ccc ddd" , "ddd eeee" -# allowing context to be preserved with the scope of padding -# For a text of length 7653 , with max_characters 2000 and padding 5 , 4 chunks are yielded. -# """ -# words = text.split(' ') -# total_words = len(words) -# window_end = False -# current_index = 0 -# while not window_end: -# current_string = "" -# for index, word in enumerate(words[current_index: ]): -# if len(current_string) + len(word) + 1 >= max_characters: -# yield current_string + " " -# current_index += index - padding_words -# break -# appendee = word if index == 0 else " " + word -# current_string += appendee - -# if current_index + index == len(words) - 1: -# window_end = True -# yield current_string - -# def annotate(self, text, http_session): -# logger.debug(f"Annotating: {text}") -# identifiers = [] -# for chunk_text in self.sliding_window(text): -# identifiers += self(chunk_text, http_session) -# return identifiers - -# def make_request(self, value: Input, http_session: Session): -# value = urllib.parse.quote(value) -# url = f'{self.url}{value}' - -# # This could be moved to a config file -# NUM_TRIES = 5 -# for _ in range(NUM_TRIES): -# response = http_session.get(url) -# if response is not None: -# # looks like it worked -# break - -# # if the reponse is still None here, throw an error -# if response is None: -# raise RuntimeError(f"no response from {url}") -# return response.json() - -# def handle_response(self, value, response: dict) -> List[Identifier]: -# identifiers = [] -# """ Parse each identifier and initialize identifier object """ -# for span in response.get('spans', []): -# search_text = span.get('text', None) -# for token in span.get('token', []): -# curie = token.get('id', None) -# if not curie: -# continue - -# biolink_types = token.get('category') -# label = token.get('terms')[0] -# identifiers.append(Identifier(id=curie, -# label=label, -# types=biolink_types, -# search_text=search_text)) -# return identifiers - - -# class Normalizer(ApiClient[Identifier, Identifier]): -# def __init__(self, url): -# self.bl_toolkit = bmt.Toolkit() -# self.url = url - -# def normalize(self, identifier: Identifier, http_session: Session): -# # Use RENCI's normalization API service to get the preferred version of an identifier -# logger.debug(f"Normalizing: {identifier.id}") -# return self(identifier, http_session) - -# def make_request(self, value: Identifier, http_session: Session) -> dict: -# curie = value.id -# url = f"{self.url}{urllib.parse.quote(curie)}" -# try: -# response = http_session.get(url) -# except Exception as get_exc: -# logger.info(f"Error normalizing {value} at {url}") -# logger.error(f"Error {get_exc.__class__.__name__}: {get_exc}") -# return {} -# try: -# normalized = response.json() -# except Exception as json_exc: -# logger.info(f"Error processing response: {response.text} (HTTP {response.status_code})") -# logger.error(f"Error {json_exc.__class__.__name__}: {json_exc}") -# return {} - -# return normalized - -# def handle_response(self, identifier: Identifier, normalized: dict) -> Optional[Identifier]: -# """ Record normalized results. """ -# curie = identifier.id -# normalization = normalized.get(curie, {}) -# if normalization is None: -# logger.info(f"Normalization service did not return normalization for: {curie}") -# return None - -# preferred_id = normalization.get("id", {}) -# equivalent_identifiers = normalization.get("equivalent_identifiers", []) -# biolink_type = normalization.get("type", []) - -# # Return none if there isn't actually a preferred id -# if 'identifier' not in preferred_id: -# logger.debug(f"ERROR: normalize({curie})=>({preferred_id}). No identifier?") -# return None - -# logger.debug(f"Preferred id: {preferred_id}") -# identifier.id = preferred_id.get('identifier', '') -# identifier.label = preferred_id.get('label', '') -# identifier.description = preferred_id.get('description', '') -# identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] -# try: -# identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name -# except: -# # converts biolink:SmallMolecule to small molecule -# identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() -# return identifier - - -# class SynonymFinder(ApiClient[str, List[str]]): - -# def __init__(self, url: str): -# self.url = url - -# def get_synonyms(self, curie: str, http_session): -# ''' -# This function uses the NCATS translator service to return a list of synonyms for -# curie id -# ''' - -# return self(curie, http_session) - -# def make_request(self, curie: str, http_session: Session): -# # Get response from namelookup reverse lookup op -# # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) -# url = f"{self.url}" -# payload = { -# 'curies': [curie] -# } -# try: -# response = http_session.post(url, json=payload) -# if str(response.status_code).startswith('4'): -# logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") -# return {curie: []} -# if str(response.status_code).startswith('5'): -# logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") -# return {curie: []} -# return response.json() -# except json.decoder.JSONDecodeError as e: -# logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") -# return {curie: []} - -# def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: -# # Return curie synonyms -# return raw_synonyms.get(curie, []) - - - - - -# class BioLinkPURLerizer: -# # Static class for the sole purpose of doing lookups of different ontology PURLs -# # Is it pretty? No. But it gets the job done. -# biolink_lookup = {"APO": "http://purl.obolibrary.org/obo/APO_", -# "Aeolus": "http://translator.ncats.nih.gov/Aeolus_", -# "BIOGRID": "http://identifiers.org/biogrid/", -# "BIOSAMPLE": "http://identifiers.org/biosample/", -# "BSPO": "http://purl.obolibrary.org/obo/BSPO_", -# "CAID": "http://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=", -# "CHEBI": "http://purl.obolibrary.org/obo/CHEBI_", -# "CHEMBL.COMPOUND": "http://identifiers.org/chembl.compound/", -# "CHEMBL.MECHANISM": "https://www.ebi.ac.uk/chembl/mechanism/inspect/", -# "CHEMBL.TARGET": "http://identifiers.org/chembl.target/", -# "CID": "http://pubchem.ncbi.nlm.nih.gov/compound/", -# "CL": "http://purl.obolibrary.org/obo/CL_", -# "CLINVAR": "http://identifiers.org/clinvar/", -# "CLO": "http://purl.obolibrary.org/obo/CLO_", -# "COAR_RESOURCE": "http://purl.org/coar/resource_type/", -# "CPT": "https://www.ama-assn.org/practice-management/cpt/", -# "CTD": "http://translator.ncats.nih.gov/CTD_", -# "ClinVarVariant": "http://www.ncbi.nlm.nih.gov/clinvar/variation/", -# "DBSNP": "http://identifiers.org/dbsnp/", -# "DGIdb": "https://www.dgidb.org/interaction_types", -# "DOID": "http://purl.obolibrary.org/obo/DOID_", -# "DRUGBANK": "http://identifiers.org/drugbank/", -# "DrugCentral": "http://translator.ncats.nih.gov/DrugCentral_", -# "EC": "http://www.enzyme-database.org/query.php?ec=", -# "ECTO": "http://purl.obolibrary.org/obo/ECTO_", -# "EDAM-DATA": "http://edamontology.org/data_", -# "EDAM-FORMAT": "http://edamontology.org/format_", -# "EDAM-OPERATION": "http://edamontology.org/operation_", -# "EDAM-TOPIC": "http://edamontology.org/topic_", -# "EFO": "http://identifiers.org/efo/", -# "ENSEMBL": "http://identifiers.org/ensembl/", -# "ExO": "http://purl.obolibrary.org/obo/ExO_", -# "FAO": "http://purl.obolibrary.org/obo/FAO_", -# "FB": "http://identifiers.org/fb/", -# "FBcv": "http://purl.obolibrary.org/obo/FBcv_", -# "FlyBase": "http://flybase.org/reports/", -# "GAMMA": "http://translator.renci.org/GAMMA_", -# "GO": "http://purl.obolibrary.org/obo/GO_", -# "GOLD.META": "http://identifiers.org/gold.meta/", -# "GOP": "http://purl.obolibrary.org/obo/go#", -# "GOREL": "http://purl.obolibrary.org/obo/GOREL_", -# "GSID": "https://scholar.google.com/citations?user=", -# "GTEx": "https://www.gtexportal.org/home/gene/", -# "HANCESTRO": "http://www.ebi.ac.uk/ancestro/ancestro_", -# "HCPCS": "http://purl.bioontology.org/ontology/HCPCS/", -# "HGNC": "http://identifiers.org/hgnc/", -# "HGNC.FAMILY": "http://identifiers.org/hgnc.family/", -# "HMDB": "http://identifiers.org/hmdb/", -# "HP": "http://purl.obolibrary.org/obo/HP_", -# "ICD0": "http://translator.ncats.nih.gov/ICD0_", -# "ICD10": "http://translator.ncats.nih.gov/ICD10_", -# "ICD9": "http://translator.ncats.nih.gov/ICD9_", -# "INCHI": "http://identifiers.org/inchi/", -# "INCHIKEY": "http://identifiers.org/inchikey/", -# "INTACT": "http://identifiers.org/intact/", -# "IUPHAR.FAMILY": "http://identifiers.org/iuphar.family/", -# "KEGG": "http://identifiers.org/kegg/", -# "LOINC": "http://loinc.org/rdf/", -# "MEDDRA": "http://identifiers.org/meddra/", -# "MESH": "http://identifiers.org/mesh/", -# "MGI": "http://identifiers.org/mgi/", -# "MI": "http://purl.obolibrary.org/obo/MI_", -# "MIR": "http://identifiers.org/mir/", -# "MONDO": "http://purl.obolibrary.org/obo/MONDO_", -# "MP": "http://purl.obolibrary.org/obo/MP_", -# "MSigDB": "https://www.gsea-msigdb.org/gsea/msigdb/", -# "MetaCyc": "http://translator.ncats.nih.gov/MetaCyc_", -# "NCBIGENE": "http://identifiers.org/ncbigene/", -# "NCBITaxon": "http://purl.obolibrary.org/obo/NCBITaxon_", -# "NCIT": "http://purl.obolibrary.org/obo/NCIT_", -# "NDDF": "http://purl.bioontology.org/ontology/NDDF/", -# "NLMID": "https://www.ncbi.nlm.nih.gov/nlmcatalog/?term=", -# "OBAN": "http://purl.org/oban/", -# "OBOREL": "http://purl.obolibrary.org/obo/RO_", -# "OIO": "http://www.geneontology.org/formats/oboInOwl#", -# "OMIM": "http://purl.obolibrary.org/obo/OMIM_", -# "ORCID": "https://orcid.org/", -# "ORPHA": "http://www.orpha.net/ORDO/Orphanet_", -# "ORPHANET": "http://identifiers.org/orphanet/", -# "PANTHER.FAMILY": "http://identifiers.org/panther.family/", -# "PANTHER.PATHWAY": "http://identifiers.org/panther.pathway/", -# "PATO-PROPERTY": "http://purl.obolibrary.org/obo/pato#", -# "PDQ": "https://www.cancer.gov/publications/pdq#", -# "PHARMGKB.DRUG": "http://identifiers.org/pharmgkb.drug/", -# "PHARMGKB.PATHWAYS": "http://identifiers.org/pharmgkb.pathways/", -# "PHAROS": "http://pharos.nih.gov", -# "PMID": "http://www.ncbi.nlm.nih.gov/pubmed/", -# "PO": "http://purl.obolibrary.org/obo/PO_", -# "POMBASE": "http://identifiers.org/pombase/", -# "PR": "http://purl.obolibrary.org/obo/PR_", -# "PUBCHEM.COMPOUND": "http://identifiers.org/pubchem.compound/", -# "PUBCHEM.SUBSTANCE": "http://identifiers.org/pubchem.substance/", -# "PathWhiz": "http://smpdb.ca/pathways/#", -# "REACT": "http://www.reactome.org/PathwayBrowser/#/", -# "REPODB": "http://apps.chiragjpgroup.org/repoDB/", -# "RGD": "http://identifiers.org/rgd/", -# "RHEA": "http://identifiers.org/rhea/", -# "RNACENTRAL": "http://identifiers.org/rnacentral/", -# "RO": "http://purl.obolibrary.org/obo/RO_", -# "RTXKG1": "http://kg1endpoint.rtx.ai/", -# "RXNORM": "http://purl.bioontology.org/ontology/RXNORM/", -# "ResearchID": "https://publons.com/researcher/", -# "SEMMEDDB": "https://skr3.nlm.nih.gov/SemMedDB", -# "SGD": "http://identifiers.org/sgd/", -# "SIO": "http://semanticscience.org/resource/SIO_", -# "SMPDB": "http://identifiers.org/smpdb/", -# "SNOMEDCT": "http://identifiers.org/snomedct/", -# "SNPEFF": "http://translator.ncats.nih.gov/SNPEFF_", -# "ScopusID": "https://www.scopus.com/authid/detail.uri?authorId=", -# "TAXRANK": "http://purl.obolibrary.org/obo/TAXRANK_", -# "UBERGRAPH": "http://translator.renci.org/ubergraph-axioms.ofn#", -# "UBERON": "http://purl.obolibrary.org/obo/UBERON_", -# "UBERON_CORE": "http://purl.obolibrary.org/obo/uberon/core#", -# "UMLS": "http://identifiers.org/umls/", -# "UMLSSC": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/code#", -# "UMLSSG": "https://metamap.nlm.nih.gov/Docs/SemGroups_2018.txt/group#", -# "UMLSST": "https://metamap.nlm.nih.gov/Docs/SemanticTypes_2018AB.txt/type#", -# "UNII": "http://identifiers.org/unii/", -# "UPHENO": "http://purl.obolibrary.org/obo/UPHENO_", -# "UniProtKB": "http://identifiers.org/uniprot/", -# "VANDF": "https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/VANDF/", -# "VMC": "https://github.com/ga4gh/vr-spec/", -# "WB": "http://identifiers.org/wb/", -# "WBPhenotype": "http://purl.obolibrary.org/obo/WBPhenotype_", -# "WBVocab": "http://bio2rdf.org/wormbase_vocabulary", -# "WIKIDATA": "https://www.wikidata.org/wiki/", -# "WIKIDATA_PROPERTY": "https://www.wikidata.org/wiki/Property:", -# "WIKIPATHWAYS": "http://identifiers.org/wikipathways/", -# "WormBase": "https://www.wormbase.org/get?name=", -# "ZFIN": "http://identifiers.org/zfin/", -# "ZP": "http://purl.obolibrary.org/obo/ZP_", -# "alliancegenome": "https://www.alliancegenome.org/", -# "biolink": "https://w3id.org/biolink/vocab/", -# "biolinkml": "https://w3id.org/biolink/biolinkml/", -# "chembio": "http://translator.ncats.nih.gov/chembio_", -# "dcterms": "http://purl.org/dc/terms/", -# "dictyBase": "http://dictybase.org/gene/", -# "doi": "https://doi.org/", -# "fabio": "http://purl.org/spar/fabio/", -# "foaf": "http://xmlns.com/foaf/0.1/", -# "foodb.compound": "http://foodb.ca/compounds/", -# "gff3": "https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md#", -# "gpi": "https://github.com/geneontology/go-annotation/blob/master/specs/gpad-gpi-2-0.md#", -# "gtpo": "https://rdf.guidetopharmacology.org/ns/gtpo#", -# "hetio": "http://translator.ncats.nih.gov/hetio_", -# "interpro": "https://www.ebi.ac.uk/interpro/entry/", -# "isbn": "https://www.isbn-international.org/identifier/", -# "isni": "https://isni.org/isni/", -# "issn": "https://portal.issn.org/resource/ISSN/", -# "medgen": "https://www.ncbi.nlm.nih.gov/medgen/", -# "oboformat": "http://www.geneontology.org/formats/oboInOWL#", -# "pav": "http://purl.org/pav/", -# "prov": "http://www.w3.org/ns/prov#", -# "qud": "http://qudt.org/1.1/schema/qudt#", -# "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", -# "rdfs": "http://www.w3.org/2000/01/rdf-schema#", -# "skos": "https://www.w3.org/TR/skos-reference/#", -# "wgs": "http://www.w3.org/2003/01/geo/wgs84_pos", -# "xsd": "http://www.w3.org/2001/XMLSchema#", -# "@vocab": "https://w3id.org/biolink/vocab/"} - -# @staticmethod -# def get_curie_purl(curie): -# # Split into prefix and suffix -# suffix = curie.split(":")[1] -# prefix = curie.split(":")[0] - -# # Check to see if the prefix exists in the hash -# if prefix not in BioLinkPURLerizer.biolink_lookup: -# return None - -# return f"{BioLinkPURLerizer.biolink_lookup[prefix]}{suffix}" From a18670e3e463ae43272b92589ce9f8a8a244b68c Mon Sep 17 00:00:00 2001 From: braswent Date: Thu, 4 Jan 2024 12:28:17 -0500 Subject: [PATCH 60/85] feat: updated elasticsearch auth protocol to latest version --- setup.cfg | 2 +- src/dug/core/async_search.py | 4 ++-- src/dug/core/index.py | 4 ++-- tests/integration/conftest.py | 16 ++++++++-------- tests/integration/test_index.py | 2 +- tests/unit/conftest.py | 11 +++++------ tests/unit/test_api.py | 13 +++++++------ tests/unit/test_core/test_search.py | 3 ++- 8 files changed, 28 insertions(+), 27 deletions(-) diff --git a/setup.cfg b/setup.cfg index 75fe4d2..0df3d5d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.11 +python_requires = >=3.10 include_package_data = true install_requires = elasticsearch==8.5.2 diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index 44d7c98..b39e6a9 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -50,12 +50,12 @@ def __init__(self, cfg: Config, indices=None): cafile=self._cfg.elastic_ca_path ) self.es = AsyncElasticsearch(hosts=self.hosts, - http_auth=(self._cfg.elastic_username, + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password), ssl_context=ssl_context) else: self.es = AsyncElasticsearch(hosts=self.hosts, - http_auth=(self._cfg.elastic_username, + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) async def dump_concepts(self, index, query={}, size=None, diff --git a/src/dug/core/index.py b/src/dug/core/index.py index 93a2d58..0491d06 100644 --- a/src/dug/core/index.py +++ b/src/dug/core/index.py @@ -30,12 +30,12 @@ def __init__(self, cfg: Config, indices=None): ) self.es = Elasticsearch( hosts=self.hosts, - http_auth=(self._cfg.elastic_username, self._cfg.elastic_password), + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password), ssl_context=ssl_context) else: self.es = Elasticsearch( hosts=self.hosts, - http_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) + basic_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) self.replicas = self.get_es_node_count() if self.es.ping(): diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 7bc0bcf..b671e3f 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -5,7 +5,7 @@ from dataclasses import dataclass from typing import Dict -import pytest +import pytest_asyncio TEST_DATA_DIR = Path(__file__).parent.resolve() / "data" @@ -45,7 +45,7 @@ def post(self, url, params: dict = None, json: dict = {}): return MockResponse(text, status_code=status_code) -@pytest.fixture +@pytest_asyncio.fixture def monarch_annotator_api(): base_url = "http://annotator.api/?content={query}" @@ -94,7 +94,7 @@ def _(keyword): ) -@pytest.fixture +@pytest_asyncio.fixture def token_classifier_api(): return MockApiService( urls={ @@ -118,7 +118,7 @@ def token_classifier_api(): ) -@pytest.fixture +@pytest_asyncio.fixture def sapbert_annotator_api(): return MockApiService( urls={ @@ -145,7 +145,7 @@ def sapbert_annotator_api(): ) -@pytest.fixture +@pytest_asyncio.fixture def normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -188,7 +188,7 @@ def _(curie): ) -@pytest.fixture +@pytest_asyncio.fixture def null_normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -211,7 +211,7 @@ def _(curie): ) -@pytest.fixture +@pytest_asyncio.fixture def synonym_api(): return MockApiService( urls={ @@ -234,7 +234,7 @@ def synonym_api(): ) -@pytest.fixture +@pytest_asyncio.fixture def null_synonym_api(): return MockApiService( urls={"http://synonyms.api": [json.dumps({"XAO:0000336": {"names":[]}}), 200]} diff --git a/tests/integration/test_index.py b/tests/integration/test_index.py index 31d0d3d..829e4ba 100644 --- a/tests/integration/test_index.py +++ b/tests/integration/test_index.py @@ -21,7 +21,7 @@ def is_elastic_up(): try: es = Elasticsearch( hosts=hosts, - http_auth=(username, password) + basic_auth=(username, password) ) return es.ping() except Exception: diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index f40d4f6..87f2edc 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -3,8 +3,7 @@ from dataclasses import dataclass from typing import Dict -import pytest - +import pytest_asyncio @dataclass class MockResponse: @@ -41,7 +40,7 @@ def post(self, url, params: dict = None, json: dict = {}): return MockResponse(text, status_code=status_code) -@pytest.fixture +@pytest_asyncio.fixture def annotator_api(): base_url = "http://annotator.api/?content={query}" @@ -150,7 +149,7 @@ def _(keyword): ) -@pytest.fixture +@pytest_asyncio.fixture def normalizer_api(): base_url = "http://normalizer.api/?curie={curie}" @@ -193,7 +192,7 @@ def _(curie): ) -@pytest.fixture +@pytest_asyncio.fixture def synonym_api(): return MockApiService( urls={ @@ -216,7 +215,7 @@ def synonym_api(): ) -@pytest.fixture() +@pytest_asyncio.fixture() def ontology_api(): base_url = "http://ontology.api/?curie={curie}" diff --git a/tests/unit/test_api.py b/tests/unit/test_api.py index e55b688..cd35ba3 100644 --- a/tests/unit/test_api.py +++ b/tests/unit/test_api.py @@ -6,24 +6,25 @@ import pytest pytest.skip("skipping as dug.api is no longer present", allow_module_level=True) from pytest import mark +import pytest_asyncio from dug.api import app, main, DugResource -@pytest.fixture +@pytest_asyncio.fixture def dug_api_test_client(): with app.test_client() as client: yield client -@pytest.fixture +@pytest_asyncio.fixture def mock_g_object(): with patch('dug.api.dug') as g: yield g -@pytest.fixture +@pytest_asyncio.fixture def mock_search_concepts(mock_g_object): mock_g_object().search_concepts.return_value = {'hits': {'hits': [ {'_type': '_doc', @@ -38,21 +39,21 @@ def mock_search_concepts(mock_g_object): }} -@pytest.fixture +@pytest_asyncio.fixture def mock_search_kg(mock_g_object): mock_g_object().search_kg.return_value = {'hits': {'hits': [ {'_type': '_doc', '_id': 'MEDDRA:10047249'} ]}} -@pytest.fixture +@pytest_asyncio.fixture def mock_search_variables(mock_g_object): mock_g_object().search_variables.return_value = {'hits': {'hits': [ {'_type': '_doc', '_id': 'MEDDRA:10047249'} ]}} -@pytest.fixture +@pytest_asyncio.fixture def mock_agg_data_types(mock_g_object): mock_g_object().agg_data_type.return_value = ["DBGaP"] diff --git a/tests/unit/test_core/test_search.py b/tests/unit/test_core/test_search.py index b7edc83..db7ed75 100644 --- a/tests/unit/test_core/test_search.py +++ b/tests/unit/test_core/test_search.py @@ -3,6 +3,7 @@ from unittest.mock import patch import pytest +import pytest_asyncio from dug.core.index import Index, SearchException from dug.config import Config @@ -95,7 +96,7 @@ def search(self, index, body, **kwargs): return {"results": {k: v for k, v in values.items() if body in v}} -@pytest.fixture +@pytest_asyncio.fixture def elastic(): with patch("dug.core.index.Elasticsearch") as es_class: es_instance = MockElastic(indices=MockIndices()) From 4c4977d83609faee2fbc8c505d23025d23c49352 Mon Sep 17 00:00:00 2001 From: braswent Date: Thu, 4 Jan 2024 13:05:57 -0500 Subject: [PATCH 61/85] feat: change annotator config to allow for different configs --- src/dug/config.py | 137 +++++++++++-------- src/dug/core/annotators/__init__.py | 12 +- src/dug/core/annotators/_base.py | 1 - src/dug/core/annotators/monarch_annotator.py | 4 +- src/dug/core/annotators/sapbert_annotator.py | 5 +- 5 files changed, 95 insertions(+), 64 deletions(-) diff --git a/src/dug/config.py b/src/dug/config.py index 5f4d59d..5f49e9e 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -9,8 +9,9 @@ @dataclass class Config: """ - TODO: Populate description + TODO: Populate description """ + elastic_password: str = "changeme" redis_password: str = "changeme" @@ -27,74 +28,102 @@ class Config: nboost_port: int = 8000 # Preprocessor config that will be passed to annotate.Preprocessor constructor - preprocessor: dict = field(default_factory=lambda: { - "debreviator": { - "BMI": "body mass index" - }, - "stopwords": ["the"] - }) - + preprocessor: dict = field( + default_factory=lambda: { + "debreviator": {"BMI": "body mass index"}, + "stopwords": ["the"], + } + ) + annotator_type: str = "annotator-monarch" # Annotator config that will be passed to annotate.Annotator constructor - annotator: dict = field(default_factory=lambda: { - "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" - }) + annotator_args: dict = field( + default_factory=lambda: { + "annotator-monarch": { + "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" + }, + "annotator-sapbert": { + "classificationUrl": "https://med-nemo.apps.renci.org/annotate/", + "annotatorUrl": "https://babel-sapbert.apps.renci.org/annotate/", + }, + } + ) # Normalizer config that will be passed to annotate.Normalizer constructor - normalizer: dict = field(default_factory=lambda: { - "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" - }) + normalizer: dict = field( + default_factory=lambda: { + "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" + } + ) # Synonym service config that will be passed to annotate.SynonymHelper constructor - synonym_service: dict = field(default_factory=lambda: { - "url": "https://name-resolution-sri.renci.org/reverse_lookup" - }) + synonym_service: dict = field( + default_factory=lambda: { + "url": "https://name-resolution-sri.renci.org/reverse_lookup" + } + ) # Ontology metadata helper config that will be passed to annotate.OntologyHelper constructor - ontology_helper: dict = field(default_factory=lambda: { - "url": "https://api.monarchinitiative.org/api/bioentity/" - }) + ontology_helper: dict = field( + default_factory=lambda: { + "url": "https://api.monarchinitiative.org/api/bioentity/" + } + ) # Redlist of identifiers not to expand via TranQL tranql_exclude_identifiers: list = field(default_factory=lambda: ["CHEBI:17336"]) - tranql_queries: dict = field(default_factory=lambda: { - "disease": ["disease", "phenotypic_feature"], - "pheno": ["phenotypic_feature", "disease"], - "anat": ["disease", "anatomical_entity"], - "chem_to_disease": ["chemical_entity", "disease"], - "small_molecule_to_disease": ["small_molecule", "disease"], - "chemical_mixture_to_disease": ["chemical_mixture", "disease"], - "phen_to_anat": ["phenotypic_feature", "anatomical_entity"], - }) - - node_to_element_queries: dict = field(default_factory=lambda: { - # Dug element type to cast the query kg nodes to - "cde": { - # Parse nodes matching criteria in kg - "node_type": "biolink:Publication", - "curie_prefix": "HEALCDE", - # list of attributes that are lists to be casted to strings - "list_field_choose_first": [ - "files" - ], - "attribute_mapping": { - # "DugElement Attribute" : "KG Node attribute" - "name": "name", - "desc": "summary", - "collection_name": "cde_category", - "collection_id": "cde_category", - "action": "files" + tranql_queries: dict = field( + default_factory=lambda: { + "disease": ["disease", "phenotypic_feature"], + "pheno": ["phenotypic_feature", "disease"], + "anat": ["disease", "anatomical_entity"], + "chem_to_disease": ["chemical_entity", "disease"], + "small_molecule_to_disease": ["small_molecule", "disease"], + "chemical_mixture_to_disease": ["chemical_mixture", "disease"], + "phen_to_anat": ["phenotypic_feature", "anatomical_entity"], + } + ) + + node_to_element_queries: dict = field( + default_factory=lambda: { + # Dug element type to cast the query kg nodes to + "cde": { + # Parse nodes matching criteria in kg + "node_type": "biolink:Publication", + "curie_prefix": "HEALCDE", + # list of attributes that are lists to be casted to strings + "list_field_choose_first": ["files"], + "attribute_mapping": { + # "DugElement Attribute" : "KG Node attribute" + "name": "name", + "desc": "summary", + "collection_name": "cde_category", + "collection_id": "cde_category", + "action": "files", + }, } } - }) + ) - concept_expander: dict = field(default_factory=lambda: { - "url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false", - "min_tranql_score": 0.0 - }) + concept_expander: dict = field( + default_factory=lambda: { + "url": "https://tranql-dev.renci.org/tranql/query?dynamic_id_resolution=true&asynchronous=false", + "min_tranql_score": 0.0, + } + ) # List of ontology types that can be used even if they fail normalization - ontology_greenlist: list = field(default_factory=lambda: ["PATO", "CHEBI", "MONDO", "UBERON", "HP", "MESH", "UMLS"]) + ontology_greenlist: list = field( + default_factory=lambda: [ + "PATO", + "CHEBI", + "MONDO", + "UBERON", + "HP", + "MESH", + "UMLS", + ] + ) @classmethod def from_env(cls): @@ -107,7 +136,7 @@ def from_env(cls): "elastic_password": "ELASTIC_PASSWORD", "redis_host": "REDIS_HOST", "redis_port": "REDIS_PORT", - "redis_password": "REDIS_PASSWORD" + "redis_password": "REDIS_PASSWORD", } kwargs = {} diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index 1a58c40..903825b 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -14,8 +14,8 @@ @hookimpl def define_annotators(annotator_dict: Dict[str, Annotator]): - annotator_dict["annotator-monarch"] = build_monarch_annotator() - annotator_dict["annotator-sapbert"] = build_sapbert_annotator() + annotator_dict["annotator-monarch"] = build_monarch_annotator("annotator-monarch") + annotator_dict["annotator-sapbert"] = build_sapbert_annotator("annotator-sapbert") class AnnotatorNotFoundException(Exception): @@ -29,6 +29,7 @@ def get_annotator(hook, annotator_name) -> Annotator: hook.define_annotators(annotator_dict=available_annotators) annotator = available_annotators.get(annotator_name.lower()) if annotator is not None: + logger.info(f'Annotating with {annotator}') return annotator err_msg = f"Cannot find annotator of type '{annotator_name}'\n" \ @@ -36,21 +37,22 @@ def get_annotator(hook, annotator_name) -> Annotator: logger.error(err_msg) raise AnnotatorNotFoundException(err_msg) -def build_monarch_annotator(): +def build_monarch_annotator(annotate_type): config = Config.from_env() annotator = AnnotateMonarch( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), config=config, + **config.annotator_args[annotate_type] ) - return annotator -def build_sapbert_annotator(): +def build_sapbert_annotator(annotate_type): config = Config.from_env() annotator = AnnotateSapbert( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), + **config.annotator_args[annotate_type] ) return annotator diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index c725bff..ea30b4d 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -13,7 +13,6 @@ logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) - class DugIdentifier: """ The Dug Identifier is the core piece of information about a concept that produced from a request to an annotator based on a some original source of data. \n The information that is being stored is mostly meant to support the Monarch API but should be adjusted accordingly to suit new Annotators needs in the future. diff --git a/src/dug/core/annotators/monarch_annotator.py b/src/dug/core/annotators/monarch_annotator.py index 1c67f40..e50e317 100644 --- a/src/dug/core/annotators/monarch_annotator.py +++ b/src/dug/core/annotators/monarch_annotator.py @@ -21,9 +21,10 @@ def __init__( synonym_finder, config, ontology_greenlist=[], + **kwargs ): - self.annotatorUrl = config.annotator['url'] + self.annotatorUrl = kwargs['url'] self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist @@ -42,7 +43,6 @@ def __init__( self.stopwords = stopwords def __call__(self, text, http_session) -> List[DugIdentifier]: - # Preprocess text (debraviate, remove stopwords, etc.) text = self.preprocess_text(text) diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index 7c2fa81..a677140 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -22,9 +22,10 @@ def __init__( normalizer, synonym_finder, ontology_greenlist=[], + **kwargs ): - self.classificationUrl = "https://med-nemo.apps.renci.org/annotate/" - self.annotatorUrl = "https://babel-sapbert.apps.renci.org/annotate/" + self.classificationUrl = kwargs['classificationUrl'] + self.annotatorUrl = kwargs['annotatorUrl'] self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist From 4eb6d2e919bdc903cc9b7d3b585020bd52d8fd7b Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 4 Jan 2024 15:02:31 -0500 Subject: [PATCH 62/85] pass down config , no global access --- src/dug/core/__init__.py | 2 +- src/dug/core/annotators/__init__.py | 18 +++++++++--------- src/dug/hookspecs.py | 3 ++- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/dug/core/__init__.py b/src/dug/core/__init__.py index 9fca7ce..effcb7b 100644 --- a/src/dug/core/__init__.py +++ b/src/dug/core/__init__.py @@ -63,7 +63,7 @@ def crawl(self, target_name: str, parser_type: str, annotator_type: str, element pm = get_plugin_manager() parser = get_parser(pm.hook, parser_type) - annotator = get_annotator(pm.hook, annotator_type) + annotator = get_annotator(pm.hook, annotator_type, self._factory.config) targets = get_targets(target_name) for target in targets: diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index 903825b..acb7823 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -13,20 +13,20 @@ hookimpl = pluggy.HookimplMarker("dug") @hookimpl -def define_annotators(annotator_dict: Dict[str, Annotator]): - annotator_dict["annotator-monarch"] = build_monarch_annotator("annotator-monarch") - annotator_dict["annotator-sapbert"] = build_sapbert_annotator("annotator-sapbert") +def define_annotators(annotator_dict: Dict[str, Annotator], config: Config): + annotator_dict["annotator-monarch"] = build_monarch_annotator("annotator-monarch", config=config) + annotator_dict["annotator-sapbert"] = build_sapbert_annotator("annotator-sapbert", config=config) class AnnotatorNotFoundException(Exception): ... -def get_annotator(hook, annotator_name) -> Annotator: +def get_annotator(hook, annotator_name, config: Config) -> Annotator: """Get the annotator from all annotators registered via the define_annotators hook""" available_annotators = {} - hook.define_annotators(annotator_dict=available_annotators) + hook.define_annotators(annotator_dict=available_annotators, config=config) annotator = available_annotators.get(annotator_name.lower()) if annotator is not None: logger.info(f'Annotating with {annotator}') @@ -37,8 +37,8 @@ def get_annotator(hook, annotator_name) -> Annotator: logger.error(err_msg) raise AnnotatorNotFoundException(err_msg) -def build_monarch_annotator(annotate_type): - config = Config.from_env() +def build_monarch_annotator(annotate_type: str, config: Config): + logger.info(f"Building Monarch annotator with args: {config.annotator_args[annotate_type]}") annotator = AnnotateMonarch( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), @@ -47,8 +47,8 @@ def build_monarch_annotator(annotate_type): ) return annotator -def build_sapbert_annotator(annotate_type): - config = Config.from_env() +def build_sapbert_annotator(annotate_type, config: Config): + logger.info(f"Building Sapbert annotator with args: {config.annotator_args[annotate_type]}") annotator = AnnotateSapbert( normalizer=DefaultNormalizer(**config.normalizer), synonym_finder=DefaultSynonymFinder(**config.synonym_service), diff --git a/src/dug/hookspecs.py b/src/dug/hookspecs.py index 96b984b..9687b15 100644 --- a/src/dug/hookspecs.py +++ b/src/dug/hookspecs.py @@ -4,6 +4,7 @@ from dug.core.parsers import Parser from dug.core.annotators import Annotator +from dug.config import Config hookspec = pluggy.HookspecMarker("dug") @@ -15,7 +16,7 @@ def define_parsers(parser_dict: Dict[str, Parser]): ... @hookspec -def define_annotators(annotator_dict: Dict[str, Annotator]): +def define_annotators(annotator_dict: Dict[str, Annotator], config: Config): """Defines what Annotators are available to Dug """ ... From 0147fae6231ca60dacde1bcf2746605475182cb9 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 4 Jan 2024 15:12:55 -0500 Subject: [PATCH 63/85] remove `-` from annotator names --- src/dug/cli.py | 2 +- src/dug/config.py | 6 +++--- src/dug/core/annotators/__init__.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/dug/cli.py b/src/dug/cli.py index 4fd5923..f211e3a 100755 --- a/src/dug/cli.py +++ b/src/dug/cli.py @@ -55,7 +55,7 @@ def get_argparser(): '-a', '--annotator', help='Annotator used to annotate identifiers in crawl file', dest="annotator_type", - default="annotator-monarch" + default="monarch" ) crawl_parser.add_argument( diff --git a/src/dug/config.py b/src/dug/config.py index 5f49e9e..92e404d 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -34,14 +34,14 @@ class Config: "stopwords": ["the"], } ) - annotator_type: str = "annotator-monarch" + annotator_type: str = "monarch" # Annotator config that will be passed to annotate.Annotator constructor annotator_args: dict = field( default_factory=lambda: { - "annotator-monarch": { + "monarch": { "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" }, - "annotator-sapbert": { + "sapbert": { "classificationUrl": "https://med-nemo.apps.renci.org/annotate/", "annotatorUrl": "https://babel-sapbert.apps.renci.org/annotate/", }, diff --git a/src/dug/core/annotators/__init__.py b/src/dug/core/annotators/__init__.py index acb7823..60b43df 100644 --- a/src/dug/core/annotators/__init__.py +++ b/src/dug/core/annotators/__init__.py @@ -14,8 +14,8 @@ @hookimpl def define_annotators(annotator_dict: Dict[str, Annotator], config: Config): - annotator_dict["annotator-monarch"] = build_monarch_annotator("annotator-monarch", config=config) - annotator_dict["annotator-sapbert"] = build_sapbert_annotator("annotator-sapbert", config=config) + annotator_dict["monarch"] = build_monarch_annotator("monarch", config=config) + annotator_dict["sapbert"] = build_sapbert_annotator("sapbert", config=config) class AnnotatorNotFoundException(Exception): From 80e35ae6c76825028a1bdade90a8d498f2e9df2d Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Thu, 4 Jan 2024 17:47:19 -0500 Subject: [PATCH 64/85] normalize args for sapbert so it becomes easier parsing from env --- src/dug/config.py | 4 ++-- src/dug/core/annotators/sapbert_annotator.py | 8 +++++-- src/dug/core/factory.py | 22 -------------------- 3 files changed, 8 insertions(+), 26 deletions(-) diff --git a/src/dug/config.py b/src/dug/config.py index 92e404d..b070cac 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -42,8 +42,8 @@ class Config: "url": "https://api.monarchinitiative.org/api/nlp/annotate/entities?min_length=4&longest_only=false&include_abbreviation=false&include_acronym=false&include_numbers=false&content=" }, "sapbert": { - "classificationUrl": "https://med-nemo.apps.renci.org/annotate/", - "annotatorUrl": "https://babel-sapbert.apps.renci.org/annotate/", + "classification_url": "https://med-nemo.apps.renci.org/annotate/", + "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/", }, } ) diff --git a/src/dug/core/annotators/sapbert_annotator.py b/src/dug/core/annotators/sapbert_annotator.py index a677140..6f2c93a 100644 --- a/src/dug/core/annotators/sapbert_annotator.py +++ b/src/dug/core/annotators/sapbert_annotator.py @@ -24,8 +24,12 @@ def __init__( ontology_greenlist=[], **kwargs ): - self.classificationUrl = kwargs['classificationUrl'] - self.annotatorUrl = kwargs['annotatorUrl'] + self.classificationUrl = kwargs.get('classification_url') + self.annotatorUrl = kwargs.get('annotator_url') + if not self.classificationUrl: + raise TypeError('Classification url needs to be defined for sapbert annotator') + if not self.annotatorUrl: + raise TypeError('Annotator url needs to be defined for sapbert annotator') self.normalizer = normalizer self.synonym_finder = synonym_finder self.ontology_greenlist = ontology_greenlist diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py index 6037f97..0bedab2 100644 --- a/src/dug/core/factory.py +++ b/src/dug/core/factory.py @@ -4,12 +4,6 @@ from requests_cache import CachedSession import dug.core.tranql as tql -# from dug.core.annotate import (DugAnnotator, -# # Annotator, -# Normalizer, -# Preprocessor, -# SynonymFinder, -# ConceptExpander) from dug.core.concept_expander import ConceptExpander from dug.config import Config as DugConfig, TRANQL_SOURCE from dug.core.crawler import Crawler @@ -53,22 +47,6 @@ def build_crawler(self, target, parser: Parser, annotator: Annotator, element_ty return crawler - # def build_annotator(self) -> Annotator: - - # preprocessor = Preprocessor(**self.config.preprocessor) - # annotator = Annotate(**self.config.annotator) - # normalizer = Normalizer(**self.config.normalizer) - # synonym_finder = SynonymFinder(**self.config.synonym_service) - - # annotator = Annotator( - # preprocessor=preprocessor, - # annotator=annotator, - # normalizer=normalizer, - # synonym_finder=synonym_finder - # ) - - # return annotator - def build_tranqlizer(self) -> ConceptExpander: return ConceptExpander(**self.config.concept_expander) From 096ba478f60c91d96dc26ead81d07da47c07b15c Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Wed, 10 Jan 2024 19:26:55 -0500 Subject: [PATCH 65/85] Sorted lists for json serialization for parser and annotator outputs --- src/dug/core/annotators/_base.py | 44 +++++++++++++++++++++++++------- src/dug/core/parsers/_base.py | 26 +++++++++++++++++-- 2 files changed, 59 insertions(+), 11 deletions(-) diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index c725bff..cc4fc18 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -15,15 +15,26 @@ class DugIdentifier: - """ The Dug Identifier is the core piece of information about a concept that produced from a request to an annotator based on a some original source of data. - \n The information that is being stored is mostly meant to support the Monarch API but should be adjusted accordingly to suit new Annotators needs in the future. + """Core information about a concept, produced from annotator request + + The Dug Identifier is the core piece of information about a concept that + produced from a request to an annotator based on a some original source of + data. + + \n The information that is being stored is mostly meant to support the + Monarch API but should be adjusted accordingly to suit new Annotators needs + in the future. \n The information that will be needed for all annotators are: \n id: The CURIE identifier \n label: The CURIE identifier \n description: The CURIE identifier - \n When there is another supported Normalizer it will be seperated into a separate plugin like annotator. + \n When there is another supported Normalizer it will be seperated into a + separate plugin like annotator. """ + def __init__(self, id, label, types=None, search_text="", description=""): + "custom init stores parameters to initial values" + self.id = id self.label = label self.description = description @@ -40,12 +51,12 @@ def id_type(self): return self.id.split(":")[0] def add_search_text(self, text): - # Add text only if it's unique and if not empty string + "Add text only if it's unique and if not empty string" if text and text not in self.search_text: self.search_text.append(text) def get_searchable_dict(self): - # Return a version of the identifier compatible with what's in ElasticSearch + "Return version of identifier compatible with what's in ElasticSearch" es_ident = { "id": self.id, "label": self.label, @@ -56,7 +67,13 @@ def get_searchable_dict(self): return es_ident def jsonable(self): - return self.__dict__ + "Output pickleable object (used by utils.complex_handler)" + outdict = self.__dict__ + + outdict['search_text'] = sorted(self.search_text) + outdict['synonyms'] = sorted(self.synonyms) + + return outdict def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) @@ -82,9 +99,18 @@ def __call__(self, value: Input, http_session: Session) -> Output: class DefaultNormalizer(): - """ After annotation there must be a Noramlizing step to collasce equivalent concepts into one official concept. This is a needed step for the knowledge graph to map between different concepts. - \n The reason why this class in integrated into the annotators.py is because currently there is only one supported Normalizer through the NCATs Translator API. - \n When there is another supported Normalizer it will be seperated into a separate plugin like annotator. + """Default concept normalizer class + + After annotation there must be a Normalizing step to collasce equivalent + concepts into one official concept. This is a needed step for the knowledge + graph to map between different concepts. + + The reason why this class in integrated into the annotators.py is because + currently there is only one supported Normalizer through the NCATs + Translator API. + + When there is another supported Normalizer it will be seperated into a + separate plugin like annotator. """ def __init__(self, url): diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index acfc5bb..f827923 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -29,7 +29,18 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - return self.__dict__ + """Output a pickleable object + + used by dug.utils.complex_handler. Because search_terms and + optional_terms are considered unsorted lists by the parsers but will be + treated as sorted lists by python, sorting the lists before output + prevents changes in ordering from being treated as a change in output by + incremental change detection. + """ + outdict = self.__dict__ + outdict['search_terms'] = sorted(self.search_terms) + outdict['optional_terms'] = sorted(self.optional_terms) + return outdict def get_searchable_dict(self): # Translate DugElement to ES-style dict @@ -132,7 +143,18 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - return self.__dict__ + """Output a pickleable object + + used by dug.utils.complex_handler. Because search_terms and + optional_terms are considered unsorted lists by the parsers but will be + treated as sorted lists by python, sorting the lists before output + prevents changes in ordering from being treated as a change in output by + incremental change detection. + """ + outdict = self.__dict__ + outdict['search_terms'] = sorted(self.search_terms) + outdict['optional_terms'] = sorted(self.optional_terms) + return outdict def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) From 0b7b51fba6bb1b53c2260bd9a7703b675ba95881 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Tue, 16 Jan 2024 15:34:00 -0500 Subject: [PATCH 66/85] Reverted jsonable, sorted lists on assignment and change, rather than on json output --- src/dug/core/annotators/_base.py | 10 +++----- src/dug/core/parsers/_base.py | 40 ++++++++------------------------ 2 files changed, 13 insertions(+), 37 deletions(-) diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py index cc4fc18..cb4c7fd 100644 --- a/src/dug/core/annotators/_base.py +++ b/src/dug/core/annotators/_base.py @@ -41,7 +41,7 @@ def __init__(self, id, label, types=None, search_text="", description=""): if types is None: types = [] self.types = types - self.search_text = [search_text] if search_text else [] + self.search_text = sorted([search_text]) if search_text else [] self.equivalent_identifiers = [] self.synonyms = [] self.purl = "" @@ -53,7 +53,7 @@ def id_type(self): def add_search_text(self, text): "Add text only if it's unique and if not empty string" if text and text not in self.search_text: - self.search_text.append(text) + self.search_text = sorted(self.search_text + [text]) def get_searchable_dict(self): "Return version of identifier compatible with what's in ElasticSearch" @@ -68,12 +68,8 @@ def get_searchable_dict(self): def jsonable(self): "Output pickleable object (used by utils.complex_handler)" - outdict = self.__dict__ + return self.__dict__ - outdict['search_text'] = sorted(self.search_text) - outdict['synonyms'] = sorted(self.synonyms) - - return outdict def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) diff --git a/src/dug/core/parsers/_base.py b/src/dug/core/parsers/_base.py index f827923..a5262e5 100644 --- a/src/dug/core/parsers/_base.py +++ b/src/dug/core/parsers/_base.py @@ -29,18 +29,8 @@ def add_concept(self, concept): self.concepts[concept.id] = concept def jsonable(self): - """Output a pickleable object - - used by dug.utils.complex_handler. Because search_terms and - optional_terms are considered unsorted lists by the parsers but will be - treated as sorted lists by python, sorting the lists before output - prevents changes in ordering from being treated as a change in output by - incremental change detection. - """ - outdict = self.__dict__ - outdict['search_terms'] = sorted(self.search_terms) - outdict['optional_terms'] = sorted(self.optional_terms) - return outdict + """Output a pickleable object""" + return self.__dict__ def get_searchable_dict(self): # Translate DugElement to ES-style dict @@ -66,7 +56,7 @@ def set_search_terms(self): concept.set_search_terms() search_terms.extend(concept.search_terms) search_terms.append(concept.name) - search_terms = list(set(search_terms)) + search_terms = sorted(list(set(search_terms))) self.search_terms = search_terms def set_optional_terms(self): @@ -74,7 +64,7 @@ def set_optional_terms(self): for concept_id, concept in self.concepts.items(): concept.set_optional_terms() optional_terms.extend(concept.optional_terms) - optional_terms = list(set(optional_terms)) + optional_terms = sorted(list(set(optional_terms))) self.optional_terms = optional_terms def __str__(self): @@ -110,15 +100,15 @@ def add_kg_answer(self, answer, query_name): self.kg_answers[answer_id] = answer def clean(self): - self.search_terms = list(set(self.search_terms)) - self.optional_terms = list(set(self.optional_terms)) + self.search_terms = sorted(list(set(self.search_terms))) + self.optional_terms = sorted(list(set(self.optional_terms))) def set_search_terms(self): # Traverse set of identifiers to determine set of search terms search_terms = self.search_terms for ident_id, ident in self.identifiers.items(): search_terms.extend(ident.search_text + ident.synonyms) - self.search_terms = list(set(search_terms)) + self.search_terms = sorted(list(set(search_terms))) def set_optional_terms(self): # Traverse set of knowledge graph answers to determine set of optional search terms @@ -126,7 +116,7 @@ def set_optional_terms(self): for kg_id, kg_answer in self.kg_answers.items(): optional_terms += kg_answer.get_node_names() optional_terms += kg_answer.get_node_synonyms() - self.optional_terms = list(set(optional_terms)) + self.optional_terms = sorted(list(set(optional_terms))) def get_searchable_dict(self): # Translate DugConcept into Elastic-Compatible Concept @@ -143,18 +133,8 @@ def get_searchable_dict(self): return es_conc def jsonable(self): - """Output a pickleable object - - used by dug.utils.complex_handler. Because search_terms and - optional_terms are considered unsorted lists by the parsers but will be - treated as sorted lists by python, sorting the lists before output - prevents changes in ordering from being treated as a change in output by - incremental change detection. - """ - outdict = self.__dict__ - outdict['search_terms'] = sorted(self.search_terms) - outdict['optional_terms'] = sorted(self.optional_terms) - return outdict + """Output a pickleable object""" + return self.__dict__ def __str__(self): return json.dumps(self.__dict__, indent=2, default=utils.complex_handler) From 0bb708584203ae1ff653cdde9ecc64fd188278cf Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Wed, 17 Jan 2024 14:35:34 -0500 Subject: [PATCH 67/85] Trying bumps in Docker base images --- Dockerfile | 4 ++-- docker-compose.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index c009bc5..01faa15 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.0-alpine3.18 +FROM python:3.12.1-alpine3.18 # Install required packages RUN apk update && \ @@ -31,4 +31,4 @@ RUN make install RUN make install.dug # Run it -ENTRYPOINT dug \ No newline at end of file +ENTRYPOINT dug diff --git a/docker-compose.yaml b/docker-compose.yaml index 8e59bd5..ccc22a3 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -56,7 +56,7 @@ services: ## ################################################################################# elasticsearch: - image: docker.elastic.co/elasticsearch/elasticsearch:8.5.2 + image: docker.elastic.co/elasticsearch/elasticsearch:8.11.3 networks: - dug-network environment: @@ -76,7 +76,7 @@ services: ## ################################################################################# redis: - image: 'redis/redis-stack:6.2.4-v2' + image: 'redis/redis-stack:6.2.14' networks: - dug-network environment: From ef0b74dde9646212804aac5899af3b4edaf3eeda Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Wed, 17 Jan 2024 17:10:02 -0500 Subject: [PATCH 68/85] Adding jsonpickle to requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bac13a6..566a6b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,7 @@ elasticsearch[async]==8.5.2 gunicorn itsdangerous Jinja2 +jsonpickle jsonschema MarkupSafe ormar @@ -26,4 +27,4 @@ click httpx linkml-runtime==1.6.0 bmt==1.1.0 -urllib3 \ No newline at end of file +urllib3 From ebf9078f731d5e6fd3c566ae92058608b76c8b28 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Mon, 22 Jan 2024 11:55:31 -0500 Subject: [PATCH 69/85] Moving required python version back to 3.11. --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index b551ef3..75fe4d2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ classifiers = package_dir = = src packages = find: -python_requires = >=3.12 +python_requires = >=3.11 include_package_data = true install_requires = elasticsearch==8.5.2 From 56b85df6fb01e53efabb93815a12733b705105a8 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Mon, 22 Jan 2024 12:06:04 -0500 Subject: [PATCH 70/85] Changing image back to 3.11 as well --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 01faa15..4f21b36 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.1-alpine3.18 +FROM python:3.11-alpine # Install required packages RUN apk update && \ From 8834423a0234b67267691a29f948afd3da51d2b8 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Mon, 22 Jan 2024 13:45:14 -0500 Subject: [PATCH 71/85] Backing up redis image change to see if I can get dug auto-build to work again --- docker-compose.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yaml b/docker-compose.yaml index ccc22a3..8e8d27d 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -76,7 +76,7 @@ services: ## ################################################################################# redis: - image: 'redis/redis-stack:6.2.14' + image: 'redis/redis-stack:6.2.4-v2' networks: - dug-network environment: From 022f6988418e8fdfdc55e59c4d316209261261a1 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 04:48:34 -0500 Subject: [PATCH 72/85] Build all branches for testing, pushing only to docker. Fix tag bypass for build-push-release action --- .github/workflows/build-push-release.yml | 2 +- .github/workflows/code-checks.yml | 83 +++++++++++++----------- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/.github/workflows/build-push-release.yml b/.github/workflows/build-push-release.yml index f23dc15..a383cef 100644 --- a/.github/workflows/build-push-release.yml +++ b/.github/workflows/build-push-release.yml @@ -18,7 +18,7 @@ on: - .dockerignore - .githooks tags-ignore: - - 'v[0-9]+.[0-9]+.*' + - '*' jobs: build-push-release: runs-on: ubuntu-latest diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 0dc8428..401c24c 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -66,45 +66,6 @@ jobs: # flake8 --ignore=E,W --exit-zero . continue-on-error: true -# ############################## build-vuln-test ############################## - # build-vuln-test: - # # needs: flake8-linter - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v3 - - # - name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # with: - # driver-opts: | - # network=host - - # - name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_TOKEN }} - # logout: true - - # # Notes on Cache: - # # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache - # - name: Build Container - # uses: docker/build-push-action@v5 - # with: - # context: . - # push: false - # load: true - # tag: ${{ github.repository }}:vuln-test - # cache-from: type=registry,ref=${{ github.repository }}:buildcache - # cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max - # ####### Run for Fidelity ###### - # - name: Run Trivy vulnerability scanner - # uses: aquasecurity/trivy-action@master - # with: - # image-ref: '${{ github.repository }}:vuln-test' - # severity: 'CRITICAL,HIGH' - # exit-code: '1' - ################################### PYTEST ################################### pytest: runs-on: ubuntu-latest @@ -145,3 +106,47 @@ jobs: - name: Test with Bandit run: | bandit -r src -n3 -lll + +############################## test-image-build ############################## + test-image-build: + runs-on: ubuntu-latest + # if: ${{ github.actor == 'dependabot[bot]' }} + steps: + - uses: actions/checkout@v3 + + - name: Set short git commit SHA + id: vars + run: | + echo "short_sha=$(git rev-parse --short ${{ github.sha }})" >> $GITHUB_OUTPUT + # https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ + + - name: Confirm git commit SHA output + run: echo ${{ steps.vars.outputs.short_sha }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + logout: true + + - name: Parse Github Reference Name + id: branch + run: | + REF=${{ github.ref_name }} + echo "GHR=${REF%/*}" >> $GITHUB_OUTPUT + + # Notes on Cache: + # https://docs.docker.com/build/ci/github-actions/examples/#inline-cache + - name: Build Container + uses: docker/build-push-action@v5 + with: + context: . + push: true + tags: | + ${{ github.repository }}:test_${{ steps.branch.outputs.GHR }} + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max \ No newline at end of file From ef8b7211f41d553b2a1280055334baf81ec57402 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:15:50 -0500 Subject: [PATCH 73/85] Testing alpine to fix trivy error --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4f21b36..f34afa2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.11-alpine +FROM python:3.12.0-alpine3.18 # Install required packages RUN apk update && \ From e16a347439523879da62bf8e16bd8f17d97e0699 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:24:52 -0500 Subject: [PATCH 74/85] Vuln confirmed in image, new docker image test --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f34afa2..e8d1ce2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.0-alpine3.18 +FROM python:alpine3.19 # Install required packages RUN apk update && \ From 5be0195dc23d05477d0f6102182fb536e5eff14b Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:30:13 -0500 Subject: [PATCH 75/85] Is buildcache causing trivy failures? --- .github/workflows/trivy-pr-scan.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 19f86e1..8d14372 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -45,8 +45,8 @@ jobs: push: false load: true tags: ${{ github.repository }}:vuln-test - cache-from: type=registry,ref=${{ github.repository }}:buildcache - cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + # cache-from: type=registry,ref=${{ github.repository }}:buildcache + # cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max # We will not be concerned with Medium and Low vulnerabilities - name: Run Trivy vulnerability scanner From d17578db8ce6e9be77ff45b07796ddc8e23a709e Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:42:58 -0500 Subject: [PATCH 76/85] Re-enabling cache after testing --- .github/workflows/trivy-pr-scan.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 8d14372..19f86e1 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -45,8 +45,8 @@ jobs: push: false load: true tags: ${{ github.repository }}:vuln-test - # cache-from: type=registry,ref=${{ github.repository }}:buildcache - # cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max + cache-from: type=registry,ref=${{ github.repository }}:buildcache + cache-to: type=registry,ref=${{ github.repository }}:buildcache,mode=max # We will not be concerned with Medium and Low vulnerabilities - name: Run Trivy vulnerability scanner From d1ff3c966f8fcfa3648e34b1dfe7754093bee63a Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:53:22 -0500 Subject: [PATCH 77/85] Revert to older trivy relelase --- .github/workflows/trivy-pr-scan.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 19f86e1..83f58f7 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -50,7 +50,7 @@ jobs: # We will not be concerned with Medium and Low vulnerabilities - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master + uses: aquasecurity/trivy-action@v0.16.0 with: image-ref: '${{ github.repository }}:vuln-test' format: 'sarif' From 96f7338f6977b88531692eebaa818f14cd07e435 Mon Sep 17 00:00:00 2001 From: Joshua Seals Date: Tue, 23 Jan 2024 05:54:25 -0500 Subject: [PATCH 78/85] trivy scan update --- .github/workflows/trivy-pr-scan.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/trivy-pr-scan.yml b/.github/workflows/trivy-pr-scan.yml index 83f58f7..19f86e1 100644 --- a/.github/workflows/trivy-pr-scan.yml +++ b/.github/workflows/trivy-pr-scan.yml @@ -50,7 +50,7 @@ jobs: # We will not be concerned with Medium and Low vulnerabilities - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@v0.16.0 + uses: aquasecurity/trivy-action@master with: image-ref: '${{ github.repository }}:vuln-test' format: 'sarif' From 5bee00d4f066eb55e1a29310cca8353aa96cda34 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:06:19 -0500 Subject: [PATCH 79/85] adding pytest asyncio --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index bac13a6..531f5ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,6 +13,7 @@ mistune pluggy pyrsistent pytest +pytest-asyncio pytz PyYAML requests From 9cb89cab9f2147ce344b01b45252e6a3d985819d Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:26:24 -0500 Subject: [PATCH 80/85] fix tests --- tests/unit/mocks/data/mock_config.py | 16 +++++++++++++--- tests/unit/test_annotators.py | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/unit/mocks/data/mock_config.py b/tests/unit/mocks/data/mock_config.py index 27ca191..d70f8a3 100644 --- a/tests/unit/mocks/data/mock_config.py +++ b/tests/unit/mocks/data/mock_config.py @@ -13,9 +13,19 @@ class MockConfig: }) # Annotator config that will be passed to annotate.Annotator constructor - annotator: dict = field(default_factory=lambda: { - "url": "http://annotator.api/?content=" - }) + annotator_type: str = "monarch" + + annotator_args: dict = field( + default_factory=lambda: { + "monarch": { + "url": "http://annotator.api/?content=" + }, + "sapbert": { + "classification_url": "http://classifier.api/annotate/", + "annotator_url": "http://entity-link.api/annotate/", + }, + } + ) # Normalizer config that will be passed to annotate.Normalizer constructor normalizer: dict = field(default_factory=lambda: { diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index c1702ee..2c7bde0 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -29,7 +29,7 @@ def test_annotator(annotator_api): synonym_finder = DefaultSynonymFinder(cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, config=cfg + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg.annotator_args ) text = "heart attack" identifiers: List[DugIdentifier] = annotator.annotate_text( From 64f3cb6de1741d2ced7f0ce68b94d0d6499ee2ea Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:34:19 -0500 Subject: [PATCH 81/85] fix annotator init --- tests/integration/test_annotators.py | 4 ++-- tests/unit/test_annotators.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_annotators.py b/tests/integration/test_annotators.py index 8004d0d..a9778bf 100644 --- a/tests/integration/test_annotators.py +++ b/tests/integration/test_annotators.py @@ -28,7 +28,7 @@ def test_monarch_annotation_full( synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, config=cfg + normalizer=normalizer, synonym_finder=synonym_finder, **cfg.annotator_args["monarch"] ) input_text = "heart attack" @@ -95,7 +95,7 @@ def test_sapbert_annotation_full( normalizer = DefaultNormalizer(**cfg.normalizer) synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) - annotator = AnnotateSapbert(normalizer=normalizer, synonym_finder=synonym_finder) + annotator = AnnotateSapbert(normalizer=normalizer, synonym_finder=synonym_finder, **cfg.annotator_args["sapbert"]) input_text = "Have you ever had a heart attack?" # Fetch Classifiers diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index 2c7bde0..5ea804d 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -29,7 +29,7 @@ def test_annotator(annotator_api): synonym_finder = DefaultSynonymFinder(cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, config=cfg.annotator_args + normalizer=normalizer, synonym_finder=synonym_finder, kwargs=cfg.annotator_args["monarch"] ) text = "heart attack" identifiers: List[DugIdentifier] = annotator.annotate_text( From 15cccfe56d30111be2b26e96b743edf228eac7a5 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:46:44 -0500 Subject: [PATCH 82/85] fix all the tests --- tests/integration/conftest.py | 2 +- tests/integration/mocks/mock_config.py | 17 ++++++++++++++--- tests/integration/test_annotators.py | 2 +- tests/unit/test_annotators.py | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index b671e3f..50f5787 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -122,7 +122,7 @@ def token_classifier_api(): def sapbert_annotator_api(): return MockApiService( urls={ - "https://babel-sapbert.apps.renci.org/annotate/": [ + "https://med-nemo.apps.renci.org/annotate/": [ json.dumps( [ { diff --git a/tests/integration/mocks/mock_config.py b/tests/integration/mocks/mock_config.py index 27ca191..82bcd1b 100644 --- a/tests/integration/mocks/mock_config.py +++ b/tests/integration/mocks/mock_config.py @@ -12,10 +12,21 @@ class MockConfig: "stopwords": ["the"] }) + # Annotator config that will be passed to annotate.Annotator constructor - annotator: dict = field(default_factory=lambda: { - "url": "http://annotator.api/?content=" - }) + annotator_type: str = "monarch" + + annotator_args: dict = field( + default_factory=lambda: { + "monarch": { + "url": "http://annotator.api/?content=" + }, + "sapbert": { + "classification_url": "https://med-nemo.apps.renci.org/annotate/", + "annotator_url": "https://med-nemo.apps.renci.org/annotate/", + }, + } + ) # Normalizer config that will be passed to annotate.Normalizer constructor normalizer: dict = field(default_factory=lambda: { diff --git a/tests/integration/test_annotators.py b/tests/integration/test_annotators.py index a9778bf..eecfd1e 100644 --- a/tests/integration/test_annotators.py +++ b/tests/integration/test_annotators.py @@ -28,7 +28,7 @@ def test_monarch_annotation_full( synonym_finder = DefaultSynonymFinder(**cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, **cfg.annotator_args["monarch"] + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg, **cfg.annotator_args["monarch"] ) input_text = "heart attack" diff --git a/tests/unit/test_annotators.py b/tests/unit/test_annotators.py index 5ea804d..830a140 100644 --- a/tests/unit/test_annotators.py +++ b/tests/unit/test_annotators.py @@ -29,7 +29,7 @@ def test_annotator(annotator_api): synonym_finder = DefaultSynonymFinder(cfg.synonym_service) annotator = AnnotateMonarch( - normalizer=normalizer, synonym_finder=synonym_finder, kwargs=cfg.annotator_args["monarch"] + normalizer=normalizer, synonym_finder=synonym_finder, config=cfg , **cfg.annotator_args["monarch"] ) text = "heart attack" identifiers: List[DugIdentifier] = annotator.annotate_text( From f3d94110558738242fdfed4eefaf4e87558d4ecf Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Wed, 24 Jan 2024 17:52:52 -0500 Subject: [PATCH 83/85] Forced Python 3.11 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e8d1ce2..c7e9bc3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:alpine3.19 +FROM python:3.11-alpine3.19 # Install required packages RUN apk update && \ From d7257dfae5cbb973609562670cba5d1be048ceb7 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 17:52:59 -0500 Subject: [PATCH 84/85] bump docker image version to 0 vuls --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index c009bc5..6147d76 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # A container for the core semantic-search capability. # ###################################################### -FROM python:3.12.0-alpine3.18 +FROM python:3.12.1-alpine3.19 # Install required packages RUN apk update && \ From 275abcbacd42bba1ec5cf89869ab845b37776a65 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 24 Jan 2024 18:07:57 -0500 Subject: [PATCH 85/85] zero again 0_o --- Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e42083a..3980ddf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,10 @@ FROM python:3.12.1-alpine3.19 # Install required packages RUN apk update && \ - apk add g++ make + apk add g++ make + +#upgrade openssl \ +RUN apk add openssl=3.1.4-r4 RUN pip install --upgrade pip # Create a non-root user.