diff --git a/data/bdc_dbgap_data_dicts.tar.gz b/data/bdc_dbgap_data_dicts.tar.gz index adf46cb3..8671c4e2 100644 Binary files a/data/bdc_dbgap_data_dicts.tar.gz and b/data/bdc_dbgap_data_dicts.tar.gz differ diff --git a/data/redis/appendonly.aof b/data/redis/appendonly.aof deleted file mode 100644 index 386fca61..00000000 Binary files a/data/redis/appendonly.aof and /dev/null differ diff --git a/requirements.txt b/requirements.txt index e80c1f1f..d2ea252d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ elasticsearch[async]==7.16.3 fastapi==0.95.0 -uvicorn +uvicorn==0.23.2 gunicorn itsdangerous Jinja2 @@ -19,5 +19,6 @@ six==1.16.0 # Click for command line arguments # We use Click 7.0 because that's what one of the pinned packages above use. -click~=7.0 +click httpx>=0.24.1 +bmt==1.1.0 \ No newline at end of file diff --git a/src/dug/config.py b/src/dug/config.py index 6c40e8b2..180544a5 100644 --- a/src/dug/config.py +++ b/src/dug/config.py @@ -39,12 +39,12 @@ class Config: # Normalizer config that will be passed to annotate.Normalizer constructor normalizer: dict = field(default_factory=lambda: { - "url": "https://nodenormalization-sri.renci.org/get_normalized_nodes?conflate=false&curie=" + "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie=" }) # Synonym service config that will be passed to annotate.SynonymHelper constructor synonym_service: dict = field(default_factory=lambda: { - "url": "https://onto.renci.org/synonyms/" + "url": "https://name-resolution-sri.renci.org/reverse_lookup" }) # Ontology metadata helper config that will be passed to annotate.OntologyHelper constructor @@ -59,7 +59,9 @@ class Config: "disease": ["disease", "phenotypic_feature"], "pheno": ["phenotypic_feature", "disease"], "anat": ["disease", "anatomical_entity"], - "chem_to_disease": ["chemical_substance", "disease"], + "chem_to_disease": ["chemical_entity", "disease"], + "small_molecule_to_disease": ["small_molecule", "disease"], + "chemical_mixture_to_disease": ["chemical_mixture", "disease"], "phen_to_anat": ["phenotypic_feature", "anatomical_entity"], }) diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py index 91976f75..5f7977c3 100644 --- a/src/dug/core/annotate.py +++ b/src/dug/core/annotate.py @@ -1,8 +1,10 @@ import json import logging import os +import re import urllib.parse from typing import TypeVar, Generic, Union, List, Tuple, Optional +import bmt import requests from requests import Session @@ -59,14 +61,12 @@ def __init__( annotator: "Annotator", normalizer: "Normalizer", synonym_finder: "SynonymFinder", - ontology_helper: "OntologyHelper", ontology_greenlist=[], ): self.preprocessor = preprocessor self.annotator = annotator self.normalizer = normalizer self.synonym_finder = synonym_finder - self.ontology_helper = ontology_helper self.ontology_greenlist = ontology_greenlist self.norm_fails_file = "norm_fails.txt" self.anno_fails_file = "anno_fails.txt" @@ -106,12 +106,6 @@ def annotate(self, text, http_session): # Add synonyms to identifier norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session) - # Get canonical label, name, and description from ontology metadata service - name, desc, ontology_type = self.ontology_helper.get_ontology_info(norm_id.id, http_session) - norm_id.label = name - norm_id.description = desc - norm_id.type = ontology_type - # Get pURL for ontology identifer for more info norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id) processed_identifiers.append(norm_id) @@ -335,6 +329,7 @@ def handle_response(self, value, response: dict) -> List[Identifier]: class Normalizer(ApiClient[Identifier, Identifier]): def __init__(self, url): + self.bl_toolkit = bmt.Toolkit() self.url = url def normalize(self, identifier: Identifier, http_session: Session): @@ -380,9 +375,13 @@ def handle_response(self, identifier: Identifier, normalized: dict) -> Optional[ logger.debug(f"Preferred id: {preferred_id}") identifier.id = preferred_id.get('identifier', '') identifier.label = preferred_id.get('label', '') - identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] - identifier.types = biolink_type - + identifier.description = preferred_id.get('description', '') + identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers] + try: + identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name + except: + # converts biolink:SmallMolecule to small molecule + identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower() return identifier @@ -400,51 +399,31 @@ def get_synonyms(self, curie: str, http_session): return self(curie, http_session) def make_request(self, curie: str, http_session: Session): - - # Get response from synonym service - url = f"{self.url}{urllib.parse.quote(curie)}" - + # Get response from namelookup reverse lookup op + # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post) + url = f"{self.url}" + payload = { + 'curies': [curie] + } try: - response = http_session.get(url) - if response.status_code == 400: - logger.error(f"No synonyms returned for: `{curie}`. Validation error.") - return [] - if response.status_code == 500: - logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}.") - return [] + response = http_session.post(url, json=payload) + if str(response.status_code).startswith('4'): + logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}") + return {curie: []} + if str(response.status_code).startswith('5'): + logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}") + return {curie: []} return response.json() except json.decoder.JSONDecodeError as e: logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}") - return [] + return {curie: []} def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]: - # List comprehension unpack all synonyms into a list - return [synonym['desc'] for synonym in raw_synonyms] + # Return curie synonyms + return raw_synonyms.get(curie, []) -class OntologyHelper(ApiClient[str, Tuple[str, str, str]]): - def __init__(self, url): - self.url = url - def make_request(self, curie: str, http_session: Session): - url = f"{self.url}{urllib.parse.quote(curie)}" - try: - response = http_session.get(url).json() - return response - except json.decoder.JSONDecodeError as e: - logger.error(f"No labels returned for: {curie}") - return {} - - def handle_response(self, curie: str, response: dict) -> Tuple[str,str,str]: - # List comprehension for synonyms - name = response.get('label', '') - description = '' if not response.get('description', None) else response.get('description', '') - ontology_type = '' if not response.get('category', None) else response.get('category', '')[0] - - return name, description, ontology_type - - def get_ontology_info(self, curie, http_session): - return self(curie, http_session) class BioLinkPURLerizer: diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index 1f3d5872..6946fb74 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -212,15 +212,10 @@ async def search_concepts(self, query, offset=0, size=None, types=None, Changed to a long boolean match query to optimize search results """ query_dict = self._build_concepts_query(query, **kwargs) - total_items = await self.es.count( - body={"query": query_dict}, - index="concepts_index") # Get aggregated counts of biolink types search_body = {"query": query_dict} search_body['aggs'] = {'type-count': {'terms': {'field': 'type'}}} - # Add post_filter on types - if types: - assert isinstance(types, list) + if isinstance(types, list): search_body['post_filter'] = { "bool": { "should": [ @@ -239,6 +234,18 @@ async def search_concepts(self, query, offset=0, size=None, types=None, size=size, explain=True ) + # Aggs/post_filter aren't supported by count + del search_body["aggs"] + if "post_filter" in search_body: + # We'll move the post_filter into the actual filter + search_body["query"]["bool"]["filter"]["bool"].update( + search_body["post_filter"]["bool"] + ) + del search_body["post_filter"] + total_items = await self.es.count( + body=search_body, + index="concepts_index" + ) # Simplify the data structure we get from aggregations to put into the # return value. This should be a count of documents hit for every type diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py index 48097f19..9fdba4dc 100644 --- a/src/dug/core/factory.py +++ b/src/dug/core/factory.py @@ -4,8 +4,12 @@ from requests_cache import CachedSession import dug.core.tranql as tql -from dug.core.annotate import DugAnnotator, Annotator, Normalizer, OntologyHelper, Preprocessor, SynonymFinder, \ - ConceptExpander +from dug.core.annotate import (DugAnnotator, + Annotator, + Normalizer, + Preprocessor, + SynonymFinder, + ConceptExpander) from dug.config import Config as DugConfig, TRANQL_SOURCE from dug.core.crawler import Crawler from dug.core.parsers import Parser @@ -53,14 +57,12 @@ def build_annotator(self) -> DugAnnotator: annotator = Annotator(**self.config.annotator) normalizer = Normalizer(**self.config.normalizer) synonym_finder = SynonymFinder(**self.config.synonym_service) - ontology_helper = OntologyHelper(**self.config.ontology_helper) annotator = DugAnnotator( preprocessor=preprocessor, annotator=annotator, normalizer=normalizer, - synonym_finder=synonym_finder, - ontology_helper=ontology_helper + synonym_finder=synonym_finder ) return annotator diff --git a/src/dug/core/index.py b/src/dug/core/index.py index a6711a15..78a723b6 100644 --- a/src/dug/core/index.py +++ b/src/dug/core/index.py @@ -26,6 +26,7 @@ def __init__(self, cfg: Config, indices=None): self.es = Elasticsearch(hosts=self.hosts, http_auth=(self._cfg.elastic_username, self._cfg.elastic_password)) + self.replicas = self.get_es_node_count() if self.es.ping(): logger.info('connected to elasticsearch') @@ -36,6 +37,10 @@ def __init__(self, cfg: Config, indices=None): raise SearchException( message='failed to connect to elasticsearch', details=f"connecting to host {self._cfg.elastic_host} and port {self._cfg.elastic_port}") + + def get_es_node_count(self): + return self.es.nodes.info()["_nodes"]["total"] + def init_indices(self): # The concepts and variable indices include an analyzer that utilizes the english @@ -49,7 +54,7 @@ def init_indices(self): kg_index = { "settings": { "number_of_shards": 1, - "number_of_replicas": 0 + "number_of_replicas": self.replicas }, "mappings": { "properties": { @@ -66,7 +71,7 @@ def init_indices(self): "settings": { "index.mapping.coerce": "false", "number_of_shards": 1, - "number_of_replicas": 0, + "number_of_replicas": self.replicas, "analysis": { "analyzer": { "std_with_stopwords": { @@ -104,7 +109,7 @@ def init_indices(self): "settings": { "index.mapping.coerce": "false", "number_of_shards": 1, - "number_of_replicas": 0, + "number_of_replicas": self.replicas, "analysis": { "analyzer": { "std_with_stopwords": { @@ -148,6 +153,11 @@ def init_indices(self): for index in self.indices: try: if self.es.indices.exists(index=index): + # if index exists check if replication is good + index_replicas = self.es.indices.get_settings(index=index)[index]["settings"]["index"]["number_of_replicas"] + if index_replicas != self.replicas: + self.es.indices.put_settings(index=index, body={"number_of_replicas": (self.replicas - 1) or 1 }) + self.es.indices.refresh(index=index) logger.info(f"Ignoring index {index} which already exists.") else: result = self.es.indices.create( diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py index b01432c4..2926b1f1 100644 --- a/src/dug/core/parsers/dbgap_parser.py +++ b/src/dug/core/parsers/dbgap_parser.py @@ -1,9 +1,10 @@ import logging -import re +import re, os from typing import List from xml.etree import ElementTree as ET from dug import utils as utils +from pathlib import Path from ._base import DugElement, FileParser, Indexable, InputFile logger = logging.getLogger('dug') @@ -13,27 +14,49 @@ class DbGaPParser(FileParser): # Class for parsers DBGaP Data dictionary into a set of Dug Elements @staticmethod - def parse_study_name_from_filename(filename: str): + def parse_study_name_from_filename(filename: str) -> str: # Parse the study name from the xml filename if it exists. Return None if filename isn't right format to get id from dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]+\.pht[0-9]+\.v[0-9]+\.(.+)\.data_dict.*') match = re.match(dbgap_file_pattern, filename) if match is not None: return match.group(1) return None + + @staticmethod + def parse_study_name_from_gap_exchange_file(filepath: Path) -> str: + # Parse the study name from the GapExchange file adjacent to the file passed in + parent_dir = filepath.parent.absolute() + gap_exchange_filename_str = "GapExchange_" + parent_dir.name + gap_exchange_filepath = None + for item in os.scandir(parent_dir): + if item.is_file and gap_exchange_filename_str in item.name: + gap_exchange_filepath = item.path + if gap_exchange_filepath is None: + return None + tree = ET.parse(gap_exchange_filepath, ET.XMLParser(encoding='iso-8859-5')) + tree_root = tree.getroot() + return tree_root.find("./Studies/Study/Configuration/StudyNameEntrez").text + def _get_element_type(self): return "DbGaP" def __call__(self, input_file: InputFile) -> List[Indexable]: logger.debug(input_file) + if "GapExchange" in str(input_file).split("/")[-1]: + msg = f"Skipping parsing for GapExchange file: {input_file}!" + logger.info(msg) + return [] tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5')) root = tree.getroot() study_id = root.attrib['study_id'] participant_set = root.get('participant_set','0') - # Parse study name from file handle - study_name = self.parse_study_name_from_filename(str(input_file)) - + # Parse study name from GapExchange file, and if that fails try from file handle + # If still None, raise an error message + study_name = self.parse_study_name_from_gap_exchange_file(Path(input_file)) + if study_name is None: + study_name = self.parse_study_name_from_filename(str(input_file)) if study_name is None: err_msg = f"Unable to parse DbGaP study name from data dictionary: {input_file}!" logger.error(err_msg) diff --git a/tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml b/tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml new file mode 100644 index 00000000..33f615cb --- /dev/null +++ b/tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml @@ -0,0 +1,390 @@ + + + + + + + + + + + + + + + + + + + + TitleNameInstitute + Principal InvestigatorJorgen Vestbo, Professorthe University of Manchester, Manchester, UK + Principal InvestigatorEdwin K. Silverman, MD, PhDBrigham and Women 's Hospital, Boston, MA + ECLIPSE InvestigatorsY. IvanovPleven, Bulgaria + ECLIPSE InvestigatorsK. KostovSofia, Bulgaria + ECLIPSE InvestigatorsJ. BourbeauMontreal, Canada + + ]]> + Evaluation of COPD Longitudinally to Identify Predictive Surrogate Endpoints (ECLIPSE) + Evaluation of COPD Longitudinally to Identify Predictive Surrogate Endpoints (ECLIPSE) + + Case-Control + Longitudinal Cohort + Cohort + + ECLIPSE was a longitudinal observational study of 2164 COPD subjects and a smaller number of smoking controls (337) and nonsmoking controls (245) followed regularly for three years, with three chest CT scans (at baseline, one year, and three years) (Vestbo, European Respiratory Journal 2008; 31: 869). Inclusion criteria included age 40-75, at least 10 pack-years of smoking, and spirometry in GOLD grades 2-4 (COPD cases) or normal spirometry with post-bronchodilator FEV1 >85% predicted and FEV1/FVC>0.7 (controls). Study visits were performed at enrollment, three months, and every six months thereafter with spirometry, questionnaires, and other clinical evaluations. The ECLIPSE CT scans have been analyzed with the VIDA software for emphysema and airway phenotypes. ECLIPSE has provided key insights into the clinical epidemiology of COPD, including COPD exacerbations (Hurst, NEJM 2010; 363: 1128) and lung function decline in COPD (Vestbo, NEJM 2011; 365: 1184). ECLIPSE has been used in a number of genetic studies of COPD susceptibility and protein biomarkers(Faner, Thorax 2014; 69: 666). Genome-wide gene expression microarray data are available in 147 induced sputum samples from COPD subjects and 248 peripheral blood samples from COPD and control subjects.

]]> +
+ Inclusion criteria included age 40-75, at least 10 pack-years of smoking, and spirometry in GOLD grades 2-4 (COPD cases) or normal spirometry with post-bronchodilator FEV1 >85% predicted and FEV1/FVC >0.7 (controls). Exclusion criteria included respiratory disorders other than COPD, known severe α1-antitrypsin deficiency, history of significant inflammatory disease other than COPD, a COPD exacerbation or blood transfusions within 4 weeks of enrollment, prior lung surgery, recent diagnosis of cancer, inability to walk, and therapy with oral corticosteroids at inclusion.

]]> +
+ + + + + + + + + + + + + + + + + + + + +
+ Jorgen Vestbo, Professor + the University of Manchester, Manchester, UK +
+
+ Edwin K. Silverman, MD, PhD + Brigham and Women 's Hospital, Boston, MA +
+
+ Y. Ivanov + Pleven, Bulgaria +
+
+ K. Kostov + Sofia, Bulgaria +
+
+ J. Bourbeau + Montreal, Canada +
+
+ M. Fitzgerald + Vancouver, BC, Canada +
+
+ P. Hernandez + Halifax, NS, Canada +
+
+ K. Killian + Hamilton, ON, Canada +
+
+ R. Levy + Vancouver, BC, Canada +
+
+ F. Maltais + Montreal, Canada +
+
+ D. O'Donnell + Kingston, ON, Canada +
+
+ J. Krepelka + Prague, Czech Republic +
+
+ J. Vestbo + Hvidovre, Denmark +
+
+ E. Wouters + Horn-Maastricht, The Netherlands +
+
+ D. Quinn + Wellington, New Zealand +
+
+ P. Bakke + Bergen, Norway +
+
+ M. Kosnik + Golnik, Slovenia +
+
+ A. Agusti + Spain +
+
+ J. Sauleda + Spain +
+
+ P. de Mallorca + Spain +
+
+ Y. Feschenko + Kiev, Ukraine +
+
+ V. Gavrisyuk + Kiev, Ukraine +
+
+ L. Yashina + Kiev, Ukraine +
+
+ N. Monogarova + Donetsk, Ukraine +
+
+ P. Calverley + Liverpool, United Kingdom +
+
+ D. Lomas + Cambridge, United Kingdom +
+
+ W. MacNee + Edinburgh, United Kingdom +
+
+ D. Singh + Manchester, United Kingdom +
+
+ J. Wedzicha + London, United Kingdom +
+
+ A. Anzueto + San Antonio, TX, USA +
+
+ S. Braman + Providence, RI, USA +
+
+ R. Casaburi + Torrance, CA, USA +
+
+ B. Celli + Boston, MA, USA +
+
+ G. Giessel + Richmond, VA, USA +
+
+ M. Gotfried + Phoenix, AZ, USA +
+
+ G. Greenwald + Rancho Mirage, CA, USA +
+
+ N. Hanania + Houston, TX, USA +
+
+ D. Mahler + Lebanon, NH, USA +
+
+ B. Make + Denver, CO, USA +
+
+ S. Rennard + Omaha, NE, USA +
+
+ C. Rochester + New Haven, CT, USA +
+
+ P. Scanlon + Rochester, MN, USA +
+
+ D. Schuller + Omaha, NE, USA +
+
+ F. Sciurba + Pittsburgh, PA, USA +
+
+ A. Sharafkhaneh + Houston, TX, USA +
+
+ T. Siler + St. Charles, MO, USA +
+
+ E. Silverman + Boston, MA, USA +
+
+ A. Wanner + Miami, FL, USA +
+
+ R. Wise + Baltimore, MD, USA +
+
+ R. ZuWallack + Hartford, CT, USA +
+
+ H. Coxson + Canada +
+
+ C. Crim + GSK, USA +
+
+ L. Edwards + GSK, USA +
+
+ D. Lomas + UK +
+
+ W. MacNee + UK +
+
+ E. Silverman + USA +
+
+ R. Tal-Singer + Co-chair, GSK, USA +
+
+ J. Vestbo + Co-chair, Denmark +
+
+ J. Yates + GSK, USA +
+
+ A. Agusti + Spain +
+
+ P. Calverley + UK +
+
+ B. Celli + USA +
+
+ C. Crim + GSK, USA +
+
+ B. Miller + GSK, USA +
+
+ W. MacNee + Chair, UK +
+
+ S. Rennard + USA +
+
+ R. Tal-Singer + GSK, USA +
+
+ E. Wouters + The Netherlands +
+
+ J. Yates + GSK, USA +
+
+ yes + + + + The ECLIPSE study protocol was finalized in 2005, and subject recruitment began in 2006.

]]> +
+ + + +
+ + + + NHLBI + National Heart, Lung, and Blood Institute DAC + 0 + + + + + yes + yes + 0 + 1 + 8 + no + + + http://dbgap.ncbi.nlm.nih.gov/aa/wga.cgi?page=DUC&view_pdf&stacc=phs001252.v1.p1 + + + + + + + + + Disease-Specific (Chronic Obstructive Pulmonary Disease, RD) + DS-COPD-RD + Use of the data must be related to Chronic Obstructive Pulmonary Disease and related disorders. + No + + + + + +
+ +
+ +
diff --git a/tests/integration/data/phs001252.v1.p1/phs001252.v1.pht006366.v1.ECLIPSE_Subject.data_dict.xml b/tests/integration/data/phs001252.v1.p1/phs001252.v1.pht006366.v1.ECLIPSE_Subject.data_dict.xml new file mode 100644 index 00000000..c9c083e9 --- /dev/null +++ b/tests/integration/data/phs001252.v1.p1/phs001252.v1.pht006366.v1.ECLIPSE_Subject.data_dict.xml @@ -0,0 +1,2 @@ + +SUBJECT_IDSubject IDstringCONSENTConsent group as determined by DACencoded valueDisease-Specific (Chronic Obstructive Pulmonary Disease, RD) (DS-COPD-RD) diff --git a/tests/integration/test_loaders.py b/tests/integration/test_loaders.py index 9287da44..799fd9dc 100644 --- a/tests/integration/test_loaders.py +++ b/tests/integration/test_loaders.py @@ -18,7 +18,7 @@ def test_filesystem_loader(): filepath=TEST_DATA_DIR, ) files = list(targets) - assert len(files) == 10 + assert len(files) == 15 with pytest.raises(ValueError): targets = load_from_filesystem( diff --git a/tests/integration/test_parsers.py b/tests/integration/test_parsers.py index a22d00c7..fa5ea2de 100644 --- a/tests/integration/test_parsers.py +++ b/tests/integration/test_parsers.py @@ -1,6 +1,7 @@ from dug.core.parsers import DbGaPParser, NIDAParser, TOPMedTagParser, SciCrunchParser, AnvilDbGaPParser,\ CRDCDbGaPParser, KFDRCDbGaPParser, SPRINTParser, BACPACParser from tests.integration.conftest import TEST_DATA_DIR +from pathlib import Path def test_dbgap_parse_study_name_from_filename(): parser = DbGaPParser() @@ -21,6 +22,12 @@ def test_nida_parse_study_name_from_filename(): studyname = parser.parse_study_name_from_filename(filename) assert studyname == "NIDA-CSP1019" +def test_dbgap_parse_study_name_from_gap_exchange_file(): + parser = DbGaPParser() + parse_filepath = Path(TEST_DATA_DIR / "phs001252.v1.p1" / "phs001252.v1.pht006366.v1.ECLIPSE_Subject.data_dict.xml") + studyname = parser.parse_study_name_from_gap_exchange_file(parse_filepath) + assert studyname == "Evaluation of COPD Longitudinally to Identify Predictive Surrogate Endpoints (ECLIPSE)" + def test_dbgap_parser(): parser = DbGaPParser() parse_file = str(TEST_DATA_DIR / "phs000166.v2.pht000700.v1.CAMP_CData.data_dict_2009_09_03.xml") diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 85f4adcb..e1b63d9a 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -29,6 +29,16 @@ def get(self, url, params: dict = None): if text is None: return MockResponse(text="{}", status_code=404) return MockResponse(text, status_code=status_code) + + def post(self, url, params: dict = None, json: dict = {}): + if params: + qstr = urllib.parse.urlencode(params, quote_via=urllib.parse.quote) + url = f"{url}?{qstr}" + text, status_code = self.urls.get(url) + + if text is None: + return MockResponse(text="{}", status_code=404) + return MockResponse(text, status_code=status_code) @pytest.fixture @@ -208,29 +218,16 @@ def _(curie): @pytest.fixture -def synonym_api(): - base_url = "http://synonyms.api/?curie={curie}" - - def _(curie): - return base_url.format( - curie=urllib.parse.quote(curie), - ) +def synonym_api(): return MockApiService(urls={ - _("UBERON:0007100"): [json.dumps([ - { - "desc": "adult heart", - "scope": "RELATED", - "syn_type": None, - "xref": "" - } - ]), 200], - _("MONDO"): [json.dumps({ - "validation error": "format should be :" - }), 400], - _("UNSUPPORTED_PREFIX:XXX"): [json.dumps({ - "validation error": "UNSUPPORTED_PREFIX is not supported" - }), 400], - + "http://synonyms.api": [json.dumps({ + "UBERON:0007100": [ + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart" + ] + }), 200] }) diff --git a/tests/unit/test_annotate.py b/tests/unit/test_annotate.py index deb479ab..87869566 100644 --- a/tests/unit/test_annotate.py +++ b/tests/unit/test_annotate.py @@ -4,7 +4,7 @@ import pytest from dug.config import Config -from dug.core.annotate import Identifier, Preprocessor, Annotator, Normalizer, SynonymFinder, OntologyHelper +from dug.core.annotate import Identifier, Preprocessor, Annotator, Normalizer, SynonymFinder def test_identifier(): @@ -195,45 +195,27 @@ def test_normalizer(normalizer_api): assert output.id == 'UBERON:0007100' assert output.label == "primary circulatory organ" assert output.equivalent_identifiers == ['UBERON:0007100'] - assert output.types == [ - 'biolink:AnatomicalEntity', 'biolink:OrganismalEntity', 'biolink:BiologicalEntity', - 'biolink:NamedThing', 'biolink:Entity' - ] + assert output.types == 'anatomical entity' + def test_synonym_finder(synonym_api): curie = "UBERON:0007100" - url = f"http://synonyms.api/?curie=" - + url = f"http://synonyms.api" finder = SynonymFinder(url) result = finder.get_synonyms( curie, synonym_api, ) - assert result == ["adult heart"] - curie = "MONDO" - result = finder.get_synonyms( - curie, - synonym_api, - ) - assert result == [] - curie = "UNSUPPORTED_PREFIX:XXX" - result = finder.get_synonyms( - curie, - synonym_api, - ) - assert result == [] + assert result == [ + "primary circulatory organ", + "dorsal tube", + "adult heart", + "heart" + ] -def test_ontology_helper(ontology_api): - curie = "UBERON:0007100" - url = "http://ontology.api/?curie=" - helper = OntologyHelper(url) - name, description, ontology_type = helper.get_ontology_info(curie, ontology_api) - assert name == 'primary circulatory organ' - assert description == 'A hollow, muscular organ, which, by contracting rhythmically, keeps up the circulation of the blood or analogs[GO,modified].' - assert ontology_type == 'anatomical entity' def test_yield_partial_text(): diff --git a/tests/unit/test_core/test_search.py b/tests/unit/test_core/test_search.py index bf511e2c..2aa49ef0 100644 --- a/tests/unit/test_core/test_search.py +++ b/tests/unit/test_core/test_search.py @@ -16,6 +16,9 @@ nboost_host = 'localhost' hosts = [{'host': host, 'port': port}] +class MockEsNode(): + def info(): + return {"_nodes" : {"total": 1}} @dataclass class MockIndex: @@ -34,6 +37,7 @@ def get(self, id): def count(self, body): return len(self.values) + class MockIndices: @@ -63,6 +67,7 @@ class MockElastic: def __init__(self, indices: MockIndices): self.indices = indices self._up = True + self.nodes = MockEsNode def index(self, index, id=None, body=None): self.indices.get_index(index).index(id, body) @@ -94,6 +99,8 @@ def search(self, index, body, **kwargs): } } + + @pytest.fixture def elastic():