From 36f3626ca3ea45b7b9200c2d6dd023021f46ca70 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Tue, 18 Jul 2023 16:14:27 -0400 Subject: [PATCH 1/6] Server-side concept type filtering concept_types dict added to results with types and number of results. types parameter added to API query dict, restricting search results to set of types. --- .env | 4 +- docker-compose.yaml | 2 + requirements.txt | 3 +- src/dug/core/async_search.py | 47 +- src/dug/server.py | 5 +- tests/integration/test_async_search.py | 34 ++ tests/unit/test_async_search.py | 757 +++++++++++++++++++++++++ 7 files changed, 837 insertions(+), 15 deletions(-) create mode 100644 tests/integration/test_async_search.py create mode 100644 tests/unit/test_async_search.py diff --git a/.env b/.env index d2b79878..1ec41380 100644 --- a/.env +++ b/.env @@ -1,13 +1,13 @@ DATA_DIR=./local_storage ELASTIC_PASSWORD=15707 -ELASTIC_API_HOST=elasticsearch +ELASTIC_API_HOST=localhost ELASTIC_USERNAME=elastic NBOOST_API_HOST=nboost REDIS_PASSWORD=15707 -REDIS_HOST=redis +REDIS_HOST=localhost REDIS_PORT=6379 API_WORKERS=4 diff --git a/docker-compose.yaml b/docker-compose.yaml index 1399e0af..4b7f5335 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -57,6 +57,7 @@ services: ################################################################################# elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:7.6.1 + platform: "linux/amd64" networks: - dug-network environment: @@ -75,6 +76,7 @@ services: ## ################################################################################# redis: + platform: "linux/amd64" image: 'bitnami/redis:5.0.8' networks: - dug-network diff --git a/requirements.txt b/requirements.txt index ffd4461e..e80c1f1f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,4 +19,5 @@ six==1.16.0 # Click for command line arguments # We use Click 7.0 because that's what one of the pinned packages above use. -click~=7.0 \ No newline at end of file +click~=7.0 +httpx>=0.24.1 diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index bcca8797..0e90e044 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -91,11 +91,10 @@ async def agg_data_type(self): results.update({'data type list': data_type_list}) return data_type_list - async def search_concepts(self, query, offset=0, size=None, fuzziness=1, prefix_length=3): - """ - Changed to a long boolean match query to optimize search results - """ - query = { + @staticmethod + def _build_concepts_query(query, fuzziness=1, prefix_length=3): + "Static data structure populator, pulled for easier testing" + query_object = { "bool": { "filter": { "bool": { @@ -196,16 +195,46 @@ async def search_concepts(self, query, offset=0, size=None, fuzziness=1, prefix_ "minimum_should_match": 1, } } - body = json.dumps({'query': query}) - total_items = await self.es.count(body=body, index="concepts_index") + return query_object + + async def search_concepts(self, query, offset=0, size=None, types=None, + **kwargs): + """ + Changed to a long boolean match query to optimize search results + """ + query_object = {'query': self._build_concepts_query(query, **kwargs)} + total_items = await self.es.count( + body=json.dumps(query_object), + index="concepts_index") + # Get aggregated counts of biolink types + query_object['aggs'] = {'type-count': {'terms': {'field': 'type'}}} + # Add post_filter on types + if types: + assert type(types) == list + query_object['post_filter'] = { + "bool": { + "should": [ + {'term': {'type': {'value': t}}} for t in types + ], + "minimum_should_match": 1 + } + } search_results = await self.es.search( index="concepts_index", - body=body, - filter_path=['hits.hits._id', 'hits.hits._type', 'hits.hits._source', 'hits.hits._score'], + body=json.dumps(query_object), + filter_path=['hits.hits._id', 'hits.hits._type', + 'hits.hits._source', 'hits.hits._score', + 'aggregations'], from_=offset, size=size ) + aggregations = search_results.pop('aggregations') + concept_types = { + bucket['key']: bucket['doc_count'] for bucket in + aggregations['type-count']['buckets'] + } search_results.update({'total_items': total_items['count']}) + search_results['concept_types'] = concept_types return search_results async def search_variables(self, concept="", query="", size=None, data_type=None, offset=0, fuzziness=1, diff --git a/src/dug/server.py b/src/dug/server.py index 00a477d9..fde7e5a0 100644 --- a/src/dug/server.py +++ b/src/dug/server.py @@ -26,6 +26,7 @@ class SearchConceptQuery(BaseModel): index: str = "concepts_index" offset: int = 0 size: int = 20 + types: list = None class SearchVariablesQuery(BaseModel): query: str @@ -40,10 +41,8 @@ class SearchKgQuery(BaseModel): index: str = "kg_index" size:int = 100 - search = Search(Config.from_env()) - @APP.on_event("shutdown") def shutdown_event(): asyncio.run(search.es.close()) @@ -101,4 +100,4 @@ async def search_var(search_query: SearchVariablesQuery): if __name__ == '__main__': - uvicorn.run(APP) \ No newline at end of file + uvicorn.run(APP) diff --git a/tests/integration/test_async_search.py b/tests/integration/test_async_search.py new file mode 100644 index 00000000..0ce6cb5c --- /dev/null +++ b/tests/integration/test_async_search.py @@ -0,0 +1,34 @@ +"Integration tests for the async_search module" + +import asyncio +from unittest import TestCase + +from fastapi.testclient import TestClient +from elasticsearch.exceptions import ConnectionError +class APISearchTestCase(TestCase): + "API search with mocked elasticsearch" + + def test_concepts_types_parameter(self): + "Test API concepts search with types parameter" + # This should patch the elasticsearch object with the mock + from dug.server import APP + client = TestClient(APP) + types = ['anatomical entity', 'drug'] + body = { + "index": "concepts_index", + "query": "brain", + "offset": 0, + "size":20, + "types": types + } + try: + response = client.post("/search", json=body) + except ConnectionError: + self.fail("For the integration test, a populated elasticsearch " + "instance must be available and configured in the " + "environment variables. See dug.config for more.") + self.assertEqual(response.status_code, 200) + response_obj = response.json() + response_types = set(hit['_source']['type'] for hit in + response_obj['result']['hits']['hits']) + self.assertEqual(response_types, set(types)) diff --git a/tests/unit/test_async_search.py b/tests/unit/test_async_search.py new file mode 100644 index 00000000..c121ce7c --- /dev/null +++ b/tests/unit/test_async_search.py @@ -0,0 +1,757 @@ +"Unit tests for the async_search module" + +import asyncio +import json +from importlib import reload +from unittest import TestCase, mock +from fastapi.testclient import TestClient + +from dug.core import async_search +from dug.config import Config + +async def _mock_search(*args, **kwargs): + "Mock of elasticsearch search function. Ignores argument" + return _brain_search_result() + +async def _mock_count(*args, **kwargs): + "Mock of elasticsearch count function. Ignores argument" + return {'count': 90, '_shards': {'total': 1, 'successful': 1, + 'skipped': 0, 'failed': 0}} +es_mock = mock.AsyncMock() +es_mock.search = _mock_search +es_mock.count = _mock_count + +class SearchTestCase(TestCase): + "Mocked unit tests for async_search" + + def setUp(self): + "Build mock elasticsearch responses" + search_result = _brain_search_result() + self.search = async_search.Search(Config.from_env()) + self.query_body = self.search._build_concepts_query("brain") + self.search.es = es_mock + + def test_concepts_search(self): + "Test async_search concepts search" + result = asyncio.run( + self.search.search_concepts("brain")) + self.assertIn('total_items', result) + self.assertEqual(result['total_items'], 90) + self.assertIn('concept_types', result) + self.assertIsInstance(result['concept_types'], dict) + self.assertEqual(len(result['concept_types']), 9) + self.assertEqual(result['concept_types']['anatomical entity'], 10) + + +brain_result_json = """{ + "hits": { + "hits": [ + { + "_type": "_doc", + "_id": "MONDO:0005560", + "_score": 274.8391, + "_source": { + "id": "MONDO:0005560", + "name": "brain disease", + "description": "A disease affecting the brain or part of the brain.", + "type": "disease", + "search_terms": [ + "brain disease", + "disorder of brain", + "disease of brain", + "disease or disorder of brain", + "brain disease or disorder" + ], + "optional_terms": [ + "alcohol use disorder measurement", + "GBA carrier status", + "systemising measurement", + "Abnormal nervous system physiology", + "Hypoglycemic encephalopathy", + "Nervous System Part", + "linguistic error measurement", + "Brain abscess", + "anatomical entity", + "Phenotypic abnormality", + "time to first cigarette measurement", + "Progressive encephalopathy", + "Epileptic encephalopathy", + "Necrotizing encephalopathy", + "Recurrent encephalopathy", + "alcohol dependence measurement", + "brain disease", + "cognitive inhibition measurement", + "Mitochondrial encephalopathy", + "Chronic hepatic encephalopathy", + "cocaine use measurement", + "Nonprogressive encephalopathy", + "Profound static encephalopathy", + "Brain", + "Acute encephalopathy", + "ADHD symptom measurement", + "cannabis dependence measurement", + "Infantile encephalopathy", + "opioid overdose severity measurement", + "delayed reward discounting measurement", + "attention function measurement", + "Herpes simplex encephalitis", + "Abnormality of neuronal migration", + "Acute necrotizing encephalopathy", + "Congenital encephalopathy", + "vascular brain injury measurement", + "Primary microcephaly", + "Central Nervous System Part", + "executive function measurement", + "syntactic complexity measurement" + ], + "concept_action": "", + "identifiers": [ + { + "id": "MONDO:0005560", + "label": "brain disease", + "equivalent_identifiers": [ + "MONDO:0005560", + "DOID:936", + "UMLS:C0006111", + "UMLS:C0085584", + "MESH:D001927", + "MEDDRA:10006120", + "MEDDRA:10014623", + "MEDDRA:10014625", + "MEDDRA:10014636", + "MEDDRA:10014641", + "NCIT:C26920", + "NCIT:C96413", + "SNOMEDCT:81308009", + "ICD10:G93.40", + "ICD10:G93.9", + "ICD9:348.30", + "ICD9:348.9", + "HP:0001298" + ], + "type": [ + "biolink:Disease", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon" + ], + "synonyms": [ + "brain disease", + "brain disease or disorder", + "disease of brain", + "disease or disorder of brain", + "disorder of brain" + ] + } + ] + } + }, + { + "_type": "_doc", + "_id": "MONDO:0005394", + "_score": 253.45584, + "_source": { + "id": "MONDO:0005394", + "name": "brain infarction", + "description": "Tissue necrosis in any area of the brain, including the cerebral hemispheres, the cerebellum, and the brain stem. Brain infarction is the result of a cascade of events initiated by inadequate blood flow through the brain that is followed by hypoxia and hypoglycemia in brain tissue. Damage may be temporary, permanent, selective or pan-necrosis.", + "type": "disease", + "search_terms": [ + "BRAIN INFARCTION" + ], + "optional_terms": [ + "blood vasculature", + "brain infarction" + ], + "concept_action": "", + "identifiers": [ + { + "id": "MONDO:0005394", + "label": "brain infarction", + "equivalent_identifiers": [ + "MONDO:0005394", + "DOID:3454", + "UMLS:C0751955", + "MESH:D020520", + "MEDDRA:10072154" + ], + "type": [ + "biolink:Disease", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon" + ], + "synonyms": [] + } + ] + } + }, + { + "_type": "_doc", + "_id": "UBERON:0000955", + "_score": 252.57217, + "_source": { + "id": "UBERON:0000955", + "name": "brain", + "description": "The brain is the center of the nervous system in all vertebrate, and most invertebrate, animals. Some primitive animals such as jellyfish and starfish have a decentralized nervous system without a brain, while sponges lack any nervous system at all. In vertebrates, the brain is located in the head, protected by the skull and close to the primary sensory apparatus of vision, hearing, balance, taste, and smell[WP].", + "type": "anatomical entity", + "search_terms": [ + "the brain", + "suprasegmental levels of nervous system", + "brain", + "suprasegmental structures", + "synganglion", + "encephalon" + ], + "optional_terms": [], + "concept_action": "", + "identifiers": [ + { + "id": "UBERON:0000955", + "label": "brain", + "equivalent_identifiers": [ + "UBERON:0000955" + ], + "type": [ + "biolink:GrossAnatomicalStructure", + "biolink:AnatomicalEntity", + "biolink:OrganismalEntity", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon", + "biolink:PhysicalEssence", + "biolink:PhysicalEssenceOrOccurrent" + ], + "synonyms": [ + "encephalon", + "suprasegmental levels of nervous system", + "suprasegmental structures", + "synganglion", + "the brain" + ] + } + ] + } + }, + { + "_type": "_doc", + "_id": "MONDO:0017998", + "_score": 136.03186, + "_source": { + "id": "MONDO:0017998", + "name": "PLA2G6-associated neurodegeneration", + "description": "Any neurodegeneration with brain iron accumulation in which the cause of the disease is a mutation in the PLA2G6 gene.", + "type": "disease", + "search_terms": [ + "plans", + "neurodegeneration with brain iron accumulation caused by mutation in PLA2G6", + "PLA2G6 neurodegeneration with brain iron accumulation", + "PLAN" + ], + "optional_terms": [], + "concept_action": "", + "identifiers": [ + { + "id": "MONDO:0017998", + "label": "PLA2G6-associated neurodegeneration", + "equivalent_identifiers": [ + "MONDO:0017998", + "ORPHANET:329303" + ], + "type": [ + "biolink:Disease", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon" + ], + "synonyms": [ + "neurodegeneration with brain iron accumulation caused by mutation in PLA2G6", + "PLA2G6 neurodegeneration with brain iron accumulation", + "PLAN" + ] + } + ] + } + }, + { + "_type": "_doc", + "_id": "MONDO:0002679", + "_score": 128.80138, + "_source": { + "id": "MONDO:0002679", + "name": "cerebral infarction", + "description": "An ischemic condition of the brain, producing a persistent focal neurological deficit in the area of distribution of the cerebral arteries.", + "type": "disease", + "search_terms": [ + "cerebral infarct", + "infarction, cerebral", + "cerebral infarction", + "CVA - cerebral infarction", + "cerebral ischemia", + "brain infarction of telencephalon", + "telencephalon brain infarction", + "cerebral, infarction" + ], + "optional_terms": [ + "Abnormal nervous system morphology", + "structure with developmental contribution from neural crest", + "stroke outcome severity measurement", + "brain infarction of telencephalon", + "Phenotypic abnormality", + "cerebral, infarction", + "cerebral infarct", + "Abnormal arterial physiology", + "CVA - cerebral infarction", + "telencephalon brain infarction", + "Abnormality of brain morphology", + "Tissue ischemia", + "cerebral infarction", + "Pontine ischemic lacunes", + "cerebral ischemia", + "Abnormal vascular morphology", + "Lacunar stroke", + "Abnormality of the vasculature", + "Abnormal cerebral vascular morphology", + "Abnormal vascular physiology", + "infarction, cerebral", + "Abnormal cardiovascular system physiology", + "Abnormality of cardiovascular system morphology" + ], + "concept_action": "", + "identifiers": [ + { + "id": "MONDO:0002679", + "label": "cerebral infarction", + "equivalent_identifiers": [ + "MONDO:0002679", + "DOID:3526", + "OMIM:601367", + "UMLS:C0007785", + "UMLS:C0948008", + "MESH:D000083242", + "MESH:D002544", + "MEDDRA:10008117", + "MEDDRA:10008118", + "MEDDRA:10021755", + "MEDDRA:10023027", + "MEDDRA:10055221", + "MEDDRA:10061256", + "NCIT:C50486", + "NCIT:C95802", + "SNOMEDCT:422504002", + "SNOMEDCT:432504007", + "ICD10:I63", + "HP:0002140" + ], + "type": [ + "biolink:Disease", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon" + ], + "synonyms": [ + "brain infarction of telencephalon", + "cerebral infarct", + "cerebral infarction", + "cerebral ischemia", + "cerebral, infarction", + "CVA - cerebral infarction", + "infarction, cerebral", + "telencephalon brain infarction" + ] + } + ] + } + }, + { + "_type": "_doc", + "_id": "UBERON:6110636", + "_score": 120.47298, + "_source": { + "id": "UBERON:6110636", + "name": "insect adult cerebral ganglion", + "description": "The pre-oral neuropils of the adult brain located above, around and partially below the esophagus, including the optic lobes. It excludes the gnathal ganglion. Developmentally, it comprises three fused neuromeres: protocerebrum, deutocerebrum, and tritocerebrum.", + "type": "anatomical entity", + "search_terms": [ + "supraesophageal ganglion", + "SPG", + "cerebrum", + "brain", + "CRG" + ], + "optional_terms": [], + "concept_action": "", + "identifiers": [ + { + "id": "UBERON:6110636", + "label": "insect adult cerebral ganglion", + "equivalent_identifiers": [ + "UBERON:6110636" + ], + "type": [ + "biolink:GrossAnatomicalStructure", + "biolink:AnatomicalEntity", + "biolink:OrganismalEntity", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon", + "biolink:PhysicalEssence", + "biolink:PhysicalEssenceOrOccurrent" + ], + "synonyms": [ + "CRG", + "SPG", + "brain", + "cerebrum", + "supraesophageal ganglion" + ] + } + ] + } + }, + { + "_type": "_doc", + "_id": "MONDO:0045057", + "_score": 115.8625, + "_source": { + "id": "MONDO:0045057", + "name": "delirium", + "description": "A disorder characterized by confusion; inattentiveness; disorientation; illusions; hallucinations; agitation; and in some instances autonomic nervous system overactivity. It may result from toxic/metabolic conditions or structural brain lesions. (From Adams et al., Principles of Neurology, 6th ed, pp411-2)", + "type": "disease", + "search_terms": [ + "delirium", + "OBS syndrome", + "organic brain syndrome" + ], + "optional_terms": [ + "Confusion", + "Abnormality of higher mental function", + "Abnormal nervous system physiology", + "delirium", + "Reduced consciousness/confusion", + "Phenotypic abnormality" + ], + "concept_action": "", + "identifiers": [ + { + "id": "MONDO:0045057", + "label": "delirium", + "equivalent_identifiers": [ + "MONDO:0045057", + "UMLS:C0011206", + "UMLS:C0029221", + "UMLS:C1285577", + "UMLS:C1306588", + "MESH:D003693", + "MEDDRA:10000685", + "MEDDRA:10000693", + "MEDDRA:10000694", + "MEDDRA:10000702", + "MEDDRA:10006150", + "MEDDRA:10012217", + "MEDDRA:10012218", + "MEDDRA:10012219", + "MEDDRA:10031077", + "MEDDRA:10042790", + "NCIT:C2981", + "NCIT:C34868", + "SNOMEDCT:130987000", + "SNOMEDCT:2776000", + "SNOMEDCT:419567006", + "HP:0031258" + ], + "type": [ + "biolink:Disease", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon" + ], + "synonyms": [ + "OBS syndrome", + "organic brain syndrome" + ] + } + ] + } + }, + { + "_type": "_doc", + "_id": "UBERON:0002298", + "_score": 90.43253, + "_source": { + "id": "UBERON:0002298", + "name": "brainstem", + "description": "Stalk-like part of the brain that includes amongst its parts the medulla oblongata of the hindbrain and the tegmentum of the midbrain[ZFA,MP,generalized].", + "type": "anatomical entity", + "search_terms": [ + "truncus encephalicus", + "truncus encephali", + "lamella pallidi incompleta", + "accessory medullary lamina of pallidum", + "lamina pallidi incompleta", + "lamina medullaris incompleta pallidi", + "brain stem", + "brainstem", + "lamina medullaris accessoria" + ], + "optional_terms": [], + "concept_action": "", + "identifiers": [ + { + "id": "UBERON:0002298", + "label": "brainstem", + "equivalent_identifiers": [ + "UBERON:0002298", + "UMLS:C0006121", + "NCIT:C12441" + ], + "type": [ + "biolink:GrossAnatomicalStructure", + "biolink:AnatomicalEntity", + "biolink:OrganismalEntity", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon", + "biolink:PhysicalEssence", + "biolink:PhysicalEssenceOrOccurrent" + ], + "synonyms": [ + "brain stem", + "truncus encephali", + "accessory medullary lamina of pallidum", + "lamella pallidi incompleta", + "lamina medullaris accessoria", + "lamina medullaris incompleta pallidi", + "lamina pallidi incompleta", + "truncus encephalicus" + ] + } + ] + } + }, + { + "_type": "_doc", + "_id": "UBERON:0001894", + "_score": 73.00175, + "_source": { + "id": "UBERON:0001894", + "name": "diencephalon", + "description": "The division of the forebrain that develops from the foremost primary cerebral vesicle.", + "type": "anatomical entity", + "search_terms": [ + "mature diencephalon", + "thalamencephalon", + "between brain", + "interbrain", + "betweenbrain", + "diencephalon", + "died." + ], + "optional_terms": [], + "concept_action": "", + "identifiers": [ + { + "id": "UBERON:0001894", + "label": "diencephalon", + "equivalent_identifiers": [ + "UBERON:0001894", + "UMLS:C0012144" + ], + "type": [ + "biolink:GrossAnatomicalStructure", + "biolink:AnatomicalEntity", + "biolink:OrganismalEntity", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon", + "biolink:PhysicalEssence", + "biolink:PhysicalEssenceOrOccurrent" + ], + "synonyms": [ + "between brain", + "interbrain", + "mature diencephalon", + "thalamencephalon", + "diencephalon", + "betweenbrain" + ] + } + ] + } + }, + { + "_type": "_doc", + "_id": "MONDO:0013792", + "_score": 69.71182, + "_source": { + "id": "MONDO:0013792", + "name": "intracerebral hemorrhage", + "description": "Bleeding into one or both cerebral hemispheres including the basal ganglia and the cerebral cortex. It is often associated with hypertension and craniocerebral trauma.", + "type": "disease", + "search_terms": [ + "stroke, hemorrhagic", + "'bleeding in brain'", + "hemorrhage, intracerebral, susceptibility to", + "ich", + "stroke, hemorrhagic, susceptibility to" + ], + "optional_terms": [ + "Abnormality of the musculoskeletal system", + "Abnormal nervous system morphology", + "Abnormality of head or neck", + "Recurrent cerebral hemorrhage", + "Intraventricular hemorrhage", + "Phenotypic abnormality", + "Grade I preterm intraventricular hemorrhage", + "stroke, hemorrhagic, susceptibility to", + "Hemorrhage", + "Intraventricular Hemorrhage Related to Birth", + "intracerebral hemorrhage", + "Antenatal intracerebral hemorrhage", + "Periventricular Hemorrhage of the Newborn", + "stroke, hemorrhagic", + "ich", + "Abnormal bleeding", + "Abnormality of brain morphology", + "Internal hemorrhage", + "Cerebral Hemorrhage Related to Birth", + "Finding", + "Abnormality of the skeletal system", + "Intraparenchymal Hemorrhage of the Newborn", + "Abnormal vascular morphology", + "Abnormality of the vasculature", + "Abnormal cerebral vascular morphology", + "Finding by Cause", + "Abnormality of the head", + "Abnormality of cardiovascular system morphology", + "Intraventricular Hemorrhage with Parenchymal Hemorrhage of the Newborn", + "Abnormal cardiovascular system physiology", + "Abnormality of blood circulation", + "hemorrhage, intracerebral, susceptibility to" + ], + "concept_action": "", + "identifiers": [ + { + "id": "MONDO:0013792", + "label": "intracerebral hemorrhage", + "equivalent_identifiers": [ + "MONDO:0013792", + "OMIM:614519", + "UMLS:C0472369", + "UMLS:C0553692", + "UMLS:C1862876", + "UMLS:C2937358", + "UMLS:C3281105", + "UMLS:C5234922", + "MESH:D000083302", + "MESH:D002543", + "MEDDRA:10008111", + "MEDDRA:10008114", + "MEDDRA:10018972", + "MEDDRA:10019005", + "MEDDRA:10019016", + "MEDDRA:10019529", + "MEDDRA:10019531", + "MEDDRA:10019551", + "MEDDRA:10022737", + "MEDDRA:10022751", + "MEDDRA:10022753", + "MEDDRA:10022754", + "MEDDRA:10048863", + "MEDDRA:10055278", + "MEDDRA:10055293", + "MEDDRA:10055800", + "MEDDRA:10055815", + "MEDDRA:10071793", + "MEDDRA:10077620", + "MEDDRA:10077622", + "NCIT:C50485", + "NCIT:C95803", + "SNOMEDCT:230706003", + "SNOMEDCT:274100004", + "HP:0001342" + ], + "type": [ + "biolink:Disease", + "biolink:DiseaseOrPhenotypicFeature", + "biolink:BiologicalEntity", + "biolink:NamedThing", + "biolink:Entity", + "biolink:ThingWithTaxon" + ], + "synonyms": [ + "stroke, hemorrhagic", + "hemorrhage, intracerebral, susceptibility to", + "ich", + "stroke, hemorrhagic, susceptibility to" + ] + } + ] + } + } + ] + }, + "aggregations": { + "type-count": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "phenotype", + "doc_count": 36 + }, + { + "key": "disease", + "doc_count": 28 + }, + { + "key": "anatomical entity", + "doc_count": 10 + }, + { + "key": "TOPMed Phenotype Concept", + "doc_count": 8 + }, + { + "key": "drug", + "doc_count": 3 + }, + { + "key": "", + "doc_count": 2 + }, + { + "key": "biological process", + "doc_count": 1 + }, + { + "key": "clinical_course", + "doc_count": 1 + }, + { + "key": "molecular entity", + "doc_count": 1 + } + ] + } + } +} +""" + +def _brain_search_result(): + """Stuck in a function just so I can shove it down here at the end + of the test module""" + return json.loads(brain_result_json) From 4fc13f4e57287b1cf64ebc2c97921f7dc0950fd4 Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 26 Jul 2023 12:13:02 -0400 Subject: [PATCH 2/6] adds reconnect to timed out ftp session --- bin/get_dbgap_data_dicts.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/bin/get_dbgap_data_dicts.py b/bin/get_dbgap_data_dicts.py index 00b42cb5..fd3d3247 100644 --- a/bin/get_dbgap_data_dicts.py +++ b/bin/get_dbgap_data_dicts.py @@ -30,6 +30,7 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir): ftp = FTP('ftp.ncbi.nlm.nih.gov') ftp.login() + ftp.sendcmd('PASV') study_variable = dbgap_accession_id.split('.')[0] # The output directory already includes the study accession number. @@ -74,7 +75,19 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir): # Step 2: Check to see if there's a GapExchange file in the parent folder # and if there is, get it. - ftp.cwd(study_id_path) + + import ftplib + try: + ftp.cwd(study_id_path) + except ftplib.error_temp as e: + logging.error("Ftp session timed out... Reconnecting") + ftp.login() + resp = ftp.cwd(study_id_path) + if resp[:1] == '2': + logging.info("command success") + + + ftp_filelist = ftp.nlst(".") for ftp_filename in ftp_filelist: if 'GapExchange' in ftp_filename: @@ -87,7 +100,7 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir): @click.command() @click.argument('input_file', type=click.File('r')) @click.option('--format', help='The format of the input file.', type=click.Choice(['CSV', 'TSV']), default='TSV') -@click.option('--field', help='The field name containing dbGaP study IDs or accession IDs.', default='dbgap_study_accession', type=str, multiple=True) +@click.option('--field', help='The field name containing dbGaP study IDs or accession IDs.', default=['dbgap_study_accession'], type=str, multiple=True) @click.option('--outdir', help='The output directory to create and write dbGaP files to.', type=click.Path(file_okay=False, dir_okay=True, exists=False), default='data/dbgap') @click.option('--group-by', help='Create subdirectories for the specified fields.', type=str, multiple=True) @click.option('--skip', help='dbGaP identifier to skip when downloading.', type=str, multiple=True) From ee80fff783c504eb645395cdcb48e1a6c7ee21ae Mon Sep 17 00:00:00 2001 From: YaphetKG Date: Wed, 26 Jul 2023 12:15:53 -0400 Subject: [PATCH 3/6] adds reconnect to timed out ftp session --- bin/get_dbgap_data_dicts.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/bin/get_dbgap_data_dicts.py b/bin/get_dbgap_data_dicts.py index fd3d3247..a21d7f72 100644 --- a/bin/get_dbgap_data_dicts.py +++ b/bin/get_dbgap_data_dicts.py @@ -7,7 +7,7 @@ import logging import os import shutil -from ftplib import FTP, error_perm +from ftplib import FTP, error_perm, error_temp import csv import click import requests @@ -76,18 +76,15 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir): # Step 2: Check to see if there's a GapExchange file in the parent folder # and if there is, get it. - import ftplib try: ftp.cwd(study_id_path) - except ftplib.error_temp as e: + except error_temp as e: logging.error("Ftp session timed out... Reconnecting") ftp.login() resp = ftp.cwd(study_id_path) if resp[:1] == '2': logging.info("command success") - - ftp_filelist = ftp.nlst(".") for ftp_filename in ftp_filelist: if 'GapExchange' in ftp_filename: From b7a08e1c6a67b7ec7fd529ce682ddbd1f606a69a Mon Sep 17 00:00:00 2001 From: YaphetKG <45075777+YaphetKG@users.noreply.github.com> Date: Wed, 26 Jul 2023 13:30:56 -0400 Subject: [PATCH 4/6] Ftp fixes (#303) * adds reconnect to timed out ftp session * adds reconnect to timed out ftp session * remove PASV command From a25489699c8351567abf6e35d10cbca942be29cc Mon Sep 17 00:00:00 2001 From: Griffin Roupe <31631417+frostyfan109@users.noreply.github.com> Date: Mon, 31 Jul 2023 09:09:05 -0400 Subject: [PATCH 5/6] Add explain to concept search results (#301) --- src/dug/core/async_search.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index bcca8797..85c66997 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -201,9 +201,10 @@ async def search_concepts(self, query, offset=0, size=None, fuzziness=1, prefix_ search_results = await self.es.search( index="concepts_index", body=body, - filter_path=['hits.hits._id', 'hits.hits._type', 'hits.hits._source', 'hits.hits._score'], + filter_path=['hits.hits._id', 'hits.hits._type', 'hits.hits._source', 'hits.hits._score', 'hits.hits._explanation'], from_=offset, - size=size + size=size, + explain=True ) search_results.update({'total_items': total_items['count']}) return search_results From d860f2b6a3154b37ffd2a0cedc2a81cb68471959 Mon Sep 17 00:00:00 2001 From: "Michael T. Bacon" Date: Mon, 31 Jul 2023 15:02:37 -0400 Subject: [PATCH 6/6] Reverted unintentional commit to .env, changed parameter submitted to ES from json str to dict --- .env | 4 ++-- src/dug/core/async_search.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/.env b/.env index a6ee42cb..d313eafa 100644 --- a/.env +++ b/.env @@ -1,14 +1,14 @@ DATA_DIR=./local_storage ELASTIC_PASSWORD=15707 -ELASTIC_API_HOST=localhost +ELASTIC_API_HOST=elasticsearch ELASTIC_USERNAME=elastic NBOOST_API_HOST=nboost REDIS_PASSWORD=15707 -REDIS_HOST=localhost +REDIS_HOST=redis REDIS_PORT=6379 API_WORKERS=4 diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py index 0e90e044..f6272d24 100644 --- a/src/dug/core/async_search.py +++ b/src/dug/core/async_search.py @@ -202,16 +202,17 @@ async def search_concepts(self, query, offset=0, size=None, types=None, """ Changed to a long boolean match query to optimize search results """ - query_object = {'query': self._build_concepts_query(query, **kwargs)} + query_dict = self._build_concepts_query(query, **kwargs) total_items = await self.es.count( - body=json.dumps(query_object), + body={"query": query_object}, index="concepts_index") # Get aggregated counts of biolink types - query_object['aggs'] = {'type-count': {'terms': {'field': 'type'}}} + search_body = {"query": query_object} + search_body['aggs'] = {'type-count': {'terms': {'field': 'type'}}} # Add post_filter on types if types: assert type(types) == list - query_object['post_filter'] = { + search_body['post_filter'] = { "bool": { "should": [ {'term': {'type': {'value': t}}} for t in types @@ -221,13 +222,17 @@ async def search_concepts(self, query, offset=0, size=None, types=None, } search_results = await self.es.search( index="concepts_index", - body=json.dumps(query_object), + body=search_body, filter_path=['hits.hits._id', 'hits.hits._type', 'hits.hits._source', 'hits.hits._score', 'aggregations'], from_=offset, size=size ) + + # Simplify the data structure we get from aggregations to put into the + # return value. This should be a count of documents hit for every type + # in the search results. aggregations = search_results.pop('aggregations') concept_types = { bucket['key']: bucket['doc_count'] for bucket in