diff --git a/data/bdc_dbgap_data_dicts.tar.gz b/data/bdc_dbgap_data_dicts.tar.gz
index adf46cb3..8671c4e2 100644
Binary files a/data/bdc_dbgap_data_dicts.tar.gz and b/data/bdc_dbgap_data_dicts.tar.gz differ
diff --git a/data/redis/appendonly.aof b/data/redis/appendonly.aof
deleted file mode 100644
index 386fca61..00000000
Binary files a/data/redis/appendonly.aof and /dev/null differ
diff --git a/requirements.txt b/requirements.txt
index e80c1f1f..d2ea252d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
elasticsearch[async]==7.16.3
fastapi==0.95.0
-uvicorn
+uvicorn==0.23.2
gunicorn
itsdangerous
Jinja2
@@ -19,5 +19,6 @@ six==1.16.0
# Click for command line arguments
# We use Click 7.0 because that's what one of the pinned packages above use.
-click~=7.0
+click
httpx>=0.24.1
+bmt==1.1.0
\ No newline at end of file
diff --git a/src/dug/config.py b/src/dug/config.py
index 6c40e8b2..180544a5 100644
--- a/src/dug/config.py
+++ b/src/dug/config.py
@@ -39,12 +39,12 @@ class Config:
# Normalizer config that will be passed to annotate.Normalizer constructor
normalizer: dict = field(default_factory=lambda: {
- "url": "https://nodenormalization-sri.renci.org/get_normalized_nodes?conflate=false&curie="
+ "url": "https://nodenormalization-dev.apps.renci.org/get_normalized_nodes?conflate=false&description=true&curie="
})
# Synonym service config that will be passed to annotate.SynonymHelper constructor
synonym_service: dict = field(default_factory=lambda: {
- "url": "https://onto.renci.org/synonyms/"
+ "url": "https://name-resolution-sri.renci.org/reverse_lookup"
})
# Ontology metadata helper config that will be passed to annotate.OntologyHelper constructor
@@ -59,7 +59,9 @@ class Config:
"disease": ["disease", "phenotypic_feature"],
"pheno": ["phenotypic_feature", "disease"],
"anat": ["disease", "anatomical_entity"],
- "chem_to_disease": ["chemical_substance", "disease"],
+ "chem_to_disease": ["chemical_entity", "disease"],
+ "small_molecule_to_disease": ["small_molecule", "disease"],
+ "chemical_mixture_to_disease": ["chemical_mixture", "disease"],
"phen_to_anat": ["phenotypic_feature", "anatomical_entity"],
})
diff --git a/src/dug/core/annotate.py b/src/dug/core/annotate.py
index 91976f75..5f7977c3 100644
--- a/src/dug/core/annotate.py
+++ b/src/dug/core/annotate.py
@@ -1,8 +1,10 @@
import json
import logging
import os
+import re
import urllib.parse
from typing import TypeVar, Generic, Union, List, Tuple, Optional
+import bmt
import requests
from requests import Session
@@ -59,14 +61,12 @@ def __init__(
annotator: "Annotator",
normalizer: "Normalizer",
synonym_finder: "SynonymFinder",
- ontology_helper: "OntologyHelper",
ontology_greenlist=[],
):
self.preprocessor = preprocessor
self.annotator = annotator
self.normalizer = normalizer
self.synonym_finder = synonym_finder
- self.ontology_helper = ontology_helper
self.ontology_greenlist = ontology_greenlist
self.norm_fails_file = "norm_fails.txt"
self.anno_fails_file = "anno_fails.txt"
@@ -106,12 +106,6 @@ def annotate(self, text, http_session):
# Add synonyms to identifier
norm_id.synonyms = self.synonym_finder.get_synonyms(norm_id.id, http_session)
- # Get canonical label, name, and description from ontology metadata service
- name, desc, ontology_type = self.ontology_helper.get_ontology_info(norm_id.id, http_session)
- norm_id.label = name
- norm_id.description = desc
- norm_id.type = ontology_type
-
# Get pURL for ontology identifer for more info
norm_id.purl = BioLinkPURLerizer.get_curie_purl(norm_id.id)
processed_identifiers.append(norm_id)
@@ -335,6 +329,7 @@ def handle_response(self, value, response: dict) -> List[Identifier]:
class Normalizer(ApiClient[Identifier, Identifier]):
def __init__(self, url):
+ self.bl_toolkit = bmt.Toolkit()
self.url = url
def normalize(self, identifier: Identifier, http_session: Session):
@@ -380,9 +375,13 @@ def handle_response(self, identifier: Identifier, normalized: dict) -> Optional[
logger.debug(f"Preferred id: {preferred_id}")
identifier.id = preferred_id.get('identifier', '')
identifier.label = preferred_id.get('label', '')
- identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers]
- identifier.types = biolink_type
-
+ identifier.description = preferred_id.get('description', '')
+ identifier.equivalent_identifiers = [v['identifier'] for v in equivalent_identifiers]
+ try:
+ identifier.types = self.bl_toolkit.get_element(biolink_type[0]).name
+ except:
+ # converts biolink:SmallMolecule to small molecule
+ identifier.types = (" ".join(re.split("(?=[A-Z])", biolink_type[0].replace('biolink:', ''))[1:])).lower()
return identifier
@@ -400,51 +399,31 @@ def get_synonyms(self, curie: str, http_session):
return self(curie, http_session)
def make_request(self, curie: str, http_session: Session):
-
- # Get response from synonym service
- url = f"{self.url}{urllib.parse.quote(curie)}"
-
+ # Get response from namelookup reverse lookup op
+ # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post)
+ url = f"{self.url}"
+ payload = {
+ 'curies': [curie]
+ }
try:
- response = http_session.get(url)
- if response.status_code == 400:
- logger.error(f"No synonyms returned for: `{curie}`. Validation error.")
- return []
- if response.status_code == 500:
- logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}.")
- return []
+ response = http_session.post(url, json=payload)
+ if str(response.status_code).startswith('4'):
+ logger.error(f"No synonyms returned for: `{curie}`. Validation error: {response.text}")
+ return {curie: []}
+ if str(response.status_code).startswith('5'):
+ logger.error(f"No synonyms returned for: `{curie}`. Internal server error from {self.url}. Error: {response.text}")
+ return {curie: []}
return response.json()
except json.decoder.JSONDecodeError as e:
logger.error(f"Json parse error for response from `{url}`. Exception: {str(e)}")
- return []
+ return {curie: []}
def handle_response(self, curie: str, raw_synonyms: List[dict]) -> List[str]:
- # List comprehension unpack all synonyms into a list
- return [synonym['desc'] for synonym in raw_synonyms]
+ # Return curie synonyms
+ return raw_synonyms.get(curie, [])
-class OntologyHelper(ApiClient[str, Tuple[str, str, str]]):
- def __init__(self, url):
- self.url = url
- def make_request(self, curie: str, http_session: Session):
- url = f"{self.url}{urllib.parse.quote(curie)}"
- try:
- response = http_session.get(url).json()
- return response
- except json.decoder.JSONDecodeError as e:
- logger.error(f"No labels returned for: {curie}")
- return {}
-
- def handle_response(self, curie: str, response: dict) -> Tuple[str,str,str]:
- # List comprehension for synonyms
- name = response.get('label', '')
- description = '' if not response.get('description', None) else response.get('description', '')
- ontology_type = '' if not response.get('category', None) else response.get('category', '')[0]
-
- return name, description, ontology_type
-
- def get_ontology_info(self, curie, http_session):
- return self(curie, http_session)
class BioLinkPURLerizer:
diff --git a/src/dug/core/async_search.py b/src/dug/core/async_search.py
index 1f3d5872..6946fb74 100644
--- a/src/dug/core/async_search.py
+++ b/src/dug/core/async_search.py
@@ -212,15 +212,10 @@ async def search_concepts(self, query, offset=0, size=None, types=None,
Changed to a long boolean match query to optimize search results
"""
query_dict = self._build_concepts_query(query, **kwargs)
- total_items = await self.es.count(
- body={"query": query_dict},
- index="concepts_index")
# Get aggregated counts of biolink types
search_body = {"query": query_dict}
search_body['aggs'] = {'type-count': {'terms': {'field': 'type'}}}
- # Add post_filter on types
- if types:
- assert isinstance(types, list)
+ if isinstance(types, list):
search_body['post_filter'] = {
"bool": {
"should": [
@@ -239,6 +234,18 @@ async def search_concepts(self, query, offset=0, size=None, types=None,
size=size,
explain=True
)
+ # Aggs/post_filter aren't supported by count
+ del search_body["aggs"]
+ if "post_filter" in search_body:
+ # We'll move the post_filter into the actual filter
+ search_body["query"]["bool"]["filter"]["bool"].update(
+ search_body["post_filter"]["bool"]
+ )
+ del search_body["post_filter"]
+ total_items = await self.es.count(
+ body=search_body,
+ index="concepts_index"
+ )
# Simplify the data structure we get from aggregations to put into the
# return value. This should be a count of documents hit for every type
diff --git a/src/dug/core/factory.py b/src/dug/core/factory.py
index 48097f19..9fdba4dc 100644
--- a/src/dug/core/factory.py
+++ b/src/dug/core/factory.py
@@ -4,8 +4,12 @@
from requests_cache import CachedSession
import dug.core.tranql as tql
-from dug.core.annotate import DugAnnotator, Annotator, Normalizer, OntologyHelper, Preprocessor, SynonymFinder, \
- ConceptExpander
+from dug.core.annotate import (DugAnnotator,
+ Annotator,
+ Normalizer,
+ Preprocessor,
+ SynonymFinder,
+ ConceptExpander)
from dug.config import Config as DugConfig, TRANQL_SOURCE
from dug.core.crawler import Crawler
from dug.core.parsers import Parser
@@ -53,14 +57,12 @@ def build_annotator(self) -> DugAnnotator:
annotator = Annotator(**self.config.annotator)
normalizer = Normalizer(**self.config.normalizer)
synonym_finder = SynonymFinder(**self.config.synonym_service)
- ontology_helper = OntologyHelper(**self.config.ontology_helper)
annotator = DugAnnotator(
preprocessor=preprocessor,
annotator=annotator,
normalizer=normalizer,
- synonym_finder=synonym_finder,
- ontology_helper=ontology_helper
+ synonym_finder=synonym_finder
)
return annotator
diff --git a/src/dug/core/index.py b/src/dug/core/index.py
index a6711a15..78a723b6 100644
--- a/src/dug/core/index.py
+++ b/src/dug/core/index.py
@@ -26,6 +26,7 @@ def __init__(self, cfg: Config, indices=None):
self.es = Elasticsearch(hosts=self.hosts,
http_auth=(self._cfg.elastic_username, self._cfg.elastic_password))
+ self.replicas = self.get_es_node_count()
if self.es.ping():
logger.info('connected to elasticsearch')
@@ -36,6 +37,10 @@ def __init__(self, cfg: Config, indices=None):
raise SearchException(
message='failed to connect to elasticsearch',
details=f"connecting to host {self._cfg.elastic_host} and port {self._cfg.elastic_port}")
+
+ def get_es_node_count(self):
+ return self.es.nodes.info()["_nodes"]["total"]
+
def init_indices(self):
# The concepts and variable indices include an analyzer that utilizes the english
@@ -49,7 +54,7 @@ def init_indices(self):
kg_index = {
"settings": {
"number_of_shards": 1,
- "number_of_replicas": 0
+ "number_of_replicas": self.replicas
},
"mappings": {
"properties": {
@@ -66,7 +71,7 @@ def init_indices(self):
"settings": {
"index.mapping.coerce": "false",
"number_of_shards": 1,
- "number_of_replicas": 0,
+ "number_of_replicas": self.replicas,
"analysis": {
"analyzer": {
"std_with_stopwords": {
@@ -104,7 +109,7 @@ def init_indices(self):
"settings": {
"index.mapping.coerce": "false",
"number_of_shards": 1,
- "number_of_replicas": 0,
+ "number_of_replicas": self.replicas,
"analysis": {
"analyzer": {
"std_with_stopwords": {
@@ -148,6 +153,11 @@ def init_indices(self):
for index in self.indices:
try:
if self.es.indices.exists(index=index):
+ # if index exists check if replication is good
+ index_replicas = self.es.indices.get_settings(index=index)[index]["settings"]["index"]["number_of_replicas"]
+ if index_replicas != self.replicas:
+ self.es.indices.put_settings(index=index, body={"number_of_replicas": (self.replicas - 1) or 1 })
+ self.es.indices.refresh(index=index)
logger.info(f"Ignoring index {index} which already exists.")
else:
result = self.es.indices.create(
diff --git a/src/dug/core/parsers/dbgap_parser.py b/src/dug/core/parsers/dbgap_parser.py
index b01432c4..2926b1f1 100644
--- a/src/dug/core/parsers/dbgap_parser.py
+++ b/src/dug/core/parsers/dbgap_parser.py
@@ -1,9 +1,10 @@
import logging
-import re
+import re, os
from typing import List
from xml.etree import ElementTree as ET
from dug import utils as utils
+from pathlib import Path
from ._base import DugElement, FileParser, Indexable, InputFile
logger = logging.getLogger('dug')
@@ -13,27 +14,49 @@ class DbGaPParser(FileParser):
# Class for parsers DBGaP Data dictionary into a set of Dug Elements
@staticmethod
- def parse_study_name_from_filename(filename: str):
+ def parse_study_name_from_filename(filename: str) -> str:
# Parse the study name from the xml filename if it exists. Return None if filename isn't right format to get id from
dbgap_file_pattern = re.compile(r'.*/*phs[0-9]+\.v[0-9]+\.pht[0-9]+\.v[0-9]+\.(.+)\.data_dict.*')
match = re.match(dbgap_file_pattern, filename)
if match is not None:
return match.group(1)
return None
+
+ @staticmethod
+ def parse_study_name_from_gap_exchange_file(filepath: Path) -> str:
+ # Parse the study name from the GapExchange file adjacent to the file passed in
+ parent_dir = filepath.parent.absolute()
+ gap_exchange_filename_str = "GapExchange_" + parent_dir.name
+ gap_exchange_filepath = None
+ for item in os.scandir(parent_dir):
+ if item.is_file and gap_exchange_filename_str in item.name:
+ gap_exchange_filepath = item.path
+ if gap_exchange_filepath is None:
+ return None
+ tree = ET.parse(gap_exchange_filepath, ET.XMLParser(encoding='iso-8859-5'))
+ tree_root = tree.getroot()
+ return tree_root.find("./Studies/Study/Configuration/StudyNameEntrez").text
+
def _get_element_type(self):
return "DbGaP"
def __call__(self, input_file: InputFile) -> List[Indexable]:
logger.debug(input_file)
+ if "GapExchange" in str(input_file).split("/")[-1]:
+ msg = f"Skipping parsing for GapExchange file: {input_file}!"
+ logger.info(msg)
+ return []
tree = ET.parse(input_file, ET.XMLParser(encoding='iso-8859-5'))
root = tree.getroot()
study_id = root.attrib['study_id']
participant_set = root.get('participant_set','0')
- # Parse study name from file handle
- study_name = self.parse_study_name_from_filename(str(input_file))
-
+ # Parse study name from GapExchange file, and if that fails try from file handle
+ # If still None, raise an error message
+ study_name = self.parse_study_name_from_gap_exchange_file(Path(input_file))
+ if study_name is None:
+ study_name = self.parse_study_name_from_filename(str(input_file))
if study_name is None:
err_msg = f"Unable to parse DbGaP study name from data dictionary: {input_file}!"
logger.error(err_msg)
diff --git a/tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml b/tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml
new file mode 100644
index 00000000..33f615cb
--- /dev/null
+++ b/tests/integration/data/phs001252.v1.p1/GapExchange_phs001252.v1.p1.xml
@@ -0,0 +1,390 @@
+
+
+ Title Name Institute
+ Principal Investigator Jorgen Vestbo, Professor the University of Manchester, Manchester, UK
+ Principal Investigator Edwin K. Silverman, MD, PhD Brigham and Women 's Hospital, Boston, MA
+ ECLIPSE Investigators Y. Ivanov Pleven, Bulgaria
+ ECLIPSE Investigators K. Kostov Sofia, Bulgaria
+
+ ]]>ECLIPSE Investigators J. Bourbeau Montreal, Canada