Skip to content

Commit

Permalink
Merge pull request #368 from helxplatform/develop
Browse files Browse the repository at this point in the history
Next release
  • Loading branch information
YaphetKG authored Aug 14, 2024
2 parents 1db2225 + 3ea7813 commit 5f7553d
Show file tree
Hide file tree
Showing 19 changed files with 1,007 additions and 674 deletions.
6 changes: 2 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,13 @@
# A container for the core semantic-search capability.
#
######################################################
FROM python:3.12.1-alpine3.19
FROM python:alpine3.20


# Install required packages
RUN apk update && \
apk add g++ make
apk add g++ make

#upgrade openssl \
#RUN apk add openssl=3.1.4-r4

RUN pip install --upgrade pip
# Create a non-root user.
Expand Down
25 changes: 24 additions & 1 deletion bin/get_bdc_studies_from_gen3.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,12 @@ def get_bdc_studies_from_gen3(output, bdc_gen3_base_url):
sorted_study_ids = sorted(discovery_list)

# Step 2. For every study ID, write out an entry into the CSV output file.
csv_writer = csv.DictWriter(output, fieldnames=['Accession', 'Consent', 'Study Name', 'Last modified', 'Notes', 'Description'])
csv_writer = csv.DictWriter(output, fieldnames=['Accession', 'Consent', 'Study Name', 'Program', 'Last modified', 'Notes', 'Description'])
csv_writer.writeheader()
for study_id in sorted_study_ids:
# Reset the variables we need.
study_name = ''
program_names = []
description = ''
notes = ''

Expand Down Expand Up @@ -139,6 +140,24 @@ def get_bdc_studies_from_gen3(output, bdc_gen3_base_url):
else:
study_name = '(no name)'

# Program name.
if 'authz' in gen3_discovery:
# authz is in the format /programs/topmed/projects/ECLIPSE_DS-COPD-MDS-RD
match = re.fullmatch(r'^/programs/(.*)/projects/(.*)$', gen3_discovery['authz'])
if match:
program_names.append(match.group(1))
# study_short_name = match.group(2)

# Tags don't seem as fine-grained as authz and are often slightly different from the authz values
# (e.g. `COVID 19` instead of `COVID-19`, `Parent` instead of `parent`), so for now we only use the authz
# values.
#
# if 'tags' in gen3_discovery:
# for tag in gen3_discovery['tags']:
# category = tag.get('category', '')
# if category.lower() == 'program':
# program_names.append(tag.get('name', '').strip())

# Description.
description = gen3_discovery.get('study_description', '')

Expand All @@ -156,11 +175,15 @@ def get_bdc_studies_from_gen3(output, bdc_gen3_base_url):
accession = study_id
consent = ''

# Remove any blank program names.
program_names = filter(lambda n: n != '', program_names)

csv_writer.writerow({
'Accession': accession,
'Consent': consent,
'Study Name': study_name,
'Description': description,
'Program': '|'.join(sorted(set(program_names))),
'Last modified': last_modified,
'Notes': notes.strip()
})
Expand Down
36 changes: 30 additions & 6 deletions bin/get_dbgap_data_dicts.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
# Default to logging at the INFO level.
logging.basicConfig(level=logging.INFO)

# FTP timeout in seconds
FTP_TIMEOUT = 100


# Helper function
def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
"""
Expand All @@ -28,7 +32,7 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):

count_downloaded_vars = 0

ftp = FTP('ftp.ncbi.nlm.nih.gov')
ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=FTP_TIMEOUT)
ftp.login()
ftp.sendcmd('PASV')
study_variable = dbgap_accession_id.split('.')[0]
Expand Down Expand Up @@ -56,6 +60,8 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
return 0

ftp_filelist = ftp.nlst(".")
ftp.quit()

for ftp_filename in ftp_filelist:
if 'data_dict' in ftp_filename:
with open(f"{local_path}/{ftp_filename}", "wb") as data_dict_file:
Expand All @@ -73,13 +79,20 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
logging.info(f"Downloaded {ftp_filename} to {local_path}/{ftp_filename} in {response.elapsed.microseconds} microseconds.")
count_downloaded_vars += 1

# Sometimes we've timed out on the FTP server by this point. So let's disconnect and reconnect.
ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=FTP_TIMEOUT)
ftp.login()
ftp.sendcmd('PASV')

# Step 2: Check to see if there's a GapExchange file in the parent folder
# and if there is, get it.
try:
ftp.cwd(study_id_path)
except error_temp as e:
logging.error("Ftp session timed out... Reconnecting")
logging.error("FTP session timed out. Reconnecting.")
ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=FTP_TIMEOUT)
ftp.login()
ftp.sendcmd('PASV')
resp = ftp.cwd(study_id_path)
if resp[:1] == '2':
logging.info("command success")
Expand Down Expand Up @@ -160,7 +173,12 @@ def get_dbgap_data_dicts(input_file, format, field, outdir, group_by, skip):
# If multiple group-by fields are specified, we use them in order.
output_dir_for_row = output_dir
for group_name in list(group_by):
if group_name in row:
if group_name in row and row[group_name].strip() != '':
if '|' in row[group_name]:
raise RuntimeError(
f"Pipe-separated multiple values in group-by field {group_name} not supported:" +
f"{row[group_name]} (line {line_num})"
)
output_dir_for_row = os.path.join(output_dir_for_row, row[group_name])
else:
output_dir_for_row = os.path.join(output_dir_for_row, '__missing__')
Expand All @@ -179,9 +197,15 @@ def get_dbgap_data_dicts(input_file, format, field, outdir, group_by, skip):
# Try to download to output folder if the study hasn't already been downloaded
if not os.path.exists(dbgap_dir):
logging.info(f"Downloading {dbgap_id} to {dbgap_dir}")
count_downloaded += download_dbgap_study(dbgap_id, dbgap_dir)

logging.info(f"Downloaded {count_downloaded} studies from {count_rows} in input files.")
try:
count_downloaded += download_dbgap_study(dbgap_id, dbgap_dir)
except Exception as ex:
logging.error(f"Exception occurred while downloading {dbgap_id} to {dbgap_dir}: {ex}")
shutil.rmtree(dbgap_dir, ignore_errors=True)
logging.error(f"Deleted {dbgap_dir} as it is probably incomplete.")
logging.error("Re-run this script to ensure that all variables are downloaded.")

logging.info(f"Downloaded {count_downloaded} data dictionaries from {count_rows} rows in input files.")


if __name__ == "__main__":
Expand Down
334 changes: 176 additions & 158 deletions data/bdc_dbgap_ids.csv

Large diffs are not rendered by default.

8 changes: 3 additions & 5 deletions docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,9 @@ services:
REDIS_PASSWORD: "$REDIS_PASSWORD"
FLASK_ENV: "development"
PYTHONUNBUFFERED: "TRUE"
entrypoint: [ "gunicorn",
"--workers=$API_WORKERS", "--name=dug",
"--bind=0.0.0.0:$API_PORT", "--timeout=$API_TIMEOUT",
"--log-level=DEBUG", "--enable-stdio-inheritance",
"-k", "uvicorn.workers.UvicornWorker", "--reload", "dug.server:APP" ]
entrypoint: [ "uvicorn",
"--host", "0.0.0.0" , "--port" , "$API_PORT",
"--log-level=debug", "--reload-dir", "/home/dug/dug/", "--reload", "dug.server:APP" ]
volumes:
- ./src:/home/dug/dug/
ports:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ requests
redis
requests-cache
six

retrying
# Click for command line arguments
# We use Click 7.0 because that's what one of the pinned packages above use.
click
Expand Down
30 changes: 28 additions & 2 deletions src/dug/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ class Config:
nboost_host: str = "nboost"
nboost_port: int = 8000

program_sort_list: str = ""
program_description: dict=field(default_factory=lambda:{})
consent_id_path: str= ""
missing_studies_path: str=""
missing_program_path: str=""


# Preprocessor config that will be passed to annotate.Preprocessor constructor
preprocessor: dict = field(
default_factory=lambda: {
Expand All @@ -43,8 +50,23 @@ class Config:
},
"sapbert": {
"classification_url": "https://med-nemo.apps.renci.org/annotate/",
"annotator_url": "https://babel-sapbert.apps.renci.org/annotate/",
},
"annotator_url": "https://sap-qdrant.apps.renci.org/annotate/",
"score_threshold": 0.5,
"bagel": {
"enabled": True,
"url": "https://bagel.apps.renci.org/group_synonyms_openai",
"prompt": "bagel/ask_classes",
"llm_args": {
"llm_model_name": "gpt-4o-2024-05-13",
"organization": "",
"access_key": "",
"llm_model_args": {
"top_p": 0,
"temperature": 0.1
}
}
}
}
}
)

Expand Down Expand Up @@ -137,6 +159,10 @@ def from_env(cls):
"redis_host": "REDIS_HOST",
"redis_port": "REDIS_PORT",
"redis_password": "REDIS_PASSWORD",
"program_description": "PROGRAM_DESCRIPTION",
"consent_id_path": "CONSENT_ID_PATH",
"missing_studies_path": "MISSING_STUDIES_PATH",
"missing_program_path": "MISSING_PROGRAM_PATH"
}

kwargs = {}
Expand Down
2 changes: 2 additions & 0 deletions src/dug/core/annotators/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from dug import utils as utils
from requests import Session
import bmt
from retrying import retry

logger = logging.getLogger("dug")

Expand Down Expand Up @@ -198,6 +199,7 @@ def __call__(self, curie: str, http_session):
result = self.handle_response(curie, response)
return result

@retry(stop_max_attempt_number=3)
def make_request(self, curie: str, http_session: Session):
# Get response from namelookup reverse lookup op
# example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post)
Expand Down
Loading

0 comments on commit 5f7553d

Please sign in to comment.