Merge pull request #368 from helxplatform/develop

Next release
helxplatform · Aug 14, 2024 · 5f7553d · 5f7553d
2 parents 1db2225 + 3ea7813
commit 5f7553d
Show file tree

Hide file tree

Showing 19 changed files with 1,007 additions and 674 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -3,15 +3,13 @@
 # A container for the core semantic-search capability.
 #
 ######################################################
-FROM python:3.12.1-alpine3.19
+FROM python:alpine3.20
 
 
 # Install required packages
 RUN apk update && \
-    apk add g++ make
+    apk add g++ make 
 
-#upgrade openssl \
-#RUN apk  add openssl=3.1.4-r4
 
 RUN pip install --upgrade pip
 # Create a non-root user.

diff --git a/bin/get_bdc_studies_from_gen3.py b/bin/get_bdc_studies_from_gen3.py
@@ -105,11 +105,12 @@ def get_bdc_studies_from_gen3(output, bdc_gen3_base_url):
     sorted_study_ids = sorted(discovery_list)
 
     # Step 2. For every study ID, write out an entry into the CSV output file.
-    csv_writer = csv.DictWriter(output, fieldnames=['Accession', 'Consent', 'Study Name', 'Last modified', 'Notes', 'Description'])
+    csv_writer = csv.DictWriter(output, fieldnames=['Accession', 'Consent', 'Study Name', 'Program', 'Last modified', 'Notes', 'Description'])
     csv_writer.writeheader()
     for study_id in sorted_study_ids:
         # Reset the variables we need.
         study_name = ''
+        program_names = []
         description = ''
         notes = ''
 
@@ -139,6 +140,24 @@ def get_bdc_studies_from_gen3(output, bdc_gen3_base_url):
             else:
                 study_name = '(no name)'
 
+            # Program name.
+            if 'authz' in gen3_discovery:
+                # authz is in the format /programs/topmed/projects/ECLIPSE_DS-COPD-MDS-RD
+                match = re.fullmatch(r'^/programs/(.*)/projects/(.*)$', gen3_discovery['authz'])
+                if match:
+                    program_names.append(match.group(1))
+                    # study_short_name = match.group(2)
+
+            # Tags don't seem as fine-grained as authz and are often slightly different from the authz values
+            # (e.g. `COVID 19` instead of `COVID-19`, `Parent` instead of `parent`), so for now we only use the authz
+            # values.
+            #
+            # if 'tags' in gen3_discovery:
+            #     for tag in gen3_discovery['tags']:
+            #         category = tag.get('category', '')
+            #         if category.lower() == 'program':
+            #             program_names.append(tag.get('name', '').strip())
+
             # Description.
             description = gen3_discovery.get('study_description', '')
 
@@ -156,11 +175,15 @@ def get_bdc_studies_from_gen3(output, bdc_gen3_base_url):
             accession = study_id
             consent = ''
 
+        # Remove any blank program names.
+        program_names = filter(lambda n: n != '', program_names)
+
         csv_writer.writerow({
             'Accession': accession,
             'Consent': consent,
             'Study Name': study_name,
             'Description': description,
+            'Program': '|'.join(sorted(set(program_names))),
             'Last modified': last_modified,
             'Notes': notes.strip()
         })

diff --git a/bin/get_dbgap_data_dicts.py b/bin/get_dbgap_data_dicts.py
@@ -16,6 +16,10 @@
 # Default to logging at the INFO level.
 logging.basicConfig(level=logging.INFO)
 
+# FTP timeout in seconds
+FTP_TIMEOUT = 100
+
+
 # Helper function
 def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
     """
@@ -28,7 +32,7 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
 
     count_downloaded_vars = 0
 
-    ftp = FTP('ftp.ncbi.nlm.nih.gov')
+    ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=FTP_TIMEOUT)
     ftp.login()
     ftp.sendcmd('PASV')
     study_variable = dbgap_accession_id.split('.')[0]
@@ -56,6 +60,8 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
         return 0
 
     ftp_filelist = ftp.nlst(".")
+    ftp.quit()
+
     for ftp_filename in ftp_filelist:
         if 'data_dict' in ftp_filename:
             with open(f"{local_path}/{ftp_filename}", "wb") as data_dict_file:
@@ -73,13 +79,20 @@ def download_dbgap_study(dbgap_accession_id, dbgap_output_dir):
                 logging.info(f"Downloaded {ftp_filename} to {local_path}/{ftp_filename} in {response.elapsed.microseconds} microseconds.")
             count_downloaded_vars += 1
 
+    # Sometimes we've timed out on the FTP server by this point. So let's disconnect and reconnect.
+    ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=FTP_TIMEOUT)
+    ftp.login()
+    ftp.sendcmd('PASV')
+
     # Step 2: Check to see if there's a GapExchange file in the parent folder
     #         and if there is, get it.
     try:
         ftp.cwd(study_id_path)
     except error_temp as e:
-        logging.error("Ftp session timed out... Reconnecting")
+        logging.error("FTP session timed out. Reconnecting.")
+        ftp = FTP('ftp.ncbi.nlm.nih.gov', timeout=FTP_TIMEOUT)
         ftp.login()
+        ftp.sendcmd('PASV')
         resp = ftp.cwd(study_id_path)
         if resp[:1] == '2':
             logging.info("command success")
@@ -160,7 +173,12 @@ def get_dbgap_data_dicts(input_file, format, field, outdir, group_by, skip):
         # If multiple group-by fields are specified, we use them in order.
         output_dir_for_row = output_dir
         for group_name in list(group_by):
-            if group_name in row:
+            if group_name in row and row[group_name].strip() != '':
+                if '|' in row[group_name]:
+                    raise RuntimeError(
+                        f"Pipe-separated multiple values in group-by field {group_name} not supported:" +
+                        f"{row[group_name]} (line {line_num})"
+                    )
                 output_dir_for_row = os.path.join(output_dir_for_row, row[group_name])
             else:
                 output_dir_for_row = os.path.join(output_dir_for_row, '__missing__')
@@ -179,9 +197,15 @@ def get_dbgap_data_dicts(input_file, format, field, outdir, group_by, skip):
             # Try to download to output folder if the study hasn't already been downloaded
             if not os.path.exists(dbgap_dir):
                 logging.info(f"Downloading {dbgap_id} to {dbgap_dir}")
-                count_downloaded += download_dbgap_study(dbgap_id, dbgap_dir)
-
-    logging.info(f"Downloaded {count_downloaded} studies from {count_rows} in input files.")
+                try:
+                    count_downloaded += download_dbgap_study(dbgap_id, dbgap_dir)
+                except Exception as ex:
+                    logging.error(f"Exception occurred while downloading {dbgap_id} to {dbgap_dir}: {ex}")
+                    shutil.rmtree(dbgap_dir, ignore_errors=True)
+                    logging.error(f"Deleted {dbgap_dir} as it is probably incomplete.")
+                    logging.error("Re-run this script to ensure that all variables are downloaded.")
+
+    logging.info(f"Downloaded {count_downloaded} data dictionaries from {count_rows} rows in input files.")
 
 
 if __name__ == "__main__":

diff --git a/data/bdc_dbgap_ids.csv b/data/bdc_dbgap_ids.csv
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -40,11 +40,9 @@ services:
       REDIS_PASSWORD: "$REDIS_PASSWORD"
       FLASK_ENV: "development"
       PYTHONUNBUFFERED: "TRUE"
-    entrypoint: [ "gunicorn",
-                     "--workers=$API_WORKERS", "--name=dug",
-                     "--bind=0.0.0.0:$API_PORT", "--timeout=$API_TIMEOUT",
-                     "--log-level=DEBUG", "--enable-stdio-inheritance",
-                     "-k", "uvicorn.workers.UvicornWorker", "--reload", "dug.server:APP" ]
+    entrypoint: [ "uvicorn",
+                     "--host", "0.0.0.0" , "--port" , "$API_PORT",
+                     "--log-level=debug",  "--reload-dir", "/home/dug/dug/",  "--reload", "dug.server:APP" ]
     volumes:
       - ./src:/home/dug/dug/
     ports:

diff --git a/requirements.txt b/requirements.txt
@@ -21,7 +21,7 @@ requests
 redis
 requests-cache
 six
-
+retrying
 # Click for command line arguments
 # We use Click 7.0 because that's what one of the pinned packages above use.
 click

diff --git a/src/dug/config.py b/src/dug/config.py
@@ -27,6 +27,13 @@ class Config:
     nboost_host: str = "nboost"
     nboost_port: int = 8000
 
+    program_sort_list: str = ""
+    program_description: dict=field(default_factory=lambda:{})
+    consent_id_path: str= ""
+    missing_studies_path: str=""
+    missing_program_path: str=""
+
+
     # Preprocessor config that will be passed to annotate.Preprocessor constructor
     preprocessor: dict = field(
         default_factory=lambda: {
@@ -43,8 +50,23 @@ class Config:
             },
             "sapbert": {
                 "classification_url": "https://med-nemo.apps.renci.org/annotate/",
-                "annotator_url": "https://babel-sapbert.apps.renci.org/annotate/",
-            },
+                "annotator_url": "https://sap-qdrant.apps.renci.org/annotate/",
+                "score_threshold": 0.5,
+                "bagel": {
+                    "enabled": True,
+                    "url": "https://bagel.apps.renci.org/group_synonyms_openai",
+                    "prompt": "bagel/ask_classes",
+                    "llm_args": {
+                        "llm_model_name": "gpt-4o-2024-05-13",
+                        "organization": "",
+                        "access_key": "",
+                        "llm_model_args": {
+                            "top_p": 0,
+                            "temperature": 0.1
+                        }
+                    }
+                }
+            }
         }
     )
 
@@ -137,6 +159,10 @@ def from_env(cls):
             "redis_host": "REDIS_HOST",
             "redis_port": "REDIS_PORT",
             "redis_password": "REDIS_PASSWORD",
+            "program_description": "PROGRAM_DESCRIPTION",
+            "consent_id_path": "CONSENT_ID_PATH",
+            "missing_studies_path": "MISSING_STUDIES_PATH",
+            "missing_program_path": "MISSING_PROGRAM_PATH"
         }
 
         kwargs = {}

diff --git a/src/dug/core/annotators/_base.py b/src/dug/core/annotators/_base.py
@@ -7,6 +7,7 @@
 from dug import utils as utils
 from requests import Session
 import bmt
+from retrying import retry
 
 logger = logging.getLogger("dug")
 
@@ -198,6 +199,7 @@ def __call__(self, curie: str, http_session):
         result = self.handle_response(curie, response)
         return result
 
+    @retry(stop_max_attempt_number=3)
     def make_request(self, curie: str, http_session: Session):
         # Get response from namelookup reverse lookup op
         # example (https://name-resolution-sri.renci.org/docs#/lookup/lookup_names_reverse_lookup_post)