updated cbioportal scripts to conform to NF ETL pipeline

choderalab · Apr 4, 2024 · a93a19a · a93a19a
1 parent 25845a4
commit a93a19a
Show file tree

Hide file tree

Showing 9 changed files with 95 additions and 77 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -43,7 +43,7 @@ pytest-runner = "^6.0.1"
 pythonpath = ["src"]
 
 [tool.poetry.scripts]
-process_cbioportal = "missense_kinase_toolkit.cli.process_cbioportal:main"
+extract_cbioportal = "missense_kinase_toolkit.cli.extract_cbioportal:main"
 
 [tool.poetry-dynamic-versioning]
 enable = true

diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py
@@ -1,14 +1,19 @@
 #!/usr/bin/env python3
 
-from __future__ import annotations
-
 import os
 import pandas as pd
 
 from bravado.client import SwaggerClient
 from bravado.requests_client import RequestsClient
 
-from missense_kinase_toolkit import config
+from missense_kinase_toolkit import config, io_utils
+
+
+# OUTPUT_DIR_VAR = "OUTPUT_DIR"
+# CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
+# CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
+# REQUEST_CACHE_VAR = "REQUESTS_CACHE"
+# CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT"
 
 
 def get_all_mutations_by_study(
@@ -21,9 +26,16 @@ def get_all_mutations_by_study(
     list | None
         cBioPortal data of Abstract Base Classes objects if successful, otherwise None
     """
+    # instance = os.environ[CBIOPORTAL_INSTANCE_VAR]
     instance = config.get_cbioportal_instance()
     url = f"https://{instance}/api/v2/api-docs"
+    # token = os.environ[CBIOPORTAL_TOKEN_VAR]
     token = config.maybe_get_cbioportal_token()
+    # study_id = os.environ[CBIOPORTAL_COHORT_VAR]
+
+    # print(token)
+    # print(url)
+    # print(study_id)
 
     if token is not None:
         http_client = RequestsClient()
@@ -100,38 +112,51 @@ def parse_iterabc2dataframe(
     return df
 
 
-def save_cbioportal_data_to_csv(
-    df: pd.DataFrame,
-) -> None:
-    """Save cBioPortal data to a CSV file
-
-    Parameters
-    ----------
-    df : pd.DataFrame
-        Dataframe of cBioPortal data
-
-    Returns
-    -------
-    None
-    """
-    try:
-        path_data = config.get_output_dir()
-        if not os.path.exists(path_data):
-            os.makedirs(path_data)
-        study_id = config.get_cbioportal_cohort()
-        df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
-    except KeyError:
-        print("OUTPUT_DIR not found in environment variables...")
+# def save_cbioportal_data_to_csv(
+#     df: pd.DataFrame,
+#     study_id: str,
+# ) -> None:
+#     """Save cBioPortal data to a CSV file
+
+#     Parameters
+#     ----------
+#     df : pd.DataFrame
+#         Dataframe of cBioPortal data
+#     study_id : str
+#         cBioPortal study ID
+
+#     Returns
+#     -------
+#     None
+#     """
+#     try:
+#         # path_data = os.environ[OUTPUT_DIR_VAR]
+#         path_data = config.get_output_dir()
+#         if not os.path.exists(path_data):
+#             os.makedirs(path_data)
+#         # study_id = os.environ[CBIOPORTAL_COHORT_VAR]
+#         # study_id = config.get_cbioportal_cohort()
+#         df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
+#     except KeyError:
+#         print("OUTPUT_DIR not found in environment variables...")
 
 
 def get_and_save_cbioportal_cohort(
+# def main(
     study_id: str,
 ) -> None:
+    # muts = get_all_mutations_by_study()
     muts = get_all_mutations_by_study(study_id)
 
     df_muts = parse_iterabc2dataframe(muts)
     df_genes = parse_iterabc2dataframe(df_muts["gene"])
     df_combo = pd.concat([df_muts, df_genes], axis=1)
     df_combo = df_combo.drop(['gene'], axis=1)
 
-    save_cbioportal_data_to_csv(df_combo)
+    filename = f"{study_id}_mutations.csv"
+    io_utils.save_dataframe_to_csv(df_combo, filename)
+    # save_cbioportal_data_to_csv(df_combo, study_id)
+
+
+# if __name__ == "__main__":
+#     main()
diff --git a/..._kinase_toolkit/cli/process_cbioportal.py → ..._kinase_toolkit/cli/extract_cbioportal.py b/..._kinase_toolkit/cli/process_cbioportal.py → ..._kinase_toolkit/cli/extract_cbioportal.py
@@ -25,8 +25,8 @@ def parsearg_utils():
     parser.add_argument(
         "--instance",
         type=str,
-        help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `cbioportal.org` (str)",
-        default="cbioportal.org",
+        help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `www.cbioportal.org` (str)",
+        default="www.cbioportal.org",
     )
 
     parser.add_argument(
@@ -36,12 +36,12 @@ def parsearg_utils():
         help="Optional: cBioPortal API token (str)",
     )
 
-    parser.add_argument(
-        "--requestsCache",
-        type=str,
-        default="",
-        help="Optional: Requests cache (str)",
-    )
+    # parser.add_argument(
+    #     "--requestsCache",
+    #     type=str,
+    #     default="",
+    #     help="Optional: Requests cache (str)",
+    # )
 
     # TODO: add logging functionality
     return parser
@@ -54,22 +54,23 @@ def main():
     list_studies = str_studies.split(",")
     list_studies = [study.strip() for study in list_studies]
 
-    # required arguments
+    # required argument
     config.set_output_dir(args.outDir)
-    config.set_cbioportal_instance(args.instance)
 
     # optional arguments
+    config.set_cbioportal_instance(args.instance)
+
     try:
         if args.token != "":
-            config.set_cbioportal_instance(args.token)
+            config.set_cbioportal_token(args.token)
     except AttributeError:
         pass
 
-    try:
-        if args.requestsCache != "":
-            config.set_cbioportal_instance(args.requestsCache)
-    except AttributeError:
-        pass
+    # try:
+    #     if args.requestsCache != "":
+    #         config.set_request_cache(args.requestsCache)
+    # except AttributeError:
+    #     pass
 
     for study in list_studies:
         cbioportal.get_and_save_cbioportal_cohort(study)
diff --git a/src/missense_kinase_toolkit/hgnc.py b/src/missense_kinase_toolkit/hgnc.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import requests
 
 from missense_kinase_toolkit import requests_wrapper, utils_requests

diff --git a/src/missense_kinase_toolkit/io_utils.py b/src/missense_kinase_toolkit/io_utils.py
@@ -2,7 +2,7 @@
 import pandas as pd
 
 
-DATA_CACHE_DIR = "DATA_CACHE"
+OUTPUT_DIR_VAR = "OUTPUT_DIR"
 
 
 def save_dataframe_to_csv(
@@ -15,8 +15,9 @@ def save_dataframe_to_csv(
     ----------
     df : pd.DataFrame
         Dataframe to save
-    output_path : str
-        Path to save the CSV file
+    filename : str
+        Filename to save (either with or without "csv" suffix)
+
 
     Returns
     -------
@@ -25,9 +26,9 @@ def save_dataframe_to_csv(
     filename = filename.replace(".csv", "") + ".csv"
 
     try:
-        path_data = os.environ[DATA_CACHE_DIR]
+        path_data = os.environ[OUTPUT_DIR_VAR]
         if not os.path.exists(path_data):
             os.makedirs(path_data)
-        df.to_csv(os.path.join(path_data, f"{filename}_mutations.csv"), index=False)
+        df.to_csv(os.path.join(path_data, filename), index=False)
     except KeyError:
-        print("DATA_CACHE not found in environment variables...")
+        print("OUTPUT_DIR not found in environment variables...")
diff --git a/src/missense_kinase_toolkit/pfam.py b/src/missense_kinase_toolkit/pfam.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 import json
 
 import pandas as pd

diff --git a/src/missense_kinase_toolkit/requests_wrapper.py b/src/missense_kinase_toolkit/requests_wrapper.py
@@ -31,6 +31,9 @@ def get_cached_session():
     if REQUEST_CACHE_VAR in os.environ:
         cache_location = os.environ[REQUEST_CACHE_VAR]
 
+        if not os.path.exists(cache_location):
+            os.makedirs(cache_location)
+
         session = CachedSession(
             cache_location, allowable_codes=(200, 404, 400), backend="sqlite"
         )

diff --git a/src/nextflow/extract_cbioportal.nf b/src/nextflow/extract_cbioportal.nf
@@ -0,0 +1,16 @@
+process PROCESS_CBIOPORTAL {
+    input:
+    tuple val(cbio_cohort), path(out_dir), val(cbio_inst), val(cbio_token), path(request_cache)
+
+    output:
+    path("${out_dir}/cbioportal")
+    """
+    export PYTHONHASHSEED=0
+    process_cbioportal \
+        --cohort ${cbio_cohort} \
+        --outDir ${out_dir} \
+        --instance ${cbio_inst} \
+        --token ${cbio_token} \
+        --requestsCache ${request_cache}
+    """
+}
diff --git a/src/nextflow/process_cbioportal.nf b/src/nextflow/process_cbioportal.nf