diff --git a/pyproject.toml b/pyproject.toml index de6bd5f..86ddedd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ pytest-runner = "^6.0.1" pythonpath = ["src"] [tool.poetry.scripts] -process_cbioportal = "missense_kinase_toolkit.cli.process_cbioportal:main" +extract_cbioportal = "missense_kinase_toolkit.cli.extract_cbioportal:main" [tool.poetry-dynamic-versioning] enable = true diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py index b3a9723..3a9743a 100644 --- a/src/missense_kinase_toolkit/cbioportal.py +++ b/src/missense_kinase_toolkit/cbioportal.py @@ -1,14 +1,19 @@ #!/usr/bin/env python3 -from __future__ import annotations - import os import pandas as pd from bravado.client import SwaggerClient from bravado.requests_client import RequestsClient -from missense_kinase_toolkit import config +from missense_kinase_toolkit import config, io_utils + + +# OUTPUT_DIR_VAR = "OUTPUT_DIR" +# CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE" +# CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN" +# REQUEST_CACHE_VAR = "REQUESTS_CACHE" +# CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT" def get_all_mutations_by_study( @@ -21,9 +26,16 @@ def get_all_mutations_by_study( list | None cBioPortal data of Abstract Base Classes objects if successful, otherwise None """ + # instance = os.environ[CBIOPORTAL_INSTANCE_VAR] instance = config.get_cbioportal_instance() url = f"https://{instance}/api/v2/api-docs" + # token = os.environ[CBIOPORTAL_TOKEN_VAR] token = config.maybe_get_cbioportal_token() + # study_id = os.environ[CBIOPORTAL_COHORT_VAR] + + # print(token) + # print(url) + # print(study_id) if token is not None: http_client = RequestsClient() @@ -100,33 +112,40 @@ def parse_iterabc2dataframe( return df -def save_cbioportal_data_to_csv( - df: pd.DataFrame, -) -> None: - """Save cBioPortal data to a CSV file - - Parameters - ---------- - df : pd.DataFrame - Dataframe of cBioPortal data - - Returns - ------- - None - """ - try: - path_data = config.get_output_dir() - if not os.path.exists(path_data): - os.makedirs(path_data) - study_id = config.get_cbioportal_cohort() - df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False) - except KeyError: - print("OUTPUT_DIR not found in environment variables...") +# def save_cbioportal_data_to_csv( +# df: pd.DataFrame, +# study_id: str, +# ) -> None: +# """Save cBioPortal data to a CSV file + +# Parameters +# ---------- +# df : pd.DataFrame +# Dataframe of cBioPortal data +# study_id : str +# cBioPortal study ID + +# Returns +# ------- +# None +# """ +# try: +# # path_data = os.environ[OUTPUT_DIR_VAR] +# path_data = config.get_output_dir() +# if not os.path.exists(path_data): +# os.makedirs(path_data) +# # study_id = os.environ[CBIOPORTAL_COHORT_VAR] +# # study_id = config.get_cbioportal_cohort() +# df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False) +# except KeyError: +# print("OUTPUT_DIR not found in environment variables...") def get_and_save_cbioportal_cohort( +# def main( study_id: str, ) -> None: + # muts = get_all_mutations_by_study() muts = get_all_mutations_by_study(study_id) df_muts = parse_iterabc2dataframe(muts) @@ -134,4 +153,10 @@ def get_and_save_cbioportal_cohort( df_combo = pd.concat([df_muts, df_genes], axis=1) df_combo = df_combo.drop(['gene'], axis=1) - save_cbioportal_data_to_csv(df_combo) + filename = f"{study_id}_mutations.csv" + io_utils.save_dataframe_to_csv(df_combo, filename) + # save_cbioportal_data_to_csv(df_combo, study_id) + + +# if __name__ == "__main__": +# main() \ No newline at end of file diff --git a/src/missense_kinase_toolkit/cli/process_cbioportal.py b/src/missense_kinase_toolkit/cli/extract_cbioportal.py similarity index 75% rename from src/missense_kinase_toolkit/cli/process_cbioportal.py rename to src/missense_kinase_toolkit/cli/extract_cbioportal.py index c97832e..845d36f 100755 --- a/src/missense_kinase_toolkit/cli/process_cbioportal.py +++ b/src/missense_kinase_toolkit/cli/extract_cbioportal.py @@ -25,8 +25,8 @@ def parsearg_utils(): parser.add_argument( "--instance", type=str, - help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `cbioportal.org` (str)", - default="cbioportal.org", + help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `www.cbioportal.org` (str)", + default="www.cbioportal.org", ) parser.add_argument( @@ -36,12 +36,12 @@ def parsearg_utils(): help="Optional: cBioPortal API token (str)", ) - parser.add_argument( - "--requestsCache", - type=str, - default="", - help="Optional: Requests cache (str)", - ) + # parser.add_argument( + # "--requestsCache", + # type=str, + # default="", + # help="Optional: Requests cache (str)", + # ) # TODO: add logging functionality return parser @@ -54,22 +54,23 @@ def main(): list_studies = str_studies.split(",") list_studies = [study.strip() for study in list_studies] - # required arguments + # required argument config.set_output_dir(args.outDir) - config.set_cbioportal_instance(args.instance) # optional arguments + config.set_cbioportal_instance(args.instance) + try: if args.token != "": - config.set_cbioportal_instance(args.token) + config.set_cbioportal_token(args.token) except AttributeError: pass - try: - if args.requestsCache != "": - config.set_cbioportal_instance(args.requestsCache) - except AttributeError: - pass + # try: + # if args.requestsCache != "": + # config.set_request_cache(args.requestsCache) + # except AttributeError: + # pass for study in list_studies: cbioportal.get_and_save_cbioportal_cohort(study) diff --git a/src/missense_kinase_toolkit/hgnc.py b/src/missense_kinase_toolkit/hgnc.py index c5e616a..bc65a0e 100644 --- a/src/missense_kinase_toolkit/hgnc.py +++ b/src/missense_kinase_toolkit/hgnc.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import requests from missense_kinase_toolkit import requests_wrapper, utils_requests diff --git a/src/missense_kinase_toolkit/io_utils.py b/src/missense_kinase_toolkit/io_utils.py index 2bc0285..e8ac0a7 100644 --- a/src/missense_kinase_toolkit/io_utils.py +++ b/src/missense_kinase_toolkit/io_utils.py @@ -2,7 +2,7 @@ import pandas as pd -DATA_CACHE_DIR = "DATA_CACHE" +OUTPUT_DIR_VAR = "OUTPUT_DIR" def save_dataframe_to_csv( @@ -15,8 +15,9 @@ def save_dataframe_to_csv( ---------- df : pd.DataFrame Dataframe to save - output_path : str - Path to save the CSV file + filename : str + Filename to save (either with or without "csv" suffix) + Returns ------- @@ -25,9 +26,9 @@ def save_dataframe_to_csv( filename = filename.replace(".csv", "") + ".csv" try: - path_data = os.environ[DATA_CACHE_DIR] + path_data = os.environ[OUTPUT_DIR_VAR] if not os.path.exists(path_data): os.makedirs(path_data) - df.to_csv(os.path.join(path_data, f"{filename}_mutations.csv"), index=False) + df.to_csv(os.path.join(path_data, filename), index=False) except KeyError: - print("DATA_CACHE not found in environment variables...") + print("OUTPUT_DIR not found in environment variables...") diff --git a/src/missense_kinase_toolkit/pfam.py b/src/missense_kinase_toolkit/pfam.py index f623e7f..d398326 100644 --- a/src/missense_kinase_toolkit/pfam.py +++ b/src/missense_kinase_toolkit/pfam.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import json import pandas as pd diff --git a/src/missense_kinase_toolkit/requests_wrapper.py b/src/missense_kinase_toolkit/requests_wrapper.py index ae73e0f..edbb511 100644 --- a/src/missense_kinase_toolkit/requests_wrapper.py +++ b/src/missense_kinase_toolkit/requests_wrapper.py @@ -31,6 +31,9 @@ def get_cached_session(): if REQUEST_CACHE_VAR in os.environ: cache_location = os.environ[REQUEST_CACHE_VAR] + if not os.path.exists(cache_location): + os.makedirs(cache_location) + session = CachedSession( cache_location, allowable_codes=(200, 404, 400), backend="sqlite" ) diff --git a/src/nextflow/extract_cbioportal.nf b/src/nextflow/extract_cbioportal.nf new file mode 100644 index 0000000..b9462ca --- /dev/null +++ b/src/nextflow/extract_cbioportal.nf @@ -0,0 +1,16 @@ +process PROCESS_CBIOPORTAL { + input: + tuple val(cbio_cohort), path(out_dir), val(cbio_inst), val(cbio_token), path(request_cache) + + output: + path("${out_dir}/cbioportal") + """ + export PYTHONHASHSEED=0 + process_cbioportal \ + --cohort ${cbio_cohort} \ + --outDir ${out_dir} \ + --instance ${cbio_inst} \ + --token ${cbio_token} \ + --requestsCache ${request_cache} + """ +} diff --git a/src/nextflow/process_cbioportal.nf b/src/nextflow/process_cbioportal.nf deleted file mode 100644 index 5625530..0000000 --- a/src/nextflow/process_cbioportal.nf +++ /dev/null @@ -1,24 +0,0 @@ -process PROCESS_CBIOPORTAL { - // tag "$meta.id" - // label 'process_medium' - // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - // 'docker://jeffquinnmsk/pan_preclinical_etl:latest' : - // 'docker.io/jeffquinnmsk/pan_preclinical_etl:latest' }" - - input: - tuple val(meta), path(raw_data), path(studies), path(source_files), path(request_cache), val(study_name) - - output: - tuple val(meta), path("${prefix}/per_study_results/${study_name}"), emit: etl_results - - """ - export PYTHONHASHSEED=0 - mkdir -p "${prefix}/per_study_results/${study_name}" - process_cbioportal \ - --data-dir ${raw_data} \ - --output-dir ${prefix}/per_study_results/${study_name} \ - --studies ${studies} \ - --source-files ${source_files} \ - --study-id ${study_name} - """ -}