Skip to content

Commit

Permalink
updated cbioportal scripts to conform to NF ETL pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
jessicaw9910 committed Apr 4, 2024
1 parent 25845a4 commit a93a19a
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 77 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pytest-runner = "^6.0.1"
pythonpath = ["src"]

[tool.poetry.scripts]
process_cbioportal = "missense_kinase_toolkit.cli.process_cbioportal:main"
extract_cbioportal = "missense_kinase_toolkit.cli.extract_cbioportal:main"

[tool.poetry-dynamic-versioning]
enable = true
Expand Down
77 changes: 51 additions & 26 deletions src/missense_kinase_toolkit/cbioportal.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
#!/usr/bin/env python3

from __future__ import annotations

import os
import pandas as pd

from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient

from missense_kinase_toolkit import config
from missense_kinase_toolkit import config, io_utils


# OUTPUT_DIR_VAR = "OUTPUT_DIR"
# CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
# CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
# REQUEST_CACHE_VAR = "REQUESTS_CACHE"
# CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT"


def get_all_mutations_by_study(
Expand All @@ -21,9 +26,16 @@ def get_all_mutations_by_study(
list | None
cBioPortal data of Abstract Base Classes objects if successful, otherwise None
"""
# instance = os.environ[CBIOPORTAL_INSTANCE_VAR]
instance = config.get_cbioportal_instance()
url = f"https://{instance}/api/v2/api-docs"
# token = os.environ[CBIOPORTAL_TOKEN_VAR]
token = config.maybe_get_cbioportal_token()
# study_id = os.environ[CBIOPORTAL_COHORT_VAR]

# print(token)
# print(url)
# print(study_id)

if token is not None:
http_client = RequestsClient()
Expand Down Expand Up @@ -100,38 +112,51 @@ def parse_iterabc2dataframe(
return df


def save_cbioportal_data_to_csv(
df: pd.DataFrame,
) -> None:
"""Save cBioPortal data to a CSV file
Parameters
----------
df : pd.DataFrame
Dataframe of cBioPortal data
Returns
-------
None
"""
try:
path_data = config.get_output_dir()
if not os.path.exists(path_data):
os.makedirs(path_data)
study_id = config.get_cbioportal_cohort()
df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
except KeyError:
print("OUTPUT_DIR not found in environment variables...")
# def save_cbioportal_data_to_csv(
# df: pd.DataFrame,
# study_id: str,
# ) -> None:
# """Save cBioPortal data to a CSV file

# Parameters
# ----------
# df : pd.DataFrame
# Dataframe of cBioPortal data
# study_id : str
# cBioPortal study ID

# Returns
# -------
# None
# """
# try:
# # path_data = os.environ[OUTPUT_DIR_VAR]
# path_data = config.get_output_dir()
# if not os.path.exists(path_data):
# os.makedirs(path_data)
# # study_id = os.environ[CBIOPORTAL_COHORT_VAR]
# # study_id = config.get_cbioportal_cohort()
# df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
# except KeyError:
# print("OUTPUT_DIR not found in environment variables...")


def get_and_save_cbioportal_cohort(
# def main(
study_id: str,
) -> None:
# muts = get_all_mutations_by_study()
muts = get_all_mutations_by_study(study_id)

df_muts = parse_iterabc2dataframe(muts)
df_genes = parse_iterabc2dataframe(df_muts["gene"])
df_combo = pd.concat([df_muts, df_genes], axis=1)
df_combo = df_combo.drop(['gene'], axis=1)

save_cbioportal_data_to_csv(df_combo)
filename = f"{study_id}_mutations.csv"
io_utils.save_dataframe_to_csv(df_combo, filename)
# save_cbioportal_data_to_csv(df_combo, study_id)


# if __name__ == "__main__":
# main()
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ def parsearg_utils():
parser.add_argument(
"--instance",
type=str,
help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `cbioportal.org` (str)",
default="cbioportal.org",
help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `www.cbioportal.org` (str)",
default="www.cbioportal.org",
)

parser.add_argument(
Expand All @@ -36,12 +36,12 @@ def parsearg_utils():
help="Optional: cBioPortal API token (str)",
)

parser.add_argument(
"--requestsCache",
type=str,
default="",
help="Optional: Requests cache (str)",
)
# parser.add_argument(
# "--requestsCache",
# type=str,
# default="",
# help="Optional: Requests cache (str)",
# )

# TODO: add logging functionality
return parser
Expand All @@ -54,22 +54,23 @@ def main():
list_studies = str_studies.split(",")
list_studies = [study.strip() for study in list_studies]

# required arguments
# required argument
config.set_output_dir(args.outDir)
config.set_cbioportal_instance(args.instance)

# optional arguments
config.set_cbioportal_instance(args.instance)

try:
if args.token != "":
config.set_cbioportal_instance(args.token)
config.set_cbioportal_token(args.token)
except AttributeError:
pass

try:
if args.requestsCache != "":
config.set_cbioportal_instance(args.requestsCache)
except AttributeError:
pass
# try:
# if args.requestsCache != "":
# config.set_request_cache(args.requestsCache)
# except AttributeError:
# pass

for study in list_studies:
cbioportal.get_and_save_cbioportal_cohort(study)
2 changes: 0 additions & 2 deletions src/missense_kinase_toolkit/hgnc.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from __future__ import annotations

import requests

from missense_kinase_toolkit import requests_wrapper, utils_requests
Expand Down
13 changes: 7 additions & 6 deletions src/missense_kinase_toolkit/io_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd


DATA_CACHE_DIR = "DATA_CACHE"
OUTPUT_DIR_VAR = "OUTPUT_DIR"


def save_dataframe_to_csv(
Expand All @@ -15,8 +15,9 @@ def save_dataframe_to_csv(
----------
df : pd.DataFrame
Dataframe to save
output_path : str
Path to save the CSV file
filename : str
Filename to save (either with or without "csv" suffix)
Returns
-------
Expand All @@ -25,9 +26,9 @@ def save_dataframe_to_csv(
filename = filename.replace(".csv", "") + ".csv"

try:
path_data = os.environ[DATA_CACHE_DIR]
path_data = os.environ[OUTPUT_DIR_VAR]
if not os.path.exists(path_data):
os.makedirs(path_data)
df.to_csv(os.path.join(path_data, f"{filename}_mutations.csv"), index=False)
df.to_csv(os.path.join(path_data, filename), index=False)
except KeyError:
print("DATA_CACHE not found in environment variables...")
print("OUTPUT_DIR not found in environment variables...")
2 changes: 0 additions & 2 deletions src/missense_kinase_toolkit/pfam.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
from __future__ import annotations

import json

import pandas as pd
Expand Down
3 changes: 3 additions & 0 deletions src/missense_kinase_toolkit/requests_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ def get_cached_session():
if REQUEST_CACHE_VAR in os.environ:
cache_location = os.environ[REQUEST_CACHE_VAR]

if not os.path.exists(cache_location):
os.makedirs(cache_location)

session = CachedSession(
cache_location, allowable_codes=(200, 404, 400), backend="sqlite"
)
Expand Down
16 changes: 16 additions & 0 deletions src/nextflow/extract_cbioportal.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
process PROCESS_CBIOPORTAL {
input:
tuple val(cbio_cohort), path(out_dir), val(cbio_inst), val(cbio_token), path(request_cache)

output:
path("${out_dir}/cbioportal")
"""
export PYTHONHASHSEED=0
process_cbioportal \
--cohort ${cbio_cohort} \
--outDir ${out_dir} \
--instance ${cbio_inst} \
--token ${cbio_token} \
--requestsCache ${request_cache}
"""
}
24 changes: 0 additions & 24 deletions src/nextflow/process_cbioportal.nf

This file was deleted.

0 comments on commit a93a19a

Please sign in to comment.