Skip to content

Commit

Permalink
reformatted cbioportal pipeline for nextflow compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
jessicaw9910 committed Apr 3, 2024
1 parent 1f1e326 commit 2062134
Show file tree
Hide file tree
Showing 9 changed files with 279 additions and 80 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -187,3 +187,4 @@ $RECYCLE.BIN/
# Requests cache directory
requests_cache/
data_cache/
*params.json
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ pytest-runner = "^6.0.1"
[tool.pytest.ini_options]
pythonpath = ["src"]

[tool.poetry.scripts]
process_cbioportal = "missense_kinase_toolkit.cli.process_cbioportal:main"

[tool.poetry-dynamic-versioning]
enable = true
vcs = "git"
Expand Down
94 changes: 15 additions & 79 deletions src/missense_kinase_toolkit/cbioportal.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,71 +4,15 @@

import os
import pandas as pd
import sys

from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient


CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
DATA_CACHE_DIR = "DATA_CACHE"
CBIOPORTAL_COHORT_VAR = "CBIOPORTAL_COHORT"


def maybe_get_cbioportal_token_from_env(
) -> str | None:
"""Get the cBioPortal token from the environment
Returns
-------
str | None
cBioPortal token as string if exists, otherwise None
"""
try:
token = os.environ[CBIOPORTAL_TOKEN_VAR]
except KeyError:
token = None

return token


def maybe_get_cbioportal_instance_from_env(
) -> str | None:
"""Get the cBioPortal instance from the environment
Returns
-------
str | None
cBioPortal instance as string if exists, otherwise None
"""
try:
instance = os.environ[CBIOPORTAL_INSTANCE_VAR]
except KeyError:
instance = None

return instance


def maybe_get_cbioportal_cohort_from_env(
) -> str | None:
"""Get the cBioPortal instance from the environment
Returns
-------
str | None
cBioPortal instance as string if exists, otherwise None
"""
try:
instance = os.environ[CBIOPORTAL_COHORT_VAR]
except KeyError:
print("Cohort not found in environment variables. This is necessary to run analysis. Exiting...")
sys.exit(1)

return instance
from missense_kinase_toolkit import config


def get_all_mutations_by_study(
study_id: str,
) -> list | None:
"""Get mutations cBioPortal data
Expand All @@ -77,19 +21,11 @@ def get_all_mutations_by_study(
list | None
cBioPortal data of Abstract Base Classes objects if successful, otherwise None
"""
token = maybe_get_cbioportal_token_from_env()

instance = maybe_get_cbioportal_instance_from_env()
if instance is not None:
url = f"https://{instance}/api/v2/api-docs"
else:
url = "https://cbioportal.org/api/v2/api-docs"
instance = config.get_cbioportal_instance()
url = f"https://{instance}/api/v2/api-docs"
token = config.maybe_get_cbioportal_token()

# Zehir, 2017 MSKCC sequencing cohort is "msk_impact_2017"
# MSKCC clinical sequencing cohort is "mskimpact"
study_id = maybe_get_cbioportal_cohort_from_env()

if all(v is not None for v in (token, instance)):
if token is not None:
http_client = RequestsClient()
http_client.set_api_key(
instance,
Expand Down Expand Up @@ -179,23 +115,23 @@ def save_cbioportal_data_to_csv(
None
"""
try:
path_data = os.environ[DATA_CACHE_DIR]
path_data = config.get_output_dir()
if not os.path.exists(path_data):
os.makedirs(path_data)
study_id = maybe_get_cbioportal_cohort_from_env()
study_id = config.get_cbioportal_cohort()
df.to_csv(os.path.join(path_data, f"{study_id}_mutations.csv"), index=False)
except KeyError:
print("DATA_CACHE not found in environment variables...")
print("OUTPUT_DIR not found in environment variables...")


def get_and_save_cbioportal_cohort(
study_id: str,
) -> None:
muts = get_all_mutations_by_study(study_id)

def main():
muts = get_all_mutations_by_study()
df_muts = parse_iterabc2dataframe(muts)
df_genes = parse_iterabc2dataframe(df_muts["gene"])
df_combo = pd.concat([df_muts, df_genes], axis=1)
df_combo = df_combo.drop(['gene'], axis=1)
save_cbioportal_data_to_csv(df_combo)


if __name__ == "__main__":
main()
save_cbioportal_data_to_csv(df_combo)
Empty file.
73 changes: 73 additions & 0 deletions src/missense_kinase_toolkit/cli/process_cbioportal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import argparse

from missense_kinase_toolkit import config, cbioportal

def parsearg_utils():
parser = argparse.ArgumentParser(
description="Get mutations from cBioPortal cohort and instance"
)

parser.add_argument(
"--cohort",
type=str,
help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort)",
default="msk_impact_2017",
)

parser.add_argument(
"--outDir",
type=str,
help="Required: Output directory path (str)",
)

parser.add_argument(
"--instance",
type=str,
help="Optional: cBioPortal instance (e.g., `cbioportal.mskcc.org`). Default: `cbioportal.org` (str)",
default="cbioportal.org",
)

parser.add_argument(
"--token",
type=str,
default="",
help="Optional: cBioPortal API token (str)",
)

parser.add_argument(
"--requestsCache",
type=str,
default="",
help="Optional: Requests cache (str)",
)

# TODO: add logging functionality
return parser


def main():
args = parsearg_utils().parse_args()

str_studies = args.cohort
list_studies = str_studies.split(",")
list_studies = [study.strip() for study in list_studies]

# required arguments
config.set_output_dir(args.outDir)
config.set_cbioportal_instance(args.instance)

# optional arguments
try:
if args.token != "":
config.set_cbioportal_instance(args.token)
except AttributeError:
pass

try:
if args.requestsCache != "":
config.set_cbioportal_instance(args.requestsCache)
except AttributeError:
pass

for study in list_studies:
cbioportal.get_and_save_cbioportal_cohort(study)
138 changes: 138 additions & 0 deletions src/missense_kinase_toolkit/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import os
import sys


OUTPUT_DIR_VAR = "OUTPUT_DIR"
CBIOPORTAL_INSTANCE_VAR = "CBIOPORTAL_INSTANCE"
CBIOPORTAL_TOKEN_VAR = "CBIOPORTAL_TOKEN"
REQUEST_CACHE_VAR = "REQUESTS_CACHE"


def set_output_dir(
val: str
) -> None:
"""Set the output directory in environment variables
Parameters
----------
val : str
Output directory path
Returns
-------
None
"""
os.environ[OUTPUT_DIR_VAR] = val


def get_output_dir(
) -> str | None:
"""Get the output directory from the environment
Returns
-------
str | None
Output directory path if exists, otherwise None
"""
try:
return os.environ[OUTPUT_DIR_VAR]
except KeyError:
print("Output directory not found in environment variables. This is necessary to run analysis. Exiting...")
sys.exit(1)


def set_cbioportal_instance(
val: str
) -> None:
"""Set the cBioPortal instance in the environment variables
Parameters
----------
val : str
cBioPortal instance; e.g., "cbioportal.mskcc.org" for MSKCC or
Returns
-------
None
"""
os.environ[CBIOPORTAL_INSTANCE_VAR] = val


def get_cbioportal_instance(
) -> str | None:
"""Get the cBioPortal instance from the environment
Returns
-------
str | None
cBioPortal instance as string if exists, otherwise None
"""
try:
return os.environ[CBIOPORTAL_INSTANCE_VAR]
except KeyError:
print("cBioPortal isntance not found in environment variables. This is necessary to run analysis. Exiting...")
sys.exit(1)


def set_cbioportal_token(
val: str
) -> None:
"""Set the cBioPortal token in the environment variables
Parameters
----------
val : str
cBioPortal token
Returns
-------
None
"""
os.environ[CBIOPORTAL_TOKEN_VAR] = val


def maybe_get_cbioportal_token(
) -> str | None:
"""Get the cBioPortal token from the environment
Returns
-------
str | None
cBioPortal token as string if exists, otherwise None
"""
try:
return os.environ[CBIOPORTAL_TOKEN_VAR]
except KeyError:
return None


def set_request_cache(
val: str
) -> None:
"""Set the request cache path in environment variables
Parameters
----------
val : str
Request cache path
Returns
-------
None
"""
os.environ[REQUEST_CACHE_VAR] = val


def maybe_get_request_cache(
) -> str | None:
"""Get the request cache path from the environment
Returns
-------
str | None
Request cache path as string if exists, otherwise None
"""
try:
return os.environ[REQUEST_CACHE_VAR]
except KeyError:
return None
2 changes: 1 addition & 1 deletion src/missense_kinase_toolkit/scrapers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd

def scrape_kinhub(
url: str ='http://www.kinhub.org/kinases.html'
url: str = "http://www.kinhub.org/kinases.html",
) -> pd.DataFrame:
"""Scrape the KinHub database for kinase information
Expand Down
24 changes: 24 additions & 0 deletions src/nextflow/README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# NextFlow workflow to run pipeline

To run: `nextflow run main.nf -params-file params.json`

Generate own `params.json` file using the following parameters:
```
{
"CBIOPORTAL_COHORT" : "TODO",
"OUTPUT_DIR" : "TODO",
"CBIOPORTAL_INSTANCE" : "TODO",
"CBIOPORTAL_TOKEN" : "TODO",
"REQUESTS_CACHE" : "TODO"
}
```

Below is a description of what each variable should contain. If variable is optional and not in use, do not create any entry in the `json` file.

| Variable | Optional | Description |
| :--------------------| :------: | :---------- |
| 'CBIOPORTAL_COHORT' | No | cBioPortal cohort to analyze |
| 'OUTPUT_DIR' | No | Path to outdir to save data |
| 'CBIOPORTAL_INSTANCE'| Yes | `cbioportal.org` if none provided |
| 'CBIOPORTAL_TOKEN' | Yes | Data Access Token if using private instance|
| 'REQUESTS_CACHE' | Yes | Path to dir to cache requests data |
Loading

0 comments on commit 2062134

Please sign in to comment.