Nextflow (#15)

* reformatted cbioportal pipeline for nextflow compatibility * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * minor in progress changes in process_cbioportal.nf * added shebang to process_cbioportal.py * updated cbioportal scripts to conform to NF ETL pipeline * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * interim commit for transform_cbioportal resolved conflicts in cbioportal.py * added transform_cbioportal CLI code * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * .gitignore and pyproject.toml changes for transform_cbioportal CLI changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated NF README * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * updated cli and pfam scripts * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refactored cbioportal as class instead of functions * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * separated the extraction of the kinase annotations from the cbioportal transform CLI * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added poetry to CI.yaml * tried poetry CI.yaml config alteration from python-poetry/poetry#2629 * python-poetry/poetry#5490 * unmarked proces_cbioportal.nf as an executable * reverting by removing SETUPTOOLS_USE_DISTUTILS=stdlib in CI.yaml * commenting out poetry config virtualenvs.create false * https://github.com/orgs/python-poetry/discussions/4943 * removed poetry pytest and just running pytest; last fix resolved installation errors * trying poetry run pytest * reverting back to using conda instead of poetry for CI testing - will likely need to update test_env.yaml in devtools * added pandas and bravado to test_env.yaml, commented out install poetry in CI.yaml * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * rebased test_env.yaml * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * added beautifulsoup4 and numpy to test_env.yaml --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
choderalab · Apr 30, 2024 · f00c721 · f00c721
1 parent a300939
commit f00c721
Show file tree

Hide file tree

Showing 15 changed files with 542 additions and 124 deletions.
diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml
@@ -27,6 +27,9 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
+      # - name: Install poetry
+      #   run: pipx install poetry
+
       - name: Additional info about the build
         shell: bash
         run: |
@@ -42,6 +45,7 @@ jobs:
           environment-name: test
           # conda-forge is the default channel now and does not need to be specified
           channels: conda-forge,defaults
+          # cache: 'poetry'
           extra-specs: |
             python=${{ matrix.python-version }}
 
@@ -50,13 +54,18 @@ jobs:
         shell: bash -l {0}
         run: |
           python -m pip install . --no-deps
+          # python -m pip install poetry
+          # python -m venv .venv --copies
+          # poetry config virtualenvs.create false
+          # poetry install
           micromamba list
 
       - name: Run tests
         # conda setup requires this special shell
         shell: bash -l {0}
         run: |
           pytest -v --cov=missense_kinase_toolkit --cov-report=xml --color=yes tests/
+          # poetry run pytest -v --cov=missense_kinase_toolkit --cov-report=xml --color=yes tests/
 
       - name: Upload coverage reports to Codecov
         uses: codecov/[email protected]

diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
@@ -17,3 +17,10 @@ dependencies:
     # Pip-only installs
   #- pip:
   #  - codecov
+
+    # Other
+  - pandas
+  - bravado
+  - requests-cache
+  - beautifulsoup4
+  - numpy
diff --git a/readthedocs.yml b/readthedocs.yml
@@ -1,15 +1,33 @@
-# readthedocs.yml
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+# taken from https://education.molssi.org/python-package-best-practices/10-documentation.html#read-the-docs
 
+# Required
 version: 2
 
+# Set the OS, Python version and other tools you might need
 build:
-  image: latest
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
 
-python:
-  version: 3.8
-  install:
-    - method: pip
-      path: .
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
 
-conda:
-  environment: docs/requirements.yaml
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+    install:
+    - requirements: docs/requirements.yaml
diff --git a/src/__init__.py b/src/__init__.py
@@ -0,0 +1,3 @@
+"""An ETL pipeline package to facilitate structure-based ML for missense human kinase property prediction"""
+
+from missense_kinase_toolkit import *
diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py
@@ -1,74 +1,19 @@
-#!/usr/bin/env python3
-
-import os
+import logging
 import pandas as pd
 
 from bravado.client import SwaggerClient
 from bravado.requests_client import RequestsClient
+# from pydantic import BaseModel
+# from typing import ClassVar
 
-from missense_kinase_toolkit import config, io_utils
+from missense_kinase_toolkit import config, io_utils, utils_requests
 
 
-def get_all_mutations_by_study(
-    study_id: str,
-) -> list | None:
-    """Get mutations  cBioPortal data
-
-    Returns
-    -------
-    list | None
-        cBioPortal data of Abstract Base Classes objects if successful, otherwise None
-    """
-    instance = config.get_cbioportal_instance()
-    url = f"https://{instance}/api/v2/api-docs"
-    token = config.maybe_get_cbioportal_token()
-
-    if token is not None:
-        http_client = RequestsClient()
-        http_client.set_api_key(
-            instance,
-            f"Bearer {token}",
-            param_name='Authorization',
-            param_in='header'
-        )
-        cbioportal = SwaggerClient.from_url(
-            url,
-            http_client=http_client,
-            config={
-                "validate_requests":False,
-                "validate_responses":False,
-                "validate_swagger_spec": False
-            }
-        )
-    else:
-        cbioportal = SwaggerClient.from_url(
-            url,
-            config={
-                "validate_requests":False,
-                "validate_responses":False,
-                "validate_swagger_spec": False
-            }
-        )
-
-    studies = cbioportal.Studies.getAllStudiesUsingGET().result()
-    study_ids = [study.studyId for study in studies]
-
-    if study_id in study_ids:
-        #TODO: add error handling
-        #TODO: extract multiple studies
-        muts = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
-            molecularProfileId=f"{study_id}_mutations",
-            sampleListId=f"{study_id}_all",
-            projection="DETAILED"
-            ).result()
-    else:
-        raise ValueError(f"Study {study_id} not found in cBioPortal instance {instance}")
-
-    return muts
+logger = logging.getLogger(__name__)
 
 
 def parse_iterabc2dataframe(
-    list_input: iter,
+    input_object: iter,
 ) -> pd.DataFrame:
     """Parse an iterable containing Abstract Base Classes into a dataframe
 
@@ -82,11 +27,11 @@ def parse_iterabc2dataframe(
     pd.DataFrame
         Dataframe for the input list of Abstract Base Classes objects
     """
-    list_dir = [dir(entry) for entry in list_input]
+    list_dir = [dir(entry) for entry in input_object]
     set_dir = {item for sublist in list_dir for item in sublist}
 
     dict_dir = {attr: [] for attr in set_dir}
-    for entry in list_input:
+    for entry in input_object:
         for attr in dict_dir.keys():
             try:
                 dict_dir[attr].append(getattr(entry, attr))
@@ -99,16 +44,106 @@ def parse_iterabc2dataframe(
     return df
 
 
-def get_and_save_cbioportal_cohort(
-    study_id: str,
-) -> None:
-    muts = get_all_mutations_by_study(study_id)
+class cBioPortal():
+    # instance: ClassVar[str] = f"{config.get_cbioportal_instance()}"
+    # url: ClassVar[str] = f"https://{instance}/api/v2/api-docs"
+    # cbioportal: ClassVar[SwaggerClient | None] = None
+
+    def __init__(self):
+        self.instance = config.get_cbioportal_instance()
+        self.url = f"https://{self.instance}/api/v2/api-docs"
+        self._cbioportal = self.get_cbioportal_api()
+
+    def _set_api_key(self):
+        token = config.maybe_get_cbioportal_token()
+        http_client = RequestsClient()
+        if token is not None:
+            http_client.set_api_key(
+                self.instance,
+                f"Bearer {token}",
+                param_name="Authorization",
+                param_in="header"
+            )
+        else:
+            print("No API token provided")
+        return http_client
+
+    def get_cbioportal_api(self):
+        http_client = self._set_api_key()
+
+        cbioportal_api = SwaggerClient.from_url(
+            self.url,
+            http_client=http_client,
+            config={
+                "validate_requests": False,
+                "validate_responses": False,
+                "validate_swagger_spec": False
+            }
+        )
+
+        # response = cbioportal_api.Studies.getAllStudiesUsingGET().response().incoming_response
+        # logger.error(utils_requests.print_status_code(response.status_code))
+
+        return cbioportal_api
+
+    def get_instance(self):
+        return self.instance
+
+    def get_url(self):
+        return self.url
+
+    def get_cbioportal(self):
+        return self._cbioportal
+
+
+class Mutations(cBioPortal):
+    def __init__(
+        self,
+        study_id: str,
+    ) -> None:
+        super().__init__()
+        self.study_id = study_id
+        self._mutations = self.get_all_mutations_by_study()
+
+    def get_all_mutations_by_study(
+        self,
+    ) -> list | None:
+        """Get mutations  cBioPortal data
+
+        Returns
+        -------
+        list | None
+            cBioPortal data of Abstract Base Classes objects if successful, otherwise None
+        """
+        studies = self._cbioportal.Studies.getAllStudiesUsingGET().result()
+        study_ids = [study.studyId for study in studies]
+
+        if self.study_id in study_ids:
+            #TODO: add incremental error handling beyond missing study
+            muts = self._cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
+                molecularProfileId=f"{self.study_id}_mutations",
+                sampleListId=f"{self.study_id}_all",
+                projection="DETAILED"
+                ).result()
+        else:
+            logging.error(f"Study {self.study_id} not found in cBioPortal instance {self.instance}")
+
+        return muts
+
+    def get_and_save_cbioportal_cohort_mutations(
+        self,
+    ) -> None:
+        df_muts = parse_iterabc2dataframe(self._mutations)
+        df_genes = parse_iterabc2dataframe(df_muts["gene"])
+        df_combo = pd.concat([df_muts, df_genes], axis=1)
+        df_combo = df_combo.drop(["gene"], axis=1)
+
+        filename = f"{self.study_id}_mutations.csv"
 
-    df_muts = parse_iterabc2dataframe(muts)
-    df_genes = parse_iterabc2dataframe(df_muts["gene"])
-    df_combo = pd.concat([df_muts, df_genes], axis=1)
-    df_combo = df_combo.drop(["gene"], axis=1)
+        io_utils.save_dataframe_to_csv(df_combo, filename)
 
-    filename = f"{study_id}_mutations.csv"
+    def get_study_id(self):
+        return self.study_id
 
-    io_utils.save_dataframe_to_csv(df_combo, filename)
+    def get_mutations(self):
+        return self._mutations
diff --git a/src/missense_kinase_toolkit/cli/extract_cbioportal.py b/src/missense_kinase_toolkit/cli/extract_cbioportal.py
@@ -2,24 +2,31 @@
 
 import argparse
 
-from missense_kinase_toolkit import config, cbioportal
+from missense_kinase_toolkit import config, io_utils, cbioportal
 
 def parsearg_utils():
     parser = argparse.ArgumentParser(
-        description="Get mutations from cBioPortal cohort and instance"
+        description="Get mutations from cBioPortal instance for all specified studies."
     )
 
     parser.add_argument(
-        "--cohort",
+        "--outDir",
         type=str,
-        help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)",
-        default="msk_impact_2017",
+        help="Required: Output directory path (str)",
     )
 
     parser.add_argument(
-        "--outDir",
+        "--requestsCache",
         type=str,
-        help="Required: Output directory path (str)",
+        default="requests_cache",
+        help="Optional: Requests cache; default: `requests_cache` (str)",
+    )
+
+    parser.add_argument(
+        "--cohort",
+        type=str,
+        help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)",
+        default="msk_impact_2017",
     )
 
     parser.add_argument(
@@ -37,21 +44,15 @@ def parsearg_utils():
     )
 
     # TODO: add logging functionality
-    # TODO: cache requests for cBioPortal API
     return parser
 
 
 def main():
     args = parsearg_utils().parse_args()
 
-    str_studies = args.cohort
-    list_studies = str_studies.split(",")
-    list_studies = [study.strip() for study in list_studies]
+    list_studies = io_utils.convert_str2list(args.cohort)
 
-    # required argument
     config.set_output_dir(args.outDir)
-
-    # optional arguments
     config.set_cbioportal_instance(args.instance)
 
     try:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""An ETL pipeline package to facilitate structure-based ML for missense human kinase property prediction"""

		from missense_kinase_toolkit import *