diff --git a/.github/workflows/CI.yaml b/.github/workflows/CI.yaml index 0b9fa90..9a9c7dd 100644 --- a/.github/workflows/CI.yaml +++ b/.github/workflows/CI.yaml @@ -27,6 +27,9 @@ jobs: steps: - uses: actions/checkout@v3 + # - name: Install poetry + # run: pipx install poetry + - name: Additional info about the build shell: bash run: | @@ -42,6 +45,7 @@ jobs: environment-name: test # conda-forge is the default channel now and does not need to be specified channels: conda-forge,defaults + # cache: 'poetry' extra-specs: | python=${{ matrix.python-version }} @@ -50,6 +54,10 @@ jobs: shell: bash -l {0} run: | python -m pip install . --no-deps + # python -m pip install poetry + # python -m venv .venv --copies + # poetry config virtualenvs.create false + # poetry install micromamba list - name: Run tests @@ -57,6 +65,7 @@ jobs: shell: bash -l {0} run: | pytest -v --cov=missense_kinase_toolkit --cov-report=xml --color=yes tests/ + # poetry run pytest -v --cov=missense_kinase_toolkit --cov-report=xml --color=yes tests/ - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4.0.1 diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 7efc6de..7210450 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -17,3 +17,10 @@ dependencies: # Pip-only installs #- pip: # - codecov + + # Other + - pandas + - bravado + - requests-cache + - beautifulsoup4 + - numpy diff --git a/readthedocs.yml b/readthedocs.yml index 95b50ae..319a3be 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,15 +1,33 @@ -# readthedocs.yml +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details +# taken from https://education.molssi.org/python-package-best-practices/10-documentation.html#read-the-docs +# Required version: 2 +# Set the OS, Python version and other tools you might need build: - image: latest + os: ubuntu-22.04 + tools: + python: "3.11" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" -python: - version: 3.8 - install: - - method: pip - path: . +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub -conda: - environment: docs/requirements.yaml +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/requirements.yaml diff --git a/src/__init__.py b/src/__init__.py index e69de29..091ac24 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -0,0 +1,3 @@ +"""An ETL pipeline package to facilitate structure-based ML for missense human kinase property prediction""" + +from missense_kinase_toolkit import * diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py index 67403ef..4cbe0cc 100644 --- a/src/missense_kinase_toolkit/cbioportal.py +++ b/src/missense_kinase_toolkit/cbioportal.py @@ -1,74 +1,19 @@ -#!/usr/bin/env python3 - -import os +import logging import pandas as pd from bravado.client import SwaggerClient from bravado.requests_client import RequestsClient +# from pydantic import BaseModel +# from typing import ClassVar -from missense_kinase_toolkit import config, io_utils +from missense_kinase_toolkit import config, io_utils, utils_requests -def get_all_mutations_by_study( - study_id: str, -) -> list | None: - """Get mutations cBioPortal data - - Returns - ------- - list | None - cBioPortal data of Abstract Base Classes objects if successful, otherwise None - """ - instance = config.get_cbioportal_instance() - url = f"https://{instance}/api/v2/api-docs" - token = config.maybe_get_cbioportal_token() - - if token is not None: - http_client = RequestsClient() - http_client.set_api_key( - instance, - f"Bearer {token}", - param_name='Authorization', - param_in='header' - ) - cbioportal = SwaggerClient.from_url( - url, - http_client=http_client, - config={ - "validate_requests":False, - "validate_responses":False, - "validate_swagger_spec": False - } - ) - else: - cbioportal = SwaggerClient.from_url( - url, - config={ - "validate_requests":False, - "validate_responses":False, - "validate_swagger_spec": False - } - ) - - studies = cbioportal.Studies.getAllStudiesUsingGET().result() - study_ids = [study.studyId for study in studies] - - if study_id in study_ids: - #TODO: add error handling - #TODO: extract multiple studies - muts = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET( - molecularProfileId=f"{study_id}_mutations", - sampleListId=f"{study_id}_all", - projection="DETAILED" - ).result() - else: - raise ValueError(f"Study {study_id} not found in cBioPortal instance {instance}") - - return muts +logger = logging.getLogger(__name__) def parse_iterabc2dataframe( - list_input: iter, + input_object: iter, ) -> pd.DataFrame: """Parse an iterable containing Abstract Base Classes into a dataframe @@ -82,11 +27,11 @@ def parse_iterabc2dataframe( pd.DataFrame Dataframe for the input list of Abstract Base Classes objects """ - list_dir = [dir(entry) for entry in list_input] + list_dir = [dir(entry) for entry in input_object] set_dir = {item for sublist in list_dir for item in sublist} dict_dir = {attr: [] for attr in set_dir} - for entry in list_input: + for entry in input_object: for attr in dict_dir.keys(): try: dict_dir[attr].append(getattr(entry, attr)) @@ -99,16 +44,106 @@ def parse_iterabc2dataframe( return df -def get_and_save_cbioportal_cohort( - study_id: str, -) -> None: - muts = get_all_mutations_by_study(study_id) +class cBioPortal(): + # instance: ClassVar[str] = f"{config.get_cbioportal_instance()}" + # url: ClassVar[str] = f"https://{instance}/api/v2/api-docs" + # cbioportal: ClassVar[SwaggerClient | None] = None + + def __init__(self): + self.instance = config.get_cbioportal_instance() + self.url = f"https://{self.instance}/api/v2/api-docs" + self._cbioportal = self.get_cbioportal_api() + + def _set_api_key(self): + token = config.maybe_get_cbioportal_token() + http_client = RequestsClient() + if token is not None: + http_client.set_api_key( + self.instance, + f"Bearer {token}", + param_name="Authorization", + param_in="header" + ) + else: + print("No API token provided") + return http_client + + def get_cbioportal_api(self): + http_client = self._set_api_key() + + cbioportal_api = SwaggerClient.from_url( + self.url, + http_client=http_client, + config={ + "validate_requests": False, + "validate_responses": False, + "validate_swagger_spec": False + } + ) + + # response = cbioportal_api.Studies.getAllStudiesUsingGET().response().incoming_response + # logger.error(utils_requests.print_status_code(response.status_code)) + + return cbioportal_api + + def get_instance(self): + return self.instance + + def get_url(self): + return self.url + + def get_cbioportal(self): + return self._cbioportal + + +class Mutations(cBioPortal): + def __init__( + self, + study_id: str, + ) -> None: + super().__init__() + self.study_id = study_id + self._mutations = self.get_all_mutations_by_study() + + def get_all_mutations_by_study( + self, + ) -> list | None: + """Get mutations cBioPortal data + + Returns + ------- + list | None + cBioPortal data of Abstract Base Classes objects if successful, otherwise None + """ + studies = self._cbioportal.Studies.getAllStudiesUsingGET().result() + study_ids = [study.studyId for study in studies] + + if self.study_id in study_ids: + #TODO: add incremental error handling beyond missing study + muts = self._cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET( + molecularProfileId=f"{self.study_id}_mutations", + sampleListId=f"{self.study_id}_all", + projection="DETAILED" + ).result() + else: + logging.error(f"Study {self.study_id} not found in cBioPortal instance {self.instance}") + + return muts + + def get_and_save_cbioportal_cohort_mutations( + self, + ) -> None: + df_muts = parse_iterabc2dataframe(self._mutations) + df_genes = parse_iterabc2dataframe(df_muts["gene"]) + df_combo = pd.concat([df_muts, df_genes], axis=1) + df_combo = df_combo.drop(["gene"], axis=1) + + filename = f"{self.study_id}_mutations.csv" - df_muts = parse_iterabc2dataframe(muts) - df_genes = parse_iterabc2dataframe(df_muts["gene"]) - df_combo = pd.concat([df_muts, df_genes], axis=1) - df_combo = df_combo.drop(["gene"], axis=1) + io_utils.save_dataframe_to_csv(df_combo, filename) - filename = f"{study_id}_mutations.csv" + def get_study_id(self): + return self.study_id - io_utils.save_dataframe_to_csv(df_combo, filename) + def get_mutations(self): + return self._mutations diff --git a/src/missense_kinase_toolkit/cli/extract_cbioportal.py b/src/missense_kinase_toolkit/cli/extract_cbioportal.py index c5df5ea..24485ab 100755 --- a/src/missense_kinase_toolkit/cli/extract_cbioportal.py +++ b/src/missense_kinase_toolkit/cli/extract_cbioportal.py @@ -2,24 +2,31 @@ import argparse -from missense_kinase_toolkit import config, cbioportal +from missense_kinase_toolkit import config, io_utils, cbioportal def parsearg_utils(): parser = argparse.ArgumentParser( - description="Get mutations from cBioPortal cohort and instance" + description="Get mutations from cBioPortal instance for all specified studies." ) parser.add_argument( - "--cohort", + "--outDir", type=str, - help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)", - default="msk_impact_2017", + help="Required: Output directory path (str)", ) parser.add_argument( - "--outDir", + "--requestsCache", type=str, - help="Required: Output directory path (str)", + default="requests_cache", + help="Optional: Requests cache; default: `requests_cache` (str)", + ) + + parser.add_argument( + "--cohort", + type=str, + help="Optional: cBioPortal cohort IDs separated by commas (e.g., `msk_impact_2017` for Zehir, 2017 and `mskimpact` for MSKCC clinical sequencing cohort) (str)", + default="msk_impact_2017", ) parser.add_argument( @@ -37,21 +44,15 @@ def parsearg_utils(): ) # TODO: add logging functionality - # TODO: cache requests for cBioPortal API return parser def main(): args = parsearg_utils().parse_args() - str_studies = args.cohort - list_studies = str_studies.split(",") - list_studies = [study.strip() for study in list_studies] + list_studies = io_utils.convert_str2list(args.cohort) - # required argument config.set_output_dir(args.outDir) - - # optional arguments config.set_cbioportal_instance(args.instance) try: diff --git a/src/missense_kinase_toolkit/cli/extract_kinase_annotations.py b/src/missense_kinase_toolkit/cli/extract_kinase_annotations.py new file mode 100755 index 0000000..8434dce --- /dev/null +++ b/src/missense_kinase_toolkit/cli/extract_kinase_annotations.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +import argparse + +import pandas as pd + +from missense_kinase_toolkit import config, io_utils, scrapers, klifs + +def parsearg_utils(): + parser = argparse.ArgumentParser( + description="Get kinase annotations from KinHub and KLIFS databases." + ) + + parser.add_argument( + "--outDir", + type=str, + help="Required: Output directory path (str)", + ) + + parser.add_argument( + "--requestsCache", + type=str, + default="requests_cache", + help="Optional: Requests cache; default: `requests_cache` (str)", + ) + + parser.add_argument( + "--csvKinhub", + type=str, + default="kinhub.csv", + help="Optional: CSV file in outDir that contains Kinhub kinase info; default: `kinhub.csv` (str)", + ) + + parser.add_argument( + "--csvKLIFS", + type=str, + default="klifs.csv", + help="Optional: CSV file in outDir that contains KLIFS kinase info; default: `klifs.csv` (str)", + ) + + # TODO: add logging functionality + return parser + + +def main(): + args = parsearg_utils().parse_args() + + config.set_output_dir(args.outDir) + config.set_request_cache(args.requestsCache) + + # get KinHub list of kinases + df_kinhub = scrapers.kinhub() + io_utils.save_dataframe_to_csv(df_kinhub, args.csvKinhub) + + # get KLIFS annotations + list_kinase_hgnc = df_kinhub["HGNC Name"].to_list() + dict_kinase_info = {} + for kinase in list_kinase_hgnc: + dict_kinase_info[kinase] = klifs.HumanKinaseInfo(kinase)._kinase_info + df_klifs = pd.DataFrame(dict_kinase_info).T + df_klifs = df_klifs.rename_axis("HGNC Name").reset_index() + io_utils.save_dataframe_to_csv(df_klifs, args.csvKLIFS) diff --git a/src/missense_kinase_toolkit/cli/transform_cbioportal.py b/src/missense_kinase_toolkit/cli/transform_cbioportal.py index d8e51be..de524a1 100755 --- a/src/missense_kinase_toolkit/cli/transform_cbioportal.py +++ b/src/missense_kinase_toolkit/cli/transform_cbioportal.py @@ -2,18 +2,18 @@ import argparse -from missense_kinase_toolkit import config, scrapers, io_utils +from missense_kinase_toolkit import config, io_utils def parsearg_utils(): parser = argparse.ArgumentParser( - description="Concatenate, remove duplicates, and extract genes and mutation types of interest" + description="Concatenate, remove duplicates, and extract genes and mutation types of interest from cBioPortal data." ) parser.add_argument( - "--mutations", + "--mutationTypes", type=str, - help="Optional: Mutation type(s) to extract, separated by commas (e.g., `Missense_Mutation`) (str)", + help="Optional: Mutation type(s) to extract, separated by commas; default: `Missense_Mutation` (str)", default="Missense_Mutation", ) @@ -25,9 +25,23 @@ def parsearg_utils(): parser.add_argument( "--requestsCache", - type=bool, - default=False, - help="Optional: Requests cache; default False (bool)", + type=str, + default="requests_cache", + help="Optional: Requests cache; default: requests_cache (str)", + ) + + parser.add_argument( + "--listCols", + type=str, + default="HGNC Name, UniprotID", + help="Optional: List of columns to merge separated by comma with the first as the merge on column of HGNC gene names; default: `HGNC Name, UniprotID` (str)", + ) + + parser.add_argument( + "--csvRef", + type=str, + default="kinhub.csv", + help="Optional: CSV file in outDir that contains; default: `kinhub.csv` (str)", ) # TODO: add logging functionality @@ -37,31 +51,29 @@ def parsearg_utils(): def main(): args = parsearg_utils().parse_args() - str_mutations = args.mutations - list_mutations = str_mutations.split(",") - list_mutations = [mutation.strip() for mutation in list_mutations] - - # required argument config.set_output_dir(args.outDir) - - # optional argument config.set_request_cache(args.requestsCache) + list_mutations = io_utils.convert_str2list(args.mutationTypes) + list_cols = io_utils.convert_str2list(args.listCols) + df_cbioportal = io_utils.concatenate_csv_files_with_glob("*_mutations.csv") - df_kinhub = scrapers.kinhub() - io_utils.save_dataframe_to_csv(df_kinhub, "kinhub.csv") + df_kinhub = io_utils.load_csv_to_dataframe(args.csvRef) + # df_kinhub = scrapers.kinhub() + # io_utils.save_dataframe_to_csv(df_kinhub, "kinhub.csv") - list_kinase_hgnc = df_kinhub["HGNC Name"].to_list() + list_kinase_hgnc = df_kinhub[list_cols[0]].to_list() + # list_kinase_hgnc = df_kinhub["HGNC Name"].to_list() df_subset = df_cbioportal.loc[df_cbioportal["mutationType"].isin(list_mutations), ].reset_index(drop=True) df_subset = df_subset.loc[df_subset["hugoGeneSymbol"].isin(list_kinase_hgnc), ].reset_index(drop=True) - list_cols = ["HGNC Name", "UniprotID"] + # list_cols = ["HGNC Name", "UniprotID"] df_subset_merge = df_subset.merge(df_kinhub[list_cols], how = "left", left_on = "hugoGeneSymbol", - right_on = "HGNC Name") - df_subset_merge = df_subset_merge.drop(["HGNC Name"], axis=1) + right_on = list_cols[0]) + df_subset_merge = df_subset_merge.drop([list_cols[0]], axis=1) io_utils.save_dataframe_to_csv(df_subset_merge, "transformed_mutations.csv") diff --git a/src/missense_kinase_toolkit/io_utils.py b/src/missense_kinase_toolkit/io_utils.py index c005461..a93b9f0 100644 --- a/src/missense_kinase_toolkit/io_utils.py +++ b/src/missense_kinase_toolkit/io_utils.py @@ -23,6 +23,50 @@ def check_outdir_exists( return path_data +def convert_str2list( + input_str: str +) -> list[str]: + """Convert a string to a list + + Parameters + ---------- + str : str + String to convert to list + + Returns + ------- + list[str] + List of strings + """ + list_str = input_str.split(",") + list_str = [str_in.strip() for str_in in list_str] + return list_str + + +def load_csv_to_dataframe( + filename: str, +) -> None: + """Load a CSV file as a dataframe + + Parameters + ---------- + filename : str + Filename to load (either with or without "csv" suffix) + + Returns + ------- + pd.DataFrame + Dataframe loaded from CSV file + """ + filename = filename.replace(".csv", "") + ".csv" + path_data = check_outdir_exists() + try: + df = pd.read_csv(os.path.join(path_data, filename)) + except FileNotFoundError: + print(f"File {filename} not found in {path_data}...") + return df + + def save_dataframe_to_csv( df: pd.DataFrame, filename: str, @@ -36,7 +80,6 @@ def save_dataframe_to_csv( filename : str Filename to save (either with or without "csv" suffix) - Returns ------- None diff --git a/src/missense_kinase_toolkit/klifs.py b/src/missense_kinase_toolkit/klifs.py new file mode 100644 index 0000000..468c642 --- /dev/null +++ b/src/missense_kinase_toolkit/klifs.py @@ -0,0 +1,113 @@ +# from bravado.requests_client import RequestsClient +from bravado.client import SwaggerClient + + +class KLIFS(): + def __init__(self): + self.url = "https://dev.klifs.net/swagger_v2/swagger.json" + self._klifs = self.get_klifs_api() + + def get_klifs_api(self): + klifs_api = SwaggerClient.from_url( + self.url, + config={ + "validate_requests": False, + "validate_responses": False, + "validate_swagger_spec": False + } + ) + return klifs_api + + def get_url(self): + return self.url + + def get_klifs(self): + return self._klifs + + +class HumanKinaseInfo(KLIFS): + species: str = "Human" + def __init__( + self, + kinase_name: str, + ) -> None: + super().__init__() + self.kinase_name = kinase_name + self._kinase_info = self.get_kinase_info() + + def get_kinase_info( + self + ) -> dict[str, str | int | None]: + try: + kinase_info = ( + self._klifs.Information.get_kinase_ID( + kinase_name=[self.kinase_name], + species=self.species) + .response() + .result[0] + ) + + list_key = dir(kinase_info) + list_val = [getattr(kinase_info, key) for key in list_key] + + dict_kinase_info = dict(zip(list_key, list_val)) + + except Exception as e: + print(e) + list_key = [ + 'family', + 'full_name', + 'gene_name', + 'group', + 'iuphar', + 'kinase_ID', + 'name', + 'pocket', + 'species', + 'subfamily', + 'uniprot' + ] + dict_kinase_info = dict(zip(list_key, [None]*len(list_key))) + + return dict_kinase_info + + def get_kinase_name(self): + return self.kinase_name + + def get_species(self): + return self.species + + +# def load_af2active(url, path_save): +# import os +# import wget +# import tarfile + +# if not os.path.exists(path_save): +# os.makedirs(path_save) +# else: +# if os.path.exists(os.path.join(path_save, "Kincore_AF2_HumanCatalyticKinases")): +# print("File already exists...") +# return + +# wget.download(url, path_save) + +# def get_tdc_dti(source_name="DAVIS"): +# from tdc.multi_pred import DTI + +# data = DTI(name = source_name) +# data_davis = DTI(name = 'DAVIS') +# data_davis.get_data() +# data_davis.entity1_idx.unique().tolist() + +# data_kiba = DTI(name = 'KIBA') + +# data_kiba.get_data() + +# print(data.label_distribution()) +# data.print_stats() +# data.entity2_name +# len(data.entity1_idx.unique()) +# data.entity2_idx.unique() +# data. +# split = data.get_split() diff --git a/src/missense_kinase_toolkit/scrapers.py b/src/missense_kinase_toolkit/scrapers.py index 21ca8c0..4bbfea2 100644 --- a/src/missense_kinase_toolkit/scrapers.py +++ b/src/missense_kinase_toolkit/scrapers.py @@ -18,7 +18,6 @@ def kinhub( pd.DataFrame DataFrame of kinase information """ - import requests from bs4 import BeautifulSoup import numpy as np # TODO: to fix ImportError @@ -52,10 +51,9 @@ def kinhub( df_kinhub = pd.DataFrame.from_dict(dict_kinhub) # df_kinhub = clean_names(df_kinhub) - # for kinases with 2 kinase domains, entries are duplicated despite same UniProt ID - # drop these - df_kinhub_drop = df_kinhub.loc[~df_kinhub["Manning Name"].apply(lambda x: "Domain2_" in str(x)), ] - # list_uniprot = df_kinhub["UniprotID"][df_kinhub["Manning Name"].apply(lambda x: "Domain2_" in str(x))].to_list() - # assert df_kinhub.shape[0] - df_kinhub_drop.shape[0] == df_kinhub_drop["UniprotID"].isin(list_uniprot).sum() + list_cols = df_kinhub.columns.to_list() + list_cols.remove("HGNC Name") + df_kinhub_agg = df_kinhub.groupby(["HGNC Name"], as_index=False, sort=False).agg(set) + df_kinhub_agg[list_cols] = df_kinhub_agg[list_cols].map(lambda x : ', '.join(str(s) for s in x)) - return df_kinhub_drop + return df_kinhub_agg diff --git a/src/nextflow/modules/extract_cbioportal.nf b/src/nextflow/modules/extract_cbioportal.nf new file mode 100644 index 0000000..b9462ca --- /dev/null +++ b/src/nextflow/modules/extract_cbioportal.nf @@ -0,0 +1,16 @@ +process PROCESS_CBIOPORTAL { + input: + tuple val(cbio_cohort), path(out_dir), val(cbio_inst), val(cbio_token), path(request_cache) + + output: + path("${out_dir}/cbioportal") + """ + export PYTHONHASHSEED=0 + process_cbioportal \ + --cohort ${cbio_cohort} \ + --outDir ${out_dir} \ + --instance ${cbio_inst} \ + --token ${cbio_token} \ + --requestsCache ${request_cache} + """ +} diff --git a/src/nextflow/nextflow.config b/src/nextflow/nextflow.config new file mode 100644 index 0000000..78f477f --- /dev/null +++ b/src/nextflow/nextflow.config @@ -0,0 +1,27 @@ +profiles { + conda { + conda.enabled = true + } + mamba { + conda.enabled = true + conda.useMamba = true + } + micromamba { + conda.enabled = true + conda.useMicromamba = true + } + juno { + // singularity { + // enabled = true + // autoMounts = true + // } + process { + executor = 'lsf' + } + executor { + name = 'lsf' + perJobMemLimit = true + queueSize = 25 + } + } +} diff --git a/src/nextflow/process_cbioportal.nf b/src/nextflow/process_cbioportal.nf new file mode 100644 index 0000000..5625530 --- /dev/null +++ b/src/nextflow/process_cbioportal.nf @@ -0,0 +1,24 @@ +process PROCESS_CBIOPORTAL { + // tag "$meta.id" + // label 'process_medium' + // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + // 'docker://jeffquinnmsk/pan_preclinical_etl:latest' : + // 'docker.io/jeffquinnmsk/pan_preclinical_etl:latest' }" + + input: + tuple val(meta), path(raw_data), path(studies), path(source_files), path(request_cache), val(study_name) + + output: + tuple val(meta), path("${prefix}/per_study_results/${study_name}"), emit: etl_results + + """ + export PYTHONHASHSEED=0 + mkdir -p "${prefix}/per_study_results/${study_name}" + process_cbioportal \ + --data-dir ${raw_data} \ + --output-dir ${prefix}/per_study_results/${study_name} \ + --studies ${studies} \ + --source-files ${source_files} \ + --study-id ${study_name} + """ +} diff --git a/tests/test_missense_kinase_toolkit.py b/tests/test_missense_kinase_toolkit.py index c2e7887..13ff2c4 100644 --- a/tests/test_missense_kinase_toolkit.py +++ b/tests/test_missense_kinase_toolkit.py @@ -13,3 +13,53 @@ def test_missense_kinase_toolkit_imported(): """Sample test, will always pass so long as import statement worked.""" assert "missense_kinase_toolkit" in sys.modules + + +def test_kinhub_scraper(): + from missense_kinase_toolkit import scrapers + + df_kinhub = scrapers.kinhub() + + assert df_kinhub.shape[0] == 517 + assert df_kinhub.shape[1] == 8 + assert "HGNC Name" in df_kinhub.columns + assert "UniprotID" in df_kinhub.columns + + +def test_klifs_HumanKinaseInfo(): + from missense_kinase_toolkit import klifs + + dict_egfr = klifs.HumanKinaseInfo("EGFR")._kinase_info + + assert dict_egfr["family"] == "EGFR" + assert dict_egfr["full_name"] == "epidermal growth factor receptor" + assert dict_egfr["gene_name"] == "EGFR" + assert dict_egfr["group"] == "TK" + assert dict_egfr["iuphar"] == 1797 + assert dict_egfr["kinase_ID"] == 406 + assert dict_egfr["name"] == "EGFR" + assert dict_egfr["pocket"] == "KVLGSGAFGTVYKVAIKELEILDEAYVMASVDPHVCRLLGIQLITQLMPFGCLLDYVREYLEDRRLVHRDLAARNVLVITDFGLA" + assert dict_egfr["species"] == "Human" + assert dict_egfr["uniprot"] == "P00533" + + +def test_io_utils(): + from missense_kinase_toolkit import io_utils + import pandas as pd + import os + + os.environ["OUTPUT_DIR"] = "." + df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + io_utils.save_dataframe_to_csv(df, "test1.csv") + df_read = io_utils.load_csv_to_dataframe("test1.csv") + assert df.equals(df_read) + + io_utils.save_dataframe_to_csv(df, "test2.csv") + df_concat = io_utils.concatenate_csv_files_with_glob("*test*.csv") + assert df_concat.equals(pd.concat([df, df])) + + os.remove("test1.csv") + os.remove("test2.csv") + + assert io_utils.convert_str2list("a,b,c") == ["a", "b", "c"] + assert io_utils.convert_str2list("a, b, c") == ["a", "b", "c"]