diff --git a/.codecov.yml b/.codecov.yml index a3ed7f4..03d2268 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -11,4 +11,4 @@ comment: branches: null behavior: default flags: null - paths: null \ No newline at end of file + paths: null diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 785c57e..8b6effa 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,7 +1,7 @@ # How to contribute We welcome contributions from external contributors, and this document -describes how to merge code changes into this missense_kinase_toolkit. +describes how to merge code changes into this missense_kinase_toolkit. ## Getting Started diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c772b96..26da5a2 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -9,4 +9,4 @@ Notable points that this PR has either accomplished or will accomplish. - [ ] Question1 ## Status -- [ ] Ready to go \ No newline at end of file +- [ ] Ready to go diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a9b6e42..8d4b276 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -32,4 +32,4 @@ repos: hooks: - id: pyupgrade args: - - --py39-plus \ No newline at end of file + - --py39-plus diff --git a/README.md b/README.md index eff677f..dfc92d7 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,6 @@ Copyright (c) 2024, Jess White #### Acknowledgements - -Project based on the + +Project based on the [Computational Molecular Science Python Cookiecutter](https://github.com/molssi/cookiecutter-cms) version 1.1. diff --git a/data/README.md b/data/README.md index 5301dae..32c125a 100644 --- a/data/README.md +++ b/data/README.md @@ -1,7 +1,7 @@ # Sample Package Data This directory contains sample additional data you may want to include with your package. -This is a place where non-code related additional information (such as data files, molecular structures, etc.) can +This is a place where non-code related additional information (such as data files, molecular structures, etc.) can go that you want to ship alongside your code. Please note that it is not recommended to place large files in your git directory. If your project requires files larger diff --git a/devtools/README.md b/devtools/README.md index 06cb626..ac99815 100644 --- a/devtools/README.md +++ b/devtools/README.md @@ -1,6 +1,6 @@ # Development, testing, and deployment tools -This directory contains a collection of tools for running Continuous Integration (CI) tests, +This directory contains a collection of tools for running Continuous Integration (CI) tests, conda installation, and other development tools not directly related to the coding process. @@ -8,7 +8,7 @@ conda installation, and other development tools not directly related to the codi ### Continuous Integration -You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and +You should test your code, but do not feel compelled to use these specific programs. You also may not need Unix and Windows testing if you only plan to deploy on specific platforms. These are just to help you get started. ### Conda Environment: @@ -17,7 +17,7 @@ This directory contains the files to setup the Conda environment for testing pur * `conda-envs`: directory containing the YAML file(s) which fully describe Conda Environments, their dependencies, and those dependency provenance's * `test_env.yaml`: Simple test environment file with base dependencies. Channels are not specified here and therefore respect global Conda configuration - + ### Additional Scripts: This directory contains OS agnostic helper scripts which don't fall in any of the previous categories @@ -40,17 +40,17 @@ This directory contains OS agnostic helper scripts which don't fall in any of th - [ ] Make sure there is an/are issue(s) opened for your specific update - [ ] Create the PR, referencing the issue - [ ] Debug the PR as needed until tests pass -- [ ] Tag the final, debugged version +- [ ] Tag the final, debugged version * `git tag -a X.Y.Z [latest pushed commit] && git push --follow-tags` - [ ] Get the PR merged in ## Versioneer Auto-version -[Versioneer](https://github.com/warner/python-versioneer) will automatically infer what version -is installed by looking at the `git` tags and how many commits ahead this version is. The format follows +[Versioneer](https://github.com/warner/python-versioneer) will automatically infer what version +is installed by looking at the `git` tags and how many commits ahead this version is. The format follows [PEP 440](https://www.python.org/dev/peps/pep-0440/) and has the regular expression of: ```regexp \d+.\d+.\d+(?\+\d+-[a-z0-9]+) ``` -If the version of this commit is the same as a `git` tag, the installed version is the same as the tag, -e.g. `missense_kinase_toolkit-0.1.2`, otherwise it will be appended with `+X` where `X` is the number of commits +If the version of this commit is the same as a `git` tag, the installed version is the same as the tag, +e.g. `missense_kinase_toolkit-0.1.2`, otherwise it will be appended with `+X` where `X` is the number of commits ahead from the last tag, and then `-YYYYYY` where the `Y`'s are replaced with the `git` commit hash. diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index c79a37b..7efc6de 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -17,4 +17,3 @@ dependencies: # Pip-only installs #- pip: # - codecov - diff --git a/devtools/scripts/create_conda_env.py b/devtools/scripts/create_conda_env.py index 9ece84a..6a87c66 100644 --- a/devtools/scripts/create_conda_env.py +++ b/devtools/scripts/create_conda_env.py @@ -28,7 +28,7 @@ except (KeyError, ImportError, IndexError): raise ImportError("No YAML parser could be found in this or the conda environment. " "Could not find PyYAML or Ruamel YAML in the current environment, " - "AND could not find Ruamel YAML in the base conda environment through CONDA_EXE path. " + "AND could not find Ruamel YAML in the base conda environment through CONDA_EXE path. " "Environment not created!") loader = yaml.YAML(typ="safe").load # typ="safe" avoids odd typing on output @@ -57,10 +57,10 @@ def temp_cd(): args = parser.parse_args() # Open the base file -with open(args.conda_file, "r") as handle: +with open(args.conda_file) as handle: yaml_script = loader(handle.read()) -python_replacement_string = "python {}*".format(args.python) +python_replacement_string = f"python {args.python}*" try: for dep_index, dep_value in enumerate(yaml_script['dependencies']): @@ -82,14 +82,14 @@ def temp_cd(): if conda_path is None: raise RuntimeError("Could not find a conda binary in CONDA_EXE variable or in executable search path") -print("CONDA ENV NAME {}".format(args.name)) -print("PYTHON VERSION {}".format(args.python)) -print("CONDA FILE NAME {}".format(args.conda_file)) -print("CONDA PATH {}".format(conda_path)) +print(f"CONDA ENV NAME {args.name}") +print(f"PYTHON VERSION {args.python}") +print(f"CONDA FILE NAME {args.conda_file}") +print(f"CONDA PATH {conda_path}") # Write to a temp directory which will always be cleaned up with temp_cd(): temp_file_name = "temp_script.yaml" with open(temp_file_name, 'w') as f: f.write(yaml.dump(yaml_script)) - sp.call("{} env create -n {} -f {}".format(conda_path, args.name, temp_file_name), shell=True) + sp.call(f"{conda_path} env create -n {args.name} -f {temp_file_name}", shell=True) diff --git a/docs/Makefile b/docs/Makefile index 87cc753..e351c19 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -17,4 +17,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md index 36c48c7..e11cd33 100644 --- a/docs/README.md +++ b/docs/README.md @@ -5,7 +5,7 @@ To compile the docs, first ensure that Sphinx and the ReadTheDocs theme are inst ```bash -conda install sphinx sphinx_rtd_theme +conda install sphinx sphinx_rtd_theme ``` @@ -14,11 +14,10 @@ Once installed, you can use the `Makefile` in this directory to compile static H make html ``` -The compiled docs will be in the `_build` directory and can be viewed by opening `index.html` (which may itself +The compiled docs will be in the `_build` directory and can be viewed by opening `index.html` (which may itself be inside a directory called `html/` depending on what version of Sphinx is installed). A configuration file for [Read The Docs](https://readthedocs.org/) (readthedocs.yaml) is included in the top level of the repository. To use Read the Docs to host your documentation, go to https://readthedocs.org/ and connect this repository. You may need to change your default branch to `main` under Advanced Settings for the project. If you would like to use Read The Docs with `autodoc` (included automatically) and your package has dependencies, you will need to include those dependencies in your documentation yaml file (`docs/requirements.yaml`). - diff --git a/docs/_static/README.md b/docs/_static/README.md index 2f0cf84..122b610 100644 --- a/docs/_static/README.md +++ b/docs/_static/README.md @@ -1,11 +1,11 @@ # Static Doc Directory Add any paths that contain custom static files (such as style sheets) here, -relative to the `conf.py` file's directory. +relative to the `conf.py` file's directory. They are copied after the builtin static files, so a file named "default.css" will overwrite the builtin "default.css". -The path to this folder is set in the Sphinx `conf.py` file in the line: +The path to this folder is set in the Sphinx `conf.py` file in the line: ```python templates_path = ['_static'] ``` diff --git a/docs/_templates/README.md b/docs/_templates/README.md index 3f4f804..485f82a 100644 --- a/docs/_templates/README.md +++ b/docs/_templates/README.md @@ -1,11 +1,11 @@ # Templates Doc Directory -Add any paths that contain templates here, relative to +Add any paths that contain templates here, relative to the `conf.py` file's directory. They are copied after the builtin template files, so a file named "page.html" will overwrite the builtin "page.html". -The path to this folder is set in the Sphinx `conf.py` file in the line: +The path to this folder is set in the Sphinx `conf.py` file in the line: ```python html_static_path = ['_templates'] ``` diff --git a/docs/conf.py b/docs/conf.py index 81fe65c..aecaf27 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. # diff --git a/docs/getting_started.rst b/docs/getting_started.rst index c92797a..078de06 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -1,4 +1,4 @@ Getting Started =============== -This page details how to get started with missense-kinase-toolkit. +This page details how to get started with missense-kinase-toolkit. diff --git a/docs/requirements.yaml b/docs/requirements.yaml index 939a290..946c491 100644 --- a/docs/requirements.yaml +++ b/docs/requirements.yaml @@ -9,4 +9,3 @@ dependencies: # Pip-only installs #- pip: - diff --git a/readthedocs.yml b/readthedocs.yml index 69d6db5..95b50ae 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -12,4 +12,4 @@ python: path: . conda: - environment: docs/requirements.yaml \ No newline at end of file + environment: docs/requirements.yaml diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/missense_kinase_toolkit/cbioportal.py b/src/missense_kinase_toolkit/cbioportal.py new file mode 100644 index 0000000..7f36b11 --- /dev/null +++ b/src/missense_kinase_toolkit/cbioportal.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +import re + +import requests +import pandas as pd + + +def create_setlist( + input_object: requests.models.Response, + attr: str, +) -> tuple[list, set]: + """Create a list and set of unique values from a response object + + Parameters + ---------- + input_object : requests.models.Response + Response object from a request + attr : str + Attribute to extract from the response object + + Returns + ------- + tuple[list, set] + List and set of unique values from the response object + """ + list_output = [] + set_output = set() + + for entry in input_object: + list_output.append(entry[attr]) + set_output.add(entry[attr]) + + return list_output, set_output + + +def print_counts( + list_input: list, +) -> None: + """Print the counts of unique values in a list + + Parameters + ---------- + list_input : list + List of values to count + + Returns + ------- + None + """ + for Unique in set(list_input): + n = list_input.count(Unique) + print(f"{Unique:<15} \t {n:>10}") + + +def parse_obj2dict( + input_object: requests.models.Response, +) -> dict: + """Parse a response object into a dictionary + + Parameters + ---------- + input_object : requests.models.Response + Response object from a request + + Returns + ------- + dict + Dictionary of values from the response object + """ + dict_output = {} + + list_dir = dir(input_object[0]) + + for attr in list_dir: + list_attr = [] + for entry in input_object: + try: + add = int(entry[attr]) + except ValueError: + add = str(entry[attr]) + list_attr.append(add) + dict_output[attr] = list_attr + + return dict_output + + +def parse_series2dict( + series: pd.Series, + strwrap: None | str = None, + delim1: None | str = None, + delim2: None | str = None, +) -> dict: + """Parse a series into a dictionary + + Parameters + ---------- + series : pd.Series + Series to parse + strwrap : None | str + Regular expression to wrap the values in the series + delim1 : None | str + Delimiter to split the values in the series + delim2 : None | str + Delimiter to split the values in the series + + Returns + ------- + dict + Dictionary of values from the series + """ + if strwrap is None: + strwrap = r"Gene\((.*)\)" + if delim1 is None: + delim1 = ", " + if delim2 is None: + delim2 = "=" + + list_temp = series.apply( + lambda x: re.search(strwrap, str(x)).group(1).split(delim1) + ) + list_keys = [gene.split(delim2)[0] for gene in list_temp[0]] + dict_out = {key: [] for key in list_keys} + + for row in list_temp: + list_row = [col.split(delim2)[1] for col in row] + for idx, col in enumerate(list_row): + dict_out[list_keys[idx]].append(col) + + return dict_out + + +def calc_vaf( + dataframe, + alt: None | str = None, + ref: None | str = None, +): + if alt is None: + alt = "tumorAltCount" + if ref is None: + ref = "tumorRefCount" + + vaf = dataframe[alt] / (dataframe[alt] + dataframe[ref]) + + return vaf diff --git a/src/missense_kinase_toolkit/hgnc.py b/src/missense_kinase_toolkit/hgnc.py index 101e469..c5e616a 100644 --- a/src/missense_kinase_toolkit/hgnc.py +++ b/src/missense_kinase_toolkit/hgnc.py @@ -2,7 +2,7 @@ import requests -from missense_kinase_toolkit import requests_wrapper +from missense_kinase_toolkit import requests_wrapper, utils_requests def maybe_get_symbol_from_hgnc_search( @@ -36,7 +36,7 @@ def maybe_get_symbol_from_hgnc_search( list_hgnc_gene_name = extract_list_from_hgnc_response_docs(res, "symbol") else: list_hgnc_gene_name = None - print_status_code_if_res_not_ok(res) + utils_requests.print_status_code_if_res_not_ok(res) return list_hgnc_gene_name @@ -78,7 +78,7 @@ def maybe_get_info_from_hgnc_fetch( list_out.append(list_entry) else: list_out = [None for _ in list_to_extract] - print_status_code_if_res_not_ok(res) + utils_requests.print_status_code_if_res_not_ok(res) dict_out = dict(zip(list_to_extract, list_out)) @@ -130,37 +130,3 @@ def generate_key_set_hgnc_response_docs( list_keys = [set(doc.keys()) for doc in res_input.json()["response"]["docs"]] set_keys = set.union(*list_keys) return set_keys - - -def print_status_code_if_res_not_ok( - res_input: requests.models.Response, - dict_status_code: dict[int, str] | None = None, -) -> None: - """Print the status code and status message if the response is not OK - - Parameters - ---------- - res_input : requests.models.Response - Response object from an HGNC REST API request - dict_status_code : dict[int, str] | None - Dictionary of status codes and status messages; if None, defaults to a standard set of status codes - - Returns - ------- - None - """ - if dict_status_code is None: - dict_status_code = { - 400: "Bad request", - 404: "Not found", - 415: "Unsupported media type", - 500: "Server error", - 503: "Service unavailable", - } - - try: - print( - f"Error code: {res_input.status_code} ({dict_status_code[res_input.status_code]})" - ) - except KeyError: - print(f"Error code: {res_input.status_code}") \ No newline at end of file diff --git a/src/missense_kinase_toolkit/pfam.py b/src/missense_kinase_toolkit/pfam.py new file mode 100644 index 0000000..fd2213f --- /dev/null +++ b/src/missense_kinase_toolkit/pfam.py @@ -0,0 +1,160 @@ +from __future__ import annotations + +import json + +import pandas as pd + +from missense_kinase_toolkit import requests_wrapper + + +def retrieve_pfam(uniprot_id: str) -> pd.DataFrame | str | None: + """Retrieve Pfam domain information for a given UniProt ID using InterPro REST API + + Parameters + ---------- + uniprot_id : str + UniProt ID + + Returns + ------- + pd.DataFrame | str | None + DataFrame with Pfam domain information if request is successful, UniProt ID if request fails; + None if response is empty + """ + + url = "https://www.ebi.ac.uk/interpro/api/entry/pfam/protein/UniProt/" + uniprot_id + + res = requests_wrapper.get_cached_session().get( + url, headers={"Accept": "application/json"} + ) + + if res.ok: + dict_json = json.loads(res.text)["results"] + try: + df1_out = pd.DataFrame() + df2_out = pd.DataFrame() + + for entry in dict_json: + df1_temp = pd.DataFrame.from_dict( + entry["metadata"], orient="index" + ).transpose() + df1_out = pd.concat([df1_out, df1_temp]).reset_index(drop=True) + + df2_temp = pd.DataFrame.from_dict( + entry["proteins"][0], orient="index" + ).transpose() + df2_out = pd.concat([df2_out, df2_temp]).reset_index(drop=True) + + df1_out = df1_out.rename(columns={"accession": "pfam_accession"}) + df2_out = df2_out.rename( + columns={ + "accession": "uniprot_accession", + "source_database": "review_status", + } + ) + + df_out = pd.concat([df1_out, df2_out], axis=1) + df_out = df_out.explode("entry_protein_locations").reset_index(drop=True) + + list_entry = ["model", "score"] + for entry in list_entry: + df_out[entry] = df_out["entry_protein_locations"].apply( + lambda x: x[entry] + ) + + list_fragments = ["start", "end", "dc-status"] + for entry in list_fragments: + df_out[entry] = df_out["entry_protein_locations"].apply( + lambda x: x["fragments"][0][entry] + ) + + del df_out["entry_protein_locations"] + + return df_out + except KeyError: + print("Error:") + print(dict_json) + print() + return None + else: + return uniprot_id + + +def concat_pfam( + iter_uniprot: iter[str], + iter_hgnc: iter[str], +) -> tuple[pd.DataFrame, dict[str, str], dict[str, str]]: + """Concatenate Pfam domain information for a list of UniProt IDs + + Parameters + ---------- + iter_uniprot : iter[str] + Iterable of UniProt IDs + iter_hgnc : iter[str] + Iterable of HGNC symbols + + Returns + ------- + pd.DataFrame + DataFrame with Pfam domain information + dict[str, str] + Dictionary of HGNC symbols and UniProt IDs with errors + dict[str, str] + Dictionary of HGNC symbols and UniProt IDs with missing information + """ + dict_error = {} + dict_missing = {} + df = pd.DataFrame() + + for uniprot, hgnc in zip(iter_uniprot, iter_hgnc): + temp = retrieve_pfam(uniprot) + + if temp is None: + dict_error[hgnc] = uniprot + if type(temp) is str: + dict_missing[hgnc] = uniprot + else: + temp.insert(0, "hgnc", hgnc) + df = pd.concat([df, temp]).reset_index(drop=True) + + return df, dict_error, dict_missing + + +def extract_numeric(input_string): + num = "" + for i in input_string: + if i.isdigit(): + num = num + i + return num + + +def find_pfam( + input_hgnc: str, + input_position: int, + df_ref: pd.DataFrame, +) -> str | None: + """Find Pfam domain for a given HGNC symbol and position + + Parameters + ---------- + input_hgnc : str + HGNC symbol + input_position : int + Codon position + df_ref : pd.DataFrame + DataFrame with Pfam domain information + + Returns + ------- + str | None + Pfam domain if found, None if not found + """ + df_temp = df_ref.loc[df_ref["hgnc"] == input_hgnc].reset_index() + try: + domain = df_temp.loc[ + ((input_position >= df_temp["start"]) & (input_position <= df_temp["end"])), + "name", + ].values[0] + return domain + except IndexError: + return None diff --git a/src/missense_kinase_toolkit/requests_wrapper.py b/src/missense_kinase_toolkit/requests_wrapper.py index 3826aba..f9762fd 100644 --- a/src/missense_kinase_toolkit/requests_wrapper.py +++ b/src/missense_kinase_toolkit/requests_wrapper.py @@ -7,6 +7,7 @@ ETL_REQUEST_CACHE_VAR = "ETL_REQUEST_CACHE" + def add_retry_to_session( session, retries=5, @@ -36,4 +37,4 @@ def get_cached_session(): else: session = CachedSession(backend="memory") - return add_retry_to_session(session) \ No newline at end of file + return add_retry_to_session(session) diff --git a/src/missense_kinase_toolkit/utils_requests.py b/src/missense_kinase_toolkit/utils_requests.py new file mode 100644 index 0000000..1ef329e --- /dev/null +++ b/src/missense_kinase_toolkit/utils_requests.py @@ -0,0 +1,35 @@ +import requests + + +def print_status_code_if_res_not_ok( + res_input: requests.models.Response, + dict_status_code: dict[int, str] | None = None, +) -> None: + """Print the status code and status message if the response is not OK + + Parameters + ---------- + res_input : requests.models.Response + Response object from an HGNC REST API request + dict_status_code : dict[int, str] | None + Dictionary of status codes and status messages; if None, defaults to a standard set of status codes + + Returns + ------- + None + """ + if dict_status_code is None: + dict_status_code = { + 400: "Bad request", + 404: "Not found", + 415: "Unsupported media type", + 500: "Server error", + 503: "Service unavailable", + } + + try: + print( + f"Error code: {res_input.status_code} ({dict_status_code[res_input.status_code]})" + ) + except KeyError: + print(f"Error code: {res_input.status_code}")