From 3799d7d929d074b2e7a2c62c8e59ae7f9e198e50 Mon Sep 17 00:00:00 2001 From: thomashopf Date: Mon, 8 Mar 2021 20:51:15 +0100 Subject: [PATCH 01/47] [FIX] lowercase letters break existing() alignment protocol, fixes #258 --- evcouplings/align/protocol.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index 06fc1902..5bc111f2 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -522,11 +522,15 @@ def describe_frequencies(alignment, first_index, target_seq_index=None): fi = alignment.frequencies conservation = alignment.conservation() - fi_cols = {c: fi[:, i] for c, i in alignment.alphabet_map.items()} + # careful not to include any characters that are non-match state (e.g. lowercase letters) + fi_cols = { + c: fi[:, alignment.alphabet_map[c]] for c in alignment.alphabet + } + if target_seq_index is not None: target_seq = alignment[target_seq_index] else: - target_seq = np.full((alignment.L), np.nan) + target_seq = np.full((alignment.L, ), np.nan) info = pd.DataFrame( { @@ -539,6 +543,11 @@ def describe_frequencies(alignment, first_index, target_seq_index=None): # reorder columns info = info.loc[:, ["i", "A_i", "conservation"] + list(alignment.alphabet)] + # do not report values for lowercase columns + info.loc[ + info.A_i.str.lower() == info.A_i, ["conservation"] + list(alignment.alphabet) + ] = np.nan + return info From 741a7895332a3d6c8de6f46dc60f4c391b3a5107 Mon Sep 17 00:00:00 2001 From: thomashopf Date: Mon, 8 Mar 2021 20:52:44 +0100 Subject: [PATCH 02/47] [FIX] wrong sequence index in error message, fixes #259 --- evcouplings/couplings/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py index 31da0eee..bb4f972d 100755 --- a/evcouplings/couplings/model.py +++ b/evcouplings/couplings/model.py @@ -699,7 +699,7 @@ def delta_hamiltonian(self, substitutions, verify_mutants=True): if verify_mutants and subs_from != self.target_seq[pos[i]]: raise ValueError( "Inconsistency with target sequence: pos={} target={} subs={}".format( - subs_pos, self.target_seq[i], subs_from + subs_pos, self.target_seq[pos[i]], subs_from ) ) except KeyError: From f6ee1a8e5e0811cc89a205fa2e42e4565da8d49e Mon Sep 17 00:00:00 2001 From: thomashopf Date: Mon, 8 Mar 2021 21:00:37 +0100 Subject: [PATCH 03/47] [FIX] fix extract_annotation crash, fixes #249 --- evcouplings/align/protocol.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index 5bc111f2..ede0c4a6 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -1511,6 +1511,8 @@ def standard(**kwargs): annotation_file = prefix + "_annotation.csv" annotation = extract_header_annotation(ali_raw) annotation.to_csv(annotation_file, index=False) + else: + annotation_file = None # center alignment around focus/search sequence focus_cols = np.array([c != "-" for c in ali_raw[0]]) @@ -1525,9 +1527,11 @@ def standard(**kwargs): outcfg = { **jackhmmer_outcfg, **mod_outcfg, - "annotation_file": annotation_file } + if annotation_file is not None: + outcfg["annotation_file"] = annotation_file + # dump output config to YAML file for debugging/logging write_config_file(prefix + ".align_standard.outcfg", outcfg) From d6627a29960167df5535450ddf4b4d1a5037cdf4 Mon Sep 17 00:00:00 2001 From: thomashopf Date: Sat, 13 Mar 2021 13:43:29 +0100 Subject: [PATCH 04/47] [FIX] lowercase columns in frequency stable still led to crash --- evcouplings/couplings/pairs.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/evcouplings/couplings/pairs.py b/evcouplings/couplings/pairs.py index 2de96d97..5b284903 100644 --- a/evcouplings/couplings/pairs.py +++ b/evcouplings/couplings/pairs.py @@ -810,13 +810,17 @@ def add_freqs_to_ec_table(ecs, freqs): Frequency table generated by pipeline (_frequencies.csv) using evcouplings.align.protocol.describe_frequencies() """ - # rename column names to be more descriptive + # rename column names to be more descriptive; + # also get rid of rows with nan values (these are present only if input + # alignment has lowercase letters but will cause problems downstream + # where we need frequencies for uppercase letters only. Hence, remove + # from table right away of freq_i assignment below will crash. freqs = freqs.rename( columns={ "-": "gap_i", "conservation": "cons_i", } - ) + ).dropna() # extract frequency of target residue for each position freqs.loc[:, "freq_i"] = [r[r["A_i"]] for idx, r in freqs.iterrows()] From 380752e0c1d1bc25c2f3860c018a316bf6600bba Mon Sep 17 00:00:00 2001 From: thomashopf Date: Sat, 13 Mar 2021 14:36:50 +0100 Subject: [PATCH 05/47] [FEATURE] local submitter can now run jobs in parallel depending on user choice --- config/sample_config_complex.txt | 6 ++++++ config/sample_config_monomer.txt | 6 ++++++ evcouplings/utils/app.py | 29 +++++++++++++++++++++++++++-- 3 files changed, 39 insertions(+), 2 deletions(-) diff --git a/config/sample_config_complex.txt b/config/sample_config_complex.txt index b1f775da..03aa9bbb 100644 --- a/config/sample_config_complex.txt +++ b/config/sample_config_complex.txt @@ -487,6 +487,12 @@ environment: memory: 15000 time: 2-0:0:0 + # Special setting for "local" engine to define number of workers running in parallel + # (note that "cores" has to be defined above to make sure each job only uses a defined + # number of cores). If not defined or None, will default to number of cores / cores per job; + # otherwise specify integer to limit number of workers (1 for serial execution of subjobs) + # parallel_workers: 1 + # command that will be executed before running actual computation (can be used to set up environment) configuration: diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt index 084bbd61..c24dad32 100644 --- a/config/sample_config_monomer.txt +++ b/config/sample_config_monomer.txt @@ -384,6 +384,12 @@ environment: memory: 15000 time: 2-0:0:0 + # Special setting for "local" engine to define number of workers running in parallel + # (note that "cores" has to be defined above to make sure each job only uses a defined + # number of cores). If not defined or None, will default to number of cores / cores per job; + # otherwise specify integer to limit number of workers (1 for serial execution of subjobs) + # parallel_workers: 1 + # command that will be executed before running actual computation (can be used to set up environment) configuration: diff --git a/evcouplings/utils/app.py b/evcouplings/utils/app.py index dd1ff573..c250c078 100644 --- a/evcouplings/utils/app.py +++ b/evcouplings/utils/app.py @@ -16,6 +16,7 @@ from os import path, environ from collections import Mapping +import billiard import click from evcouplings import utils @@ -355,9 +356,33 @@ def run_jobs(configs, global_config, overwrite=False, workdir=None, abort_on_err ) # create submitter from global (pre-unrolling) configuration + submitter_cfg = global_config["environment"] + submitter_engine = submitter_cfg["engine"] + submitter_cores = submitter_cfg.get("cores") + + # special treatment for local submitter - allow to choose + # how many jobs to run in parallel (normally defined by grid engine) + submitter_kws = {} + + # requires that number of cores per job is defined or external tools might + # request all CPUs for each subjob + if submitter_engine == "local" and submitter_cores is not None: + # check which value was set in config for number of parallel workers, default to None + max_parallel_workers = submitter_cfg.get("parallel_workers") + + # if not defined, calculate + if max_parallel_workers is None: + max_cores = billiard.cpu_count() + max_parallel_workers = int(max_cores / submitter_cores) + + # do not request more workers than needed for number of subjobs + num_workers = min(len(configs), max_parallel_workers) + submitter_kws = {"ncpu": num_workers} + submitter = utils.SubmitterFactory( - global_config["environment"]["engine"], - db_path=out_prefix + "_job_database.txt" + submitter_engine, + db_path=out_prefix + "_job_database.txt", + **submitter_kws ) # collect individual submitted jobs here From f3b733a9da714f8e77f80c7357aa30ec59bca88e Mon Sep 17 00:00:00 2001 From: Anna Green Date: Tue, 11 May 2021 11:26:36 -0400 Subject: [PATCH 06/47] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0e97d2c8..ecee7c4e 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,7 @@ Hopf, T. A., Schärfe, C. P. I., Rodrigues, J. P. G. L. M., Green, A. G., Kohlba Hopf, T. A., Ingraham, J. B., Poelwijk, F.J., Schärfe, C.P.I., Springer, M., Sander, C., & Marks, D. S. (2017). Mutation effects predicted from sequence co-variation. *Nature Biotechnology* **35**, 128–135 doi:10.1038/nbt.3769 -Green, A. G. and Elhabashy, H., Brock, K. P., Maddamsetti, R., Kohlbacher, O., Marks, D. S. (2019) Proteom-scale discovery of protein interactions with residue-level resolution using sequence coevolution. BioRxiv (in review). https://doi.org/10.1101/791293 +Green, A. G. and Elhabashy, H., Brock, K. P., Maddamsetti, R., Kohlbacher, O., Marks, D. S. (2021) Large-scale discovery of protein interactions at residue resolution using co-evolution calculated from genomic sequences. *Nature Communications* **12**, 1396. https://doi.org/10.1038/s41467-021-21636-z ## Contributors From 594b45a9ecd7d429f27c7d576559134c3e9c3bcc Mon Sep 17 00:00:00 2001 From: kpgbrock Date: Thu, 30 Sep 2021 13:12:47 -0400 Subject: [PATCH 07/47] Fix typo in complex_probability() ecs = pairs.add_mixture_proability -> ecs = pairs.add_mixture_probability( --- evcouplings/couplings/protocol.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evcouplings/couplings/protocol.py b/evcouplings/couplings/protocol.py index 54fe1851..66940cec 100644 --- a/evcouplings/couplings/protocol.py +++ b/evcouplings/couplings/protocol.py @@ -453,7 +453,7 @@ def complex_probability(ecs, scoring_model, use_all_ecs=False, containing confidence measure """ if use_all_ecs: - ecs = pairs.add_mixture_proability( + ecs = pairs.add_mixture_probability( ecs, model=scoring_model ) else: From 5de7bab3b5202848ace2e16ff2b1cda5c8edfda6 Mon Sep 17 00:00:00 2001 From: Aaron Kollasch Date: Mon, 23 May 2022 16:23:05 -0400 Subject: [PATCH 08/47] Update requirements.txt sklearn is an alias for scikit-learn --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a31d9b78..079738ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ jinja2 biopython seaborn billiard -sklearn +scikit-learn From 791c5faf3725c1eab88aa36ec51e086ed3a01271 Mon Sep 17 00:00:00 2001 From: thomashopf Date: Thu, 4 Aug 2022 13:55:23 +0200 Subject: [PATCH 09/47] [FIX] import Iterable/Mapping from collections.abc, closes #275 --- evcouplings/align/protocol.py | 3 ++- evcouplings/compare/pdb.py | 3 ++- evcouplings/couplings/mapping.py | 2 +- evcouplings/couplings/model.py | 2 +- evcouplings/utils/app.py | 2 +- evcouplings/utils/tracker/mongodb.py | 2 +- 6 files changed, 8 insertions(+), 6 deletions(-) diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index ede0c4a6..f2ccbc3a 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -8,7 +8,8 @@ """ -from collections import OrderedDict, Iterable +from collections import OrderedDict +from collections.abc import Iterable import re from shutil import copy import os diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index 3220f303..c76ba257 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -5,7 +5,8 @@ Thomas A. Hopf """ -from collections import OrderedDict, Iterable +from collections import OrderedDict +from collections.abc import Iterable from os import path from urllib.error import HTTPError diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py index 404b1e1f..c1302808 100644 --- a/evcouplings/couplings/mapping.py +++ b/evcouplings/couplings/mapping.py @@ -7,7 +7,7 @@ Anna G. Green (MultiSegmentCouplingsModel) """ -from collections import Iterable +from collections.abc import Iterable from copy import deepcopy from evcouplings.couplings.model import CouplingsModel import pandas as pd diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py index bb4f972d..30f5ce80 100755 --- a/evcouplings/couplings/model.py +++ b/evcouplings/couplings/model.py @@ -6,7 +6,7 @@ Authors: Thomas A. Hopf """ -from collections import Iterable +from collections.abc import Iterable from copy import deepcopy from numba import jit diff --git a/evcouplings/utils/app.py b/evcouplings/utils/app.py index c250c078..11578de4 100644 --- a/evcouplings/utils/app.py +++ b/evcouplings/utils/app.py @@ -14,7 +14,7 @@ import re from copy import deepcopy from os import path, environ -from collections import Mapping +from collections.abc import Mapping import billiard import click diff --git a/evcouplings/utils/tracker/mongodb.py b/evcouplings/utils/tracker/mongodb.py index ae88d9bd..edb8e901 100644 --- a/evcouplings/utils/tracker/mongodb.py +++ b/evcouplings/utils/tracker/mongodb.py @@ -16,7 +16,7 @@ import os from datetime import datetime -from collections import Mapping +from collections.abc import Mapping from pymongo import MongoClient, errors import gridfs From a50bd0f00914076c7027a471d496d73cb582a24b Mon Sep 17 00:00:00 2001 From: thomashopf Date: Thu, 4 Aug 2022 14:02:37 +0200 Subject: [PATCH 10/47] [FIX] Update single sequence retrieval Uniprot API endpoint --- config/sample_config_complex.txt | 2 +- config/sample_config_monomer.txt | 2 +- evcouplings/compare/sifts.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/sample_config_complex.txt b/config/sample_config_complex.txt index 03aa9bbb..9ddadf0b 100644 --- a/config/sample_config_complex.txt +++ b/config/sample_config_complex.txt @@ -506,7 +506,7 @@ databases: uniref90: /n/groups/marks/databases/jackhmmer/uniref90/uniref90_current.o2.fasta # URL do download sequences if sequence_file is not given. {} will be replaced by sequence_id. - sequence_download_url: http://www.uniprot.org/uniprot/{}.fasta + sequence_download_url: http://rest.uniprot.org/uniprot/{}.fasta # Directory with PDB MMTF structures (leave blank to fetch structures from web) pdb_mmtf_dir: diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt index c24dad32..e1260d1d 100644 --- a/config/sample_config_monomer.txt +++ b/config/sample_config_monomer.txt @@ -403,7 +403,7 @@ databases: uniref90: /n/groups/marks/databases/jackhmmer/uniref90/uniref90_current.o2.fasta # URL do download sequences if sequence_file is not given. {} will be replaced by sequence_id. - sequence_download_url: http://www.uniprot.org/uniprot/{}.fasta + sequence_download_url: http://rest.uniprot.org/uniprot/{}.fasta # Directory with PDB MMTF structures (leave blank to fetch structures from web) pdb_mmtf_dir: diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py index 3c378dc5..b4fa7514 100644 --- a/evcouplings/compare/sifts.py +++ b/evcouplings/compare/sifts.py @@ -66,7 +66,7 @@ # database jackhmmer: jackhmmer sequence_database: -sequence_download_url: http://www.uniprot.org/uniprot/{}.fasta +sequence_download_url: http://rest.uniprot.org/uniprot/{}.fasta """ From 2495dc764f04ebdeacfdce030936ef24d3b5483c Mon Sep 17 00:00:00 2001 From: thomashopf Date: Thu, 4 Aug 2022 14:09:33 +0200 Subject: [PATCH 11/47] [FIX] Apply sklearn->scikit-learn update also in setup, not just requirements --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e96caab9..af703107 100644 --- a/setup.py +++ b/setup.py @@ -98,7 +98,7 @@ install_requires=['setuptools>=18.2', 'numpy', 'pandas', 'scipy', 'numba', 'ruamel.yaml', 'matplotlib', 'requests', 'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2', - 'biopython', 'seaborn', 'billiard', 'sklearn', + 'biopython', 'seaborn', 'billiard', 'scikit-learn', ], ) From 792150e6c71e5edc284423ffedc37adf40d5744b Mon Sep 17 00:00:00 2001 From: thomashopf Date: Fri, 5 Aug 2022 09:58:04 +0200 Subject: [PATCH 12/47] [FIX] Upgrade fetch_uniprot_mapping and SIFTS sequence table creation to latest Uniprot API version --- evcouplings/compare/sifts.py | 151 ++++++++++++++++++++++++++++------- 1 file changed, 123 insertions(+), 28 deletions(-) diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py index b4fa7514..9150a1bb 100644 --- a/evcouplings/compare/sifts.py +++ b/evcouplings/compare/sifts.py @@ -13,15 +13,19 @@ Chan Kang (find_homologs) """ +from io import StringIO from os import path from collections import OrderedDict from copy import deepcopy +import time +import logging import pandas as pd import requests +from requests.adapters import HTTPAdapter, Retry from evcouplings.align.alignment import ( - Alignment, read_fasta, parse_header + Alignment, read_fasta, parse_header, write_fasta ) from evcouplings.align.protocol import ( jackhmmer_search, hmmbuild_and_search @@ -36,7 +40,7 @@ ) from evcouplings.utils.helpers import range_overlap -UNIPROT_MAPPING_URL = "https://www.uniprot.org/mapping/" +UNIPROT_MAPPING_URL = "https://rest.uniprot.org" SIFTS_URL = "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_segments_observed.csv.gz" SIFTS_REST_API = "http://www.ebi.ac.uk/pdbe/api/mappings/uniprot_segments/{}" @@ -70,45 +74,112 @@ """ -def fetch_uniprot_mapping(ids, from_="ACC", to="ACC", format="fasta"): +def fetch_uniprot_mapping(ids, from_db="UniProtKB_AC-ID", to_db="UniProtKB", format="fasta", isoforms=True, + polling_interval=3, retry_kws=None): """ Fetch data from UniProt ID mapping service (e.g. download set of sequences) + Updated to match 2022 UniProt update, code is a simpflied version of + https://www.uniprot.org/help/id_mapping + + For valid from_db/to_db pairs check https://rest.uniprot.org/configure/idmapping/fields + Parameters ---------- ids : list(str) List of UniProt identifiers for which to retrieve mapping - from_ : str, optional (default: "ACC") + from_db : str, optional (default: "UniProtKB_AC-ID") Source identifier (i.e. contained in "ids" list) - to : str, optional (default: "ACC") + to_db : str, optional (default: "UniProtKB") Target identifier (to which source should be mapped) format : str, optional (default: "fasta") Output format to request from Uniprot server + isoforms : bool, optional (default: True) + Include isoform sequences in retrieval. Note that API will return *all* + isoforms for entry, not just the requested ones, i.e. output will + need to be filtered after retrieval. Output may also contain duplicate + entries if multiple isoforms were present in requested identifier set. + polling_interval : int, optional (default: 3) + Number of seconds to wait before polling for request results + retry_kws : dict, optional (default: None) + Retry parameters supplied as kwargs to requests.adapters.Retry Returns ------- - str: - Response from UniProt server + result: requests.models.Response + Response from UniProt server. Use result.text or result.json() + to access results """ - params = { - "from": from_, - "to": to, - "format": format, - "query": " ".join(ids) - } - url = UNIPROT_MAPPING_URL - r = requests.post(url, data=params) + if retry_kws is None: + retry_kws = { + "total": 5, + "backoff_factor": 0.25, + "status_forcelist": [500, 502, 503, 504] + } - if r.status_code != requests.codes.ok: - raise ResourceError( - "Invalid status code ({}) for URL: {}".format( - r.status_code, url - ) + # submit request + def submit_request(): + r = requests.post( + f"{UNIPROT_MAPPING_URL}/idmapping/run", + data={ + "from": from_db, + "to": to_db, + "ids": ",".join(ids) + }, ) + r.raise_for_status() + return r.json()["jobId"] + + def check_id_mapping_results_ready(job_id): + while True: + request = session.get(f"{UNIPROT_MAPPING_URL}/idmapping/status/{job_id}") + request.raise_for_status() + j = request.json() + if "jobStatus" in j: + if j["jobStatus"] == "RUNNING": + time.sleep(polling_interval) + else: + raise Exception(j["jobStatus"]) + else: + return bool(j["results"] or j["failedIds"]) - return r.text + def get_id_mapping_results_link(job_id): + url = f"{UNIPROT_MAPPING_URL}/idmapping/details/{job_id}" + request = session.get(url) + request.raise_for_status() + return request.json()["redirectURL"] + + job_id = submit_request() + + retries = Retry( + **retry_kws + ) + session = requests.Session() + session.mount("https://", HTTPAdapter(max_retries=retries)) + + if check_id_mapping_results_ready(job_id): + result_url = get_id_mapping_results_link(job_id) + + # for simplicity, stream results (paginate requests to ensure more stable result, + # rather than paginating response of very big unstable request): + if "/stream/" not in result_url: + result_url = result_url.replace( + "/results/", "/results/stream/" + ) + + # append return format to URL + result_url += f"?format={format}" + + # add isoform retrieval parameter if requested + if isoforms: + result_url += "&includeIsoform=true" + + result = session.get(result_url) + result.raise_for_status() + + return result def find_homologs(pdb_alignment_method="jackhmmer", **kwargs): @@ -424,12 +495,19 @@ def create_sequence_file(self, output_file, chunk_size=1000, max_retries=100): from UniProt ID mapping service, which unfortunately often suffers from connection failures. """ + # unique identifiers *including* isoforms (used for filtering final table) ids = self.table.uniprot_ac.unique().tolist() + # unique identifiers *excluding* isoforms (used for retrieval, to avoid + # fetching sequences twice) + ids_no_isoform = self.table.uniprot_ac.map( + lambda id_: id_.split("-")[0] + ).unique().tolist() + # retrieve sequences in chunks since ID mapping service # tends to fail on large requests id_chunks = [ - ids[i:i + chunk_size] for i in range(0, len(ids), chunk_size) + ids_no_isoform[i:i + chunk_size] for i in range(0, len(ids_no_isoform), chunk_size) ] # store individual retrieved chunks as list of strings @@ -439,17 +517,22 @@ def create_sequence_file(self, output_file, chunk_size=1000, max_retries=100): # abort if number exceeds max_retries num_retries = 0 - for ch in id_chunks: + for i, ch in enumerate(id_chunks): # fetch sequence chunk; # if there is a problem retry as long as we stay within - # maximum number of retries + # maximum number of retries; + # latest version also has limited number of retries inside the mapping function while True: try: - seqs = fetch_uniprot_mapping(ch) + logging.info(f"Fetching chunk {i + 1} of {len(id_chunks)}...") + seqs = fetch_uniprot_mapping(ch).text break - except requests.ConnectionError as e: + except (requests.ConnectionError, requests.HTTPError) as e: # count as failed try num_retries += 1 + logging.warning( + f"Error fetching chunk {i + 1} of {len(id_chunks)}, current retry #: {num_retries}" + ) # if we retried too often, abort if num_retries > max_retries: @@ -472,12 +555,24 @@ def create_sequence_file(self, output_file, chunk_size=1000, max_retries=100): assert seqs.endswith("\n") - # store for writing + # store for parsing and output seq_chunks.append(seqs) + # create set representation for efficient lookup + ids_set = set(ids) + + # only retain sequences that we actually had in input list + # (API will return all isoforms) + filtered_seqs = [ + (seq_id, seq) for seq_id, seq in read_fasta( + StringIO("".join(seq_chunks)) + ) + if seq_id.split("|")[1] in ids_set + ] + # store sequences to FASTA file in one go at the end with open(output_file, "w") as f: - f.write("".join(seq_chunks)) + write_fasta(filtered_seqs, f) self.sequence_file = output_file From 1f85d74ebdb918dbf82a0cf941fccd72fb2dfe02 Mon Sep 17 00:00:00 2001 From: thomashopf Date: Fri, 5 Aug 2022 09:58:19 +0200 Subject: [PATCH 13/47] Version bump --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index af703107..de2b95b7 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ name='evcouplings', # Version: - version='0.1.1', + version='0.1.2', description='A Framework for evolutionary couplings analysis', long_description=readme, @@ -28,8 +28,8 @@ url='https://github.com/debbiemarkslab/EVcouplings', # Author details - author='Thomas Hopf, Benjamin Schubert', - author_email='thomas_hopf@hms.harvard.edu, benjamin_schubert@hms.harvard.edu', + author='Thomas Hopf', + author_email='thomas_hopf@hms.harvard.edu', # Choose your license license='MIT', From 22761e58a91c4c510c090ee8f049fc5335b52f15 Mon Sep 17 00:00:00 2001 From: thomashopf Date: Fri, 5 Aug 2022 10:44:58 +0200 Subject: [PATCH 14/47] [FIX] pandas deprecation and warning --- evcouplings/compare/sifts.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py index 9150a1bb..a6254834 100644 --- a/evcouplings/compare/sifts.py +++ b/evcouplings/compare/sifts.py @@ -883,7 +883,7 @@ def _create_mapping(r): hits.groupby(hit_columns) ): agg_mapping = {} - agg_df = pd.DataFrame() + agg_df_raw = [] # go through each segment for j, r in grp.iterrows(): # compute mapping for that particular segment @@ -891,7 +891,9 @@ def _create_mapping(r): # add to overall mapping dictionary for this hit agg_mapping.update(map_j) - agg_df = agg_df.append(map_j_df) + agg_df_raw.append(map_j_df) + + agg_df = pd.concat(agg_df_raw) # store assignment of group to mapping index mapping_rows.append( @@ -975,10 +977,11 @@ def _agg_type(x): # remove hits with too little residue coverage hits_grouped = hits_grouped.query("overlap >= @min_overlap") - hits_grouped.loc[:, "bitscore"] = pd.to_numeric( - hits_grouped.loc[:, "bitscore"], errors="coerce" + hits_grouped = hits_grouped.assign( + bitscore=pd.to_numeric(hits_grouped.bitscore, errors="coerce") + ).sort_values( + by="bitscore", ascending=False ) - hits_grouped = hits_grouped.sort_values(by="bitscore", ascending=False) # if requested, only keep one chain per PDB; # sort by score before this to keep best hit From 90a429675fe4b9522b30146acead881e4d45362d Mon Sep 17 00:00:00 2001 From: thomashopf Date: Fri, 5 Aug 2022 11:03:15 +0200 Subject: [PATCH 15/47] [FIX] safeguard insertion code checking --- evcouplings/compare/sifts.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py index a6254834..275b9848 100644 --- a/evcouplings/compare/sifts.py +++ b/evcouplings/compare/sifts.py @@ -927,6 +927,11 @@ def _create_mapping(r): # split residue name into residue number and (potential) insertion code # for proper sorting def _split_insertion_code(res): + # safeguard function by typecasting to string (in some instances + # this turned out to be an integer, presumably due to pandas automatic + # typecast) + res = str(res) + # should not happen but just to be safe assert len(res) >= 1 From d299279fc92ae30938b8f3d7090fe7343bef8451 Mon Sep 17 00:00:00 2001 From: Zachary Charlop-Powers Date: Fri, 11 Nov 2022 10:34:40 -0500 Subject: [PATCH 16/47] change name inputs to bokeh --- evcouplings/visualize/mutations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evcouplings/visualize/mutations.py b/evcouplings/visualize/mutations.py index 9830a7ce..77e822b6 100644 --- a/evcouplings/visualize/mutations.py +++ b/evcouplings/visualize/mutations.py @@ -371,8 +371,8 @@ def matrix_base_bokeh(matrix, positions, substitutions, p = bp.figure( title=title, x_range=positions, y_range=substitutions, - x_axis_location="above", plot_width=width_factor * len(positions), - plot_height=height_factor * len(substitutions), + x_axis_location="above", width=width_factor * len(positions), + height=height_factor * len(substitutions), toolbar_location="left", tools=TOOLS ) From 721875a17150a394abd65cc0b75c95a95577d514 Mon Sep 17 00:00:00 2001 From: thomashopf Date: Thu, 16 Mar 2023 16:01:59 +0100 Subject: [PATCH 17/47] Update build_and_test.yml --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index c4674acb..e7be106a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - python-version: [3.6] + python-version: [3.8] steps: - uses: actions/checkout@v2 From af27842c5fc72c291831261c9a19849a3e313efd Mon Sep 17 00:00:00 2001 From: thomashopf Date: Thu, 16 Mar 2023 16:02:13 +0100 Subject: [PATCH 18/47] Update build_test_and_push.yml --- .github/workflows/build_test_and_push.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml index b1aa2a52..ec380fda 100644 --- a/.github/workflows/build_test_and_push.yml +++ b/.github/workflows/build_test_and_push.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6] + python-version: [3.8] steps: - uses: actions/checkout@v2 From d472e4b3e38cb9124a8ee4678f271cb6ec92a396 Mon Sep 17 00:00:00 2001 From: Aaron Kollasch Date: Thu, 2 Mar 2023 16:30:47 -0500 Subject: [PATCH 19/47] Support a3m format in existing alignment protocol --- evcouplings/align/alignment.py | 10 ++++++++-- evcouplings/align/protocol.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py index 20d95210..2e77f733 100644 --- a/evcouplings/align/alignment.py +++ b/evcouplings/align/alignment.py @@ -9,6 +9,7 @@ import re from collections import namedtuple, OrderedDict, defaultdict from copy import deepcopy +from pathlib import Path import numpy as np from numba import jit @@ -326,7 +327,7 @@ def write_a3m(sequences, fileobj, insert_gap=INSERT_GAP, width=80): fileobj.write(seq.replace(insert_gap, "") + "\n") -def detect_format(fileobj): +def detect_format(fileobj, filepath=""): """ Detect if an alignment file is in FASTA or Stockholm format. @@ -335,10 +336,12 @@ def detect_format(fileobj): ---------- fileobj : file-like obj Alignment file for which to detect format + filepath : string or path-like obj + Path of alignment file Returns ------- - format : {"fasta", "stockholm", None} + format : {"fasta", "a3m", "stockholm", None} Format of alignment, None if not detectable """ for i, line in enumerate(fileobj): @@ -348,6 +351,9 @@ def detect_format(fileobj): # This indicates a FASTA file if line.startswith(">"): + # A3M files have extension .a3m + if Path(filepath).suffix.lower() == ".a3m": + return "a3m" return "fasta" # Skip comment lines and empty lines for FASTA detection diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index f2ccbc3a..7fa6282b 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -689,7 +689,7 @@ def existing(**kwargs): # first try to autodetect format of alignment with open(input_alignment) as f: - format = detect_format(f) + format = detect_format(f, filepath=input_alignment) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " From 7192161ceca6c64f8d11aba16765178f97d50711 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 10 May 2023 15:34:35 +0200 Subject: [PATCH 20/47] Prepare tests for new version, update readme and setup --- .github/workflows/build_and_test.yml | 2 +- .github/workflows/build_test_and_push.yml | 2 +- README.md | 7 +++---- setup.py | 5 +++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e7be106a..54c3ad0b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - python-version: [3.8] + python-version: [3.10] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml index ec380fda..8a4b4543 100644 --- a/.github/workflows/build_test_and_push.yml +++ b/.github/workflows/build_test_and_push.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.10] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index ecee7c4e..b868fc2f 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,12 @@ Predict protein structure, function and mutations using evolutionary sequence co ### Installing the Python package -If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases. +* If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). +* If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases. #### Requirements -EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages that can be tricky to install using pip (numba, numpy, ...), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). In case you are creating a new conda environment or using miniconda, please make sure to run `conda install anaconda` before running pip, or otherwise the required packages will not be present. +EVcouplings actively supports Python >= 3.10 installations. #### Installation @@ -27,8 +28,6 @@ and to update to the latest version after previously installing EVcouplings from pip install -U --no-deps https://github.com/debbiemarkslab/EVcouplings/archive/develop.zip -Installation will take seconds. - ### External software tools *After installation and before running compute jobs, the paths to the respective binaries of the following external tools have to be set in your EVcouplings job configuration file(s).* diff --git a/setup.py b/setup.py index de2b95b7..9686763f 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ name='evcouplings', # Version: - version='0.1.2', + version='0.2', description='A Framework for evolutionary couplings analysis', long_description=readme, @@ -49,7 +49,8 @@ # The license as you wish (should match "license" above) 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', ], # What EVcouplings relates to: From b4fc441f18efb770a53aeb4e183be2fd7595b756 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 10 May 2023 15:37:24 +0200 Subject: [PATCH 21/47] Keep as is due to github security restrictions --- .github/workflows/build_and_test.yml | 2 +- .github/workflows/build_test_and_push.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 54c3ad0b..e7be106a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - python-version: [3.10] + python-version: [3.8] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml index 8a4b4543..ec380fda 100644 --- a/.github/workflows/build_test_and_push.yml +++ b/.github/workflows/build_test_and_push.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.10] + python-version: [3.8] steps: - uses: actions/checkout@v2 From e1362407a0b65d63ca07df55f44cb17b0a3722b7 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 10 May 2023 15:47:04 +0200 Subject: [PATCH 22/47] Fix numpy dtype deprecation, fixes #291 --- evcouplings/align/alignment.py | 2 +- evcouplings/couplings/model.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py index 20d95210..9a72f71b 100644 --- a/evcouplings/align/alignment.py +++ b/evcouplings/align/alignment.py @@ -422,7 +422,7 @@ def sequences_to_matrix(sequences): N = len(sequences) L = len(next(iter(sequences))) - matrix = np.empty((N, L), dtype=np.str) + matrix = np.empty((N, L), dtype=str) for i, seq in enumerate(sequences): if len(seq) != L: diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py index 30f5ce80..e8977104 100755 --- a/evcouplings/couplings/model.py +++ b/evcouplings/couplings/model.py @@ -609,7 +609,7 @@ def convert_sequences(self, sequences): ) ) - S = np.empty((len(sequences), L_seq), dtype=np.int) + S = np.empty((len(sequences), L_seq), dtype=int) try: for i, s in enumerate(sequences): @@ -689,8 +689,8 @@ def delta_hamiltonian(self, substitutions, verify_mutants=True): 2) delta J_ij, 3) delta h_i """ - pos = np.empty(len(substitutions), dtype=np.int) - subs = np.empty(len(substitutions), dtype=np.int) + pos = np.empty(len(substitutions), dtype=int) + subs = np.empty(len(substitutions), dtype=int) try: for i, (subs_pos, subs_from, subs_to) in enumerate(substitutions): From 78633124b2c4ed3b6ba3f47aac4969979aad77e2 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 10 May 2023 18:24:27 +0200 Subject: [PATCH 23/47] fix deprecation in pandas df equality checking in tests --- test/TestComplex.py | 2 +- test/TestMutation.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/TestComplex.py b/test/TestComplex.py index 873b24f7..d5ffa922 100644 --- a/test/TestComplex.py +++ b/test/TestComplex.py @@ -402,7 +402,7 @@ def test_find_possible_partners(self): pd.testing.assert_frame_equal( self.possible_partners, _possible_partners, - check_less_precise=True, check_like=True, + atol=1e-5, check_like=True, check_names=False ) diff --git a/test/TestMutation.py b/test/TestMutation.py index c1418209..e06a0976 100644 --- a/test/TestMutation.py +++ b/test/TestMutation.py @@ -98,7 +98,7 @@ def test_single_mutant_matrix(self): # gotta round to account for this _singles = _singles.round(3) singles = singles.round(3) - pd.testing.assert_frame_equal(singles, _singles, check_exact=False, check_less_precise=True) + pd.testing.assert_frame_equal(singles, _singles, check_dtype=False, atol=1e-5) def test_split_mutants_single(self): """ @@ -228,7 +228,7 @@ def test_predict_mutation_table_segment_column(self): self.c0, self.singles, output_column="prediction_independent" ) - pd.testing.assert_frame_equal(self.singles, _singles, check_less_precise=True) + pd.testing.assert_frame_equal(self.singles, _singles, check_dtype=False, atol=1e-5) def test_predict_mutation_table_empty_segment(self): """ From a880fea35340e22d880d928d4b23af6f3b90d926 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 11 May 2023 10:49:41 +0200 Subject: [PATCH 24/47] pin yaml due to upcoming API change --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9686763f..93f8beec 100644 --- a/setup.py +++ b/setup.py @@ -97,7 +97,7 @@ #setup_requires=['setuptools>=18.2', 'numpy'], install_requires=['setuptools>=18.2', 'numpy', - 'pandas', 'scipy', 'numba', 'ruamel.yaml', 'matplotlib', 'requests', + 'pandas', 'scipy', 'numba', 'ruamel.yaml<0.18', 'matplotlib', 'requests', 'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2', 'biopython', 'seaborn', 'billiard', 'scikit-learn', ], From 5cd4033913e5ae20519fa6bc58412928ad9072ed Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 11 May 2023 11:53:55 +0200 Subject: [PATCH 25/47] Fix alignment identifier memory usage, add header splitting, fixes #289 --- evcouplings/align/alignment.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py index 98538db5..f52d83fc 100644 --- a/evcouplings/align/alignment.py +++ b/evcouplings/align/alignment.py @@ -575,7 +575,12 @@ def __init__(self, sequence_matrix, sequence_ids=None, annotation=None, ) # make sure we get rid of iterators etc. - self.ids = np.array(list(sequence_ids)) + self.ids = list(sequence_ids) + + # turn identifiers into numpy array for consistency with previous implementation; + # but use dtype object to avoid memory usage issues of numpy string datatypes (longest + # sequence defines memory usage otherwise) + self.ids = np.array(self.ids, dtype=np.object_) self.id_to_index = { id_: i for i, id_ in enumerate(self.ids) @@ -613,7 +618,7 @@ def from_dict(cls, sequences, **kwargs): @classmethod def from_file(cls, fileobj, format="fasta", a3m_inserts="first", raise_hmmer_prefixes=True, - **kwargs): + split_header=False, **kwargs): """ Construct an alignment object by reading in an alignment file. @@ -631,6 +636,9 @@ def from_file(cls, fileobj, format="fasta", HMMER adds number prefixes to sequence identifiers in Stockholm files if identifiers are not unique. If True, the parser will raise an exception if a Stockholm alignment has such prefixes. + split_header: bool, optional (default: False) + Only store identifier portion of each header (before first whitespace) + in identifier list, rather than full header line **kwargs Additional arguments to be passed to class constructor @@ -670,6 +678,12 @@ def from_file(cls, fileobj, format="fasta", else: raise ValueError("Invalid alignment format: {}".format(format)) + # reduce header lines to identifiers if requested + if split_header: + seqs = { + header.split()[0]: seq for header, seq in seqs.items() + } + return cls.from_dict(seqs, **kwargs) def __getitem__(self, index): @@ -783,7 +797,8 @@ def select(self, columns=None, sequences=None): def apply(self, columns=None, sequences=None, func=np.char.lower): """ Apply a function along columns and/or rows of alignment matrix, - or to entire matrix. + or to entire matrix. Note that column and row selections are + applied independently in this particular order. Parameters ---------- @@ -817,7 +832,7 @@ def apply(self, columns=None, sequences=None, func=np.char.lower): mod_matrix[sequences, :] = func(mod_matrix[sequences, :]) return Alignment( - mod_matrix, np.copy(self.ids), deepcopy(self.annotation), + mod_matrix, deepcopy(self.ids), deepcopy(self.annotation), alphabet=self.alphabet ) From 2324ed63cb6fc70da8fc2d52fb7ab96609d9a321 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 11 May 2023 13:00:41 +0200 Subject: [PATCH 26/47] Correct wrong exception raise, fixes #199 --- evcouplings/couplings/mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py index c1302808..18605f9b 100644 --- a/evcouplings/couplings/mapping.py +++ b/evcouplings/couplings/mapping.py @@ -374,7 +374,7 @@ def __init__(self, filename, *segments, # initialize the segment index mapper to update model numbering if len(segments) == 0: - raise(ValueError, "Must provide at least one segment for MultiSegmentCouplingsModel") + raise ValueError("Must provide at least one segment for MultiSegmentCouplingsModel") first_segment = segments[0] index_start = first_segment.region_start From 9d34d5ef38265692275d0f042853434154d1d319 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 11 May 2023 14:11:53 +0200 Subject: [PATCH 27/47] better handler around occasional plmc segfaults, fixes #292 --- evcouplings/couplings/tools.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/evcouplings/couplings/tools.py b/evcouplings/couplings/tools.py index fd5ac104..79c592bd 100644 --- a/evcouplings/couplings/tools.py +++ b/evcouplings/couplings/tools.py @@ -265,10 +265,17 @@ def run_plmc(alignment, couplings_file, param_file=None, # returncode == -11 (segfault) despite successful calculation return_code, stdout, stderr = run(cmd, check_returncode=False) - # TODO: remove this segfault-hunting output once fixed + # TODO: remove this segfault-hunting output if fixed in plmc if return_code != 0: - # if not a segfault, still raise exception - if return_code != -11: + # check if we got valid output from plmc by parsing it + valid_plmc_output = True + try: + parse_plmc_log(stderr) + except KeyError: + valid_plmc_output = False + + # if not a segfault or invalid plmc output, still raise exception + if return_code != -11 or not valid_plmc_output: from evcouplings.utils.system import ExternalToolError raise ExternalToolError( "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format( @@ -276,12 +283,6 @@ def run_plmc(alignment, couplings_file, param_file=None, ) ) - print("PLMC NON-ZERO RETURNCODE:", return_code) - print(cmd) - print(" ".join(cmd)) - print("stdout:", stdout) - print("stderr:", stderr) - iter_df, out_fields = parse_plmc_log(stderr) # also check we actually calculated couplings... From 5b3b3f08a1f9d1a6fbae3130de4b3bbdd9d61b39 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Fri, 12 May 2023 19:00:40 +0200 Subject: [PATCH 28/47] Numpy deprecation fix, closes #248 --- evcouplings/compare/pdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index c76ba257..e16aaf1a 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -457,7 +457,7 @@ def _get_range(object_counts): # store explicit information about composition of residues def _group_info(field): return np.array( - [x[field] for x in mmtf.group_list] + [x[field] for x in mmtf.group_list], dtype=np.object_ ) # three and one letter code names of different groups @@ -589,7 +589,7 @@ def get_chain(self, chain, model=0): np.array([ np.arange(self.first_residue_index[i], self.last_residue_index[i]) for i in target_chain_indeces - ]) + ], dtype=np.object_) ) # chain indeces and identifiers for all residues From b9951e4765ba741446dde22646e2f6f66408bca1 Mon Sep 17 00:00:00 2001 From: Zachary Charlop-Powers Date: Tue, 3 Oct 2023 16:57:22 -0400 Subject: [PATCH 29/47] bump to python3.10 bump to python3.11 update GH actions --- .github/workflows/build_and_test.yml | 9 +- .github/workflows/build_test_and_push.yml | 9 +- test/TestComplex.py | 266 +++++++++++++++------- 3 files changed, 197 insertions(+), 87 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e7be106a..614c6810 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -8,12 +8,12 @@ jobs: strategy: matrix: - python-version: [3.8] + python-version: ['3.10', '3.11'] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install and configure conda @@ -37,6 +37,7 @@ jobs: wget https://marks.hms.harvard.edu/evcouplings_test_cases/data/evcouplings_test_cases.tar.gz tar -xf evcouplings_test_cases.tar.gz -C $HOME/ - name: Run tests in headless xvfb environment - uses: GabrielBB/xvfb-action@v1 + uses: coactions/setup-xvfb@v1 with: run: coverage run -m unittest discover -s test -p "Test*.py" + working-directory: ./ #optional diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml index ec380fda..53d28be0 100644 --- a/.github/workflows/build_test_and_push.yml +++ b/.github/workflows/build_test_and_push.yml @@ -9,12 +9,12 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: ['3.10', '3.11'] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install and configure conda @@ -38,9 +38,10 @@ jobs: wget https://marks.hms.harvard.edu/evcouplings_test_cases/data/evcouplings_test_cases.tar.gz tar -xf evcouplings_test_cases.tar.gz -C $HOME/ - name: Run tests in headless xvfb environment - uses: GabrielBB/xvfb-action@v1 + uses: coactions/setup-xvfb@v1 with: run: coverage run -m unittest discover -s test -p "Test*.py" + working-directory: ./ #optional - name: Publish evcouplings to test PyPI if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@master diff --git a/test/TestComplex.py b/test/TestComplex.py index d5ffa922..2f6e4f86 100644 --- a/test/TestComplex.py +++ b/test/TestComplex.py @@ -18,48 +18,70 @@ from evcouplings.complex.protocol import * from evcouplings.align import Alignment -TRAVIS_PATH = os.getenv('HOME') + "/evcouplings_test_cases/complex_test" +TRAVIS_PATH = os.getenv("HOME") + "/evcouplings_test_cases/complex_test" # TRAVIS_PATH = "/home/travis/evcouplings_test_cases/complex_test" # TRAVIS_PATH = "/Users/AG/Dropbox/evcouplings_dev/test_cases/for_B/complex_test" -class TestComplex(TestCase): +class TestComplex(TestCase): def __init__(self, *args, **kwargs): super(TestComplex, self).__init__(*args, **kwargs) # Genome location table - genome_location_filename_1 = \ - "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH) - genome_location_filename_2 = \ - "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH) + genome_location_filename_1 = "{}/align_1/test_new_genome_location.csv".format( + TRAVIS_PATH + ) + genome_location_filename_2 = "{}/align_2/test_new_genome_location.csv".format( + TRAVIS_PATH + ) self.gene_location_table_1 = pd.read_csv(genome_location_filename_1, header=0) self.gene_location_table_2 = pd.read_csv(genome_location_filename_2, header=0) # possible partners file - possible_partners_file = "{}/concatenate/test_new_possible_partners.csv".format(TRAVIS_PATH) - possible_partners = pd.read_csv(possible_partners_file, index_col=0, header=0, - dtype={"uniprot_id_1": str, "uniprot_id_2": str, "distance": int} + possible_partners_file = "{}/concatenate/test_new_possible_partners.csv".format( + TRAVIS_PATH + ) + possible_partners = pd.read_csv( + possible_partners_file, + index_col=0, + header=0, + dtype={"uniprot_id_1": str, "uniprot_id_2": str, "distance": int}, + ) + possible_partners = possible_partners.sort_values( + ["uniprot_id_1", "uniprot_id_2", "distance"] ) - possible_partners = possible_partners.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"]) self.possible_partners = possible_partners.reset_index(drop=True) # id pairing id_pairing_file = "{}/concatenate/test_new_id_pairing.csv".format(TRAVIS_PATH) - id_pairing = pd.read_csv(id_pairing_file, index_col=0, header=0, - dtype={"uniprot_id_1": str, "uniprot_id_2": str, "distance": int} + id_pairing = pd.read_csv( + id_pairing_file, + index_col=0, + header=0, + dtype={"uniprot_id_1": str, "uniprot_id_2": str, "distance": int}, + ) + id_pairing = id_pairing.sort_values( + ["uniprot_id_1", "uniprot_id_2", "distance"] ) - id_pairing = id_pairing.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"]) self.id_pairing = id_pairing.reset_index(drop=True) # annotation table for concatenation - annotation_data_file = "{}/concatenate/test_new_uniprot_annotation.csv".format(TRAVIS_PATH) - self.annotation_data = pd.read_csv(annotation_data_file, index_col=None, header=0, dtype=str) + annotation_data_file = "{}/concatenate/test_new_uniprot_annotation.csv".format( + TRAVIS_PATH + ) + self.annotation_data = pd.read_csv( + annotation_data_file, index_col=None, header=0, dtype=str + ) - annotation_and_id_file = "{}/concatenate/test_new_annotation_and_id.csv".format(TRAVIS_PATH) + annotation_and_id_file = "{}/concatenate/test_new_annotation_and_id.csv".format( + TRAVIS_PATH + ) self.annotation_and_id = pd.read_csv( - annotation_and_id_file, header=0, index_col=None, - dtype={"id": str, "id_to_query": float, "species": str, "name": str} + annotation_and_id_file, + header=0, + index_col=None, + dtype={"id": str, "id_to_query": float, "species": str, "name": str}, ) # table of sequence identities @@ -71,10 +93,14 @@ def __init__(self, *args, **kwargs): self.paralog_table = pd.read_csv(paralog_file, index_col=0, header=0) # input and output configuration - with open("{}/concatenate/test_new_concatenate.incfg".format(TRAVIS_PATH)) as inf: + with open( + "{}/concatenate/test_new_concatenate.incfg".format(TRAVIS_PATH) + ) as inf: self.incfg = yaml.safe_load(inf) - with open("{}/concatenate/test_new_concatenate.outcfg".format(TRAVIS_PATH)) as inf: + with open( + "{}/concatenate/test_new_concatenate.outcfg".format(TRAVIS_PATH) + ) as inf: self.outcfg = yaml.safe_load(inf) def test_genome_distance(self): @@ -89,12 +115,24 @@ def test_genome_distance(self): temporary_incfg = deepcopy(self.incfg) temporary_incfg["prefix"] = tmp_prefix - temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format(TRAVIS_PATH) - temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format(TRAVIS_PATH) - temporary_incfg["first_genome_location_file"] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH) - temporary_incfg["second_genome_location_file"] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH) - temporary_incfg["first_annotation_file"] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH) - temporary_incfg["second_annotation_file"] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH) + temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format( + TRAVIS_PATH + ) + temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format( + TRAVIS_PATH + ) + temporary_incfg[ + "first_genome_location_file" + ] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH) + temporary_incfg[ + "second_genome_location_file" + ] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH) + temporary_incfg[ + "first_annotation_file" + ] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH) + temporary_incfg[ + "second_annotation_file" + ] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH) outcfg = genome_distance(**temporary_incfg) @@ -112,7 +150,7 @@ def test_genome_distance(self): "distance_plot_file", "alignment_file", "frequencies_file", - "raw_focus_alignment_file" + "raw_focus_alignment_file", ] for key in keys_list: @@ -133,18 +171,36 @@ def test_best_hit_normal(self): temporary_incfg = deepcopy(self.incfg) temporary_incfg["prefix"] = tmp_prefix - temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format(TRAVIS_PATH) - temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format(TRAVIS_PATH) - temporary_incfg["first_annotation_file"] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH) - temporary_incfg["second_annotation_file"] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH) - temporary_incfg["first_identities_file"] = "{}/align_1/test_new_identities.csv".format(TRAVIS_PATH) - temporary_incfg["second_identities_file"] = "{}/align_2/test_new_identities.csv".format(TRAVIS_PATH) - temporary_incfg["first_genome_location_file"] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH) - temporary_incfg["second_genome_location_file"] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH) + temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format( + TRAVIS_PATH + ) + temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format( + TRAVIS_PATH + ) + temporary_incfg[ + "first_annotation_file" + ] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH) + temporary_incfg[ + "second_annotation_file" + ] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH) + temporary_incfg[ + "first_identities_file" + ] = "{}/align_1/test_new_identities.csv".format(TRAVIS_PATH) + temporary_incfg[ + "second_identities_file" + ] = "{}/align_2/test_new_identities.csv".format(TRAVIS_PATH) + temporary_incfg[ + "first_genome_location_file" + ] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH) + temporary_incfg[ + "second_genome_location_file" + ] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH) temporary_incfg["use_best_reciprocal"] = False temporary_incfg["paralog_identity_threshold"] = 0.9 - with open("{}/concatenate/test_new_best_hit_concatenate.outcfg".format(TRAVIS_PATH)) as inf: + with open( + "{}/concatenate/test_new_best_hit_concatenate.outcfg".format(TRAVIS_PATH) + ) as inf: _outcfg = yaml.safe_load(inf) outcfg = best_hit(**temporary_incfg) @@ -162,7 +218,7 @@ def test_best_hit_normal(self): "concatentation_statistics_file", "alignment_file", "frequencies_file", - "raw_focus_alignment_file" + "raw_focus_alignment_file", ] for key in keys_list: @@ -183,18 +239,38 @@ def test_best_hit_reciprocal(self): temporary_incfg = deepcopy(self.incfg) temporary_incfg["prefix"] = tmp_prefix - temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format(TRAVIS_PATH) - temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format(TRAVIS_PATH) - temporary_incfg["first_annotation_file"] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH) - temporary_incfg["second_annotation_file"] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH) - temporary_incfg["first_identities_file"] = "{}/align_1/test_new_identities.csv".format(TRAVIS_PATH) - temporary_incfg["second_identities_file"] = "{}/align_2/test_new_identities.csv".format(TRAVIS_PATH) - temporary_incfg["first_genome_location_file"] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH) - temporary_incfg["second_genome_location_file"] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH) + temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format( + TRAVIS_PATH + ) + temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format( + TRAVIS_PATH + ) + temporary_incfg[ + "first_annotation_file" + ] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH) + temporary_incfg[ + "second_annotation_file" + ] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH) + temporary_incfg[ + "first_identities_file" + ] = "{}/align_1/test_new_identities.csv".format(TRAVIS_PATH) + temporary_incfg[ + "second_identities_file" + ] = "{}/align_2/test_new_identities.csv".format(TRAVIS_PATH) + temporary_incfg[ + "first_genome_location_file" + ] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH) + temporary_incfg[ + "second_genome_location_file" + ] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH) temporary_incfg["use_best_reciprocal"] = True temporary_incfg["paralog_identity_threshold"] = 0.9 - with open("{}/concatenate/test_new_best_reciprocal_concatenate.outcfg".format(TRAVIS_PATH)) as inf: + with open( + "{}/concatenate/test_new_best_reciprocal_concatenate.outcfg".format( + TRAVIS_PATH + ) + ) as inf: _outcfg = yaml.safe_load(inf) outcfg = best_hit(**temporary_incfg) @@ -212,7 +288,7 @@ def test_best_hit_reciprocal(self): "concatentation_statistics_file", "alignment_file", "frequencies_file", - "raw_focus_alignment_file" + "raw_focus_alignment_file", ] for key in keys_list: @@ -247,12 +323,12 @@ def test_describe_concatenation(self): "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH), "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH), "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH), - outfile.name + outfile.name, ) - concatenation_stats = pd.read_csv("{}/concatenate/test_new_concatenation_statistics.csv".format( - TRAVIS_PATH - )) + concatenation_stats = pd.read_csv( + "{}/concatenate/test_new_concatenation_statistics.csv".format(TRAVIS_PATH) + ) _concatenation_stats = pd.read_csv(outfile.name) pd.testing.assert_frame_equal(concatenation_stats, _concatenation_stats) @@ -265,15 +341,15 @@ def test_write_concatenated_alignment(self): """ alignment_1_file = "{}/concatenate/test_new_monomer_1.fasta".format(TRAVIS_PATH) - with(open(alignment_1_file)) as inf: + with open(alignment_1_file) as inf: alignment_1 = Alignment.from_file(inf) alignment_2_file = "{}/concatenate/test_new_monomer_2.fasta".format(TRAVIS_PATH) - with(open(alignment_2_file)) as inf: + with open(alignment_2_file) as inf: alignment_2 = Alignment.from_file(inf) alignment_file = "{}/concatenate/test_new_raw_focus.fasta".format(TRAVIS_PATH) - with(open(alignment_file)) as inf: + with open(alignment_file) as inf: alignment = Alignment.from_file(inf) target_header = alignment.ids[0] @@ -285,15 +361,25 @@ def test_write_concatenated_alignment(self): id_pairing.loc[:, "id_1"] = id_pairing["uniprot_id_1"] id_pairing.loc[:, "id_2"] = id_pairing["uniprot_id_2"] - _target_header, _target_seq_idx, _ali, _ali_1, _ali_2 = write_concatenated_alignment( - id_pairing, input_alignment_file_1, input_alignment_file_2, - "DINJ_ECOLI/1-86", "YAFQ_ECOLI/1-92" + ( + _target_header, + _target_seq_idx, + _ali, + _ali_1, + _ali_2, + ) = write_concatenated_alignment( + id_pairing, + input_alignment_file_1, + input_alignment_file_2, + "DINJ_ECOLI/1-86", + "YAFQ_ECOLI/1-92", ) def _test_aln_equivalence(ali1, ali2): np.testing.assert_array_equal(ali1.ids, ali2.ids) np.testing.assert_array_equal(ali1.matrix, ali2.matrix) -# + + # _test_aln_equivalence(alignment_1, _ali_1) _test_aln_equivalence(alignment_2, _ali_2) _test_aln_equivalence(alignment, _ali) @@ -305,7 +391,9 @@ def test_read_species_annotation_table_uniprot(self): tests whether a uniprot annotation table is read correctly """ - annotation_file_uniprot = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH) + annotation_file_uniprot = "{}/align_1/test_new_annotation.csv".format( + TRAVIS_PATH + ) annotation_data = read_species_annotation_table(annotation_file_uniprot) pd.testing.assert_frame_equal(annotation_data, self.annotation_data) @@ -314,19 +402,26 @@ def test_read_species_annotation_table_uniref(self): tests whether a uniref annotation table is read correctly """ - _annotation_file_uniref = "{}/DIVIB_BACSU_1-54_b0.3_annotation.csv".format(TRAVIS_PATH) + _annotation_file_uniref = "{}/DIVIB_BACSU_1-54_b0.3_annotation.csv".format( + TRAVIS_PATH + ) _annotation_data = read_species_annotation_table(_annotation_file_uniref) - annotation_file_uniref = "{}/concatenate/test_new_uniref_annotation.csv".format(TRAVIS_PATH) - annotation_data_gold = pd.read_csv(annotation_file_uniref, index_col=None, header=0, dtype=str) + annotation_file_uniref = "{}/concatenate/test_new_uniref_annotation.csv".format( + TRAVIS_PATH + ) + annotation_data_gold = pd.read_csv( + annotation_file_uniref, index_col=None, header=0, dtype=str + ) pd.testing.assert_frame_equal(annotation_data_gold, _annotation_data) - def test_most_similar_by_organism(self): - """ - tests whether most_similar_by_organism returns the correct dataframe + # Todo: Restore this test + # def test_most_similar_by_organism(self): + # """ + # tests whether most_similar_by_organism returns the correct dataframe - """ - annotation_and_id = most_similar_by_organism(self.similarities, self.annotation_data) - pd.testing.assert_frame_equal(annotation_and_id, self.annotation_and_id) + # """ + # annotation_and_id = most_similar_by_organism(self.similarities, self.annotation_data) + # pd.testing.assert_frame_equal(annotation_and_id, self.annotation_and_id) def test_find_paralogs(self): """ @@ -334,7 +429,9 @@ def test_find_paralogs(self): """ target_id = "DINJ_ECOLI" - paralog_table = find_paralogs(target_id, self.annotation_data, self.similarities, 0.9) + paralog_table = find_paralogs( + target_id, self.annotation_data, self.similarities, 0.9 + ) pd.testing.assert_frame_equal(paralog_table, self.paralog_table) def test_filter_best_reciprocal(self): @@ -343,8 +440,13 @@ def test_filter_best_reciprocal(self): """ alignment_file = "{}/align_1/test_new.a2m".format(TRAVIS_PATH) - best_recip = pd.read_csv("{}/concatenate/test_new_best_reciprocal.csv".format(TRAVIS_PATH), index_col=0) - _best_recip = filter_best_reciprocal(alignment_file, self.paralog_table, self.annotation_and_id, 0.02) + best_recip = pd.read_csv( + "{}/concatenate/test_new_best_reciprocal.csv".format(TRAVIS_PATH), + index_col=0, + ) + _best_recip = filter_best_reciprocal( + alignment_file, self.paralog_table, self.annotation_and_id, 0.02 + ) pd.testing.assert_frame_equal(best_recip, _best_recip) def test_get_distance_overlap(self): @@ -383,7 +485,9 @@ def test_best_reciprocal_matching(self): """ id_pairing = best_reciprocal_matching(self.possible_partners) - id_pairing = id_pairing.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"]) + id_pairing = id_pairing.sort_values( + ["uniprot_id_1", "uniprot_id_2", "distance"] + ) id_pairing = id_pairing.reset_index(drop=True) pd.testing.assert_frame_equal(id_pairing, self.id_pairing) @@ -394,16 +498,19 @@ def test_find_possible_partners(self): """ _possible_partners = find_possible_partners( - self.gene_location_table_1, - self.gene_location_table_2 + self.gene_location_table_1, self.gene_location_table_2 + ) + _possible_partners = _possible_partners.sort_values( + ["uniprot_id_1", "uniprot_id_2", "distance"] ) - _possible_partners = _possible_partners.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"]) _possible_partners = _possible_partners.reset_index(drop=True) pd.testing.assert_frame_equal( - self.possible_partners, _possible_partners, - atol=1e-5, check_like=True, - check_names=False + self.possible_partners, + _possible_partners, + atol=1e-5, + check_like=True, + check_names=False, ) def test_plot_distance_distribution(self): @@ -418,5 +525,6 @@ def test_plot_distance_distribution(self): outfile.close() os.unlink(outfile.name) -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() From 55ccdf5af7eadd0d085f17d3c5e20a0bb00900a8 Mon Sep 17 00:00:00 2001 From: Zachary Charlop-Powers Date: Fri, 24 Nov 2023 15:22:35 -0500 Subject: [PATCH 30/47] add yml files --- .github/workflows/build_and_test.yml | 3 ++- .github/workflows/build_test_and_push.yml | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 614c6810..53fb0d82 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -29,7 +29,8 @@ jobs: source activate test-environment - name: Run setup.py run: | - python setup.py sdist --formats=zip -k + python setup.py sdist --formats=zip -k + python setup.py bdist_wheel find ./dist -iname "*.zip" -print0 | xargs -0 pip install pip install codecov - name: Download test files diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml index 53d28be0..7276086f 100644 --- a/.github/workflows/build_test_and_push.yml +++ b/.github/workflows/build_test_and_push.yml @@ -31,6 +31,7 @@ jobs: - name: Run setup.py run: | python setup.py sdist --formats=zip -k + python setup.py bdist_wheel find ./dist -iname "*.zip" -print0 | xargs -0 pip install pip install codecov - name: Download test files From 5a50ab9367c531e1d03f298ea8058991f274d99d Mon Sep 17 00:00:00 2001 From: Zachary Charlop-Powers Date: Sun, 26 Nov 2023 11:47:42 -0500 Subject: [PATCH 31/47] build --- .github/workflows/build_and_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 53fb0d82..235ce16f 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -30,7 +30,7 @@ jobs: - name: Run setup.py run: | python setup.py sdist --formats=zip -k - python setup.py bdist_wheel + python -m build find ./dist -iname "*.zip" -print0 | xargs -0 pip install pip install codecov - name: Download test files From 686ca3c15ffb9f46272bd8aa9a9ae510ed2785a4 Mon Sep 17 00:00:00 2001 From: Zachary Charlop-Powers Date: Sun, 26 Nov 2023 11:51:05 -0500 Subject: [PATCH 32/47] add build --- .github/workflows/build_and_test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 235ce16f..d334b1c4 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -29,6 +29,7 @@ jobs: source activate test-environment - name: Run setup.py run: | + pip install build python setup.py sdist --formats=zip -k python -m build find ./dist -iname "*.zip" -print0 | xargs -0 pip install From b4eb674c1ba1332daf954c35644902015d1aac2d Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 24 Jan 2024 16:45:28 +0100 Subject: [PATCH 33/47] Fix PDB chain retrieval regression by pandas/numpy updates --- evcouplings/compare/pdb.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index e16aaf1a..3bbb6ce6 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -585,12 +585,10 @@ def get_chain(self, chain, model=0): ) # collect internal indeces of all residues/groups in chain - residue_indeces = np.concatenate( - np.array([ - np.arange(self.first_residue_index[i], self.last_residue_index[i]) - for i in target_chain_indeces - ], dtype=np.object_) - ) + residue_indeces = np.concatenate([ + np.arange(self.first_residue_index[i], self.last_residue_index[i]) + for i in target_chain_indeces + ]) # chain indeces and identifiers for all residues # (not to be confused with chain name!); @@ -619,7 +617,11 @@ def get_chain(self, chain, model=0): ("sec_struct", self.sec_struct[residue_indeces]), ]) - res_df = pd.DataFrame(res) + # also store entity IDs and indices + res_df = pd.DataFrame(res).assign( + entity_index=lambda df: df.chain_index.map(self.chain_to_entity), + entity_id=lambda df: df.entity_index + 1 + ) # shift seqres indexing to start at 1; # However, do not add to positions without sequence index (-1) From c2a766d1df45919115330f715179a91466feeb95 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Fri, 5 Jul 2024 16:58:02 +0200 Subject: [PATCH 34/47] Replace MMTF with bCIF --- evcouplings/compare/pdb.py | 737 +++++++++++++++++++++++++++---------- requirements.txt | 2 +- setup.py | 4 +- 3 files changed, 539 insertions(+), 204 deletions(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index 3bbb6ce6..91fd1201 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -7,12 +7,16 @@ from collections import OrderedDict from collections.abc import Iterable +import gzip +from io import BytesIO from os import path from urllib.error import HTTPError -from mmtf import fetch, parse import numpy as np import pandas as pd +import requests +import msgpack +from Bio.PDB.binary_cif import _decode from evcouplings.utils.config import InvalidParameterError from evcouplings.utils.constants import AA3_to_AA1 @@ -21,6 +25,9 @@ valid_file, ResourceError, tempdir ) +PDB_BCIF_DOWNLOAD_URL = "https://models.rcsb.org/{pdb_id}.bcif.gz" + + # Mapping from MMTF secondary structure codes to DSSP symbols MMTF_DSSP_CODE_MAP = { 0: "I", # pi helix @@ -402,109 +409,157 @@ def to_file(self, fileobj, chain_id="A", end=True, first_atom_id=1): class PDB: """ - Wrapper around PDB MMTF decoder object to access residue and - coordinate information + Holds PDB structure from binaryCIF format; supersedes original PDB class based + on MMTF format (renamed to MmtfPDB, cf. below) due to MMTF retirement in 2024 """ - def __init__(self, mmtf): + def __init__(self, filehandle, keep_full_data=False): """ - Initialize by further decoding information in mmtf object. + Initialize by parsing binaryCIF from open filehandle. + Recommended to use from_file() and from_id() class methods to create object. - Normally one should use from_file() and from_id() class - methods to create object. + Column extraction and decoding based on https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py Parameters ---------- - mmtf : mmtf.api.mmtf_reader.MMTFDecoder - MMTF decoder object (as returned by fetch or parse - function in mmtf module) + filehandle: file-like object + Open filehandle (binary) from which to read binaryCIF data + keep_full_data: bool (default: False) + Associate raw extracted data with object """ - def _get_range(object_counts): - """ - Extract index ranges for chains, residues and atoms - """ - last_element = np.cumsum(object_counts, dtype=int) - first_element = np.concatenate( - (np.zeros(1, dtype=int), last_element[:-1]) - ) - - return first_element, last_element - - # store raw MMTF decoder - self.mmtf = mmtf - - # Step 1: summarize basic information about model - # number of models in structure - self.num_models = len(mmtf.chains_per_model) - - self.first_chain_index, self.last_chain_index = _get_range( - mmtf.chains_per_model + # unpack information in bCIF file + raw_data = msgpack.unpack( + filehandle, use_list=True ) - # collect list which chain corresponds to which entity - self.chain_to_entity = {} - for i, ent in enumerate(mmtf.entity_list): - for c in ent["chainIndexList"]: - self.chain_to_entity[c] = i - - # Step 2: identify residues and corresponding atom indices - - # first/last index of residue (aka group) for each chain; - # index these lists with index of chain - self.first_residue_index, self.last_residue_index = _get_range( - mmtf.groups_per_chain - ) - - # store explicit information about composition of residues - def _group_info(field): - return np.array( - [x[field] for x in mmtf.group_list], dtype=np.object_ - ) - - # three and one letter code names of different groups - self._residue_names_3 = _group_info("groupName") - self._residue_names_1 = _group_info("singleLetterCode") - - # atom types and overall number of atoms in each type of residue - self._residue_type_atom_names = _group_info("atomNameList") - self._residue_type_elements = _group_info("elementList") - self._residue_type_charges = _group_info("formalChargeList") - - self._residue_type_num_atoms = np.array([len(x) for x in self._residue_type_atom_names]) - - # prepare alternative location list as numpy array, with empty strings - self.alt_loc_list = np.array( - [x.replace("\x00", "") for x in mmtf.alt_loc_list] - ) + data = { + f"{category['name']}.{column['name']}": column + for block in raw_data["dataBlocks"] for category in block["categories"] for column in category["columns"] + } - # compute first and last atom index for each residue/group - # (by fetching corresponding length for each group based on group type) - self._residue_num_atoms = self._residue_type_num_atoms[mmtf.group_type_list] + ATOM_TARGET_COLS = { + "_atom_site.pdbx_PDB_model_num": "model_number", + "_atom_site.group_PDB": "record_type", # ATOM, HETATM etc. + + # atom IDs and types + "_atom_site.id": "id", # x + "_atom_site.type_symbol": "type_symbol", # x + "_atom_site.label_atom_id": "label_atom_id", # x + "_atom_site.auth_atom_id": "auth_atom_id", + + # residue/molecule types (three-letter code) + "_atom_site.label_comp_id": "label_comp_id", # x + "_atom_site.auth_comp_id": "auth_comp_id", + + # chain IDs (official, author) and entity IDs + "_atom_site.label_asym_id": "label_asym_id", # x + "_atom_site.auth_asym_id": "auth_asym_id", + "_atom_site.label_entity_id": "label_entity_id", + + # residue IDs (official and author) + "_atom_site.label_seq_id": "label_seq_id", + "_atom_site.auth_seq_id": "auth_seq_id", # x + "_atom_site.pdbx_PDB_ins_code": "insertion_code", + + # atom properties + "_atom_site.Cartn_x": "x", # x + "_atom_site.Cartn_y": "y", # x + "_atom_site.Cartn_z": "z", # x + "_atom_site.occupancy": "occupancy", # x + "_atom_site.B_iso_or_equiv": "b_factor", # x + "_atom_site.pdbx_formal_charge": "charge", + } - self.first_atom_index, self.last_atom_index = _get_range( - self._residue_num_atoms - ) + HELIX_TARGET_COLS = { + "_struct_conf.conf_type_id": "conformation_type", + "_struct_conf.id": "id", + # label_asym_id and label_seq_id are sufficient for merging to atom table; + # do not bother with author IDs here + "_struct_conf.beg_label_asym_id": "beg_label_asym_id", + "_struct_conf.beg_label_seq_id": "beg_label_seq_id", + "_struct_conf.end_label_asym_id": "end_label_asym_id", + "_struct_conf.end_label_seq_id": "end_label_seq_id", + } - # assemble residue ID strings - self.residue_ids = np.array([ - "{}{}".format(group_id, ins_code.replace("\x00", "")) - for group_id, ins_code - in zip(mmtf.group_id_list, mmtf.ins_code_list) - ]) + SHEET_TARGET_COLS = { + "_struct_sheet_range.sheet_id": "sheet_id", + "_struct_sheet_range.id": "id", + "_struct_sheet_range.beg_label_asym_id": "beg_label_asym_id", + "_struct_sheet_range.beg_label_seq_id": "beg_label_seq_id", + "_struct_sheet_range.end_label_asym_id": "end_label_asym_id", + "_struct_sheet_range.end_label_seq_id": "end_label_seq_id", + } - # map secondary structure codes into DSSP symbols - self.sec_struct = np.array( - [MMTF_DSSP_CODE_MAP[x] for x in mmtf.sec_struct_list] - ) + if keep_full_data: + self.data = data + else: + self.data = None + + # decode information into dataframe with BioPython helper method + self.atom_table = pd.DataFrame({ + name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items() + }) + + # decode information into dataframe with BioPython helper method + self.helix_table = pd.DataFrame({ + name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items() + }) + + # decode information into dataframe with BioPython helper method + self.sheet_table = pd.DataFrame({ + name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items() + }) + + # create secondary structure table for merging to chain tables + # (will only contain helix/H and strand/E, coil/C will need to be filled in) + sse_raw = [] + for sse_type, sse_table in [ + ("H", self.helix_table), + ("E", self.sheet_table) + ]: + for _, row in sse_table.iterrows(): + assert row.beg_label_asym_id == row.end_label_asym_id + for seq_id in range(row.beg_label_seq_id, row.end_label_seq_id + 1): + sse_raw.append({ + "label_asym_id": row.beg_label_asym_id, + "label_seq_id": seq_id, + "sec_struct_3state": sse_type, + }) + + self.secondary_structure = pd.DataFrame(sse_raw) + + # store information about models/chains for quick retrieval and verification + self.models = list(self.atom_table.model_number.unique()) + + # model number to auth chains + self.model_to_chains = self.atom_table[ + ["model_number", "auth_asym_id"] + ].drop_duplicates().groupby( + "model_number" + ).agg( + lambda s: list(s) + )["auth_asym_id"].to_dict() + + self.model_to_asym_ids = self.atom_table[ + ["model_number", "label_asym_id"] + ].drop_duplicates().groupby( + "model_number" + ).agg( + lambda s: list(s) + )["label_asym_id"].to_dict() @classmethod - def from_file(cls, filename): + def from_file(cls, filename, keep_full_data=False): """ - Initialize structure from MMTF file + Initialize structure from binaryCIF file + + inspired by https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py Parameters ---------- filename : str Path of MMTF file + keep_full_data: bool (default: False) + Associate raw extracted data with object Returns ------- @@ -512,36 +567,53 @@ def from_file(cls, filename): initialized PDB structure """ try: - return cls(parse(filename)) - except FileNotFoundError as e: + with ( + gzip.open(filename, mode="rb") + if filename.lower().endswith(".gz") else open(filename, mode="rb") + ) as f: + return cls(f, keep_full_data=keep_full_data) + except IOError as e: raise ResourceError( - "Could not find file {}".format(filename) + "Could not open file {}".format(filename) ) from e @classmethod - def from_id(cls, pdb_id): + def from_id(cls, pdb_id, keep_full_data=False): """ - Initialize structure by PDB ID (fetches - structure from RCSB servers) + Initialize structure by PDB ID (fetches structure from RCSB servers) Parameters ---------- pdb_id : str PDB identifier (e.g. 1hzx) + keep_full_data: bool (default: False) + Associate raw extracted data with object Returns ------- PDB initialized PDB structure """ + # TODO: add proper retry logic and timeouts + # TODO: add better exception handling try: - return cls(fetch(pdb_id)) - except HTTPError as e: + r = requests.get( + PDB_BCIF_DOWNLOAD_URL.format(pdb_id=pdb_id.lower()) + ) + except requests.exceptions.RequestException as e: raise ResourceError( - "Could not fetch MMTF data for {}".format(pdb_id) + "Error fetching bCIF data for {}".format(pdb_id) ) from e - def get_chain(self, chain, model=0): + if not r.ok: + raise ResourceError( + "Did not receive valid response fetching {}".format(pdb_id) + ) + + with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f: + return cls(f, keep_full_data=keep_full_data) + + def get_chain(self, chain, model=1, is_author_id=True): """ Extract residue information and atom coordinates for a given chain in PDB structure @@ -549,9 +621,12 @@ def get_chain(self, chain, model=0): Parameters ---------- chain : str - Name of chain to be extracted (e.g. "A") - model : int, optional (default: 0) - Index of model to be extracted + ID of chain to be extracted (e.g. "A") + model : int, optional (default: 1) + Identifier of model to be extracted + is_author_id : bool (default: True) + If true, interpret chain parameter as author chain identifier; + if false, interpret as label_asym_id Returns ------- @@ -559,127 +634,387 @@ def get_chain(self, chain, model=0): Chain object containing DataFrames listing residues and atom coordinates """ - if not (0 <= model < self.num_models): + # check if valid model was requested + if model not in self.models: raise ValueError( - "Illegal model index, can be from 0 up to {}".format( - self.num_models - 1 - ) + "Invalid model identifier, valid options are: " + (",".join(map(str, self.models))) ) - # first and last index of chains corresponding to selected model - first_chain_index = self.first_chain_index[model] - last_chain_index = self.last_chain_index[model] - - # which model chains match our target PDB chain? - chain_names = np.array( - self.mmtf.chain_name_list[first_chain_index:last_chain_index] - ) - - # indices of chains that match chain name, in current model - indices = np.arange(first_chain_index, last_chain_index, dtype=int) - target_chain_indeces = indices[chain_names == chain] - - if len(target_chain_indeces) == 0: + # check if valid chain was requested + if ((is_author_id and chain not in self.model_to_chains[model]) or + (not is_author_id and chain not in self.model_to_asym_ids[model])): raise ValueError( - "No chains with given name found" + "Invalid chain selection, check self.model_to_chains / self.model_to_asym_ids for options" ) - # collect internal indeces of all residues/groups in chain - residue_indeces = np.concatenate([ - np.arange(self.first_residue_index[i], self.last_residue_index[i]) - for i in target_chain_indeces - ]) - - # chain indeces and identifiers for all residues - # (not to be confused with chain name!); - # maximum length 4 characters according to MMTF spec - chain_indeces = np.concatenate([ - np.full( - self.last_residue_index[i] - self.first_residue_index[i], - i, dtype=int - ) for i in target_chain_indeces - ]) - - chain_ids = np.array(self.mmtf.chain_id_list)[chain_indeces] - - # create dataframe representation of selected chain - m = self.mmtf - group_types = m.group_type_list[residue_indeces] - - res = OrderedDict([ - ("id", self.residue_ids[residue_indeces]), - ("seqres_id", m.sequence_index_list[residue_indeces]), - ("coord_id", self.residue_ids[residue_indeces]), - ("one_letter_code", self._residue_names_1[group_types]), - ("three_letter_code", self._residue_names_3[group_types]), - ("chain_index", chain_indeces), - ("chain_id", chain_ids), - ("sec_struct", self.sec_struct[residue_indeces]), - ]) - - # also store entity IDs and indices - res_df = pd.DataFrame(res).assign( - entity_index=lambda df: df.chain_index.map(self.chain_to_entity), - entity_id=lambda df: df.entity_index + 1 + if is_author_id: + chain_field = "auth_asym_id" + else: + chain_field = "label_asym_id" + + # filter atom table to model + chain selection + atoms = self.atom_table.query( + f"model_number == @model and {chain_field} == @chain" + ).assign( + # create coordinate ID from author residue ID + insertion code + # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID) + coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code, + seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", np.nan), + one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"), + id=lambda df: df.coord_id, + # note that MSE will now be labeled as HETATM, which was not the case with MMTF + hetatm=lambda df: df.record_type == "HETATM", + ).reset_index( + drop=True ) - # shift seqres indexing to start at 1; - # However, do not add to positions without sequence index (-1) - res_df.loc[res_df.seqres_id >= 0, "seqres_id"] += 1 - - # turn all indeces into strings and create proper NaN values - res_df.loc[:, "coord_id"] = ( - res_df.loc[:, "coord_id"].astype(str) + # create residue table by de-duplicating atoms + res = atoms.drop_duplicates( + subset=["coord_id"] + ).reset_index( + drop=True ) + res.index.name = "residue_index" - res_df.loc[:, "seqres_id"] = ( - res_df.loc[:, "seqres_id"].astype(str).replace("-1", np.nan) + # merge secondary structure information (left outer join as coil is missing from table) + res_sse = res.merge( + self.secondary_structure, + on=("label_seq_id", "label_asym_id"), + how="left" ) - # copy updated coordinate indeces - res_df.loc[:, "id"] = res_df.loc[:, "coord_id"] - res_df.loc[:, "one_letter_code"] = res_df.loc[:, "one_letter_code"].replace("?", np.nan) - res_df.loc[:, "sec_struct"] = res_df.loc[:, "sec_struct"].replace("", np.nan) + res_sse.loc[ + res_sse.sec_struct_3state.isnull() & (res_sse.label_seq_id > 0), + "sec_struct_3state" + ] = "C" + + RES_RENAME_MAP = { + "id": "id", + "seqres_id": "seqres_id", + "coord_id": "coord_id", + "one_letter_code": "one_letter_code", + "label_comp_id": "three_letter_code", + "auth_asym_id": "chain_id", + "label_asym_id": "asym_id", # new + "label_entity_id": "entity_id", + "sec_struct_3state": "sec_struct_3state", + "hetatm": "hetatm", + } - # reduction to 3-state secondary structure (following Rost & Sander) - res_df.loc[:, "sec_struct_3state"] = res_df.loc[:, "sec_struct"].map( - lambda x: DSSP_3_STATE_MAP[x], na_action="ignore" + res_final = res_sse.loc[ + :, list(RES_RENAME_MAP) + ].rename( + columns=RES_RENAME_MAP ) - # proxy for HETATM records - will not work e.g. for MSE, - # which is listed as "M" - res_df.loc[:, "hetatm"] = res_df.one_letter_code.isnull() - - # finally, get atom names and coordinates for all residues - atom_first = self.first_atom_index[residue_indeces] - atom_last = self.last_atom_index[residue_indeces] - atom_names = np.concatenate(self._residue_type_atom_names[group_types]) - elements = np.concatenate(self._residue_type_elements[group_types]) - charges = np.concatenate(self._residue_type_charges[group_types]) - - residue_number = np.repeat(res_df.index, atom_last - atom_first) - atom_indices = np.concatenate([ - np.arange(self.first_atom_index[i], self.last_atom_index[i]) - for i in residue_indeces - ]) - - coords = OrderedDict([ - ("residue_index", residue_number), - ("atom_id", self.mmtf.atom_id_list[atom_indices]), - ("atom_name", atom_names), - ("element", elements), - ("charge", charges), - ("x", self.mmtf.x_coord_list[atom_indices]), - ("y", self.mmtf.y_coord_list[atom_indices]), - ("z", self.mmtf.z_coord_list[atom_indices]), - ("alt_loc", self.alt_loc_list[atom_indices]), - ("occupancy", self.mmtf.occupancy_list[atom_indices]), - ("b_factor", self.mmtf.b_factor_list[atom_indices]), - ]) - - coord_df = pd.DataFrame(coords) + # not included in new version: alt_loc + ATOM_RENAME_MAP = { + "residue_index": "residue_index", + "id": "atom_id", + "label_atom_id": "atom_name", + "type_symbol": "element", + "charge": "charge", + "x": "x", + "y": "y", + "z": "z", + "occupancy": "occupancy", + "b_factor": "b_factor", + } - return Chain(res_df, coord_df) + # add information about residue index to atoms + atoms_with_residue_idx = atoms.merge( + res.reset_index()[["coord_id", "residue_index"]], + on="coord_id" + ).loc[:, list(ATOM_RENAME_MAP)].rename( + columns=ATOM_RENAME_MAP + ) + assert len(atoms_with_residue_idx) == len(atoms) + + return Chain(res_final, atoms_with_residue_idx) + + +# class MmtfPDB: +# """ +# Wrapper around PDB MMTF decoder object to access residue and +# coordinate information +# +# Note: only kept for legacy reasons, MMTF was phased out by RCSB on July 2nd, 2024 :( +# """ +# def __init__(self, mmtf): +# """ +# Initialize by further decoding information in mmtf object. +# +# Normally one should use from_file() and from_id() class +# methods to create object. +# +# Parameters +# ---------- +# mmtf : mmtf.api.mmtf_reader.MMTFDecoder +# MMTF decoder object (as returned by fetch or parse +# function in mmtf module) +# """ +# def _get_range(object_counts): +# """ +# Extract index ranges for chains, residues and atoms +# """ +# last_element = np.cumsum(object_counts, dtype=int) +# first_element = np.concatenate( +# (np.zeros(1, dtype=int), last_element[:-1]) +# ) +# +# return first_element, last_element +# +# # store raw MMTF decoder +# self.mmtf = mmtf +# +# # Step 1: summarize basic information about model +# # number of models in structure +# self.num_models = len(mmtf.chains_per_model) +# +# self.first_chain_index, self.last_chain_index = _get_range( +# mmtf.chains_per_model +# ) +# +# # collect list which chain corresponds to which entity +# self.chain_to_entity = {} +# for i, ent in enumerate(mmtf.entity_list): +# for c in ent["chainIndexList"]: +# self.chain_to_entity[c] = i +# +# # Step 2: identify residues and corresponding atom indices +# +# # first/last index of residue (aka group) for each chain; +# # index these lists with index of chain +# self.first_residue_index, self.last_residue_index = _get_range( +# mmtf.groups_per_chain +# ) +# +# # store explicit information about composition of residues +# def _group_info(field): +# return np.array( +# [x[field] for x in mmtf.group_list], dtype=np.object_ +# ) +# +# # three and one letter code names of different groups +# self._residue_names_3 = _group_info("groupName") +# self._residue_names_1 = _group_info("singleLetterCode") +# +# # atom types and overall number of atoms in each type of residue +# self._residue_type_atom_names = _group_info("atomNameList") +# self._residue_type_elements = _group_info("elementList") +# self._residue_type_charges = _group_info("formalChargeList") +# +# self._residue_type_num_atoms = np.array([len(x) for x in self._residue_type_atom_names]) +# +# # prepare alternative location list as numpy array, with empty strings +# self.alt_loc_list = np.array( +# [x.replace("\x00", "") for x in mmtf.alt_loc_list] +# ) +# +# # compute first and last atom index for each residue/group +# # (by fetching corresponding length for each group based on group type) +# self._residue_num_atoms = self._residue_type_num_atoms[mmtf.group_type_list] +# +# self.first_atom_index, self.last_atom_index = _get_range( +# self._residue_num_atoms +# ) +# +# # assemble residue ID strings +# self.residue_ids = np.array([ +# "{}{}".format(group_id, ins_code.replace("\x00", "")) +# for group_id, ins_code +# in zip(mmtf.group_id_list, mmtf.ins_code_list) +# ]) +# +# # map secondary structure codes into DSSP symbols +# self.sec_struct = np.array( +# [MMTF_DSSP_CODE_MAP[x] for x in mmtf.sec_struct_list] +# ) +# +# @classmethod +# def from_file(cls, filename): +# """ +# Initialize structure from MMTF file +# +# Parameters +# ---------- +# filename : str +# Path of MMTF file +# +# Returns +# ------- +# PDB +# initialized PDB structure +# """ +# try: +# return cls(parse(filename)) +# except FileNotFoundError as e: +# raise ResourceError( +# "Could not find file {}".format(filename) +# ) from e +# +# @classmethod +# def from_id(cls, pdb_id): +# """ +# Initialize structure by PDB ID (fetches +# structure from RCSB servers) +# +# Parameters +# ---------- +# pdb_id : str +# PDB identifier (e.g. 1hzx) +# +# Returns +# ------- +# PDB +# initialized PDB structure +# """ +# try: +# return cls(fetch(pdb_id)) +# except HTTPError as e: +# raise ResourceError( +# "Could not fetch MMTF data for {}".format(pdb_id) +# ) from e +# +# def get_chain(self, chain, model=0): +# """ +# Extract residue information and atom coordinates +# for a given chain in PDB structure +# +# Parameters +# ---------- +# chain : str +# Name of chain to be extracted (e.g. "A") +# model : int, optional (default: 0) +# Index of model to be extracted +# +# Returns +# ------- +# Chain +# Chain object containing DataFrames listing residues +# and atom coordinates +# """ +# if not (0 <= model < self.num_models): +# raise ValueError( +# "Illegal model index, can be from 0 up to {}".format( +# self.num_models - 1 +# ) +# ) +# +# # first and last index of chains corresponding to selected model +# first_chain_index = self.first_chain_index[model] +# last_chain_index = self.last_chain_index[model] +# +# # which model chains match our target PDB chain? +# chain_names = np.array( +# self.mmtf.chain_name_list[first_chain_index:last_chain_index] +# ) +# +# # indices of chains that match chain name, in current model +# indices = np.arange(first_chain_index, last_chain_index, dtype=int) +# target_chain_indeces = indices[chain_names == chain] +# +# if len(target_chain_indeces) == 0: +# raise ValueError( +# "No chains with given name found" +# ) +# +# # collect internal indeces of all residues/groups in chain +# residue_indeces = np.concatenate([ +# np.arange(self.first_residue_index[i], self.last_residue_index[i]) +# for i in target_chain_indeces +# ]) +# +# # chain indeces and identifiers for all residues +# # (not to be confused with chain name!); +# # maximum length 4 characters according to MMTF spec +# chain_indeces = np.concatenate([ +# np.full( +# self.last_residue_index[i] - self.first_residue_index[i], +# i, dtype=int +# ) for i in target_chain_indeces +# ]) +# +# chain_ids = np.array(self.mmtf.chain_id_list)[chain_indeces] +# +# # create dataframe representation of selected chain +# m = self.mmtf +# group_types = m.group_type_list[residue_indeces] +# +# res = OrderedDict([ +# ("id", self.residue_ids[residue_indeces]), +# ("seqres_id", m.sequence_index_list[residue_indeces]), +# ("coord_id", self.residue_ids[residue_indeces]), +# ("one_letter_code", self._residue_names_1[group_types]), +# ("three_letter_code", self._residue_names_3[group_types]), +# ("chain_index", chain_indeces), +# ("chain_id", chain_ids), +# ("sec_struct", self.sec_struct[residue_indeces]), +# ]) +# +# # also store entity IDs and indices +# res_df = pd.DataFrame(res).assign( +# entity_index=lambda df: df.chain_index.map(self.chain_to_entity), +# entity_id=lambda df: df.entity_index + 1 +# ) +# +# # shift seqres indexing to start at 1; +# # However, do not add to positions without sequence index (-1) +# res_df.loc[res_df.seqres_id >= 0, "seqres_id"] += 1 +# +# # turn all indeces into strings and create proper NaN values +# res_df.loc[:, "coord_id"] = ( +# res_df.loc[:, "coord_id"].astype(str) +# ) +# +# res_df.loc[:, "seqres_id"] = ( +# res_df.loc[:, "seqres_id"].astype(str).replace("-1", np.nan) +# ) +# # copy updated coordinate indeces +# res_df.loc[:, "id"] = res_df.loc[:, "coord_id"] +# +# res_df.loc[:, "one_letter_code"] = res_df.loc[:, "one_letter_code"].replace("?", np.nan) +# res_df.loc[:, "sec_struct"] = res_df.loc[:, "sec_struct"].replace("", np.nan) +# +# # reduction to 3-state secondary structure (following Rost & Sander) +# res_df.loc[:, "sec_struct_3state"] = res_df.loc[:, "sec_struct"].map( +# lambda x: DSSP_3_STATE_MAP[x], na_action="ignore" +# ) +# +# # proxy for HETATM records - will not work e.g. for MSE, +# # which is listed as "M" +# res_df.loc[:, "hetatm"] = res_df.one_letter_code.isnull() +# +# # finally, get atom names and coordinates for all residues +# atom_first = self.first_atom_index[residue_indeces] +# atom_last = self.last_atom_index[residue_indeces] +# atom_names = np.concatenate(self._residue_type_atom_names[group_types]) +# elements = np.concatenate(self._residue_type_elements[group_types]) +# charges = np.concatenate(self._residue_type_charges[group_types]) +# +# residue_number = np.repeat(res_df.index, atom_last - atom_first) +# atom_indices = np.concatenate([ +# np.arange(self.first_atom_index[i], self.last_atom_index[i]) +# for i in residue_indeces +# ]) +# +# coords = OrderedDict([ +# ("residue_index", residue_number), +# ("atom_id", self.mmtf.atom_id_list[atom_indices]), +# ("atom_name", atom_names), +# ("element", elements), +# ("charge", charges), +# ("x", self.mmtf.x_coord_list[atom_indices]), +# ("y", self.mmtf.y_coord_list[atom_indices]), +# ("z", self.mmtf.z_coord_list[atom_indices]), +# ("alt_loc", self.alt_loc_list[atom_indices]), +# ("occupancy", self.mmtf.occupancy_list[atom_indices]), +# ("b_factor", self.mmtf.b_factor_list[atom_indices]), +# ]) +# +# coord_df = pd.DataFrame(coords) +# +# return Chain(res_df, coord_df) class ClassicPDB: diff --git a/requirements.txt b/requirements.txt index 079738ea..e7397c17 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,6 @@ numba ruamel.yaml matplotlib requests -mmtf-python click filelock psutil @@ -15,3 +14,4 @@ biopython seaborn billiard scikit-learn +msgpack \ No newline at end of file diff --git a/setup.py b/setup.py index 93f8beec..4ba746ab 100644 --- a/setup.py +++ b/setup.py @@ -98,8 +98,8 @@ install_requires=['setuptools>=18.2', 'numpy', 'pandas', 'scipy', 'numba', 'ruamel.yaml<0.18', 'matplotlib', 'requests', - 'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2', - 'biopython', 'seaborn', 'billiard', 'scikit-learn', + 'click', 'filelock', 'psutil', 'bokeh', 'jinja2', + 'biopython>=1.84', 'seaborn', 'billiard', 'scikit-learn', 'msgpack' ], ) From cde87ae6e7463a0ab76d485234f9b449ac03c8a1 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sat, 6 Jul 2024 10:06:50 +0200 Subject: [PATCH 35/47] Make model handling consistent to previous class --- evcouplings/compare/pdb.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index 91fd1201..439e6b08 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -527,13 +527,16 @@ def __init__(self, filehandle, keep_full_data=False): self.secondary_structure = pd.DataFrame(sse_raw) - # store information about models/chains for quick retrieval and verification - self.models = list(self.atom_table.model_number.unique()) + # store information about models/chains for quick retrieval and verification; + # subtract 0 to start numbering consistently to how this was handled with MMTF + self.models = list(sorted(self.atom_table.model_number.unique() - 1)) # model number to auth chains self.model_to_chains = self.atom_table[ ["model_number", "auth_asym_id"] - ].drop_duplicates().groupby( + ].drop_duplicates().assign( + model_number=lambda df: df.model_number - 1 + ).groupby( "model_number" ).agg( lambda s: list(s) @@ -541,7 +544,9 @@ def __init__(self, filehandle, keep_full_data=False): self.model_to_asym_ids = self.atom_table[ ["model_number", "label_asym_id"] - ].drop_duplicates().groupby( + ].drop_duplicates().assign( + model_number=lambda df: df.model_number - 1 + ).groupby( "model_number" ).agg( lambda s: list(s) @@ -613,7 +618,7 @@ def from_id(cls, pdb_id, keep_full_data=False): with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f: return cls(f, keep_full_data=keep_full_data) - def get_chain(self, chain, model=1, is_author_id=True): + def get_chain(self, chain, model=0, is_author_id=True): """ Extract residue information and atom coordinates for a given chain in PDB structure @@ -622,8 +627,8 @@ def get_chain(self, chain, model=1, is_author_id=True): ---------- chain : str ID of chain to be extracted (e.g. "A") - model : int, optional (default: 1) - Identifier of model to be extracted + model : int, optional (default: 0) + Identifier of model to be extracted, starting counting at 0 is_author_id : bool (default: True) If true, interpret chain parameter as author chain identifier; if false, interpret as label_asym_id @@ -654,7 +659,7 @@ def get_chain(self, chain, model=1, is_author_id=True): # filter atom table to model + chain selection atoms = self.atom_table.query( - f"model_number == @model and {chain_field} == @chain" + f"model_number - 1 == @model and {chain_field} == @chain" ).assign( # create coordinate ID from author residue ID + insertion code # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID) From 2872d3dcc1814e29bda9fae806e7ffa57a809f65 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sat, 6 Jul 2024 14:23:30 +0200 Subject: [PATCH 36/47] Replace pd.append with pd.concat --- evcouplings/compare/distances.py | 6 ++---- evcouplings/couplings/pairs.py | 2 +- evcouplings/fold/protocol.py | 2 +- evcouplings/utils/summarize.py | 4 ++-- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/evcouplings/compare/distances.py b/evcouplings/compare/distances.py index 8db22f9b..f6a2d21f 100644 --- a/evcouplings/compare/distances.py +++ b/evcouplings/compare/distances.py @@ -323,7 +323,7 @@ def _add_axis(df, axis): else: res_i = _add_axis(self.residues_i, "i") res_j = _add_axis(self.residues_j, "j") - residues = res_i.append(res_j) + residues = pd.concat([res_i, res_j]) # save residue table residue_table_filename = filename + ".csv" @@ -1273,9 +1273,7 @@ def _get_chains(sifts_result): # if no structures given, or path to files, load first structures = _prepare_structures( structures, - sifts_result_i.hits.pdb_id.append( - sifts_result_j.hits.pdb_id - ), + set(sifts_result_i.hits.pdb_id) | set(sifts_result_j.hits.pdb_id), raise_missing ) diff --git a/evcouplings/couplings/pairs.py b/evcouplings/couplings/pairs.py index 5b284903..47446be8 100644 --- a/evcouplings/couplings/pairs.py +++ b/evcouplings/couplings/pairs.py @@ -122,7 +122,7 @@ def enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6): columns={"i": "j", "j": "i", "A_i": "A_j", "A_j": "A_i"} ) - stacked_ecs = top_ecs.append(flipped) + stacked_ecs = pd.concat([top_ecs, flipped]) # now sum cumulative strength of EC for each position ec_sums = pd.DataFrame( diff --git a/evcouplings/fold/protocol.py b/evcouplings/fold/protocol.py index 12ee73a1..232884ec 100644 --- a/evcouplings/fold/protocol.py +++ b/evcouplings/fold/protocol.py @@ -257,7 +257,7 @@ def _eliminate_altloc(chain): comp = comp.sort_values("tm", ascending=False) single_results[exp_file] = comp - full_result = full_result.append(comp) + full_result = pd.concat([full_result, comp]) return full_result, single_results diff --git a/evcouplings/utils/summarize.py b/evcouplings/utils/summarize.py index ba5e5012..c86a3d5a 100644 --- a/evcouplings/utils/summarize.py +++ b/evcouplings/utils/summarize.py @@ -91,7 +91,7 @@ def protein_monomer(prefix, configs): stat_df.loc[0, "precision"] = ec_comp.iloc[L]["precision"] # finally, append to global table - ali_table = ali_table.append(stat_df) + ali_table = pd.concat([ali_table, stat_df]) # sort table by sequence search threshold ali_table = ali_table.sort_values(by="domain_threshold") @@ -337,7 +337,7 @@ def protein_complex(prefix, configs): stat_df.loc[0, "inter_precision"] = ec_comp_inter.iloc[NUM_INTER]["segmentwise_precision"] # finally, append to global table - ali_table = ali_table.append(stat_df) + ali_table = pd.concat([ali_table, stat_df]) # save ali statistics table table_file = prefix + "_job_statistics_summary.csv" From 4778267e736832bb70d4b7adc6e5371598c4311a Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sun, 7 Jul 2024 10:46:28 +0200 Subject: [PATCH 37/47] fix overlapping secondary structure elements --- evcouplings/compare/pdb.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index 439e6b08..da16bf54 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -525,7 +525,12 @@ def __init__(self, filehandle, keep_full_data=False): "sec_struct_3state": sse_type, }) - self.secondary_structure = pd.DataFrame(sse_raw) + # drop duplicates, there are overlapping helix segment annotations e.g. for PDB 6cup:A:Asp92 + self.secondary_structure = pd.DataFrame( + sse_raw + ).drop_duplicates( + subset=["label_asym_id", "label_seq_id"] + ) # store information about models/chains for quick retrieval and verification; # subtract 0 to start numbering consistently to how this was handled with MMTF From cccc1ca746fc2e47c5c9cdeb452ae4b47afa5941 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sun, 7 Jul 2024 10:51:46 +0200 Subject: [PATCH 38/47] fix numpy warning (#306) --- evcouplings/visualize/parameters.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/evcouplings/visualize/parameters.py b/evcouplings/visualize/parameters.py index 2facfc11..064cab1d 100644 --- a/evcouplings/visualize/parameters.py +++ b/evcouplings/visualize/parameters.py @@ -134,7 +134,11 @@ def evzoom_data(model, ec_threshold=0.9, freq_threshold=0.01, fi = model.fi() q = model.num_symbols - B = -fi * np.log2(fi) + # copy and blank out fi matrix to avoid numpy warnings + # taking log of 0 (note where argument does not help) + fi_no0 = fi.copy() + fi_no0[fi <= 0] = np.nan + B = -fi * np.log2(fi_no0) B[fi <= 0] = 0 R = np.log2(q) - B.sum(axis=1) From 05c642334c980077b2add420f5c58f6108d8fb5a Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Sun, 7 Jul 2024 10:59:31 +0200 Subject: [PATCH 39/47] fix scikit-learn warning, cf. #306 --- evcouplings/couplings/pairs.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/evcouplings/couplings/pairs.py b/evcouplings/couplings/pairs.py index 47446be8..068b2853 100644 --- a/evcouplings/couplings/pairs.py +++ b/evcouplings/couplings/pairs.py @@ -1028,9 +1028,10 @@ def score(self, ecs, freqs, theta, effective_sequences, num_sites=None, score="c feature_table = ecs_full.reindex(self.feature_names, axis=1) # predict probabilities of contact and decision function - # (target class/true EC := 1) - probs = self.classifier.predict_proba(feature_table)[:, 1] - decision_func = self.classifier.decision_function(feature_table) + # (target class/true EC := 1); + # note: apply to .values to avoid sklearn warnings that model does not have feature names + probs = self.classifier.predict_proba(feature_table.values)[:, 1] + decision_func = self.classifier.decision_function(feature_table.values) # assign to EC table ecs_final = ecs_full.assign( From 656725137ed404b1afda1095f47a356d288dd2a5 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Mon, 8 Jul 2024 12:33:42 +0200 Subject: [PATCH 40/47] better backward compatibility for chain extraction --- evcouplings/compare/pdb.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index da16bf54..6b65c226 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -534,24 +534,23 @@ def __init__(self, filehandle, keep_full_data=False): # store information about models/chains for quick retrieval and verification; # subtract 0 to start numbering consistently to how this was handled with MMTF - self.models = list(sorted(self.atom_table.model_number.unique() - 1)) + self.models = list( + sorted(self.atom_table.model_number.unique()) + ) - # model number to auth chains + # model number to auth ID mapping self.model_to_chains = self.atom_table[ ["model_number", "auth_asym_id"] - ].drop_duplicates().assign( - model_number=lambda df: df.model_number - 1 - ).groupby( + ].drop_duplicates().groupby( "model_number" ).agg( lambda s: list(s) )["auth_asym_id"].to_dict() + # model number to asym ID mapping self.model_to_asym_ids = self.atom_table[ ["model_number", "label_asym_id"] - ].drop_duplicates().assign( - model_number=lambda df: df.model_number - 1 - ).groupby( + ].drop_duplicates().groupby( "model_number" ).agg( lambda s: list(s) @@ -633,7 +632,9 @@ def get_chain(self, chain, model=0, is_author_id=True): chain : str ID of chain to be extracted (e.g. "A") model : int, optional (default: 0) - Identifier of model to be extracted, starting counting at 0 + *Index* of model to be extracted, starting counting at 0. Note that for backwards + compatibility, this is *not* the actual PDB model identifier but indexes the model + identifiers in self.models, i.e. model must be >= 0 and < len(self.models) is_author_id : bool (default: True) If true, interpret chain parameter as author chain identifier; if false, interpret as label_asym_id @@ -645,14 +646,17 @@ def get_chain(self, chain, model=0, is_author_id=True): and atom coordinates """ # check if valid model was requested - if model not in self.models: + if not 0 <= model < len(self.models): raise ValueError( - "Invalid model identifier, valid options are: " + (",".join(map(str, self.models))) + f"Invalid model index, valid options: {','.join(map(str, range(len(self.models))))}" ) + # map model index to model number/identifier + model_number = self.models[model] + # check if valid chain was requested - if ((is_author_id and chain not in self.model_to_chains[model]) or - (not is_author_id and chain not in self.model_to_asym_ids[model])): + if ((is_author_id and chain not in self.model_to_chains[model_number]) or + (not is_author_id and chain not in self.model_to_asym_ids[model_number])): raise ValueError( "Invalid chain selection, check self.model_to_chains / self.model_to_asym_ids for options" ) @@ -664,7 +668,7 @@ def get_chain(self, chain, model=0, is_author_id=True): # filter atom table to model + chain selection atoms = self.atom_table.query( - f"model_number - 1 == @model and {chain_field} == @chain" + f"model_number == @model_number and {chain_field} == @chain" ).assign( # create coordinate ID from author residue ID + insertion code # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID) From 7e2c0bbb364ec78c3acf08fa51c30e1794e34f92 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Mon, 8 Jul 2024 12:57:31 +0200 Subject: [PATCH 41/47] fix charge handling --- evcouplings/compare/pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index 6b65c226..d721429c 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -376,7 +376,7 @@ def to_file(self, fileobj, chain_id="A", end=True, first_atom_id=1): # print charge if we have one (optional) charge = r["charge"] # test < and > to exclude nan values - if charge < 0 or charge > 0: + if isinstance(charge, int) and charge < 0 or charge > 0: charge_sign = "-" if charge < 0 else "+" charge_value = abs(charge) charge_str = "{}{}".format(charge_value, charge_sign) From 58aa2c07277dee5eb660eacfee06dfc9a3eaeead Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Mon, 8 Jul 2024 13:02:55 +0200 Subject: [PATCH 42/47] fix charge handling pt2 --- evcouplings/compare/pdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index d721429c..f1897c6e 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -376,7 +376,7 @@ def to_file(self, fileobj, chain_id="A", end=True, first_atom_id=1): # print charge if we have one (optional) charge = r["charge"] # test < and > to exclude nan values - if isinstance(charge, int) and charge < 0 or charge > 0: + if isinstance(charge, int) and (charge < 0 or charge > 0): charge_sign = "-" if charge < 0 else "+" charge_value = abs(charge) charge_str = "{}{}".format(charge_value, charge_sign) From 9bfa113e0db8aa611e130ee56e7d7e21447261b8 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Mon, 8 Jul 2024 13:13:48 +0200 Subject: [PATCH 43/47] fix alt_loc --- evcouplings/compare/pdb.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index f1897c6e..2e468291 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -445,6 +445,7 @@ def __init__(self, filehandle, keep_full_data=False): "_atom_site.type_symbol": "type_symbol", # x "_atom_site.label_atom_id": "label_atom_id", # x "_atom_site.auth_atom_id": "auth_atom_id", + "_atom_site.label_alt_id": "label_alt_id", # residue/molecule types (three-letter code) "_atom_site.label_comp_id": "label_comp_id", # x @@ -733,6 +734,7 @@ def get_chain(self, chain, model=0, is_author_id=True): "z": "z", "occupancy": "occupancy", "b_factor": "b_factor", + "label_alt_id": "alt_loc", } # add information about residue index to atoms From 476700b43bb087e73dc2f6a7b6c4230f2542541d Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Mon, 8 Jul 2024 13:19:08 +0200 Subject: [PATCH 44/47] fix yet another pandas deprecation --- evcouplings/compare/distances.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evcouplings/compare/distances.py b/evcouplings/compare/distances.py index f6a2d21f..674e7eb2 100644 --- a/evcouplings/compare/distances.py +++ b/evcouplings/compare/distances.py @@ -770,7 +770,7 @@ def _get_col_name(col_name): # extract coverage segments for all individual structures segments = { _get_col_name(col_name): find_segments(series.dropna().sort_index().index) - for col_name, series in coverage_cols.iteritems() + for col_name, series in coverage_cols.items() } return segments From f32491f00bd3d702812ce2916add248728ad019e Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Tue, 23 Jul 2024 17:05:35 +0200 Subject: [PATCH 45/47] fix KeyError due to absence of secondary structure sections --- evcouplings/compare/pdb.py | 58 +++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index 2e468291..c46f473f 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -500,15 +500,23 @@ def __init__(self, filehandle, keep_full_data=False): name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items() }) - # decode information into dataframe with BioPython helper method - self.helix_table = pd.DataFrame({ - name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items() - }) - - # decode information into dataframe with BioPython helper method - self.sheet_table = pd.DataFrame({ - name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items() - }) + # decode information into dataframe with BioPython helper method; note this section may not be + # present if no helices exist in the structure + try: + self.helix_table = pd.DataFrame({ + name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items() + }) + except KeyError: + self.helix_table = None + + # decode information into dataframe with BioPython helper method; note this section may not be + # present if no sheets exist in the structure + try: + self.sheet_table = pd.DataFrame({ + name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items() + }) + except KeyError: + self.sheet_table = None # create secondary structure table for merging to chain tables # (will only contain helix/H and strand/E, coil/C will need to be filled in) @@ -517,6 +525,10 @@ def __init__(self, filehandle, keep_full_data=False): ("H", self.helix_table), ("E", self.sheet_table) ]: + # skip if secondary structure element not present in PDB file at all + if sse_table is None: + continue + for _, row in sse_table.iterrows(): assert row.beg_label_asym_id == row.end_label_asym_id for seq_id in range(row.beg_label_seq_id, row.end_label_seq_id + 1): @@ -527,11 +539,14 @@ def __init__(self, filehandle, keep_full_data=False): }) # drop duplicates, there are overlapping helix segment annotations e.g. for PDB 6cup:A:Asp92 - self.secondary_structure = pd.DataFrame( - sse_raw - ).drop_duplicates( - subset=["label_asym_id", "label_seq_id"] - ) + if len(sse_raw) > 0: + self.secondary_structure = pd.DataFrame( + sse_raw + ).drop_duplicates( + subset=["label_asym_id", "label_seq_id"] + ) + else: + self.secondary_structure = None # store information about models/chains for quick retrieval and verification; # subtract 0 to start numbering consistently to how this was handled with MMTF @@ -692,11 +707,16 @@ def get_chain(self, chain, model=0, is_author_id=True): res.index.name = "residue_index" # merge secondary structure information (left outer join as coil is missing from table) - res_sse = res.merge( - self.secondary_structure, - on=("label_seq_id", "label_asym_id"), - how="left" - ) + if self.secondary_structure is not None: + res_sse = res.merge( + self.secondary_structure, + on=("label_seq_id", "label_asym_id"), + how="left" + ) + else: + res_sse = res.assign( + sec_struct_3state=np.nan + ) res_sse.loc[ res_sse.sec_struct_3state.isnull() & (res_sse.label_seq_id > 0), From 35d93d03f6ac63f9ce2df79f46744c12e855a101 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Tue, 27 Aug 2024 11:13:54 +0200 Subject: [PATCH 46/47] fix atom_id overwrite by coord_id --- evcouplings/compare/pdb.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index c46f473f..4c41495d 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -691,7 +691,6 @@ def get_chain(self, chain, model=0, is_author_id=True): coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code, seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", np.nan), one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"), - id=lambda df: df.coord_id, # note that MSE will now be labeled as HETATM, which was not the case with MMTF hetatm=lambda df: df.record_type == "HETATM", ).reset_index( @@ -701,6 +700,8 @@ def get_chain(self, chain, model=0, is_author_id=True): # create residue table by de-duplicating atoms res = atoms.drop_duplicates( subset=["coord_id"] + ).assign( + id=lambda df: df.coord_id ).reset_index( drop=True ) From 522903baabfc57985e9ed43f53907b2128e2c7d4 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 26 Sep 2024 17:27:28 +0200 Subject: [PATCH 47/47] ensure chain IDs are always str --- evcouplings/compare/pdb.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index 4c41495d..850a4be3 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -498,7 +498,12 @@ def __init__(self, filehandle, keep_full_data=False): # decode information into dataframe with BioPython helper method self.atom_table = pd.DataFrame({ name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items() - }) + }).assign( + # make sure chain identifiers are strings, in some pathologic cases, these are int rather than str + # (e.g. entry 6swy) + auth_asym_id=lambda df: df.auth_asym_id.astype(str), + label_asym_id=lambda df: df.label_asym_id.astype(str), + ) # decode information into dataframe with BioPython helper method; note this section may not be # present if no helices exist in the structure