From d472e4b3e38cb9124a8ee4678f271cb6ec92a396 Mon Sep 17 00:00:00 2001 From: Aaron Kollasch Date: Thu, 2 Mar 2023 16:30:47 -0500 Subject: [PATCH 01/10] Support a3m format in existing alignment protocol --- evcouplings/align/alignment.py | 10 ++++++++-- evcouplings/align/protocol.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py index 20d95210..2e77f733 100644 --- a/evcouplings/align/alignment.py +++ b/evcouplings/align/alignment.py @@ -9,6 +9,7 @@ import re from collections import namedtuple, OrderedDict, defaultdict from copy import deepcopy +from pathlib import Path import numpy as np from numba import jit @@ -326,7 +327,7 @@ def write_a3m(sequences, fileobj, insert_gap=INSERT_GAP, width=80): fileobj.write(seq.replace(insert_gap, "") + "\n") -def detect_format(fileobj): +def detect_format(fileobj, filepath=""): """ Detect if an alignment file is in FASTA or Stockholm format. @@ -335,10 +336,12 @@ def detect_format(fileobj): ---------- fileobj : file-like obj Alignment file for which to detect format + filepath : string or path-like obj + Path of alignment file Returns ------- - format : {"fasta", "stockholm", None} + format : {"fasta", "a3m", "stockholm", None} Format of alignment, None if not detectable """ for i, line in enumerate(fileobj): @@ -348,6 +351,9 @@ def detect_format(fileobj): # This indicates a FASTA file if line.startswith(">"): + # A3M files have extension .a3m + if Path(filepath).suffix.lower() == ".a3m": + return "a3m" return "fasta" # Skip comment lines and empty lines for FASTA detection diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index f2ccbc3a..7fa6282b 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -689,7 +689,7 @@ def existing(**kwargs): # first try to autodetect format of alignment with open(input_alignment) as f: - format = detect_format(f) + format = detect_format(f, filepath=input_alignment) if format is None: raise InvalidParameterError( "Format of input alignment {} could not be " From 7192161ceca6c64f8d11aba16765178f97d50711 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 10 May 2023 15:34:35 +0200 Subject: [PATCH 02/10] Prepare tests for new version, update readme and setup --- .github/workflows/build_and_test.yml | 2 +- .github/workflows/build_test_and_push.yml | 2 +- README.md | 7 +++---- setup.py | 5 +++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e7be106a..54c3ad0b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - python-version: [3.8] + python-version: [3.10] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml index ec380fda..8a4b4543 100644 --- a/.github/workflows/build_test_and_push.yml +++ b/.github/workflows/build_test_and_push.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.10] steps: - uses: actions/checkout@v2 diff --git a/README.md b/README.md index ecee7c4e..b868fc2f 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,12 @@ Predict protein structure, function and mutations using evolutionary sequence co ### Installing the Python package -If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases. +* If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). +* If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases. #### Requirements -EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages that can be tricky to install using pip (numba, numpy, ...), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). In case you are creating a new conda environment or using miniconda, please make sure to run `conda install anaconda` before running pip, or otherwise the required packages will not be present. +EVcouplings actively supports Python >= 3.10 installations. #### Installation @@ -27,8 +28,6 @@ and to update to the latest version after previously installing EVcouplings from pip install -U --no-deps https://github.com/debbiemarkslab/EVcouplings/archive/develop.zip -Installation will take seconds. - ### External software tools *After installation and before running compute jobs, the paths to the respective binaries of the following external tools have to be set in your EVcouplings job configuration file(s).* diff --git a/setup.py b/setup.py index de2b95b7..9686763f 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ name='evcouplings', # Version: - version='0.1.2', + version='0.2', description='A Framework for evolutionary couplings analysis', long_description=readme, @@ -49,7 +49,8 @@ # The license as you wish (should match "license" above) 'License :: OSI Approved :: MIT License', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', ], # What EVcouplings relates to: From b4fc441f18efb770a53aeb4e183be2fd7595b756 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 10 May 2023 15:37:24 +0200 Subject: [PATCH 03/10] Keep as is due to github security restrictions --- .github/workflows/build_and_test.yml | 2 +- .github/workflows/build_test_and_push.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 54c3ad0b..e7be106a 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: - python-version: [3.10] + python-version: [3.8] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml index 8a4b4543..ec380fda 100644 --- a/.github/workflows/build_test_and_push.yml +++ b/.github/workflows/build_test_and_push.yml @@ -9,7 +9,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.10] + python-version: [3.8] steps: - uses: actions/checkout@v2 From e1362407a0b65d63ca07df55f44cb17b0a3722b7 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 10 May 2023 15:47:04 +0200 Subject: [PATCH 04/10] Fix numpy dtype deprecation, fixes #291 --- evcouplings/align/alignment.py | 2 +- evcouplings/couplings/model.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py index 20d95210..9a72f71b 100644 --- a/evcouplings/align/alignment.py +++ b/evcouplings/align/alignment.py @@ -422,7 +422,7 @@ def sequences_to_matrix(sequences): N = len(sequences) L = len(next(iter(sequences))) - matrix = np.empty((N, L), dtype=np.str) + matrix = np.empty((N, L), dtype=str) for i, seq in enumerate(sequences): if len(seq) != L: diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py index 30f5ce80..e8977104 100755 --- a/evcouplings/couplings/model.py +++ b/evcouplings/couplings/model.py @@ -609,7 +609,7 @@ def convert_sequences(self, sequences): ) ) - S = np.empty((len(sequences), L_seq), dtype=np.int) + S = np.empty((len(sequences), L_seq), dtype=int) try: for i, s in enumerate(sequences): @@ -689,8 +689,8 @@ def delta_hamiltonian(self, substitutions, verify_mutants=True): 2) delta J_ij, 3) delta h_i """ - pos = np.empty(len(substitutions), dtype=np.int) - subs = np.empty(len(substitutions), dtype=np.int) + pos = np.empty(len(substitutions), dtype=int) + subs = np.empty(len(substitutions), dtype=int) try: for i, (subs_pos, subs_from, subs_to) in enumerate(substitutions): From 78633124b2c4ed3b6ba3f47aac4969979aad77e2 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Wed, 10 May 2023 18:24:27 +0200 Subject: [PATCH 05/10] fix deprecation in pandas df equality checking in tests --- test/TestComplex.py | 2 +- test/TestMutation.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/TestComplex.py b/test/TestComplex.py index 873b24f7..d5ffa922 100644 --- a/test/TestComplex.py +++ b/test/TestComplex.py @@ -402,7 +402,7 @@ def test_find_possible_partners(self): pd.testing.assert_frame_equal( self.possible_partners, _possible_partners, - check_less_precise=True, check_like=True, + atol=1e-5, check_like=True, check_names=False ) diff --git a/test/TestMutation.py b/test/TestMutation.py index c1418209..e06a0976 100644 --- a/test/TestMutation.py +++ b/test/TestMutation.py @@ -98,7 +98,7 @@ def test_single_mutant_matrix(self): # gotta round to account for this _singles = _singles.round(3) singles = singles.round(3) - pd.testing.assert_frame_equal(singles, _singles, check_exact=False, check_less_precise=True) + pd.testing.assert_frame_equal(singles, _singles, check_dtype=False, atol=1e-5) def test_split_mutants_single(self): """ @@ -228,7 +228,7 @@ def test_predict_mutation_table_segment_column(self): self.c0, self.singles, output_column="prediction_independent" ) - pd.testing.assert_frame_equal(self.singles, _singles, check_less_precise=True) + pd.testing.assert_frame_equal(self.singles, _singles, check_dtype=False, atol=1e-5) def test_predict_mutation_table_empty_segment(self): """ From a880fea35340e22d880d928d4b23af6f3b90d926 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 11 May 2023 10:49:41 +0200 Subject: [PATCH 06/10] pin yaml due to upcoming API change --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9686763f..93f8beec 100644 --- a/setup.py +++ b/setup.py @@ -97,7 +97,7 @@ #setup_requires=['setuptools>=18.2', 'numpy'], install_requires=['setuptools>=18.2', 'numpy', - 'pandas', 'scipy', 'numba', 'ruamel.yaml', 'matplotlib', 'requests', + 'pandas', 'scipy', 'numba', 'ruamel.yaml<0.18', 'matplotlib', 'requests', 'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2', 'biopython', 'seaborn', 'billiard', 'scikit-learn', ], From 5cd4033913e5ae20519fa6bc58412928ad9072ed Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 11 May 2023 11:53:55 +0200 Subject: [PATCH 07/10] Fix alignment identifier memory usage, add header splitting, fixes #289 --- evcouplings/align/alignment.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py index 98538db5..f52d83fc 100644 --- a/evcouplings/align/alignment.py +++ b/evcouplings/align/alignment.py @@ -575,7 +575,12 @@ def __init__(self, sequence_matrix, sequence_ids=None, annotation=None, ) # make sure we get rid of iterators etc. - self.ids = np.array(list(sequence_ids)) + self.ids = list(sequence_ids) + + # turn identifiers into numpy array for consistency with previous implementation; + # but use dtype object to avoid memory usage issues of numpy string datatypes (longest + # sequence defines memory usage otherwise) + self.ids = np.array(self.ids, dtype=np.object_) self.id_to_index = { id_: i for i, id_ in enumerate(self.ids) @@ -613,7 +618,7 @@ def from_dict(cls, sequences, **kwargs): @classmethod def from_file(cls, fileobj, format="fasta", a3m_inserts="first", raise_hmmer_prefixes=True, - **kwargs): + split_header=False, **kwargs): """ Construct an alignment object by reading in an alignment file. @@ -631,6 +636,9 @@ def from_file(cls, fileobj, format="fasta", HMMER adds number prefixes to sequence identifiers in Stockholm files if identifiers are not unique. If True, the parser will raise an exception if a Stockholm alignment has such prefixes. + split_header: bool, optional (default: False) + Only store identifier portion of each header (before first whitespace) + in identifier list, rather than full header line **kwargs Additional arguments to be passed to class constructor @@ -670,6 +678,12 @@ def from_file(cls, fileobj, format="fasta", else: raise ValueError("Invalid alignment format: {}".format(format)) + # reduce header lines to identifiers if requested + if split_header: + seqs = { + header.split()[0]: seq for header, seq in seqs.items() + } + return cls.from_dict(seqs, **kwargs) def __getitem__(self, index): @@ -783,7 +797,8 @@ def select(self, columns=None, sequences=None): def apply(self, columns=None, sequences=None, func=np.char.lower): """ Apply a function along columns and/or rows of alignment matrix, - or to entire matrix. + or to entire matrix. Note that column and row selections are + applied independently in this particular order. Parameters ---------- @@ -817,7 +832,7 @@ def apply(self, columns=None, sequences=None, func=np.char.lower): mod_matrix[sequences, :] = func(mod_matrix[sequences, :]) return Alignment( - mod_matrix, np.copy(self.ids), deepcopy(self.annotation), + mod_matrix, deepcopy(self.ids), deepcopy(self.annotation), alphabet=self.alphabet ) From 2324ed63cb6fc70da8fc2d52fb7ab96609d9a321 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 11 May 2023 13:00:41 +0200 Subject: [PATCH 08/10] Correct wrong exception raise, fixes #199 --- evcouplings/couplings/mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py index c1302808..18605f9b 100644 --- a/evcouplings/couplings/mapping.py +++ b/evcouplings/couplings/mapping.py @@ -374,7 +374,7 @@ def __init__(self, filename, *segments, # initialize the segment index mapper to update model numbering if len(segments) == 0: - raise(ValueError, "Must provide at least one segment for MultiSegmentCouplingsModel") + raise ValueError("Must provide at least one segment for MultiSegmentCouplingsModel") first_segment = segments[0] index_start = first_segment.region_start From 9d34d5ef38265692275d0f042853434154d1d319 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Thu, 11 May 2023 14:11:53 +0200 Subject: [PATCH 09/10] better handler around occasional plmc segfaults, fixes #292 --- evcouplings/couplings/tools.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/evcouplings/couplings/tools.py b/evcouplings/couplings/tools.py index fd5ac104..79c592bd 100644 --- a/evcouplings/couplings/tools.py +++ b/evcouplings/couplings/tools.py @@ -265,10 +265,17 @@ def run_plmc(alignment, couplings_file, param_file=None, # returncode == -11 (segfault) despite successful calculation return_code, stdout, stderr = run(cmd, check_returncode=False) - # TODO: remove this segfault-hunting output once fixed + # TODO: remove this segfault-hunting output if fixed in plmc if return_code != 0: - # if not a segfault, still raise exception - if return_code != -11: + # check if we got valid output from plmc by parsing it + valid_plmc_output = True + try: + parse_plmc_log(stderr) + except KeyError: + valid_plmc_output = False + + # if not a segfault or invalid plmc output, still raise exception + if return_code != -11 or not valid_plmc_output: from evcouplings.utils.system import ExternalToolError raise ExternalToolError( "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format( @@ -276,12 +283,6 @@ def run_plmc(alignment, couplings_file, param_file=None, ) ) - print("PLMC NON-ZERO RETURNCODE:", return_code) - print(cmd) - print(" ".join(cmd)) - print("stdout:", stdout) - print("stderr:", stderr) - iter_df, out_fields = parse_plmc_log(stderr) # also check we actually calculated couplings... From 5b3b3f08a1f9d1a6fbae3130de4b3bbdd9d61b39 Mon Sep 17 00:00:00 2001 From: Thomas Hopf Date: Fri, 12 May 2023 19:00:40 +0200 Subject: [PATCH 10/10] Numpy deprecation fix, closes #248 --- evcouplings/compare/pdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py index c76ba257..e16aaf1a 100644 --- a/evcouplings/compare/pdb.py +++ b/evcouplings/compare/pdb.py @@ -457,7 +457,7 @@ def _get_range(object_counts): # store explicit information about composition of residues def _group_info(field): return np.array( - [x[field] for x in mmtf.group_list] + [x[field] for x in mmtf.group_list], dtype=np.object_ ) # three and one letter code names of different groups @@ -589,7 +589,7 @@ def get_chain(self, chain, model=0): np.array([ np.arange(self.first_residue_index[i], self.last_residue_index[i]) for i in target_chain_indeces - ]) + ], dtype=np.object_) ) # chain indeces and identifiers for all residues