Merge remote-tracking branch 'origin/v0.2' into lood/permissions_error

# Conflicts: # evcouplings/couplings/model.py # test/TestComplex.py # test/TestMutation.py
debbiemarkslab · Aug 1, 2023 · 6463ad9 · 6463ad9
2 parents a9181df + 5b3b3f0
commit 6463ad9
Show file tree

Hide file tree

Showing 10 changed files with 54 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -7,11 +7,12 @@ Predict protein structure, function and mutations using evolutionary sequence co
 
 ### Installing the Python package
 
-If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.
+* If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). 
+* If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.
 
 #### Requirements
 
-EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages that can be tricky to install using pip (numba, numpy, ...), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). In case you are creating a new conda environment or using miniconda, please make sure to run `conda install anaconda` before running pip, or otherwise the required packages will not be present.  
+EVcouplings actively supports Python >= 3.10 installations.  
 
 #### Installation
 
@@ -27,8 +28,6 @@ and to update to the latest version after previously installing EVcouplings from
 
     pip install -U --no-deps https://github.com/debbiemarkslab/EVcouplings/archive/develop.zip
 
-Installation will take seconds.
-
 ### External software tools
 
 *After installation and before running compute jobs, the paths to the respective binaries of the following external tools have to be set in your EVcouplings job configuration file(s).*

diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
@@ -9,6 +9,7 @@
 import re
 from collections import namedtuple, OrderedDict, defaultdict
 from copy import deepcopy
+from pathlib import Path
 
 import numpy as np
 from numba import jit
@@ -326,7 +327,7 @@ def write_a3m(sequences, fileobj, insert_gap=INSERT_GAP, width=80):
         fileobj.write(seq.replace(insert_gap, "") + "\n")
 
 
-def detect_format(fileobj):
+def detect_format(fileobj, filepath=""):
     """
     Detect if an alignment file is in FASTA or
     Stockholm format.
@@ -335,10 +336,12 @@ def detect_format(fileobj):
     ----------
     fileobj : file-like obj
         Alignment file for which to detect format
+    filepath : string or path-like obj
+        Path of alignment file
 
     Returns
     -------
-    format : {"fasta", "stockholm", None}
+    format : {"fasta", "a3m", "stockholm", None}
         Format of alignment, None if not detectable
     """
     for i, line in enumerate(fileobj):
@@ -348,6 +351,9 @@ def detect_format(fileobj):
 
         # This indicates a FASTA file
         if line.startswith(">"):
+            # A3M files have extension .a3m
+            if Path(filepath).suffix.lower() == ".a3m":
+                return "a3m"
             return "fasta"
 
         # Skip comment lines and empty lines for FASTA detection
@@ -569,7 +575,12 @@ def __init__(self, sequence_matrix, sequence_ids=None, annotation=None,
                 )
 
             # make sure we get rid of iterators etc.
-            self.ids = np.array(list(sequence_ids))
+            self.ids = list(sequence_ids)
+
+        # turn identifiers into numpy array for consistency with previous implementation;
+        # but use dtype object to avoid memory usage issues of numpy string datatypes (longest
+        # sequence defines memory usage otherwise)
+        self.ids = np.array(self.ids, dtype=np.object_)
 
         self.id_to_index = {
             id_: i for i, id_ in enumerate(self.ids)
@@ -607,7 +618,7 @@ def from_dict(cls, sequences, **kwargs):
     @classmethod
     def from_file(cls, fileobj, format="fasta",
                   a3m_inserts="first", raise_hmmer_prefixes=True,
-                  **kwargs):
+                  split_header=False, **kwargs):
         """
         Construct an alignment object by reading in an
         alignment file.
@@ -625,6 +636,9 @@ def from_file(cls, fileobj, format="fasta",
             HMMER adds number prefixes to sequence identifiers in Stockholm
             files if identifiers are not unique. If True, the parser will
             raise an exception if a Stockholm alignment has such prefixes.
+        split_header: bool, optional (default: False)
+            Only store identifier portion of each header (before first whitespace)
+            in identifier list, rather than full header line
         **kwargs
             Additional arguments to be passed to class constructor
 
@@ -664,6 +678,12 @@ def from_file(cls, fileobj, format="fasta",
         else:
             raise ValueError("Invalid alignment format: {}".format(format))
 
+        # reduce header lines to identifiers if requested
+        if split_header:
+            seqs = {
+                header.split()[0]: seq for header, seq in seqs.items()
+            }
+
         return cls.from_dict(seqs, **kwargs)
 
     def __getitem__(self, index):
@@ -777,7 +797,8 @@ def select(self, columns=None, sequences=None):
     def apply(self, columns=None, sequences=None, func=np.char.lower):
         """
         Apply a function along columns and/or rows of alignment matrix,
-        or to entire matrix.
+        or to entire matrix. Note that column and row selections are
+        applied independently in this particular order.
 
         Parameters
         ----------
@@ -811,7 +832,7 @@ def apply(self, columns=None, sequences=None, func=np.char.lower):
                 mod_matrix[sequences, :] = func(mod_matrix[sequences, :])
 
         return Alignment(
-            mod_matrix, np.copy(self.ids), deepcopy(self.annotation),
+            mod_matrix, deepcopy(self.ids), deepcopy(self.annotation),
             alphabet=self.alphabet
         )
 

diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
@@ -689,7 +689,7 @@ def existing(**kwargs):
 
     # first try to autodetect format of alignment
     with open(input_alignment) as f:
-        format = detect_format(f)
+        format = detect_format(f, filepath=input_alignment)
         if format is None:
             raise InvalidParameterError(
                 "Format of input alignment {} could not be "

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
@@ -457,7 +457,7 @@ def _get_range(object_counts):
         # store explicit information about composition of residues
         def _group_info(field):
             return np.array(
-                [x[field] for x in mmtf.group_list]
+                [x[field] for x in mmtf.group_list], dtype=np.object_
             )
 
         # three and one letter code names of different groups
@@ -589,7 +589,7 @@ def get_chain(self, chain, model=0):
             np.array([
                 np.arange(self.first_residue_index[i], self.last_residue_index[i])
                 for i in target_chain_indeces
-            ])
+            ], dtype=np.object_)
         )
 
         # chain indeces and identifiers for all residues

diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py
@@ -374,7 +374,7 @@ def __init__(self, filename, *segments,
 
         # initialize the segment index mapper to update model numbering
         if len(segments) == 0:
-            raise(ValueError, "Must provide at least one segment for MultiSegmentCouplingsModel")
+            raise ValueError("Must provide at least one segment for MultiSegmentCouplingsModel")
 
         first_segment = segments[0]
         index_start = first_segment.region_start

diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py
@@ -609,7 +609,7 @@ def convert_sequences(self, sequences):
                 )
             )
 
-        S = np.empty((len(sequences), L_seq), dtype=np.int64)
+        S = np.empty((len(sequences), L_seq), dtype=int)
 
         try:
             for i, s in enumerate(sequences):
@@ -689,8 +689,8 @@ def delta_hamiltonian(self, substitutions, verify_mutants=True):
             2) delta J_ij, 3) delta h_i
 
         """
-        pos = np.empty(len(substitutions), dtype=np.int64)
-        subs = np.empty(len(substitutions), dtype=np.int64)
+        pos = np.empty(len(substitutions), dtype=int)
+        subs = np.empty(len(substitutions), dtype=int)
 
         try:
             for i, (subs_pos, subs_from, subs_to) in enumerate(substitutions):

diff --git a/evcouplings/couplings/tools.py b/evcouplings/couplings/tools.py
@@ -265,23 +265,24 @@ def run_plmc(alignment, couplings_file, param_file=None,
     # returncode == -11 (segfault) despite successful calculation
     return_code, stdout, stderr = run(cmd, check_returncode=False)
 
-    # TODO: remove this segfault-hunting output once fixed
+    # TODO: remove this segfault-hunting output if fixed in plmc
     if return_code != 0:
-        # if not a segfault, still raise exception
-        if return_code != -11:
+        # check if we got valid output from plmc by parsing it
+        valid_plmc_output = True
+        try:
+            parse_plmc_log(stderr)
+        except KeyError:
+            valid_plmc_output = False
+
+        # if not a segfault or invalid plmc output, still raise exception
+        if return_code != -11 or not valid_plmc_output:
             from evcouplings.utils.system import ExternalToolError
             raise ExternalToolError(
                 "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
                     cmd, return_code, stdout, stderr
                 )
             )
 
-        print("PLMC NON-ZERO RETURNCODE:", return_code)
-        print(cmd)
-        print(" ".join(cmd))
-        print("stdout:", stdout)
-        print("stderr:", stderr)
-
     iter_df, out_fields = parse_plmc_log(stderr)
 
     # also check we actually calculated couplings...

diff --git a/setup.py b/setup.py
@@ -18,7 +18,7 @@
     name='evcouplings',
 
     # Version:
-    version='0.1.2',
+    version='0.2',
 
     description='A Framework for evolutionary couplings analysis',
     long_description=readme,
@@ -49,7 +49,8 @@
         # The license as you wish (should match "license" above)
         'License :: OSI Approved :: MIT License',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
     ],
 
     # What EVcouplings relates to:
@@ -96,7 +97,7 @@
     #setup_requires=['setuptools>=18.2', 'numpy'],
 
     install_requires=['setuptools>=18.2', 'numpy',
-        'pandas', 'scipy', 'numba', 'ruamel.yaml', 'matplotlib', 'requests',
+        'pandas', 'scipy', 'numba', 'ruamel.yaml<0.18', 'matplotlib', 'requests',
         'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2',
         'biopython', 'seaborn', 'billiard', 'scikit-learn',
     ],

diff --git a/test/TestComplex.py b/test/TestComplex.py
@@ -402,9 +402,7 @@ def test_find_possible_partners(self):
 
         pd.testing.assert_frame_equal(
             self.possible_partners, _possible_partners,
-            check_exact=False,
-            rtol=1e-3,
-            check_like=True,
+            atol=1e-5, check_like=True,
             check_names=False
         )
 

diff --git a/test/TestMutation.py b/test/TestMutation.py
@@ -98,7 +98,7 @@ def test_single_mutant_matrix(self):
         # gotta round to account for this
         _singles = _singles.round(3)
         singles = singles.round(3)
-        pd.testing.assert_frame_equal(singles, _singles, check_exact=False, rtol=1e-3)
+        pd.testing.assert_frame_equal(singles, _singles, check_dtype=False, atol=1e-5)
 
     def test_split_mutants_single(self):
         """
@@ -228,7 +228,7 @@ def test_predict_mutation_table_segment_column(self):
             self.c0, self.singles, output_column="prediction_independent"
         )
 
-        pd.testing.assert_frame_equal(self.singles, _singles, check_exact=False, rtol=1e-3)
+        pd.testing.assert_frame_equal(self.singles, _singles, check_dtype=False, atol=1e-5)
 
     def test_predict_mutation_table_empty_segment(self):
         """