From d472e4b3e38cb9124a8ee4678f271cb6ec92a396 Mon Sep 17 00:00:00 2001
From: Aaron Kollasch <aaron@kollasch.dev>
Date: Thu, 2 Mar 2023 16:30:47 -0500
Subject: [PATCH 01/10] Support a3m format in existing alignment protocol

---
 evcouplings/align/alignment.py | 10 ++++++++--
 evcouplings/align/protocol.py  |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
index 20d95210..2e77f733 100644
--- a/evcouplings/align/alignment.py
+++ b/evcouplings/align/alignment.py
@@ -9,6 +9,7 @@
 import re
 from collections import namedtuple, OrderedDict, defaultdict
 from copy import deepcopy
+from pathlib import Path
 
 import numpy as np
 from numba import jit
@@ -326,7 +327,7 @@ def write_a3m(sequences, fileobj, insert_gap=INSERT_GAP, width=80):
         fileobj.write(seq.replace(insert_gap, "") + "\n")
 
 
-def detect_format(fileobj):
+def detect_format(fileobj, filepath=""):
     """
     Detect if an alignment file is in FASTA or
     Stockholm format.
@@ -335,10 +336,12 @@ def detect_format(fileobj):
     ----------
     fileobj : file-like obj
         Alignment file for which to detect format
+    filepath : string or path-like obj
+        Path of alignment file
 
     Returns
     -------
-    format : {"fasta", "stockholm", None}
+    format : {"fasta", "a3m", "stockholm", None}
         Format of alignment, None if not detectable
     """
     for i, line in enumerate(fileobj):
@@ -348,6 +351,9 @@ def detect_format(fileobj):
 
         # This indicates a FASTA file
         if line.startswith(">"):
+            # A3M files have extension .a3m
+            if Path(filepath).suffix.lower() == ".a3m":
+                return "a3m"
             return "fasta"
 
         # Skip comment lines and empty lines for FASTA detection
diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
index f2ccbc3a..7fa6282b 100644
--- a/evcouplings/align/protocol.py
+++ b/evcouplings/align/protocol.py
@@ -689,7 +689,7 @@ def existing(**kwargs):
 
     # first try to autodetect format of alignment
     with open(input_alignment) as f:
-        format = detect_format(f)
+        format = detect_format(f, filepath=input_alignment)
         if format is None:
             raise InvalidParameterError(
                 "Format of input alignment {} could not be "

From 7192161ceca6c64f8d11aba16765178f97d50711 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 10 May 2023 15:34:35 +0200
Subject: [PATCH 02/10] Prepare tests for new version, update readme and setup

---
 .github/workflows/build_and_test.yml      | 2 +-
 .github/workflows/build_test_and_push.yml | 2 +-
 README.md                                 | 7 +++----
 setup.py                                  | 5 +++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e7be106a..54c3ad0b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -8,7 +8,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: [3.10]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml
index ec380fda..8a4b4543 100644
--- a/.github/workflows/build_test_and_push.yml
+++ b/.github/workflows/build_test_and_push.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: [3.10]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/README.md b/README.md
index ecee7c4e..b868fc2f 100644
--- a/README.md
+++ b/README.md
@@ -7,11 +7,12 @@ Predict protein structure, function and mutations using evolutionary sequence co
 
 ### Installing the Python package
 
-If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.
+* If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). 
+* If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.
 
 #### Requirements
 
-EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages that can be tricky to install using pip (numba, numpy, ...), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). In case you are creating a new conda environment or using miniconda, please make sure to run `conda install anaconda` before running pip, or otherwise the required packages will not be present.  
+EVcouplings actively supports Python >= 3.10 installations.  
 
 #### Installation
 
@@ -27,8 +28,6 @@ and to update to the latest version after previously installing EVcouplings from
 
     pip install -U --no-deps https://github.com/debbiemarkslab/EVcouplings/archive/develop.zip
 
-Installation will take seconds.
-
 ### External software tools
 
 *After installation and before running compute jobs, the paths to the respective binaries of the following external tools have to be set in your EVcouplings job configuration file(s).*
diff --git a/setup.py b/setup.py
index de2b95b7..9686763f 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
     name='evcouplings',
 
     # Version:
-    version='0.1.2',
+    version='0.2',
 
     description='A Framework for evolutionary couplings analysis',
     long_description=readme,
@@ -49,7 +49,8 @@
         # The license as you wish (should match "license" above)
         'License :: OSI Approved :: MIT License',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
     ],
 
     # What EVcouplings relates to:

From b4fc441f18efb770a53aeb4e183be2fd7595b756 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 10 May 2023 15:37:24 +0200
Subject: [PATCH 03/10] Keep as is due to github security restrictions

---
 .github/workflows/build_and_test.yml      | 2 +-
 .github/workflows/build_test_and_push.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 54c3ad0b..e7be106a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -8,7 +8,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.10]
+        python-version: [3.8]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml
index 8a4b4543..ec380fda 100644
--- a/.github/workflows/build_test_and_push.yml
+++ b/.github/workflows/build_test_and_push.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.10]
+        python-version: [3.8]
 
     steps:
     - uses: actions/checkout@v2

From e1362407a0b65d63ca07df55f44cb17b0a3722b7 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 10 May 2023 15:47:04 +0200
Subject: [PATCH 04/10] Fix numpy dtype deprecation, fixes #291

---
 evcouplings/align/alignment.py | 2 +-
 evcouplings/couplings/model.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
index 20d95210..9a72f71b 100644
--- a/evcouplings/align/alignment.py
+++ b/evcouplings/align/alignment.py
@@ -422,7 +422,7 @@ def sequences_to_matrix(sequences):
 
     N = len(sequences)
     L = len(next(iter(sequences)))
-    matrix = np.empty((N, L), dtype=np.str)
+    matrix = np.empty((N, L), dtype=str)
 
     for i, seq in enumerate(sequences):
         if len(seq) != L:
diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py
index 30f5ce80..e8977104 100755
--- a/evcouplings/couplings/model.py
+++ b/evcouplings/couplings/model.py
@@ -609,7 +609,7 @@ def convert_sequences(self, sequences):
                 )
             )
 
-        S = np.empty((len(sequences), L_seq), dtype=np.int)
+        S = np.empty((len(sequences), L_seq), dtype=int)
 
         try:
             for i, s in enumerate(sequences):
@@ -689,8 +689,8 @@ def delta_hamiltonian(self, substitutions, verify_mutants=True):
             2) delta J_ij, 3) delta h_i
 
         """
-        pos = np.empty(len(substitutions), dtype=np.int)
-        subs = np.empty(len(substitutions), dtype=np.int)
+        pos = np.empty(len(substitutions), dtype=int)
+        subs = np.empty(len(substitutions), dtype=int)
 
         try:
             for i, (subs_pos, subs_from, subs_to) in enumerate(substitutions):

From 78633124b2c4ed3b6ba3f47aac4969979aad77e2 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 10 May 2023 18:24:27 +0200
Subject: [PATCH 05/10] fix deprecation in pandas df equality checking in tests

---
 test/TestComplex.py  | 2 +-
 test/TestMutation.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/TestComplex.py b/test/TestComplex.py
index 873b24f7..d5ffa922 100644
--- a/test/TestComplex.py
+++ b/test/TestComplex.py
@@ -402,7 +402,7 @@ def test_find_possible_partners(self):
 
         pd.testing.assert_frame_equal(
             self.possible_partners, _possible_partners,
-            check_less_precise=True, check_like=True,
+            atol=1e-5, check_like=True,
             check_names=False
         )
 
diff --git a/test/TestMutation.py b/test/TestMutation.py
index c1418209..e06a0976 100644
--- a/test/TestMutation.py
+++ b/test/TestMutation.py
@@ -98,7 +98,7 @@ def test_single_mutant_matrix(self):
         # gotta round to account for this
         _singles = _singles.round(3)
         singles = singles.round(3)
-        pd.testing.assert_frame_equal(singles, _singles, check_exact=False, check_less_precise=True)
+        pd.testing.assert_frame_equal(singles, _singles, check_dtype=False, atol=1e-5)
 
     def test_split_mutants_single(self):
         """
@@ -228,7 +228,7 @@ def test_predict_mutation_table_segment_column(self):
             self.c0, self.singles, output_column="prediction_independent"
         )
 
-        pd.testing.assert_frame_equal(self.singles, _singles, check_less_precise=True)
+        pd.testing.assert_frame_equal(self.singles, _singles, check_dtype=False, atol=1e-5)
 
     def test_predict_mutation_table_empty_segment(self):
         """

From a880fea35340e22d880d928d4b23af6f3b90d926 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Thu, 11 May 2023 10:49:41 +0200
Subject: [PATCH 06/10] pin yaml due to upcoming API change

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9686763f..93f8beec 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,7 @@
     #setup_requires=['setuptools>=18.2', 'numpy'],
 
     install_requires=['setuptools>=18.2', 'numpy',
-        'pandas', 'scipy', 'numba', 'ruamel.yaml', 'matplotlib', 'requests',
+        'pandas', 'scipy', 'numba', 'ruamel.yaml<0.18', 'matplotlib', 'requests',
         'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2',
         'biopython', 'seaborn', 'billiard', 'scikit-learn',
     ],

From 5cd4033913e5ae20519fa6bc58412928ad9072ed Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Thu, 11 May 2023 11:53:55 +0200
Subject: [PATCH 07/10] Fix alignment identifier memory usage, add header
 splitting, fixes #289

---
 evcouplings/align/alignment.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
index 98538db5..f52d83fc 100644
--- a/evcouplings/align/alignment.py
+++ b/evcouplings/align/alignment.py
@@ -575,7 +575,12 @@ def __init__(self, sequence_matrix, sequence_ids=None, annotation=None,
                 )
 
             # make sure we get rid of iterators etc.
-            self.ids = np.array(list(sequence_ids))
+            self.ids = list(sequence_ids)
+
+        # turn identifiers into numpy array for consistency with previous implementation;
+        # but use dtype object to avoid memory usage issues of numpy string datatypes (longest
+        # sequence defines memory usage otherwise)
+        self.ids = np.array(self.ids, dtype=np.object_)
 
         self.id_to_index = {
             id_: i for i, id_ in enumerate(self.ids)
@@ -613,7 +618,7 @@ def from_dict(cls, sequences, **kwargs):
     @classmethod
     def from_file(cls, fileobj, format="fasta",
                   a3m_inserts="first", raise_hmmer_prefixes=True,
-                  **kwargs):
+                  split_header=False, **kwargs):
         """
         Construct an alignment object by reading in an
         alignment file.
@@ -631,6 +636,9 @@ def from_file(cls, fileobj, format="fasta",
             HMMER adds number prefixes to sequence identifiers in Stockholm
             files if identifiers are not unique. If True, the parser will
             raise an exception if a Stockholm alignment has such prefixes.
+        split_header: bool, optional (default: False)
+            Only store identifier portion of each header (before first whitespace)
+            in identifier list, rather than full header line
         **kwargs
             Additional arguments to be passed to class constructor
 
@@ -670,6 +678,12 @@ def from_file(cls, fileobj, format="fasta",
         else:
             raise ValueError("Invalid alignment format: {}".format(format))
 
+        # reduce header lines to identifiers if requested
+        if split_header:
+            seqs = {
+                header.split()[0]: seq for header, seq in seqs.items()
+            }
+
         return cls.from_dict(seqs, **kwargs)
 
     def __getitem__(self, index):
@@ -783,7 +797,8 @@ def select(self, columns=None, sequences=None):
     def apply(self, columns=None, sequences=None, func=np.char.lower):
         """
         Apply a function along columns and/or rows of alignment matrix,
-        or to entire matrix.
+        or to entire matrix. Note that column and row selections are
+        applied independently in this particular order.
 
         Parameters
         ----------
@@ -817,7 +832,7 @@ def apply(self, columns=None, sequences=None, func=np.char.lower):
                 mod_matrix[sequences, :] = func(mod_matrix[sequences, :])
 
         return Alignment(
-            mod_matrix, np.copy(self.ids), deepcopy(self.annotation),
+            mod_matrix, deepcopy(self.ids), deepcopy(self.annotation),
             alphabet=self.alphabet
         )
 

From 2324ed63cb6fc70da8fc2d52fb7ab96609d9a321 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Thu, 11 May 2023 13:00:41 +0200
Subject: [PATCH 08/10] Correct wrong exception raise, fixes #199

---
 evcouplings/couplings/mapping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py
index c1302808..18605f9b 100644
--- a/evcouplings/couplings/mapping.py
+++ b/evcouplings/couplings/mapping.py
@@ -374,7 +374,7 @@ def __init__(self, filename, *segments,
 
         # initialize the segment index mapper to update model numbering
         if len(segments) == 0:
-            raise(ValueError, "Must provide at least one segment for MultiSegmentCouplingsModel")
+            raise ValueError("Must provide at least one segment for MultiSegmentCouplingsModel")
 
         first_segment = segments[0]
         index_start = first_segment.region_start

From 9d34d5ef38265692275d0f042853434154d1d319 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Thu, 11 May 2023 14:11:53 +0200
Subject: [PATCH 09/10] better handler around occasional plmc segfaults, fixes
 #292

---
 evcouplings/couplings/tools.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/evcouplings/couplings/tools.py b/evcouplings/couplings/tools.py
index fd5ac104..79c592bd 100644
--- a/evcouplings/couplings/tools.py
+++ b/evcouplings/couplings/tools.py
@@ -265,10 +265,17 @@ def run_plmc(alignment, couplings_file, param_file=None,
     # returncode == -11 (segfault) despite successful calculation
     return_code, stdout, stderr = run(cmd, check_returncode=False)
 
-    # TODO: remove this segfault-hunting output once fixed
+    # TODO: remove this segfault-hunting output if fixed in plmc
     if return_code != 0:
-        # if not a segfault, still raise exception
-        if return_code != -11:
+        # check if we got valid output from plmc by parsing it
+        valid_plmc_output = True
+        try:
+            parse_plmc_log(stderr)
+        except KeyError:
+            valid_plmc_output = False
+
+        # if not a segfault or invalid plmc output, still raise exception
+        if return_code != -11 or not valid_plmc_output:
             from evcouplings.utils.system import ExternalToolError
             raise ExternalToolError(
                 "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
@@ -276,12 +283,6 @@ def run_plmc(alignment, couplings_file, param_file=None,
                 )
             )
 
-        print("PLMC NON-ZERO RETURNCODE:", return_code)
-        print(cmd)
-        print(" ".join(cmd))
-        print("stdout:", stdout)
-        print("stderr:", stderr)
-
     iter_df, out_fields = parse_plmc_log(stderr)
 
     # also check we actually calculated couplings...

From 5b3b3f08a1f9d1a6fbae3130de4b3bbdd9d61b39 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Fri, 12 May 2023 19:00:40 +0200
Subject: [PATCH 10/10] Numpy deprecation fix, closes #248

---
 evcouplings/compare/pdb.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index c76ba257..e16aaf1a 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -457,7 +457,7 @@ def _get_range(object_counts):
         # store explicit information about composition of residues
         def _group_info(field):
             return np.array(
-                [x[field] for x in mmtf.group_list]
+                [x[field] for x in mmtf.group_list], dtype=np.object_
             )
 
         # three and one letter code names of different groups
@@ -589,7 +589,7 @@ def get_chain(self, chain, model=0):
             np.array([
                 np.arange(self.first_residue_index[i], self.last_residue_index[i])
                 for i in target_chain_indeces
-            ])
+            ], dtype=np.object_)
         )
 
         # chain indeces and identifiers for all residues