From 3799d7d929d074b2e7a2c62c8e59ae7f9e198e50 Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Mon, 8 Mar 2021 20:51:15 +0100
Subject: [PATCH 01/47] [FIX] lowercase letters break existing() alignment
 protocol, fixes #258

---
 evcouplings/align/protocol.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
index 06fc1902..5bc111f2 100644
--- a/evcouplings/align/protocol.py
+++ b/evcouplings/align/protocol.py
@@ -522,11 +522,15 @@ def describe_frequencies(alignment, first_index, target_seq_index=None):
     fi = alignment.frequencies
     conservation = alignment.conservation()
 
-    fi_cols = {c: fi[:, i] for c, i in alignment.alphabet_map.items()}
+    # careful not to include any characters that are non-match state (e.g. lowercase letters)
+    fi_cols = {
+        c: fi[:, alignment.alphabet_map[c]] for c in alignment.alphabet
+    }
+
     if target_seq_index is not None:
         target_seq = alignment[target_seq_index]
     else:
-        target_seq = np.full((alignment.L), np.nan)
+        target_seq = np.full((alignment.L, ), np.nan)
 
     info = pd.DataFrame(
         {
@@ -539,6 +543,11 @@ def describe_frequencies(alignment, first_index, target_seq_index=None):
     # reorder columns
     info = info.loc[:, ["i", "A_i", "conservation"] + list(alignment.alphabet)]
 
+    # do not report values for lowercase columns
+    info.loc[
+        info.A_i.str.lower() == info.A_i, ["conservation"] + list(alignment.alphabet)
+    ] = np.nan
+
     return info
 
 

From 741a7895332a3d6c8de6f46dc60f4c391b3a5107 Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Mon, 8 Mar 2021 20:52:44 +0100
Subject: [PATCH 02/47] [FIX] wrong sequence index in error message, fixes #259

---
 evcouplings/couplings/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py
index 31da0eee..bb4f972d 100755
--- a/evcouplings/couplings/model.py
+++ b/evcouplings/couplings/model.py
@@ -699,7 +699,7 @@ def delta_hamiltonian(self, substitutions, verify_mutants=True):
                 if verify_mutants and subs_from != self.target_seq[pos[i]]:
                     raise ValueError(
                         "Inconsistency with target sequence: pos={} target={} subs={}".format(
-                            subs_pos, self.target_seq[i], subs_from
+                            subs_pos, self.target_seq[pos[i]], subs_from
                         )
                     )
         except KeyError:

From f6ee1a8e5e0811cc89a205fa2e42e4565da8d49e Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Mon, 8 Mar 2021 21:00:37 +0100
Subject: [PATCH 03/47] [FIX] fix extract_annotation crash, fixes #249

---
 evcouplings/align/protocol.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
index 5bc111f2..ede0c4a6 100644
--- a/evcouplings/align/protocol.py
+++ b/evcouplings/align/protocol.py
@@ -1511,6 +1511,8 @@ def standard(**kwargs):
         annotation_file = prefix + "_annotation.csv"
         annotation = extract_header_annotation(ali_raw)
         annotation.to_csv(annotation_file, index=False)
+    else:
+        annotation_file = None
 
     # center alignment around focus/search sequence
     focus_cols = np.array([c != "-" for c in ali_raw[0]])
@@ -1525,9 +1527,11 @@ def standard(**kwargs):
     outcfg = {
         **jackhmmer_outcfg,
         **mod_outcfg,
-        "annotation_file": annotation_file
     }
 
+    if annotation_file is not None:
+        outcfg["annotation_file"] = annotation_file
+
     # dump output config to YAML file for debugging/logging
     write_config_file(prefix + ".align_standard.outcfg", outcfg)
 

From d6627a29960167df5535450ddf4b4d1a5037cdf4 Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Sat, 13 Mar 2021 13:43:29 +0100
Subject: [PATCH 04/47] [FIX] lowercase columns in frequency stable still led
 to crash

---
 evcouplings/couplings/pairs.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/evcouplings/couplings/pairs.py b/evcouplings/couplings/pairs.py
index 2de96d97..5b284903 100644
--- a/evcouplings/couplings/pairs.py
+++ b/evcouplings/couplings/pairs.py
@@ -810,13 +810,17 @@ def add_freqs_to_ec_table(ecs, freqs):
         Frequency table generated by pipeline (_frequencies.csv) using
         evcouplings.align.protocol.describe_frequencies()
     """
-    # rename column names to be more descriptive
+    # rename column names to be more descriptive;
+    # also get rid of rows with nan values (these are present only if input
+    # alignment has lowercase letters but will cause problems downstream
+    # where we need frequencies for uppercase letters only. Hence, remove
+    # from table right away of freq_i assignment below will crash.
     freqs = freqs.rename(
         columns={
             "-": "gap_i",
             "conservation": "cons_i",
         }
-    )
+    ).dropna()
 
     # extract frequency of target residue for each position
     freqs.loc[:, "freq_i"] = [r[r["A_i"]] for idx, r in freqs.iterrows()]

From 380752e0c1d1bc25c2f3860c018a316bf6600bba Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Sat, 13 Mar 2021 14:36:50 +0100
Subject: [PATCH 05/47] [FEATURE] local submitter can now run jobs in parallel
 depending on user choice

---
 config/sample_config_complex.txt |  6 ++++++
 config/sample_config_monomer.txt |  6 ++++++
 evcouplings/utils/app.py         | 29 +++++++++++++++++++++++++++--
 3 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/config/sample_config_complex.txt b/config/sample_config_complex.txt
index b1f775da..03aa9bbb 100644
--- a/config/sample_config_complex.txt
+++ b/config/sample_config_complex.txt
@@ -487,6 +487,12 @@ environment:
     memory: 15000
     time: 2-0:0:0
 
+    # Special setting for "local" engine to define number of workers running in parallel
+    # (note that "cores" has to be defined above to make sure each job only uses a defined
+    # number of cores). If not defined or None, will default to number of cores / cores per job;
+    # otherwise specify integer to limit number of workers (1 for serial execution of subjobs)
+    # parallel_workers: 1
+
     # command that will be executed before running actual computation (can be used to set up environment)
     configuration:
         
diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt
index 084bbd61..c24dad32 100644
--- a/config/sample_config_monomer.txt
+++ b/config/sample_config_monomer.txt
@@ -384,6 +384,12 @@ environment:
     memory: 15000
     time: 2-0:0:0
 
+    # Special setting for "local" engine to define number of workers running in parallel
+    # (note that "cores" has to be defined above to make sure each job only uses a defined
+    # number of cores). If not defined or None, will default to number of cores / cores per job;
+    # otherwise specify integer to limit number of workers (1 for serial execution of subjobs)
+    # parallel_workers: 1
+
     # command that will be executed before running actual computation (can be used to set up environment)
     configuration:
         
diff --git a/evcouplings/utils/app.py b/evcouplings/utils/app.py
index dd1ff573..c250c078 100644
--- a/evcouplings/utils/app.py
+++ b/evcouplings/utils/app.py
@@ -16,6 +16,7 @@
 from os import path, environ
 from collections import Mapping
 
+import billiard
 import click
 
 from evcouplings import utils
@@ -355,9 +356,33 @@ def run_jobs(configs, global_config, overwrite=False, workdir=None, abort_on_err
     )
 
     # create submitter from global (pre-unrolling) configuration
+    submitter_cfg = global_config["environment"]
+    submitter_engine = submitter_cfg["engine"]
+    submitter_cores = submitter_cfg.get("cores")
+
+    # special treatment for local submitter - allow to choose
+    # how many jobs to run in parallel (normally defined by grid engine)
+    submitter_kws = {}
+
+    # requires that number of cores per job is defined or external tools might
+    # request all CPUs for each subjob
+    if submitter_engine == "local" and submitter_cores is not None:
+        # check which value was set in config for number of parallel workers, default to None
+        max_parallel_workers = submitter_cfg.get("parallel_workers")
+
+        # if not defined, calculate
+        if max_parallel_workers is None:
+            max_cores = billiard.cpu_count()
+            max_parallel_workers = int(max_cores / submitter_cores)
+
+        # do not request more workers than needed for number of subjobs
+        num_workers = min(len(configs), max_parallel_workers)
+        submitter_kws = {"ncpu": num_workers}
+
     submitter = utils.SubmitterFactory(
-        global_config["environment"]["engine"],
-        db_path=out_prefix + "_job_database.txt"
+        submitter_engine,
+        db_path=out_prefix + "_job_database.txt",
+        **submitter_kws
     )
 
     # collect individual submitted jobs here

From f3b733a9da714f8e77f80c7357aa30ec59bca88e Mon Sep 17 00:00:00 2001
From: Anna Green <anna.g.green@gmail.com>
Date: Tue, 11 May 2021 11:26:36 -0400
Subject: [PATCH 06/47] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0e97d2c8..ecee7c4e 100644
--- a/README.md
+++ b/README.md
@@ -141,7 +141,7 @@ Hopf, T. A., Schärfe, C. P. I., Rodrigues, J. P. G. L. M., Green, A. G., Kohlba
 
 Hopf, T. A., Ingraham, J. B., Poelwijk, F.J., Schärfe, C.P.I., Springer, M., Sander, C., & Marks, D. S. (2017). Mutation effects predicted from sequence co-variation. *Nature Biotechnology* **35**, 128–135 doi:10.1038/nbt.3769
 
-Green, A. G. and Elhabashy, H., Brock, K. P., Maddamsetti, R., Kohlbacher, O., Marks, D. S. (2019) Proteom-scale discovery of protein interactions with residue-level resolution using sequence coevolution. BioRxiv (in review). https://doi.org/10.1101/791293
+Green, A. G. and Elhabashy, H., Brock, K. P., Maddamsetti, R., Kohlbacher, O., Marks, D. S. (2021) Large-scale discovery of protein interactions at residue resolution using co-evolution calculated from genomic sequences. *Nature Communications* **12**, 1396. https://doi.org/10.1038/s41467-021-21636-z
 
 ## Contributors
 

From 594b45a9ecd7d429f27c7d576559134c3e9c3bcc Mon Sep 17 00:00:00 2001
From: kpgbrock <kpgbrock@gmail.com>
Date: Thu, 30 Sep 2021 13:12:47 -0400
Subject: [PATCH 07/47] Fix typo in complex_probability()

ecs = pairs.add_mixture_proability -> ecs = pairs.add_mixture_probability(
---
 evcouplings/couplings/protocol.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evcouplings/couplings/protocol.py b/evcouplings/couplings/protocol.py
index 54fe1851..66940cec 100644
--- a/evcouplings/couplings/protocol.py
+++ b/evcouplings/couplings/protocol.py
@@ -453,7 +453,7 @@ def complex_probability(ecs, scoring_model, use_all_ecs=False,
         containing confidence measure
     """
     if use_all_ecs:
-        ecs = pairs.add_mixture_proability(
+        ecs = pairs.add_mixture_probability(
             ecs, model=scoring_model
         )
     else:

From 5de7bab3b5202848ace2e16ff2b1cda5c8edfda6 Mon Sep 17 00:00:00 2001
From: Aaron Kollasch <aaron@kollasch.dev>
Date: Mon, 23 May 2022 16:23:05 -0400
Subject: [PATCH 08/47] Update requirements.txt

sklearn is an alias for scikit-learn
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a31d9b78..079738ea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,4 +14,4 @@ jinja2
 biopython
 seaborn
 billiard
-sklearn
+scikit-learn

From 791c5faf3725c1eab88aa36ec51e086ed3a01271 Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Thu, 4 Aug 2022 13:55:23 +0200
Subject: [PATCH 09/47] [FIX] import Iterable/Mapping from collections.abc,
 closes #275

---
 evcouplings/align/protocol.py        | 3 ++-
 evcouplings/compare/pdb.py           | 3 ++-
 evcouplings/couplings/mapping.py     | 2 +-
 evcouplings/couplings/model.py       | 2 +-
 evcouplings/utils/app.py             | 2 +-
 evcouplings/utils/tracker/mongodb.py | 2 +-
 6 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
index ede0c4a6..f2ccbc3a 100644
--- a/evcouplings/align/protocol.py
+++ b/evcouplings/align/protocol.py
@@ -8,7 +8,8 @@
 
 """
 
-from collections import OrderedDict, Iterable
+from collections import OrderedDict
+from collections.abc import Iterable
 import re
 from shutil import copy
 import os
diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index 3220f303..c76ba257 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -5,7 +5,8 @@
   Thomas A. Hopf
 """
 
-from collections import OrderedDict, Iterable
+from collections import OrderedDict
+from collections.abc import Iterable
 from os import path
 from urllib.error import HTTPError
 
diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py
index 404b1e1f..c1302808 100644
--- a/evcouplings/couplings/mapping.py
+++ b/evcouplings/couplings/mapping.py
@@ -7,7 +7,7 @@
   Anna G. Green (MultiSegmentCouplingsModel)
 """
 
-from collections import Iterable
+from collections.abc import Iterable
 from copy import deepcopy
 from evcouplings.couplings.model import CouplingsModel
 import pandas as pd
diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py
index bb4f972d..30f5ce80 100755
--- a/evcouplings/couplings/model.py
+++ b/evcouplings/couplings/model.py
@@ -6,7 +6,7 @@
 Authors:
   Thomas A. Hopf
 """
-from collections import Iterable
+from collections.abc import Iterable
 from copy import deepcopy
 
 from numba import jit
diff --git a/evcouplings/utils/app.py b/evcouplings/utils/app.py
index c250c078..11578de4 100644
--- a/evcouplings/utils/app.py
+++ b/evcouplings/utils/app.py
@@ -14,7 +14,7 @@
 import re
 from copy import deepcopy
 from os import path, environ
-from collections import Mapping
+from collections.abc import Mapping
 
 import billiard
 import click
diff --git a/evcouplings/utils/tracker/mongodb.py b/evcouplings/utils/tracker/mongodb.py
index ae88d9bd..edb8e901 100644
--- a/evcouplings/utils/tracker/mongodb.py
+++ b/evcouplings/utils/tracker/mongodb.py
@@ -16,7 +16,7 @@
 
 import os
 from datetime import datetime
-from collections import Mapping
+from collections.abc import Mapping
 
 from pymongo import MongoClient, errors
 import gridfs

From a50bd0f00914076c7027a471d496d73cb582a24b Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Thu, 4 Aug 2022 14:02:37 +0200
Subject: [PATCH 10/47] [FIX] Update single sequence retrieval Uniprot API
 endpoint

---
 config/sample_config_complex.txt | 2 +-
 config/sample_config_monomer.txt | 2 +-
 evcouplings/compare/sifts.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/sample_config_complex.txt b/config/sample_config_complex.txt
index 03aa9bbb..9ddadf0b 100644
--- a/config/sample_config_complex.txt
+++ b/config/sample_config_complex.txt
@@ -506,7 +506,7 @@ databases:
     uniref90: /n/groups/marks/databases/jackhmmer/uniref90/uniref90_current.o2.fasta
 
     # URL do download sequences if sequence_file is not given. {} will be replaced by sequence_id.
-    sequence_download_url: http://www.uniprot.org/uniprot/{}.fasta
+    sequence_download_url: http://rest.uniprot.org/uniprot/{}.fasta
 
     # Directory with PDB MMTF structures (leave blank to fetch structures from web)
     pdb_mmtf_dir:
diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt
index c24dad32..e1260d1d 100644
--- a/config/sample_config_monomer.txt
+++ b/config/sample_config_monomer.txt
@@ -403,7 +403,7 @@ databases:
     uniref90: /n/groups/marks/databases/jackhmmer/uniref90/uniref90_current.o2.fasta
 
     # URL do download sequences if sequence_file is not given. {} will be replaced by sequence_id.
-    sequence_download_url: http://www.uniprot.org/uniprot/{}.fasta
+    sequence_download_url: http://rest.uniprot.org/uniprot/{}.fasta
 
     # Directory with PDB MMTF structures (leave blank to fetch structures from web)
     pdb_mmtf_dir:
diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py
index 3c378dc5..b4fa7514 100644
--- a/evcouplings/compare/sifts.py
+++ b/evcouplings/compare/sifts.py
@@ -66,7 +66,7 @@
 # database
 jackhmmer: jackhmmer
 sequence_database:
-sequence_download_url: http://www.uniprot.org/uniprot/{}.fasta
+sequence_download_url: http://rest.uniprot.org/uniprot/{}.fasta
 """
 
 

From 2495dc764f04ebdeacfdce030936ef24d3b5483c Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Thu, 4 Aug 2022 14:09:33 +0200
Subject: [PATCH 11/47] [FIX] Apply sklearn->scikit-learn update also in setup,
 not just requirements

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e96caab9..af703107 100644
--- a/setup.py
+++ b/setup.py
@@ -98,7 +98,7 @@
     install_requires=['setuptools>=18.2', 'numpy',
         'pandas', 'scipy', 'numba', 'ruamel.yaml', 'matplotlib', 'requests',
         'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2',
-        'biopython', 'seaborn', 'billiard', 'sklearn',
+        'biopython', 'seaborn', 'billiard', 'scikit-learn',
     ],
 
 )

From 792150e6c71e5edc284423ffedc37adf40d5744b Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Fri, 5 Aug 2022 09:58:04 +0200
Subject: [PATCH 12/47] [FIX] Upgrade fetch_uniprot_mapping and SIFTS sequence
 table creation to latest Uniprot API version

---
 evcouplings/compare/sifts.py | 151 ++++++++++++++++++++++++++++-------
 1 file changed, 123 insertions(+), 28 deletions(-)

diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py
index b4fa7514..9150a1bb 100644
--- a/evcouplings/compare/sifts.py
+++ b/evcouplings/compare/sifts.py
@@ -13,15 +13,19 @@
   Chan Kang (find_homologs)
 """
 
+from io import StringIO
 from os import path
 from collections import OrderedDict
 from copy import deepcopy
+import time
+import logging
 
 import pandas as pd
 import requests
+from requests.adapters import HTTPAdapter, Retry
 
 from evcouplings.align.alignment import (
-    Alignment, read_fasta, parse_header
+    Alignment, read_fasta, parse_header, write_fasta
 )
 from evcouplings.align.protocol import (
     jackhmmer_search, hmmbuild_and_search
@@ -36,7 +40,7 @@
 )
 from evcouplings.utils.helpers import range_overlap
 
-UNIPROT_MAPPING_URL = "https://www.uniprot.org/mapping/"
+UNIPROT_MAPPING_URL = "https://rest.uniprot.org"
 SIFTS_URL = "ftp://ftp.ebi.ac.uk/pub/databases/msd/sifts/flatfiles/csv/uniprot_segments_observed.csv.gz"
 SIFTS_REST_API = "http://www.ebi.ac.uk/pdbe/api/mappings/uniprot_segments/{}"
 
@@ -70,45 +74,112 @@
 """
 
 
-def fetch_uniprot_mapping(ids, from_="ACC", to="ACC", format="fasta"):
+def fetch_uniprot_mapping(ids, from_db="UniProtKB_AC-ID", to_db="UniProtKB", format="fasta", isoforms=True,
+                          polling_interval=3, retry_kws=None):
     """
     Fetch data from UniProt ID mapping service
     (e.g. download set of sequences)
 
+    Updated to match 2022 UniProt update, code is a simpflied version of
+    https://www.uniprot.org/help/id_mapping
+
+    For valid from_db/to_db pairs check https://rest.uniprot.org/configure/idmapping/fields
+
     Parameters
     ----------
     ids : list(str)
         List of UniProt identifiers for which to
         retrieve mapping
-    from_ : str, optional (default: "ACC")
+    from_db : str, optional (default: "UniProtKB_AC-ID")
         Source identifier (i.e. contained in "ids" list)
-    to : str, optional (default: "ACC")
+    to_db : str, optional (default: "UniProtKB")
         Target identifier (to which source should be mapped)
     format : str, optional (default: "fasta")
         Output format to request from Uniprot server
+    isoforms : bool, optional (default: True)
+        Include isoform sequences in retrieval. Note that API will return *all*
+        isoforms for entry, not just the requested ones, i.e. output will
+        need to be filtered after retrieval. Output may also contain duplicate
+        entries if multiple isoforms were present in requested identifier set.
+    polling_interval : int, optional (default: 3)
+        Number of seconds to wait before polling for request results
+    retry_kws : dict, optional (default: None)
+        Retry parameters supplied as kwargs to requests.adapters.Retry
 
     Returns
     -------
-    str:
-        Response from UniProt server
+    result: requests.models.Response
+        Response from UniProt server. Use result.text or result.json()
+        to access results
     """
-    params = {
-        "from": from_,
-        "to": to,
-        "format": format,
-        "query": " ".join(ids)
-    }
-    url = UNIPROT_MAPPING_URL
-    r = requests.post(url, data=params)
+    if retry_kws is None:
+        retry_kws = {
+            "total": 5,
+            "backoff_factor": 0.25,
+            "status_forcelist": [500, 502, 503, 504]
+        }
 
-    if r.status_code != requests.codes.ok:
-        raise ResourceError(
-            "Invalid status code ({}) for URL: {}".format(
-                r.status_code, url
-            )
+    # submit request
+    def submit_request():
+        r = requests.post(
+            f"{UNIPROT_MAPPING_URL}/idmapping/run",
+            data={
+                "from": from_db,
+                "to": to_db,
+                "ids": ",".join(ids)
+            },
         )
+        r.raise_for_status()
+        return r.json()["jobId"]
+
+    def check_id_mapping_results_ready(job_id):
+        while True:
+            request = session.get(f"{UNIPROT_MAPPING_URL}/idmapping/status/{job_id}")
+            request.raise_for_status()
+            j = request.json()
+            if "jobStatus" in j:
+                if j["jobStatus"] == "RUNNING":
+                    time.sleep(polling_interval)
+                else:
+                    raise Exception(j["jobStatus"])
+            else:
+                return bool(j["results"] or j["failedIds"])
 
-    return r.text
+    def get_id_mapping_results_link(job_id):
+        url = f"{UNIPROT_MAPPING_URL}/idmapping/details/{job_id}"
+        request = session.get(url)
+        request.raise_for_status()
+        return request.json()["redirectURL"]
+
+    job_id = submit_request()
+
+    retries = Retry(
+        **retry_kws
+    )
+    session = requests.Session()
+    session.mount("https://", HTTPAdapter(max_retries=retries))
+
+    if check_id_mapping_results_ready(job_id):
+        result_url = get_id_mapping_results_link(job_id)
+
+        # for simplicity, stream results (paginate requests to ensure more stable result,
+        # rather than paginating response of very big unstable request):
+        if "/stream/" not in result_url:
+            result_url = result_url.replace(
+                "/results/", "/results/stream/"
+            )
+
+        # append return format to URL
+        result_url += f"?format={format}"
+
+        # add isoform retrieval parameter if requested
+        if isoforms:
+            result_url += "&includeIsoform=true"
+
+        result = session.get(result_url)
+        result.raise_for_status()
+
+        return result
 
 
 def find_homologs(pdb_alignment_method="jackhmmer", **kwargs):
@@ -424,12 +495,19 @@ def create_sequence_file(self, output_file, chunk_size=1000, max_retries=100):
             from UniProt ID mapping service, which unfortunately
             often suffers from connection failures.
         """
+        # unique identifiers *including* isoforms (used for filtering final table)
         ids = self.table.uniprot_ac.unique().tolist()
 
+        # unique identifiers *excluding* isoforms (used for retrieval, to avoid
+        # fetching sequences twice)
+        ids_no_isoform = self.table.uniprot_ac.map(
+            lambda id_: id_.split("-")[0]
+        ).unique().tolist()
+
         # retrieve sequences in chunks since ID mapping service
         # tends to fail on large requests
         id_chunks = [
-            ids[i:i + chunk_size] for i in range(0, len(ids), chunk_size)
+            ids_no_isoform[i:i + chunk_size] for i in range(0, len(ids_no_isoform), chunk_size)
         ]
 
         # store individual retrieved chunks as list of strings
@@ -439,17 +517,22 @@ def create_sequence_file(self, output_file, chunk_size=1000, max_retries=100):
         # abort if number exceeds max_retries
         num_retries = 0
 
-        for ch in id_chunks:
+        for i, ch in enumerate(id_chunks):
             # fetch sequence chunk;
             # if there is a problem retry as long as we stay within
-            # maximum number of retries
+            # maximum number of retries;
+            # latest version also has limited number of retries inside the mapping function
             while True:
                 try:
-                    seqs = fetch_uniprot_mapping(ch)
+                    logging.info(f"Fetching chunk {i + 1} of {len(id_chunks)}...")
+                    seqs = fetch_uniprot_mapping(ch).text
                     break
-                except requests.ConnectionError as e:
+                except (requests.ConnectionError, requests.HTTPError) as e:
                     # count as failed try
                     num_retries += 1
+                    logging.warning(
+                        f"Error fetching chunk {i + 1} of {len(id_chunks)}, current retry #: {num_retries}"
+                    )
 
                     # if we retried too often, abort
                     if num_retries > max_retries:
@@ -472,12 +555,24 @@ def create_sequence_file(self, output_file, chunk_size=1000, max_retries=100):
 
             assert seqs.endswith("\n")
 
-            # store for writing
+            # store for parsing and output
             seq_chunks.append(seqs)
 
+        # create set representation for efficient lookup
+        ids_set = set(ids)
+
+        # only retain sequences that we actually had in input list
+        # (API will return all isoforms)
+        filtered_seqs = [
+            (seq_id, seq) for seq_id, seq in read_fasta(
+                StringIO("".join(seq_chunks))
+            )
+            if seq_id.split("|")[1] in ids_set
+        ]
+
         # store sequences to FASTA file in one go at the end
         with open(output_file, "w") as f:
-            f.write("".join(seq_chunks))
+            write_fasta(filtered_seqs, f)
 
         self.sequence_file = output_file
 

From 1f85d74ebdb918dbf82a0cf941fccd72fb2dfe02 Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Fri, 5 Aug 2022 09:58:19 +0200
Subject: [PATCH 13/47] Version bump

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index af703107..de2b95b7 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
     name='evcouplings',
 
     # Version:
-    version='0.1.1',
+    version='0.1.2',
 
     description='A Framework for evolutionary couplings analysis',
     long_description=readme,
@@ -28,8 +28,8 @@
     url='https://github.com/debbiemarkslab/EVcouplings',
 
     # Author details
-    author='Thomas Hopf, Benjamin Schubert',
-    author_email='thomas_hopf@hms.harvard.edu, benjamin_schubert@hms.harvard.edu',
+    author='Thomas Hopf',
+    author_email='thomas_hopf@hms.harvard.edu',
 
     # Choose your license
     license='MIT',

From 22761e58a91c4c510c090ee8f049fc5335b52f15 Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Fri, 5 Aug 2022 10:44:58 +0200
Subject: [PATCH 14/47] [FIX] pandas deprecation and warning

---
 evcouplings/compare/sifts.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py
index 9150a1bb..a6254834 100644
--- a/evcouplings/compare/sifts.py
+++ b/evcouplings/compare/sifts.py
@@ -883,7 +883,7 @@ def _create_mapping(r):
             hits.groupby(hit_columns)
         ):
             agg_mapping = {}
-            agg_df = pd.DataFrame()
+            agg_df_raw = []
             # go through each segment
             for j, r in grp.iterrows():
                 # compute mapping for that particular segment
@@ -891,7 +891,9 @@ def _create_mapping(r):
 
                 # add to overall mapping dictionary for this hit
                 agg_mapping.update(map_j)
-                agg_df = agg_df.append(map_j_df)
+                agg_df_raw.append(map_j_df)
+
+            agg_df = pd.concat(agg_df_raw)
 
             # store assignment of group to mapping index
             mapping_rows.append(
@@ -975,10 +977,11 @@ def _agg_type(x):
         # remove hits with too little residue coverage
         hits_grouped = hits_grouped.query("overlap >= @min_overlap")
 
-        hits_grouped.loc[:, "bitscore"] = pd.to_numeric(
-            hits_grouped.loc[:, "bitscore"], errors="coerce"
+        hits_grouped = hits_grouped.assign(
+            bitscore=pd.to_numeric(hits_grouped.bitscore, errors="coerce")
+        ).sort_values(
+            by="bitscore", ascending=False
         )
-        hits_grouped = hits_grouped.sort_values(by="bitscore", ascending=False)
 
         # if requested, only keep one chain per PDB;
         # sort by score before this to keep best hit

From 90a429675fe4b9522b30146acead881e4d45362d Mon Sep 17 00:00:00 2001
From: thomashopf <thomas.hopf@gmail.com>
Date: Fri, 5 Aug 2022 11:03:15 +0200
Subject: [PATCH 15/47] [FIX] safeguard insertion code checking

---
 evcouplings/compare/sifts.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py
index a6254834..275b9848 100644
--- a/evcouplings/compare/sifts.py
+++ b/evcouplings/compare/sifts.py
@@ -927,6 +927,11 @@ def _create_mapping(r):
         # split residue name into residue number and (potential) insertion code
         # for proper sorting
         def _split_insertion_code(res):
+            # safeguard function by typecasting to string (in some instances
+            # this turned out to be an integer, presumably due to pandas automatic
+            # typecast)
+            res = str(res)
+
             # should not happen but just to be safe
             assert len(res) >= 1
 

From d299279fc92ae30938b8f3d7090fe7343bef8451 Mon Sep 17 00:00:00 2001
From: Zachary Charlop-Powers <zcharlop-powers@zymergen.com>
Date: Fri, 11 Nov 2022 10:34:40 -0500
Subject: [PATCH 16/47] change name inputs to bokeh

---
 evcouplings/visualize/mutations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evcouplings/visualize/mutations.py b/evcouplings/visualize/mutations.py
index 9830a7ce..77e822b6 100644
--- a/evcouplings/visualize/mutations.py
+++ b/evcouplings/visualize/mutations.py
@@ -371,8 +371,8 @@ def matrix_base_bokeh(matrix, positions, substitutions,
     p = bp.figure(
         title=title,
         x_range=positions, y_range=substitutions,
-        x_axis_location="above", plot_width=width_factor * len(positions),
-        plot_height=height_factor * len(substitutions),
+        x_axis_location="above", width=width_factor * len(positions),
+        height=height_factor * len(substitutions),
         toolbar_location="left", tools=TOOLS
     )
 

From 721875a17150a394abd65cc0b75c95a95577d514 Mon Sep 17 00:00:00 2001
From: thomashopf <thomashopf@users.noreply.github.com>
Date: Thu, 16 Mar 2023 16:01:59 +0100
Subject: [PATCH 17/47] Update build_and_test.yml

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c4674acb..e7be106a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -8,7 +8,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.6]
+        python-version: [3.8]
 
     steps:
     - uses: actions/checkout@v2

From af27842c5fc72c291831261c9a19849a3e313efd Mon Sep 17 00:00:00 2001
From: thomashopf <thomashopf@users.noreply.github.com>
Date: Thu, 16 Mar 2023 16:02:13 +0100
Subject: [PATCH 18/47] Update build_test_and_push.yml

---
 .github/workflows/build_test_and_push.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml
index b1aa2a52..ec380fda 100644
--- a/.github/workflows/build_test_and_push.yml
+++ b/.github/workflows/build_test_and_push.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.6]
+        python-version: [3.8]
 
     steps:
     - uses: actions/checkout@v2

From d472e4b3e38cb9124a8ee4678f271cb6ec92a396 Mon Sep 17 00:00:00 2001
From: Aaron Kollasch <aaron@kollasch.dev>
Date: Thu, 2 Mar 2023 16:30:47 -0500
Subject: [PATCH 19/47] Support a3m format in existing alignment protocol

---
 evcouplings/align/alignment.py | 10 ++++++++--
 evcouplings/align/protocol.py  |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
index 20d95210..2e77f733 100644
--- a/evcouplings/align/alignment.py
+++ b/evcouplings/align/alignment.py
@@ -9,6 +9,7 @@
 import re
 from collections import namedtuple, OrderedDict, defaultdict
 from copy import deepcopy
+from pathlib import Path
 
 import numpy as np
 from numba import jit
@@ -326,7 +327,7 @@ def write_a3m(sequences, fileobj, insert_gap=INSERT_GAP, width=80):
         fileobj.write(seq.replace(insert_gap, "") + "\n")
 
 
-def detect_format(fileobj):
+def detect_format(fileobj, filepath=""):
     """
     Detect if an alignment file is in FASTA or
     Stockholm format.
@@ -335,10 +336,12 @@ def detect_format(fileobj):
     ----------
     fileobj : file-like obj
         Alignment file for which to detect format
+    filepath : string or path-like obj
+        Path of alignment file
 
     Returns
     -------
-    format : {"fasta", "stockholm", None}
+    format : {"fasta", "a3m", "stockholm", None}
         Format of alignment, None if not detectable
     """
     for i, line in enumerate(fileobj):
@@ -348,6 +351,9 @@ def detect_format(fileobj):
 
         # This indicates a FASTA file
         if line.startswith(">"):
+            # A3M files have extension .a3m
+            if Path(filepath).suffix.lower() == ".a3m":
+                return "a3m"
             return "fasta"
 
         # Skip comment lines and empty lines for FASTA detection
diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py
index f2ccbc3a..7fa6282b 100644
--- a/evcouplings/align/protocol.py
+++ b/evcouplings/align/protocol.py
@@ -689,7 +689,7 @@ def existing(**kwargs):
 
     # first try to autodetect format of alignment
     with open(input_alignment) as f:
-        format = detect_format(f)
+        format = detect_format(f, filepath=input_alignment)
         if format is None:
             raise InvalidParameterError(
                 "Format of input alignment {} could not be "

From 7192161ceca6c64f8d11aba16765178f97d50711 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 10 May 2023 15:34:35 +0200
Subject: [PATCH 20/47] Prepare tests for new version, update readme and setup

---
 .github/workflows/build_and_test.yml      | 2 +-
 .github/workflows/build_test_and_push.yml | 2 +-
 README.md                                 | 7 +++----
 setup.py                                  | 5 +++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e7be106a..54c3ad0b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -8,7 +8,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: [3.10]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml
index ec380fda..8a4b4543 100644
--- a/.github/workflows/build_test_and_push.yml
+++ b/.github/workflows/build_test_and_push.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: [3.10]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/README.md b/README.md
index ecee7c4e..b868fc2f 100644
--- a/README.md
+++ b/README.md
@@ -7,11 +7,12 @@ Predict protein structure, function and mutations using evolutionary sequence co
 
 ### Installing the Python package
 
-If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.
+* If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). 
+* If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.
 
 #### Requirements
 
-EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages that can be tricky to install using pip (numba, numpy, ...), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). In case you are creating a new conda environment or using miniconda, please make sure to run `conda install anaconda` before running pip, or otherwise the required packages will not be present.  
+EVcouplings actively supports Python >= 3.10 installations.  
 
 #### Installation
 
@@ -27,8 +28,6 @@ and to update to the latest version after previously installing EVcouplings from
 
     pip install -U --no-deps https://github.com/debbiemarkslab/EVcouplings/archive/develop.zip
 
-Installation will take seconds.
-
 ### External software tools
 
 *After installation and before running compute jobs, the paths to the respective binaries of the following external tools have to be set in your EVcouplings job configuration file(s).*
diff --git a/setup.py b/setup.py
index de2b95b7..9686763f 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
     name='evcouplings',
 
     # Version:
-    version='0.1.2',
+    version='0.2',
 
     description='A Framework for evolutionary couplings analysis',
     long_description=readme,
@@ -49,7 +49,8 @@
         # The license as you wish (should match "license" above)
         'License :: OSI Approved :: MIT License',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.10',
+        'Programming Language :: Python :: 3.11',
     ],
 
     # What EVcouplings relates to:

From b4fc441f18efb770a53aeb4e183be2fd7595b756 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 10 May 2023 15:37:24 +0200
Subject: [PATCH 21/47] Keep as is due to github security restrictions

---
 .github/workflows/build_and_test.yml      | 2 +-
 .github/workflows/build_test_and_push.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 54c3ad0b..e7be106a 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -8,7 +8,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.10]
+        python-version: [3.8]
 
     steps:
     - uses: actions/checkout@v2
diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml
index 8a4b4543..ec380fda 100644
--- a/.github/workflows/build_test_and_push.yml
+++ b/.github/workflows/build_test_and_push.yml
@@ -9,7 +9,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.10]
+        python-version: [3.8]
 
     steps:
     - uses: actions/checkout@v2

From e1362407a0b65d63ca07df55f44cb17b0a3722b7 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 10 May 2023 15:47:04 +0200
Subject: [PATCH 22/47] Fix numpy dtype deprecation, fixes #291

---
 evcouplings/align/alignment.py | 2 +-
 evcouplings/couplings/model.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
index 20d95210..9a72f71b 100644
--- a/evcouplings/align/alignment.py
+++ b/evcouplings/align/alignment.py
@@ -422,7 +422,7 @@ def sequences_to_matrix(sequences):
 
     N = len(sequences)
     L = len(next(iter(sequences)))
-    matrix = np.empty((N, L), dtype=np.str)
+    matrix = np.empty((N, L), dtype=str)
 
     for i, seq in enumerate(sequences):
         if len(seq) != L:
diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py
index 30f5ce80..e8977104 100755
--- a/evcouplings/couplings/model.py
+++ b/evcouplings/couplings/model.py
@@ -609,7 +609,7 @@ def convert_sequences(self, sequences):
                 )
             )
 
-        S = np.empty((len(sequences), L_seq), dtype=np.int)
+        S = np.empty((len(sequences), L_seq), dtype=int)
 
         try:
             for i, s in enumerate(sequences):
@@ -689,8 +689,8 @@ def delta_hamiltonian(self, substitutions, verify_mutants=True):
             2) delta J_ij, 3) delta h_i
 
         """
-        pos = np.empty(len(substitutions), dtype=np.int)
-        subs = np.empty(len(substitutions), dtype=np.int)
+        pos = np.empty(len(substitutions), dtype=int)
+        subs = np.empty(len(substitutions), dtype=int)
 
         try:
             for i, (subs_pos, subs_from, subs_to) in enumerate(substitutions):

From 78633124b2c4ed3b6ba3f47aac4969979aad77e2 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 10 May 2023 18:24:27 +0200
Subject: [PATCH 23/47] fix deprecation in pandas df equality checking in tests

---
 test/TestComplex.py  | 2 +-
 test/TestMutation.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/TestComplex.py b/test/TestComplex.py
index 873b24f7..d5ffa922 100644
--- a/test/TestComplex.py
+++ b/test/TestComplex.py
@@ -402,7 +402,7 @@ def test_find_possible_partners(self):
 
         pd.testing.assert_frame_equal(
             self.possible_partners, _possible_partners,
-            check_less_precise=True, check_like=True,
+            atol=1e-5, check_like=True,
             check_names=False
         )
 
diff --git a/test/TestMutation.py b/test/TestMutation.py
index c1418209..e06a0976 100644
--- a/test/TestMutation.py
+++ b/test/TestMutation.py
@@ -98,7 +98,7 @@ def test_single_mutant_matrix(self):
         # gotta round to account for this
         _singles = _singles.round(3)
         singles = singles.round(3)
-        pd.testing.assert_frame_equal(singles, _singles, check_exact=False, check_less_precise=True)
+        pd.testing.assert_frame_equal(singles, _singles, check_dtype=False, atol=1e-5)
 
     def test_split_mutants_single(self):
         """
@@ -228,7 +228,7 @@ def test_predict_mutation_table_segment_column(self):
             self.c0, self.singles, output_column="prediction_independent"
         )
 
-        pd.testing.assert_frame_equal(self.singles, _singles, check_less_precise=True)
+        pd.testing.assert_frame_equal(self.singles, _singles, check_dtype=False, atol=1e-5)
 
     def test_predict_mutation_table_empty_segment(self):
         """

From a880fea35340e22d880d928d4b23af6f3b90d926 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Thu, 11 May 2023 10:49:41 +0200
Subject: [PATCH 24/47] pin yaml due to upcoming API change

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9686763f..93f8beec 100644
--- a/setup.py
+++ b/setup.py
@@ -97,7 +97,7 @@
     #setup_requires=['setuptools>=18.2', 'numpy'],
 
     install_requires=['setuptools>=18.2', 'numpy',
-        'pandas', 'scipy', 'numba', 'ruamel.yaml', 'matplotlib', 'requests',
+        'pandas', 'scipy', 'numba', 'ruamel.yaml<0.18', 'matplotlib', 'requests',
         'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2',
         'biopython', 'seaborn', 'billiard', 'scikit-learn',
     ],

From 5cd4033913e5ae20519fa6bc58412928ad9072ed Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Thu, 11 May 2023 11:53:55 +0200
Subject: [PATCH 25/47] Fix alignment identifier memory usage, add header
 splitting, fixes #289

---
 evcouplings/align/alignment.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/evcouplings/align/alignment.py b/evcouplings/align/alignment.py
index 98538db5..f52d83fc 100644
--- a/evcouplings/align/alignment.py
+++ b/evcouplings/align/alignment.py
@@ -575,7 +575,12 @@ def __init__(self, sequence_matrix, sequence_ids=None, annotation=None,
                 )
 
             # make sure we get rid of iterators etc.
-            self.ids = np.array(list(sequence_ids))
+            self.ids = list(sequence_ids)
+
+        # turn identifiers into numpy array for consistency with previous implementation;
+        # but use dtype object to avoid memory usage issues of numpy string datatypes (longest
+        # sequence defines memory usage otherwise)
+        self.ids = np.array(self.ids, dtype=np.object_)
 
         self.id_to_index = {
             id_: i for i, id_ in enumerate(self.ids)
@@ -613,7 +618,7 @@ def from_dict(cls, sequences, **kwargs):
     @classmethod
     def from_file(cls, fileobj, format="fasta",
                   a3m_inserts="first", raise_hmmer_prefixes=True,
-                  **kwargs):
+                  split_header=False, **kwargs):
         """
         Construct an alignment object by reading in an
         alignment file.
@@ -631,6 +636,9 @@ def from_file(cls, fileobj, format="fasta",
             HMMER adds number prefixes to sequence identifiers in Stockholm
             files if identifiers are not unique. If True, the parser will
             raise an exception if a Stockholm alignment has such prefixes.
+        split_header: bool, optional (default: False)
+            Only store identifier portion of each header (before first whitespace)
+            in identifier list, rather than full header line
         **kwargs
             Additional arguments to be passed to class constructor
 
@@ -670,6 +678,12 @@ def from_file(cls, fileobj, format="fasta",
         else:
             raise ValueError("Invalid alignment format: {}".format(format))
 
+        # reduce header lines to identifiers if requested
+        if split_header:
+            seqs = {
+                header.split()[0]: seq for header, seq in seqs.items()
+            }
+
         return cls.from_dict(seqs, **kwargs)
 
     def __getitem__(self, index):
@@ -783,7 +797,8 @@ def select(self, columns=None, sequences=None):
     def apply(self, columns=None, sequences=None, func=np.char.lower):
         """
         Apply a function along columns and/or rows of alignment matrix,
-        or to entire matrix.
+        or to entire matrix. Note that column and row selections are
+        applied independently in this particular order.
 
         Parameters
         ----------
@@ -817,7 +832,7 @@ def apply(self, columns=None, sequences=None, func=np.char.lower):
                 mod_matrix[sequences, :] = func(mod_matrix[sequences, :])
 
         return Alignment(
-            mod_matrix, np.copy(self.ids), deepcopy(self.annotation),
+            mod_matrix, deepcopy(self.ids), deepcopy(self.annotation),
             alphabet=self.alphabet
         )
 

From 2324ed63cb6fc70da8fc2d52fb7ab96609d9a321 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Thu, 11 May 2023 13:00:41 +0200
Subject: [PATCH 26/47] Correct wrong exception raise, fixes #199

---
 evcouplings/couplings/mapping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evcouplings/couplings/mapping.py b/evcouplings/couplings/mapping.py
index c1302808..18605f9b 100644
--- a/evcouplings/couplings/mapping.py
+++ b/evcouplings/couplings/mapping.py
@@ -374,7 +374,7 @@ def __init__(self, filename, *segments,
 
         # initialize the segment index mapper to update model numbering
         if len(segments) == 0:
-            raise(ValueError, "Must provide at least one segment for MultiSegmentCouplingsModel")
+            raise ValueError("Must provide at least one segment for MultiSegmentCouplingsModel")
 
         first_segment = segments[0]
         index_start = first_segment.region_start

From 9d34d5ef38265692275d0f042853434154d1d319 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Thu, 11 May 2023 14:11:53 +0200
Subject: [PATCH 27/47] better handler around occasional plmc segfaults, fixes
 #292

---
 evcouplings/couplings/tools.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/evcouplings/couplings/tools.py b/evcouplings/couplings/tools.py
index fd5ac104..79c592bd 100644
--- a/evcouplings/couplings/tools.py
+++ b/evcouplings/couplings/tools.py
@@ -265,10 +265,17 @@ def run_plmc(alignment, couplings_file, param_file=None,
     # returncode == -11 (segfault) despite successful calculation
     return_code, stdout, stderr = run(cmd, check_returncode=False)
 
-    # TODO: remove this segfault-hunting output once fixed
+    # TODO: remove this segfault-hunting output if fixed in plmc
     if return_code != 0:
-        # if not a segfault, still raise exception
-        if return_code != -11:
+        # check if we got valid output from plmc by parsing it
+        valid_plmc_output = True
+        try:
+            parse_plmc_log(stderr)
+        except KeyError:
+            valid_plmc_output = False
+
+        # if not a segfault or invalid plmc output, still raise exception
+        if return_code != -11 or not valid_plmc_output:
             from evcouplings.utils.system import ExternalToolError
             raise ExternalToolError(
                 "Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
@@ -276,12 +283,6 @@ def run_plmc(alignment, couplings_file, param_file=None,
                 )
             )
 
-        print("PLMC NON-ZERO RETURNCODE:", return_code)
-        print(cmd)
-        print(" ".join(cmd))
-        print("stdout:", stdout)
-        print("stderr:", stderr)
-
     iter_df, out_fields = parse_plmc_log(stderr)
 
     # also check we actually calculated couplings...

From 5b3b3f08a1f9d1a6fbae3130de4b3bbdd9d61b39 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Fri, 12 May 2023 19:00:40 +0200
Subject: [PATCH 28/47] Numpy deprecation fix, closes #248

---
 evcouplings/compare/pdb.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index c76ba257..e16aaf1a 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -457,7 +457,7 @@ def _get_range(object_counts):
         # store explicit information about composition of residues
         def _group_info(field):
             return np.array(
-                [x[field] for x in mmtf.group_list]
+                [x[field] for x in mmtf.group_list], dtype=np.object_
             )
 
         # three and one letter code names of different groups
@@ -589,7 +589,7 @@ def get_chain(self, chain, model=0):
             np.array([
                 np.arange(self.first_residue_index[i], self.last_residue_index[i])
                 for i in target_chain_indeces
-            ])
+            ], dtype=np.object_)
         )
 
         # chain indeces and identifiers for all residues

From b9951e4765ba741446dde22646e2f6f66408bca1 Mon Sep 17 00:00:00 2001
From: Zachary Charlop-Powers <zcharloppowers@ginkgobioworks.com>
Date: Tue, 3 Oct 2023 16:57:22 -0400
Subject: [PATCH 29/47] bump to python3.10 bump to python3.11 update GH actions

---
 .github/workflows/build_and_test.yml      |   9 +-
 .github/workflows/build_test_and_push.yml |   9 +-
 test/TestComplex.py                       | 266 +++++++++++++++-------
 3 files changed, 197 insertions(+), 87 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e7be106a..614c6810 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -8,12 +8,12 @@ jobs:
 
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: ['3.10', '3.11']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install and configure conda
@@ -37,6 +37,7 @@ jobs:
         wget https://marks.hms.harvard.edu/evcouplings_test_cases/data/evcouplings_test_cases.tar.gz
         tar -xf evcouplings_test_cases.tar.gz -C $HOME/
     - name: Run tests in headless xvfb environment
-      uses: GabrielBB/xvfb-action@v1
+      uses: coactions/setup-xvfb@v1
       with:
         run: coverage run -m unittest discover -s test -p "Test*.py"
+        working-directory: ./ #optional
diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml
index ec380fda..53d28be0 100644
--- a/.github/workflows/build_test_and_push.yml
+++ b/.github/workflows/build_test_and_push.yml
@@ -9,12 +9,12 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [3.8]
+        python-version: ['3.10', '3.11']
 
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install and configure conda
@@ -38,9 +38,10 @@ jobs:
         wget https://marks.hms.harvard.edu/evcouplings_test_cases/data/evcouplings_test_cases.tar.gz
         tar -xf evcouplings_test_cases.tar.gz -C $HOME/
     - name: Run tests in headless xvfb environment
-      uses: GabrielBB/xvfb-action@v1
+      uses: coactions/setup-xvfb@v1
       with:
         run: coverage run -m unittest discover -s test -p "Test*.py"
+        working-directory: ./ #optional
     - name: Publish evcouplings to test PyPI
       if: startsWith(github.ref, 'refs/tags')
       uses: pypa/gh-action-pypi-publish@master
diff --git a/test/TestComplex.py b/test/TestComplex.py
index d5ffa922..2f6e4f86 100644
--- a/test/TestComplex.py
+++ b/test/TestComplex.py
@@ -18,48 +18,70 @@
 from evcouplings.complex.protocol import *
 from evcouplings.align import Alignment
 
-TRAVIS_PATH = os.getenv('HOME') + "/evcouplings_test_cases/complex_test"
+TRAVIS_PATH = os.getenv("HOME") + "/evcouplings_test_cases/complex_test"
 # TRAVIS_PATH = "/home/travis/evcouplings_test_cases/complex_test"
 # TRAVIS_PATH = "/Users/AG/Dropbox/evcouplings_dev/test_cases/for_B/complex_test"
 
-class TestComplex(TestCase):
 
+class TestComplex(TestCase):
     def __init__(self, *args, **kwargs):
         super(TestComplex, self).__init__(*args, **kwargs)
 
         # Genome location table
-        genome_location_filename_1 = \
-            "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH)
-        genome_location_filename_2 = \
-            "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH)
+        genome_location_filename_1 = "{}/align_1/test_new_genome_location.csv".format(
+            TRAVIS_PATH
+        )
+        genome_location_filename_2 = "{}/align_2/test_new_genome_location.csv".format(
+            TRAVIS_PATH
+        )
 
         self.gene_location_table_1 = pd.read_csv(genome_location_filename_1, header=0)
         self.gene_location_table_2 = pd.read_csv(genome_location_filename_2, header=0)
 
         # possible partners file
-        possible_partners_file = "{}/concatenate/test_new_possible_partners.csv".format(TRAVIS_PATH)
-        possible_partners = pd.read_csv(possible_partners_file, index_col=0, header=0,
-            dtype={"uniprot_id_1": str, "uniprot_id_2": str, "distance": int}
+        possible_partners_file = "{}/concatenate/test_new_possible_partners.csv".format(
+            TRAVIS_PATH
+        )
+        possible_partners = pd.read_csv(
+            possible_partners_file,
+            index_col=0,
+            header=0,
+            dtype={"uniprot_id_1": str, "uniprot_id_2": str, "distance": int},
+        )
+        possible_partners = possible_partners.sort_values(
+            ["uniprot_id_1", "uniprot_id_2", "distance"]
         )
-        possible_partners = possible_partners.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"])
         self.possible_partners = possible_partners.reset_index(drop=True)
 
         # id pairing
         id_pairing_file = "{}/concatenate/test_new_id_pairing.csv".format(TRAVIS_PATH)
-        id_pairing = pd.read_csv(id_pairing_file, index_col=0, header=0,
-            dtype={"uniprot_id_1": str, "uniprot_id_2": str, "distance": int}
+        id_pairing = pd.read_csv(
+            id_pairing_file,
+            index_col=0,
+            header=0,
+            dtype={"uniprot_id_1": str, "uniprot_id_2": str, "distance": int},
+        )
+        id_pairing = id_pairing.sort_values(
+            ["uniprot_id_1", "uniprot_id_2", "distance"]
         )
-        id_pairing = id_pairing.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"])
         self.id_pairing = id_pairing.reset_index(drop=True)
 
         # annotation table for concatenation
-        annotation_data_file = "{}/concatenate/test_new_uniprot_annotation.csv".format(TRAVIS_PATH)
-        self.annotation_data = pd.read_csv(annotation_data_file, index_col=None, header=0, dtype=str)
+        annotation_data_file = "{}/concatenate/test_new_uniprot_annotation.csv".format(
+            TRAVIS_PATH
+        )
+        self.annotation_data = pd.read_csv(
+            annotation_data_file, index_col=None, header=0, dtype=str
+        )
 
-        annotation_and_id_file = "{}/concatenate/test_new_annotation_and_id.csv".format(TRAVIS_PATH)
+        annotation_and_id_file = "{}/concatenate/test_new_annotation_and_id.csv".format(
+            TRAVIS_PATH
+        )
         self.annotation_and_id = pd.read_csv(
-            annotation_and_id_file,  header=0, index_col=None,
-            dtype={"id": str, "id_to_query": float, "species": str, "name": str}
+            annotation_and_id_file,
+            header=0,
+            index_col=None,
+            dtype={"id": str, "id_to_query": float, "species": str, "name": str},
         )
 
         # table of sequence identities
@@ -71,10 +93,14 @@ def __init__(self, *args, **kwargs):
         self.paralog_table = pd.read_csv(paralog_file, index_col=0, header=0)
 
         # input and output configuration
-        with open("{}/concatenate/test_new_concatenate.incfg".format(TRAVIS_PATH)) as inf:
+        with open(
+            "{}/concatenate/test_new_concatenate.incfg".format(TRAVIS_PATH)
+        ) as inf:
             self.incfg = yaml.safe_load(inf)
 
-        with open("{}/concatenate/test_new_concatenate.outcfg".format(TRAVIS_PATH)) as inf:
+        with open(
+            "{}/concatenate/test_new_concatenate.outcfg".format(TRAVIS_PATH)
+        ) as inf:
             self.outcfg = yaml.safe_load(inf)
 
     def test_genome_distance(self):
@@ -89,12 +115,24 @@ def test_genome_distance(self):
 
         temporary_incfg = deepcopy(self.incfg)
         temporary_incfg["prefix"] = tmp_prefix
-        temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format(TRAVIS_PATH)
-        temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format(TRAVIS_PATH)
-        temporary_incfg["first_genome_location_file"] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH)
-        temporary_incfg["second_genome_location_file"] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH)
-        temporary_incfg["first_annotation_file"] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH)
-        temporary_incfg["second_annotation_file"] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH)
+        temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format(
+            TRAVIS_PATH
+        )
+        temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format(
+            TRAVIS_PATH
+        )
+        temporary_incfg[
+            "first_genome_location_file"
+        ] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "second_genome_location_file"
+        ] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "first_annotation_file"
+        ] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "second_annotation_file"
+        ] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH)
 
         outcfg = genome_distance(**temporary_incfg)
 
@@ -112,7 +150,7 @@ def test_genome_distance(self):
             "distance_plot_file",
             "alignment_file",
             "frequencies_file",
-            "raw_focus_alignment_file"
+            "raw_focus_alignment_file",
         ]
 
         for key in keys_list:
@@ -133,18 +171,36 @@ def test_best_hit_normal(self):
 
         temporary_incfg = deepcopy(self.incfg)
         temporary_incfg["prefix"] = tmp_prefix
-        temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format(TRAVIS_PATH)
-        temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format(TRAVIS_PATH)
-        temporary_incfg["first_annotation_file"] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH)
-        temporary_incfg["second_annotation_file"] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH)
-        temporary_incfg["first_identities_file"] = "{}/align_1/test_new_identities.csv".format(TRAVIS_PATH)
-        temporary_incfg["second_identities_file"] = "{}/align_2/test_new_identities.csv".format(TRAVIS_PATH)
-        temporary_incfg["first_genome_location_file"] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH)
-        temporary_incfg["second_genome_location_file"] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH)
+        temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format(
+            TRAVIS_PATH
+        )
+        temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format(
+            TRAVIS_PATH
+        )
+        temporary_incfg[
+            "first_annotation_file"
+        ] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "second_annotation_file"
+        ] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "first_identities_file"
+        ] = "{}/align_1/test_new_identities.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "second_identities_file"
+        ] = "{}/align_2/test_new_identities.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "first_genome_location_file"
+        ] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "second_genome_location_file"
+        ] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH)
         temporary_incfg["use_best_reciprocal"] = False
         temporary_incfg["paralog_identity_threshold"] = 0.9
 
-        with open("{}/concatenate/test_new_best_hit_concatenate.outcfg".format(TRAVIS_PATH)) as inf:
+        with open(
+            "{}/concatenate/test_new_best_hit_concatenate.outcfg".format(TRAVIS_PATH)
+        ) as inf:
             _outcfg = yaml.safe_load(inf)
 
         outcfg = best_hit(**temporary_incfg)
@@ -162,7 +218,7 @@ def test_best_hit_normal(self):
             "concatentation_statistics_file",
             "alignment_file",
             "frequencies_file",
-            "raw_focus_alignment_file"
+            "raw_focus_alignment_file",
         ]
 
         for key in keys_list:
@@ -183,18 +239,38 @@ def test_best_hit_reciprocal(self):
 
         temporary_incfg = deepcopy(self.incfg)
         temporary_incfg["prefix"] = tmp_prefix
-        temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format(TRAVIS_PATH)
-        temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format(TRAVIS_PATH)
-        temporary_incfg["first_annotation_file"] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH)
-        temporary_incfg["second_annotation_file"] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH)
-        temporary_incfg["first_identities_file"] = "{}/align_1/test_new_identities.csv".format(TRAVIS_PATH)
-        temporary_incfg["second_identities_file"] = "{}/align_2/test_new_identities.csv".format(TRAVIS_PATH)
-        temporary_incfg["first_genome_location_file"] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH)
-        temporary_incfg["second_genome_location_file"] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH)
+        temporary_incfg["first_alignment_file"] = "{}/align_1/test_new.a2m".format(
+            TRAVIS_PATH
+        )
+        temporary_incfg["second_alignment_file"] = "{}/align_2/test_new.a2m".format(
+            TRAVIS_PATH
+        )
+        temporary_incfg[
+            "first_annotation_file"
+        ] = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "second_annotation_file"
+        ] = "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "first_identities_file"
+        ] = "{}/align_1/test_new_identities.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "second_identities_file"
+        ] = "{}/align_2/test_new_identities.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "first_genome_location_file"
+        ] = "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH)
+        temporary_incfg[
+            "second_genome_location_file"
+        ] = "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH)
         temporary_incfg["use_best_reciprocal"] = True
         temporary_incfg["paralog_identity_threshold"] = 0.9
 
-        with open("{}/concatenate/test_new_best_reciprocal_concatenate.outcfg".format(TRAVIS_PATH)) as inf:
+        with open(
+            "{}/concatenate/test_new_best_reciprocal_concatenate.outcfg".format(
+                TRAVIS_PATH
+            )
+        ) as inf:
             _outcfg = yaml.safe_load(inf)
 
         outcfg = best_hit(**temporary_incfg)
@@ -212,7 +288,7 @@ def test_best_hit_reciprocal(self):
             "concatentation_statistics_file",
             "alignment_file",
             "frequencies_file",
-            "raw_focus_alignment_file"
+            "raw_focus_alignment_file",
         ]
 
         for key in keys_list:
@@ -247,12 +323,12 @@ def test_describe_concatenation(self):
             "{}/align_2/test_new_annotation.csv".format(TRAVIS_PATH),
             "{}/align_1/test_new_genome_location.csv".format(TRAVIS_PATH),
             "{}/align_2/test_new_genome_location.csv".format(TRAVIS_PATH),
-            outfile.name
+            outfile.name,
         )
 
-        concatenation_stats = pd.read_csv("{}/concatenate/test_new_concatenation_statistics.csv".format(
-            TRAVIS_PATH
-        ))
+        concatenation_stats = pd.read_csv(
+            "{}/concatenate/test_new_concatenation_statistics.csv".format(TRAVIS_PATH)
+        )
         _concatenation_stats = pd.read_csv(outfile.name)
 
         pd.testing.assert_frame_equal(concatenation_stats, _concatenation_stats)
@@ -265,15 +341,15 @@ def test_write_concatenated_alignment(self):
 
         """
         alignment_1_file = "{}/concatenate/test_new_monomer_1.fasta".format(TRAVIS_PATH)
-        with(open(alignment_1_file)) as inf:
+        with open(alignment_1_file) as inf:
             alignment_1 = Alignment.from_file(inf)
 
         alignment_2_file = "{}/concatenate/test_new_monomer_2.fasta".format(TRAVIS_PATH)
-        with(open(alignment_2_file)) as inf:
+        with open(alignment_2_file) as inf:
             alignment_2 = Alignment.from_file(inf)
 
         alignment_file = "{}/concatenate/test_new_raw_focus.fasta".format(TRAVIS_PATH)
-        with(open(alignment_file)) as inf:
+        with open(alignment_file) as inf:
             alignment = Alignment.from_file(inf)
 
         target_header = alignment.ids[0]
@@ -285,15 +361,25 @@ def test_write_concatenated_alignment(self):
         id_pairing.loc[:, "id_1"] = id_pairing["uniprot_id_1"]
         id_pairing.loc[:, "id_2"] = id_pairing["uniprot_id_2"]
 
-        _target_header, _target_seq_idx, _ali, _ali_1, _ali_2 = write_concatenated_alignment(
-            id_pairing, input_alignment_file_1, input_alignment_file_2,
-            "DINJ_ECOLI/1-86", "YAFQ_ECOLI/1-92"
+        (
+            _target_header,
+            _target_seq_idx,
+            _ali,
+            _ali_1,
+            _ali_2,
+        ) = write_concatenated_alignment(
+            id_pairing,
+            input_alignment_file_1,
+            input_alignment_file_2,
+            "DINJ_ECOLI/1-86",
+            "YAFQ_ECOLI/1-92",
         )
 
         def _test_aln_equivalence(ali1, ali2):
             np.testing.assert_array_equal(ali1.ids, ali2.ids)
             np.testing.assert_array_equal(ali1.matrix, ali2.matrix)
-#
+
+        #
         _test_aln_equivalence(alignment_1, _ali_1)
         _test_aln_equivalence(alignment_2, _ali_2)
         _test_aln_equivalence(alignment, _ali)
@@ -305,7 +391,9 @@ def test_read_species_annotation_table_uniprot(self):
         tests whether a uniprot annotation table is read correctly
 
         """
-        annotation_file_uniprot = "{}/align_1/test_new_annotation.csv".format(TRAVIS_PATH)
+        annotation_file_uniprot = "{}/align_1/test_new_annotation.csv".format(
+            TRAVIS_PATH
+        )
         annotation_data = read_species_annotation_table(annotation_file_uniprot)
         pd.testing.assert_frame_equal(annotation_data, self.annotation_data)
 
@@ -314,19 +402,26 @@ def test_read_species_annotation_table_uniref(self):
         tests whether a uniref annotation table is read correctly
 
         """
-        _annotation_file_uniref = "{}/DIVIB_BACSU_1-54_b0.3_annotation.csv".format(TRAVIS_PATH)
+        _annotation_file_uniref = "{}/DIVIB_BACSU_1-54_b0.3_annotation.csv".format(
+            TRAVIS_PATH
+        )
         _annotation_data = read_species_annotation_table(_annotation_file_uniref)
-        annotation_file_uniref = "{}/concatenate/test_new_uniref_annotation.csv".format(TRAVIS_PATH)
-        annotation_data_gold = pd.read_csv(annotation_file_uniref, index_col=None, header=0, dtype=str)
+        annotation_file_uniref = "{}/concatenate/test_new_uniref_annotation.csv".format(
+            TRAVIS_PATH
+        )
+        annotation_data_gold = pd.read_csv(
+            annotation_file_uniref, index_col=None, header=0, dtype=str
+        )
         pd.testing.assert_frame_equal(annotation_data_gold, _annotation_data)
 
-    def test_most_similar_by_organism(self):
-        """
-        tests whether most_similar_by_organism returns the correct dataframe
+    # Todo: Restore this test
+    # def test_most_similar_by_organism(self):
+    #     """
+    #     tests whether most_similar_by_organism returns the correct dataframe
 
-        """
-        annotation_and_id = most_similar_by_organism(self.similarities, self.annotation_data)
-        pd.testing.assert_frame_equal(annotation_and_id, self.annotation_and_id)
+    #     """
+    #     annotation_and_id = most_similar_by_organism(self.similarities, self.annotation_data)
+    #     pd.testing.assert_frame_equal(annotation_and_id, self.annotation_and_id)
 
     def test_find_paralogs(self):
         """
@@ -334,7 +429,9 @@ def test_find_paralogs(self):
 
         """
         target_id = "DINJ_ECOLI"
-        paralog_table = find_paralogs(target_id, self.annotation_data, self.similarities, 0.9)
+        paralog_table = find_paralogs(
+            target_id, self.annotation_data, self.similarities, 0.9
+        )
         pd.testing.assert_frame_equal(paralog_table, self.paralog_table)
 
     def test_filter_best_reciprocal(self):
@@ -343,8 +440,13 @@ def test_filter_best_reciprocal(self):
 
         """
         alignment_file = "{}/align_1/test_new.a2m".format(TRAVIS_PATH)
-        best_recip = pd.read_csv("{}/concatenate/test_new_best_reciprocal.csv".format(TRAVIS_PATH), index_col=0)
-        _best_recip = filter_best_reciprocal(alignment_file, self.paralog_table, self.annotation_and_id, 0.02)
+        best_recip = pd.read_csv(
+            "{}/concatenate/test_new_best_reciprocal.csv".format(TRAVIS_PATH),
+            index_col=0,
+        )
+        _best_recip = filter_best_reciprocal(
+            alignment_file, self.paralog_table, self.annotation_and_id, 0.02
+        )
         pd.testing.assert_frame_equal(best_recip, _best_recip)
 
     def test_get_distance_overlap(self):
@@ -383,7 +485,9 @@ def test_best_reciprocal_matching(self):
 
         """
         id_pairing = best_reciprocal_matching(self.possible_partners)
-        id_pairing = id_pairing.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"])
+        id_pairing = id_pairing.sort_values(
+            ["uniprot_id_1", "uniprot_id_2", "distance"]
+        )
         id_pairing = id_pairing.reset_index(drop=True)
         pd.testing.assert_frame_equal(id_pairing, self.id_pairing)
 
@@ -394,16 +498,19 @@ def test_find_possible_partners(self):
         """
 
         _possible_partners = find_possible_partners(
-            self.gene_location_table_1,
-            self.gene_location_table_2
+            self.gene_location_table_1, self.gene_location_table_2
+        )
+        _possible_partners = _possible_partners.sort_values(
+            ["uniprot_id_1", "uniprot_id_2", "distance"]
         )
-        _possible_partners = _possible_partners.sort_values(["uniprot_id_1", "uniprot_id_2", "distance"])
         _possible_partners = _possible_partners.reset_index(drop=True)
 
         pd.testing.assert_frame_equal(
-            self.possible_partners, _possible_partners,
-            atol=1e-5, check_like=True,
-            check_names=False
+            self.possible_partners,
+            _possible_partners,
+            atol=1e-5,
+            check_like=True,
+            check_names=False,
         )
 
     def test_plot_distance_distribution(self):
@@ -418,5 +525,6 @@ def test_plot_distance_distribution(self):
         outfile.close()
         os.unlink(outfile.name)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()

From 55ccdf5af7eadd0d085f17d3c5e20a0bb00900a8 Mon Sep 17 00:00:00 2001
From: Zachary Charlop-Powers <zcharloppowers@ginkgobioworks.com>
Date: Fri, 24 Nov 2023 15:22:35 -0500
Subject: [PATCH 30/47] add yml files

---
 .github/workflows/build_and_test.yml      | 3 ++-
 .github/workflows/build_test_and_push.yml | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 614c6810..53fb0d82 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -29,7 +29,8 @@ jobs:
         source activate test-environment
     - name: Run setup.py
       run: |
-        python setup.py sdist --formats=zip -k
+        python setup.py sdist  --formats=zip -k
+        python setup.py bdist_wheel
         find ./dist -iname "*.zip" -print0 | xargs -0 pip install
         pip install codecov
     - name: Download test files
diff --git a/.github/workflows/build_test_and_push.yml b/.github/workflows/build_test_and_push.yml
index 53d28be0..7276086f 100644
--- a/.github/workflows/build_test_and_push.yml
+++ b/.github/workflows/build_test_and_push.yml
@@ -31,6 +31,7 @@ jobs:
     - name: Run setup.py
       run: |
         python setup.py sdist --formats=zip -k
+        python setup.py bdist_wheel
         find ./dist -iname "*.zip" -print0 | xargs -0 pip install
         pip install codecov
     - name: Download test files

From 5a50ab9367c531e1d03f298ea8058991f274d99d Mon Sep 17 00:00:00 2001
From: Zachary Charlop-Powers <zcharloppowers@ginkgobioworks.com>
Date: Sun, 26 Nov 2023 11:47:42 -0500
Subject: [PATCH 31/47] build

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 53fb0d82..235ce16f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -30,7 +30,7 @@ jobs:
     - name: Run setup.py
       run: |
         python setup.py sdist  --formats=zip -k
-        python setup.py bdist_wheel
+        python -m build
         find ./dist -iname "*.zip" -print0 | xargs -0 pip install
         pip install codecov
     - name: Download test files

From 686ca3c15ffb9f46272bd8aa9a9ae510ed2785a4 Mon Sep 17 00:00:00 2001
From: Zachary Charlop-Powers <zcharloppowers@ginkgobioworks.com>
Date: Sun, 26 Nov 2023 11:51:05 -0500
Subject: [PATCH 32/47] add build

---
 .github/workflows/build_and_test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 235ce16f..d334b1c4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -29,6 +29,7 @@ jobs:
         source activate test-environment
     - name: Run setup.py
       run: |
+        pip install build
         python setup.py sdist  --formats=zip -k
         python -m build
         find ./dist -iname "*.zip" -print0 | xargs -0 pip install

From b4eb674c1ba1332daf954c35644902015d1aac2d Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Wed, 24 Jan 2024 16:45:28 +0100
Subject: [PATCH 33/47] Fix PDB chain retrieval regression by pandas/numpy
 updates

---
 evcouplings/compare/pdb.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index e16aaf1a..3bbb6ce6 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -585,12 +585,10 @@ def get_chain(self, chain, model=0):
             )
 
         # collect internal indeces of all residues/groups in chain
-        residue_indeces = np.concatenate(
-            np.array([
-                np.arange(self.first_residue_index[i], self.last_residue_index[i])
-                for i in target_chain_indeces
-            ], dtype=np.object_)
-        )
+        residue_indeces = np.concatenate([
+            np.arange(self.first_residue_index[i], self.last_residue_index[i])
+            for i in target_chain_indeces
+        ])
 
         # chain indeces and identifiers for all residues
         # (not to be confused with chain name!);
@@ -619,7 +617,11 @@ def get_chain(self, chain, model=0):
             ("sec_struct", self.sec_struct[residue_indeces]),
         ])
 
-        res_df = pd.DataFrame(res)
+        # also store entity IDs and indices
+        res_df = pd.DataFrame(res).assign(
+            entity_index=lambda df: df.chain_index.map(self.chain_to_entity),
+            entity_id=lambda df: df.entity_index + 1
+        )
 
         # shift seqres indexing to start at 1;
         # However, do not add to positions without sequence index (-1)

From c2a766d1df45919115330f715179a91466feeb95 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Fri, 5 Jul 2024 16:58:02 +0200
Subject: [PATCH 34/47] Replace MMTF with bCIF

---
 evcouplings/compare/pdb.py | 737 +++++++++++++++++++++++++++----------
 requirements.txt           |   2 +-
 setup.py                   |   4 +-
 3 files changed, 539 insertions(+), 204 deletions(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index 3bbb6ce6..91fd1201 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -7,12 +7,16 @@
 
 from collections import OrderedDict
 from collections.abc import Iterable
+import gzip
+from io import BytesIO
 from os import path
 from urllib.error import HTTPError
 
-from mmtf import fetch, parse
 import numpy as np
 import pandas as pd
+import requests
+import msgpack
+from Bio.PDB.binary_cif import _decode
 
 from evcouplings.utils.config import InvalidParameterError
 from evcouplings.utils.constants import AA3_to_AA1
@@ -21,6 +25,9 @@
     valid_file, ResourceError, tempdir
 )
 
+PDB_BCIF_DOWNLOAD_URL = "https://models.rcsb.org/{pdb_id}.bcif.gz"
+
+
 # Mapping from MMTF secondary structure codes to DSSP symbols
 MMTF_DSSP_CODE_MAP = {
     0: "I",  # pi helix
@@ -402,109 +409,157 @@ def to_file(self, fileobj, chain_id="A", end=True, first_atom_id=1):
 
 class PDB:
     """
-    Wrapper around PDB MMTF decoder object to access residue and
-    coordinate information
+    Holds PDB structure from binaryCIF format; supersedes original PDB class based
+    on MMTF format (renamed to MmtfPDB, cf. below) due to MMTF retirement in 2024
     """
-    def __init__(self, mmtf):
+    def __init__(self, filehandle, keep_full_data=False):
         """
-        Initialize by further decoding information in mmtf object.
+        Initialize by parsing binaryCIF from open filehandle.
+        Recommended to use from_file() and from_id() class methods to create object.
 
-        Normally one should use from_file() and from_id() class
-        methods to create object.
+        Column extraction and decoding based on https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py
 
         Parameters
         ----------
-        mmtf : mmtf.api.mmtf_reader.MMTFDecoder
-            MMTF decoder object (as returned by fetch or parse
-            function in mmtf module)
+        filehandle: file-like object
+            Open filehandle (binary) from which to read binaryCIF data
+        keep_full_data: bool (default: False)
+            Associate raw extracted data with object
         """
-        def _get_range(object_counts):
-            """
-            Extract index ranges for chains, residues and atoms
-            """
-            last_element = np.cumsum(object_counts, dtype=int)
-            first_element = np.concatenate(
-                (np.zeros(1, dtype=int), last_element[:-1])
-            )
-
-            return first_element, last_element
-
-        # store raw MMTF decoder
-        self.mmtf = mmtf
-
-        # Step 1: summarize basic information about model
-        # number of models in structure
-        self.num_models = len(mmtf.chains_per_model)
-
-        self.first_chain_index, self.last_chain_index = _get_range(
-            mmtf.chains_per_model
+        # unpack information in bCIF file
+        raw_data = msgpack.unpack(
+            filehandle, use_list=True
         )
 
-        # collect list which chain corresponds to which entity
-        self.chain_to_entity = {}
-        for i, ent in enumerate(mmtf.entity_list):
-            for c in ent["chainIndexList"]:
-                self.chain_to_entity[c] = i
-
-        # Step 2: identify residues and corresponding atom indices
-
-        # first/last index of residue (aka group) for each chain;
-        # index these lists with index of chain
-        self.first_residue_index, self.last_residue_index = _get_range(
-            mmtf.groups_per_chain
-        )
-
-        # store explicit information about composition of residues
-        def _group_info(field):
-            return np.array(
-                [x[field] for x in mmtf.group_list], dtype=np.object_
-            )
-
-        # three and one letter code names of different groups
-        self._residue_names_3 = _group_info("groupName")
-        self._residue_names_1 = _group_info("singleLetterCode")
-
-        # atom types and overall number of atoms in each type of residue
-        self._residue_type_atom_names = _group_info("atomNameList")
-        self._residue_type_elements = _group_info("elementList")
-        self._residue_type_charges = _group_info("formalChargeList")
-
-        self._residue_type_num_atoms = np.array([len(x) for x in self._residue_type_atom_names])
-
-        # prepare alternative location list as numpy array, with empty strings
-        self.alt_loc_list = np.array(
-            [x.replace("\x00", "") for x in mmtf.alt_loc_list]
-        )
+        data = {
+            f"{category['name']}.{column['name']}": column
+            for block in raw_data["dataBlocks"] for category in block["categories"] for column in category["columns"]
+        }
 
-        # compute first and last atom index for each residue/group
-        # (by fetching corresponding length for each group based on group type)
-        self._residue_num_atoms = self._residue_type_num_atoms[mmtf.group_type_list]
+        ATOM_TARGET_COLS = {
+            "_atom_site.pdbx_PDB_model_num": "model_number",
+            "_atom_site.group_PDB": "record_type",  # ATOM, HETATM etc.
+
+            # atom IDs and types
+            "_atom_site.id": "id",  # x
+            "_atom_site.type_symbol": "type_symbol",  # x
+            "_atom_site.label_atom_id": "label_atom_id",  # x
+            "_atom_site.auth_atom_id": "auth_atom_id",
+
+            # residue/molecule types (three-letter code)
+            "_atom_site.label_comp_id": "label_comp_id",  # x
+            "_atom_site.auth_comp_id": "auth_comp_id",
+
+            # chain IDs (official, author) and entity IDs
+            "_atom_site.label_asym_id": "label_asym_id",  # x
+            "_atom_site.auth_asym_id": "auth_asym_id",
+            "_atom_site.label_entity_id": "label_entity_id",
+
+            # residue IDs (official and author)
+            "_atom_site.label_seq_id": "label_seq_id",
+            "_atom_site.auth_seq_id": "auth_seq_id",  # x
+            "_atom_site.pdbx_PDB_ins_code": "insertion_code",
+
+            # atom properties
+            "_atom_site.Cartn_x": "x",  # x
+            "_atom_site.Cartn_y": "y",  # x
+            "_atom_site.Cartn_z": "z",  # x
+            "_atom_site.occupancy": "occupancy",  # x
+            "_atom_site.B_iso_or_equiv": "b_factor",  # x
+            "_atom_site.pdbx_formal_charge": "charge",
+        }
 
-        self.first_atom_index, self.last_atom_index = _get_range(
-            self._residue_num_atoms
-        )
+        HELIX_TARGET_COLS = {
+            "_struct_conf.conf_type_id": "conformation_type",
+            "_struct_conf.id": "id",
+            # label_asym_id and label_seq_id are sufficient for merging to atom table;
+            # do not bother with author IDs here
+            "_struct_conf.beg_label_asym_id": "beg_label_asym_id",
+            "_struct_conf.beg_label_seq_id": "beg_label_seq_id",
+            "_struct_conf.end_label_asym_id": "end_label_asym_id",
+            "_struct_conf.end_label_seq_id": "end_label_seq_id",
+        }
 
-        # assemble residue ID strings
-        self.residue_ids = np.array([
-            "{}{}".format(group_id, ins_code.replace("\x00", ""))
-            for group_id, ins_code
-            in zip(mmtf.group_id_list, mmtf.ins_code_list)
-        ])
+        SHEET_TARGET_COLS = {
+            "_struct_sheet_range.sheet_id": "sheet_id",
+            "_struct_sheet_range.id": "id",
+            "_struct_sheet_range.beg_label_asym_id": "beg_label_asym_id",
+            "_struct_sheet_range.beg_label_seq_id": "beg_label_seq_id",
+            "_struct_sheet_range.end_label_asym_id": "end_label_asym_id",
+            "_struct_sheet_range.end_label_seq_id": "end_label_seq_id",
+        }
 
-        # map secondary structure codes into DSSP symbols
-        self.sec_struct = np.array(
-            [MMTF_DSSP_CODE_MAP[x] for x in mmtf.sec_struct_list]
-        )
+        if keep_full_data:
+            self.data = data
+        else:
+            self.data = None
+
+        # decode information into dataframe with BioPython helper method
+        self.atom_table = pd.DataFrame({
+            name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items()
+        })
+
+        # decode information into dataframe with BioPython helper method
+        self.helix_table = pd.DataFrame({
+            name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items()
+        })
+
+        # decode information into dataframe with BioPython helper method
+        self.sheet_table = pd.DataFrame({
+            name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items()
+        })
+
+        # create secondary structure table for merging to chain tables
+        # (will only contain helix/H and strand/E, coil/C will need to be filled in)
+        sse_raw = []
+        for sse_type, sse_table in [
+            ("H", self.helix_table),
+            ("E", self.sheet_table)
+        ]:
+            for _, row in sse_table.iterrows():
+                assert row.beg_label_asym_id == row.end_label_asym_id
+                for seq_id in range(row.beg_label_seq_id, row.end_label_seq_id + 1):
+                    sse_raw.append({
+                        "label_asym_id": row.beg_label_asym_id,
+                        "label_seq_id": seq_id,
+                        "sec_struct_3state": sse_type,
+                    })
+
+        self.secondary_structure = pd.DataFrame(sse_raw)
+
+        # store information about models/chains for quick retrieval and verification
+        self.models = list(self.atom_table.model_number.unique())
+
+        # model number to auth chains
+        self.model_to_chains = self.atom_table[
+            ["model_number", "auth_asym_id"]
+        ].drop_duplicates().groupby(
+            "model_number"
+        ).agg(
+            lambda s: list(s)
+        )["auth_asym_id"].to_dict()
+
+        self.model_to_asym_ids = self.atom_table[
+            ["model_number", "label_asym_id"]
+        ].drop_duplicates().groupby(
+            "model_number"
+        ).agg(
+            lambda s: list(s)
+        )["label_asym_id"].to_dict()
 
     @classmethod
-    def from_file(cls, filename):
+    def from_file(cls, filename, keep_full_data=False):
         """
-        Initialize structure from MMTF file
+        Initialize structure from binaryCIF file
+
+        inspired by https://github.com/biopython/biopython/blob/master/Bio/PDB/binary_cif.py
 
         Parameters
         ----------
         filename : str
             Path of MMTF file
+        keep_full_data: bool (default: False)
+            Associate raw extracted data with object
 
         Returns
         -------
@@ -512,36 +567,53 @@ def from_file(cls, filename):
             initialized PDB structure
         """
         try:
-            return cls(parse(filename))
-        except FileNotFoundError as e:
+            with (
+                    gzip.open(filename, mode="rb")
+                    if filename.lower().endswith(".gz") else open(filename, mode="rb")
+            ) as f:
+                return cls(f, keep_full_data=keep_full_data)
+        except IOError as e:
             raise ResourceError(
-                "Could not find file {}".format(filename)
+                "Could not open file {}".format(filename)
             ) from e
 
     @classmethod
-    def from_id(cls, pdb_id):
+    def from_id(cls, pdb_id, keep_full_data=False):
         """
-        Initialize structure by PDB ID (fetches
-        structure from RCSB servers)
+        Initialize structure by PDB ID (fetches structure from RCSB servers)
 
         Parameters
         ----------
         pdb_id : str
             PDB identifier (e.g. 1hzx)
+        keep_full_data: bool (default: False)
+            Associate raw extracted data with object
 
         Returns
         -------
         PDB
             initialized PDB structure
         """
+        # TODO: add proper retry logic and timeouts
+        # TODO: add better exception handling
         try:
-            return cls(fetch(pdb_id))
-        except HTTPError as e:
+            r = requests.get(
+                PDB_BCIF_DOWNLOAD_URL.format(pdb_id=pdb_id.lower())
+            )
+        except requests.exceptions.RequestException as e:
             raise ResourceError(
-                "Could not fetch MMTF data for {}".format(pdb_id)
+                "Error fetching bCIF data for {}".format(pdb_id)
             ) from e
 
-    def get_chain(self, chain, model=0):
+        if not r.ok:
+            raise ResourceError(
+                "Did not receive valid response fetching {}".format(pdb_id)
+            )
+
+        with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f:
+            return cls(f, keep_full_data=keep_full_data)
+
+    def get_chain(self, chain, model=1, is_author_id=True):
         """
         Extract residue information and atom coordinates
         for a given chain in PDB structure
@@ -549,9 +621,12 @@ def get_chain(self, chain, model=0):
         Parameters
         ----------
         chain : str
-            Name of chain to be extracted (e.g. "A")
-        model : int, optional (default: 0)
-            Index of model to be extracted
+            ID of chain to be extracted (e.g. "A")
+        model : int, optional (default: 1)
+            Identifier of model to be extracted
+        is_author_id : bool (default: True)
+            If true, interpret chain parameter as author chain identifier;
+            if false, interpret as label_asym_id
 
         Returns
         -------
@@ -559,127 +634,387 @@ def get_chain(self, chain, model=0):
             Chain object containing DataFrames listing residues
             and atom coordinates
         """
-        if not (0 <= model < self.num_models):
+        # check if valid model was requested
+        if model not in self.models:
             raise ValueError(
-                "Illegal model index, can be from 0 up to {}".format(
-                    self.num_models - 1
-                )
+                "Invalid model identifier, valid options are: " + (",".join(map(str, self.models)))
             )
 
-        # first and last index of chains corresponding to selected model
-        first_chain_index = self.first_chain_index[model]
-        last_chain_index = self.last_chain_index[model]
-
-        # which model chains match our target PDB chain?
-        chain_names = np.array(
-            self.mmtf.chain_name_list[first_chain_index:last_chain_index]
-        )
-
-        # indices of chains that match chain name, in current model
-        indices = np.arange(first_chain_index, last_chain_index, dtype=int)
-        target_chain_indeces = indices[chain_names == chain]
-
-        if len(target_chain_indeces) == 0:
+        # check if valid chain was requested
+        if ((is_author_id and chain not in self.model_to_chains[model]) or
+                (not is_author_id and chain not in self.model_to_asym_ids[model])):
             raise ValueError(
-                "No chains with given name found"
+                "Invalid chain selection, check self.model_to_chains / self.model_to_asym_ids for options"
             )
 
-        # collect internal indeces of all residues/groups in chain
-        residue_indeces = np.concatenate([
-            np.arange(self.first_residue_index[i], self.last_residue_index[i])
-            for i in target_chain_indeces
-        ])
-
-        # chain indeces and identifiers for all residues
-        # (not to be confused with chain name!);
-        # maximum length 4 characters according to MMTF spec
-        chain_indeces = np.concatenate([
-            np.full(
-                self.last_residue_index[i] - self.first_residue_index[i],
-                i, dtype=int
-            ) for i in target_chain_indeces
-        ])
-
-        chain_ids = np.array(self.mmtf.chain_id_list)[chain_indeces]
-
-        # create dataframe representation of selected chain
-        m = self.mmtf
-        group_types = m.group_type_list[residue_indeces]
-
-        res = OrderedDict([
-            ("id", self.residue_ids[residue_indeces]),
-            ("seqres_id", m.sequence_index_list[residue_indeces]),
-            ("coord_id", self.residue_ids[residue_indeces]),
-            ("one_letter_code", self._residue_names_1[group_types]),
-            ("three_letter_code", self._residue_names_3[group_types]),
-            ("chain_index", chain_indeces),
-            ("chain_id", chain_ids),
-            ("sec_struct", self.sec_struct[residue_indeces]),
-        ])
-
-        # also store entity IDs and indices
-        res_df = pd.DataFrame(res).assign(
-            entity_index=lambda df: df.chain_index.map(self.chain_to_entity),
-            entity_id=lambda df: df.entity_index + 1
+        if is_author_id:
+            chain_field = "auth_asym_id"
+        else:
+            chain_field = "label_asym_id"
+
+        # filter atom table to model + chain selection
+        atoms = self.atom_table.query(
+            f"model_number == @model and {chain_field} == @chain"
+        ).assign(
+            # create coordinate ID from author residue ID + insertion code
+            # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID)
+            coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code,
+            seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", np.nan),
+            one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"),
+            id=lambda df: df.coord_id,
+            # note that MSE will now be labeled as HETATM, which was not the case with MMTF
+            hetatm=lambda df: df.record_type == "HETATM",
+        ).reset_index(
+            drop=True
         )
 
-        # shift seqres indexing to start at 1;
-        # However, do not add to positions without sequence index (-1)
-        res_df.loc[res_df.seqres_id >= 0, "seqres_id"] += 1
-
-        # turn all indeces into strings and create proper NaN values
-        res_df.loc[:, "coord_id"] = (
-            res_df.loc[:, "coord_id"].astype(str)
+        # create residue table by de-duplicating atoms
+        res = atoms.drop_duplicates(
+            subset=["coord_id"]
+        ).reset_index(
+            drop=True
         )
+        res.index.name = "residue_index"
 
-        res_df.loc[:, "seqres_id"] = (
-            res_df.loc[:, "seqres_id"].astype(str).replace("-1", np.nan)
+        # merge secondary structure information (left outer join as coil is missing from table)
+        res_sse = res.merge(
+            self.secondary_structure,
+            on=("label_seq_id", "label_asym_id"),
+            how="left"
         )
-        # copy updated coordinate indeces
-        res_df.loc[:, "id"] = res_df.loc[:, "coord_id"]
 
-        res_df.loc[:, "one_letter_code"] = res_df.loc[:, "one_letter_code"].replace("?", np.nan)
-        res_df.loc[:, "sec_struct"] = res_df.loc[:, "sec_struct"].replace("", np.nan)
+        res_sse.loc[
+            res_sse.sec_struct_3state.isnull() & (res_sse.label_seq_id > 0),
+            "sec_struct_3state"
+        ] = "C"
+
+        RES_RENAME_MAP = {
+            "id": "id",
+            "seqres_id": "seqres_id",
+            "coord_id": "coord_id",
+            "one_letter_code": "one_letter_code",
+            "label_comp_id": "three_letter_code",
+            "auth_asym_id": "chain_id",
+            "label_asym_id": "asym_id",  # new
+            "label_entity_id": "entity_id",
+            "sec_struct_3state": "sec_struct_3state",
+            "hetatm": "hetatm",
+        }
 
-        # reduction to 3-state secondary structure (following Rost & Sander)
-        res_df.loc[:, "sec_struct_3state"] = res_df.loc[:, "sec_struct"].map(
-            lambda x: DSSP_3_STATE_MAP[x], na_action="ignore"
+        res_final = res_sse.loc[
+            :, list(RES_RENAME_MAP)
+        ].rename(
+            columns=RES_RENAME_MAP
         )
 
-        # proxy for HETATM records - will not work e.g. for MSE,
-        # which is listed as "M"
-        res_df.loc[:, "hetatm"] = res_df.one_letter_code.isnull()
-
-        # finally, get atom names and coordinates for all residues
-        atom_first = self.first_atom_index[residue_indeces]
-        atom_last = self.last_atom_index[residue_indeces]
-        atom_names = np.concatenate(self._residue_type_atom_names[group_types])
-        elements = np.concatenate(self._residue_type_elements[group_types])
-        charges = np.concatenate(self._residue_type_charges[group_types])
-
-        residue_number = np.repeat(res_df.index, atom_last - atom_first)
-        atom_indices = np.concatenate([
-            np.arange(self.first_atom_index[i], self.last_atom_index[i])
-            for i in residue_indeces
-        ])
-
-        coords = OrderedDict([
-            ("residue_index", residue_number),
-            ("atom_id", self.mmtf.atom_id_list[atom_indices]),
-            ("atom_name", atom_names),
-            ("element", elements),
-            ("charge", charges),
-            ("x", self.mmtf.x_coord_list[atom_indices]),
-            ("y", self.mmtf.y_coord_list[atom_indices]),
-            ("z", self.mmtf.z_coord_list[atom_indices]),
-            ("alt_loc", self.alt_loc_list[atom_indices]),
-            ("occupancy", self.mmtf.occupancy_list[atom_indices]),
-            ("b_factor", self.mmtf.b_factor_list[atom_indices]),
-        ])
-
-        coord_df = pd.DataFrame(coords)
+        # not included in new version: alt_loc
+        ATOM_RENAME_MAP = {
+            "residue_index": "residue_index",
+            "id": "atom_id",
+            "label_atom_id": "atom_name",
+            "type_symbol": "element",
+            "charge": "charge",
+            "x": "x",
+            "y": "y",
+            "z": "z",
+            "occupancy": "occupancy",
+            "b_factor": "b_factor",
+        }
 
-        return Chain(res_df, coord_df)
+        # add information about residue index to atoms
+        atoms_with_residue_idx = atoms.merge(
+            res.reset_index()[["coord_id", "residue_index"]],
+            on="coord_id"
+        ).loc[:, list(ATOM_RENAME_MAP)].rename(
+            columns=ATOM_RENAME_MAP
+        )
+        assert len(atoms_with_residue_idx) == len(atoms)
+
+        return Chain(res_final, atoms_with_residue_idx)
+
+
+# class MmtfPDB:
+#     """
+#     Wrapper around PDB MMTF decoder object to access residue and
+#     coordinate information
+#
+#     Note: only kept for legacy reasons, MMTF was phased out by RCSB on July 2nd, 2024 :(
+#     """
+#     def __init__(self, mmtf):
+#         """
+#         Initialize by further decoding information in mmtf object.
+#
+#         Normally one should use from_file() and from_id() class
+#         methods to create object.
+#
+#         Parameters
+#         ----------
+#         mmtf : mmtf.api.mmtf_reader.MMTFDecoder
+#             MMTF decoder object (as returned by fetch or parse
+#             function in mmtf module)
+#         """
+#         def _get_range(object_counts):
+#             """
+#             Extract index ranges for chains, residues and atoms
+#             """
+#             last_element = np.cumsum(object_counts, dtype=int)
+#             first_element = np.concatenate(
+#                 (np.zeros(1, dtype=int), last_element[:-1])
+#             )
+#
+#             return first_element, last_element
+#
+#         # store raw MMTF decoder
+#         self.mmtf = mmtf
+#
+#         # Step 1: summarize basic information about model
+#         # number of models in structure
+#         self.num_models = len(mmtf.chains_per_model)
+#
+#         self.first_chain_index, self.last_chain_index = _get_range(
+#             mmtf.chains_per_model
+#         )
+#
+#         # collect list which chain corresponds to which entity
+#         self.chain_to_entity = {}
+#         for i, ent in enumerate(mmtf.entity_list):
+#             for c in ent["chainIndexList"]:
+#                 self.chain_to_entity[c] = i
+#
+#         # Step 2: identify residues and corresponding atom indices
+#
+#         # first/last index of residue (aka group) for each chain;
+#         # index these lists with index of chain
+#         self.first_residue_index, self.last_residue_index = _get_range(
+#             mmtf.groups_per_chain
+#         )
+#
+#         # store explicit information about composition of residues
+#         def _group_info(field):
+#             return np.array(
+#                 [x[field] for x in mmtf.group_list], dtype=np.object_
+#             )
+#
+#         # three and one letter code names of different groups
+#         self._residue_names_3 = _group_info("groupName")
+#         self._residue_names_1 = _group_info("singleLetterCode")
+#
+#         # atom types and overall number of atoms in each type of residue
+#         self._residue_type_atom_names = _group_info("atomNameList")
+#         self._residue_type_elements = _group_info("elementList")
+#         self._residue_type_charges = _group_info("formalChargeList")
+#
+#         self._residue_type_num_atoms = np.array([len(x) for x in self._residue_type_atom_names])
+#
+#         # prepare alternative location list as numpy array, with empty strings
+#         self.alt_loc_list = np.array(
+#             [x.replace("\x00", "") for x in mmtf.alt_loc_list]
+#         )
+#
+#         # compute first and last atom index for each residue/group
+#         # (by fetching corresponding length for each group based on group type)
+#         self._residue_num_atoms = self._residue_type_num_atoms[mmtf.group_type_list]
+#
+#         self.first_atom_index, self.last_atom_index = _get_range(
+#             self._residue_num_atoms
+#         )
+#
+#         # assemble residue ID strings
+#         self.residue_ids = np.array([
+#             "{}{}".format(group_id, ins_code.replace("\x00", ""))
+#             for group_id, ins_code
+#             in zip(mmtf.group_id_list, mmtf.ins_code_list)
+#         ])
+#
+#         # map secondary structure codes into DSSP symbols
+#         self.sec_struct = np.array(
+#             [MMTF_DSSP_CODE_MAP[x] for x in mmtf.sec_struct_list]
+#         )
+#
+#     @classmethod
+#     def from_file(cls, filename):
+#         """
+#         Initialize structure from MMTF file
+#
+#         Parameters
+#         ----------
+#         filename : str
+#             Path of MMTF file
+#
+#         Returns
+#         -------
+#         PDB
+#             initialized PDB structure
+#         """
+#         try:
+#             return cls(parse(filename))
+#         except FileNotFoundError as e:
+#             raise ResourceError(
+#                 "Could not find file {}".format(filename)
+#             ) from e
+#
+#     @classmethod
+#     def from_id(cls, pdb_id):
+#         """
+#         Initialize structure by PDB ID (fetches
+#         structure from RCSB servers)
+#
+#         Parameters
+#         ----------
+#         pdb_id : str
+#             PDB identifier (e.g. 1hzx)
+#
+#         Returns
+#         -------
+#         PDB
+#             initialized PDB structure
+#         """
+#         try:
+#             return cls(fetch(pdb_id))
+#         except HTTPError as e:
+#             raise ResourceError(
+#                 "Could not fetch MMTF data for {}".format(pdb_id)
+#             ) from e
+#
+#     def get_chain(self, chain, model=0):
+#         """
+#         Extract residue information and atom coordinates
+#         for a given chain in PDB structure
+#
+#         Parameters
+#         ----------
+#         chain : str
+#             Name of chain to be extracted (e.g. "A")
+#         model : int, optional (default: 0)
+#             Index of model to be extracted
+#
+#         Returns
+#         -------
+#         Chain
+#             Chain object containing DataFrames listing residues
+#             and atom coordinates
+#         """
+#         if not (0 <= model < self.num_models):
+#             raise ValueError(
+#                 "Illegal model index, can be from 0 up to {}".format(
+#                     self.num_models - 1
+#                 )
+#             )
+#
+#         # first and last index of chains corresponding to selected model
+#         first_chain_index = self.first_chain_index[model]
+#         last_chain_index = self.last_chain_index[model]
+#
+#         # which model chains match our target PDB chain?
+#         chain_names = np.array(
+#             self.mmtf.chain_name_list[first_chain_index:last_chain_index]
+#         )
+#
+#         # indices of chains that match chain name, in current model
+#         indices = np.arange(first_chain_index, last_chain_index, dtype=int)
+#         target_chain_indeces = indices[chain_names == chain]
+#
+#         if len(target_chain_indeces) == 0:
+#             raise ValueError(
+#                 "No chains with given name found"
+#             )
+#
+#         # collect internal indeces of all residues/groups in chain
+#         residue_indeces = np.concatenate([
+#             np.arange(self.first_residue_index[i], self.last_residue_index[i])
+#             for i in target_chain_indeces
+#         ])
+#
+#         # chain indeces and identifiers for all residues
+#         # (not to be confused with chain name!);
+#         # maximum length 4 characters according to MMTF spec
+#         chain_indeces = np.concatenate([
+#             np.full(
+#                 self.last_residue_index[i] - self.first_residue_index[i],
+#                 i, dtype=int
+#             ) for i in target_chain_indeces
+#         ])
+#
+#         chain_ids = np.array(self.mmtf.chain_id_list)[chain_indeces]
+#
+#         # create dataframe representation of selected chain
+#         m = self.mmtf
+#         group_types = m.group_type_list[residue_indeces]
+#
+#         res = OrderedDict([
+#             ("id", self.residue_ids[residue_indeces]),
+#             ("seqres_id", m.sequence_index_list[residue_indeces]),
+#             ("coord_id", self.residue_ids[residue_indeces]),
+#             ("one_letter_code", self._residue_names_1[group_types]),
+#             ("three_letter_code", self._residue_names_3[group_types]),
+#             ("chain_index", chain_indeces),
+#             ("chain_id", chain_ids),
+#             ("sec_struct", self.sec_struct[residue_indeces]),
+#         ])
+#
+#         # also store entity IDs and indices
+#         res_df = pd.DataFrame(res).assign(
+#             entity_index=lambda df: df.chain_index.map(self.chain_to_entity),
+#             entity_id=lambda df: df.entity_index + 1
+#         )
+#
+#         # shift seqres indexing to start at 1;
+#         # However, do not add to positions without sequence index (-1)
+#         res_df.loc[res_df.seqres_id >= 0, "seqres_id"] += 1
+#
+#         # turn all indeces into strings and create proper NaN values
+#         res_df.loc[:, "coord_id"] = (
+#             res_df.loc[:, "coord_id"].astype(str)
+#         )
+#
+#         res_df.loc[:, "seqres_id"] = (
+#             res_df.loc[:, "seqres_id"].astype(str).replace("-1", np.nan)
+#         )
+#         # copy updated coordinate indeces
+#         res_df.loc[:, "id"] = res_df.loc[:, "coord_id"]
+#
+#         res_df.loc[:, "one_letter_code"] = res_df.loc[:, "one_letter_code"].replace("?", np.nan)
+#         res_df.loc[:, "sec_struct"] = res_df.loc[:, "sec_struct"].replace("", np.nan)
+#
+#         # reduction to 3-state secondary structure (following Rost & Sander)
+#         res_df.loc[:, "sec_struct_3state"] = res_df.loc[:, "sec_struct"].map(
+#             lambda x: DSSP_3_STATE_MAP[x], na_action="ignore"
+#         )
+#
+#         # proxy for HETATM records - will not work e.g. for MSE,
+#         # which is listed as "M"
+#         res_df.loc[:, "hetatm"] = res_df.one_letter_code.isnull()
+#
+#         # finally, get atom names and coordinates for all residues
+#         atom_first = self.first_atom_index[residue_indeces]
+#         atom_last = self.last_atom_index[residue_indeces]
+#         atom_names = np.concatenate(self._residue_type_atom_names[group_types])
+#         elements = np.concatenate(self._residue_type_elements[group_types])
+#         charges = np.concatenate(self._residue_type_charges[group_types])
+#
+#         residue_number = np.repeat(res_df.index, atom_last - atom_first)
+#         atom_indices = np.concatenate([
+#             np.arange(self.first_atom_index[i], self.last_atom_index[i])
+#             for i in residue_indeces
+#         ])
+#
+#         coords = OrderedDict([
+#             ("residue_index", residue_number),
+#             ("atom_id", self.mmtf.atom_id_list[atom_indices]),
+#             ("atom_name", atom_names),
+#             ("element", elements),
+#             ("charge", charges),
+#             ("x", self.mmtf.x_coord_list[atom_indices]),
+#             ("y", self.mmtf.y_coord_list[atom_indices]),
+#             ("z", self.mmtf.z_coord_list[atom_indices]),
+#             ("alt_loc", self.alt_loc_list[atom_indices]),
+#             ("occupancy", self.mmtf.occupancy_list[atom_indices]),
+#             ("b_factor", self.mmtf.b_factor_list[atom_indices]),
+#         ])
+#
+#         coord_df = pd.DataFrame(coords)
+#
+#         return Chain(res_df, coord_df)
 
 
 class ClassicPDB:
diff --git a/requirements.txt b/requirements.txt
index 079738ea..e7397c17 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,6 @@ numba
 ruamel.yaml
 matplotlib
 requests
-mmtf-python
 click
 filelock
 psutil
@@ -15,3 +14,4 @@ biopython
 seaborn
 billiard
 scikit-learn
+msgpack
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 93f8beec..4ba746ab 100644
--- a/setup.py
+++ b/setup.py
@@ -98,8 +98,8 @@
 
     install_requires=['setuptools>=18.2', 'numpy',
         'pandas', 'scipy', 'numba', 'ruamel.yaml<0.18', 'matplotlib', 'requests',
-        'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2',
-        'biopython', 'seaborn', 'billiard', 'scikit-learn',
+        'click', 'filelock', 'psutil', 'bokeh', 'jinja2',
+        'biopython>=1.84', 'seaborn', 'billiard', 'scikit-learn', 'msgpack'
     ],
 
 )

From cde87ae6e7463a0ab76d485234f9b449ac03c8a1 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sat, 6 Jul 2024 10:06:50 +0200
Subject: [PATCH 35/47] Make model handling consistent to previous class

---
 evcouplings/compare/pdb.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index 91fd1201..439e6b08 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -527,13 +527,16 @@ def __init__(self, filehandle, keep_full_data=False):
 
         self.secondary_structure = pd.DataFrame(sse_raw)
 
-        # store information about models/chains for quick retrieval and verification
-        self.models = list(self.atom_table.model_number.unique())
+        # store information about models/chains for quick retrieval and verification;
+        # subtract 0 to start numbering consistently to how this was handled with MMTF
+        self.models = list(sorted(self.atom_table.model_number.unique() - 1))
 
         # model number to auth chains
         self.model_to_chains = self.atom_table[
             ["model_number", "auth_asym_id"]
-        ].drop_duplicates().groupby(
+        ].drop_duplicates().assign(
+            model_number=lambda df: df.model_number - 1
+        ).groupby(
             "model_number"
         ).agg(
             lambda s: list(s)
@@ -541,7 +544,9 @@ def __init__(self, filehandle, keep_full_data=False):
 
         self.model_to_asym_ids = self.atom_table[
             ["model_number", "label_asym_id"]
-        ].drop_duplicates().groupby(
+        ].drop_duplicates().assign(
+            model_number=lambda df: df.model_number - 1
+        ).groupby(
             "model_number"
         ).agg(
             lambda s: list(s)
@@ -613,7 +618,7 @@ def from_id(cls, pdb_id, keep_full_data=False):
         with gzip.GzipFile(fileobj=BytesIO(r.content), mode="r") as f:
             return cls(f, keep_full_data=keep_full_data)
 
-    def get_chain(self, chain, model=1, is_author_id=True):
+    def get_chain(self, chain, model=0, is_author_id=True):
         """
         Extract residue information and atom coordinates
         for a given chain in PDB structure
@@ -622,8 +627,8 @@ def get_chain(self, chain, model=1, is_author_id=True):
         ----------
         chain : str
             ID of chain to be extracted (e.g. "A")
-        model : int, optional (default: 1)
-            Identifier of model to be extracted
+        model : int, optional (default: 0)
+            Identifier of model to be extracted, starting counting at 0
         is_author_id : bool (default: True)
             If true, interpret chain parameter as author chain identifier;
             if false, interpret as label_asym_id
@@ -654,7 +659,7 @@ def get_chain(self, chain, model=1, is_author_id=True):
 
         # filter atom table to model + chain selection
         atoms = self.atom_table.query(
-            f"model_number == @model and {chain_field} == @chain"
+            f"model_number - 1 == @model and {chain_field} == @chain"
         ).assign(
             # create coordinate ID from author residue ID + insertion code
             # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID)

From 2872d3dcc1814e29bda9fae806e7ffa57a809f65 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sat, 6 Jul 2024 14:23:30 +0200
Subject: [PATCH 36/47] Replace pd.append with pd.concat

---
 evcouplings/compare/distances.py | 6 ++----
 evcouplings/couplings/pairs.py   | 2 +-
 evcouplings/fold/protocol.py     | 2 +-
 evcouplings/utils/summarize.py   | 4 ++--
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/evcouplings/compare/distances.py b/evcouplings/compare/distances.py
index 8db22f9b..f6a2d21f 100644
--- a/evcouplings/compare/distances.py
+++ b/evcouplings/compare/distances.py
@@ -323,7 +323,7 @@ def _add_axis(df, axis):
         else:
             res_i = _add_axis(self.residues_i, "i")
             res_j = _add_axis(self.residues_j, "j")
-            residues = res_i.append(res_j)
+            residues = pd.concat([res_i, res_j])
 
         # save residue table
         residue_table_filename = filename + ".csv"
@@ -1273,9 +1273,7 @@ def _get_chains(sifts_result):
     # if no structures given, or path to files, load first
     structures = _prepare_structures(
         structures,
-        sifts_result_i.hits.pdb_id.append(
-            sifts_result_j.hits.pdb_id
-        ),
+        set(sifts_result_i.hits.pdb_id) | set(sifts_result_j.hits.pdb_id),
         raise_missing
     )
 
diff --git a/evcouplings/couplings/pairs.py b/evcouplings/couplings/pairs.py
index 5b284903..47446be8 100644
--- a/evcouplings/couplings/pairs.py
+++ b/evcouplings/couplings/pairs.py
@@ -122,7 +122,7 @@ def enrichment(ecs, num_pairs=1.0, score="cn", min_seqdist=6):
         columns={"i": "j", "j": "i", "A_i": "A_j", "A_j": "A_i"}
     )
 
-    stacked_ecs = top_ecs.append(flipped)
+    stacked_ecs = pd.concat([top_ecs, flipped])
 
     # now sum cumulative strength of EC for each position
     ec_sums = pd.DataFrame(
diff --git a/evcouplings/fold/protocol.py b/evcouplings/fold/protocol.py
index 12ee73a1..232884ec 100644
--- a/evcouplings/fold/protocol.py
+++ b/evcouplings/fold/protocol.py
@@ -257,7 +257,7 @@ def _eliminate_altloc(chain):
         comp = comp.sort_values("tm", ascending=False)
         single_results[exp_file] = comp
 
-        full_result = full_result.append(comp)
+        full_result = pd.concat([full_result, comp])
 
     return full_result, single_results
 
diff --git a/evcouplings/utils/summarize.py b/evcouplings/utils/summarize.py
index ba5e5012..c86a3d5a 100644
--- a/evcouplings/utils/summarize.py
+++ b/evcouplings/utils/summarize.py
@@ -91,7 +91,7 @@ def protein_monomer(prefix, configs):
                     stat_df.loc[0, "precision"] = ec_comp.iloc[L]["precision"]
 
                 # finally, append to global table
-                ali_table = ali_table.append(stat_df)
+                ali_table = pd.concat([ali_table, stat_df])
 
     # sort table by sequence search threshold
     ali_table = ali_table.sort_values(by="domain_threshold")
@@ -337,7 +337,7 @@ def protein_complex(prefix, configs):
                             stat_df.loc[0, "inter_precision"] = ec_comp_inter.iloc[NUM_INTER]["segmentwise_precision"]
 
                     # finally, append to global table
-                    ali_table = ali_table.append(stat_df)
+                    ali_table = pd.concat([ali_table, stat_df])
 
     # save ali statistics table
     table_file = prefix + "_job_statistics_summary.csv"

From 4778267e736832bb70d4b7adc6e5371598c4311a Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sun, 7 Jul 2024 10:46:28 +0200
Subject: [PATCH 37/47] fix overlapping secondary structure elements

---
 evcouplings/compare/pdb.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index 439e6b08..da16bf54 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -525,7 +525,12 @@ def __init__(self, filehandle, keep_full_data=False):
                         "sec_struct_3state": sse_type,
                     })
 
-        self.secondary_structure = pd.DataFrame(sse_raw)
+        # drop duplicates, there are overlapping helix segment annotations e.g. for PDB 6cup:A:Asp92
+        self.secondary_structure = pd.DataFrame(
+            sse_raw
+        ).drop_duplicates(
+            subset=["label_asym_id", "label_seq_id"]
+        )
 
         # store information about models/chains for quick retrieval and verification;
         # subtract 0 to start numbering consistently to how this was handled with MMTF

From cccc1ca746fc2e47c5c9cdeb452ae4b47afa5941 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sun, 7 Jul 2024 10:51:46 +0200
Subject: [PATCH 38/47] fix numpy warning (#306)

---
 evcouplings/visualize/parameters.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/evcouplings/visualize/parameters.py b/evcouplings/visualize/parameters.py
index 2facfc11..064cab1d 100644
--- a/evcouplings/visualize/parameters.py
+++ b/evcouplings/visualize/parameters.py
@@ -134,7 +134,11 @@ def evzoom_data(model, ec_threshold=0.9, freq_threshold=0.01,
     fi = model.fi()
     q = model.num_symbols
 
-    B = -fi * np.log2(fi)
+    # copy and blank out fi matrix to avoid numpy warnings
+    # taking log of 0 (note where argument does not help)
+    fi_no0 = fi.copy()
+    fi_no0[fi <= 0] = np.nan
+    B = -fi * np.log2(fi_no0)
     B[fi <= 0] = 0
     R = np.log2(q) - B.sum(axis=1)
 

From 05c642334c980077b2add420f5c58f6108d8fb5a Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Sun, 7 Jul 2024 10:59:31 +0200
Subject: [PATCH 39/47] fix scikit-learn warning, cf. #306

---
 evcouplings/couplings/pairs.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/evcouplings/couplings/pairs.py b/evcouplings/couplings/pairs.py
index 47446be8..068b2853 100644
--- a/evcouplings/couplings/pairs.py
+++ b/evcouplings/couplings/pairs.py
@@ -1028,9 +1028,10 @@ def score(self, ecs, freqs, theta, effective_sequences, num_sites=None, score="c
         feature_table = ecs_full.reindex(self.feature_names, axis=1)
 
         # predict probabilities of contact and decision function
-        # (target class/true EC := 1)
-        probs = self.classifier.predict_proba(feature_table)[:, 1]
-        decision_func = self.classifier.decision_function(feature_table)
+        # (target class/true EC := 1);
+        # note: apply to .values to avoid sklearn warnings that model does not have feature names
+        probs = self.classifier.predict_proba(feature_table.values)[:, 1]
+        decision_func = self.classifier.decision_function(feature_table.values)
 
         # assign to EC table
         ecs_final = ecs_full.assign(

From 656725137ed404b1afda1095f47a356d288dd2a5 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Mon, 8 Jul 2024 12:33:42 +0200
Subject: [PATCH 40/47] better backward compatibility for chain extraction

---
 evcouplings/compare/pdb.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index da16bf54..6b65c226 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -534,24 +534,23 @@ def __init__(self, filehandle, keep_full_data=False):
 
         # store information about models/chains for quick retrieval and verification;
         # subtract 0 to start numbering consistently to how this was handled with MMTF
-        self.models = list(sorted(self.atom_table.model_number.unique() - 1))
+        self.models = list(
+            sorted(self.atom_table.model_number.unique())
+        )
 
-        # model number to auth chains
+        # model number to auth ID mapping
         self.model_to_chains = self.atom_table[
             ["model_number", "auth_asym_id"]
-        ].drop_duplicates().assign(
-            model_number=lambda df: df.model_number - 1
-        ).groupby(
+        ].drop_duplicates().groupby(
             "model_number"
         ).agg(
             lambda s: list(s)
         )["auth_asym_id"].to_dict()
 
+        # model number to asym ID mapping
         self.model_to_asym_ids = self.atom_table[
             ["model_number", "label_asym_id"]
-        ].drop_duplicates().assign(
-            model_number=lambda df: df.model_number - 1
-        ).groupby(
+        ].drop_duplicates().groupby(
             "model_number"
         ).agg(
             lambda s: list(s)
@@ -633,7 +632,9 @@ def get_chain(self, chain, model=0, is_author_id=True):
         chain : str
             ID of chain to be extracted (e.g. "A")
         model : int, optional (default: 0)
-            Identifier of model to be extracted, starting counting at 0
+            *Index* of model to be extracted, starting counting at 0. Note that for backwards
+            compatibility, this is *not* the actual PDB model identifier but indexes the model
+            identifiers in self.models, i.e. model must be >= 0 and < len(self.models)
         is_author_id : bool (default: True)
             If true, interpret chain parameter as author chain identifier;
             if false, interpret as label_asym_id
@@ -645,14 +646,17 @@ def get_chain(self, chain, model=0, is_author_id=True):
             and atom coordinates
         """
         # check if valid model was requested
-        if model not in self.models:
+        if not 0 <= model < len(self.models):
             raise ValueError(
-                "Invalid model identifier, valid options are: " + (",".join(map(str, self.models)))
+                f"Invalid model index, valid options: {','.join(map(str, range(len(self.models))))}"
             )
 
+        # map model index to model number/identifier
+        model_number = self.models[model]
+
         # check if valid chain was requested
-        if ((is_author_id and chain not in self.model_to_chains[model]) or
-                (not is_author_id and chain not in self.model_to_asym_ids[model])):
+        if ((is_author_id and chain not in self.model_to_chains[model_number]) or
+                (not is_author_id and chain not in self.model_to_asym_ids[model_number])):
             raise ValueError(
                 "Invalid chain selection, check self.model_to_chains / self.model_to_asym_ids for options"
             )
@@ -664,7 +668,7 @@ def get_chain(self, chain, model=0, is_author_id=True):
 
         # filter atom table to model + chain selection
         atoms = self.atom_table.query(
-            f"model_number - 1 == @model and {chain_field} == @chain"
+            f"model_number == @model_number and {chain_field} == @chain"
         ).assign(
             # create coordinate ID from author residue ID + insertion code
             # (this should be unique and circumvents issues from 0 seqres values if selecting based on author chain ID)

From 7e2c0bbb364ec78c3acf08fa51c30e1794e34f92 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Mon, 8 Jul 2024 12:57:31 +0200
Subject: [PATCH 41/47] fix charge handling

---
 evcouplings/compare/pdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index 6b65c226..d721429c 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -376,7 +376,7 @@ def to_file(self, fileobj, chain_id="A", end=True, first_atom_id=1):
             # print charge if we have one (optional)
             charge = r["charge"]
             # test < and > to exclude nan values
-            if charge < 0 or charge > 0:
+            if isinstance(charge, int) and charge < 0 or charge > 0:
                 charge_sign = "-" if charge < 0 else "+"
                 charge_value = abs(charge)
                 charge_str = "{}{}".format(charge_value, charge_sign)

From 58aa2c07277dee5eb660eacfee06dfc9a3eaeead Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Mon, 8 Jul 2024 13:02:55 +0200
Subject: [PATCH 42/47] fix charge handling pt2

---
 evcouplings/compare/pdb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index d721429c..f1897c6e 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -376,7 +376,7 @@ def to_file(self, fileobj, chain_id="A", end=True, first_atom_id=1):
             # print charge if we have one (optional)
             charge = r["charge"]
             # test < and > to exclude nan values
-            if isinstance(charge, int) and charge < 0 or charge > 0:
+            if isinstance(charge, int) and (charge < 0 or charge > 0):
                 charge_sign = "-" if charge < 0 else "+"
                 charge_value = abs(charge)
                 charge_str = "{}{}".format(charge_value, charge_sign)

From 9bfa113e0db8aa611e130ee56e7d7e21447261b8 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Mon, 8 Jul 2024 13:13:48 +0200
Subject: [PATCH 43/47] fix alt_loc

---
 evcouplings/compare/pdb.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index f1897c6e..2e468291 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -445,6 +445,7 @@ def __init__(self, filehandle, keep_full_data=False):
             "_atom_site.type_symbol": "type_symbol",  # x
             "_atom_site.label_atom_id": "label_atom_id",  # x
             "_atom_site.auth_atom_id": "auth_atom_id",
+            "_atom_site.label_alt_id": "label_alt_id",
 
             # residue/molecule types (three-letter code)
             "_atom_site.label_comp_id": "label_comp_id",  # x
@@ -733,6 +734,7 @@ def get_chain(self, chain, model=0, is_author_id=True):
             "z": "z",
             "occupancy": "occupancy",
             "b_factor": "b_factor",
+            "label_alt_id": "alt_loc",
         }
 
         # add information about residue index to atoms

From 476700b43bb087e73dc2f6a7b6c4230f2542541d Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Mon, 8 Jul 2024 13:19:08 +0200
Subject: [PATCH 44/47] fix yet another pandas deprecation

---
 evcouplings/compare/distances.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evcouplings/compare/distances.py b/evcouplings/compare/distances.py
index f6a2d21f..674e7eb2 100644
--- a/evcouplings/compare/distances.py
+++ b/evcouplings/compare/distances.py
@@ -770,7 +770,7 @@ def _get_col_name(col_name):
             # extract coverage segments for all individual structures
             segments = {
                 _get_col_name(col_name): find_segments(series.dropna().sort_index().index)
-                for col_name, series in coverage_cols.iteritems()
+                for col_name, series in coverage_cols.items()
             }
 
             return segments

From f32491f00bd3d702812ce2916add248728ad019e Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Tue, 23 Jul 2024 17:05:35 +0200
Subject: [PATCH 45/47] fix KeyError due to absence of secondary structure
 sections

---
 evcouplings/compare/pdb.py | 58 +++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index 2e468291..c46f473f 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -500,15 +500,23 @@ def __init__(self, filehandle, keep_full_data=False):
             name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items()
         })
 
-        # decode information into dataframe with BioPython helper method
-        self.helix_table = pd.DataFrame({
-            name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items()
-        })
-
-        # decode information into dataframe with BioPython helper method
-        self.sheet_table = pd.DataFrame({
-            name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items()
-        })
+        # decode information into dataframe with BioPython helper method; note this section may not be
+        # present if no helices exist in the structure
+        try:
+            self.helix_table = pd.DataFrame({
+                name: _decode(data[source_column]) for source_column, name in HELIX_TARGET_COLS.items()
+            })
+        except KeyError:
+            self.helix_table = None
+
+        # decode information into dataframe with BioPython helper method; note this section may not be
+        # present if no sheets exist in the structure
+        try:
+            self.sheet_table = pd.DataFrame({
+                name: _decode(data[source_column]) for source_column, name in SHEET_TARGET_COLS.items()
+            })
+        except KeyError:
+            self.sheet_table = None
 
         # create secondary structure table for merging to chain tables
         # (will only contain helix/H and strand/E, coil/C will need to be filled in)
@@ -517,6 +525,10 @@ def __init__(self, filehandle, keep_full_data=False):
             ("H", self.helix_table),
             ("E", self.sheet_table)
         ]:
+            # skip if secondary structure element not present in PDB file at all
+            if sse_table is None:
+                continue
+
             for _, row in sse_table.iterrows():
                 assert row.beg_label_asym_id == row.end_label_asym_id
                 for seq_id in range(row.beg_label_seq_id, row.end_label_seq_id + 1):
@@ -527,11 +539,14 @@ def __init__(self, filehandle, keep_full_data=False):
                     })
 
         # drop duplicates, there are overlapping helix segment annotations e.g. for PDB 6cup:A:Asp92
-        self.secondary_structure = pd.DataFrame(
-            sse_raw
-        ).drop_duplicates(
-            subset=["label_asym_id", "label_seq_id"]
-        )
+        if len(sse_raw) > 0:
+            self.secondary_structure = pd.DataFrame(
+                sse_raw
+            ).drop_duplicates(
+                subset=["label_asym_id", "label_seq_id"]
+            )
+        else:
+            self.secondary_structure = None
 
         # store information about models/chains for quick retrieval and verification;
         # subtract 0 to start numbering consistently to how this was handled with MMTF
@@ -692,11 +707,16 @@ def get_chain(self, chain, model=0, is_author_id=True):
         res.index.name = "residue_index"
 
         # merge secondary structure information (left outer join as coil is missing from table)
-        res_sse = res.merge(
-            self.secondary_structure,
-            on=("label_seq_id", "label_asym_id"),
-            how="left"
-        )
+        if self.secondary_structure is not None:
+            res_sse = res.merge(
+                self.secondary_structure,
+                on=("label_seq_id", "label_asym_id"),
+                how="left"
+            )
+        else:
+            res_sse = res.assign(
+                sec_struct_3state=np.nan
+            )
 
         res_sse.loc[
             res_sse.sec_struct_3state.isnull() & (res_sse.label_seq_id > 0),

From 35d93d03f6ac63f9ce2df79f46744c12e855a101 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Tue, 27 Aug 2024 11:13:54 +0200
Subject: [PATCH 46/47] fix atom_id overwrite by coord_id

---
 evcouplings/compare/pdb.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index c46f473f..4c41495d 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -691,7 +691,6 @@ def get_chain(self, chain, model=0, is_author_id=True):
             coord_id=lambda df: df.auth_seq_id.astype(str) + df.insertion_code,
             seqres_id=lambda df: df.label_seq_id.astype(str).replace("0", np.nan),
             one_letter_code=lambda df: df.label_comp_id.map(AA3_to_AA1, na_action="ignore"),
-            id=lambda df: df.coord_id,
             # note that MSE will now be labeled as HETATM, which was not the case with MMTF
             hetatm=lambda df: df.record_type == "HETATM",
         ).reset_index(
@@ -701,6 +700,8 @@ def get_chain(self, chain, model=0, is_author_id=True):
         # create residue table by de-duplicating atoms
         res = atoms.drop_duplicates(
             subset=["coord_id"]
+        ).assign(
+            id=lambda df: df.coord_id
         ).reset_index(
             drop=True
         )

From 522903baabfc57985e9ed43f53907b2128e2c7d4 Mon Sep 17 00:00:00 2001
From: Thomas Hopf <thomas.hopf@gmail.com>
Date: Thu, 26 Sep 2024 17:27:28 +0200
Subject: [PATCH 47/47] ensure chain IDs are always str

---
 evcouplings/compare/pdb.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/evcouplings/compare/pdb.py b/evcouplings/compare/pdb.py
index 4c41495d..850a4be3 100644
--- a/evcouplings/compare/pdb.py
+++ b/evcouplings/compare/pdb.py
@@ -498,7 +498,12 @@ def __init__(self, filehandle, keep_full_data=False):
         # decode information into dataframe with BioPython helper method
         self.atom_table = pd.DataFrame({
             name: _decode(data[source_column]) for source_column, name in ATOM_TARGET_COLS.items()
-        })
+        }).assign(
+            # make sure chain identifiers are strings, in some pathologic cases, these are int rather than str
+            # (e.g. entry 6swy)
+            auth_asym_id=lambda df: df.auth_asym_id.astype(str),
+            label_asym_id=lambda df: df.label_asym_id.astype(str),
+        )
 
         # decode information into dataframe with BioPython helper method; note this section may not be
         # present if no helices exist in the structure