Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/v0.2' into lood/permissions_error
Browse files Browse the repository at this point in the history
# Conflicts:
#	evcouplings/couplings/model.py
#	test/TestComplex.py
#	test/TestMutation.py
  • Loading branch information
loodvn committed Aug 1, 2023
2 parents a9181df + 5b3b3f0 commit 6463ad9
Show file tree
Hide file tree
Showing 10 changed files with 54 additions and 34 deletions.
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@ Predict protein structure, function and mutations using evolutionary sequence co

### Installing the Python package

If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools). If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.
* If you are simply interested in using EVcouplings as a library, installing the Python package is all you need to do (unless you use functions that depend on external tools).
* If you want to run the *evcouplings* application (alignment generation, model parameter inference, structure prediction, etc.) you will also need to follow the sections on installing external tools and databases.

#### Requirements

EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages that can be tricky to install using pip (numba, numpy, ...), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). In case you are creating a new conda environment or using miniconda, please make sure to run `conda install anaconda` before running pip, or otherwise the required packages will not be present.
EVcouplings actively supports Python >= 3.10 installations.

#### Installation

Expand All @@ -27,8 +28,6 @@ and to update to the latest version after previously installing EVcouplings from

pip install -U --no-deps https://github.com/debbiemarkslab/EVcouplings/archive/develop.zip

Installation will take seconds.

### External software tools

*After installation and before running compute jobs, the paths to the respective binaries of the following external tools have to be set in your EVcouplings job configuration file(s).*
Expand Down
33 changes: 27 additions & 6 deletions evcouplings/align/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import re
from collections import namedtuple, OrderedDict, defaultdict
from copy import deepcopy
from pathlib import Path

import numpy as np
from numba import jit
Expand Down Expand Up @@ -326,7 +327,7 @@ def write_a3m(sequences, fileobj, insert_gap=INSERT_GAP, width=80):
fileobj.write(seq.replace(insert_gap, "") + "\n")


def detect_format(fileobj):
def detect_format(fileobj, filepath=""):
"""
Detect if an alignment file is in FASTA or
Stockholm format.
Expand All @@ -335,10 +336,12 @@ def detect_format(fileobj):
----------
fileobj : file-like obj
Alignment file for which to detect format
filepath : string or path-like obj
Path of alignment file
Returns
-------
format : {"fasta", "stockholm", None}
format : {"fasta", "a3m", "stockholm", None}
Format of alignment, None if not detectable
"""
for i, line in enumerate(fileobj):
Expand All @@ -348,6 +351,9 @@ def detect_format(fileobj):

# This indicates a FASTA file
if line.startswith(">"):
# A3M files have extension .a3m
if Path(filepath).suffix.lower() == ".a3m":
return "a3m"
return "fasta"

# Skip comment lines and empty lines for FASTA detection
Expand Down Expand Up @@ -569,7 +575,12 @@ def __init__(self, sequence_matrix, sequence_ids=None, annotation=None,
)

# make sure we get rid of iterators etc.
self.ids = np.array(list(sequence_ids))
self.ids = list(sequence_ids)

# turn identifiers into numpy array for consistency with previous implementation;
# but use dtype object to avoid memory usage issues of numpy string datatypes (longest
# sequence defines memory usage otherwise)
self.ids = np.array(self.ids, dtype=np.object_)

self.id_to_index = {
id_: i for i, id_ in enumerate(self.ids)
Expand Down Expand Up @@ -607,7 +618,7 @@ def from_dict(cls, sequences, **kwargs):
@classmethod
def from_file(cls, fileobj, format="fasta",
a3m_inserts="first", raise_hmmer_prefixes=True,
**kwargs):
split_header=False, **kwargs):
"""
Construct an alignment object by reading in an
alignment file.
Expand All @@ -625,6 +636,9 @@ def from_file(cls, fileobj, format="fasta",
HMMER adds number prefixes to sequence identifiers in Stockholm
files if identifiers are not unique. If True, the parser will
raise an exception if a Stockholm alignment has such prefixes.
split_header: bool, optional (default: False)
Only store identifier portion of each header (before first whitespace)
in identifier list, rather than full header line
**kwargs
Additional arguments to be passed to class constructor
Expand Down Expand Up @@ -664,6 +678,12 @@ def from_file(cls, fileobj, format="fasta",
else:
raise ValueError("Invalid alignment format: {}".format(format))

# reduce header lines to identifiers if requested
if split_header:
seqs = {
header.split()[0]: seq for header, seq in seqs.items()
}

return cls.from_dict(seqs, **kwargs)

def __getitem__(self, index):
Expand Down Expand Up @@ -777,7 +797,8 @@ def select(self, columns=None, sequences=None):
def apply(self, columns=None, sequences=None, func=np.char.lower):
"""
Apply a function along columns and/or rows of alignment matrix,
or to entire matrix.
or to entire matrix. Note that column and row selections are
applied independently in this particular order.
Parameters
----------
Expand Down Expand Up @@ -811,7 +832,7 @@ def apply(self, columns=None, sequences=None, func=np.char.lower):
mod_matrix[sequences, :] = func(mod_matrix[sequences, :])

return Alignment(
mod_matrix, np.copy(self.ids), deepcopy(self.annotation),
mod_matrix, deepcopy(self.ids), deepcopy(self.annotation),
alphabet=self.alphabet
)

Expand Down
2 changes: 1 addition & 1 deletion evcouplings/align/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ def existing(**kwargs):

# first try to autodetect format of alignment
with open(input_alignment) as f:
format = detect_format(f)
format = detect_format(f, filepath=input_alignment)
if format is None:
raise InvalidParameterError(
"Format of input alignment {} could not be "
Expand Down
4 changes: 2 additions & 2 deletions evcouplings/compare/pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,7 +457,7 @@ def _get_range(object_counts):
# store explicit information about composition of residues
def _group_info(field):
return np.array(
[x[field] for x in mmtf.group_list]
[x[field] for x in mmtf.group_list], dtype=np.object_
)

# three and one letter code names of different groups
Expand Down Expand Up @@ -589,7 +589,7 @@ def get_chain(self, chain, model=0):
np.array([
np.arange(self.first_residue_index[i], self.last_residue_index[i])
for i in target_chain_indeces
])
], dtype=np.object_)
)

# chain indeces and identifiers for all residues
Expand Down
2 changes: 1 addition & 1 deletion evcouplings/couplings/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ def __init__(self, filename, *segments,

# initialize the segment index mapper to update model numbering
if len(segments) == 0:
raise(ValueError, "Must provide at least one segment for MultiSegmentCouplingsModel")
raise ValueError("Must provide at least one segment for MultiSegmentCouplingsModel")

first_segment = segments[0]
index_start = first_segment.region_start
Expand Down
6 changes: 3 additions & 3 deletions evcouplings/couplings/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ def convert_sequences(self, sequences):
)
)

S = np.empty((len(sequences), L_seq), dtype=np.int64)
S = np.empty((len(sequences), L_seq), dtype=int)

try:
for i, s in enumerate(sequences):
Expand Down Expand Up @@ -689,8 +689,8 @@ def delta_hamiltonian(self, substitutions, verify_mutants=True):
2) delta J_ij, 3) delta h_i
"""
pos = np.empty(len(substitutions), dtype=np.int64)
subs = np.empty(len(substitutions), dtype=np.int64)
pos = np.empty(len(substitutions), dtype=int)
subs = np.empty(len(substitutions), dtype=int)

try:
for i, (subs_pos, subs_from, subs_to) in enumerate(substitutions):
Expand Down
19 changes: 10 additions & 9 deletions evcouplings/couplings/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,23 +265,24 @@ def run_plmc(alignment, couplings_file, param_file=None,
# returncode == -11 (segfault) despite successful calculation
return_code, stdout, stderr = run(cmd, check_returncode=False)

# TODO: remove this segfault-hunting output once fixed
# TODO: remove this segfault-hunting output if fixed in plmc
if return_code != 0:
# if not a segfault, still raise exception
if return_code != -11:
# check if we got valid output from plmc by parsing it
valid_plmc_output = True
try:
parse_plmc_log(stderr)
except KeyError:
valid_plmc_output = False

# if not a segfault or invalid plmc output, still raise exception
if return_code != -11 or not valid_plmc_output:
from evcouplings.utils.system import ExternalToolError
raise ExternalToolError(
"Call failed:\ncmd={}\nreturncode={}\nstdout={}\nstderr={}".format(
cmd, return_code, stdout, stderr
)
)

print("PLMC NON-ZERO RETURNCODE:", return_code)
print(cmd)
print(" ".join(cmd))
print("stdout:", stdout)
print("stderr:", stderr)

iter_df, out_fields = parse_plmc_log(stderr)

# also check we actually calculated couplings...
Expand Down
7 changes: 4 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
name='evcouplings',

# Version:
version='0.1.2',
version='0.2',

description='A Framework for evolutionary couplings analysis',
long_description=readme,
Expand Down Expand Up @@ -49,7 +49,8 @@
# The license as you wish (should match "license" above)
'License :: OSI Approved :: MIT License',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
],

# What EVcouplings relates to:
Expand Down Expand Up @@ -96,7 +97,7 @@
#setup_requires=['setuptools>=18.2', 'numpy'],

install_requires=['setuptools>=18.2', 'numpy',
'pandas', 'scipy', 'numba', 'ruamel.yaml', 'matplotlib', 'requests',
'pandas', 'scipy', 'numba', 'ruamel.yaml<0.18', 'matplotlib', 'requests',
'mmtf-python', 'click', 'filelock', 'psutil', 'bokeh', 'jinja2',
'biopython', 'seaborn', 'billiard', 'scikit-learn',
],
Expand Down
4 changes: 1 addition & 3 deletions test/TestComplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,9 +402,7 @@ def test_find_possible_partners(self):

pd.testing.assert_frame_equal(
self.possible_partners, _possible_partners,
check_exact=False,
rtol=1e-3,
check_like=True,
atol=1e-5, check_like=True,
check_names=False
)

Expand Down
4 changes: 2 additions & 2 deletions test/TestMutation.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_single_mutant_matrix(self):
# gotta round to account for this
_singles = _singles.round(3)
singles = singles.round(3)
pd.testing.assert_frame_equal(singles, _singles, check_exact=False, rtol=1e-3)
pd.testing.assert_frame_equal(singles, _singles, check_dtype=False, atol=1e-5)

def test_split_mutants_single(self):
"""
Expand Down Expand Up @@ -228,7 +228,7 @@ def test_predict_mutation_table_segment_column(self):
self.c0, self.singles, output_column="prediction_independent"
)

pd.testing.assert_frame_equal(self.singles, _singles, check_exact=False, rtol=1e-3)
pd.testing.assert_frame_equal(self.singles, _singles, check_dtype=False, atol=1e-5)

def test_predict_mutation_table_empty_segment(self):
"""
Expand Down

0 comments on commit 6463ad9

Please sign in to comment.