Skip to content

Commit

Permalink
reverse complement utility func & docs
Browse files Browse the repository at this point in the history
  • Loading branch information
LukasMahieu committed Nov 15, 2024
1 parent 6c76dde commit 3ba8e85
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/api/utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ CREsted provides a few utility function to help with sequence encoding, function
hot_encoding_to_sequence
one_hot_encode_sequence
fetch_sequences
reverse_complement
permute_model
setup_logging
```
1 change: 1 addition & 0 deletions src/crested/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
hot_encoding_to_sequence,
one_hot_encode_sequence,
read_bigwig_region,
reverse_complement,
)
49 changes: 49 additions & 0 deletions src/crested/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,3 +426,52 @@ def read_bigwig_region(
).squeeze()

return values, positions


def reverse_complement(sequence: str | list[str] | np.ndarray) -> str | np.ndarray:
"""
Perform reverse complement on either a one-hot encoded array or a (list of) DNA sequence string(s).
Parameters
----------
sequence
The DNA sequence string(s) or one-hot encoded array to reverse complement.
Returns
-------
The reverse complemented DNA sequence string or one-hot encoded array.
"""

def complement_str(seq: str) -> str:
complement = str.maketrans("ACGTacgt", "TGCAtgca")
return seq.translate(complement)[::-1]

if isinstance(sequence, str):
return complement_str(sequence)
elif isinstance(sequence, list):
return [complement_str(seq) for seq in sequence]
elif isinstance(sequence, np.ndarray):
if sequence.ndim == 2:
if sequence.shape[1] == 4:
return sequence[::-1, ::-1]
elif sequence.shape[0] == 4:
return sequence[:, ::-1][:, ::-1]
else:
raise ValueError(
"One-hot encoded array must have shape (W, 4) or (4, W)"
)
elif sequence.ndim == 3:
if sequence.shape[1] == 4:
return sequence[:, ::-1, ::-1]
elif sequence.shape[2] == 4:
return sequence[:, ::-1, ::-1]
else:
raise ValueError(
"One-hot encoded array must have shape (B, 4, W) or (B, W, 4)"
)
else:
raise ValueError("One-hot encoded array must have 2 or 3 dimensions")
else:
raise TypeError(
"Input must be either a DNA sequence string or a one-hot encoded array"
)
35 changes: 35 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import numpy as np

from crested.utils import (
hot_encoding_to_sequence,
one_hot_encode_sequence,
reverse_complement,
)


def test_reverse_complement_string():
assert reverse_complement("ACGT") == "ACGT"
assert reverse_complement("TGCA") == "TGCA"
assert reverse_complement("AAGGTTCC") == "GGAACCTT"


def test_reverse_complement_list_of_strings():
assert reverse_complement(["ACGT", "TGCA"]) == ["ACGT", "TGCA"]
assert reverse_complement(["AAGGTTCC", "CCGGAATT"]) == ["GGAACCTT", "AATTCCGG"]


def test_reverse_complement_one_hot_encoded_array():
seq_1 = "ACGTA"
seq_1_one_hot = one_hot_encode_sequence(seq_1)
seq_1_rev = reverse_complement(seq_1)
seq_1_rev_one_hot = one_hot_encode_sequence(seq_1_rev)

np.testing.assert_array_equal(reverse_complement(seq_1_one_hot), seq_1_rev_one_hot)

seq_2 = "AAGGTTCC"
seq_2_rev = "GGAACCTT"
seq_2_one_hot = one_hot_encode_sequence(seq_2, expand_dim=False)
seq_2_rev_one_hot = reverse_complement(seq_2_one_hot)
seq_2_seq_reversed = hot_encoding_to_sequence(seq_2_rev_one_hot)

assert seq_2_seq_reversed == seq_2_rev

0 comments on commit 3ba8e85

Please sign in to comment.