diff --git a/docs/api/utils.md b/docs/api/utils.md index 886757a..551df3d 100644 --- a/docs/api/utils.md +++ b/docs/api/utils.md @@ -15,6 +15,7 @@ CREsted provides a few utility function to help with sequence encoding, function hot_encoding_to_sequence one_hot_encode_sequence fetch_sequences + reverse_complement permute_model setup_logging ``` diff --git a/src/crested/utils/__init__.py b/src/crested/utils/__init__.py index 4440a9f..ed7a1f8 100644 --- a/src/crested/utils/__init__.py +++ b/src/crested/utils/__init__.py @@ -9,4 +9,5 @@ hot_encoding_to_sequence, one_hot_encode_sequence, read_bigwig_region, + reverse_complement, ) diff --git a/src/crested/utils/_utils.py b/src/crested/utils/_utils.py index 6a368ae..5fde197 100644 --- a/src/crested/utils/_utils.py +++ b/src/crested/utils/_utils.py @@ -426,3 +426,52 @@ def read_bigwig_region( ).squeeze() return values, positions + + +def reverse_complement(sequence: str | list[str] | np.ndarray) -> str | np.ndarray: + """ + Perform reverse complement on either a one-hot encoded array or a (list of) DNA sequence string(s). + + Parameters + ---------- + sequence + The DNA sequence string(s) or one-hot encoded array to reverse complement. + + Returns + ------- + The reverse complemented DNA sequence string or one-hot encoded array. + """ + + def complement_str(seq: str) -> str: + complement = str.maketrans("ACGTacgt", "TGCAtgca") + return seq.translate(complement)[::-1] + + if isinstance(sequence, str): + return complement_str(sequence) + elif isinstance(sequence, list): + return [complement_str(seq) for seq in sequence] + elif isinstance(sequence, np.ndarray): + if sequence.ndim == 2: + if sequence.shape[1] == 4: + return sequence[::-1, ::-1] + elif sequence.shape[0] == 4: + return sequence[:, ::-1][:, ::-1] + else: + raise ValueError( + "One-hot encoded array must have shape (W, 4) or (4, W)" + ) + elif sequence.ndim == 3: + if sequence.shape[1] == 4: + return sequence[:, ::-1, ::-1] + elif sequence.shape[2] == 4: + return sequence[:, ::-1, ::-1] + else: + raise ValueError( + "One-hot encoded array must have shape (B, 4, W) or (B, W, 4)" + ) + else: + raise ValueError("One-hot encoded array must have 2 or 3 dimensions") + else: + raise TypeError( + "Input must be either a DNA sequence string or a one-hot encoded array" + ) diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..675befa --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,35 @@ +import numpy as np + +from crested.utils import ( + hot_encoding_to_sequence, + one_hot_encode_sequence, + reverse_complement, +) + + +def test_reverse_complement_string(): + assert reverse_complement("ACGT") == "ACGT" + assert reverse_complement("TGCA") == "TGCA" + assert reverse_complement("AAGGTTCC") == "GGAACCTT" + + +def test_reverse_complement_list_of_strings(): + assert reverse_complement(["ACGT", "TGCA"]) == ["ACGT", "TGCA"] + assert reverse_complement(["AAGGTTCC", "CCGGAATT"]) == ["GGAACCTT", "AATTCCGG"] + + +def test_reverse_complement_one_hot_encoded_array(): + seq_1 = "ACGTA" + seq_1_one_hot = one_hot_encode_sequence(seq_1) + seq_1_rev = reverse_complement(seq_1) + seq_1_rev_one_hot = one_hot_encode_sequence(seq_1_rev) + + np.testing.assert_array_equal(reverse_complement(seq_1_one_hot), seq_1_rev_one_hot) + + seq_2 = "AAGGTTCC" + seq_2_rev = "GGAACCTT" + seq_2_one_hot = one_hot_encode_sequence(seq_2, expand_dim=False) + seq_2_rev_one_hot = reverse_complement(seq_2_one_hot) + seq_2_seq_reversed = hot_encoding_to_sequence(seq_2_rev_one_hot) + + assert seq_2_seq_reversed == seq_2_rev