Skip to content

Commit

Permalink
feat: add cigar utility method get_alignment_offsets (#188)
Browse files Browse the repository at this point in the history
- add cigar utility method get_alignment_offsets to find the offsets of the aligned segment in the read
- tests included

---------

Co-authored-by: Matt Stone <[email protected]>
  • Loading branch information
yfarjoun and msto authored Oct 24, 2024
1 parent 318ff98 commit 12c2d31
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 0 deletions.
39 changes: 39 additions & 0 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,45 @@ def length_on_target(self) -> int:
"""Returns the length of the alignment on the target sequence."""
return sum([elem.length_on_target for elem in self.elements])

def query_alignment_offsets(self) -> Tuple[int, int]:
"""
Gets the 0-based, end-exclusive positions of the first and last aligned base in the query.
The resulting range will contain the range of positions in the SEQ string for
the bases that are aligned.
If counting from the end of the query is desired, use
`cigar.reversed().query_alignment_offsets()`
Returns:
A tuple (start, stop) containing the start and stop positions
of the aligned part of the query. These offsets are 0-based and open-ended, with
respect to the beginning of the query.
Raises:
ValueError: If according to the cigar, there are no aligned query bases.
"""
start_offset: int = 0
end_offset: int = 0
element: CigarElement
alignment_began = False
for element in self.elements:
if element.operator.is_clipping and not alignment_began:
# We are in the clipping operators preceding the alignment
# Note: hardclips have length-on-query=0
start_offset += element.length_on_query
end_offset += element.length_on_query
elif not element.operator.is_clipping:
# We are within the alignment
alignment_began = True
end_offset += element.length_on_query
else:
# We have exited the alignment and are in the clipping operators after the alignment
break

if start_offset == end_offset:
raise ValueError(f"Cigar {self} has no aligned bases")
return start_offset, end_offset

def __getitem__(
self, index: Union[int, slice]
) -> Union[CigarElement, Tuple[CigarElement, ...]]:
Expand Down
77 changes: 77 additions & 0 deletions tests/fgpyo/sam/test_cigar.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Tuple

import pytest

from fgpyo.sam import Cigar
Expand Down Expand Up @@ -31,3 +33,78 @@ def test_bad_index_raises_index_error(index: int) -> None:
def test_bad_index_raises_type_error(index: int) -> None:
with pytest.raises(TypeError):
cigar[index]


@pytest.mark.parametrize(
("cigar_string", "expected_range"),
[
("10M", (0, 10)),
("10M10I", (0, 20)),
("10X10I", (0, 20)),
("10X10D", (0, 10)),
("10=10D", (0, 10)),
("10S10M", (10, 20)),
("10H10M", (0, 10)),
("10H10S10M", (10, 20)),
("10H10S10M5S", (10, 20)),
("10H10S10M5S10H", (10, 20)),
("76I", (0, 76)),
],
)
def test_query_alignment_offsets(cigar_string: str, expected_range: Tuple[int, int]) -> None:
"""
cig.query_alignment_offsets() should return the expected results.
"""
cig = Cigar.from_cigarstring(cigar_string)
ret = cig.query_alignment_offsets()
assert ret == expected_range


@pytest.mark.parametrize(
("cigar_string"),
[
("10H"),
("10S"),
("10S10H"),
("5H10S10H"),
("76D"),
("10P76S"),
("50S1000N50S"),
],
)
def test_query_alignment_offsets_failures(cigar_string: str) -> None:
"""query_alignment_offsets() should raise a ValueError if the CIGAR has no aligned positions."""
cig = Cigar.from_cigarstring(cigar_string)
with pytest.raises(ValueError):
cig.query_alignment_offsets()

with pytest.raises(ValueError):
cig.reversed().query_alignment_offsets()


@pytest.mark.parametrize(
("cigar_string", "expected_range"),
[
("10M", (0, 10)),
("10M10I", (0, 20)),
("10X10I", (0, 20)),
("10X10D", (0, 10)),
("10=10D", (0, 10)),
("10S10M", (0, 10)),
("10H10M", (0, 10)),
("10H10S10M", (0, 10)),
("10H10S10M5S", (5, 15)),
("10H10S10M5S10H", (5, 15)),
],
)
def test_query_alignment_offsets_reversed(
cigar_string: str, expected_range: Tuple[int, int]
) -> None:
"""
cig.revered().query_alignment_offsets() should return the expected results.
"""

cig = Cigar.from_cigarstring(cigar_string)

ret = cig.reversed().query_alignment_offsets()
assert ret == expected_range

0 comments on commit 12c2d31

Please sign in to comment.