Skip to content

Commit

Permalink
Allow insert size calculation to work on 1 read only (#205)
Browse files Browse the repository at this point in the history
  • Loading branch information
clintval authored Dec 28, 2024
1 parent 3c6a94e commit bd38898
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 14 deletions.
59 changes: 46 additions & 13 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -677,6 +677,49 @@ def from_recs( # noqa: C901 # `from_recs` is too complex (11 > 10)
return PairOrientation.RF


def isize(rec1: AlignedSegment, rec2: Optional[AlignedSegment] = None) -> int:
"""Computes the insert size ("template length" or "TLEN") for a pair of records.
Args:
rec1: The first record in the pair.
rec2: The second record in the pair. If None, then mate info on `rec1` will be used.
"""
if rec2 is None:
rec2_is_unmapped = rec1.mate_is_unmapped
rec2_reference_id = rec1.next_reference_id
else:
rec2_is_unmapped = rec2.is_unmapped
rec2_reference_id = rec2.reference_id

if rec1.is_unmapped or rec2_is_unmapped or rec1.reference_id != rec2_reference_id:
return 0

if rec2 is None:
rec2_is_forward = rec1.mate_is_forward
rec2_reference_start = rec1.next_reference_start
else:
rec2_is_forward = rec2.is_forward
rec2_reference_start = rec2.reference_start

if rec1.is_forward and rec2_is_forward:
return rec2_reference_start - rec1.reference_start
if rec1.is_reverse and rec2_is_forward:
return rec2_reference_start - rec1.reference_end

if rec2 is None:
if not rec1.has_tag("MC"):
raise ValueError('Cannot determine proper pair status without a mate cigar ("MC")!')
rec2_cigar = Cigar.from_cigarstring(str(rec1.get_tag("MC")))
rec2_reference_end = rec1.next_reference_start + rec2_cigar.length_on_target()
else:
rec2_reference_end = rec2.reference_end

if rec1.is_forward:
return rec2_reference_end - rec1.reference_start
else:
return rec2_reference_end - rec1.reference_end


DefaultProperlyPairedOrientations: set[PairOrientation] = {PairOrientation.FR}
"""The default orientations for properly paired reads."""

Expand All @@ -686,6 +729,7 @@ def is_proper_pair(
rec2: Optional[AlignedSegment] = None,
max_insert_size: int = 1000,
orientations: Collection[PairOrientation] = DefaultProperlyPairedOrientations,
isize: Callable[[AlignedSegment, AlignedSegment], int] = isize,
) -> bool:
"""Determines if a pair of records are properly paired or not.
Expand All @@ -700,6 +744,7 @@ def is_proper_pair(
rec2: The second record in the pair. If None, then mate info on `rec1` will be used.
max_insert_size: The maximum insert size to consider a pair "proper".
orientations: The valid set of orientations to consider a pair "proper".
isize: A function that takes the two alignments and calculates their isize.
See:
[`htsjdk.samtools.SamPairUtil.isProperPair()`](https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L106-L125)
Expand All @@ -716,9 +761,7 @@ def is_proper_pair(
and rec2_is_mapped
and rec1.reference_id == rec2_reference_id
and PairOrientation.from_recs(rec1=rec1, rec2=rec2) in orientations
# TODO: consider replacing with `abs(isize(r1, r2)) <= max_insert_size`
# which can only be done if isize() is modified to allow for an optional R2.
and 0 < abs(rec1.template_length) <= max_insert_size
and 0 < abs(isize(rec1, rec2)) <= max_insert_size
)


Expand Down Expand Up @@ -803,16 +846,6 @@ def from_read(cls, read: pysam.AlignedSegment) -> List["SupplementaryAlignment"]
return []


def isize(r1: AlignedSegment, r2: AlignedSegment) -> int:
"""Computes the insert size for a pair of records."""
if r1.is_unmapped or r2.is_unmapped or r1.reference_id != r2.reference_id:
return 0
else:
r1_pos = r1.reference_end if r1.is_reverse else r1.reference_start
r2_pos = r2.reference_end if r2.is_reverse else r2.reference_start
return r2_pos - r1_pos


def sum_of_base_qualities(rec: AlignedSegment, min_quality_score: int = 15) -> int:
"""Calculate the sum of base qualities score for an alignment record.
Expand Down
51 changes: 50 additions & 1 deletion tests/fgpyo/sam/test_sam.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,7 +545,16 @@ def test_not_is_proper_pair_if_too_far_apart() -> None:
assert not is_proper_pair(r1, r2)


def test_isize() -> None:
def test_is_not_proper_pair_with_custom_isize_func() -> None:
"""Test that reads are not properly paired because of a custom isize function."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, start2=100)
assert is_proper_pair(r1, r2)
assert not is_proper_pair(r1, r2, isize=lambda a, b: False)


def test_isize_when_r2_defined() -> None:
"""Tests that an insert size can be calculated when both input records are defined."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert sam.isize(r1, r2) == 190
Expand All @@ -555,6 +564,46 @@ def test_isize() -> None:
assert sam.isize(r1, r2) == 0


def test_isize_when_r2_undefined() -> None:
"""Tests that an insert size can be calculated when R1 is provided only."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")
assert sam.isize(r1) == 190
assert sam.isize(r2) == -190

r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M")
assert sam.isize(r1) == 0
assert sam.isize(r2) == 0


def test_isize_when_r2_undefined_indels_in_r2_cigar() -> None:
"""Tests that an insert size can be derived without R2 by using R2's cigar."""
builder = SamBuilder()
r1, _ = builder.add_pair(
chrom="chr1",
start1=100,
cigar1="115M",
start2=250,
cigar2="10S5M1D1M1D2I2D30M", # only 40bp reference-consuming operators
)
assert sam.isize(r1) == 190


def test_isize_raises_when_r2_not_provided_and_mate_cigar_tag_unset_r1() -> None:
"""Tests an exception is raised when the mate cigar tag is not on rec1 and rec2 is missing."""
builder = SamBuilder()
r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M")

r1.set_tag("MC", None)

assert sam.isize(r2) == -190

with pytest.raises(
ValueError, match="Cannot determine proper pair status without a mate cigar"
):
sam.isize(r1)


def test_sum_of_base_qualities() -> None:
builder = SamBuilder(r1_len=5, r2_len=5)
single = builder.add_single(quals=[1, 2, 3, 4, 5])
Expand Down

0 comments on commit bd38898

Please sign in to comment.