Skip to content

Commit

Permalink
Deprecate set_pair_info and _set_mate_info for set_mate_info (#202)
Browse files Browse the repository at this point in the history
I notice we have two implementations for setting mate info on a pair of
alignments.

I've refactored for the following reasons:

-
[`set_pair_info()`](https://github.com/fulcrumgenomics/fgpyo/blob/3680b4fe1fe3d8686172e05dbf9c5043fa51ae1a/fgpyo/sam/__init__.py#L700-L731):
    - Unnecessarily disallows unmapped reads
- Uses `assert` instead of `raise`, which can be turned off at runtime
- Does not set mate quality tag
([`MQ`](https://samtools.github.io/hts-specs/SAMtags.pdf))
- Does not set mate score tag
([`ms`](https://www.htslib.org/doc/samtools-fixmate.html))
- Defaults proper pair status to `True` (it's easy to forget to change
this)
- Forces you to override the "properly paired" status with True/False
-
[`_set_mate_info()`](https://github.com/fulcrumgenomics/fgpyo/blob/3680b4fe1fe3d8686172e05dbf9c5043fa51ae1a/fgpyo/sam/builder.py#L267-L329):
- Looks like a direct translation of [HTSJDK's
implementation](https://github.com/samtools/htsjdk/blob/c31bc92c24bc4e9552b2a913e52286edf8f8ab96/src/main/java/htsjdk/samtools/SamPairUtil.java#L206-L287)
(it can be simpler with pysam)
- Does not always cleanup mate cigar tag
([`MC`](https://samtools.github.io/hts-specs/SAMtags.pdf))
- Does not set mate quality tag
([`MQ`](https://samtools.github.io/hts-specs/SAMtags.pdf))
- Does not set mate score tag
([`ms`](https://www.htslib.org/doc/samtools-fixmate.html))
    - Forces a reset of properly paired status

Now we have:

-
[`set_pair_info()`](https://github.com/fulcrumgenomics/fgpyo/blob/1e066132a0fdc9af8e3b4c5dbd1633881e9f4c08/fgpyo/sam/__init__.py#L872-L884):
annotated as _**deprecated**_ to avoid an API break
-
[`set_mate_info()`](https://github.com/fulcrumgenomics/fgpyo/blob/1e066132a0fdc9af8e3b4c5dbd1633881e9f4c08/fgpyo/sam/__init__.py#L807-L837):
a more useful function for setting mate info on alignments
-
~[`set_as_pairs()`](https://github.com/fulcrumgenomics/fgpyo/blob/1e066132a0fdc9af8e3b4c5dbd1633881e9f4c08/fgpyo/sam/__init__.py#L840-L869):
this behavior was in `set_pair_info()` so it is now a sole function~

Questions?

- ~Is it worth keeping `set_as_pairs()`? What was the motivation for
this functionality?~
- Since we are still pre-v1, can we remove `set_pair_info()` without
deprecating first?

Along the way I found a bug in `SamBuilder.add_pair()` and fixed that
too.
  • Loading branch information
clintval authored Dec 28, 2024
1 parent f9aa849 commit 3c6a94e
Show file tree
Hide file tree
Showing 6 changed files with 506 additions and 348 deletions.
81 changes: 64 additions & 17 deletions fgpyo/sam/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@
from pathlib import Path
from typing import IO
from typing import Any
from typing import Callable
from typing import Dict
from typing import Iterable
from typing import Iterator
Expand All @@ -177,6 +178,7 @@
from pysam import AlignedSegment
from pysam import AlignmentFile as SamFile
from pysam import AlignmentHeader as SamHeader
from typing_extensions import deprecated

import fgpyo.io
from fgpyo.collections import PeekableIterator
Expand Down Expand Up @@ -811,38 +813,83 @@ def isize(r1: AlignedSegment, r2: AlignedSegment) -> int:
return r2_pos - r1_pos


def sum_of_base_qualities(rec: AlignedSegment, min_quality_score: int = 15) -> int:
"""Calculate the sum of base qualities score for an alignment record.
This function is useful for calculating the "mate score" as implemented in samtools fixmate.
Args:
rec: The alignment record to calculate the sum of base qualities from.
min_quality_score: The minimum base quality score to use for summation.
See:
[`calc_sum_of_base_qualities()`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L227-L238)
[`MD_MIN_QUALITY`](https://github.com/samtools/samtools/blob/4f3a7397a1f841020074c0048c503a01a52d5fa2/bam_mate.c#L42)
"""
score: int = sum(qual for qual in rec.query_qualities if qual >= min_quality_score)
return score


def set_mate_info(
r1: AlignedSegment,
r2: AlignedSegment,
is_proper_pair: Callable[[AlignedSegment, AlignedSegment], bool] = is_proper_pair,
) -> None:
"""Resets mate pair information between reads in a pair.
Args:
r1: Read 1 (first read in the template).
r2: Read 2 with the same query name as r1 (second read in the template).
is_proper_pair: A function that takes the two alignments and determines proper pair status.
"""
if r1.query_name != r2.query_name:
raise ValueError("Cannot set mate info on alignments with different query names!")

for src, dest in [(r1, r2), (r2, r1)]:
dest.next_reference_id = src.reference_id
dest.next_reference_name = src.reference_name
dest.next_reference_start = src.reference_start
dest.mate_is_forward = src.is_forward
dest.mate_is_mapped = src.is_mapped
dest.set_tag("MC", src.cigarstring)
dest.set_tag("MQ", src.mapping_quality)

r1.set_tag("ms", sum_of_base_qualities(r2))
r2.set_tag("ms", sum_of_base_qualities(r1))

template_length = isize(r1, r2)
r1.template_length = template_length
r2.template_length = -template_length

proper_pair = is_proper_pair(r1, r2)
r1.is_proper_pair = proper_pair
r2.is_proper_pair = proper_pair


@deprecated("Use `set_mate_info()` instead. Deprecated after fgpyo 0.8.0.")
def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = True) -> None:
"""Resets mate pair information between reads in a pair. Requires that both r1
and r2 are mapped. Can be handed reads that already have pairing flags setup or
independent R1 and R2 records that are currently flagged as SE reads.
"""Resets mate pair information between reads in a pair.
Can be handed reads that already have pairing flags setup or independent R1 and R2 records that
are currently flagged as SE reads.
Args:
r1: read 1
r2: read 2 with the same queryname as r1
r1: Read 1 (first read in the template).
r2: Read 2 with the same query name as r1 (second read in the template).
proper_pair: whether the pair is proper or not.
"""
if r1.query_name != r2.query_name:
raise ValueError("Cannot set pair info on reads with different query names!")

for r in [r1, r2]:
r.is_paired = True
r.is_proper_pair = proper_pair

r1.is_read1 = True
r1.is_read2 = False
r2.is_read2 = True
r2.is_read1 = False

for src, dest in [(r1, r2), (r2, r1)]:
dest.next_reference_id = src.reference_id
dest.next_reference_start = src.reference_start
dest.mate_is_reverse = src.is_reverse
dest.mate_is_unmapped = src.mate_is_unmapped
dest.set_tag("MC", src.cigarstring)
dest.set_tag("MQ", src.mapping_quality)

insert_size = isize(r1, r2)
r1.template_length = insert_size
r2.template_length = -insert_size
set_mate_info(r1=r1, r2=r2, is_proper_pair=lambda a, b: proper_pair)


@attr.s(frozen=True, auto_attribs=True)
Expand Down
78 changes: 11 additions & 67 deletions fgpyo/sam/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,70 +264,6 @@ def _set_length_dependent_fields(
if not rec.is_unmapped:
rec.cigarstring = cigar if cigar else f"{length}M"

def _set_mate_info(self, r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None:
"""Sets the mate information on a pair of sam records.
Handles cases where both reads are mapped, one of the two reads is unmapped or both reads
are unmapped.
Args:
r1: the first read in the pair
r2: the sceond read in the pair
"""
for rec in r1, r2:
rec.template_length = 0
rec.is_proper_pair = False

if r1.is_unmapped and r2.is_unmapped:
# If they're both unmapped just clean the records up
for rec, other in [(r1, r2), (r2, r1)]:
rec.reference_id = sam.NO_REF_INDEX
rec.next_reference_id = sam.NO_REF_INDEX
rec.reference_start = sam.NO_REF_POS
rec.next_reference_start = sam.NO_REF_POS
rec.is_unmapped = True
rec.mate_is_unmapped = True
rec.is_proper_pair = False
rec.mate_is_reverse = other.is_reverse

elif r1.is_unmapped or r2.is_unmapped:
# If only one is mapped/unmapped copy over the relevant stuff
(m, u) = (r1, r2) if r2.is_unmapped else (r2, r1)
u.reference_id = m.reference_id
u.reference_start = m.reference_start
u.next_reference_id = m.reference_id
u.next_reference_start = m.reference_start
u.mate_is_reverse = m.is_reverse
u.mate_is_unmapped = False
u.set_tag("MC", m.cigarstring)

m.next_reference_id = u.reference_id
m.next_reference_start = u.reference_start
m.mate_is_reverse = u.is_reverse
m.mate_is_unmapped = True

else:
# Else they are both mapped
for rec, other in [(r1, r2), (r2, r1)]:
rec.next_reference_id = other.reference_id
rec.next_reference_start = other.reference_start
rec.mate_is_reverse = other.is_reverse
rec.mate_is_unmapped = False
rec.set_tag("MC", other.cigarstring)

if r1.reference_id == r2.reference_id:
r1p = r1.reference_end if r1.is_reverse else r1.reference_start
r2p = r2.reference_end if r2.is_reverse else r2.reference_start
r1.template_length = r2p - r1p
r2.template_length = r1p - r2p

# Arbitrarily set proper pair if the we have an FR pair with isize <= 1000
if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000:
fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p)
if fpos < rpos:
r1.is_proper_pair = True
r2.is_proper_pair = True

def rg(self) -> Dict[str, Any]:
"""Returns the single read group that is defined in the header."""
# The `RG` field contains a list of read group mappings
Expand Down Expand Up @@ -444,8 +380,16 @@ def add_pair(
raise ValueError("Cannot use chrom in combination with chrom1 or chrom2")

chrom = sam.NO_REF_NAME if chrom is None else chrom
chrom1 = next(c for c in (chrom1, chrom) if c is not None)
chrom2 = next(c for c in (chrom2, chrom) if c is not None)

if start1 != sam.NO_REF_POS:
chrom1 = next(c for c in (chrom1, chrom) if c is not None)
else:
chrom1 = sam.NO_REF_NAME

if start2 != sam.NO_REF_POS:
chrom2 = next(c for c in (chrom2, chrom) if c is not None)
else:
chrom2 = sam.NO_REF_NAME

if chrom1 == sam.NO_REF_NAME and start1 != sam.NO_REF_POS:
raise ValueError("start1 cannot be used on its own - specify chrom or chrom1")
Expand All @@ -468,7 +412,7 @@ def add_pair(
)

# Sync up mate info and we're done!
self._set_mate_info(r1, r2)
sam.set_mate_info(r1, r2)
self._records.append(r1)
self._records.append(r2)
return r1, r2
Expand Down
Loading

0 comments on commit 3c6a94e

Please sign in to comment.