From c0f71f53c47d590c77c92d2e23ff0c4d91e6e7fb Mon Sep 17 00:00:00 2001 From: Tim Dunn Date: Tue, 3 Sep 2024 12:27:21 -0400 Subject: [PATCH] feat: added template.fixmate() [issue #83] - added fixmate() method - added set_mate_info_on_supplementary() helper function - moved builder._set_mate_info() to helper function set_mate_info() - moved test_template_iterator.py to test_template.py (contained more than just iterator) - added template_test_fixmate() tests --- fgpyo/sam/__init__.py | 106 +++++++++++++++++- fgpyo/sam/builder.py | 72 +----------- ..._template_iterator.py => test_template.py} | 62 +++++++++- 3 files changed, 169 insertions(+), 71 deletions(-) rename tests/fgpyo/sam/{test_template_iterator.py => test_template.py} (62%) diff --git a/fgpyo/sam/__init__.py b/fgpyo/sam/__init__.py index afc6d81e..ec1f2450 100644 --- a/fgpyo/sam/__init__.py +++ b/fgpyo/sam/__init__.py @@ -4,7 +4,7 @@ This module contains utility classes for working with SAM/BAM files and the data contained within them. This includes i) utilities for opening SAM/BAM files for reading and writing, ii) functions for manipulating supplementary alignments, iii) classes and functions for -maniuplating CIGAR strings, and iv) a class for building sam records and files for testing. +manipulating CIGAR strings, and iv) a class for building sam records and files for testing. ## Motivation for Reader and Writer methods @@ -691,6 +691,100 @@ def set_pair_info(r1: AlignedSegment, r2: AlignedSegment, proper_pair: bool = Tr r2.template_length = -insert_size +def set_mate_info(r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None: + """Sets the mate information on a pair of sam records. + + Handles cases where both reads are mapped, one of the two reads is unmapped or both reads + are unmapped. + + Args: + r1: the first read in the pair + r2: the second read in the pair + """ + assert ( + r1.query_name == r2.query_name + ), "Attempting to set mate info on reads with different qnames." + + for rec in r1, r2: + rec.template_length = 0 + rec.is_proper_pair = False + + if r1.is_unmapped and r2.is_unmapped: + # If they're both unmapped just clean the records up + for rec, other in [(r1, r2), (r2, r1)]: + rec.reference_id = NO_REF_INDEX + rec.next_reference_id = NO_REF_INDEX + rec.reference_start = NO_REF_POS + rec.next_reference_start = NO_REF_POS + rec.is_unmapped = True + rec.is_proper_pair = False + rec.mate_is_reverse = other.is_reverse + rec.mate_is_unmapped = True + rec.set_tag("MC", None) + + elif r1.is_unmapped or r2.is_unmapped: + # If only one is mapped/unmapped copy over the relevant stuff + (m, u) = (r1, r2) if r2.is_unmapped else (r2, r1) + u.reference_id = m.reference_id + u.reference_start = m.reference_start + u.next_reference_id = m.reference_id + u.next_reference_start = m.reference_start + u.mate_is_reverse = m.is_reverse + u.mate_is_unmapped = False + u.set_tag("MC", m.cigarstring) + + m.next_reference_id = u.reference_id + m.next_reference_start = u.reference_start + m.mate_is_reverse = u.is_reverse + m.mate_is_unmapped = True + m.set_tag("MC", None) + + else: + # Else they are both mapped + for rec, other in [(r1, r2), (r2, r1)]: + rec.next_reference_id = other.reference_id + rec.next_reference_start = other.reference_start + rec.mate_is_reverse = other.is_reverse + rec.mate_is_unmapped = False + rec.set_tag("MC", other.cigarstring) + + if r1.reference_id == r2.reference_id: + r1p = r1.reference_end if r1.is_reverse else r1.reference_start + r2p = r2.reference_end if r2.is_reverse else r2.reference_start + r1.template_length = r2p - r1p + r2.template_length = r1p - r2p + + # Arbitrarily set proper pair if the we have an FR pair with isize <= 1000 + if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000: + fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p) + if fpos < rpos: + r1.is_proper_pair = True + r2.is_proper_pair = True + + +def set_mate_info_on_supplementary( + primary: AlignedSegment, supp: AlignedSegment, set_cigar: bool = False +) -> None: + """Sets mate pair information on a supplemental record (e.g. from a split alignment), using + the primary alignment of the read's mate. + + Args: + primary: the primary alignment of the mate pair of the supplemental + supp: a supplemental alignment for the mate pair of the primary supplied + set_cigar: true if we should update/create the mate CIGAR (MC) optional tag, + false if we should remove any mate CIGAR tag that is present + """ + supp.next_reference_id = primary.reference_id + supp.next_reference_start = primary.reference_start + supp.mate_is_reverse = primary.is_reverse + supp.mate_is_unmapped = primary.is_unmapped + supp.template_length = -primary.template_length + if set_cigar and not primary.is_unmapped: + supp.set_tag("MC", primary.cigarstring) + else: + supp.set_tag("MC", None) + + @attr.s(frozen=True, auto_attribs=True) class ReadEditInfo: """ @@ -954,6 +1048,16 @@ def set_tag( for rec in self.all_recs(): rec.set_tag(tag, value) + def fixmate(self) -> None: + """Fixes mate information and sets mate CIGAR on all primary and supplementary + (but not secondary) records. + """ + for r2_supplemental in self.r2_supplementals: + set_mate_info_on_supplementary(self.r1, r2_supplemental, True) + for r1_supplemental in self.r1_supplementals: + set_mate_info_on_supplementary(self.r2, r1_supplemental, True) + set_mate_info(self.r1, self.r2) + class TemplateIterator(Iterator[Template]): """ diff --git a/fgpyo/sam/builder.py b/fgpyo/sam/builder.py index 8ca0921d..9d22b98f 100755 --- a/fgpyo/sam/builder.py +++ b/fgpyo/sam/builder.py @@ -230,7 +230,7 @@ def _set_length_dependent_fields( If any of bases, quals or cigar are defined, they must all have the same length/query length. If none are defined then the length parameter is used. Undefined values are - synthesize at the inferred length. + synthesized at the inferred length. Args: rec: a SAM record @@ -264,70 +264,6 @@ def _set_length_dependent_fields( if not rec.is_unmapped: rec.cigarstring = cigar if cigar else f"{length}M" - def _set_mate_info(self, r1: pysam.AlignedSegment, r2: pysam.AlignedSegment) -> None: - """Sets the mate information on a pair of sam records. - - Handles cases where both reads are mapped, one of the two reads is unmapped or both reads - are unmapped. - - Args: - r1: the first read in the pair - r2: the sceond read in the pair - """ - for rec in r1, r2: - rec.template_length = 0 - rec.is_proper_pair = False - - if r1.is_unmapped and r2.is_unmapped: - # If they're both unmapped just clean the records up - for rec, other in [(r1, r2), (r2, r1)]: - rec.reference_id = sam.NO_REF_INDEX - rec.next_reference_id = sam.NO_REF_INDEX - rec.reference_start = sam.NO_REF_POS - rec.next_reference_start = sam.NO_REF_POS - rec.is_unmapped = True - rec.mate_is_unmapped = True - rec.is_proper_pair = False - rec.mate_is_reverse = other.is_reverse - - elif r1.is_unmapped or r2.is_unmapped: - # If only one is mapped/unmapped copy over the relevant stuff - (m, u) = (r1, r2) if r2.is_unmapped else (r2, r1) - u.reference_id = m.reference_id - u.reference_start = m.reference_start - u.next_reference_id = m.reference_id - u.next_reference_start = m.reference_start - u.mate_is_reverse = m.is_reverse - u.mate_is_unmapped = False - u.set_tag("MC", m.cigarstring) - - m.next_reference_id = u.reference_id - m.next_reference_start = u.reference_start - m.mate_is_reverse = u.is_reverse - m.mate_is_unmapped = True - - else: - # Else they are both mapped - for rec, other in [(r1, r2), (r2, r1)]: - rec.next_reference_id = other.reference_id - rec.next_reference_start = other.reference_start - rec.mate_is_reverse = other.is_reverse - rec.mate_is_unmapped = False - rec.set_tag("MC", other.cigarstring) - - if r1.reference_id == r2.reference_id: - r1p = r1.reference_end if r1.is_reverse else r1.reference_start - r2p = r2.reference_end if r2.is_reverse else r2.reference_start - r1.template_length = r2p - r1p - r2.template_length = r1p - r2p - - # Arbitrarily set proper pair if the we have an FR pair with isize <= 1000 - if r1.is_reverse != r2.is_reverse and abs(r1.template_length) <= 1000: - fpos, rpos = (r2p, r1p) if r1.is_reverse else (r1p, r2p) - if fpos < rpos: - r1.is_proper_pair = True - r2.is_proper_pair = True - def rg(self) -> Dict[str, Any]: """Returns the single read group that is defined in the header.""" # The `RG` field contains a list of read group mappings @@ -468,7 +404,7 @@ def add_pair( ) # Sync up mate info and we're done! - self._set_mate_info(r1, r2) + sam.set_mate_info(r1, r2) self._records.append(r1) self._records.append(r2) return r1, r2 @@ -489,12 +425,12 @@ def add_single( supplementary: bool = False, attrs: Optional[Dict[str, Any]] = None, ) -> AlignedSegment: - """Generates a new single reads, adds them to the internal collection, and returns it. + """Generates a new single read, adds it to the internal collection, and returns it. Most fields are optional. If `read_num` is None (the default) an unpaired read will be created. If `read_num` is - set to 1 or 2, the read will have it's paired flag set and read number flags set. + set to 1 or 2, the read will have its paired flag set and read number flags set. An unmapped read can be created by calling the method with no parameters (specifically, not setting chrom, start1 or start2). If cigar is provided, it will be ignored. diff --git a/tests/fgpyo/sam/test_template_iterator.py b/tests/fgpyo/sam/test_template.py similarity index 62% rename from tests/fgpyo/sam/test_template_iterator.py rename to tests/fgpyo/sam/test_template.py index 79ab9283..d30e0687 100644 --- a/tests/fgpyo/sam/test_template_iterator.py +++ b/tests/fgpyo/sam/test_template.py @@ -2,6 +2,7 @@ import pytest +from fgpyo import sam from fgpyo.sam import Template from fgpyo.sam import TemplateIterator from fgpyo.sam import reader @@ -29,7 +30,7 @@ def test_template_init_function() -> None: assert len([t for t in template.r1_secondaries]) == 0 -def test_to_templates() -> None: +def test_template_iterator() -> None: builder = SamBuilder() # Series of alignments for one template @@ -103,7 +104,7 @@ def test_write_template( assert len([r for r in template.all_recs()]) == 2 -def test_set_tag() -> None: +def test_template_set_tag() -> None: builder = SamBuilder() template = Template.build(builder.add_pair(chrom="chr1", start1=100, start2=200)) @@ -129,3 +130,60 @@ def test_set_tag() -> None: for bad_tag in ["", "A", "ABC", "ABCD"]: with pytest.raises(AssertionError, match="Tags must be 2 characters"): template.set_tag(bad_tag, VALUE) + + +def test_template_fixmate() -> None: + builder = SamBuilder(r1_len=100, r2_len=100, base_quality=30) + + # both reads are mapped + r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100, start2=500) + r1.cigarstring = "80M20S" + r1.reference_start = 107 + template = Template.build([r1, r2]) + template.fixmate() + assert r1.get_tag("MC") == "100M" + assert r2.get_tag("MC") == "80M20S" + assert r2.next_reference_start == 107 + + # only read 1 is mapped + r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100) + r1.cigarstring = "80M20S" + r1.reference_start = 107 + template = Template.build([r1, r2]) + template.fixmate() + with pytest.raises(KeyError): r1.get_tag("MC") + assert r2.get_tag("MC") == "80M20S" + assert r2.next_reference_start == 107 + + # neither reads are mapped + r1, r2 = builder.add_pair(chrom=sam.NO_REF_NAME) + r1.cigarstring = "80M20S" + template = Template.build([r1, r2]) + template.fixmate() + with pytest.raises(KeyError): r1.get_tag("MC") + with pytest.raises(KeyError): r2.get_tag("MC") + + # all supplementary (and not secondard) records should be updated + r1, r2 = builder.add_pair(name="q1", chrom="chr1", start1=100, start2=500) + supp1a = builder.add_single(name="q1", read_num=1, chrom="chr1", start=101, supplementary=True) + supp1b = builder.add_single(name="q1", read_num=1, chrom="chr1", start=102, supplementary=True) + supp2a = builder.add_single(name="q1", read_num=2, chrom="chr1", start=501, supplementary=True) + supp2b = builder.add_single(name="q1", read_num=2, chrom="chr1", start=502, supplementary=True) + sec1 = builder.add_single(name="q1", read_num=1, chrom="chr1", start=1001, secondary=True) + sec2 = builder.add_single(name="q1", read_num=2, chrom="chr1", start=1002, secondary=True) + r1.cigarstring = "80M20S" + r1.reference_start = 107 + template = Template.build([r1, r2, supp1a, supp1b, supp2a, supp2b, sec1, sec2]) + template.fixmate() + assert r1.get_tag("MC") == "100M" + assert supp1a.get_tag("MC") == "100M" + assert supp1b.get_tag("MC") == "100M" + with pytest.raises(KeyError): sec1.get_tag("MC") + assert r2.get_tag("MC") == "80M20S" + assert supp2a.get_tag("MC") == "80M20S" + assert supp2b.get_tag("MC") == "80M20S" + with pytest.raises(KeyError): sec2.get_tag("MC") + assert r2.next_reference_start == 107 + assert supp2a.next_reference_start == 107 + assert supp2b.next_reference_start == 107 + assert sec2.next_reference_start == -1