From e5bf00f741748a6c08447af97f66038602d2999a Mon Sep 17 00:00:00 2001 From: Matt Stone Date: Tue, 17 Oct 2023 10:01:23 -0400 Subject: [PATCH] feat: add chrom1/chrom2 arguments to add_pair --- fgpyo/sam/builder.py | 11 ++-- fgpyo/sam/tests/test_builder.py | 69 ++++++++++++++--------- fgpyo/sam/tests/test_clipping.py | 4 +- fgpyo/sam/tests/test_sam.py | 2 +- fgpyo/sam/tests/test_template_iterator.py | 4 +- fgpyo/util/tests/test_logging.py | 4 +- 6 files changed, 56 insertions(+), 38 deletions(-) diff --git a/fgpyo/sam/builder.py b/fgpyo/sam/builder.py index faf5aa72..e1c719d7 100755 --- a/fgpyo/sam/builder.py +++ b/fgpyo/sam/builder.py @@ -340,7 +340,8 @@ def add_pair( bases2: Optional[str] = None, quals1: Optional[List[int]] = None, quals2: Optional[List[int]] = None, - chrom: str = sam.NO_REF_NAME, + chrom1: str = sam.NO_REF_NAME, + chrom2: Optional[str] = None, start1: int = sam.NO_REF_POS, start2: int = sam.NO_REF_POS, cigar1: Optional[str] = None, @@ -379,7 +380,8 @@ def add_pair( bases2: The bases for R2. If None is given a random sequence is generated. quals1: The list of int qualities for R1. If None, the default base quality is used. quals2: The list of int qualities for R2. If None, the default base quality is used. - chrom: The chromosome to which both reads are mapped. Defaults to the unmapped value. + chrom1: The chromosome to which R1 is mapped. Defaults to the unmapped value. + chrom2: The chromosome to which R2 is mapped. If None, `chrom1` is used. start1: The start position of R1. Defaults to the unmapped value. start2: The start position of R2. Defaults to the unmapped value. cigar1: The cigar string for R1. Defaults to None for unmapped reads, otherwise all M. @@ -404,16 +406,17 @@ def add_pair( raise ValueError(f"Invalid value for strand2: {strand2}") name = name if name is not None else self._next_name() + chrom2 = chrom2 if chrom2 is not None else chrom1 # Setup R1 - r1 = self._new_rec(name=name, chrom=chrom, start=start1, mapq=mapq1, attrs=attrs) + r1 = self._new_rec(name=name, chrom=chrom1, start=start1, mapq=mapq1, attrs=attrs) self._set_flags(r1, read_num=1, strand=strand1) self._set_length_dependent_fields( rec=r1, length=self.r1_len, bases=bases1, quals=quals1, cigar=cigar1 ) # Setup R2 - r2 = self._new_rec(name=name, chrom=chrom, start=start2, mapq=mapq2, attrs=attrs) + r2 = self._new_rec(name=name, chrom=chrom2, start=start2, mapq=mapq2, attrs=attrs) self._set_flags(r2, read_num=2, strand=strand2) self._set_length_dependent_fields( rec=r2, length=self.r2_len, bases=bases2, quals=quals2, cigar=cigar2 diff --git a/fgpyo/sam/tests/test_builder.py b/fgpyo/sam/tests/test_builder.py index 28bfa49b..7945fbf9 100755 --- a/fgpyo/sam/tests/test_builder.py +++ b/fgpyo/sam/tests/test_builder.py @@ -10,7 +10,7 @@ def test_add_pair_all_fields() -> None: builder = SamBuilder() builder.add_pair( name="q1", - chrom="chr1", + chrom1="chr1", bases1="ACGTG", quals1=[20, 21, 22, 23, 24], start1=10000, @@ -52,7 +52,7 @@ def test_add_pair_all_fields() -> None: def test_add_pair_minimal() -> None: builder = SamBuilder(r1_len=10, r2_len=5, base_quality=25, mapping_quality=20) - r1, r2 = builder.add_pair(chrom="chr1", start1=1000, start2=1200) + r1, r2 = builder.add_pair(chrom1="chr1", start1=1000, start2=1200) assert r1.query_name == r2.query_name assert r1.reference_name == r2.reference_name == "chr1" assert r1.reference_start == 1000 @@ -71,12 +71,14 @@ def test_add_pair_minimal() -> None: def test_add_pair_mix_and_match() -> None: builder = SamBuilder(r1_len=100, r2_len=100, base_quality=30) - r1, r2 = builder.add_pair(chrom="chr1", start1=500, start2=700, cigar1="75M", cigar2="9M1I30M") + r1, r2 = builder.add_pair( + chrom1="chr1", start1=500, start2=700, cigar1="75M", cigar2="9M1I30M" + ) assert len(r1.query_sequence) == len(r1.query_qualities) == 75 assert len(r2.query_sequence) == len(r2.query_qualities) == 40 r1, r2 = builder.add_pair( - chrom="chr1", start1=500, start2=700, bases1="ACGTGCATGC", bases2="ACGAC" + chrom1="chr1", start1=500, start2=700, bases1="ACGTGCATGC", bases2="ACGAC" ) assert len(r1.query_sequence) == len(r1.query_qualities) == 10 assert len(r2.query_sequence) == len(r2.query_qualities) == 5 @@ -84,7 +86,7 @@ def test_add_pair_mix_and_match() -> None: assert r2.cigarstring == "5M" r1, r2 = builder.add_pair( - chrom="chr1", start1=500, start2=700, quals1=[30] * 20, quals2=[20] * 10 + chrom1="chr1", start1=500, start2=700, quals1=[30] * 20, quals2=[20] * 10 ) assert len(r1.query_sequence) == len(r1.query_qualities) == 20 assert len(r2.query_sequence) == len(r2.query_qualities) == 10 @@ -93,18 +95,18 @@ def test_add_pair_mix_and_match() -> None: # Now what if we provide multiple values that are inconsistent with pytest.raises(ValueError, match="not length compatible"): - builder.add_pair(chrom="chr1", start1=10, start2=99, bases1="ACGTG", cigar1="10M") + builder.add_pair(chrom1="chr1", start1=10, start2=99, bases1="ACGTG", cigar1="10M") with pytest.raises(ValueError, match="not length compatible"): - builder.add_pair(chrom="chr1", start1=10, start2=99, bases1="ACGTG", quals1=[2, 2]) + builder.add_pair(chrom1="chr1", start1=10, start2=99, bases1="ACGTG", quals1=[2, 2]) with pytest.raises(ValueError, match="not length compatible"): - builder.add_pair(chrom="chr1", start1=10, start2=99, quals1=[2, 2], cigar1="5M") + builder.add_pair(chrom1="chr1", start1=10, start2=99, quals1=[2, 2], cigar1="5M") def test_unmapped_reads() -> None: builder = SamBuilder() - r1, r2 = builder.add_pair(chrom="chr1", start1=1000) + r1, r2 = builder.add_pair(chrom1="chr1", start1=1000) assert not r1.is_unmapped assert r1.mate_is_unmapped assert r2.is_unmapped @@ -115,7 +117,7 @@ def test_unmapped_reads() -> None: assert rec.next_reference_name == "chr1" assert rec.next_reference_start == 1000 - r1, r2 = builder.add_pair(chrom="chr1", start2=2000) + r1, r2 = builder.add_pair(chrom1="chr1", start2=2000) assert r1.is_unmapped assert not r1.mate_is_unmapped assert not r2.is_unmapped @@ -126,7 +128,7 @@ def test_unmapped_reads() -> None: assert rec.next_reference_name == "chr1" assert rec.next_reference_start == 2000 - r1, r2 = builder.add_pair(chrom=sam.NO_REF_NAME) + r1, r2 = builder.add_pair(chrom1=sam.NO_REF_NAME) assert r1.is_unmapped assert r1.mate_is_unmapped assert r2.is_unmapped @@ -140,33 +142,46 @@ def test_unmapped_reads() -> None: def test_invalid_strand() -> None: with pytest.raises(ValueError, match="strand"): - SamBuilder().add_pair(chrom="chr1", start1=100, start2=200, strand1="F", strand2="R") + SamBuilder().add_pair(chrom1="chr1", start1=100, start2=200, strand1="F", strand2="R") def test_proper_pair() -> None: builder = SamBuilder() # Regular innies - for rec in builder.add_pair(chrom="chr1", start1=5000, start2=5200, strand1="+", strand2="-"): + for rec in builder.add_pair(chrom1="chr1", start1=5000, start2=5200, strand1="+", strand2="-"): assert rec.is_proper_pair - for rec in builder.add_pair(chrom="chr1", start1=5200, start2=5000, strand1="-", strand2="+"): + for rec in builder.add_pair(chrom1="chr1", start1=5200, start2=5000, strand1="-", strand2="+"): assert rec.is_proper_pair # Outies - for rec in builder.add_pair(chrom="chr1", start1=5000, start2=5200, strand1="-", strand2="+"): + for rec in builder.add_pair(chrom1="chr1", start1=5000, start2=5200, strand1="-", strand2="+"): assert not rec.is_proper_pair - for rec in builder.add_pair(chrom="chr1", start1=5200, start2=5000, strand1="+", strand2="-"): + for rec in builder.add_pair(chrom1="chr1", start1=5200, start2=5000, strand1="+", strand2="-"): assert not rec.is_proper_pair # Unmapped - for rec in builder.add_pair(chrom="chr1", start1=5000, strand1="+"): + for rec in builder.add_pair(chrom1="chr1", start1=5000, strand1="+"): assert not rec.is_proper_pair - for rec in builder.add_pair(chrom="chr1", start2=5000, strand2="+"): + for rec in builder.add_pair(chrom1="chr1", start2=5000, strand2="+"): assert not rec.is_proper_pair for rec in builder.add_pair(): assert not rec.is_proper_pair +def test_discordant_pair() -> None: + builder = SamBuilder() + r1, r2 = builder.add_pair(chrom1="chr1", start1=1000, chrom2="chr2", start2=1000) + + assert r1.query_name == r2.query_name + assert r1.reference_name == "chr1" + assert r2.reference_name == "chr2" + assert r1.reference_start == 1000 + assert r2.reference_start == 1000 + assert not r1.is_reverse + assert r2.is_reverse + + def test_add_single() -> None: builder = SamBuilder(r1_len=25, r2_len=50) @@ -199,10 +214,10 @@ def test_add_single() -> None: def test_sorting() -> None: builder = SamBuilder() - builder.add_pair(chrom="chr1", start1=5000, start2=4700, strand1="-", strand2="+") - builder.add_pair(chrom="chr1", start1=4000, start2=4300) - builder.add_pair(chrom="chr5", start1=4000, start2=4300) - builder.add_pair(chrom="chr2", start1=4000, start2=4300) + builder.add_pair(chrom1="chr1", start1=5000, start2=4700, strand1="-", strand2="+") + builder.add_pair(chrom1="chr1", start1=4000, start2=4300) + builder.add_pair(chrom1="chr5", start1=4000, start2=4300) + builder.add_pair(chrom1="chr2", start1=4000, start2=4300) last_ref_id = -1 last_start = -1 @@ -217,17 +232,17 @@ def test_sorting() -> None: def test_custom_sd() -> None: builder1 = SamBuilder() builder2 = SamBuilder(sd=[{"SN": "hi", "LN": 999}, {"SN": "bye", "LN": 888}]) - builder1.add_pair(chrom="chr1", start1=200, start2=400) - builder2.add_pair(chrom="hi", start1=200, start2=400) + builder1.add_pair(chrom1="chr1", start1=200, start2=400) + builder2.add_pair(chrom1="hi", start1=200, start2=400) with pytest.raises(ValueError, match="not a valid chromosome name"): - builder1.add_pair(chrom="hi", start1=200, start2=400) + builder1.add_pair(chrom1="hi", start1=200, start2=400) with pytest.raises(ValueError, match="not a valid chromosome name"): - builder2.add_pair(chrom="chr1", start1=200, start2=400) + builder2.add_pair(chrom1="chr1", start1=200, start2=400) def test_custom_rg() -> None: builder = SamBuilder(rg={"ID": "novel", "SM": "custom_rg", "LB": "foo", "PL": "ILLUMINA"}) - for rec in builder.add_pair(chrom="chr1", start1=100, start2=200): + for rec in builder.add_pair(chrom1="chr1", start1=100, start2=200): assert rec.get_tag("RG") == "novel" diff --git a/fgpyo/sam/tests/test_clipping.py b/fgpyo/sam/tests/test_clipping.py index 1960a5a8..6e6b44b9 100755 --- a/fgpyo/sam/tests/test_clipping.py +++ b/fgpyo/sam/tests/test_clipping.py @@ -14,7 +14,7 @@ def r(start: Optional[int], cigar: Optional[str], strand: Optional[str] = "+") - """Constructs a read for testing.""" builder = SamBuilder() if start: - r1, r2 = builder.add_pair(chrom="chr1", start1=start, cigar1=cigar, strand1=strand) + r1, r2 = builder.add_pair(chrom1="chr1", start1=start, cigar1=cigar, strand1=strand) else: r1, r2 = builder.add_pair() return r1 @@ -22,7 +22,7 @@ def r(start: Optional[int], cigar: Optional[str], strand: Optional[str] = "+") - def test_make_read_unmapped() -> None: builder = SamBuilder() - r1, r2 = builder.add_pair(chrom="chr1", start1=100, start2=250) + r1, r2 = builder.add_pair(chrom1="chr1", start1=100, start2=250) clipping._make_read_unmapped(r1) assert r1.is_unmapped diff --git a/fgpyo/sam/tests/test_sam.py b/fgpyo/sam/tests/test_sam.py index 14dc36cd..7e395ccf 100755 --- a/fgpyo/sam/tests/test_sam.py +++ b/fgpyo/sam/tests/test_sam.py @@ -288,7 +288,7 @@ def test_is_clipping() -> None: def test_isize() -> None: builder = SamBuilder() - r1, r2 = builder.add_pair(chrom="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") + r1, r2 = builder.add_pair(chrom1="chr1", start1=100, cigar1="115M", start2=250, cigar2="40M") assert sam.isize(r1, r2) == 190 assert sam.isize(r2, r1) == -190 diff --git a/fgpyo/sam/tests/test_template_iterator.py b/fgpyo/sam/tests/test_template_iterator.py index cdefc35a..cfeb0904 100644 --- a/fgpyo/sam/tests/test_template_iterator.py +++ b/fgpyo/sam/tests/test_template_iterator.py @@ -4,7 +4,7 @@ def test_template_init_function() -> None: builder: SamBuilder = SamBuilder() - builder.add_pair(name="x", chrom="chr1", start1=1, start2=2) + builder.add_pair(name="x", chrom1="chr1", start1=1, start2=2) r1 = builder.to_sorted_list()[0] r2 = builder.to_sorted_list()[1] template = Template( @@ -26,7 +26,7 @@ def test_to_templates() -> None: builder = SamBuilder() # Series of alignments for one template - builder.add_pair(name="q1", chrom="chr1", start1=1, start2=2) + builder.add_pair(name="q1", chrom1="chr1", start1=1, start2=2) builder.add_single(name="q1", read_num=1, chrom="chr1", start=1, supplementary=True) builder.add_single(name="q1", read_num=1, chrom="chr1", start=11, supplementary=True) builder.add_single(name="q1", read_num=2, chrom="chr1", start=2, supplementary=True) diff --git a/fgpyo/util/tests/test_logging.py b/fgpyo/util/tests/test_logging.py index 09f21235..66c06181 100644 --- a/fgpyo/util/tests/test_logging.py +++ b/fgpyo/util/tests/test_logging.py @@ -38,8 +38,8 @@ def test_progress_logger_as_context_manager() -> None: builder = SamBuilder() -r1_mapped_named, r2_unmapped_named = builder.add_pair(chrom="chr1", start1=1000) -r1_unmapped_un_named, r2_unmapped_un_named = builder.add_pair(chrom=sam.NO_REF_NAME) +r1_mapped_named, r2_unmapped_named = builder.add_pair(chrom1="chr1", start1=1000) +r1_unmapped_un_named, r2_unmapped_un_named = builder.add_pair(chrom1=sam.NO_REF_NAME) @pytest.mark.parametrize(