feature(isoslam): Adds extract_segment_pair() function

This function reads through a `.bam` file, which should already have been sorted and assigned, and produces pairs of reads.
sudlab · Dec 10, 2024 · d1efad4 · d1efad4
1 parent b8fec60
commit d1efad4
Show file tree

Hide file tree

Showing 4 changed files with 223 additions and 59 deletions.
diff --git a/docs/source/workflow.md b/docs/source/workflow.md
@@ -56,6 +56,16 @@ graph TD;
 
 ```
 
+## IsoSLAM
+
+A number of pre-processing steps are undertaken prior to IsoSLAM work being done. The following is work in progress as
+the code is refactored.
+
+1. Iterate over `.bam` file and pair segments. If two or more `AlignedSegments` with the same `query_name` are found
+   then `n > 1` segments are dropped.
+2. Pairs of segments (individual `AlignedSegments`) are then assessed and if they are `Assigned` the `start`, `end`,
+   `length`, `status` (i.e. `Assigned`), `transcript_id`, `block_start` and `block_end` are extracted.
+
 ## Updating
 
 The above diagram is written in [Mermaid][mermaid]. You can view the source code in the IsoSLAM repository and

diff --git a/isoslam/isoslam.py b/isoslam/isoslam.py
@@ -1,6 +1,7 @@
 """IsoSLAM module."""
 
 from collections import defaultdict
+from collections.abc import Generator
 from pathlib import Path
 from typing import Any
 
@@ -68,6 +69,37 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
     return (strand, transcript)
 
 
+def extract_segment_pairs(bam_file: str | Path) -> Generator[AlignedSegment]:
+    """
+    Extract pairs of AlignedSegments from a ``.bam`` file.
+
+    When there are two adjacent ``AlignedSegments`` with the same ``query_name`` only the first is paired, subsequent
+    segments are dropped.
+
+    Parameters
+    ----------
+    bam_file : str | Path
+        Path to a ``.bam`` file.
+
+    Yields
+    ------
+    Generator
+        Itterable of paired segments.
+    """
+    previous_read: str | None = None
+    pair: list[AlignedSegment] = []
+    for read in io.load_file(bam_file):
+        # Return pairs of reads, i.e. not on first pass, nor if query_name matches the previous read
+        if previous_read is not None and previous_read != read.query_name:
+            yield pair
+            pair = []
+            previous_read = read.query_name
+        previous_read = read.query_name
+        pair.append(read)
+    # Don't forget to return the last pair!
+    yield pair
+
+
 def extract_features_from_read(read: AlignedSegment) -> dict[str, int | str | None | tuple[int, int]]:
     """
     Extract start, end and length from an aligned segment read.

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 
 import pytest
-from pysam import AlignedSegment, AlignmentFile
+from pysam import AlignmentFile
 
 from isoslam import io
 
@@ -14,15 +14,15 @@
 BED_DIR = RESOURCES / "bed"
 VCF_DIR = RESOURCES / "vcf"
 BAM_DIR = RESOURCES / "bam"
-
+BAM_SORTED_ASSIGNED_DIR = BAM_DIR / "sorted_assigned"
 
 # pylint: disable=redefined-outer-name
 
 
 @pytest.fixture()
-def bam_file() -> AlignmentFile:
+def bam_file1() -> AlignmentFile:
     """Load a bam file."""
-    return io.load_file(BAM_DIR / "d0_no4sU_filtered_remapped_sorted.bam")
+    return io.load_file(BAM_SORTED_ASSIGNED_DIR / "d0_no4sU_filtered_remapped_sorted.sorted.assigned.bam")
 
 
 @pytest.fixture()

diff --git a/tests/test_isoslam.py b/tests/test_isoslam.py
@@ -1,6 +1,7 @@
 """Tests for the isoslam module."""
 
 from pathlib import Path
+from types import GeneratorType
 from typing import Any
 
 import pytest  # type: ignore[import-not-found]
@@ -9,6 +10,11 @@
 
 BASE_DIR = Path.cwd()
 RESOURCES = BASE_DIR / "tests" / "resources"
+BAM_DIR = RESOURCES / "bam"
+BED_DIR = RESOURCES / "bed"
+GTF_DIR = RESOURCES / "gtf"
+VCF_DIR = RESOURCES / "vcf"
+BAM_SORTED_ASSIGNED_DIR = BAM_DIR / "sorted_assigned"
 
 # pylint: disable=too-many-arguments
 # pylint: disable=too-many-positional-arguments
@@ -18,7 +24,7 @@
     ("bed_file", "expected_transcript"),
     [
         pytest.param(  # type: ignore[misc]
-            RESOURCES / "bed" / "test_coding_introns.bed",
+            BED_DIR / "test_coding_introns.bed",
             {
                 "ENST00000442898": [
                     ("9", 14940, 15080, "ENST00000442898", "-"),
@@ -49,7 +55,7 @@ def test_isoslam_extract_transcripts(
     ("gtf_file", "expected_strand", "expected_transcript"),
     [
         pytest.param(  # type: ignore[misc]
-            RESOURCES / "gtf" / "test_wash1.gtf",
+            GTF_DIR / "test_wash1.gtf",
             {"MSTRG.63147": "-"},
             {"MSTRG.63147": ["ENST00000442898"]},
             id="gtf file as Path",
@@ -66,68 +72,28 @@ def test_extract_strand_transcript(
 
 
 @pytest.mark.parametrize(
-    ("aligned_segment", "start", "end", "length", "status", "transcript", "block_start", "block_end"),
+    ("bam_file", "expected_length"),
     [
         pytest.param(  # type: ignore[misc]
-            "aligned_segment_28584",
-            28584,
-            28733,
-            149,
-            None,
-            None,
-            (28584, 28704),
-            (28704, 28733),
-            id="28584 - Assignment and Transcript are None",
+            BAM_SORTED_ASSIGNED_DIR / "d0_no4sU_filtered_remapped_sorted.sorted.assigned.bam",
+            710,
+            id="file 1",
         ),
         pytest.param(
-            "aligned_segment_17416",
-            17416,
-            17805,
-            389,
-            "Assigned",
-            "Something",
-            (17416, 17718),
-            (17479, 17805),
-            id="17416 - Assignment and Transcript are None",
-            marks=pytest.mark.xfail(reasion="Not getting Assigned or Transcript tags for some reason?"),
-        ),
-        pytest.param(
-            "aligned_segment_18029",
-            18029,
-            18385,
-            356,
-            "Assigned",
-            "something",
-            (18029, 18380),
-            (18174, 18385),
-            id="17416 - Assignment and Transcript are None",
-            marks=pytest.mark.xfail(reasion="Not getting Assigned tag for some reason?"),
+            BAM_SORTED_ASSIGNED_DIR / "d0_0hr1_filtered_remapped_sorted.sorted.assigned.bam",
+            1845,
+            id="file 2",
         ),
     ],
 )
-def test_extract_features_from_read_param(
-    aligned_segment: str,
-    start: int,
-    end: int,
-    length: int,
-    status: str,
-    transcript: str,
-    block_start: tuple[int, int],
-    block_end: tuple[int, int],
-    request: pytest.FixtureRequest,
-) -> None:
-    """Test extract of features from an aligned segment read."""
-    segment = isoslam.extract_features_from_read(request.getfixturevalue(aligned_segment))
-    assert isinstance(segment, dict)
-    assert segment["start"] == start
-    assert segment["end"] == end
-    assert segment["length"] == length
-    assert segment["status"] == status
-    assert segment["transcript"] == transcript
-    assert segment["block_start"] == block_start
-    assert segment["block_end"] == block_end
+def test_extract_segment_pairs(bam_file: str | Path, expected_length: int) -> None:
+    """Test extraction of paired segments from a sorted and assigned ``.bam`` file."""
+    alignment_file = isoslam.extract_segment_pairs(bam_file)
+    assert isinstance(alignment_file, GeneratorType)
+    assert len(list(alignment_file)) == expected_length
 
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 @pytest.mark.parametrize(
     ("aligned_segment1", "aligned_segment2", "expected"),
@@ -424,3 +390,159 @@ def test_extract_features_from_pair(
     assert isinstance(read_pair, dict)
     assert read_pair == expected
 >>>>>>> 3878e7a (fixup! feature(isoslam): Adds functions to extract features from reads)
+||||||| parent of 3ee6aa7 (feature(isoslam): Adds extract_segment_pair() function)
+@pytest.mark.parametrize(
+    ("aligned_segment1", "aligned_segment2", "expected"),
+    [
+        pytest.param(  # type: ignore[misc]
+            "aligned_segment_28584",
+            "aligned_segment_17416",
+            {
+                "read1": {
+                    "start": 28584,
+                    "end": 28733,
+                    "length": 149,
+                    "status": None,
+                    "transcript": None,
+                    "block_start": (28584, 28704),
+                    "block_end": (28704, 28733),
+                },
+                "read2": {
+                    "start": 17416,
+                    "end": 17805,
+                    "length": 389,
+                    "status": None,
+                    "transcript": None,
+                    "block_start": (17416, 17718),
+                    "block_end": (17479, 17805),
+                },
+            },
+            id="28584 and 17416 - Assignment and Transcript are None",
+        ),
+    ],
+)
+def test_extract_features_from_pair(
+    aligned_segment1: str,
+    aligned_segment2: str,
+    expected: dict[str, dict[str, int | None | str | tuple[int, int]]],
+    request: pytest.FixtureRequest,
+) -> None:
+    """Test extract of features from a list of pairs."""
+    read_pair = isoslam.extract_features_from_pair(
+        [
+            request.getfixturevalue(aligned_segment1),
+            request.getfixturevalue(aligned_segment2),
+        ]
+    )
+    assert isinstance(read_pair, dict)
+    assert read_pair == expected
+=======
+# @pytest.mark.parametrize(
+#     ("aligned_segment", "start", "end", "length", "status", "transcript", "block_start", "block_end"),
+#     [
+#         pytest.param(  # type: ignore[misc]
+#             "aligned_segment_28584",
+#             28584,
+#             28733,
+#             149,
+#             None,
+#             None,
+#             (28584, 28704),
+#             (28704, 28733),
+#             id="28584 - Assignment and Transcript are None",
+#         ),
+#         pytest.param(
+#             "aligned_segment_17416",
+#             17416,
+#             17805,
+#             389,
+#             "Assigned",
+#             "Something",
+#             (17416, 17718),
+#             (17479, 17805),
+#             id="17416 - Assignment and Transcript are None",
+#             marks=pytest.mark.xfail(reasion="Not getting Assigned or Transcript tags for some reason?"),
+#         ),
+#         pytest.param(
+#             "aligned_segment_18029",
+#             18029,
+#             18385,
+#             356,
+#             "Assigned",
+#             "something",
+#             (18029, 18380),
+#             (18174, 18385),
+#             id="17416 - Assignment and Transcript are None",
+#             marks=pytest.mark.xfail(reasion="Not getting Assigned tag for some reason?"),
+#         ),
+#     ],
+# )
+# def test_extract_features_from_read_param(
+#     aligned_segment: str,
+#     start: int,
+#     end: int,
+#     length: int,
+#     status: str,
+#     transcript: str,
+#     block_start: tuple[int, int],
+#     block_end: tuple[int, int],
+#     request: pytest.FixtureRequest,
+# ) -> None:
+#     """Test extract of features from an aligned segment read."""
+#     segment = isoslam.extract_features_from_read(request.getfixturevalue(aligned_segment))
+#     assert isinstance(segment, dict)
+#     assert segment["start"] == start
+#     assert segment["end"] == end
+#     assert segment["length"] == length
+#     assert segment["status"] == status
+#     assert segment["transcript"] == transcript
+#     assert segment["block_start"] == block_start
+#     assert segment["block_end"] == block_end
+
+
+# @pytest.mark.parametrize(
+#     ("aligned_segment1", "aligned_segment2", "expected"),
+#     [
+#         pytest.param(  # type: ignore[misc]
+#             "aligned_segment_28584",
+#             "aligned_segment_17416",
+#             {
+#                 "read1": {
+#                     "start": 28584,
+#                     "end": 28733,
+#                     "length": 149,
+#                     "status": None,
+#                     "transcript": None,
+#                     "block_start": (28584, 28704),
+#                     "block_end": (28704, 28733),
+#                 },
+#                 "read2": {
+#                     "start": 17416,
+#                     "end": 17805,
+#                     "length": 389,
+#                     "status": None,
+#                     "transcript": None,
+#                     jls_extract_var: (17416, 17718),
+#                     "block_end": (17479, 17805),
+#                 },
+#             },
+#             id="28584 and 17416 - Assignment and Transcript are None",
+#         ),
+#     ],
+# )
+# def test_extract_features_from_pair(
+#     aligned_segment1: str,
+#     aligned_segment2: str,
+#     expected: dict[str, dict[str, int | None | str | tuple[int, int]]],
+#     request: pytest.FixtureRequest,
+# ) -> None:
+#     """Test extract of features from a list of pairs."""
+#     read_pair = isoslam.extract_features_from_pair(
+#         [
+#             request.getfixturevalue(aligned_segment1),
+#             request.getfixturevalue(aligned_segment2),
+#         ]
+#     )
+#     assert isinstance(read_pair, dict)
+#     assert read_pair == expected
+>>>>>>> 3ee6aa7 (feature(isoslam): Adds extract_segment_pair() function)