feature(isoslam): Adds functions to extract features from reads

Closes #88 Adds functions to extract features from a read both individual `pysam.AlignedSegment` and pairs of reads returned from the `pysam.fetch()` iterator. - Includes tests. - Extracts - `start` - `end` - `length` - tag for `XS` if present, otherwise `None` - tag for `XT` if present, otherwise `None` - zips up the blocks and returns them as `block_start` and `block_end` Tests use fixtures defined in `tests/conftest.py` and I had trouble trying to moock `pysam.AlignedSegment` as a number of the attributes I wanted to mock were immutable. I therefore opted to load a `.bam` file and extract some individual `AlignedSegments` as features and use these in the parameterised tests. However, confusingly the value I used for `start` in the `fetch()` call did not match the first item returned by the generator (this can be seen by the disconnect between the `start` parameters in the `fetch()` calls and the names of the fixtures which reflect the values that are tested and used in the parameter names). I'm not sure why this is, but one consequence of it is that I'm yet to have extracted any `AlignedSegment` where the `get_tag("XS") == Aligned` and so in turn these are all `None`, similarly there are no `get_tag("XT")` (the transcript id?). Guidance on how to identify and extract these would be very welcome (I just stuck some `print()` statements into the section of `all_introns_counts_and_info.py` where the `get_tag("XT") calls were made to show the `read1_start` at that point, very crude as I'm still fumbling my way around these files and their specification!).
sudlab · Dec 6, 2024 · d518e53 · d518e53
1 parent 34892d8
commit d518e53
Show file tree

Hide file tree

Showing 2 changed files with 196 additions and 2 deletions.
diff --git a/isoslam/isoslam.py b/isoslam/isoslam.py
@@ -5,6 +5,7 @@
 from typing import Any
 
 from loguru import logger
+from pysam import AlignedSegment
 
 from isoslam import io
 
@@ -53,8 +54,8 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
     Returns
     -------
     tuple[dict[str, tuple[str]], dict[str, tuple[str]]]
-        Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the ``gene_id`` as
-        the key.
+        Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the
+        ``gene_id`` as the key.
     """
     strand = defaultdict(str)
     transcript = defaultdict(list)
@@ -65,3 +66,83 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
         transcript[entry.gene_id].append(entry.transcript_id)
     logger.info(f"Extracted features from : {gtf_file}")
     return (strand, transcript)
+
+
+def extract_features_from_read(read: AlignedSegment) -> dict[str, int | str | None | tuple[int, int]]:
+    """
+    Extract start, end and length from an aligned segment read.
+
+    Parameters
+    ----------
+    read : AlignedSegment
+        An aligned segment read from ``pysam``.
+
+    Returns
+    -------
+    dict[str, Any]
+        Dictionary of ``start``, ``end`` and ``length`` of the segment.
+    """
+    block_start, block_end = zip(*read.get_blocks())
+    try:
+        status = read.get_tag("XS")
+    except KeyError:
+        status = None
+    try:
+        transcript = read.get_tag("XT")
+    except KeyError:
+        transcript = None
+    return {
+        "start": read.reference_start,
+        "end": read.reference_end,
+        "length": read.reference_length,
+        "status": status,
+        "transcript": transcript,
+        "block_start": block_start,
+        "block_end": block_end,
+    }
+
+
+def extract_features_from_pair(pair: list[AlignedSegment]) -> dict[str, dict[str, Any]]:
+    """
+    Extract features from a pair of reads.
+
+    Parameters
+    ----------
+    pair : list[AlignedSegment]
+        A list of two aligned segments from ``pysam``.
+
+    Returns
+    -------
+    dic[str, dict[str, Any]]
+        Returns a nested dictionaries of the ``start``, ``end`` and ``length`` of each read.
+    """
+    return {
+        "read1": extract_features_from_read(pair[0]),
+        "read2": extract_features_from_read(pair[1]),
+    }
+
+
+# def extract_utron(read: AlignedSegment, transcript_to_gene: Any, tag: str = "XS") -> list | None:
+#     """
+#     Extract and sum the utrons based on tag.
+
+#     ACTION : This function needs better documentation, my guess is that its extracting the transcripts to genes and
+#     then getting some related information (what I'm not sure) from the .bed file and adding these up.
+
+#     Parameters
+#     ----------
+#     read : AlignedSegement
+#         An aligned segment read.
+#     transcript_to_gene : TextIO
+#         Transcript to gene from a ``.bed`` file.
+#     tag : str
+#         Type of tag to extract.
+
+#     Returns
+#     -------
+#     list | None
+#         List of the length of assigned regions.
+#     """
+#     if read.get_tag(tag) == "Assigned":
+#         return sum(bed_file[transcript] for transcript in tx2gene[read.get_tag("XT")])
+#     return None
diff --git a/tests/test_isoslam.py b/tests/test_isoslam.py
@@ -10,6 +10,9 @@
 BASE_DIR = Path.cwd()
 RESOURCES = BASE_DIR / "tests" / "resources"
 
+# pylint: disable=too-many-arguments
+# pylint: disable=too-many-positional-arguments
+
 
 @pytest.mark.parametrize(
     ("bed_file", "expected_transcript"),
@@ -60,3 +63,113 @@ def test_extract_strand_transcript(
     strand, transcript = isoslam.extract_strand_transcript(gtf_file)
     assert strand == expected_strand
     assert transcript == expected_transcript
+
+
+@pytest.mark.parametrize(
+    ("aligned_segment", "start", "end", "length", "status", "transcript", "block_start", "block_end"),
+    [
+        pytest.param(  # type: ignore[misc]
+            "aligned_segment_28584",
+            28584,
+            28733,
+            149,
+            None,
+            None,
+            (28584, 28704),
+            (28704, 28733),
+            id="28584 - Assignment and Transcript are None",
+        ),
+        pytest.param(
+            "aligned_segment_17416",
+            17416,
+            17805,
+            389,
+            "Assigned",
+            "Something",
+            (17416, 17718),
+            (17479, 17805),
+            id="17416 - Assignment and Transcript are None",
+            marks=pytest.mark.xfail(reasion="Not getting Assigned or Transcript tags for some reason?"),
+        ),
+        pytest.param(
+            "aligned_segment_18029",
+            18029,
+            18385,
+            356,
+            "Assigned",
+            "something",
+            (18029, 18380),
+            (18174, 18385),
+            id="17416 - Assignment and Transcript are None",
+            marks=pytest.mark.xfail(reasion="Not getting Assigned tag for some reason?"),
+        ),
+    ],
+)
+def test_extract_features_from_read_param(
+    aligned_segment: str,
+    start: int,
+    end: int,
+    length: int,
+    status: str,
+    transcript: str,
+    block_start: tuple[int, int],
+    block_end: tuple[int, int],
+    request: pytest.FixtureRequest,
+) -> None:
+    """Test extract of features from an aligned segment read."""
+    segment = isoslam.extract_features_from_read(request.getfixturevalue(aligned_segment))
+    assert isinstance(segment, dict)
+    assert segment["start"] == start
+    assert segment["end"] == end
+    assert segment["length"] == length
+    assert segment["status"] == status
+    assert segment["transcript"] == transcript
+    assert segment["block_start"] == block_start
+    assert segment["block_end"] == block_end
+
+
+@pytest.mark.parametrize(
+    ("aligned_segment1", "aligned_segment2", "expected"),
+    [
+        pytest.param(  # type: ignore[misc]
+            "aligned_segment_28584",
+            "aligned_segment_17416",
+            {
+                "read1": {
+                    "start": 28584,
+                    "end": 28733,
+                    "length": 149,
+                    "status": None,
+                    "transcript": None,
+                    "block_start": (28584, 28704),
+                    "block_end": (28704, 28733),
+                },
+                "read2": {
+                    "start": 17416,
+                    "end": 17805,
+                    "length": 389,
+                    "status": None,
+                    "transcript": None,
+                    "block_start": (17416, 17718),
+                    "block_end": (17479, 17805),
+                },
+            },
+            id="28584 and 17416 - Assignment and Transcript are None",
+        ),
+    ],
+)
+def test_extract_features_from_pair(
+    aligned_segment1: str,
+    aligned_segment2: str,
+    expected: dict[str, dict[str, int | None | str | tuple[int, int]]],
+    request: pytest.FixtureRequest,
+) -> None:
+    """Test extract of features from a list of pairs."""
+    read_pair = isoslam.extract_features_from_pair(
+        [
+            request.getfixturevalue(aligned_segment1),
+            request.getfixturevalue(aligned_segment2),
+        ]
+    )
+    assert isinstance(read_pair, dict)
+    assert read_pair == expected