sudlab · ns-rse · Dec 12, 2024 · Dec 4, 2024 · Dec 6, 2024 · Dec 9, 2024
diff --git a/docs/source/workflow.md b/docs/source/workflow.md
@@ -56,6 +56,16 @@ graph TD;
 
 ```
 
+## IsoSLAM
+
+A number of pre-processing steps are undertaken prior to IsoSLAM work being done. The following is work in progress as
+the code is refactored.
+
+1. Iterate over `.bam` file and pair segments. If two or more `AlignedSegments` with the same `query_name` are found
+   then `n > 1` segments are dropped.
+2. Pairs of segments (individual `AlignedSegments`) are then assessed and if they are `Assigned` the `start`, `end`,
+   `length`, `status` (i.e. `Assigned`), `transcript_id`, `block_start` and `block_end` are extracted.
+
 ## Updating
 
 The above diagram is written in [Mermaid][mermaid]. You can view the source code in the IsoSLAM repository and

diff --git a/isoslam/isoslam.py b/isoslam/isoslam.py
@@ -1,10 +1,12 @@
 """IsoSLAM module."""
 
 from collections import defaultdict
+from collections.abc import Generator
 from pathlib import Path
 from typing import Any
 
 from loguru import logger
+from pysam import AlignedSegment
 
 from isoslam import io
 
@@ -53,8 +55,8 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
     Returns
     -------
     tuple[dict[str, tuple[str]], dict[str, tuple[str]]]
-        Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the ``gene_id`` as
-        the key.
+        Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the
+        ``gene_id`` as the key.
     """
     strand = defaultdict(str)
     transcript = defaultdict(list)
@@ -65,3 +67,114 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
         transcript[entry.gene_id].append(entry.transcript_id)
     logger.info(f"Extracted features from : {gtf_file}")
     return (strand, transcript)
+
+
+def extract_segment_pairs(bam_file: str | Path) -> Generator[AlignedSegment]:
+    """
+    Extract pairs of AlignedSegments from a ``.bam`` file.
+
+    When there are two adjacent ``AlignedSegments`` with the same ``query_name`` only the first is paired, subsequent
+    segments are dropped.
+
+    Parameters
+    ----------
+    bam_file : str | Path
+        Path to a ``.bam`` file.
+
+    Yields
+    ------
+    Generator
+        Itterable of paired segments.
+    """
+    previous_read: str | None = None
+    pair: list[AlignedSegment] = []
+    for read in io.load_file(bam_file):
+        # Return pairs of reads, i.e. not on first pass, nor if query_name matches the previous read
+        if previous_read is not None and previous_read != read.query_name:
+            yield pair
+            pair = []
+            previous_read = read.query_name
+        previous_read = read.query_name
+        pair.append(read)
+    # Don't forget to return the last pair!
+    yield pair
+
+
+def extract_features_from_read(read: AlignedSegment) -> dict[str, int | str | None | tuple[int, int]]:
+    """
+    Extract start, end and length from an aligned segment read.
+
+    Parameters
+    ----------
+    read : AlignedSegment
+        An aligned segment read from ``pysam``.
+
+    Returns
+    -------
+    dict[str, Any]
+        Dictionary of ``start``, ``end`` and ``length`` of the segment.
+    """
+    block_start, block_end = zip(*read.get_blocks())
+    try:
+        status = read.get_tag("XS")
+    except KeyError:
+        status = None
+    try:
+        transcript = read.get_tag("XT")
+    except KeyError:
+        transcript = None
+    return {
+        "start": read.reference_start,
+        "end": read.reference_end,
+        "length": read.reference_length,
+        "status": status,
+        "transcript": transcript,
+        "block_start": block_start,
+        "block_end": block_end,
+    }
+
+
+def extract_features_from_pair(pair: list[AlignedSegment]) -> dict[str, dict[str, Any]]:
+    """
+    Extract features from a pair of reads.
+
+    Parameters
+    ----------
+    pair : list[AlignedSegment]
+        A list of two aligned segments from ``pysam``.
+
+    Returns
+    -------
+    dic[str, dict[str, Any]]
+        Returns a nested dictionaries of the ``start``, ``end`` and ``length`` of each read.
+    """
+    return {
+        "read1": extract_features_from_read(pair[0]),
+        "read2": extract_features_from_read(pair[1]),
+    }
+
+
+# def extract_utron(read: AlignedSegment, transcript_to_gene: Any, tag: str = "XS") -> list | None:
+#     """
+#     Extract and sum the utrons based on tag.
+
+#     ACTION : This function needs better documentation, my guess is that its extracting the transcripts to genes and
+#     then getting some related information (what I'm not sure) from the .bed file and adding these up.
+
+#     Parameters
+#     ----------
+#     read : AlignedSegement
+#         An aligned segment read.
+#     transcript_to_gene : TextIO
+#         Transcript to gene from a ``.bed`` file.
+#     tag : str
+#         Type of tag to extract.
+
+#     Returns
+#     -------
+#     list | None
+#         List of the length of assigned regions.
+#     """
+#     if read.get_tag(tag) == "Assigned":
+#         return sum(bed_file[transcript] for transcript in tx2gene[read.get_tag("XT")])
+#     return None
diff --git a/isoslam/pipeline_slam_3UIs.py b/isoslam/pipeline_slam_3UIs.py
@@ -55,12 +55,12 @@ def feature_count_read_assignments(infile, outfile):
     outdir = os.path.dirname(outfile)
     rename_outfile = outfile.replace(".assigned.bam", "")
     statement = (
-        "featureCounts"
+        "featureCounts "
         "-p "
         f"-a {annotation} "
         f"-o {outdir}/counts.txt "
         "-T 2 "
-        "-R BAM"
+        "-R BAM "
         f"{infile} &&"
         "mv %(rename_outfile)s.bam.featureCounts.bam %(outfile)s"
     )