Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature(isoslam): Adds functions to extract features from reads #106

Merged
merged 5 commits into from
Dec 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/source/workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@ graph TD;

```

## IsoSLAM

A number of pre-processing steps are undertaken prior to IsoSLAM work being done. The following is work in progress as
the code is refactored.

1. Iterate over `.bam` file and pair segments. If two or more `AlignedSegments` with the same `query_name` are found
then `n > 1` segments are dropped.
2. Pairs of segments (individual `AlignedSegments`) are then assessed and if they are `Assigned` the `start`, `end`,
`length`, `status` (i.e. `Assigned`), `transcript_id`, `block_start` and `block_end` are extracted.

## Updating

The above diagram is written in [Mermaid][mermaid]. You can view the source code in the IsoSLAM repository and
Expand Down
117 changes: 115 additions & 2 deletions isoslam/isoslam.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
"""IsoSLAM module."""

from collections import defaultdict
from collections.abc import Generator
from pathlib import Path
from typing import Any

from loguru import logger
from pysam import AlignedSegment

from isoslam import io

Expand Down Expand Up @@ -53,8 +55,8 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
Returns
-------
tuple[dict[str, tuple[str]], dict[str, tuple[str]]]
Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the ``gene_id`` as
the key.
Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the
``gene_id`` as the key.
"""
strand = defaultdict(str)
transcript = defaultdict(list)
Expand All @@ -65,3 +67,114 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
transcript[entry.gene_id].append(entry.transcript_id)
logger.info(f"Extracted features from : {gtf_file}")
return (strand, transcript)


def extract_segment_pairs(bam_file: str | Path) -> Generator[AlignedSegment]:
"""
Extract pairs of AlignedSegments from a ``.bam`` file.

When there are two adjacent ``AlignedSegments`` with the same ``query_name`` only the first is paired, subsequent
segments are dropped.

Parameters
----------
bam_file : str | Path
Path to a ``.bam`` file.

Yields
------
Generator
Itterable of paired segments.
"""
previous_read: str | None = None
pair: list[AlignedSegment] = []
for read in io.load_file(bam_file):
# Return pairs of reads, i.e. not on first pass, nor if query_name matches the previous read
if previous_read is not None and previous_read != read.query_name:
yield pair
pair = []
previous_read = read.query_name
previous_read = read.query_name
pair.append(read)
# Don't forget to return the last pair!
yield pair


def extract_features_from_read(read: AlignedSegment) -> dict[str, int | str | None | tuple[int, int]]:
"""
Extract start, end and length from an aligned segment read.

Parameters
----------
read : AlignedSegment
An aligned segment read from ``pysam``.

Returns
-------
dict[str, Any]
Dictionary of ``start``, ``end`` and ``length`` of the segment.
"""
block_start, block_end = zip(*read.get_blocks())
try:
status = read.get_tag("XS")
except KeyError:
status = None
try:
transcript = read.get_tag("XT")
except KeyError:
transcript = None
return {
"start": read.reference_start,
"end": read.reference_end,
"length": read.reference_length,
"status": status,
"transcript": transcript,
"block_start": block_start,
"block_end": block_end,
}


def extract_features_from_pair(pair: list[AlignedSegment]) -> dict[str, dict[str, Any]]:
"""
Extract features from a pair of reads.

Parameters
----------
pair : list[AlignedSegment]
A list of two aligned segments from ``pysam``.

Returns
-------
dic[str, dict[str, Any]]
Returns a nested dictionaries of the ``start``, ``end`` and ``length`` of each read.
"""
return {
"read1": extract_features_from_read(pair[0]),
"read2": extract_features_from_read(pair[1]),
}


# def extract_utron(read: AlignedSegment, transcript_to_gene: Any, tag: str = "XS") -> list | None:
# """
# Extract and sum the utrons based on tag.

# ACTION : This function needs better documentation, my guess is that its extracting the transcripts to genes and
# then getting some related information (what I'm not sure) from the .bed file and adding these up.

# Parameters
# ----------
# read : AlignedSegement
# An aligned segment read.
# transcript_to_gene : TextIO
# Transcript to gene from a ``.bed`` file.
# tag : str
# Type of tag to extract.

# Returns
# -------
# list | None
# List of the length of assigned regions.
# """
# if read.get_tag(tag) == "Assigned":
# return sum(bed_file[transcript] for transcript in tx2gene[read.get_tag("XT")])
# return None
4 changes: 2 additions & 2 deletions isoslam/pipeline_slam_3UIs.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ def feature_count_read_assignments(infile, outfile):
outdir = os.path.dirname(outfile)
rename_outfile = outfile.replace(".assigned.bam", "")
statement = (
"featureCounts"
"featureCounts "
"-p "
f"-a {annotation} "
f"-o {outdir}/counts.txt "
"-T 2 "
"-R BAM"
"-R BAM "
f"{infile} &&"
"mv %(rename_outfile)s.bam.featureCounts.bam %(outfile)s"
)
Expand Down
Loading
Loading