Skip to content

Commit

Permalink
feature(isoslam): Adds functions to extract features from reads
Browse files Browse the repository at this point in the history
Closes #88

Adds functions to extract features from a read both individual `pysam.AlignedSegment` and pairs of reads returned from
the `pysam.fetch()` iterator.

- Includes tests.
- Extracts
  - `start`
  - `end`
  - `length`
  - tag for `XS` if present, otherwise `None`
  - tag for `XT` if present, otherwise `None`
  - zips up the blocks and returns them as `block_start` and `block_end`

Tests use fixtures defined in `tests/conftest.py` and I had trouble trying to moock `pysam.AlignedSegment` as a number
of the attributes I wanted to mock were immutable. I therefore opted to load a `.bam` file and extract some individual
`AlignedSegments` as features and use these in the parameterised tests.

However, confusingly the value I used for `start` in the `fetch()` call did not match the first item returned by the
generator (this can be seen by the disconnect between the `start` parameters in the `fetch()` calls and the names of the
fixtures which reflect the values that are tested and used in the parameter names).

I'm not sure why this is, but one consequence of it is that I'm yet to have extracted any `AlignedSegment` where the
`get_tag("XS") == Aligned` and so in turn these are all `None`, similarly there are no `get_tag("XT")` (the transcript
id?).

Guidance on how to identify and extract these would be very welcome (I just stuck some `print()` statements into the
section of `all_introns_counts_and_info.py` where the `get_tag("XT") calls were made to show the `read1_start` at that
point, very crude as I'm still fumbling my way around these files and their specification!).
  • Loading branch information
ns-rse committed Dec 6, 2024
1 parent 34892d8 commit b97ba3c
Show file tree
Hide file tree
Showing 3 changed files with 241 additions and 2 deletions.
85 changes: 83 additions & 2 deletions isoslam/isoslam.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any

from loguru import logger
from pysam import AlignedSegment

from isoslam import io

Expand Down Expand Up @@ -53,8 +54,8 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
Returns
-------
tuple[dict[str, tuple[str]], dict[str, tuple[str]]]
Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the ``gene_id`` as
the key.
Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the
``gene_id`` as the key.
"""
strand = defaultdict(str)
transcript = defaultdict(list)
Expand All @@ -65,3 +66,83 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
transcript[entry.gene_id].append(entry.transcript_id)
logger.info(f"Extracted features from : {gtf_file}")
return (strand, transcript)


def extract_features_from_read(read: AlignedSegment) -> dict[str, int | str | None | tuple[int, int]]:
"""
Extract start, end and length from an aligned segment read.
Parameters
----------
read : AlignedSegment
An aligned segment read from ``pysam``.
Returns
-------
dict[str, Any]
Dictionary of ``start``, ``end`` and ``length`` of the segment.
"""
block_start, block_end = zip(*read.get_blocks())
try:
status = read.get_tag("XS")
except KeyError:
status = None
try:
transcript = read.get_tag("XT")
except KeyError:
transcript = None
return {
"start": read.reference_start,
"end": read.reference_end,
"length": read.reference_length,
"status": status,
"transcript": transcript,
"block_start": block_start,
"block_end": block_end,
}


def extract_features_from_pair(pair: list[AlignedSegment]) -> dict[str, dict[str, Any]]:
"""
Extract features from a pair of reads.
Parameters
----------
pair : list[AlignedSegment]
A list of two aligned segments from ``pysam``.
Returns
-------
dic[str, dict[str, Any]]
Returns a nested dictionaries of the ``start``, ``end`` and ``length`` of each read.
"""
return {
"read1": extract_features_from_read(pair[0]),
"read2": extract_features_from_read(pair[1]),
}


# def extract_utron(read: AlignedSegment, transcript_to_gene: Any, tag: str = "XS") -> list | None:
# """
# Extract and sum the utrons based on tag.

# ACTION : This function needs better documentation, my guess is that its extracting the transcripts to genes and
# then getting some related information (what I'm not sure) from the .bed file and adding these up.

# Parameters
# ----------
# read : AlignedSegement
# An aligned segment read.
# transcript_to_gene : TextIO
# Transcript to gene from a ``.bed`` file.
# tag : str
# Type of tag to extract.

# Returns
# -------
# list | None
# List of the length of assigned regions.
# """
# if read.get_tag(tag) == "Assigned":
# return sum(bed_file[transcript] for transcript in tx2gene[read.get_tag("XT")])
# return None
45 changes: 45 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Fixtures for pytest."""

from pathlib import Path

import pytest
from pysam import AlignedSegment, AlignmentFile

from isoslam import io

BASE_DIR = Path.cwd()
TEST_DIR = BASE_DIR / "tests"
RESOURCES = TEST_DIR / "resources"
GTF_DIR = RESOURCES / "gtf"
BED_DIR = RESOURCES / "bed"
VCF_DIR = RESOURCES / "vcf"
BAM_DIR = RESOURCES / "bam"


# pylint: disable=redefined-outer-name


@pytest.fixture()
def bam_file() -> AlignmentFile:
"""Load a bam file."""
return io.load_file(BAM_DIR / "d0_no4sU_filtered_remapped_sorted.bam")


@pytest.fixture()
def aligned_segment_28584(bam_file: AlignmentFile) -> AlignedSegment:
"""Extract an individual AlignedSegment from a ``.bam`` file."""
return next(bam_file.fetch(contig="chr9", start=28592, end=28593))


@pytest.fixture()
def aligned_segment_17416(bam_file: AlignmentFile) -> AlignedSegment:
"""Extract an individual AlignedSegment from a ``.bam`` file."""
# @ns-rse : I have no idea why the generator doesn't return AlignedSegments that match the start/end here
return next(bam_file.fetch(contig="chr9", start=17804, end=18126))


@pytest.fixture()
def aligned_segment_18029(bam_file: AlignmentFile) -> AlignedSegment:
"""Extract an individual AlignedSegment from a ``.bam`` file."""
# @ns-rse : I have no idea why the generator doesn't return AlignedSegments that match the start/end here
return next(bam_file.fetch(contig="chr9", start=18156, end=24870))
113 changes: 113 additions & 0 deletions tests/test_isoslam.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
BASE_DIR = Path.cwd()
RESOURCES = BASE_DIR / "tests" / "resources"

# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments


@pytest.mark.parametrize(
("bed_file", "expected_transcript"),
Expand Down Expand Up @@ -60,3 +63,113 @@ def test_extract_strand_transcript(
strand, transcript = isoslam.extract_strand_transcript(gtf_file)
assert strand == expected_strand
assert transcript == expected_transcript


@pytest.mark.parametrize(
("aligned_segment", "start", "end", "length", "status", "transcript", "block_start", "block_end"),
[
pytest.param( # type: ignore[misc]
"aligned_segment_28584",
28584,
28733,
149,
None,
None,
(28584, 28704),
(28704, 28733),
id="28584 - Assignment and Transcript are None",
),
pytest.param(
"aligned_segment_17416",
17416,
17805,
389,
"Assigned",
"Something",
(17416, 17718),
(17479, 17805),
id="17416 - Assignment and Transcript are None",
marks=pytest.mark.xfail(reasion="Not getting Assigned or Transcript tags for some reason?"),
),
pytest.param(
"aligned_segment_18029",
18029,
18385,
356,
"Assigned",
"something",
(18029, 18380),
(18174, 18385),
id="17416 - Assignment and Transcript are None",
marks=pytest.mark.xfail(reasion="Not getting Assigned tag for some reason?"),
),
],
)
def test_extract_features_from_read_param(
aligned_segment: str,
start: int,
end: int,
length: int,
status: str,
transcript: str,
block_start: tuple[int, int],
block_end: tuple[int, int],
request: pytest.FixtureRequest,
) -> None:
"""Test extract of features from an aligned segment read."""
segment = isoslam.extract_features_from_read(request.getfixturevalue(aligned_segment))

Check failure on line 120 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.10, ubuntu-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None] ValueError: fetch called on bamfile without index

Check failure on line 120 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.11, ubuntu-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None] ValueError: fetch called on bamfile without index

Check failure on line 120 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.12, ubuntu-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None] ValueError: fetch called on bamfile without index

Check failure on line 120 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.10, macos-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None] ValueError: fetch called on bamfile without index

Check failure on line 120 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.11, macos-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None] ValueError: fetch called on bamfile without index

Check failure on line 120 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.12, macos-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None] ValueError: fetch called on bamfile without index
assert isinstance(segment, dict)
assert segment["start"] == start
assert segment["end"] == end
assert segment["length"] == length
assert segment["status"] == status
assert segment["transcript"] == transcript
assert segment["block_start"] == block_start
assert segment["block_end"] == block_end


@pytest.mark.parametrize(
("aligned_segment1", "aligned_segment2", "expected"),
[
pytest.param( # type: ignore[misc]
"aligned_segment_28584",
"aligned_segment_17416",
{
"read1": {
"start": 28584,
"end": 28733,
"length": 149,
"status": None,
"transcript": None,
"block_start": (28584, 28704),
"block_end": (28704, 28733),
},
"read2": {
"start": 17416,
"end": 17805,
"length": 389,
"status": None,
"transcript": None,
"block_start": (17416, 17718),
"block_end": (17479, 17805),
},
},
id="28584 and 17416 - Assignment and Transcript are None",
),
],
)
def test_extract_features_from_pair(
aligned_segment1: str,
aligned_segment2: str,
expected: dict[str, dict[str, int | None | str | tuple[int, int]]],
request: pytest.FixtureRequest,
) -> None:
"""Test extract of features from a list of pairs."""
read_pair = isoslam.extract_features_from_pair(
[
request.getfixturevalue(aligned_segment1),
request.getfixturevalue(aligned_segment2),
]
)
assert isinstance(read_pair, dict)
assert read_pair == expected

0 comments on commit b97ba3c

Please sign in to comment.