Skip to content

Commit

Permalink
feature(isoslam): Adds functions to extract features from reads
Browse files Browse the repository at this point in the history
Closes #88

Adds functions to extract features from a read both individual `pysam.AlignedSegment` and pairs of reads returned from
the `pysam.fetch()` iterator.

- Includes tests.
- Extracts
  - `start`
  - `end`
  - `length`
  - tag for `XS` if present, otherwise `None`
  - tag for `XT` if present, otherwise `None`
  - zips up the blocks and returns them as `block_start` and `block_end`

Tests use fixtures defined in `tests/conftest.py` and I had trouble trying to moock `pysam.AlignedSegment` as a number
of the attributes I wanted to mock were immutable. I therefore opted to load a `.bam` file and extract some individual
`AlignedSegments` as features and use these in the parameterised tests.

However, confusingly the value I used for `start` in the `fetch()` call did not match the first item returned by the
generator (this can be seen by the disconnect between the `start` parameters in the `fetch()` calls and the names of the
fixtures which reflect the values that are tested and used in the parameter names).

I'm not sure why this is, but one consequence of it is that I'm yet to have extracted any `AlignedSegment` where the
`get_tag("XS") == Aligned` and so in turn these are all `None`, similarly there are no `get_tag("XT")` (the transcript
id?).

Guidance on how to identify and extract these would be very welcome (I just stuck some `print()` statements into the
section of `all_introns_counts_and_info.py` where the `get_tag("XT") calls were made to show the `read1_start` at that
point, very crude as I'm still fumbling my way around these files and their specification!).
  • Loading branch information
ns-rse committed Dec 6, 2024
1 parent 34892d8 commit d518e53
Show file tree
Hide file tree
Showing 2 changed files with 196 additions and 2 deletions.
85 changes: 83 additions & 2 deletions isoslam/isoslam.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Any

from loguru import logger
from pysam import AlignedSegment

from isoslam import io

Expand Down Expand Up @@ -53,8 +54,8 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
Returns
-------
tuple[dict[str, tuple[str]], dict[str, tuple[str]]]
Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the ``gene_id`` as
the key.
Two dictionaries are returned, one of the ``strand`` the other of the ``transcript_id`` both using the
``gene_id`` as the key.
"""
strand = defaultdict(str)
transcript = defaultdict(list)
Expand All @@ -65,3 +66,83 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
transcript[entry.gene_id].append(entry.transcript_id)
logger.info(f"Extracted features from : {gtf_file}")
return (strand, transcript)


def extract_features_from_read(read: AlignedSegment) -> dict[str, int | str | None | tuple[int, int]]:
"""
Extract start, end and length from an aligned segment read.
Parameters
----------
read : AlignedSegment
An aligned segment read from ``pysam``.
Returns
-------
dict[str, Any]
Dictionary of ``start``, ``end`` and ``length`` of the segment.
"""
block_start, block_end = zip(*read.get_blocks())
try:
status = read.get_tag("XS")
except KeyError:
status = None
try:
transcript = read.get_tag("XT")
except KeyError:
transcript = None
return {
"start": read.reference_start,
"end": read.reference_end,
"length": read.reference_length,
"status": status,
"transcript": transcript,
"block_start": block_start,
"block_end": block_end,
}


def extract_features_from_pair(pair: list[AlignedSegment]) -> dict[str, dict[str, Any]]:
"""
Extract features from a pair of reads.
Parameters
----------
pair : list[AlignedSegment]
A list of two aligned segments from ``pysam``.
Returns
-------
dic[str, dict[str, Any]]
Returns a nested dictionaries of the ``start``, ``end`` and ``length`` of each read.
"""
return {
"read1": extract_features_from_read(pair[0]),
"read2": extract_features_from_read(pair[1]),
}


# def extract_utron(read: AlignedSegment, transcript_to_gene: Any, tag: str = "XS") -> list | None:
# """
# Extract and sum the utrons based on tag.

# ACTION : This function needs better documentation, my guess is that its extracting the transcripts to genes and
# then getting some related information (what I'm not sure) from the .bed file and adding these up.

# Parameters
# ----------
# read : AlignedSegement
# An aligned segment read.
# transcript_to_gene : TextIO
# Transcript to gene from a ``.bed`` file.
# tag : str
# Type of tag to extract.

# Returns
# -------
# list | None
# List of the length of assigned regions.
# """
# if read.get_tag(tag) == "Assigned":
# return sum(bed_file[transcript] for transcript in tx2gene[read.get_tag("XT")])
# return None
113 changes: 113 additions & 0 deletions tests/test_isoslam.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
BASE_DIR = Path.cwd()
RESOURCES = BASE_DIR / "tests" / "resources"

# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments


@pytest.mark.parametrize(
("bed_file", "expected_transcript"),
Expand Down Expand Up @@ -60,3 +63,113 @@ def test_extract_strand_transcript(
strand, transcript = isoslam.extract_strand_transcript(gtf_file)
assert strand == expected_strand
assert transcript == expected_transcript


@pytest.mark.parametrize(

Check failure on line 68 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.10, ubuntu-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None]

Check failure on line 68 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.11, ubuntu-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None]

Check failure on line 68 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.12, ubuntu-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None]

Check failure on line 68 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.10, macos-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None]

Check failure on line 68 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.11, macos-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None]

Check failure on line 68 in tests/test_isoslam.py

View workflow job for this annotation

GitHub Actions / Ex1 (3.12, macos-latest)

test_extract_features_from_read_param[28584 - Assignment and Transcript are None]
("aligned_segment", "start", "end", "length", "status", "transcript", "block_start", "block_end"),
[
pytest.param( # type: ignore[misc]
"aligned_segment_28584",
28584,
28733,
149,
None,
None,
(28584, 28704),
(28704, 28733),
id="28584 - Assignment and Transcript are None",
),
pytest.param(
"aligned_segment_17416",
17416,
17805,
389,
"Assigned",
"Something",
(17416, 17718),
(17479, 17805),
id="17416 - Assignment and Transcript are None",
marks=pytest.mark.xfail(reasion="Not getting Assigned or Transcript tags for some reason?"),
),
pytest.param(
"aligned_segment_18029",
18029,
18385,
356,
"Assigned",
"something",
(18029, 18380),
(18174, 18385),
id="17416 - Assignment and Transcript are None",
marks=pytest.mark.xfail(reasion="Not getting Assigned tag for some reason?"),
),
],
)
def test_extract_features_from_read_param(
aligned_segment: str,
start: int,
end: int,
length: int,
status: str,
transcript: str,
block_start: tuple[int, int],
block_end: tuple[int, int],
request: pytest.FixtureRequest,
) -> None:
"""Test extract of features from an aligned segment read."""
segment = isoslam.extract_features_from_read(request.getfixturevalue(aligned_segment))
assert isinstance(segment, dict)
assert segment["start"] == start
assert segment["end"] == end
assert segment["length"] == length
assert segment["status"] == status
assert segment["transcript"] == transcript
assert segment["block_start"] == block_start
assert segment["block_end"] == block_end


@pytest.mark.parametrize(
("aligned_segment1", "aligned_segment2", "expected"),
[
pytest.param( # type: ignore[misc]
"aligned_segment_28584",
"aligned_segment_17416",
{
"read1": {
"start": 28584,
"end": 28733,
"length": 149,
"status": None,
"transcript": None,
"block_start": (28584, 28704),
"block_end": (28704, 28733),
},
"read2": {
"start": 17416,
"end": 17805,
"length": 389,
"status": None,
"transcript": None,
"block_start": (17416, 17718),
"block_end": (17479, 17805),
},
},
id="28584 and 17416 - Assignment and Transcript are None",
),
],
)
def test_extract_features_from_pair(
aligned_segment1: str,
aligned_segment2: str,
expected: dict[str, dict[str, int | None | str | tuple[int, int]]],
request: pytest.FixtureRequest,
) -> None:
"""Test extract of features from a list of pairs."""
read_pair = isoslam.extract_features_from_pair(
[
request.getfixturevalue(aligned_segment1),
request.getfixturevalue(aligned_segment2),
]
)
assert isinstance(read_pair, dict)
assert read_pair == expected

0 comments on commit d518e53

Please sign in to comment.