Skip to content

Commit

Permalink
feature(isoslam): Adds extract_segment_pair() function
Browse files Browse the repository at this point in the history
This function reads through a `.bam` file, which should already have been sorted and assigned, and produces pairs of
reads.
  • Loading branch information
ns-rse committed Dec 10, 2024
1 parent b8fec60 commit d1efad4
Show file tree
Hide file tree
Showing 4 changed files with 223 additions and 59 deletions.
10 changes: 10 additions & 0 deletions docs/source/workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@ graph TD;
```

## IsoSLAM

A number of pre-processing steps are undertaken prior to IsoSLAM work being done. The following is work in progress as
the code is refactored.

1. Iterate over `.bam` file and pair segments. If two or more `AlignedSegments` with the same `query_name` are found
then `n > 1` segments are dropped.
2. Pairs of segments (individual `AlignedSegments`) are then assessed and if they are `Assigned` the `start`, `end`,
`length`, `status` (i.e. `Assigned`), `transcript_id`, `block_start` and `block_end` are extracted.

## Updating

The above diagram is written in [Mermaid][mermaid]. You can view the source code in the IsoSLAM repository and
Expand Down
32 changes: 32 additions & 0 deletions isoslam/isoslam.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""IsoSLAM module."""

from collections import defaultdict
from collections.abc import Generator
from pathlib import Path
from typing import Any

Expand Down Expand Up @@ -68,6 +69,37 @@ def extract_strand_transcript(gtf_file: str | Path) -> tuple[defaultdict[Any, An
return (strand, transcript)


def extract_segment_pairs(bam_file: str | Path) -> Generator[AlignedSegment]:
"""
Extract pairs of AlignedSegments from a ``.bam`` file.
When there are two adjacent ``AlignedSegments`` with the same ``query_name`` only the first is paired, subsequent
segments are dropped.
Parameters
----------
bam_file : str | Path
Path to a ``.bam`` file.
Yields
------
Generator
Itterable of paired segments.
"""
previous_read: str | None = None
pair: list[AlignedSegment] = []
for read in io.load_file(bam_file):
# Return pairs of reads, i.e. not on first pass, nor if query_name matches the previous read
if previous_read is not None and previous_read != read.query_name:
yield pair
pair = []
previous_read = read.query_name
previous_read = read.query_name
pair.append(read)
# Don't forget to return the last pair!
yield pair


def extract_features_from_read(read: AlignedSegment) -> dict[str, int | str | None | tuple[int, int]]:
"""
Extract start, end and length from an aligned segment read.
Expand Down
8 changes: 4 additions & 4 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path

import pytest
from pysam import AlignedSegment, AlignmentFile
from pysam import AlignmentFile

from isoslam import io

Expand All @@ -14,15 +14,15 @@
BED_DIR = RESOURCES / "bed"
VCF_DIR = RESOURCES / "vcf"
BAM_DIR = RESOURCES / "bam"

BAM_SORTED_ASSIGNED_DIR = BAM_DIR / "sorted_assigned"

# pylint: disable=redefined-outer-name


@pytest.fixture()
def bam_file() -> AlignmentFile:
def bam_file1() -> AlignmentFile:
"""Load a bam file."""
return io.load_file(BAM_DIR / "d0_no4sU_filtered_remapped_sorted.bam")
return io.load_file(BAM_SORTED_ASSIGNED_DIR / "d0_no4sU_filtered_remapped_sorted.sorted.assigned.bam")


@pytest.fixture()
Expand Down
232 changes: 177 additions & 55 deletions tests/test_isoslam.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Tests for the isoslam module."""

from pathlib import Path
from types import GeneratorType
from typing import Any

import pytest # type: ignore[import-not-found]
Expand All @@ -9,6 +10,11 @@

BASE_DIR = Path.cwd()
RESOURCES = BASE_DIR / "tests" / "resources"
BAM_DIR = RESOURCES / "bam"
BED_DIR = RESOURCES / "bed"
GTF_DIR = RESOURCES / "gtf"
VCF_DIR = RESOURCES / "vcf"
BAM_SORTED_ASSIGNED_DIR = BAM_DIR / "sorted_assigned"

# pylint: disable=too-many-arguments
# pylint: disable=too-many-positional-arguments
Expand All @@ -18,7 +24,7 @@
("bed_file", "expected_transcript"),
[
pytest.param( # type: ignore[misc]
RESOURCES / "bed" / "test_coding_introns.bed",
BED_DIR / "test_coding_introns.bed",
{
"ENST00000442898": [
("9", 14940, 15080, "ENST00000442898", "-"),
Expand Down Expand Up @@ -49,7 +55,7 @@ def test_isoslam_extract_transcripts(
("gtf_file", "expected_strand", "expected_transcript"),
[
pytest.param( # type: ignore[misc]
RESOURCES / "gtf" / "test_wash1.gtf",
GTF_DIR / "test_wash1.gtf",
{"MSTRG.63147": "-"},
{"MSTRG.63147": ["ENST00000442898"]},
id="gtf file as Path",
Expand All @@ -66,68 +72,28 @@ def test_extract_strand_transcript(


@pytest.mark.parametrize(
("aligned_segment", "start", "end", "length", "status", "transcript", "block_start", "block_end"),
("bam_file", "expected_length"),
[
pytest.param( # type: ignore[misc]
"aligned_segment_28584",
28584,
28733,
149,
None,
None,
(28584, 28704),
(28704, 28733),
id="28584 - Assignment and Transcript are None",
BAM_SORTED_ASSIGNED_DIR / "d0_no4sU_filtered_remapped_sorted.sorted.assigned.bam",
710,
id="file 1",
),
pytest.param(
"aligned_segment_17416",
17416,
17805,
389,
"Assigned",
"Something",
(17416, 17718),
(17479, 17805),
id="17416 - Assignment and Transcript are None",
marks=pytest.mark.xfail(reasion="Not getting Assigned or Transcript tags for some reason?"),
),
pytest.param(
"aligned_segment_18029",
18029,
18385,
356,
"Assigned",
"something",
(18029, 18380),
(18174, 18385),
id="17416 - Assignment and Transcript are None",
marks=pytest.mark.xfail(reasion="Not getting Assigned tag for some reason?"),
BAM_SORTED_ASSIGNED_DIR / "d0_0hr1_filtered_remapped_sorted.sorted.assigned.bam",
1845,
id="file 2",
),
],
)
def test_extract_features_from_read_param(
aligned_segment: str,
start: int,
end: int,
length: int,
status: str,
transcript: str,
block_start: tuple[int, int],
block_end: tuple[int, int],
request: pytest.FixtureRequest,
) -> None:
"""Test extract of features from an aligned segment read."""
segment = isoslam.extract_features_from_read(request.getfixturevalue(aligned_segment))
assert isinstance(segment, dict)
assert segment["start"] == start
assert segment["end"] == end
assert segment["length"] == length
assert segment["status"] == status
assert segment["transcript"] == transcript
assert segment["block_start"] == block_start
assert segment["block_end"] == block_end
def test_extract_segment_pairs(bam_file: str | Path, expected_length: int) -> None:
"""Test extraction of paired segments from a sorted and assigned ``.bam`` file."""
alignment_file = isoslam.extract_segment_pairs(bam_file)
assert isinstance(alignment_file, GeneratorType)
assert len(list(alignment_file)) == expected_length


<<<<<<< HEAD
<<<<<<< HEAD
@pytest.mark.parametrize(
("aligned_segment1", "aligned_segment2", "expected"),
Expand Down Expand Up @@ -424,3 +390,159 @@ def test_extract_features_from_pair(
assert isinstance(read_pair, dict)
assert read_pair == expected
>>>>>>> 3878e7a (fixup! feature(isoslam): Adds functions to extract features from reads)
||||||| parent of 3ee6aa7 (feature(isoslam): Adds extract_segment_pair() function)
@pytest.mark.parametrize(
("aligned_segment1", "aligned_segment2", "expected"),
[
pytest.param( # type: ignore[misc]
"aligned_segment_28584",
"aligned_segment_17416",
{
"read1": {
"start": 28584,
"end": 28733,
"length": 149,
"status": None,
"transcript": None,
"block_start": (28584, 28704),
"block_end": (28704, 28733),
},
"read2": {
"start": 17416,
"end": 17805,
"length": 389,
"status": None,
"transcript": None,
"block_start": (17416, 17718),
"block_end": (17479, 17805),
},
},
id="28584 and 17416 - Assignment and Transcript are None",
),
],
)
def test_extract_features_from_pair(
aligned_segment1: str,
aligned_segment2: str,
expected: dict[str, dict[str, int | None | str | tuple[int, int]]],
request: pytest.FixtureRequest,
) -> None:
"""Test extract of features from a list of pairs."""
read_pair = isoslam.extract_features_from_pair(
[
request.getfixturevalue(aligned_segment1),
request.getfixturevalue(aligned_segment2),
]
)
assert isinstance(read_pair, dict)
assert read_pair == expected
=======
# @pytest.mark.parametrize(
# ("aligned_segment", "start", "end", "length", "status", "transcript", "block_start", "block_end"),
# [
# pytest.param( # type: ignore[misc]
# "aligned_segment_28584",
# 28584,
# 28733,
# 149,
# None,
# None,
# (28584, 28704),
# (28704, 28733),
# id="28584 - Assignment and Transcript are None",
# ),
# pytest.param(
# "aligned_segment_17416",
# 17416,
# 17805,
# 389,
# "Assigned",
# "Something",
# (17416, 17718),
# (17479, 17805),
# id="17416 - Assignment and Transcript are None",
# marks=pytest.mark.xfail(reasion="Not getting Assigned or Transcript tags for some reason?"),
# ),
# pytest.param(
# "aligned_segment_18029",
# 18029,
# 18385,
# 356,
# "Assigned",
# "something",
# (18029, 18380),
# (18174, 18385),
# id="17416 - Assignment and Transcript are None",
# marks=pytest.mark.xfail(reasion="Not getting Assigned tag for some reason?"),
# ),
# ],
# )
# def test_extract_features_from_read_param(
# aligned_segment: str,
# start: int,
# end: int,
# length: int,
# status: str,
# transcript: str,
# block_start: tuple[int, int],
# block_end: tuple[int, int],
# request: pytest.FixtureRequest,
# ) -> None:
# """Test extract of features from an aligned segment read."""
# segment = isoslam.extract_features_from_read(request.getfixturevalue(aligned_segment))
# assert isinstance(segment, dict)
# assert segment["start"] == start
# assert segment["end"] == end
# assert segment["length"] == length
# assert segment["status"] == status
# assert segment["transcript"] == transcript
# assert segment["block_start"] == block_start
# assert segment["block_end"] == block_end


# @pytest.mark.parametrize(
# ("aligned_segment1", "aligned_segment2", "expected"),
# [
# pytest.param( # type: ignore[misc]
# "aligned_segment_28584",
# "aligned_segment_17416",
# {
# "read1": {
# "start": 28584,
# "end": 28733,
# "length": 149,
# "status": None,
# "transcript": None,
# "block_start": (28584, 28704),
# "block_end": (28704, 28733),
# },
# "read2": {
# "start": 17416,
# "end": 17805,
# "length": 389,
# "status": None,
# "transcript": None,
# jls_extract_var: (17416, 17718),
# "block_end": (17479, 17805),
# },
# },
# id="28584 and 17416 - Assignment and Transcript are None",
# ),
# ],
# )
# def test_extract_features_from_pair(
# aligned_segment1: str,
# aligned_segment2: str,
# expected: dict[str, dict[str, int | None | str | tuple[int, int]]],
# request: pytest.FixtureRequest,
# ) -> None:
# """Test extract of features from a list of pairs."""
# read_pair = isoslam.extract_features_from_pair(
# [
# request.getfixturevalue(aligned_segment1),
# request.getfixturevalue(aligned_segment2),
# ]
# )
# assert isinstance(read_pair, dict)
# assert read_pair == expected
>>>>>>> 3ee6aa7 (feature(isoslam): Adds extract_segment_pair() function)

0 comments on commit d1efad4

Please sign in to comment.