Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature: extract transcripts #97

Merged
merged 1 commit into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions isoslam/isoslam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""IsoSLAM module."""

from collections import defaultdict
from pathlib import Path
from typing import Any

from isoslam import io


def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int, int, Any, Any]]]:
"""
Extract features from `.bed` file and return as a dictionary indexed by transcript_id.

Parameters
----------
bed_file : str | Path
Path, as string or pathlib Path, to a `.bed` file.

Returns
-------
dict[Any, list[tuple[Any, int, int, Any, Any]]]
Nested dictionary of chromosome, start, end and bedstrand indexed by transcript_id.
"""
coordinates = defaultdict(list)
for line in io.load_file(bed_file):
contents = line.strip().split("\t")
transcript_id = contents[3].replace("_intron", "")
coordinates[transcript_id].append(
(
contents[0],
int(contents[1]),
int(contents[2]),
transcript_id,
contents[5],
)
)
return coordinates
42 changes: 42 additions & 0 deletions tests/test_isoslam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Tests for the isoslam module."""

from pathlib import Path
from typing import Any

import pytest # type: ignore[import-not-found]

from isoslam import isoslam

BASE_DIR = Path.cwd()
RESOURCES = BASE_DIR / "tests" / "resources"


@pytest.mark.parametrize(
("bed_file", "expected_transcript"),
[
pytest.param( # type: ignore[misc]
RESOURCES / "bed" / "test_coding_introns.bed",
{
"ENST00000442898": [
("9", 14940, 15080, "ENST00000442898", "-"),
("9", 15149, 15908, "ENST00000442898", "-"),
("9", 16061, 16717, "ENST00000442898", "-"),
("9", 16876, 16964, "ENST00000442898", "-"),
("9", 17166, 17343, "ENST00000442898", "-"),
("9", 17479, 17718, "ENST00000442898", "-"),
("9", 17855, 18027, "ENST00000442898", "-"),
("9", 18174, 18380, "ENST00000442898", "-"),
("9", 18492, 24850, "ENST00000442898", "-"),
("9", 25004, 29601, "ENST00000442898", "-"),
]
},
id="bed coding introons",
),
],
)
def test_isoslam_extract_transcripts(
bed_file: str | Path,
expected_transcript: dict[Any, list[tuple[Any, int, int, Any, Any]]],
) -> None:
"""Test extraction of tanscript data from bed file using extract_transcripts()."""
assert isoslam.extract_transcripts(bed_file) == expected_transcript
Loading