diff --git a/isoslam/isoslam.py b/isoslam/isoslam.py new file mode 100644 index 0000000..dd20fed --- /dev/null +++ b/isoslam/isoslam.py @@ -0,0 +1,37 @@ +"""IsoSLAM module.""" + +from collections import defaultdict +from pathlib import Path +from typing import Any + +from isoslam import io + + +def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int, int, Any, Any]]]: + """ + Extract features from `.bed` file and return as a dictionary indexed by transcript_id. + + Parameters + ---------- + bed_file : str | Path + Path, as string or pathlib Path, to a `.bed` file. + + Returns + ------- + dict[Any, list[tuple[Any, int, int, Any, Any]]] + Nested dictionary of chromosome, start, end and bedstrand indexed by transcript_id. + """ + coordinates = defaultdict(list) + for line in io.load_file(bed_file): + contents = line.strip().split("\t") + transcript_id = contents[3].replace("_intron", "") + coordinates[transcript_id].append( + ( + contents[0], + int(contents[1]), + int(contents[2]), + transcript_id, + contents[5], + ) + ) + return coordinates diff --git a/tests/test_isoslam.py b/tests/test_isoslam.py new file mode 100644 index 0000000..c93fd43 --- /dev/null +++ b/tests/test_isoslam.py @@ -0,0 +1,42 @@ +"""Tests for the isoslam module.""" + +from pathlib import Path +from typing import Any + +import pytest # type: ignore[import-not-found] + +from isoslam import isoslam + +BASE_DIR = Path.cwd() +RESOURCES = BASE_DIR / "tests" / "resources" + + +@pytest.mark.parametrize( + ("bed_file", "expected_transcript"), + [ + pytest.param( # type: ignore[misc] + RESOURCES / "bed" / "test_coding_introns.bed", + { + "ENST00000442898": [ + ("9", 14940, 15080, "ENST00000442898", "-"), + ("9", 15149, 15908, "ENST00000442898", "-"), + ("9", 16061, 16717, "ENST00000442898", "-"), + ("9", 16876, 16964, "ENST00000442898", "-"), + ("9", 17166, 17343, "ENST00000442898", "-"), + ("9", 17479, 17718, "ENST00000442898", "-"), + ("9", 17855, 18027, "ENST00000442898", "-"), + ("9", 18174, 18380, "ENST00000442898", "-"), + ("9", 18492, 24850, "ENST00000442898", "-"), + ("9", 25004, 29601, "ENST00000442898", "-"), + ] + }, + id="bed coding introons", + ), + ], +) +def test_isoslam_extract_transcripts( + bed_file: str | Path, + expected_transcript: dict[Any, list[tuple[Any, int, int, Any, Any]]], +) -> None: + """Test extraction of tanscript data from bed file using extract_transcripts().""" + assert isoslam.extract_transcripts(bed_file) == expected_transcript