Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Genome class to handle fasta files and chromsizes throughout package #76

Merged
merged 24 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
2b5de26
genome object and registration for easier working with genomes
LukasMahieu Nov 30, 2024
1534b66
genome object and registration for easier working with genomes
LukasMahieu Nov 30, 2024
579e918
check for genome object if chromsizes not provided in inputs
LukasMahieu Nov 30, 2024
d3229fe
unit tests and handling of path chromsizes
LukasMahieu Dec 1, 2024
7f3a5b8
more explicit docstring
LukasMahieu Dec 2, 2024
be2cd39
remove old debug print
LukasMahieu Dec 2, 2024
2d8edf2
check if fasta files exist on init of Genome
LukasMahieu Dec 2, 2024
b96da5d
remove incorrect info log about dir creating
LukasMahieu Dec 2, 2024
f5ce2d5
examples in docstrings and _resolve_genome for backward compatibility
LukasMahieu Dec 2, 2024
0e29e3a
update funcs to accept genome object
LukasMahieu Dec 2, 2024
3862d16
genome and register_genome doc
LukasMahieu Dec 2, 2024
83a6bf1
bugfix - unused dense bias and output activation
LukasMahieu Dec 2, 2024
30fe270
update pipeline test to use genome obj
LukasMahieu Dec 2, 2024
edab6dd
make fasta attribute into FastaFile instead of simply the path and ad…
LukasMahieu Dec 2, 2024
61c080a
correctly init the keras backend in the unit tests
LukasMahieu Dec 2, 2024
192c882
Add fetch() to genome object
casblaauw Dec 3, 2024
2a22aea
Resolve circular dependencies, add missing arg check
Dec 3, 2024
307282d
Add more tests
Dec 3, 2024
4ddeeb9
Fix test bug and some ruff stuff
Dec 3, 2024
8be54c5
Fix more test bugs
Dec 3, 2024
37fd939
Add automatic chromsizes to change_regions_width
Dec 6, 2024
164cca4
Update tutorial with Genome class
Dec 6, 2024
065a984
Fix anndata reference in text
Dec 6, 2024
01a46e7
explicit genome in all docs
LukasMahieu Dec 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/api/datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ Downloading of use case datasets which ar explored in the example analyses.

get_dataset
get_motif_db
Genome
register_genome
```
22 changes: 13 additions & 9 deletions docs/references.bib
Original file line number Diff line number Diff line change
@@ -1,12 +1,3 @@
@article{hu2023single,
title={Single-cell multi-scale footprinting reveals the modular organization of DNA regulatory elements},
author={Hu, Yan and Ma, Sai and Kartha, Vinay K and Duarte, Fabiana M and Horlbeck, Max and Zhang, Ruochi and Shrestha, Rojesh and Labade, Ajay and Kletzien, Heidi and Meliki, Alia and others},
journal={bioRxiv},
pages={2023--03},
year={2023},
publisher={Cold Spring Harbor Laboratory}
}

@misc{shrikumar2021tfmodisco,
author = {Av Shrikumar and Katherine Tian and annashcherbina and Žiga Avsec and Amr and Charles McAnany and pgreenside and Surag Nair and mhfzsharmin and Stefan Holderbach and Rosa Ma},
title = {{kundajelab/tfmodisco: Nicer API for density-adaptive hit scoring (v0.5.14.1)}},
Expand All @@ -26,3 +17,16 @@ @article{Virshup_2023
title = {The scverse project provides a computational ecosystem for single-cell omics data analysis},
journal = {Nature Biotechnology}
}

@article{Zhang_2024,
doi = {10.1038/s41592-023-02139-9},
url = {https://doi.org/10.1038/s41592-023-02139-9},
year = 2024,
month = {mar},
publisher = {Springer Nature},
author = {Kai Zhang and Nicholas R. Zemke and Evan J. Armand and others},
title = {A fast, scalable and versatile tool for analysis of single-cell omics data},
journal = {Nature Methods},
volume = {21},
pages = {217--227}
}
1,235 changes: 529 additions & 706 deletions docs/tutorials/model_training_and_eval.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/crested/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from . import pl, pp, tl, utils
from ._datasets import get_dataset, get_motif_db
from ._genome import Genome, register_genome
from ._io import import_beds, import_bigwigs

__all__ = [
Expand All @@ -35,6 +36,8 @@
"import_bigwigs",
"get_dataset",
"get_motif_db",
"Genome",
"register_genome",
]

__version__ = version("crested")
Expand Down
3 changes: 3 additions & 0 deletions src/crested/_conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""Persistent configuration for the crested package."""

genome = None # persistent genome object
238 changes: 238 additions & 0 deletions src/crested/_genome.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
"""Genome object for storing information about a genome and genome registry."""

from __future__ import annotations

import errno
import os
from pathlib import Path

from loguru import logger
from pysam import FastaFile

import crested._conf as conf
from crested.utils._seq_utils import reverse_complement


class Genome:
"""
A class that encapsulates information about a genome, including its FASTA sequence, its annotation, and chromosome sizes.

Adapted from https://github.com/kaizhang/SnapATAC2/blob/main/snapatac2-python/python/snapatac2/genome.py.

Parameters
----------
fasta
The path to the FASTA file.
chrom_sizes
A path to a tab delimited chromsizes file or a dictionary containing chromosome names and sizes.
If not provided, the chromosome sizes will be inferred from the FASTA file.
annotation
The path to the annotation file.
name
Optional name of the genome.

Examples
--------
>>> genome = Genome(
... fasta="tests/data/test.fa",
... chrom_sizes="tests/data/test.chrom.sizes",
... )
>>> print(genome.fasta)
<pysam.libcfaidx.FastaFile at 0x7f4d8b4a8f40>
>>> print(genome.chrom_sizes)
{'chr1': 1000, 'chr2': 2000}
>>> print(genome.name)
test
"""

def __init__(
self,
fasta: Path,
chrom_sizes: dict[str, int] | Path | None = None,
annotation: Path | None = None,
name: str | None = None,
):
"""Initialize the Genome object."""
if isinstance(fasta, (Path, str)):
fasta = Path(fasta)
if not os.path.exists(fasta):
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), str(fasta)
)
self._fasta = fasta
else:
raise ValueError("fasta must be a Path.")

if isinstance(chrom_sizes, (Path, str)):
chrom_sizes = Path(chrom_sizes)
if not os.path.exists(chrom_sizes):
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), str(chrom_sizes)
)
self._chrom_sizes = chrom_sizes
else:
self._chrom_sizes = chrom_sizes

if isinstance(annotation, (Path, str)):
annotation = Path(annotation)
if not os.path.exists(annotation):
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), str(annotation)
)
self._annotation = annotation
else:
self._annotation = None

self._name = name

@property
def fasta(self) -> FastaFile:
"""
The pysam FastaFile object for the FASTA file.

Returns
-------
The pysam FastaFile object.
"""
return FastaFile(self._fasta)

@property
def annotation(self) -> Path | None:
"""
The Path to the annotation file.

Currently not used in the package.

Returns
-------
The path to the annotation file.
"""
return self._annotation

@property
def chrom_sizes(self) -> dict[str, int]:
"""
A dictionary with chromosome names as keys and their lengths as values.

Returns
-------
A dictionary of chromosome sizes.
"""
if self._chrom_sizes is None:
self._chrom_sizes = dict(zip(self.fasta.references, self.fasta.lengths))
elif isinstance(self._chrom_sizes, Path):
from crested._io import _read_chromsizes

self._chrom_sizes = _read_chromsizes(self._chrom_sizes)
elif not isinstance(self._chrom_sizes, dict):
raise ValueError("chrom_sizes must be a dictionary or a Path.")
return self._chrom_sizes

@property
def name(self) -> str:
"""
The name of the genome.

Returns
-------
The name of the genome.
"""
if self._name is None:
filename = self.fasta.filename.decode("utf-8")
basename = os.path.basename(filename)

if basename.endswith(".fa") or basename.endswith(".fasta"):
return os.path.splitext(basename)[0] # Remove the extension and return
else:
return basename
return self._name

def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -> str:
"""
Fetch a sequence from a genomic region.

Start and end denote 0-based, half-open intervals, following the bed convention.

Parameters
----------
chrom
The chromosome of the region to extract.
start
The start of the region to extract. Assumes 0-indexed positions.
end
The end of the region to extract, exclusive.
strand
The strand of the region. If '-', the sequence is reverse-complemented. Default is "+".
region
Alternatively, a region string to parse. If supplied together with chrom/start/end, explicit coordinates take priority.

Returns
-------
The requested sequence, as a string.
"""
if region and (chrom or start or end):
logger.warning("Both region and chrom/start/end supplied. Using chrom/start/end...")
elif region:
if region[-2] == ":":
chrom, start_end, strand = region.split(":")
else:
chrom, start_end = region.split(":")
start, end = map(int, start_end.split("-"))

if not (chrom and start and end):
raise ValueError("chrom/start/end must all be supplied to extract a sequence.")

seq = self.fasta.fetch(reference=chrom, start=start, end=end)
if strand == "-":
return reverse_complement(seq)
else:
return seq

def register_genome(genome: Genome):
"""
Register a genome to be used throughout a session.

Once a genome is registered, all the functions in the package that require a genome will use it if not explicitly provided.

Parameters
----------
genome
The Genome object to register.

Examples
--------
>>> genome = Genome(
... fasta="tests/data/hg38.fa",
... chrom_sizes="tests/data/test.chrom.sizes",
... )
>>> register_genome(genome)
INFO Genome hg38 registered.
"""
if not isinstance(genome, Genome):
raise TypeError("genome must be an instance of crested.Genome")
conf.genome = genome
logger.info(f"Genome {genome.name} registered.")


def _resolve_genome(
genome: os.PathLike | Genome | None,
chromsizes_file: os.PathLike | None = None,
annotation: os.PathLike | None = None,
) -> Genome:
"""Resolve the input to a Genome object. Required to keep backwards compatibility with fasta and chromsizes paths as inputs."""
if isinstance(genome, Genome):
return genome
if genome is not None:
genome_path = Path(genome)
if not genome_path.exists():
raise FileNotFoundError(
errno.ENOENT, os.strerror(errno.ENOENT), str(genome_path)
)
return Genome(
fasta=genome_path, chrom_sizes=chromsizes_file, annotation=annotation
)
else:
if conf.genome is not None:
return conf.genome
else:
raise ValueError("No genome provided or registered.")
Loading
Loading