aertslab · LukasMahieu · Dec 10, 2024 · Nov 30, 2024 · Nov 30, 2024 · Nov 30, 2024
diff --git a/docs/api/datasets.md b/docs/api/datasets.md
@@ -12,4 +12,6 @@ Downloading of use case datasets which ar explored in the example analyses.
 
     get_dataset
     get_motif_db
+    Genome
+    register_genome
 ```
diff --git a/docs/references.bib b/docs/references.bib
@@ -1,12 +1,3 @@
-@article{hu2023single,
-	title={Single-cell multi-scale footprinting reveals the modular organization of DNA regulatory elements},
-	author={Hu, Yan and Ma, Sai and Kartha, Vinay K and Duarte, Fabiana M and Horlbeck, Max and Zhang, Ruochi and Shrestha, Rojesh and Labade, Ajay and Kletzien, Heidi and Meliki, Alia and others},
-	journal={bioRxiv},
-	pages={2023--03},
-	year={2023},
-	publisher={Cold Spring Harbor Laboratory}
-}
-
 @misc{shrikumar2021tfmodisco,
   author       = {Av Shrikumar and Katherine Tian and annashcherbina and Žiga Avsec and Amr and Charles McAnany and pgreenside and Surag Nair and mhfzsharmin and Stefan Holderbach and Rosa Ma},
   title        = {{kundajelab/tfmodisco: Nicer API for density-adaptive hit scoring (v0.5.14.1)}},
@@ -26,3 +17,16 @@ @article{Virshup_2023
 	title = {The scverse project provides a computational ecosystem for single-cell omics data analysis},
 	journal = {Nature Biotechnology}
 }
+
+@article{Zhang_2024,
+    doi = {10.1038/s41592-023-02139-9},
+    url = {https://doi.org/10.1038/s41592-023-02139-9},
+    year = 2024,
+    month = {mar},
+    publisher = {Springer Nature},
+    author = {Kai Zhang and Nicholas R. Zemke and Evan J. Armand and others},
+    title = {A fast, scalable and versatile tool for analysis of single-cell omics data},
+    journal = {Nature Methods},
+    volume = {21},
+    pages = {217--227}
+}
diff --git a/docs/tutorials/model_training_and_eval.ipynb b/docs/tutorials/model_training_and_eval.ipynb
diff --git a/src/crested/__init__.py b/src/crested/__init__.py
@@ -24,6 +24,7 @@
 
 from . import pl, pp, tl, utils
 from ._datasets import get_dataset, get_motif_db
+from ._genome import Genome, register_genome
 from ._io import import_beds, import_bigwigs
 
 __all__ = [
@@ -35,6 +36,8 @@
     "import_bigwigs",
     "get_dataset",
     "get_motif_db",
+    "Genome",
+    "register_genome",
 ]
 
 __version__ = version("crested")

diff --git a/src/crested/_conf.py b/src/crested/_conf.py
@@ -0,0 +1,3 @@
+"""Persistent configuration for the crested package."""
+
+genome = None  # persistent genome object
diff --git a/src/crested/_genome.py b/src/crested/_genome.py
@@ -0,0 +1,238 @@
+"""Genome object for storing information about a genome and genome registry."""
+
+from __future__ import annotations
+
+import errno
+import os
+from pathlib import Path
+
+from loguru import logger
+from pysam import FastaFile
+
+import crested._conf as conf
+from crested.utils._seq_utils import reverse_complement
+
+
+class Genome:
+    """
+    A class that encapsulates information about a genome, including its FASTA sequence, its annotation, and chromosome sizes.
+
+    Adapted from https://github.com/kaizhang/SnapATAC2/blob/main/snapatac2-python/python/snapatac2/genome.py.
+
+    Parameters
+    ----------
+    fasta
+        The path to the FASTA file.
+    chrom_sizes
+        A path to a tab delimited chromsizes file or a dictionary containing chromosome names and sizes.
+        If not provided, the chromosome sizes will be inferred from the FASTA file.
+    annotation
+        The path to the annotation file.
+    name
+        Optional name of the genome.
+
+    Examples
+    --------
+    >>> genome = Genome(
+    ...     fasta="tests/data/test.fa",
+    ...     chrom_sizes="tests/data/test.chrom.sizes",
+    ... )
+    >>> print(genome.fasta)
+    <pysam.libcfaidx.FastaFile at 0x7f4d8b4a8f40>
+    >>> print(genome.chrom_sizes)
+    {'chr1': 1000, 'chr2': 2000}
+    >>> print(genome.name)
+    test
+    """
+
+    def __init__(
+        self,
+        fasta: Path,
+        chrom_sizes: dict[str, int] | Path | None = None,
+        annotation: Path | None = None,
+        name: str | None = None,
+    ):
+        """Initialize the Genome object."""
+        if isinstance(fasta, (Path, str)):
+            fasta = Path(fasta)
+            if not os.path.exists(fasta):
+                raise FileNotFoundError(
+                    errno.ENOENT, os.strerror(errno.ENOENT), str(fasta)
+                )
+            self._fasta = fasta
+        else:
+            raise ValueError("fasta must be a Path.")
+
+        if isinstance(chrom_sizes, (Path, str)):
+            chrom_sizes = Path(chrom_sizes)
+            if not os.path.exists(chrom_sizes):
+                raise FileNotFoundError(
+                    errno.ENOENT, os.strerror(errno.ENOENT), str(chrom_sizes)
+                )
+            self._chrom_sizes = chrom_sizes
+        else:
+            self._chrom_sizes = chrom_sizes
+
+        if isinstance(annotation, (Path, str)):
+            annotation = Path(annotation)
+            if not os.path.exists(annotation):
+                raise FileNotFoundError(
+                    errno.ENOENT, os.strerror(errno.ENOENT), str(annotation)
+                )
+            self._annotation = annotation
+        else:
+            self._annotation = None
+
+        self._name = name
+
+    @property
+    def fasta(self) -> FastaFile:
+        """
+        The pysam FastaFile object for the FASTA file.
+
+        Returns
+        -------
+        The pysam FastaFile object.
+        """
+        return FastaFile(self._fasta)
+
+    @property
+    def annotation(self) -> Path | None:
+        """
+        The Path to the annotation file.
+
+        Currently not used in the package.
+
+        Returns
+        -------
+        The path to the annotation file.
+        """
+        return self._annotation
+
+    @property
+    def chrom_sizes(self) -> dict[str, int]:
+        """
+        A dictionary with chromosome names as keys and their lengths as values.
+
+        Returns
+        -------
+        A dictionary of chromosome sizes.
+        """
+        if self._chrom_sizes is None:
+            self._chrom_sizes = dict(zip(self.fasta.references, self.fasta.lengths))
+        elif isinstance(self._chrom_sizes, Path):
+            from crested._io import _read_chromsizes
+
+            self._chrom_sizes = _read_chromsizes(self._chrom_sizes)
+        elif not isinstance(self._chrom_sizes, dict):
+            raise ValueError("chrom_sizes must be a dictionary or a Path.")
+        return self._chrom_sizes
+
+    @property
+    def name(self) -> str:
+        """
+        The name of the genome.
+
+        Returns
+        -------
+        The name of the genome.
+        """
+        if self._name is None:
+            filename = self.fasta.filename.decode("utf-8")
+            basename = os.path.basename(filename)
+
+            if basename.endswith(".fa") or basename.endswith(".fasta"):
+                return os.path.splitext(basename)[0]  # Remove the extension and return
+            else:
+                return basename
+        return self._name
+
+    def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -> str:
+        """
+        Fetch a sequence from a genomic region.
+
+        Start and end denote 0-based, half-open intervals, following the bed convention.
+
+        Parameters
+        ----------
+        chrom
+            The chromosome of the region to extract.
+        start
+            The start of the region to extract. Assumes 0-indexed positions.
+        end
+            The end of the region to extract, exclusive.
+        strand
+            The strand of the region. If '-', the sequence is reverse-complemented. Default is "+".
+        region
+            Alternatively, a region string to parse. If supplied together with chrom/start/end, explicit coordinates take priority.
+
+        Returns
+        -------
+        The requested sequence, as a string.
+        """
+        if region and (chrom or start or end):
+            logger.warning("Both region and chrom/start/end supplied. Using chrom/start/end...")
+        elif region:
+            if region[-2] == ":":
+                chrom, start_end, strand = region.split(":")
+            else:
+                chrom, start_end = region.split(":")
+            start, end = map(int, start_end.split("-"))
+
+        if not (chrom and start and end):
+            raise ValueError("chrom/start/end must all be supplied to extract a sequence.")
+
+        seq = self.fasta.fetch(reference=chrom, start=start, end=end)
+        if strand == "-":
+            return reverse_complement(seq)
+        else:
+            return seq
+
+def register_genome(genome: Genome):
+    """
+    Register a genome to be used throughout a session.
+
+    Once a genome is registered, all the functions in the package that require a genome will use it if not explicitly provided.
+
+    Parameters
+    ----------
+    genome
+        The Genome object to register.
+
+    Examples
+    --------
+    >>> genome = Genome(
+    ...     fasta="tests/data/hg38.fa",
+    ...     chrom_sizes="tests/data/test.chrom.sizes",
+    ... )
+    >>> register_genome(genome)
+    INFO Genome hg38 registered.
+    """
+    if not isinstance(genome, Genome):
+        raise TypeError("genome must be an instance of crested.Genome")
+    conf.genome = genome
+    logger.info(f"Genome {genome.name} registered.")
+
+
+def _resolve_genome(
+    genome: os.PathLike | Genome | None,
+    chromsizes_file: os.PathLike | None = None,
+    annotation: os.PathLike | None = None,
+) -> Genome:
+    """Resolve the input to a Genome object. Required to keep backwards compatibility with fasta and chromsizes paths as inputs."""
+    if isinstance(genome, Genome):
+        return genome
+    if genome is not None:
+        genome_path = Path(genome)
+        if not genome_path.exists():
+            raise FileNotFoundError(
+                errno.ENOENT, os.strerror(errno.ENOENT), str(genome_path)
+            )
+        return Genome(
+            fasta=genome_path, chrom_sizes=chromsizes_file, annotation=annotation
+        )
+    else:
+        if conf.genome is not None:
+            return conf.genome
+        else:
+            raise ValueError("No genome provided or registered.")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		"""Persistent configuration for the crested package."""

		genome = None # persistent genome object