diff --git a/docs/api/utils.md b/docs/api/utils.md index 79d3d1ec..886757ac 100644 --- a/docs/api/utils.md +++ b/docs/api/utils.md @@ -14,6 +14,7 @@ CREsted provides a few utility function to help with sequence encoding, function read_bigwig_region hot_encoding_to_sequence one_hot_encode_sequence + fetch_sequences permute_model setup_logging ``` diff --git a/docs/conf.py b/docs/conf.py index c9cfc210..64f3bdbe 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,4 @@ -# Configuration file for the Sphinx documentation builder. +"""Configuration file for the Sphinx documentation builder.""" # # This file only contains a selection of the most common options. For a full # list see the documentation: @@ -64,6 +64,7 @@ autosummary_generate = True autodoc_member_order = "groupwise" +bibtex_reference_style = "author_year" default_role = "literal" napoleon_google_docstring = False napoleon_numpy_docstring = True diff --git a/docs/references.bib b/docs/references.bib index added383..96a303a7 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -1,12 +1,10 @@ -@article{Virshup_2023, - doi = {10.1038/s41587-023-01733-8}, - url = {https://doi.org/10.1038%2Fs41587-023-01733-8}, - year = 2023, - month = {apr}, - publisher = {Springer Science and Business Media {LLC}}, - author = {Isaac Virshup and Danila Bredikhin and Lukas Heumos and Giovanni Palla and Gregor Sturm and Adam Gayoso and Ilia Kats and Mikaela Koutrouli and Philipp Angerer and Volker Bergen and Pierre Boyeau and Maren Büttner and Gokcen Eraslan and David Fischer and Max Frank and Justin Hong and Michal Klein and Marius Lange and Romain Lopez and Mohammad Lotfollahi and Malte D. Luecken and Fidel Ramirez and Jeffrey Regier and Sergei Rybakov and Anna C. Schaar and Valeh Valiollah Pour Amiri and Philipp Weiler and Galen Xing and Bonnie Berger and Dana Pe'er and Aviv Regev and Sarah A. Teichmann and Francesca Finotello and F. Alexander Wolf and Nir Yosef and Oliver Stegle and Fabian J. Theis}, - title = {The scverse project provides a computational ecosystem for single-cell omics data analysis}, - journal = {Nature Biotechnology} +@article{hu2023single, + title={Single-cell multi-scale footprinting reveals the modular organization of DNA regulatory elements}, + author={Hu, Yan and Ma, Sai and Kartha, Vinay K and Duarte, Fabiana M and Horlbeck, Max and Zhang, Ruochi and Shrestha, Rojesh and Labade, Ajay and Kletzien, Heidi and Meliki, Alia and others}, + journal={bioRxiv}, + pages={2023--03}, + year={2023}, + publisher={Cold Spring Harbor Laboratory} } @misc{shrikumar2021tfmodisco, @@ -17,3 +15,14 @@ @misc{shrikumar2021tfmodisco doi = {10.5281/zenodo.4728132}, url = {https://doi.org/10.5281/zenodo.4728132} } + +@article{Virshup_2023, + doi = {10.1038/s41587-023-01733-8}, + url = {https://doi.org/10.1038%2Fs41587-023-01733-8}, + year = 2023, + month = {apr}, + publisher = {Springer Science and Business Media {LLC}}, + author = {Isaac Virshup and Danila Bredikhin and Lukas Heumos and Giovanni Palla and Gregor Sturm and Adam Gayoso and Ilia Kats and Mikaela Koutrouli and Philipp Angerer and Volker Bergen and Pierre Boyeau and Maren Büttner and Gokcen Eraslan and David Fischer and Max Frank and Justin Hong and Michal Klein and Marius Lange and Romain Lopez and Mohammad Lotfollahi and Malte D. Luecken and Fidel Ramirez and Jeffrey Regier and Sergei Rybakov and Anna C. Schaar and Valeh Valiollah Pour Amiri and Philipp Weiler and Galen Xing and Bonnie Berger and Dana Pe'er and Aviv Regev and Sarah A. Teichmann and Francesca Finotello and F. Alexander Wolf and Nir Yosef and Oliver Stegle and Fabian J. Theis and}, + title = {The scverse project provides a computational ecosystem for single-cell omics data analysis}, + journal = {Nature Biotechnology} +} diff --git a/docs/references.md b/docs/references.md index 76f59b35..57036eaf 100644 --- a/docs/references.md +++ b/docs/references.md @@ -1,5 +1,5 @@ # References ```{bibliography} - +:all: ``` diff --git a/src/crested/tl/_crested.py b/src/crested/tl/_crested.py index 3015bec6..b8d7ccd3 100644 --- a/src/crested/tl/_crested.py +++ b/src/crested/tl/_crested.py @@ -53,7 +53,7 @@ class Crested: Name of the run. Used for wandb logging and creating output directories. If not provided, the current date and time will be used. logger - Logger to use for logging. Can be "wandb" or "tensorboard" (tensorboard not implemented yet) + Logger to use for logging. Can be "wandb", "tensorboard", or "dvc" (tensorboard not implemented yet) If not provided, no additional logging will be done. seed Seed to use for reproducibility. @@ -204,6 +204,16 @@ def _initialize_logger(logger_type: str | None, project_name: str, run_name: str ) callbacks.append(tensorboard_callback) run = None + elif logger_type == "dvc": + if os.environ["KERAS_BACKEND"] != "tensorflow": + raise ValueError("DVC Keras logging requires a tensorflow backend") + logger.warning("Using DVC logger. Make sure to have dvclive installed.") + from dvclive.keras import DVCLiveCallback + + log_dir = os.path.join("logs", project_name, run_name) + dvc_callback = DVCLiveCallback() + callbacks.append(dvc_callback) + run = None else: run = None diff --git a/src/crested/utils/__init__.py b/src/crested/utils/__init__.py index 27b5f75e..4440a9f0 100644 --- a/src/crested/utils/__init__.py +++ b/src/crested/utils/__init__.py @@ -5,6 +5,7 @@ from ._utils import ( EnhancerOptimizer, extract_bigwig_values_per_bp, + fetch_sequences, hot_encoding_to_sequence, one_hot_encode_sequence, read_bigwig_region, diff --git a/src/crested/utils/_utils.py b/src/crested/utils/_utils.py index d52c3e39..6a368aee 100644 --- a/src/crested/utils/_utils.py +++ b/src/crested/utils/_utils.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd +import pysam from loguru import logger from crested._io import _extract_tracks_from_bigwig @@ -315,6 +316,46 @@ def extract_bigwig_values_per_bp( return bw_values, all_midpoints +def fetch_sequences( + regions: str | list[str], genome: os.PathLike, uppercase: bool = True +) -> list[str]: + """ + Fetch sequences from a genome file for a list of regions using pysam. + + Regions should be formatted as "chr:start-end". + + Parameters + ---------- + regions + List of regions to fetch sequences for. + genome + Path to the genome file. + uppercase + If True, return sequences in uppercase. + + Returns + ------- + List of sequence strings for each region. + + Examples + -------- + >>> regions = ["chr1:1000000-1000100", "chr1:1000100-1000200"] + >>> region_seqs = crested.utils.fetch_sequences(regions, genome_path) + """ + if isinstance(regions, str): + regions = [regions] + fasta = pysam.FastaFile(genome) + seqs = [] + for region in regions: + chrom, start_end = region.split(":") + start, end = start_end.split("-") + seq = fasta.fetch(chrom, int(start), int(end)) + if uppercase: + seq = seq.upper() + seqs.append(seq) + return seqs + + def read_bigwig_region( bigwig_file: os.PathLike, coordinates: tuple[str, int, int],