Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Seq utilities and loggign #50

Merged
merged 4 commits into from
Nov 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/api/utils.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ CREsted provides a few utility function to help with sequence encoding, function
read_bigwig_region
hot_encoding_to_sequence
one_hot_encode_sequence
fetch_sequences
permute_model
setup_logging
```
3 changes: 2 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Configuration file for the Sphinx documentation builder.
"""Configuration file for the Sphinx documentation builder."""
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
Expand Down Expand Up @@ -64,6 +64,7 @@

autosummary_generate = True
autodoc_member_order = "groupwise"
bibtex_reference_style = "author_year"
default_role = "literal"
napoleon_google_docstring = False
napoleon_numpy_docstring = True
Expand Down
27 changes: 18 additions & 9 deletions docs/references.bib
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
@article{Virshup_2023,
doi = {10.1038/s41587-023-01733-8},
url = {https://doi.org/10.1038%2Fs41587-023-01733-8},
year = 2023,
month = {apr},
publisher = {Springer Science and Business Media {LLC}},
author = {Isaac Virshup and Danila Bredikhin and Lukas Heumos and Giovanni Palla and Gregor Sturm and Adam Gayoso and Ilia Kats and Mikaela Koutrouli and Philipp Angerer and Volker Bergen and Pierre Boyeau and Maren Büttner and Gokcen Eraslan and David Fischer and Max Frank and Justin Hong and Michal Klein and Marius Lange and Romain Lopez and Mohammad Lotfollahi and Malte D. Luecken and Fidel Ramirez and Jeffrey Regier and Sergei Rybakov and Anna C. Schaar and Valeh Valiollah Pour Amiri and Philipp Weiler and Galen Xing and Bonnie Berger and Dana Pe'er and Aviv Regev and Sarah A. Teichmann and Francesca Finotello and F. Alexander Wolf and Nir Yosef and Oliver Stegle and Fabian J. Theis},
title = {The scverse project provides a computational ecosystem for single-cell omics data analysis},
journal = {Nature Biotechnology}
@article{hu2023single,
title={Single-cell multi-scale footprinting reveals the modular organization of DNA regulatory elements},
author={Hu, Yan and Ma, Sai and Kartha, Vinay K and Duarte, Fabiana M and Horlbeck, Max and Zhang, Ruochi and Shrestha, Rojesh and Labade, Ajay and Kletzien, Heidi and Meliki, Alia and others},
journal={bioRxiv},
pages={2023--03},
year={2023},
publisher={Cold Spring Harbor Laboratory}
}

@misc{shrikumar2021tfmodisco,
Expand All @@ -17,3 +15,14 @@ @misc{shrikumar2021tfmodisco
doi = {10.5281/zenodo.4728132},
url = {https://doi.org/10.5281/zenodo.4728132}
}

@article{Virshup_2023,
doi = {10.1038/s41587-023-01733-8},
url = {https://doi.org/10.1038%2Fs41587-023-01733-8},
year = 2023,
month = {apr},
publisher = {Springer Science and Business Media {LLC}},
author = {Isaac Virshup and Danila Bredikhin and Lukas Heumos and Giovanni Palla and Gregor Sturm and Adam Gayoso and Ilia Kats and Mikaela Koutrouli and Philipp Angerer and Volker Bergen and Pierre Boyeau and Maren Büttner and Gokcen Eraslan and David Fischer and Max Frank and Justin Hong and Michal Klein and Marius Lange and Romain Lopez and Mohammad Lotfollahi and Malte D. Luecken and Fidel Ramirez and Jeffrey Regier and Sergei Rybakov and Anna C. Schaar and Valeh Valiollah Pour Amiri and Philipp Weiler and Galen Xing and Bonnie Berger and Dana Pe'er and Aviv Regev and Sarah A. Teichmann and Francesca Finotello and F. Alexander Wolf and Nir Yosef and Oliver Stegle and Fabian J. Theis and},
title = {The scverse project provides a computational ecosystem for single-cell omics data analysis},
journal = {Nature Biotechnology}
}
2 changes: 1 addition & 1 deletion docs/references.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# References

```{bibliography}

:all:
```
12 changes: 11 additions & 1 deletion src/crested/tl/_crested.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class Crested:
Name of the run. Used for wandb logging and creating output directories.
If not provided, the current date and time will be used.
logger
Logger to use for logging. Can be "wandb" or "tensorboard" (tensorboard not implemented yet)
Logger to use for logging. Can be "wandb", "tensorboard", or "dvc" (tensorboard not implemented yet)
If not provided, no additional logging will be done.
seed
Seed to use for reproducibility.
Expand Down Expand Up @@ -204,6 +204,16 @@ def _initialize_logger(logger_type: str | None, project_name: str, run_name: str
)
callbacks.append(tensorboard_callback)
run = None
elif logger_type == "dvc":
if os.environ["KERAS_BACKEND"] != "tensorflow":
raise ValueError("DVC Keras logging requires a tensorflow backend")
logger.warning("Using DVC logger. Make sure to have dvclive installed.")
from dvclive.keras import DVCLiveCallback

log_dir = os.path.join("logs", project_name, run_name)
dvc_callback = DVCLiveCallback()
callbacks.append(dvc_callback)
run = None
else:
run = None

Expand Down
1 change: 1 addition & 0 deletions src/crested/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ._utils import (
EnhancerOptimizer,
extract_bigwig_values_per_bp,
fetch_sequences,
hot_encoding_to_sequence,
one_hot_encode_sequence,
read_bigwig_region,
Expand Down
41 changes: 41 additions & 0 deletions src/crested/utils/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import numpy as np
import pandas as pd
import pysam
from loguru import logger

from crested._io import _extract_tracks_from_bigwig
Expand Down Expand Up @@ -315,6 +316,46 @@ def extract_bigwig_values_per_bp(
return bw_values, all_midpoints


def fetch_sequences(
regions: str | list[str], genome: os.PathLike, uppercase: bool = True
) -> list[str]:
"""
Fetch sequences from a genome file for a list of regions using pysam.

Regions should be formatted as "chr:start-end".

Parameters
----------
regions
List of regions to fetch sequences for.
genome
Path to the genome file.
uppercase
If True, return sequences in uppercase.

Returns
-------
List of sequence strings for each region.

Examples
--------
>>> regions = ["chr1:1000000-1000100", "chr1:1000100-1000200"]
>>> region_seqs = crested.utils.fetch_sequences(regions, genome_path)
"""
if isinstance(regions, str):
regions = [regions]
fasta = pysam.FastaFile(genome)
seqs = []
for region in regions:
chrom, start_end = region.split(":")
start, end = start_end.split("-")
seq = fasta.fetch(chrom, int(start), int(end))
if uppercase:
seq = seq.upper()
seqs.append(seq)
return seqs


def read_bigwig_region(
bigwig_file: os.PathLike,
coordinates: tuple[str, int, int],
Expand Down
Loading