diff --git a/docs/tutorials/enhancer_code_analysis.ipynb b/docs/tutorials/enhancer_code_analysis.ipynb index c5471e1..c784fcb 100644 --- a/docs/tutorials/enhancer_code_analysis.ipynb +++ b/docs/tutorials/enhancer_code_analysis.ipynb @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -54,9 +54,12 @@ "adata = anndata.read_h5ad(\"mouse_biccn_data_filtered.h5ad\")\n", "\n", "genome_file = \"/home/VIB.LOCAL/niklas.kempynck/nkemp/software/dev_DeepPeak/DeepPeak/data/raw_mm/genome.fa\"\n", + "\n", + "genome = crested.Genome(genome_file)\n", + "\n", "datamodule = crested.tl.data.AnnDataModule(\n", " adata,\n", - " genome_file=genome_file,\n", + " genome,\n", ")" ] }, @@ -315,7 +318,7 @@ ], "source": [ "%matplotlib inline\n", - "top_k=1000\n", + "top_k = 1000\n", "crested.pl.patterns.modisco_results(\n", " classes=[\"Astro\", \"L5ET\", \"Vip\", \"Oligo\"],\n", " contribution=\"positive\",\n", @@ -691,7 +694,10 @@ " verbose=True, # Useful for doing sanity checks on matching patterns\n", ")\n", "pattern_matrix = crested.tl.modisco.create_pattern_matrix(\n", - " classes=list(adata.obs_names), all_patterns=all_patterns, normalize=False, pattern_parameter='seqlet_count_log'\n", + " classes=list(adata.obs_names),\n", + " all_patterns=all_patterns,\n", + " normalize=False,\n", + " pattern_parameter=\"seqlet_count_log\",\n", ")\n", "pattern_matrix.shape" ] @@ -722,13 +728,21 @@ ], "source": [ "import matplotlib\n", + "\n", "%matplotlib inline\n", - "matplotlib.rcParams['pdf.fonttype'] = 42\n", - "matplotlib.rcParams['ps.fonttype'] = 42\n", + "matplotlib.rcParams[\"pdf.fonttype\"] = 42\n", + "matplotlib.rcParams[\"ps.fonttype\"] = 42\n", "\n", "pat_seqs = crested.tl.modisco.generate_nucleotide_sequences(all_patterns)\n", "crested.pl.patterns.clustermap(\n", - " pattern_matrix, list(adata.obs_names), figsize=(16, 4.2), pat_seqs=pat_seqs, grid=True, fig_path='paperfigs/motif_clustering.pdf', dendrogram_ratio=(0.03,0.15), importance_threshold=4.5\n", + " pattern_matrix,\n", + " list(adata.obs_names),\n", + " figsize=(16, 4.2),\n", + " pat_seqs=pat_seqs,\n", + " grid=True,\n", + " fig_path=\"paperfigs/motif_clustering.pdf\",\n", + " dendrogram_ratio=(0.03, 0.15),\n", + " importance_threshold=4.5,\n", ")" ] }, @@ -791,12 +805,12 @@ "crested.pl.patterns.clustermap(\n", " pattern_matrix,\n", " classes=list(adata.obs_names),\n", - " subset=['L2_3IT', 'L5ET', 'L5IT', 'L5_6NP', 'L6CT', 'L6IT','L6b'],\n", + " subset=[\"L2_3IT\", \"L5ET\", \"L5IT\", \"L5_6NP\", \"L6CT\", \"L6IT\", \"L6b\"],\n", " figsize=(10, 2),\n", " pat_seqs=pat_seqs,\n", " grid=True,\n", " dy=0.0025,\n", - " importance_threshold=4.5\n", + " importance_threshold=4.5,\n", ")" ] }, @@ -1022,6 +1036,7 @@ ], "source": [ "import crested\n", + "\n", "file_path = \"/home/VIB.LOCAL/niklas.kempynck/nkemp/mouse/biccn/Mouse_rna.h5ad\" # Locate h5 file containing scRNAseq data\n", "cell_type_column = \"subclass_Bakken_2022\"\n", "mean_expression_df = crested.tl.modisco.calculate_mean_expression_per_cell_type(\n", @@ -1046,7 +1061,9 @@ } ], "source": [ - "crested.pl.patterns.tf_expression_per_cell_type(mean_expression_df, ['Nfia', 'Spi1', 'Mef2c'])" + "crested.pl.patterns.tf_expression_per_cell_type(\n", + " mean_expression_df, [\"Nfia\", \"Spi1\", \"Mef2c\"]\n", + ")" ] }, { @@ -1486,7 +1503,7 @@ " normalize_gex=True,\n", " min_tf_gex=0.95,\n", " importance_threshold=5,\n", - " pattern_parameter='seqlet_count_log',\n", + " pattern_parameter=\"seqlet_count_log\",\n", " filter_correlation=True,\n", " verbose=True,\n", " zscore_threshold=1,\n", @@ -1528,13 +1545,13 @@ "source": [ "crested.pl.patterns.clustermap_tf_motif(\n", " tf_ct_matrix,\n", - " heatmap_dim='contrib',\n", - " dot_dim='gex',\n", + " heatmap_dim=\"contrib\",\n", + " dot_dim=\"gex\",\n", " class_labels=classes,\n", " pattern_labels=tf_pattern_annots,\n", - " fig_size=(35,6),\n", + " fig_size=(35, 6),\n", " cluster_rows=True,\n", - " cluster_columns=False\n", + " cluster_columns=False,\n", ")" ] }, diff --git a/docs/tutorials/model_training_and_eval.ipynb b/docs/tutorials/model_training_and_eval.ipynb index f00fa45..70984c0 100644 --- a/docs/tutorials/model_training_and_eval.ipynb +++ b/docs/tutorials/model_training_and_eval.ipynb @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "By loading our genome in the {class}`crested.Genome` class and setting it with {func}`~crested.register_genome`, the genome is automatically used in all functions throughout CREsted." + "By loading our genome in the {class}`crested.Genome` class and setting it with {func}`~crested.register_genome`, the genome is automatically used in all functions throughout CREsted. If you don't provide the chromomsome sizes, they will be automatically calculated from the fasta.\n", + "```{note}\n", + "Any function or class that expects a genome object can still accept a genome object as explicit input even if one was already registered. In that case, the input will be used instead of the registered genome. \n", + "```" ] }, { @@ -647,9 +650,7 @@ ], "source": [ "# Load chrombpnet architecture for a dataset with 2114bp regions and 19 cell types\n", - "model_architecture = crested.tl.zoo.chrombpnet(\n", - " seq_len=2114, num_classes=adata.n_obs\n", - ")" + "model_architecture = crested.tl.zoo.chrombpnet(seq_len=2114, num_classes=adata.n_obs)" ] }, { @@ -2125,7 +2126,9 @@ "outputs": [], "source": [ "bw_file = Path(bigwigs_folder) / \"Sst.bw\"\n", - "bw_values, midpoints = crested.utils.read_bigwig_region(bw_file, (chrom, min_loc, max_loc))" + "bw_values, midpoints = crested.utils.read_bigwig_region(\n", + " bw_file, (chrom, min_loc, max_loc)\n", + ")" ] }, { diff --git a/docs/tutorials/multi_gpu.ipynb b/docs/tutorials/multi_gpu.ipynb index d84283e..788408a 100644 --- a/docs/tutorials/multi_gpu.ipynb +++ b/docs/tutorials/multi_gpu.ipynb @@ -30,7 +30,7 @@ "\n", "datamodule = crested.tl.data.AnnDataModule(\n", " my_adata,\n", - " genome_file=my_genome,\n", + " genome=my_genome,\n", " chromsizes_file=my_chromsizes_file,\n", " batch_size=128,\n", " max_stochastic_shift=5,\n", diff --git a/docs/tutorials/topic_classification.ipynb b/docs/tutorials/topic_classification.ipynb index eeddcb7..ac3bf44 100644 --- a/docs/tutorials/topic_classification.ipynb +++ b/docs/tutorials/topic_classification.ipynb @@ -155,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -171,7 +171,7 @@ "# Datamodule\n", "datamodule = crested.tl.data.AnnDataModule(\n", " adata,\n", - " genome_file=\"../../../Crested_testing/data/tmp/mm10.fa\",\n", + " genome=\"../../../Crested_testing/data/tmp/mm10.fa\",\n", " batch_size=128, # lower this if you encounter OOM errors\n", " max_stochastic_shift=3, # optional augmentation\n", " always_reverse_complement=True, # default True. Will double the effective size of the training dataset.\n", diff --git a/src/crested/_genome.py b/src/crested/_genome.py index e3e9fce..8f4281f 100644 --- a/src/crested/_genome.py +++ b/src/crested/_genome.py @@ -43,6 +43,10 @@ class Genome: {'chr1': 1000, 'chr2': 2000} >>> print(genome.name) test + + See Also + -------- + crested.register_genome """ def __init__( @@ -147,7 +151,7 @@ def name(self) -> str: return basename return self._name - def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -> str: + def fetch(self, chrom=None, start=None, end=None, strand="+", region=None) -> str: """ Fetch a sequence from a genomic region. @@ -171,7 +175,9 @@ def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) - The requested sequence, as a string. """ if region and (chrom or start or end): - logger.warning("Both region and chrom/start/end supplied. Using chrom/start/end...") + logger.warning( + "Both region and chrom/start/end supplied. Using chrom/start/end..." + ) elif region: if region[-2] == ":": chrom, start_end, strand = region.split(":") @@ -180,7 +186,9 @@ def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) - start, end = map(int, start_end.split("-")) if not (chrom and start and end): - raise ValueError("chrom/start/end must all be supplied to extract a sequence.") + raise ValueError( + "chrom/start/end must all be supplied to extract a sequence." + ) seq = self.fasta.fetch(reference=chrom, start=start, end=end) if strand == "-": @@ -188,6 +196,7 @@ def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) - else: return seq + def register_genome(genome: Genome): """ Register a genome to be used throughout a session.