explicit genome in all docs

aertslab · Dec 10, 2024 · 01a46e7 · 01a46e7
1 parent 065a984
commit 01a46e7
Show file tree

Hide file tree

Showing 5 changed files with 56 additions and 27 deletions.
diff --git a/docs/tutorials/enhancer_code_analysis.ipynb b/docs/tutorials/enhancer_code_analysis.ipynb
@@ -33,7 +33,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -54,9 +54,12 @@
     "adata = anndata.read_h5ad(\"mouse_biccn_data_filtered.h5ad\")\n",
     "\n",
     "genome_file = \"/home/VIB.LOCAL/niklas.kempynck/nkemp/software/dev_DeepPeak/DeepPeak/data/raw_mm/genome.fa\"\n",
+    "\n",
+    "genome = crested.Genome(genome_file)\n",
+    "\n",
     "datamodule = crested.tl.data.AnnDataModule(\n",
     "    adata,\n",
-    "    genome_file=genome_file,\n",
+    "    genome,\n",
     ")"
    ]
   },
@@ -315,7 +318,7 @@
    ],
    "source": [
     "%matplotlib inline\n",
-    "top_k=1000\n",
+    "top_k = 1000\n",
     "crested.pl.patterns.modisco_results(\n",
     "    classes=[\"Astro\", \"L5ET\", \"Vip\", \"Oligo\"],\n",
     "    contribution=\"positive\",\n",
@@ -691,7 +694,10 @@
     "    verbose=True,  # Useful for doing sanity checks on matching patterns\n",
     ")\n",
     "pattern_matrix = crested.tl.modisco.create_pattern_matrix(\n",
-    "    classes=list(adata.obs_names), all_patterns=all_patterns, normalize=False, pattern_parameter='seqlet_count_log'\n",
+    "    classes=list(adata.obs_names),\n",
+    "    all_patterns=all_patterns,\n",
+    "    normalize=False,\n",
+    "    pattern_parameter=\"seqlet_count_log\",\n",
     ")\n",
     "pattern_matrix.shape"
    ]
@@ -722,13 +728,21 @@
    ],
    "source": [
     "import matplotlib\n",
+    "\n",
     "%matplotlib inline\n",
-    "matplotlib.rcParams['pdf.fonttype'] = 42\n",
-    "matplotlib.rcParams['ps.fonttype'] = 42\n",
+    "matplotlib.rcParams[\"pdf.fonttype\"] = 42\n",
+    "matplotlib.rcParams[\"ps.fonttype\"] = 42\n",
     "\n",
     "pat_seqs = crested.tl.modisco.generate_nucleotide_sequences(all_patterns)\n",
     "crested.pl.patterns.clustermap(\n",
-    "    pattern_matrix, list(adata.obs_names), figsize=(16, 4.2), pat_seqs=pat_seqs, grid=True, fig_path='paperfigs/motif_clustering.pdf', dendrogram_ratio=(0.03,0.15), importance_threshold=4.5\n",
+    "    pattern_matrix,\n",
+    "    list(adata.obs_names),\n",
+    "    figsize=(16, 4.2),\n",
+    "    pat_seqs=pat_seqs,\n",
+    "    grid=True,\n",
+    "    fig_path=\"paperfigs/motif_clustering.pdf\",\n",
+    "    dendrogram_ratio=(0.03, 0.15),\n",
+    "    importance_threshold=4.5,\n",
     ")"
    ]
   },
@@ -791,12 +805,12 @@
     "crested.pl.patterns.clustermap(\n",
     "    pattern_matrix,\n",
     "    classes=list(adata.obs_names),\n",
-    "    subset=['L2_3IT', 'L5ET', 'L5IT', 'L5_6NP', 'L6CT', 'L6IT','L6b'],\n",
+    "    subset=[\"L2_3IT\", \"L5ET\", \"L5IT\", \"L5_6NP\", \"L6CT\", \"L6IT\", \"L6b\"],\n",
     "    figsize=(10, 2),\n",
     "    pat_seqs=pat_seqs,\n",
     "    grid=True,\n",
     "    dy=0.0025,\n",
-    "    importance_threshold=4.5\n",
+    "    importance_threshold=4.5,\n",
     ")"
    ]
   },
@@ -1022,6 +1036,7 @@
    ],
    "source": [
     "import crested\n",
+    "\n",
     "file_path = \"/home/VIB.LOCAL/niklas.kempynck/nkemp/mouse/biccn/Mouse_rna.h5ad\"  # Locate h5 file containing scRNAseq data\n",
     "cell_type_column = \"subclass_Bakken_2022\"\n",
     "mean_expression_df = crested.tl.modisco.calculate_mean_expression_per_cell_type(\n",
@@ -1046,7 +1061,9 @@
     }
    ],
    "source": [
-    "crested.pl.patterns.tf_expression_per_cell_type(mean_expression_df, ['Nfia', 'Spi1', 'Mef2c'])"
+    "crested.pl.patterns.tf_expression_per_cell_type(\n",
+    "    mean_expression_df, [\"Nfia\", \"Spi1\", \"Mef2c\"]\n",
+    ")"
    ]
   },
   {
@@ -1486,7 +1503,7 @@
     "    normalize_gex=True,\n",
     "    min_tf_gex=0.95,\n",
     "    importance_threshold=5,\n",
-    "    pattern_parameter='seqlet_count_log',\n",
+    "    pattern_parameter=\"seqlet_count_log\",\n",
     "    filter_correlation=True,\n",
     "    verbose=True,\n",
     "    zscore_threshold=1,\n",
@@ -1528,13 +1545,13 @@
    "source": [
     "crested.pl.patterns.clustermap_tf_motif(\n",
     "    tf_ct_matrix,\n",
-    "    heatmap_dim='contrib',\n",
-    "    dot_dim='gex',\n",
+    "    heatmap_dim=\"contrib\",\n",
+    "    dot_dim=\"gex\",\n",
     "    class_labels=classes,\n",
     "    pattern_labels=tf_pattern_annots,\n",
-    "    fig_size=(35,6),\n",
+    "    fig_size=(35, 6),\n",
     "    cluster_rows=True,\n",
-    "    cluster_columns=False\n",
+    "    cluster_columns=False,\n",
     ")"
    ]
   },

diff --git a/docs/tutorials/model_training_and_eval.ipynb b/docs/tutorials/model_training_and_eval.ipynb
@@ -64,7 +64,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,7 +76,10 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "By loading our genome in the {class}`crested.Genome` class and setting it with {func}`~crested.register_genome`, the genome is automatically used in all functions throughout CREsted."
+    "By loading our genome in the {class}`crested.Genome` class and setting it with {func}`~crested.register_genome`, the genome is automatically used in all functions throughout CREsted. If you don't provide the chromomsome sizes, they will be automatically calculated from the fasta.\n",
+    "```{note}\n",
+    "Any function or class that expects a genome object can still accept a genome object as explicit input even if one was already registered. In that case, the input will be used instead of the registered genome. \n",
+    "```"
    ]
   },
   {
@@ -647,9 +650,7 @@
    ],
    "source": [
     "# Load chrombpnet architecture for a dataset with 2114bp regions and 19 cell types\n",
-    "model_architecture = crested.tl.zoo.chrombpnet(\n",
-    "    seq_len=2114, num_classes=adata.n_obs\n",
-    ")"
+    "model_architecture = crested.tl.zoo.chrombpnet(seq_len=2114, num_classes=adata.n_obs)"
    ]
   },
   {
@@ -2125,7 +2126,9 @@
    "outputs": [],
    "source": [
     "bw_file = Path(bigwigs_folder) / \"Sst.bw\"\n",
-    "bw_values, midpoints = crested.utils.read_bigwig_region(bw_file, (chrom, min_loc, max_loc))"
+    "bw_values, midpoints = crested.utils.read_bigwig_region(\n",
+    "    bw_file, (chrom, min_loc, max_loc)\n",
+    ")"
    ]
   },
   {

diff --git a/docs/tutorials/multi_gpu.ipynb b/docs/tutorials/multi_gpu.ipynb
@@ -30,7 +30,7 @@
     "\n",
     "datamodule = crested.tl.data.AnnDataModule(\n",
     "        my_adata,\n",
-    "        genome_file=my_genome,\n",
+    "        genome=my_genome,\n",
     "        chromsizes_file=my_chromsizes_file,\n",
     "        batch_size=128,\n",
     "        max_stochastic_shift=5,\n",

diff --git a/docs/tutorials/topic_classification.ipynb b/docs/tutorials/topic_classification.ipynb
@@ -155,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -171,7 +171,7 @@
     "# Datamodule\n",
     "datamodule = crested.tl.data.AnnDataModule(\n",
     "    adata,\n",
-    "    genome_file=\"../../../Crested_testing/data/tmp/mm10.fa\",\n",
+    "    genome=\"../../../Crested_testing/data/tmp/mm10.fa\",\n",
     "    batch_size=128,  # lower this if you encounter OOM errors\n",
     "    max_stochastic_shift=3,  # optional augmentation\n",
     "    always_reverse_complement=True,  # default True. Will double the effective size of the training dataset.\n",

diff --git a/src/crested/_genome.py b/src/crested/_genome.py
@@ -43,6 +43,10 @@ class Genome:
     {'chr1': 1000, 'chr2': 2000}
     >>> print(genome.name)
     test
+
+    See Also
+    --------
+    crested.register_genome
     """
 
     def __init__(
@@ -147,7 +151,7 @@ def name(self) -> str:
                 return basename
         return self._name
 
-    def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -> str:
+    def fetch(self, chrom=None, start=None, end=None, strand="+", region=None) -> str:
         """
         Fetch a sequence from a genomic region.
 
@@ -171,7 +175,9 @@ def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -
         The requested sequence, as a string.
         """
         if region and (chrom or start or end):
-            logger.warning("Both region and chrom/start/end supplied. Using chrom/start/end...")
+            logger.warning(
+                "Both region and chrom/start/end supplied. Using chrom/start/end..."
+            )
         elif region:
             if region[-2] == ":":
                 chrom, start_end, strand = region.split(":")
@@ -180,14 +186,17 @@ def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -
             start, end = map(int, start_end.split("-"))
 
         if not (chrom and start and end):
-            raise ValueError("chrom/start/end must all be supplied to extract a sequence.")
+            raise ValueError(
+                "chrom/start/end must all be supplied to extract a sequence."
+            )
 
         seq = self.fasta.fetch(reference=chrom, start=start, end=end)
         if strand == "-":
             return reverse_complement(seq)
         else:
             return seq
 
+
 def register_genome(genome: Genome):
     """
     Register a genome to be used throughout a session.