Skip to content

Commit

Permalink
explicit genome in all docs
Browse files Browse the repository at this point in the history
  • Loading branch information
LukasMahieu committed Dec 10, 2024
1 parent 065a984 commit 01a46e7
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 27 deletions.
47 changes: 32 additions & 15 deletions docs/tutorials/enhancer_code_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -54,9 +54,12 @@
"adata = anndata.read_h5ad(\"mouse_biccn_data_filtered.h5ad\")\n",
"\n",
"genome_file = \"/home/VIB.LOCAL/niklas.kempynck/nkemp/software/dev_DeepPeak/DeepPeak/data/raw_mm/genome.fa\"\n",
"\n",
"genome = crested.Genome(genome_file)\n",
"\n",
"datamodule = crested.tl.data.AnnDataModule(\n",
" adata,\n",
" genome_file=genome_file,\n",
" genome,\n",
")"
]
},
Expand Down Expand Up @@ -315,7 +318,7 @@
],
"source": [
"%matplotlib inline\n",
"top_k=1000\n",
"top_k = 1000\n",
"crested.pl.patterns.modisco_results(\n",
" classes=[\"Astro\", \"L5ET\", \"Vip\", \"Oligo\"],\n",
" contribution=\"positive\",\n",
Expand Down Expand Up @@ -691,7 +694,10 @@
" verbose=True, # Useful for doing sanity checks on matching patterns\n",
")\n",
"pattern_matrix = crested.tl.modisco.create_pattern_matrix(\n",
" classes=list(adata.obs_names), all_patterns=all_patterns, normalize=False, pattern_parameter='seqlet_count_log'\n",
" classes=list(adata.obs_names),\n",
" all_patterns=all_patterns,\n",
" normalize=False,\n",
" pattern_parameter=\"seqlet_count_log\",\n",
")\n",
"pattern_matrix.shape"
]
Expand Down Expand Up @@ -722,13 +728,21 @@
],
"source": [
"import matplotlib\n",
"\n",
"%matplotlib inline\n",
"matplotlib.rcParams['pdf.fonttype'] = 42\n",
"matplotlib.rcParams['ps.fonttype'] = 42\n",
"matplotlib.rcParams[\"pdf.fonttype\"] = 42\n",
"matplotlib.rcParams[\"ps.fonttype\"] = 42\n",
"\n",
"pat_seqs = crested.tl.modisco.generate_nucleotide_sequences(all_patterns)\n",
"crested.pl.patterns.clustermap(\n",
" pattern_matrix, list(adata.obs_names), figsize=(16, 4.2), pat_seqs=pat_seqs, grid=True, fig_path='paperfigs/motif_clustering.pdf', dendrogram_ratio=(0.03,0.15), importance_threshold=4.5\n",
" pattern_matrix,\n",
" list(adata.obs_names),\n",
" figsize=(16, 4.2),\n",
" pat_seqs=pat_seqs,\n",
" grid=True,\n",
" fig_path=\"paperfigs/motif_clustering.pdf\",\n",
" dendrogram_ratio=(0.03, 0.15),\n",
" importance_threshold=4.5,\n",
")"
]
},
Expand Down Expand Up @@ -791,12 +805,12 @@
"crested.pl.patterns.clustermap(\n",
" pattern_matrix,\n",
" classes=list(adata.obs_names),\n",
" subset=['L2_3IT', 'L5ET', 'L5IT', 'L5_6NP', 'L6CT', 'L6IT','L6b'],\n",
" subset=[\"L2_3IT\", \"L5ET\", \"L5IT\", \"L5_6NP\", \"L6CT\", \"L6IT\", \"L6b\"],\n",
" figsize=(10, 2),\n",
" pat_seqs=pat_seqs,\n",
" grid=True,\n",
" dy=0.0025,\n",
" importance_threshold=4.5\n",
" importance_threshold=4.5,\n",
")"
]
},
Expand Down Expand Up @@ -1022,6 +1036,7 @@
],
"source": [
"import crested\n",
"\n",
"file_path = \"/home/VIB.LOCAL/niklas.kempynck/nkemp/mouse/biccn/Mouse_rna.h5ad\" # Locate h5 file containing scRNAseq data\n",
"cell_type_column = \"subclass_Bakken_2022\"\n",
"mean_expression_df = crested.tl.modisco.calculate_mean_expression_per_cell_type(\n",
Expand All @@ -1046,7 +1061,9 @@
}
],
"source": [
"crested.pl.patterns.tf_expression_per_cell_type(mean_expression_df, ['Nfia', 'Spi1', 'Mef2c'])"
"crested.pl.patterns.tf_expression_per_cell_type(\n",
" mean_expression_df, [\"Nfia\", \"Spi1\", \"Mef2c\"]\n",
")"
]
},
{
Expand Down Expand Up @@ -1486,7 +1503,7 @@
" normalize_gex=True,\n",
" min_tf_gex=0.95,\n",
" importance_threshold=5,\n",
" pattern_parameter='seqlet_count_log',\n",
" pattern_parameter=\"seqlet_count_log\",\n",
" filter_correlation=True,\n",
" verbose=True,\n",
" zscore_threshold=1,\n",
Expand Down Expand Up @@ -1528,13 +1545,13 @@
"source": [
"crested.pl.patterns.clustermap_tf_motif(\n",
" tf_ct_matrix,\n",
" heatmap_dim='contrib',\n",
" dot_dim='gex',\n",
" heatmap_dim=\"contrib\",\n",
" dot_dim=\"gex\",\n",
" class_labels=classes,\n",
" pattern_labels=tf_pattern_annots,\n",
" fig_size=(35,6),\n",
" fig_size=(35, 6),\n",
" cluster_rows=True,\n",
" cluster_columns=False\n",
" cluster_columns=False,\n",
")"
]
},
Expand Down
15 changes: 9 additions & 6 deletions docs/tutorials/model_training_and_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -76,7 +76,10 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"By loading our genome in the {class}`crested.Genome` class and setting it with {func}`~crested.register_genome`, the genome is automatically used in all functions throughout CREsted."
"By loading our genome in the {class}`crested.Genome` class and setting it with {func}`~crested.register_genome`, the genome is automatically used in all functions throughout CREsted. If you don't provide the chromomsome sizes, they will be automatically calculated from the fasta.\n",
"```{note}\n",
"Any function or class that expects a genome object can still accept a genome object as explicit input even if one was already registered. In that case, the input will be used instead of the registered genome. \n",
"```"
]
},
{
Expand Down Expand Up @@ -647,9 +650,7 @@
],
"source": [
"# Load chrombpnet architecture for a dataset with 2114bp regions and 19 cell types\n",
"model_architecture = crested.tl.zoo.chrombpnet(\n",
" seq_len=2114, num_classes=adata.n_obs\n",
")"
"model_architecture = crested.tl.zoo.chrombpnet(seq_len=2114, num_classes=adata.n_obs)"
]
},
{
Expand Down Expand Up @@ -2125,7 +2126,9 @@
"outputs": [],
"source": [
"bw_file = Path(bigwigs_folder) / \"Sst.bw\"\n",
"bw_values, midpoints = crested.utils.read_bigwig_region(bw_file, (chrom, min_loc, max_loc))"
"bw_values, midpoints = crested.utils.read_bigwig_region(\n",
" bw_file, (chrom, min_loc, max_loc)\n",
")"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/tutorials/multi_gpu.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"\n",
"datamodule = crested.tl.data.AnnDataModule(\n",
" my_adata,\n",
" genome_file=my_genome,\n",
" genome=my_genome,\n",
" chromsizes_file=my_chromsizes_file,\n",
" batch_size=128,\n",
" max_stochastic_shift=5,\n",
Expand Down
4 changes: 2 additions & 2 deletions docs/tutorials/topic_classification.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
Expand All @@ -171,7 +171,7 @@
"# Datamodule\n",
"datamodule = crested.tl.data.AnnDataModule(\n",
" adata,\n",
" genome_file=\"../../../Crested_testing/data/tmp/mm10.fa\",\n",
" genome=\"../../../Crested_testing/data/tmp/mm10.fa\",\n",
" batch_size=128, # lower this if you encounter OOM errors\n",
" max_stochastic_shift=3, # optional augmentation\n",
" always_reverse_complement=True, # default True. Will double the effective size of the training dataset.\n",
Expand Down
15 changes: 12 additions & 3 deletions src/crested/_genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ class Genome:
{'chr1': 1000, 'chr2': 2000}
>>> print(genome.name)
test
See Also
--------
crested.register_genome
"""

def __init__(
Expand Down Expand Up @@ -147,7 +151,7 @@ def name(self) -> str:
return basename
return self._name

def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -> str:
def fetch(self, chrom=None, start=None, end=None, strand="+", region=None) -> str:
"""
Fetch a sequence from a genomic region.
Expand All @@ -171,7 +175,9 @@ def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -
The requested sequence, as a string.
"""
if region and (chrom or start or end):
logger.warning("Both region and chrom/start/end supplied. Using chrom/start/end...")
logger.warning(
"Both region and chrom/start/end supplied. Using chrom/start/end..."
)
elif region:
if region[-2] == ":":
chrom, start_end, strand = region.split(":")
Expand All @@ -180,14 +186,17 @@ def fetch(self, chrom=None, start=None, end=None, strand = "+", region = None) -
start, end = map(int, start_end.split("-"))

if not (chrom and start and end):
raise ValueError("chrom/start/end must all be supplied to extract a sequence.")
raise ValueError(
"chrom/start/end must all be supplied to extract a sequence."
)

seq = self.fasta.fetch(reference=chrom, start=start, end=end)
if strand == "-":
return reverse_complement(seq)
else:
return seq


def register_genome(genome: Genome):
"""
Register a genome to be used throughout a session.
Expand Down

0 comments on commit 01a46e7

Please sign in to comment.