From 50235446a657fbb04872ff33589919b837c73e11 Mon Sep 17 00:00:00 2001
From: nkempynck <niklas.ke@hotmail.com>
Date: Fri, 13 Dec 2024 10:26:14 +0100
Subject: [PATCH] plotting updates

---
 docs/tutorials/model_training_and_eval.ipynb |  8 +++++
 src/crested/pl/hist/_locus_scoring.py        | 22 ++++++++++----
 src/crested/pl/scatter/_class_density.py     | 31 +++++++++++++-------
 src/crested/tl/modisco/_tfmodisco.py         |  2 +-
 4 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/docs/tutorials/model_training_and_eval.ipynb b/docs/tutorials/model_training_and_eval.ipynb
index fc09a0a..14ebc35 100644
--- a/docs/tutorials/model_training_and_eval.ipynb
+++ b/docs/tutorials/model_training_and_eval.ipynb
@@ -2866,6 +2866,14 @@
      },
      "metadata": {},
      "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1;34mwandb\u001b[0m: 🚀 View run \u001b[33mfinetuned_model\u001b[0m at: \u001b[34mhttps://wandb.ai/kemp/mouse_biccn/runs/it1js3u7\u001b[0m\n",
+      "\u001b[1;34mwandb\u001b[0m: Find logs at: \u001b[1;35mwandb/run-20241212_101919-it1js3u7/logs\u001b[0m\n"
+     ]
     }
    ],
    "source": [
diff --git a/src/crested/pl/hist/_locus_scoring.py b/src/crested/pl/hist/_locus_scoring.py
index e7275eb..997d367 100644
--- a/src/crested/pl/hist/_locus_scoring.py
+++ b/src/crested/pl/hist/_locus_scoring.py
@@ -16,6 +16,9 @@ def locus_scoring(
     bigwig_values: np.ndarray | None = None,
     bigwig_midpoints: list[int] | None = None,
     filename: str | None = None,
+    grid: bool = True,
+    figsize: tuple[float, float] = (30,5),
+    highlight_positions: list[tuple[int, int]] | None = None,
 ):
     """
     Plot the predictions as a line chart over the entire genomic input and optionally indicate the gene locus.
@@ -44,6 +47,12 @@ def locus_scoring(
         A list of base pair positions corresponding to the bigwig_values.
     filename
         The filename to save the plot to.
+    grid
+        Add grid to plot.
+    figsize
+        Size of figure.
+    highlight_positions
+        A list of tuples specifying ranges to highlight on the plot.
 
     See Also
     --------
@@ -65,7 +74,7 @@ def locus_scoring(
     .. image:: ../../../../docs/_static/img/examples/hist_locus_scoring.png
     """
     # Plotting predictions
-    plt.figure(figsize=(30, 10))
+    plt.figure(figsize=figsize)
 
     # Top plot: Model predictions
     plt.subplot(2, 1, 1)
@@ -78,13 +87,16 @@ def locus_scoring(
         label="Prediction Score",
     )
     if gene_start is not None and gene_end is not None:
-        plt.axvspan(gene_start, gene_end, color="red", alpha=0.3, label="Gene Locus")
+        plt.axvspan(gene_start, gene_end, color="red", alpha=0.2, label="Gene Locus")
+    if highlight_positions:
+        for start, end in highlight_positions:
+            plt.axvspan(start, end, color="green", alpha=0.3)
     plt.title(title)
     plt.xlabel("Genomic Position")
     plt.ylabel("Prediction Score")
     plt.ylim(bottom=0)
     plt.xticks(rotation=90)
-    plt.grid(True)
+    plt.grid(grid)
     plt.legend()
     if ylim:
         plt.ylim(ylim)
@@ -101,13 +113,13 @@ def locus_scoring(
         )
         if gene_start is not None and gene_end is not None:
             plt.axvspan(
-                gene_start, gene_end, color="red", alpha=0.3, label="Gene Locus"
+                gene_start, gene_end, color="red", alpha=0.2, label="Gene Locus"
             )
         plt.xlabel("Genomic Position")
         plt.ylabel("bigWig Values")
         plt.xticks(rotation=90)
         plt.ylim(bottom=0)
-        plt.grid(True)
+        plt.grid(grid)
         plt.legend()
 
     plt.tight_layout()
diff --git a/src/crested/pl/scatter/_class_density.py b/src/crested/pl/scatter/_class_density.py
index c17ad53..b6f1623 100644
--- a/src/crested/pl/scatter/_class_density.py
+++ b/src/crested/pl/scatter/_class_density.py
@@ -14,12 +14,13 @@
 
 def class_density(
     adata: AnnData,
-    class_name: str,
+    class_name: str | None = None,
     model_names: list[str] | None = None,
     split: str | None = "test",
     log_transform: bool = False,
     exclude_zeros: bool = True,
     density_indication: bool = False,
+    alpha: float = 0.25,
     **kwargs,
 ) -> plt.Figure:
     """
@@ -30,7 +31,7 @@ def class_density(
     adata
         AnnData object containing the data in `X` and predictions in `layers`.
     class_name
-        Name of the class in `adata.obs_names`.
+        Name of the class in `adata.obs_names`. If None, plot is made for all the classes.
     model_names
         List of model names in `adata.layers`. If None, will create a plot per model in `adata.layers`.
     split
@@ -38,9 +39,11 @@ def class_density(
     log_transform
         Whether to log-transform the data before plotting. Default is False.
     exclude_zeros
-        Whether to exclude zero values from the plot. Default is True.
+        Whether to exclude zero ground truth values from the plot. Default is True.
     density_indication
         Whether to indicate density in the scatter plot. Default is False.
+    alpha
+        Transparency of points in scatter plot. From 0 (transparent) to 1 (opaque).
     kwargs
         Additional arguments passed to :func:`~crested.pl.render_plot` to
         control the final plot output. Please see :func:`~crested.pl.render_plot`
@@ -75,7 +78,7 @@ def _check_input_params():
                 "No split column found in anndata.var. Run `pp.train_val_test_split` first if 'split' is not None."
             )
 
-        if class_name not in adata.obs_names:
+        if (class_name) and (class_name not in adata.obs_names):
             raise ValueError(f"Class {class_name} not found in adata.obs_names.")
         if split not in ["train", "val", "test", None]:
             raise ValueError("Split must be 'train', 'val', 'test', or None.")
@@ -83,7 +86,7 @@ def _check_input_params():
     _check_input_params()
 
     classes = list(adata.obs_names)
-    column_index = classes.index(class_name)
+    column_index = classes.index(class_name) if class_name else np.arange(0, len(classes))
     if model_names is None:
         model_names = list(adata.layers.keys())
 
@@ -115,9 +118,14 @@ def _check_input_params():
 
     n_models = len(predicted_values)
 
-    logger.info(
-        f"Plotting density scatter for class: {class_name}, models: {model_names}, split: {split}"
-    )
+    if class_name:
+        logger.info(
+            f"Plotting density scatter for class: {class_name}, models: {model_names}, split: {split}"
+        )
+    else:
+        logger.info(
+            f"Plotting density scatter for all targets and predictions, models: {model_names}, split: {split}"
+        )
 
     fig, axes = plt.subplots(1, n_models, figsize=(8 * n_models, 8), sharey=True)
     if n_models == 1:
@@ -130,10 +138,11 @@ def _check_input_params():
         if density_indication:
             xy = np.vstack([x, y])
             z = gaussian_kde(xy)(xy)
-            scatter = ax.scatter(x, y, c=z, s=50, edgecolor="k", alpha=0.25)
+            scatter = ax.scatter(x, y, c=z, s=50, edgecolor="k", alpha=alpha)
+            scatter.set_rasterized(True)  # Rasterize only the scatter points
             plt.colorbar(scatter, ax=ax, label="Density")
         else:
-            scatter = ax.scatter(x, y, edgecolor="k", alpha=0.25)
+            scatter = ax.scatter(x, y, edgecolor="k", alpha=alpha)
 
         ax.annotate(
             f"Pearson: {pearson_corr:.2f}",
@@ -164,6 +173,6 @@ def _check_input_params():
     if "ylabel" not in kwargs:
         kwargs["ylabel"] = "Predictions"
     if "title" not in kwargs:
-        kwargs["title"] = f"{class_name}"
+        kwargs["title"] = f"{class_name}" if class_name else "Targets vs Predictions"
 
     return render_plot(fig, **kwargs)
diff --git a/src/crested/tl/modisco/_tfmodisco.py b/src/crested/tl/modisco/_tfmodisco.py
index cc57124..4952b18 100644
--- a/src/crested/tl/modisco/_tfmodisco.py
+++ b/src/crested/tl/modisco/_tfmodisco.py
@@ -545,7 +545,7 @@ def merge_patterns(pattern1: dict, pattern2: dict) -> dict:
                 if ic_a > ic_b
                 else pattern2["classes"][cell_type]
             )
-            merged_classes[cell_type]['n_seqlets'] = max(n_seqlets_a, n_seqlets_b) # if patterns from the same class get merged, we keep the max seqlet count between the two of them since they are the same pattern
+            merged_classes[cell_type]['n_seqlets'] = n_seqlets_a + n_seqlets_b
         else:
             merged_classes[cell_type] = pattern1["classes"][cell_type]