epigen · bednarsky · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/config/README.md b/config/README.md
@@ -6,7 +6,7 @@ You need one configuration file to configure the analyses and one annotation fil
 - sample annotation (annotation): CSV file consisting of four mandatory columns.
     -  name: A unique name for the dataset (tip: keep it short but descriptive).
     -  data: Path to the tabular data as a comma-separated table (CSV).
-    -  metadata: Path to the metadata as a comma-separated table (CSV) with the first column being the index/identifier of each observation/sample and every other column metadata for the respective observation (either numeric or categorical, not mixed). **No NaN or empty values allowed.**
+    -  metadata: Path to the metadata as a comma-separated table (CSV) with the first column being the index/identifier of each observation/sample and every other column metadata for the respective observation (either numeric or categorical, not mixed). **No NaN or empty values allowed, and no special characters (all except a-z, 0-9, `_`) in the index.**
     -  samples_by_features: Boolean indicator if the data matrix is observations/samples (rows) x features (columns): 0==no, 1==yes.
 
 Set workflow-specific `resources` or command line arguments (CLI) in the workflow profile `workflow/profiles/default.config.yaml`, which supersedes global Snakemake profiles.
diff --git a/config/config.yaml b/config/config.yaml
@@ -41,7 +41,7 @@ umap:
 heatmap:
     metrics: ['correlation','cosine']
     hclust_methods: ['complete']
-    n_observations: 1000 # random sampled proportion float (0-1] or absolute number as integer
+    n_observations: 1 # random sampled proportion float (0-1] or absolute number as integer
     n_features: 0.5 # highly variable features proportion float (0-1] or absolute number as integer
 
 ##### LEIDEN #####
@@ -92,6 +92,7 @@ metadata_of_interest: ["target"]
 coord_fixed: 0
 
 # 2D/3D visualization with ggplot2 and plotly
+# for more than 200 datapoints it is recommended to reduce size to 0.2
 scatterplot2d:
     size: 1
     alpha: 1

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -7,6 +7,7 @@ import os
 import sys
 import pandas as pd
 import yaml
+import re
 from snakemake.utils import min_version
 
 ##### set minimum Snakemake version #####
@@ -23,6 +24,9 @@ configfile: os.path.join("config", "config.yaml")
 
 # load annotations
 annot = pd.read_csv(config['annotation'], index_col='name')
+# make sure the index does not contain any special characters, otherwise weird downstream errors
+pattern = re.compile(r'[^a-zA-Z0-9_]')
+assert all([pattern.match(name) is None for name in annot.index]), "Sample names should not contain special characters"
 
 result_path = os.path.join(config["result_path"],module_name)
 

diff --git a/workflow/envs/ggplot.yaml b/workflow/envs/ggplot.yaml
@@ -8,4 +8,5 @@ dependencies:
   - r-ggally=2.1.2
   - r-ggrepel=0.9.1
   - r-reshape2=1.4.4
+  - r-stringi=1.8.4
   - r-data.table=1.15.4
diff --git a/workflow/scripts/distance_matrix.py b/workflow/scripts/distance_matrix.py
@@ -41,7 +41,8 @@
 if data_or_feature == "observations":
     if isinstance(n_observations, float) or n_observations==1:
         n_observations = int(math.floor(n_observations * data.shape[0]))
-    data = data.sample(n=n_observations, random_state=42)
+    if n_observations < data.shape[0]:
+        data = data.sample(n=n_observations, random_state=42)
 
 # Convert DataFrame to NumPy array
 # data_np = data.to_numpy()

diff --git a/workflow/scripts/plot_heatmap.R b/workflow/scripts/plot_heatmap.R
@@ -116,6 +116,10 @@ feat_dend <- as.dendrogram(feat_hc)
 limit <- ceiling(max(abs(quantile(data, c(0.01, 0.99)))))
 col_fun <- colorRamp2(c(-limit, 0, limit), c("blue", "white", "red"))
 
+randomColor <- function() {
+  paste0("#", paste0(sample(c(0:9, letters[1:6]), size = 6, replace = TRUE), collapse = ""))
+}
+
 # make color mapping
 if (is.numeric(metadata[[metadata_col]])){
     #check if divergent or sequential?
@@ -124,7 +128,14 @@ if (is.numeric(metadata[[metadata_col]])){
 }else{
     n_cat <- length(unique(metadata[[metadata_col]]))
     qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
-    colors <- sample(unique(unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))),n_cat)
+    all_brewer_colors <- unique(unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))))
+    if (n_cat <= length(all_brewer_colors)){
+        # get nice colors
+        colors <- sample(all_brewer_colors, n_cat)
+    }else{
+        # too many groups, get random colors
+        colors <- replicate(n_cat, randomColor())
+    }
     names(colors) <- unique(metadata[[metadata_col]])
     colors_list <- list()
     colors_list[[metadata_col]] <- colors# put here the mapped colors