From 138853e4e0a8be4dd8604a89d6b63015e580ce1c Mon Sep 17 00:00:00 2001
From: SarahOuologuem <sarah.ouologuem@gmx.de>
Date: Tue, 9 Apr 2024 10:28:23 +0000
Subject: [PATCH] add logging info

---
 panpipes/R_scripts/plotQC.R                   | 55 +++++++++++--------
 .../python_scripts/run_preprocess_atac.py     | 45 +++++++++++----
 .../python_scripts/run_preprocess_prot.py     | 49 ++++++++++-------
 panpipes/python_scripts/run_preprocess_rna.py |  8 +--
 4 files changed, 98 insertions(+), 59 deletions(-)

diff --git a/panpipes/R_scripts/plotQC.R b/panpipes/R_scripts/plotQC.R
index c8b1e7b0..ecff7b14 100644
--- a/panpipes/R_scripts/plotQC.R
+++ b/panpipes/R_scripts/plotQC.R
@@ -33,7 +33,7 @@ do_violin_plot <- function(df, qc, group){
     theme(axis.text.x=element_text(size=8, angle=90),
           axis.text.y=element_text(size=8))
   if(length(unique(df[[group]])) > 10){
-    message(paste0(sc, "has too many categories, removing legend"))
+    print(paste(sc, "has too many categories, removing legend"))
     g <- g + theme(legend.position="none")
   }
   
@@ -82,15 +82,12 @@ option_list <- list(
 )
 
 
-message("Plot QC data")
 opt <- parse_args(OptionParser(option_list=option_list))
 
 if(is.null(opt$outdir)) { opt$outdir <- paste0(getwd(),"/")}
 if(!grepl("\\/$", opt$outdir)){opt$outdir <- paste(opt$outdir, "/", sep = "")}
 if(!file.exists(opt$outdir)){dir.create(opt$outdir)}
 
-print("Running with options:")
-print(opt)
 
 run<- opt$outdir
 opt[which(opt=="NULL")] <- NULL
@@ -118,16 +115,16 @@ if (!is.null(opt$groupingvar)){
   source_facet <- source_facet[source_facet %in% colnames(data_plot)]
   source_facet <- unique(c(source_facet, keep_source) )
   if(length(source_facet)>0){
-    print("Facet plotting with")
     # add sample_id as a minimum requirement if it's not there already
     source_facet = unique(c("sample_id", source_facet))
-    print(source_facet)
+    print(paste("Facet plotting with",source_facet))
   }
 }else{
-  stop("i don't have the minimum variable _sampleid_ to facet on, will stop here")
+  stop("Need the minimum variable _sampleid_ to facet on, will stop here")
 }
 # RNA plots --------------------------------------------------------------------
 
+print("RNA plots")
 
 if(opt$scanpy_or_muon=="scanpy"){
   rna_data_plot <- data_plot
@@ -162,9 +159,8 @@ rna_source_facet <- gsub("^rna\\.", "",grep("^rna.", source_facet, value = TRUE)
 rna_source_facet <- unique(c(rna_source_facet, source_facet[!grepl("^rna.", source_facet)]))
 rna_source_facet <- rna_source_facet[rna_source_facet %in% colnames(rna_data_plot)]
 for (qc in qcmetrics){
-  print(qc)
+  print(paste("Plotting violin plots of", qc))
   for (sc in rna_source_facet){ #add gsub temp here
-    print(sc)
     g <- do_violin_plot(rna_data_plot, qc, sc)
     if (uniq_sample_id  > 50){width=12}else{width=6}
     ggsave(g, filename=file.path(outpath, paste0("violin_", sc, "_rna-", qc,".png")), type="cairo", width= width, height=6)
@@ -183,23 +179,26 @@ for (sc in rna_source_facet){
   }
   # plot once per source facet
   if (all(c("total_counts","n_genes_by_counts")%in% colnames(rna_data_plot))){
+    print("Plotting scatter plot of total_counts and n_genes_by_counts")
     g <- do_scatter_plot(rna_data_plot,x="total_counts",y="n_genes_by_counts", facet=sc)
     ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-genes.png")), type="cairo",
            width= 3*ncols, height=3*nrows, dpi=200)
   }
   
   if (all(c("log1p_total_counts","log1p_n_genes_by_counts")%in% colnames(rna_data_plot))){
-    
+    print("Plotting scatter plot of log1p_total_counts and n_genes_by_counts")
     g <- do_scatter_plot(rna_data_plot,x="log1p_total_counts",y="log1p_n_genes_by_counts", facet=sc)
     ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-log1p_nUMI_v_rna-log1p_genes.png")), type="cairo",
            width= 3*ncols, height=3*nrows, dpi=200)
   }
   if (all(c("total_counts","pct_counts_mt")%in% colnames(rna_data_plot))){
+    print("Plotting scatter plot of total_counts and pct_counts_mt")
     g <- do_scatter_plot(rna_data_plot,x="total_counts",y="pct_counts_mt", facet=sc)
     ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-pct_mt.png")), type="cairo",
            width= 3*ncols, height=3*nrows, dpi=200)
   }
   if (all(c("n_genes_by_counts","doublet_scores","total_counts")%in% colnames(rna_data_plot))){
+    print("Plotting scatter plot of n_genes_by_counts and doublet_scores")
     g <- do_scatter_plot(rna_data_plot,x="n_genes_by_counts",y="doublet_scores",
                          hue="total_counts", facet=sc)
     ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-genes_rna-doublet_scores_rna-numi.png")),    
@@ -213,7 +212,10 @@ for (sc in rna_source_facet){
 # Protein plots ----------------------------------------------------------------
 
 
+
 if(!is.null(opt$prot_qc_metrics)){
+  print("Protein plots")
+
   qcmetrics <- strsplit(opt$prot_qc_metrics,",")[[1]]
   prot_data_plot <- data_plot[,grep("^prot\\.",colnames(data_plot))]
   colnames(prot_data_plot) <- gsub("^prot\\.", "", colnames(prot_data_plot))
@@ -230,7 +232,7 @@ if(!is.null(opt$prot_qc_metrics)){
   # check these qc metrics are in the file
   qcmetrics <- qcmetrics[qcmetrics %in% colnames(prot_data_plot)]
   for (qc in qcmetrics){
-    print(qc)
+    print(paste("Plotting violin plots of", qc))
     for (sc in prot_source_facet){
       g <- do_violin_plot(prot_data_plot, qc, sc)
       if (uniq_sample_id  > 50){width=12}else{width=6}
@@ -255,28 +257,33 @@ if(!is.null(opt$prot_qc_metrics)){
       nrows=1
     }
     # plot once per source facet
-    message("protein scatter plots")  
+
     if (all(c("total_counts","n_prot_by_counts")%in% colnames(prot_data_plot))){
+      print("Plotting scatter plot of total_counts and n_prot_by_counts")
       g <- do_scatter_plot(prot_data_plot,x="total_counts",y="n_prot_by_counts", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-prot.png")), type="cairo",
              width= 3*ncols, height=3*nrows, dpi=200)
     }
     if (all(c("log1p_total_counts","log1p_n_prot_by_counts")%in% colnames(prot_data_plot))){
+      print("Plotting scatter plot of log1p_total_counts and log1p_n_prot_by_counts")
       g <- do_scatter_plot(prot_data_plot,x="log1p_total_counts",y="log1p_n_prot_by_counts", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_prot.png")), type="cairo",
              width= 3*ncols, height=3*nrows, dpi=200)
     }
     if (all(c("total_counts","pct_counts_isotype")%in% colnames(prot_data_plot))){
+      print("Plotting scatter plot of total_counts and pct_counts_isotype")
       g <- do_scatter_plot(prot_data_plot,x="total_counts",y="pct_counts_isotype", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-pct_isotype.png")), type="cairo",
              width= 3*ncols, height=3*nrows, dpi=200)
     }
     if (all(c("total_counts","total_counts_isotype")%in% colnames(prot_data_plot))){
+      print("Plotting scatter plot of total_counts and total_counts_isotype")
       g <- do_scatter_plot(prot_data_plot,x="total_counts",y="total_counts_isotype", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-total_counts_isotype.png")), type="cairo",
              width= 3*ncols, height=3*nrows, dpi=200)
     }
     if (all(c("log1p_total_counts","log1p_total_counts_isotype", "isotype_exclude")%in% colnames(prot_data_plot))){
+      print("Plotting scatter plot of log1p_total_counts and log1p_total_counts_isotype")
       g <- do_scatter_plot(prot_data_plot,x="log1p_total_counts",y="log1p_total_counts_isotype", hue="isotype_exclude", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_total_counts_isotype.png")), type="cairo",
              width= 3*ncols, height=3*nrows, dpi=200)
@@ -288,7 +295,7 @@ if(!is.null(opt$prot_qc_metrics)){
 
 
 if(!is.null(opt$atac_qc_metrics)){
-  message("Atac plots")
+  print("ATAC plots")
   atac_data_plot <- data_plot[,grep("^atac\\.",colnames(data_plot))]
   colnames(atac_data_plot) <- gsub("^atac\\.", "", colnames(atac_data_plot))
   atac_source_facet <- gsub("^atac\\.", "",grep("^atac.", source_facet, value = TRUE))
@@ -304,7 +311,7 @@ if(!is.null(opt$atac_qc_metrics)){
   # check these qc metrics are in the file
   qcmetrics <- qcmetrics[qcmetrics %in% colnames(atac_data_plot)]
   for (qc in qcmetrics){
-    print(qc)
+    print(paste("Plotting violin plots of", qc))
     for (sc in atac_source_facet){
       g <- do_violin_plot(atac_data_plot, qc, sc)
       if (uniq_sample_id  > 50){width=12}else{width=6}
@@ -317,7 +324,7 @@ if(!is.null(opt$atac_qc_metrics)){
 
 
 if (!is.null(opt$rep_qc_metrics)) {
-  message("Repertoire plots")
+  print("Repertoire plots")
   qcmetrics <- strsplit(opt$rep_qc_metrics,",")[[1]]
   qcmetrics <- gsub("rep:", "", qcmetrics)
   rep_data_plot <- data_plot[,grep("^rep\\.",colnames(data_plot))]
@@ -338,6 +345,7 @@ if (!is.null(opt$rep_qc_metrics)) {
   # check these qc metrics are in the file
   qcmetrics <- qcmetrics[qcmetrics %in% colnames(rep_data_plot)]
   for (qc in qcmetrics){
+    print(paste("Plotting bar plots of", qc))
     for (sc in rep_source_facet){
       g <- do_bar_plot(rep_data_plot, qc, sc)
       if (uniq_sample_id  > 50){width=12}else{width=6}
@@ -354,6 +362,7 @@ if (!is.null(opt$rep_qc_metrics)) {
 
 
 if(!is.null(opt$prot_qc_metrics)){
+print("RNA vs. Protein plots")
 
 # rna vs prot plots ----------------------------------------------------------------
   
@@ -363,8 +372,6 @@ if(!is.null(opt$prot_qc_metrics)){
   
 
   for (sc in source_facet){
-    
-    message(sc)
     uniq_source <- nrow(unique(data_plot[sc]))
     if(uniq_source >6){
       ncols=6
@@ -373,28 +380,32 @@ if(!is.null(opt$prot_qc_metrics)){
       ncols=uniq_source
       nrows=1
     }
-    message("rna v protein scatter plots")  
     if (all(c("rna.total_counts","prot.total_counts")%in% colnames(data_plot))){
+      print("Plotting scatter plot of rna.total_counts and prot.total_counts")
       g <- do_scatter_plot(data_plot,x="rna.total_counts",y="prot.total_counts", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-nUMI_v_rna-nUMI.png")), type="cairo",
               width= 3*ncols, height=3*nrows, dpi=200)
     }
     if (all(c("rna.log1p_total_counts","prot.log1p_total_counts")%in% colnames(data_plot))){
+      print("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts")
       g <- do_scatter_plot(data_plot,x="rna.log1p_total_counts",y="prot.log1p_total_counts", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_rna-log1p_nUMI.png")), type="cairo",
               width= 3*ncols, height=3*nrows, dpi=200)
     }
     if (all(c("rna.total_counts","prot.total_counts_isotype")%in% colnames(data_plot))){
+      print("Plotting scatter plot of rna.total_counts and prot.total_counts_isotype")
       g <-  do_scatter_plot(data_plot,x="rna.total_counts",y="prot.total_counts_isotype", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-nUMI_v_prot-counts_isotype.png")), type="cairo",
               width= 3*ncols, height=3*nrows, dpi=200)
     }    
     if (all(c("rna.log1p_total_counts","prot.log1p_total_counts_isotype") %in% colnames(data_plot))){
+      print("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts_isotype")
       g <-  do_scatter_plot(data_plot,x="rna.log1p_total_counts",y="prot.log1p_total_counts_isotype", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_prot-log1p_counts_isotype.png")), type="cairo",
               width= 3*ncols, height=3*nrows, dpi=200)
     }
     if (all(c("rna.doublet_scores","prot.log1p_total_counts") %in% colnames(data_plot))){
+      print("Plotting scatter plot of rna.doublet_scores and prot.log1p_total_counts")
       g <-  do_scatter_plot(data_plot,x="rna.doublet_scores",y="prot.log1p_total_counts", facet=sc)
       ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-doublet_scores_v_prot-log1p_nUMI.png")), type="cairo",
               width= 3*ncols, height=3*nrows, dpi=200)
@@ -408,11 +419,9 @@ if(!is.null(opt$prot_qc_metrics)){
 
 
 sprefix <- opt$sampleprefix
-print(sprefix)
-message ("saving some counts tables for references")
-
 
 if(opt$prefilter){
+  print("Saving counts tables for references")
   if(all(c("pct_counts_mt", "pct_counts_hb", "n_genes_by_counts", "doublet_scores") %in% colnames(rna_data_plot))){
     f1 <- rna_data_plot %>% 
       dplyr::filter(pct_counts_mt<=20 & pct_counts_hb<=70 &n_genes_by_counts>=100 & doublet_scores<=0.25) %>%
@@ -480,7 +489,7 @@ if(opt$prefilter){
   }
   
 }else{
-  message("producing files with final counts for cells after filtering")
+  print("Producing files with final counts for cells after filtering")
   
   baseline <- rna_data_plot %>% 
     group_by_at(.vars=c(rna_source_facet)) %>% 
@@ -500,4 +509,4 @@ if(opt$prefilter){
 }
 
 
-message("done")
+print("Done")
diff --git a/panpipes/python_scripts/run_preprocess_atac.py b/panpipes/python_scripts/run_preprocess_atac.py
index 06645905..5ccae406 100644
--- a/panpipes/python_scripts/run_preprocess_atac.py
+++ b/panpipes/python_scripts/run_preprocess_atac.py
@@ -88,7 +88,6 @@
 
 args, opt = parser.parse_known_args()
 
-L.info("running with args:")
 L.info(args)
 
 figdir = args.figdir
@@ -99,14 +98,18 @@
 sc.settings.figdir = figdir
 sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
 
+L.info("Reading in MuData from '%s'" % args.input_mudata)
 mdata = mu.read(args.input_mudata)
 atac = mdata.mod['atac']
 
 # save raw counts
+L.info("Checking if raw data is available")
 if X_is_raw(atac):
+    L.info("Saving raw counts from .X to .layers['raw_counts']")
     atac.layers["raw_counts"] = atac.X.copy()
-elif "raw_counts" in atac.layers :
-    L.info("raw_counts layer already exists")
+elif "raw_counts" in atac.layers:
+    L.info(".layers['raw_counts'] already exists and copying it to .X")
+    atac.X = atac.layers['raw_counts'].copy()
 else:
     L.error("X is not raw data and raw_counts layer not found")
     sys.exit("X is not raw data and raw_counts layer not found")
@@ -115,43 +118,49 @@
 # NORMALIZE
 if X_is_raw(atac):
     if args.binarize is True:
-        L.info("binarizing peak count matrix")
+        L.info("Binarizing peak count matrix")
         ac.pp.binarize(atac)    
 
     if args.normalize is not None:
+        L.info("Normalizing data")
         if args.normalize == "log1p":
             if args.binarize:
                 L.warning("Careful, you have decided to binarize data but also to normalize per cell and log1p. Not sure this is meaningful")
             sc.pp.normalize_per_cell(atac,counts_per_cell_after=1e4)
             sc.pp.log1p(atac)
+            L.info("Saving normalized counts to layer 'logged_counts'")
             atac.layers["logged_counts"] = atac.X.copy()
         elif args.normalize == "TFIDF":
             if args.TFIDF_flavour=="signac":
                 ac.pp.tfidf(atac, scale_factor=1e4, log_tfidf=True, log_tf=False, log_idf=False)
+                L.info("Saving normalized counts to layer 'signac_norm'")
                 atac.layers["signac_norm"] = atac.X.copy()
             elif args.TFIDF_flavour=="logTF":
-                ac.pp.tfidf(atac, scale_factor=1e4, log_tfidf=False, log_tf=True, log_idf=False)    
+                ac.pp.tfidf(atac, scale_factor=1e4, log_tfidf=False, log_tf=True, log_idf=False)  
+                L.info("Saving normalized counts to layer 'logTF_norm'")  
                 atac.layers["logTF_norm"] = atac.X.copy()
             elif args.TFIDF_flavour=="logIDF":
                 ac.pp.tfidf(atac, scale_factor=1e4, log_tfidf=False, log_tf=False, log_idf=True)
+                L.info("Saving normalized counts to layer 'logIDF_norm'")
                 atac.layers["logIDF_norm"] = atac.X.copy()
     else: 
-        L.error("We Require a normalization strategy for ATAC")
-        sys.exit("Exiting because no normalization was specified. If None was intended, check your pipeline.yml file")
+        L.error("A normalization strategy is needed for ATAC. For that, please specify the 'normalize' parameter in the pipeline.yml")
+        sys.exit("A normalization strategy is needed for ATAC. For that, please specify the 'normalize' parameter in the pipeline.yml")
 
 
 #highly variable feature selection
 if "highly_variable" in atac.var: 
     L.warning( "You have %s Highly Variable Features", np.sum(atac.var.highly_variable))
 else:
-
     if args.feature_selection_flavour == "scanpy":
+        L.info("Running HVF selection")
         if args.n_top_features is None:
             sc.pp.highly_variable_genes(atac, min_mean=float(args.min_mean), 
             max_mean=float(args.max_mean), min_disp=float(args.min_disp))
         else:
             sc.pp.highly_variable_genes(atac, n_top_genes=int(args.n_top_features))
     elif args.feature_selection_flavour == "signac":
+        L.info("Running HVF selection")
         findTopFeatures_pseudo_signac(atac, args.min_cutoff)
     else:
         L.warning("No highly variable feature selection was performed!")
@@ -160,10 +169,15 @@
     L.warning( "You have %s Highly Variable Features", np.sum(atac.var.highly_variable))
 
 # filter by hvgs
-filter_by_hvgs = args.filter_by_hvf
+if args.filter_by_hvf == "False": 
+    filter_by_hvgs = False
+elif args.filter_by_hvf == "True":
+    filter_by_hvgs = True
+else: 
+    filter_by_hvgs = args.filter_by_hvf
 
 if filter_by_hvgs:
-    L.info("filtering object to only include highly variable features")
+    L.info("Subsetting object to only include HVGs")
     mdata.mod["atac"] = atac[:, atac.var.highly_variable]
     if isinstance(mdata, mu.MuData):
         mdata.update()
@@ -176,13 +190,17 @@
 # and were first introduced for the analysis of scATAC-seq data by Cusanovich et al. 2015.
 
 if args.dimred == "LSI" and args.normalize == "TFIDF":
+    L.info("Running LSI")
     lsi(adata=atac, num_components=int(args.n_comps))
-else:
+elif args.dimred == "LSI" and args.normalize == "log1p":
     L.info("Applying LSI on logged_counts is not recommended. Changing dimred to PCA")
     args.dimred = "PCA"
 if args.dimred == "PCA":
+    L.info("Scaling data")
     sc.pp.scale(atac)
+    L.info("Saving scaled counts to layer 'scaled_counts'")
     atac.layers["scaled_counts"] = atac.X.copy()
+    L.info("Running PCA")
     sc.tl.pca(atac, n_comps=int(args.n_comps), svd_solver=args.solver, 
     random_state=0)
 
@@ -193,11 +211,13 @@
 #some plotting before removing the components
 
 if args.dimred =='PCA':
+    L.info("Plotting PCA")
     sc.pl.pca_variance_ratio(atac, log=True, n_pcs=int(args.n_comps), save=".png")
     sc.pl.pca(atac, color=col_use, save = "_vars.png")
     sc.pl.pca_loadings(atac, components="1,2,3,4,5,6", save = ".png")
     sc.pl.pca_overview(atac, save = ".png")
 if args.dimred =='LSI':
+    L.info("Plotting LSI")
     sc.pl.embedding(atac, color=col_use,basis="X_lsi", save = ".png")
     correlation_df = calc_tech_corr(extract_lsi(atac), 
         tech_covariates = ['n_genes_by_counts', 'total_counts'])
@@ -209,7 +229,7 @@
 
 
 if args.dim_remove is not None:
-    L.info("removing component from dimred")
+    L.info("Removing component %s from %s" % (args.dim_remove, args.dimred))
     dimrem=int(args.dim_remove)
     if args.dimred == "LSI":
         atac.obsm['X_lsi'] = atac.obsm['X_lsi'][:,dimrem:]
@@ -220,6 +240,7 @@
         atac.varm["PCs"] = atac.varm["PCs"][:,dimrem:]
         
 mdata.update()
+L.info("Saving updated MuData to '%s'" % args.output_mudata)
 mdata.write(args.output_mudata)
 
 L.info("Done")
diff --git a/panpipes/python_scripts/run_preprocess_prot.py b/panpipes/python_scripts/run_preprocess_prot.py
index 047a2468..8454ef7c 100644
--- a/panpipes/python_scripts/run_preprocess_prot.py
+++ b/panpipes/python_scripts/run_preprocess_prot.py
@@ -83,48 +83,57 @@
 
 
 # load filtered data - if use_muon is True this will return a mudata object, else returns an mudata object
-L.info("reading filtered mudata object")
+L.info("Reading in MuData from '%s'" % args.filtered_mudata)
 try:
     all_mdata = mu.read(args.filtered_mudata)
 except FileNotFoundError:
+    L.error("filtered_mudata file not found")
     sys.exit("filtered_mudata file not found")
 
 # load bg data
 if 'dsb' in norm_methods:
     if args.bg_mudata is not None:
-        L.info("reading bg file object")
+        L.info("Reading in background MuData from " + args.bg_mudata)
         try:
             all_mdata_bg = mu.read(args.bg_mudata)
             # subset to just the channel 
         except FileNotFoundError:
-            sys.exit("bg_mudata object not found")
+            L.error("Background MuData object not found in " + args.bg_mudata)
+            sys.exit("Background MuData object not found in " + args.bg_mudata)
     else:
-        sys.exit("must specify a bg mudata to run dsb, containing both rna and prot")
+        L.error("You must specify a background MuData to run dsb, containing both rna and prot")
+        sys.exit("You must specify a background MuData to run dsb, containing both rna and prot")
     
-    # checking that the samne proteins are in foreground and background (since foreground might have been filtered)
+    # checking that the same proteins are in foreground and background (since foreground might have been filtered)
+    L.info("Checking that the same proteins are in foreground and background")
     if len(all_mdata['prot'].var_names) != len(all_mdata_bg['prot'].var_names):
         mu.pp.filter_var(all_mdata_bg['prot'], all_mdata['prot'].var_names)
         all_mdata_bg.update()
 
 
 # find isotypes columns 
+L.info("Looking for isotypes column in .var['isotype']")
 if "isotype" in all_mdata["prot"].var.columns:
     isotypes = list(all_mdata["prot"].var_names[all_mdata["prot"].var.isotype])
     # exclude isotypes from downstream analysis by setting them as the only not highly variable genes.
+    L.info("Excluding isotypes from downstream analysis by setting them as the only not HVGs")
     all_mdata['prot'].var['highly_variable'] = ~all_mdata['prot'].var['isotype'] 
 else:
+    L.info("No isotype column found in .var")
     isotypes = None
     
-L.info("isotypes found:")
+L.info("The following isotypes were found:")
 L.info(isotypes)
 
 
 # save raw counts
+L.info("Checking if raw data is available")
 if pnp.scmethods.X_is_raw(all_mdata["prot"]):
+    L.info("Saving raw counts from .X to .layers['raw_counts']")
     all_mdata["prot"].layers["raw_counts"] = all_mdata["prot"].X.copy()
 elif "raw_counts" in all_mdata["prot"].layers :
-    L.info("raw_counts layer already exists")
-    all_mdata.X = all_mdata["prot"].layers['raw_counts'].copy()
+    L.info(".layers['raw_counts'] already exists and copying it to .X")
+    all_mdata["prot"].X = all_mdata["prot"].layers['raw_counts'].copy()
 else:
     L.error("X is not raw data and raw_counts layer not found")
     sys.exit("X is not raw data and raw_counts layer not found")
@@ -202,10 +211,12 @@
         all_mdata["prot"].X = all_mdata["prot"].layers["raw_counts"].copy()
         # run normalise
         # this stores a layer named after the method as well as overwriting X
+        L.info("Normalizing data with CLR")
         pnp.scmethods.run_prot_normalise(mdata=all_mdata, 
                 mdata_bg=None,
                 method="clr",
                 clr_margin=int(args.clr_margin))
+        L.info("Plotting Ridgeplot")
         pnp.plotting.ridgeplot(all_mdata["prot"], features=plot_features, layer="clr",  splitplot=6)
         plt.savefig(os.path.join(args.figpath, "clr_ridgeplot.png"))
         if isotypes is not None:
@@ -223,15 +234,18 @@
         mu.pl.histogram(all_mdata_bg["rna"], ["log10umi"], bins=50)
         plt.savefig(os.path.join(args.figpath, "all_log10umi.png"))
         # this stores a layer named after the method as well as overwriting X
+        L.info("Normalizing data with dsb")
         pnp.scmethods.run_prot_normalise(mdata=all_mdata, 
                 mdata_bg=all_mdata_bg, 
                 method="dsb",
                 isotypes=isotypes) 
         # apply quantile clipping # discussed in FAQs https://cran.r-project.org/web/packages/dsb/vignettes/dsb_normalizing_CITEseq_data.html
         if args.quantile_clipping:
+            L.info("Quantile clipping")
             all_mdata['prot'].layers['dsb'] = pnp.scmethods.quantile_clipping(all_mdata['prot'], layer="dsb", inplace=False)
             all_mdata['prot'].X = all_mdata['prot'].layers['dsb'].copy()
         # plot ridgeplot
+        L.info("Plotting Ridgeplot")
         pnp.plotting.ridgeplot(all_mdata["prot"], features=plot_features, layer="dsb",  splitplot=6)
         plt.savefig(os.path.join(args.figpath, "dsb_ridgeplot.png"))
         if isotypes is not None:
@@ -247,12 +261,12 @@
     # run pca on X 
     # this basically makes no sense if you have a small panel of antibodies.
     if args.run_pca:
-        L.info("running pca")
+        L.info("Running PCA")
 
         if all_mdata['prot'].var.shape[0] < int(args.n_pcs):
-            L.info("You have less features than number of PCs you intend to calculate")
+            L.warning("You have less features (%s) than number of PCs (%t) you intend to calculate." % (all_mdata['prot'].var.shape[0], args.n_pcs))
             n_pcs = all_mdata['prot'].var.shape[0] - 1
-            L.info("Setting n PCS to %i" % int(n_pcs))
+            L.info("Setting number of PCS to %i" % int(n_pcs))
         else:
             n_pcs = int(args.n_pcs)
         sc.tl.pca(all_mdata['prot'], n_comps=n_pcs, 
@@ -260,29 +274,24 @@
                         random_state=0) 
 
         # do some plots!
+        L.info("Plotting PCA")
         sc.pl.pca_variance_ratio(all_mdata['prot'], log=True, n_pcs=n_pcs, save=".png")
         
-        L.info(args.color_by)
         
         col_variables = args.color_by.split(",")
-        L.info("col_variables")
-        L.info(col_variables)
         
         col_variables = [a.strip() for a in col_variables]
-
-        L.info("col_variables now")
-        L.info(col_variables)
         
         col_use = [var for var in col_variables if var in all_mdata['prot'].obs.columns]
-        L.info("col_use")
-        L.info(col_use)
+
         sc.pl.pca(all_mdata['prot'], color=col_use, save = "_vars.png")
         sc.pl.pca_loadings(all_mdata['prot'], components="1,2,3,4,5,6", save = ".png")
         sc.pl.pca_overview(all_mdata['prot'], save = ".png")
 
     if args.save_mudata_path is not None:
         all_mdata.update()
+        L.info("Saving updated MuData to '%s'" % args.save_mudata_path)
         all_mdata.write(args.save_mudata_path)
 
 
-L.info("done")
+L.info("Done")
diff --git a/panpipes/python_scripts/run_preprocess_rna.py b/panpipes/python_scripts/run_preprocess_rna.py
index ccb17d7a..ad2eba55 100644
--- a/panpipes/python_scripts/run_preprocess_rna.py
+++ b/panpipes/python_scripts/run_preprocess_rna.py
@@ -92,7 +92,7 @@
 if args.hvg_batch_key is not None:
     columns = [x.strip() for x in args.hvg_batch_key.split(",")]
     if len(columns) > 1: 
-        L.info("combining batch columns into one column 'hvg_batch_key'")
+        L.info("Combining batch columns into one column 'hvg_batch_key'")
         adata.obs["hvg_batch_key"] = adata.obs[columns].apply(lambda x: '|'.join(x), axis=1)
         # make sure that batch is a categorical
         adata.obs["hvg_batch_key"] = adata.obs["hvg_batch_key"].astype("category")
@@ -119,10 +119,10 @@
 # except when flavor='seurat_v3' in which count data is expected.
 # change the order accordingly
 if X_is_raw(adata):
-    L.info("Normalise, log and calculate HVGs")
+    L.info("Normalize, log and calculate HVGs")
     if args.flavor == "seurat_v3":
         if args.n_top_genes is None:
-            raise ValueError("if seurat_v3 is used you must give a n_top_genes value")
+            raise ValueError("If seurat_v3 is used, you must give a n_top_genes value")
         else:
             sc.pp.highly_variable_genes(adata, flavor="seurat_v3", 
                                         n_top_genes=int(args.n_top_genes),
@@ -213,7 +213,7 @@
 L.info("Running PCA")
 
 if adata.var.shape[0] < int(args.n_pcs):
-    L.warning("You have less features (%s) than number of PCs (%t) you intend to calculate." % (adata.var.shape[0], args.n_pcs))
+    L.warning("You have less features (%s) than number of PCs (%s) you intend to calculate." % (adata.var.shape[0], args.n_pcs))
     n_pcs = adata.var.shape[0] - 1
     L.info("Setting number of PCS to %i" % int(n_pcs))    
 else: