From 138853e4e0a8be4dd8604a89d6b63015e580ce1c Mon Sep 17 00:00:00 2001 From: SarahOuologuem Date: Tue, 9 Apr 2024 10:28:23 +0000 Subject: [PATCH] add logging info --- panpipes/R_scripts/plotQC.R | 55 +++++++++++-------- .../python_scripts/run_preprocess_atac.py | 45 +++++++++++---- .../python_scripts/run_preprocess_prot.py | 49 ++++++++++------- panpipes/python_scripts/run_preprocess_rna.py | 8 +-- 4 files changed, 98 insertions(+), 59 deletions(-) diff --git a/panpipes/R_scripts/plotQC.R b/panpipes/R_scripts/plotQC.R index c8b1e7b0..ecff7b14 100644 --- a/panpipes/R_scripts/plotQC.R +++ b/panpipes/R_scripts/plotQC.R @@ -33,7 +33,7 @@ do_violin_plot <- function(df, qc, group){ theme(axis.text.x=element_text(size=8, angle=90), axis.text.y=element_text(size=8)) if(length(unique(df[[group]])) > 10){ - message(paste0(sc, "has too many categories, removing legend")) + print(paste(sc, "has too many categories, removing legend")) g <- g + theme(legend.position="none") } @@ -82,15 +82,12 @@ option_list <- list( ) -message("Plot QC data") opt <- parse_args(OptionParser(option_list=option_list)) if(is.null(opt$outdir)) { opt$outdir <- paste0(getwd(),"/")} if(!grepl("\\/$", opt$outdir)){opt$outdir <- paste(opt$outdir, "/", sep = "")} if(!file.exists(opt$outdir)){dir.create(opt$outdir)} -print("Running with options:") -print(opt) run<- opt$outdir opt[which(opt=="NULL")] <- NULL @@ -118,16 +115,16 @@ if (!is.null(opt$groupingvar)){ source_facet <- source_facet[source_facet %in% colnames(data_plot)] source_facet <- unique(c(source_facet, keep_source) ) if(length(source_facet)>0){ - print("Facet plotting with") # add sample_id as a minimum requirement if it's not there already source_facet = unique(c("sample_id", source_facet)) - print(source_facet) + print(paste("Facet plotting with",source_facet)) } }else{ - stop("i don't have the minimum variable _sampleid_ to facet on, will stop here") + stop("Need the minimum variable _sampleid_ to facet on, will stop here") } # RNA plots -------------------------------------------------------------------- +print("RNA plots") if(opt$scanpy_or_muon=="scanpy"){ rna_data_plot <- data_plot @@ -162,9 +159,8 @@ rna_source_facet <- gsub("^rna\\.", "",grep("^rna.", source_facet, value = TRUE) rna_source_facet <- unique(c(rna_source_facet, source_facet[!grepl("^rna.", source_facet)])) rna_source_facet <- rna_source_facet[rna_source_facet %in% colnames(rna_data_plot)] for (qc in qcmetrics){ - print(qc) + print(paste("Plotting violin plots of", qc)) for (sc in rna_source_facet){ #add gsub temp here - print(sc) g <- do_violin_plot(rna_data_plot, qc, sc) if (uniq_sample_id > 50){width=12}else{width=6} ggsave(g, filename=file.path(outpath, paste0("violin_", sc, "_rna-", qc,".png")), type="cairo", width= width, height=6) @@ -183,23 +179,26 @@ for (sc in rna_source_facet){ } # plot once per source facet if (all(c("total_counts","n_genes_by_counts")%in% colnames(rna_data_plot))){ + print("Plotting scatter plot of total_counts and n_genes_by_counts") g <- do_scatter_plot(rna_data_plot,x="total_counts",y="n_genes_by_counts", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-genes.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("log1p_total_counts","log1p_n_genes_by_counts")%in% colnames(rna_data_plot))){ - + print("Plotting scatter plot of log1p_total_counts and n_genes_by_counts") g <- do_scatter_plot(rna_data_plot,x="log1p_total_counts",y="log1p_n_genes_by_counts", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-log1p_nUMI_v_rna-log1p_genes.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("total_counts","pct_counts_mt")%in% colnames(rna_data_plot))){ + print("Plotting scatter plot of total_counts and pct_counts_mt") g <- do_scatter_plot(rna_data_plot,x="total_counts",y="pct_counts_mt", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-nUMI_v_rna-pct_mt.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("n_genes_by_counts","doublet_scores","total_counts")%in% colnames(rna_data_plot))){ + print("Plotting scatter plot of n_genes_by_counts and doublet_scores") g <- do_scatter_plot(rna_data_plot,x="n_genes_by_counts",y="doublet_scores", hue="total_counts", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_rna-genes_rna-doublet_scores_rna-numi.png")), @@ -213,7 +212,10 @@ for (sc in rna_source_facet){ # Protein plots ---------------------------------------------------------------- + if(!is.null(opt$prot_qc_metrics)){ + print("Protein plots") + qcmetrics <- strsplit(opt$prot_qc_metrics,",")[[1]] prot_data_plot <- data_plot[,grep("^prot\\.",colnames(data_plot))] colnames(prot_data_plot) <- gsub("^prot\\.", "", colnames(prot_data_plot)) @@ -230,7 +232,7 @@ if(!is.null(opt$prot_qc_metrics)){ # check these qc metrics are in the file qcmetrics <- qcmetrics[qcmetrics %in% colnames(prot_data_plot)] for (qc in qcmetrics){ - print(qc) + print(paste("Plotting violin plots of", qc)) for (sc in prot_source_facet){ g <- do_violin_plot(prot_data_plot, qc, sc) if (uniq_sample_id > 50){width=12}else{width=6} @@ -255,28 +257,33 @@ if(!is.null(opt$prot_qc_metrics)){ nrows=1 } # plot once per source facet - message("protein scatter plots") + if (all(c("total_counts","n_prot_by_counts")%in% colnames(prot_data_plot))){ + print("Plotting scatter plot of total_counts and n_prot_by_counts") g <- do_scatter_plot(prot_data_plot,x="total_counts",y="n_prot_by_counts", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-prot.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("log1p_total_counts","log1p_n_prot_by_counts")%in% colnames(prot_data_plot))){ + print("Plotting scatter plot of log1p_total_counts and log1p_n_prot_by_counts") g <- do_scatter_plot(prot_data_plot,x="log1p_total_counts",y="log1p_n_prot_by_counts", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_prot.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("total_counts","pct_counts_isotype")%in% colnames(prot_data_plot))){ + print("Plotting scatter plot of total_counts and pct_counts_isotype") g <- do_scatter_plot(prot_data_plot,x="total_counts",y="pct_counts_isotype", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-pct_isotype.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("total_counts","total_counts_isotype")%in% colnames(prot_data_plot))){ + print("Plotting scatter plot of total_counts and total_counts_isotype") g <- do_scatter_plot(prot_data_plot,x="total_counts",y="total_counts_isotype", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-nUMI_v_prot-total_counts_isotype.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("log1p_total_counts","log1p_total_counts_isotype", "isotype_exclude")%in% colnames(prot_data_plot))){ + print("Plotting scatter plot of log1p_total_counts and log1p_total_counts_isotype") g <- do_scatter_plot(prot_data_plot,x="log1p_total_counts",y="log1p_total_counts_isotype", hue="isotype_exclude", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "_prot-log1p_nUMI_v_prot-log1p_total_counts_isotype.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) @@ -288,7 +295,7 @@ if(!is.null(opt$prot_qc_metrics)){ if(!is.null(opt$atac_qc_metrics)){ - message("Atac plots") + print("ATAC plots") atac_data_plot <- data_plot[,grep("^atac\\.",colnames(data_plot))] colnames(atac_data_plot) <- gsub("^atac\\.", "", colnames(atac_data_plot)) atac_source_facet <- gsub("^atac\\.", "",grep("^atac.", source_facet, value = TRUE)) @@ -304,7 +311,7 @@ if(!is.null(opt$atac_qc_metrics)){ # check these qc metrics are in the file qcmetrics <- qcmetrics[qcmetrics %in% colnames(atac_data_plot)] for (qc in qcmetrics){ - print(qc) + print(paste("Plotting violin plots of", qc)) for (sc in atac_source_facet){ g <- do_violin_plot(atac_data_plot, qc, sc) if (uniq_sample_id > 50){width=12}else{width=6} @@ -317,7 +324,7 @@ if(!is.null(opt$atac_qc_metrics)){ if (!is.null(opt$rep_qc_metrics)) { - message("Repertoire plots") + print("Repertoire plots") qcmetrics <- strsplit(opt$rep_qc_metrics,",")[[1]] qcmetrics <- gsub("rep:", "", qcmetrics) rep_data_plot <- data_plot[,grep("^rep\\.",colnames(data_plot))] @@ -338,6 +345,7 @@ if (!is.null(opt$rep_qc_metrics)) { # check these qc metrics are in the file qcmetrics <- qcmetrics[qcmetrics %in% colnames(rep_data_plot)] for (qc in qcmetrics){ + print(paste("Plotting bar plots of", qc)) for (sc in rep_source_facet){ g <- do_bar_plot(rep_data_plot, qc, sc) if (uniq_sample_id > 50){width=12}else{width=6} @@ -354,6 +362,7 @@ if (!is.null(opt$rep_qc_metrics)) { if(!is.null(opt$prot_qc_metrics)){ +print("RNA vs. Protein plots") # rna vs prot plots ---------------------------------------------------------------- @@ -363,8 +372,6 @@ if(!is.null(opt$prot_qc_metrics)){ for (sc in source_facet){ - - message(sc) uniq_source <- nrow(unique(data_plot[sc])) if(uniq_source >6){ ncols=6 @@ -373,28 +380,32 @@ if(!is.null(opt$prot_qc_metrics)){ ncols=uniq_source nrows=1 } - message("rna v protein scatter plots") if (all(c("rna.total_counts","prot.total_counts")%in% colnames(data_plot))){ + print("Plotting scatter plot of rna.total_counts and prot.total_counts") g <- do_scatter_plot(data_plot,x="rna.total_counts",y="prot.total_counts", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-nUMI_v_rna-nUMI.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("rna.log1p_total_counts","prot.log1p_total_counts")%in% colnames(data_plot))){ + print("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts") g <- do_scatter_plot(data_plot,x="rna.log1p_total_counts",y="prot.log1p_total_counts", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_rna-log1p_nUMI.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("rna.total_counts","prot.total_counts_isotype")%in% colnames(data_plot))){ + print("Plotting scatter plot of rna.total_counts and prot.total_counts_isotype") g <- do_scatter_plot(data_plot,x="rna.total_counts",y="prot.total_counts_isotype", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-nUMI_v_prot-counts_isotype.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("rna.log1p_total_counts","prot.log1p_total_counts_isotype") %in% colnames(data_plot))){ + print("Plotting scatter plot of rna.log1p_total_counts and prot.log1p_total_counts_isotype") g <- do_scatter_plot(data_plot,x="rna.log1p_total_counts",y="prot.log1p_total_counts_isotype", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-log1p_nUMI_v_prot-log1p_counts_isotype.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) } if (all(c("rna.doublet_scores","prot.log1p_total_counts") %in% colnames(data_plot))){ + print("Plotting scatter plot of rna.doublet_scores and prot.log1p_total_counts") g <- do_scatter_plot(data_plot,x="rna.doublet_scores",y="prot.log1p_total_counts", facet=sc) ggsave(g, filename=file.path(outpath, paste0("scatter_", sc, "-doublet_scores_v_prot-log1p_nUMI.png")), type="cairo", width= 3*ncols, height=3*nrows, dpi=200) @@ -408,11 +419,9 @@ if(!is.null(opt$prot_qc_metrics)){ sprefix <- opt$sampleprefix -print(sprefix) -message ("saving some counts tables for references") - if(opt$prefilter){ + print("Saving counts tables for references") if(all(c("pct_counts_mt", "pct_counts_hb", "n_genes_by_counts", "doublet_scores") %in% colnames(rna_data_plot))){ f1 <- rna_data_plot %>% dplyr::filter(pct_counts_mt<=20 & pct_counts_hb<=70 &n_genes_by_counts>=100 & doublet_scores<=0.25) %>% @@ -480,7 +489,7 @@ if(opt$prefilter){ } }else{ - message("producing files with final counts for cells after filtering") + print("Producing files with final counts for cells after filtering") baseline <- rna_data_plot %>% group_by_at(.vars=c(rna_source_facet)) %>% @@ -500,4 +509,4 @@ if(opt$prefilter){ } -message("done") +print("Done") diff --git a/panpipes/python_scripts/run_preprocess_atac.py b/panpipes/python_scripts/run_preprocess_atac.py index 06645905..5ccae406 100644 --- a/panpipes/python_scripts/run_preprocess_atac.py +++ b/panpipes/python_scripts/run_preprocess_atac.py @@ -88,7 +88,6 @@ args, opt = parser.parse_known_args() -L.info("running with args:") L.info(args) figdir = args.figdir @@ -99,14 +98,18 @@ sc.settings.figdir = figdir sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5)) +L.info("Reading in MuData from '%s'" % args.input_mudata) mdata = mu.read(args.input_mudata) atac = mdata.mod['atac'] # save raw counts +L.info("Checking if raw data is available") if X_is_raw(atac): + L.info("Saving raw counts from .X to .layers['raw_counts']") atac.layers["raw_counts"] = atac.X.copy() -elif "raw_counts" in atac.layers : - L.info("raw_counts layer already exists") +elif "raw_counts" in atac.layers: + L.info(".layers['raw_counts'] already exists and copying it to .X") + atac.X = atac.layers['raw_counts'].copy() else: L.error("X is not raw data and raw_counts layer not found") sys.exit("X is not raw data and raw_counts layer not found") @@ -115,43 +118,49 @@ # NORMALIZE if X_is_raw(atac): if args.binarize is True: - L.info("binarizing peak count matrix") + L.info("Binarizing peak count matrix") ac.pp.binarize(atac) if args.normalize is not None: + L.info("Normalizing data") if args.normalize == "log1p": if args.binarize: L.warning("Careful, you have decided to binarize data but also to normalize per cell and log1p. Not sure this is meaningful") sc.pp.normalize_per_cell(atac,counts_per_cell_after=1e4) sc.pp.log1p(atac) + L.info("Saving normalized counts to layer 'logged_counts'") atac.layers["logged_counts"] = atac.X.copy() elif args.normalize == "TFIDF": if args.TFIDF_flavour=="signac": ac.pp.tfidf(atac, scale_factor=1e4, log_tfidf=True, log_tf=False, log_idf=False) + L.info("Saving normalized counts to layer 'signac_norm'") atac.layers["signac_norm"] = atac.X.copy() elif args.TFIDF_flavour=="logTF": - ac.pp.tfidf(atac, scale_factor=1e4, log_tfidf=False, log_tf=True, log_idf=False) + ac.pp.tfidf(atac, scale_factor=1e4, log_tfidf=False, log_tf=True, log_idf=False) + L.info("Saving normalized counts to layer 'logTF_norm'") atac.layers["logTF_norm"] = atac.X.copy() elif args.TFIDF_flavour=="logIDF": ac.pp.tfidf(atac, scale_factor=1e4, log_tfidf=False, log_tf=False, log_idf=True) + L.info("Saving normalized counts to layer 'logIDF_norm'") atac.layers["logIDF_norm"] = atac.X.copy() else: - L.error("We Require a normalization strategy for ATAC") - sys.exit("Exiting because no normalization was specified. If None was intended, check your pipeline.yml file") + L.error("A normalization strategy is needed for ATAC. For that, please specify the 'normalize' parameter in the pipeline.yml") + sys.exit("A normalization strategy is needed for ATAC. For that, please specify the 'normalize' parameter in the pipeline.yml") #highly variable feature selection if "highly_variable" in atac.var: L.warning( "You have %s Highly Variable Features", np.sum(atac.var.highly_variable)) else: - if args.feature_selection_flavour == "scanpy": + L.info("Running HVF selection") if args.n_top_features is None: sc.pp.highly_variable_genes(atac, min_mean=float(args.min_mean), max_mean=float(args.max_mean), min_disp=float(args.min_disp)) else: sc.pp.highly_variable_genes(atac, n_top_genes=int(args.n_top_features)) elif args.feature_selection_flavour == "signac": + L.info("Running HVF selection") findTopFeatures_pseudo_signac(atac, args.min_cutoff) else: L.warning("No highly variable feature selection was performed!") @@ -160,10 +169,15 @@ L.warning( "You have %s Highly Variable Features", np.sum(atac.var.highly_variable)) # filter by hvgs -filter_by_hvgs = args.filter_by_hvf +if args.filter_by_hvf == "False": + filter_by_hvgs = False +elif args.filter_by_hvf == "True": + filter_by_hvgs = True +else: + filter_by_hvgs = args.filter_by_hvf if filter_by_hvgs: - L.info("filtering object to only include highly variable features") + L.info("Subsetting object to only include HVGs") mdata.mod["atac"] = atac[:, atac.var.highly_variable] if isinstance(mdata, mu.MuData): mdata.update() @@ -176,13 +190,17 @@ # and were first introduced for the analysis of scATAC-seq data by Cusanovich et al. 2015. if args.dimred == "LSI" and args.normalize == "TFIDF": + L.info("Running LSI") lsi(adata=atac, num_components=int(args.n_comps)) -else: +elif args.dimred == "LSI" and args.normalize == "log1p": L.info("Applying LSI on logged_counts is not recommended. Changing dimred to PCA") args.dimred = "PCA" if args.dimred == "PCA": + L.info("Scaling data") sc.pp.scale(atac) + L.info("Saving scaled counts to layer 'scaled_counts'") atac.layers["scaled_counts"] = atac.X.copy() + L.info("Running PCA") sc.tl.pca(atac, n_comps=int(args.n_comps), svd_solver=args.solver, random_state=0) @@ -193,11 +211,13 @@ #some plotting before removing the components if args.dimred =='PCA': + L.info("Plotting PCA") sc.pl.pca_variance_ratio(atac, log=True, n_pcs=int(args.n_comps), save=".png") sc.pl.pca(atac, color=col_use, save = "_vars.png") sc.pl.pca_loadings(atac, components="1,2,3,4,5,6", save = ".png") sc.pl.pca_overview(atac, save = ".png") if args.dimred =='LSI': + L.info("Plotting LSI") sc.pl.embedding(atac, color=col_use,basis="X_lsi", save = ".png") correlation_df = calc_tech_corr(extract_lsi(atac), tech_covariates = ['n_genes_by_counts', 'total_counts']) @@ -209,7 +229,7 @@ if args.dim_remove is not None: - L.info("removing component from dimred") + L.info("Removing component %s from %s" % (args.dim_remove, args.dimred)) dimrem=int(args.dim_remove) if args.dimred == "LSI": atac.obsm['X_lsi'] = atac.obsm['X_lsi'][:,dimrem:] @@ -220,6 +240,7 @@ atac.varm["PCs"] = atac.varm["PCs"][:,dimrem:] mdata.update() +L.info("Saving updated MuData to '%s'" % args.output_mudata) mdata.write(args.output_mudata) L.info("Done") diff --git a/panpipes/python_scripts/run_preprocess_prot.py b/panpipes/python_scripts/run_preprocess_prot.py index 047a2468..8454ef7c 100644 --- a/panpipes/python_scripts/run_preprocess_prot.py +++ b/panpipes/python_scripts/run_preprocess_prot.py @@ -83,48 +83,57 @@ # load filtered data - if use_muon is True this will return a mudata object, else returns an mudata object -L.info("reading filtered mudata object") +L.info("Reading in MuData from '%s'" % args.filtered_mudata) try: all_mdata = mu.read(args.filtered_mudata) except FileNotFoundError: + L.error("filtered_mudata file not found") sys.exit("filtered_mudata file not found") # load bg data if 'dsb' in norm_methods: if args.bg_mudata is not None: - L.info("reading bg file object") + L.info("Reading in background MuData from " + args.bg_mudata) try: all_mdata_bg = mu.read(args.bg_mudata) # subset to just the channel except FileNotFoundError: - sys.exit("bg_mudata object not found") + L.error("Background MuData object not found in " + args.bg_mudata) + sys.exit("Background MuData object not found in " + args.bg_mudata) else: - sys.exit("must specify a bg mudata to run dsb, containing both rna and prot") + L.error("You must specify a background MuData to run dsb, containing both rna and prot") + sys.exit("You must specify a background MuData to run dsb, containing both rna and prot") - # checking that the samne proteins are in foreground and background (since foreground might have been filtered) + # checking that the same proteins are in foreground and background (since foreground might have been filtered) + L.info("Checking that the same proteins are in foreground and background") if len(all_mdata['prot'].var_names) != len(all_mdata_bg['prot'].var_names): mu.pp.filter_var(all_mdata_bg['prot'], all_mdata['prot'].var_names) all_mdata_bg.update() # find isotypes columns +L.info("Looking for isotypes column in .var['isotype']") if "isotype" in all_mdata["prot"].var.columns: isotypes = list(all_mdata["prot"].var_names[all_mdata["prot"].var.isotype]) # exclude isotypes from downstream analysis by setting them as the only not highly variable genes. + L.info("Excluding isotypes from downstream analysis by setting them as the only not HVGs") all_mdata['prot'].var['highly_variable'] = ~all_mdata['prot'].var['isotype'] else: + L.info("No isotype column found in .var") isotypes = None -L.info("isotypes found:") +L.info("The following isotypes were found:") L.info(isotypes) # save raw counts +L.info("Checking if raw data is available") if pnp.scmethods.X_is_raw(all_mdata["prot"]): + L.info("Saving raw counts from .X to .layers['raw_counts']") all_mdata["prot"].layers["raw_counts"] = all_mdata["prot"].X.copy() elif "raw_counts" in all_mdata["prot"].layers : - L.info("raw_counts layer already exists") - all_mdata.X = all_mdata["prot"].layers['raw_counts'].copy() + L.info(".layers['raw_counts'] already exists and copying it to .X") + all_mdata["prot"].X = all_mdata["prot"].layers['raw_counts'].copy() else: L.error("X is not raw data and raw_counts layer not found") sys.exit("X is not raw data and raw_counts layer not found") @@ -202,10 +211,12 @@ all_mdata["prot"].X = all_mdata["prot"].layers["raw_counts"].copy() # run normalise # this stores a layer named after the method as well as overwriting X + L.info("Normalizing data with CLR") pnp.scmethods.run_prot_normalise(mdata=all_mdata, mdata_bg=None, method="clr", clr_margin=int(args.clr_margin)) + L.info("Plotting Ridgeplot") pnp.plotting.ridgeplot(all_mdata["prot"], features=plot_features, layer="clr", splitplot=6) plt.savefig(os.path.join(args.figpath, "clr_ridgeplot.png")) if isotypes is not None: @@ -223,15 +234,18 @@ mu.pl.histogram(all_mdata_bg["rna"], ["log10umi"], bins=50) plt.savefig(os.path.join(args.figpath, "all_log10umi.png")) # this stores a layer named after the method as well as overwriting X + L.info("Normalizing data with dsb") pnp.scmethods.run_prot_normalise(mdata=all_mdata, mdata_bg=all_mdata_bg, method="dsb", isotypes=isotypes) # apply quantile clipping # discussed in FAQs https://cran.r-project.org/web/packages/dsb/vignettes/dsb_normalizing_CITEseq_data.html if args.quantile_clipping: + L.info("Quantile clipping") all_mdata['prot'].layers['dsb'] = pnp.scmethods.quantile_clipping(all_mdata['prot'], layer="dsb", inplace=False) all_mdata['prot'].X = all_mdata['prot'].layers['dsb'].copy() # plot ridgeplot + L.info("Plotting Ridgeplot") pnp.plotting.ridgeplot(all_mdata["prot"], features=plot_features, layer="dsb", splitplot=6) plt.savefig(os.path.join(args.figpath, "dsb_ridgeplot.png")) if isotypes is not None: @@ -247,12 +261,12 @@ # run pca on X # this basically makes no sense if you have a small panel of antibodies. if args.run_pca: - L.info("running pca") + L.info("Running PCA") if all_mdata['prot'].var.shape[0] < int(args.n_pcs): - L.info("You have less features than number of PCs you intend to calculate") + L.warning("You have less features (%s) than number of PCs (%t) you intend to calculate." % (all_mdata['prot'].var.shape[0], args.n_pcs)) n_pcs = all_mdata['prot'].var.shape[0] - 1 - L.info("Setting n PCS to %i" % int(n_pcs)) + L.info("Setting number of PCS to %i" % int(n_pcs)) else: n_pcs = int(args.n_pcs) sc.tl.pca(all_mdata['prot'], n_comps=n_pcs, @@ -260,29 +274,24 @@ random_state=0) # do some plots! + L.info("Plotting PCA") sc.pl.pca_variance_ratio(all_mdata['prot'], log=True, n_pcs=n_pcs, save=".png") - L.info(args.color_by) col_variables = args.color_by.split(",") - L.info("col_variables") - L.info(col_variables) col_variables = [a.strip() for a in col_variables] - - L.info("col_variables now") - L.info(col_variables) col_use = [var for var in col_variables if var in all_mdata['prot'].obs.columns] - L.info("col_use") - L.info(col_use) + sc.pl.pca(all_mdata['prot'], color=col_use, save = "_vars.png") sc.pl.pca_loadings(all_mdata['prot'], components="1,2,3,4,5,6", save = ".png") sc.pl.pca_overview(all_mdata['prot'], save = ".png") if args.save_mudata_path is not None: all_mdata.update() + L.info("Saving updated MuData to '%s'" % args.save_mudata_path) all_mdata.write(args.save_mudata_path) -L.info("done") +L.info("Done") diff --git a/panpipes/python_scripts/run_preprocess_rna.py b/panpipes/python_scripts/run_preprocess_rna.py index ccb17d7a..ad2eba55 100644 --- a/panpipes/python_scripts/run_preprocess_rna.py +++ b/panpipes/python_scripts/run_preprocess_rna.py @@ -92,7 +92,7 @@ if args.hvg_batch_key is not None: columns = [x.strip() for x in args.hvg_batch_key.split(",")] if len(columns) > 1: - L.info("combining batch columns into one column 'hvg_batch_key'") + L.info("Combining batch columns into one column 'hvg_batch_key'") adata.obs["hvg_batch_key"] = adata.obs[columns].apply(lambda x: '|'.join(x), axis=1) # make sure that batch is a categorical adata.obs["hvg_batch_key"] = adata.obs["hvg_batch_key"].astype("category") @@ -119,10 +119,10 @@ # except when flavor='seurat_v3' in which count data is expected. # change the order accordingly if X_is_raw(adata): - L.info("Normalise, log and calculate HVGs") + L.info("Normalize, log and calculate HVGs") if args.flavor == "seurat_v3": if args.n_top_genes is None: - raise ValueError("if seurat_v3 is used you must give a n_top_genes value") + raise ValueError("If seurat_v3 is used, you must give a n_top_genes value") else: sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=int(args.n_top_genes), @@ -213,7 +213,7 @@ L.info("Running PCA") if adata.var.shape[0] < int(args.n_pcs): - L.warning("You have less features (%s) than number of PCs (%t) you intend to calculate." % (adata.var.shape[0], args.n_pcs)) + L.warning("You have less features (%s) than number of PCs (%s) you intend to calculate." % (adata.var.shape[0], args.n_pcs)) n_pcs = adata.var.shape[0] - 1 L.info("Setting number of PCS to %i" % int(n_pcs)) else: