add log infos

DendrouLab · Apr 9, 2024 · e7ec9f7 · e7ec9f7
1 parent 88ecfb7
commit e7ec9f7
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 45 deletions.
diff --git a/panpipes/python_scripts/downsample.py b/panpipes/python_scripts/downsample.py
@@ -42,12 +42,12 @@
                     help='comma separated string of modalities we want to intersect_obs on')                 
 
 
-L.warning("Running filter_mudata")
-
 parser.set_defaults(verbose=True)
 args, opt = parser.parse_known_args()
+L.info(args)
 
 # load data
+L.info("Reading in MuData from '%s'" % args.input_mudata)
 mdata = mu.read(args.input_mudata)
 
 if isinstance(mdata, AnnData):
@@ -56,7 +56,7 @@
 
 orig_obs = mdata.obs
 
-L.info("Number of cells per sample \n%s" % mdata.obs.sample_id.value_counts())
+L.info("Before downsampling: number of cells per sample \n%s" % mdata.obs.sample_id.value_counts())
 
 
 if args.downsample_col == "None":
@@ -74,7 +74,7 @@
     mdata.obs[args.downsample_col] = mdata.obs[args.downsample_col].cat.remove_unused_categories()
 
 # do the downsample.
-
+L.info("Downsampling modalities %s" % mods)
 downsample_mudata(mdata, nn=int(args.downsample_value),
     cat_col=args.downsample_col,
     mods = mods,
@@ -83,15 +83,14 @@
 
 
 mdata.update()
-
-L.info("Number of cells per sample")
-L.info(mdata.obs.sample_id.value_counts())
+L.info("After downsampling: number of cells per sample \n%s" % mdata.obs.sample_id.value_counts())
 
 # write out the observations
-write_obs(mdata, output_prefix=re.sub("\\.(.*)", "", args.output_mudata), 
-        output_suffix="_downsampled_cell_metadata.tsv")
-
+prefix = re.sub("\\.(.*)", "", args.output_mudata)
+L.info("Saving updated obs in a metadata tsv file to " + prefix + "_downsampled_cell_metadata.tsv")
+write_obs(mdata, output_prefix=prefix, output_suffix="_downsampled_cell_metadata.tsv")
 
+L.info("Saving updated MuData to '%s'" % args.output_mudata)
 mdata.write(args.output_mudata)
-#This stage is the point (i.e. pre-normalisation) where the mdata file can be outputted so that we can extract raw matrix for the cellphonedb.
+
 L.info("Done")
diff --git a/panpipes/python_scripts/run_filter.py b/panpipes/python_scripts/run_filter.py
@@ -174,7 +174,7 @@ def test_matching_df_ignore_cat(new_df, old_df):
 # write out obs
 output_prefix = re.sub(".h5mu", "", args.output_mudata)
 
-L.info("Saving updated MuData.obs in a metadata tsv file to '" + output_prefix + "_filtered_cell_metadata.tsv'")
+L.info("Saving updated obs in a metadata tsv file to '" + output_prefix + "_filtered_cell_metadata.tsv'")
 write_obs(mdata, output_prefix=output_prefix, output_suffix="_filtered_cell_metadata.tsv")
 
 # write out the per sample_id cell numbers 

diff --git a/panpipes/python_scripts/run_filter_spatial.py b/panpipes/python_scripts/run_filter_spatial.py
@@ -151,7 +151,7 @@ def test_matching_df_ignore_cat(new_df, old_df):
 # write out obs
 output_prefix = re.sub(".h5mu", "", os.path.basename(args.output_mudata))
 
-L.info("Saving updated MuData.obs in a metadata tsv file to './tables/" + output_prefix + "_filtered_cell_metadata.tsv'")
+L.info("Saving updated obs in a metadata tsv file to './tables/" + output_prefix + "_filtered_cell_metadata.tsv'")
 write_obs(mdata, output_prefix=os.path.join("tables/",output_prefix), output_suffix="_filtered_cell_metadata.tsv")
 
 # write out the per sample_id cell numbers 

diff --git a/panpipes/python_scripts/run_preprocess_rna.py b/panpipes/python_scripts/run_preprocess_rna.py
@@ -80,6 +80,7 @@
 sc.settings.figdir = figdir
 sc.set_figure_params(scanpy=True, fontsize=14, dpi=300, facecolor='white', figsize=(5,5))
 
+L.info("Reading in MuData from '%s'" % args.input_mudata)
 mdata = mu.read(args.input_mudata)
 
 # if we have actually loaded an anndata object, make into a mudata
@@ -102,10 +103,12 @@
     hvg_batch_key=None
 
 # save raw counts as a layer
+L.info("Checking if raw data is available")
 if X_is_raw(adata):
+    L.info("Saving raw counts from .X to .layers['raw_counts']")
     adata.layers['raw_counts'] = adata.X.copy()
 elif "raw_counts" in adata.layers :
-    L.info("raw_counts layer already exists")
+    L.info(".layers['raw_counts'] already exists and copying it to .X")
     adata.X = adata.layers['raw_counts'].copy()
 else:
     L.error("X is not raw data and raw_counts layer not found")
@@ -116,7 +119,7 @@
 # except when flavor='seurat_v3' in which count data is expected.
 # change the order accordingly
 if X_is_raw(adata):
-    L.info("normalise, log and calculate highly variable genes")
+    L.info("Normalise, log and calculate HVGs")
     if args.flavor == "seurat_v3":
         if args.n_top_genes is None:
             raise ValueError("if seurat_v3 is used you must give a n_top_genes value")
@@ -136,51 +139,48 @@
                                     min_disp=float(args.min_disp),
                                     batch_key=hvg_batch_key)
         L.debug(adata.uns['log1p'])
+
 if "highly_variable" in adata.var: 
-    L.warning( "You have %s Highly Variable Features", np.sum(adata.var.highly_variable))
+    L.info( "You have %s HVGs", np.sum(adata.var.highly_variable))
     sc.pl.highly_variable_genes(adata,show=False, save ="_genes_highlyvar.png")
-else:
-    sys.exit("I cannot find Highly Variable Features in your RNA")
 
 if isinstance(mdata, mu.MuData):
     mdata.update()
 
 # exclude hvgs if there is an exclude file
 if args.exclude_file is not None:
     if os.path.exists(args.exclude_file):
-        L.info("exclude genes from hvg")
+        L.info("Reading in exclude_file from '%s'" % args.exclude_file)
         customgenes = pd.read_csv(args.exclude_file) 
         custom_cat = list(set(customgenes['group'].tolist()))
         cat_dic = {}
         for cc in custom_cat:
             cat_dic[cc] = customgenes.loc[customgenes["group"] == cc,"feature"].tolist()
         exclude_action = str(args.exclude)
         excl = cat_dic[exclude_action]
-        L.info(len(excl))
-        L.info("number of hvgs prior to filtering")
-        L.info(adata.var.highly_variable.sum())
+        L.info("Number of HVGs prior to filtering: %s" % adata.var.highly_variable.sum())
         adata.var['hvg_exclude'] = [True if gg in excl else False for gg in adata.var.index]
-        L.debug("num to exclude %s" %  (adata.var['hvg_exclude'] & adata.var['highly_variable']).sum() )
-        L.debug(adata)
+        L.info("Number of genes to exclude %s" %  (adata.var['hvg_exclude'] & adata.var['highly_variable']).sum() )
+        L.info("Excluding genes from HVG")
         if any(adata.var['hvg_exclude'] & adata.var['highly_variable']):
             adata.var['highly_variable'].loc[adata.var.hvg_exclude] = False
             adata.var.loc[adata.var.hvg_exclude, 'highly_variable_rank'] = np.nan
 
-            L.info("number of hvgs after filtering")
-            L.info(adata.var.highly_variable.sum())
+            L.info("Number of HVGs after filtering: %s" % adata.var.highly_variable.sum())
             sc.pl.highly_variable_genes(adata,show=False, save ="_exclude_genes_highlyvar.png")
     else:
-        sys.exit("exclusion file %s not found, check the path and try again" % args.exclude_file)
+        L.error("Exclusion file %s not found" % args.exclude_file)
+        sys.exit("Exclusion file %s not found" % args.exclude_file)
 
 if isinstance(mdata, mu.MuData):
     mdata.update()
 
-L.debug(adata.uns['log1p'])
+
 # filter by hvgs
 filter_by_hvgs = args.filter_by_hvg
 
 if filter_by_hvgs is True:
-    L.info("filtering object to only include highly variable genes")
+    L.info("Subsetting object to only include HVGs")
     mdata.mod["rna"] = adata[:, adata.var.highly_variable]
     if isinstance(mdata, mu.MuData):
         mdata.update()
@@ -191,39 +191,39 @@
 
 
 adata.layers['logged_counts'] = adata.X.copy()
-L.debug(adata.uns['log1p'])
 # regress out
 if args.regress_out is not None:
+    L.info("Regressing out %s" % args.regress_out)
     regress_opts = args.regress_out.split(",")
-    L.info("regressing out %s" % regress_opts)
     sc.pp.regress_out(adata, regress_opts)
 
 
 if args.scale is True and args.scale_max_value is None:
-    L.info("scaling data with default parameters")
+    L.info("Scaling data with default parameters")
     sc.pp.scale(adata)
 elif args.scale is True:
-    L.info("scaling data to max value %i" % int(args.scale_max_value))
+    L.info("Scaling data to max value %i" % int(args.scale_max_value))
     sc.pp.scale(adata, max_value=int(args.scale_max_value))
 else:
-    L.info("not scaling data")
-    pass
+    L.info("Not scaling the data")
+    #pass
+
 
-L.debug(adata.uns['log1p'])
 # run pca
-L.info("running pca")
+L.info("Running PCA")
 
 if adata.var.shape[0] < int(args.n_pcs):
-    L.info("You have less features than number of PCs you intend to calculate")
+    L.warning("You have less features (%s) than number of PCs (%t) you intend to calculate." % (adata.var.shape[0], args.n_pcs))
     n_pcs = adata.var.shape[0] - 1
-    L.info("Setting n PCS to %i" % int(n_pcs))    
+    L.info("Setting number of PCS to %i" % int(n_pcs))    
 else:
     n_pcs = int(args.n_pcs)
 sc.tl.pca(adata, n_comps=n_pcs, 
                 svd_solver=args.pca_solver, 
                 random_state=0) 
 
-# do some plots!
+# pca plots
+L.info("Plotting PCA")
 sc.pl.pca_variance_ratio(adata, log=True, n_pcs=n_pcs, save=".png")
 
 col_variables = args.color_by.split(",")
@@ -237,8 +237,7 @@
 
 mdata.update()
 # save the scaled adata object
-
+L.info("Saving updated MuData to '%s'" % args.output_scaled_mudata)
 mdata.write(args.output_scaled_mudata)
-L.debug(adata.uns['log1p'])
 
-L.info("done")
+L.info("Done")
diff --git a/panpipes/python_scripts/run_preprocess_spatial.py b/panpipes/python_scripts/run_preprocess_spatial.py
@@ -111,7 +111,6 @@
     L.error("X is not raw data and 'raw_counts' layer not found")
     sys.exit("X is not raw data and 'raw_counts' layer not found")
 
-L.info("Using raw counts for HVG selection & normalization")
 
 # Normalization + HVG selection based on flavour
 if args.norm_hvg_flavour == "squidpy":

diff --git a/panpipes/python_scripts/run_scanpyQC_spatial.py b/panpipes/python_scripts/run_scanpyQC_spatial.py
@@ -158,7 +158,7 @@
 single_id = os.path.basename(str(args.input_anndata))
 single_id = single_id.replace("_raw.h5mu","")
 
-L.info("Saving updated MuData.obs in a metadata tsv file to ./" + single_id + "_cell_metadata.tsv")
+L.info("Saving updated obs in a metadata tsv file to ./" + single_id + "_cell_metadata.tsv")
 write_obs(mdata, output_prefix=single_id, output_suffix="_cell_metadata.tsv")
 L.info("Saving updated MuData to '%s'" % args.outfile)
 mdata.write(args.outfile)