diff --git a/README.md b/README.md
index 50142746..b9267187 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Available pipelines:
 See [installation instrcutions here](https://github.com/DendrouLab/panpipes/blob/main/docs/install.md)
 Review this issue before installatiion: https://github.com/DendrouLab/panpipes/issues/11 
 
-<!-- Oxford BMRC Rescomp users find additional advice in [docs/installation_rescomp](https://github.com/DendrouLab/panpipes/blob/main/docs/installation_rescomp.md) -->
+Oxford BMRC Rescomp users find additional advice in [docs/installation_rescomp](https://github.com/DendrouLab/panpipes/blob/main/docs/installation_rescomp.md)
 
 # General principles for running pipelines
 
diff --git a/docs/install.md b/docs/install.md
index 0fb1c1ce..1ba617a3 100644
--- a/docs/install.md
+++ b/docs/install.md
@@ -33,7 +33,7 @@ conda activate pipeline_env
 we include an environment.yml for a conda environment tested on all the pipelines packaged in this version of Panpipes.
 
 ##### Step 2 Download and install this repo
-
+If you have not already set up SSH keys for github first follow these [instructions](https://github.com/DendrouLab/panpipes/docs/set_up_ssh_keys_for_github.md): 
 
 ```
 git clone https://github.com/DendrouLab/panpipes
diff --git a/docs/set_up_ssh_keys_for_github.md b/docs/set_up_ssh_keys_for_github.md
new file mode 100644
index 00000000..2161c71c
--- /dev/null
+++ b/docs/set_up_ssh_keys_for_github.md
@@ -0,0 +1,25 @@
+
+## Set up SSH key for GitHub
+For more advice: https://docs.github.com/en/authentication/connecting-to-github-with-ssh/about-ssh
+
+After checking for existing keys, if you receive error that ~/.ssh doesn't exist then you don't have one. If there already is one (ie. id_rsa.pub, id_ed25519.pub) then you can either connect it to GitHub or generate new one.
+```
+ls -al ~/.ssh #check for existing keys
+ssh-keygen -t ed25519 -C "your_email@example.com"                                       #use your GitHub email address
+#Enter a file in which to save the key (/c/Users/you/.ssh/id_algorithm):[Press enter]
+#Enter passphrase (empty for no passphrase): [Type a passphrase]
+eval "$(ssh-agent -s)"                                                                  #start ssh-agent
+ssh-add ~/.ssh/id_ed25519                                                               #add your SSH private key to ssh-agent
+clip < ~/.ssh/id_ed25519.pub                                                            #copy SSH public key 
+```
+After copying your SSH public key, go to GitHub --> Settings --> SSH and GPG keys (under Access) --> Add new public SSH key
+
+To test connection 
+```
+ssh -T git@github.com 
+```
+A successful connection should result in 
+> Hi username! You've successfully authenticated, but GitHub does not provide shell access.
+
+Activate the environment
+```
\ No newline at end of file
diff --git a/panpipes/funcs/scmethods.py b/panpipes/funcs/scmethods.py
index 41f84ab9..8f025173 100644
--- a/panpipes/funcs/scmethods.py
+++ b/panpipes/funcs/scmethods.py
@@ -4,6 +4,7 @@
 from scipy.sparse import issparse
 from scanpy.get import obs_df as get_obs_df
 from scanpy.pp import normalize_total
+import scanpy as sc
 import warnings
 import logging
 from typing import Optional, Literal
@@ -38,9 +39,53 @@ def exp_mean_dense(x):
     # convert out of compressed sparse matrix
     return np.log((np.sum(np.exp(x)-1)/x.shape[1]) + 1)
 
-
-
-def pseudo_seurat(adata, arg_minpct=0.1, arg_mindiffpct=-float("inf"), arg_logfcdiff=0.25, use_dense=False):
+def find_all_markers_pseudo_seurat(
+        adata, 
+        groups,
+        groupby,
+        layer=None,
+        method=None,
+        n_genes=float("inf"), 
+        corr_method="bonferroni",
+        arg_minpct=0.1,
+        arg_mindiffpct=-float("inf"), 
+        arg_logfcdiff=0.25):
+    # add replace X with layer
+    if layer is not None:
+        adata.X = adata.layers[layer]
+    # need to check is the assay layer is dense or not
+    assay_is_sparse = issparse(adata.X)
+    use_dense = assay_is_sparse==False 
+    if groups == 'all':
+        groups = adata.obs[groupby].unique().tolist()
+    markers_dict = {}
+    filter_dict = {}
+    for cv in groups:
+        # \ set up idenst as cv ==1 and everything else = 0
+        adata.obs['idents'] = ['1' if x == cv else '0' for x in adata.obs[groupby]]
+        filter_dict[cv] = pseudo_seurat(adata, use_dense=use_dense,arg_minpct=arg_minpct,
+                  arg_mindiffpct=arg_mindiffpct, 
+                  arg_logfcdiff=arg_logfcdiff )
+        logging.info("number of genes remaining after filtering:  %i\n" % filter_dict[cv]['background'].sum())
+        adata_rg = adata[:, filter_dict[cv]['background'].tolist()].copy()
+        sc.tl.rank_genes_groups(adata_rg, layer=layer,
+                                groupby="idents", groups=["1"],  
+                                reference="0",
+                                method=method, 
+                                n_genes=float("inf"), 
+                                corr_method="bonferroni")
+        markers_dict[cv] = sc.get.rank_genes_groups_df(adata_rg, group="1")
+        # remove adata from mem
+        adata_rg = None
+    markers = pd.concat(markers_dict.values(), keys=markers_dict.keys())
+    filter_stats = pd.concat(filter_dict.values(), keys=filter_dict.keys())
+    return markers, filter_stats
+
+def pseudo_seurat(adata, 
+                  arg_minpct=0.1,
+                  arg_mindiffpct=-float("inf"), 
+                  arg_logfcdiff=0.25, 
+                  use_dense=False):
     """
     alternative method that"s more like seurat (pseudo seurat if you will)
     In that you filter genes before running rank genes
@@ -79,7 +124,6 @@ def pseudo_seurat(adata, arg_minpct=0.1, arg_mindiffpct=-float("inf"), arg_logfc
     min_pct = pcts.min(axis=1)
     diff_pct = max_pct - min_pct
     take_diff_pct = diff_pct > arg_mindiffpct
-
     # remove genes that are not expressed higher than 0.1 in one of the groups
     take_min_pct = max_pct > arg_minpct
 
@@ -88,7 +132,7 @@ def pseudo_seurat(adata, arg_minpct=0.1, arg_mindiffpct=-float("inf"), arg_logfc
     # this has the potential to be very slow. Transposeing it speeds it up a bit.
     # I need to undertand sparse matrices better to make it work
     if use_dense:
-        print("using dense matrix")
+        logging.info("using dense matrix")
         # extract the counts for cluster cells and calculate exp means on each row
         nct = adata.X.T[:, cluster_cells_ind]
         cluster_mean = np.apply_along_axis(exp_mean_dense, 1, nct.todense())
@@ -98,7 +142,7 @@ def pseudo_seurat(adata, arg_minpct=0.1, arg_mindiffpct=-float("inf"), arg_logfc
         other_mean = np.apply_along_axis(exp_mean_dense, 1, nct.todense())
         diff_mean = abs(cluster_mean - other_mean)
     else:
-        print("using sparse matrix")
+        logging.info("using sparse matrix")
         cluster_mean = exp_mean_sparse(adata.X.T[:, cluster_cells_ind])
         other_mean = exp_mean_sparse(adata.X.T[:, other_cells_ind])
         diff_mean = abs(cluster_mean - other_mean).A1
@@ -122,7 +166,7 @@ def run_neighbors_method_choice(adata, method, n_neighbors, n_pcs, metric, use_r
     # useful if we are dealing with a MuData object but we want to use single rep, e.g.
     # calculating neighbors on a totalVI latent rep
     if method == "scanpy":
-        print("Computing neighbors using scanpy")
+        logging.info("Computing neighbors using scanpy")
         from scanpy.pp import neighbors
         neighbors(adata,
                         n_pcs=n_pcs,
@@ -131,7 +175,7 @@ def run_neighbors_method_choice(adata, method, n_neighbors, n_pcs, metric, use_r
                         use_rep=use_rep)
     elif method == "hnsw":
         from scvelo.pp import neighbors
-        print("Computing neighbors using hnswlib (with scvelo a la pegasus!)")
+        logging.info("Computing neighbors using hnswlib (with scvelo a la pegasus!)")
         # we use the neighbors function from scvelo (thanks!)
         # with parameters from pegasus (for a more exact result).
         # code snippet from Steve Sansom, via COMBAT project
diff --git a/panpipes/pipeline_clustering.py b/panpipes/pipeline_clustering.py
index 3cf8c7c0..c4733e4e 100644
--- a/panpipes/pipeline_clustering.py
+++ b/panpipes/pipeline_clustering.py
@@ -19,7 +19,7 @@
 from itertools import chain, product
 import glob
 
-from panpipes.funcs.processing import is_float_try, splitall, extract_parameter_from_fname
+from panpipes.funcs.processing import extract_parameter_from_fname
 PARAMS = P.get_parameters(
     ["%s/pipeline.yml" % os.path.splitext(__file__)[0],
      "../pipeline.yml",
@@ -27,7 +27,7 @@
 
 PARAMS['py_path'] =  os.path.join(os.path.dirname(os.path.dirname(__file__)), 'python')
 PARAMS['r_path'] = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'R')
-
+PARAMS['mudata_with_knn'] = 'mudata_w_neighbors.h5mu'
 job_kwargs={}
 if PARAMS['condaenv'] is not None:
     job_kwargs["job_condaenv"] =PARAMS['condaenv']
@@ -36,7 +36,6 @@
 @originate("logs/setup_dirs.sentinel")
 def set_up_dirs(log_file):
     os.mkdir("logs")
-    
     mods =  [key for key, value in PARAMS['modalities'].items() if value is True]
     if PARAMS["multimodal"]["run_clustering"] is True :
         mods.append("multimodal")
@@ -48,16 +47,44 @@ def set_up_dirs(log_file):
 ## Single modality scripts
 ## ------------------------------------
 
+# -----------------------------------=
+# neighbors
+# --------------------------------------
+@follows(set_up_dirs)
+@originate(PARAMS['mudata_with_knn'])
+def run_neighbors(outfile):
+    mods =  [key for key, value in PARAMS['modalities'].items() if value is True]
+    if  any([PARAMS['neighbors'][mod]['use_existing'] is False for mod in mods]):
+        # this means we want to rerun neighbors for at least one assay
+        #we want to replace thhe scaled obj with the new neighbors
+        log_file="logs/run_single_mod_neighbors.log"
+        cmd="""
+        python %(py_path)s/rerun_find_neighbors_for_clustering.py \
+            --infile %(scaled_obj)s \
+            --outfile %(outfile)s  \
+            --neighbor_dict '%(neighbors)s' \
+            --n_threads %(resources_threads_high)s
+            """
+        cmd += " > %(log_file)s"
+        job_kwargs["job_threads"] = PARAMS['resources_threads_high']
+        P.run(cmd, **job_kwargs)
+    else:
+        P.run('ln -s %(scaled_obj)s %(outfile)s', without_cluster=True)
+
+
+
+
 # ------------------------------------
 # UMAP
 # ------------------------------------
+
 def gen_umap_jobs():
     """
     Generate find neighbor jobs with all parameter combinations.
     """
     # same infile for all jobs
     # define files based on jobs
-    infile = PARAMS['scaled_obj']
+    infile = PARAMS['mudata_with_knn']
     mods =  [key for key, value in PARAMS['modalities'].items() if value is True]
     if PARAMS["multimodal"]["run_clustering"] is True :
         mods.append("multimodal")
@@ -68,14 +95,14 @@ def gen_umap_jobs():
                 log_file = os.path.join("logs","_".join([mod,  'md' + str(md) +"_umap.log"]))
                 yield [infile, output_file, mod, md, log_file]
 
-
+@follows(run_neighbors)
 @follows(set_up_dirs)
 @files(gen_umap_jobs)
 def calc_sm_umaps(infile, outfile, mod, mindist, log_file):
     prefix = os.path.split(infile)[0]
     cmd = """
         python %(py_path)s/run_umap.py \
-            --infile %(scaled_obj)s \
+            --infile %(infile)s \
             --outfile %(outfile)s \
             --min_dist %(mindist)s 
             """
@@ -100,7 +127,7 @@ def gen_cluster_jobs():
     """
     # same infile for all jobs
     # define files based on jobs
-    infile = PARAMS['scaled_obj']
+    infile = PARAMS['mudata_with_knn']
     mods =  [key for key, value in PARAMS['modalities'].items() if value is True]
     if PARAMS['multimodal']['run_clustering'] is True:
         mods.append("multimodal")
@@ -114,6 +141,7 @@ def gen_cluster_jobs():
 
 @follows(set_up_dirs)
 @files(gen_cluster_jobs)
+@follows(run_neighbors)
 def calc_cluster(infile, outfile,  mod, res, alg, log_file):
     cmd = """python %(py_path)s/run_clustering.py 
             --infile %(infile)s 
@@ -168,10 +196,11 @@ def collate_mdata(infiles,outfile):
         --umap_files_csv umap_file_paths.csv
         --output_mudata %(outfile)s 
     """
-    # if PARAMS['full_obj'] is None:
-    cmd += "--input_mudata %(scaled_obj)s"
-    # else:
-    #     cmd += "--input_mudata %(scaled_obj)s %(full_obj)s"
+    if PARAMS['full_obj'] is None:
+        mdata_in = PARAMS['mudata_with_knn']
+        cmd += "--input_mudata %(mdata_in)s"
+    else:
+        cmd += "--input_mudata  %(full_obj)s"
     cmd += " > logs/collate_data.log"
     job_kwargs["job_threads"] = PARAMS['resources_threads_medium']
     P.run(cmd, **job_kwargs)
@@ -182,13 +211,19 @@ def collate_mdata(infiles,outfile):
             'logs/plot_clusters_umaps.log')
 def plot_cluster_umaps(infile, log_file,):
     # get associated umap
+    mods =  [key for key, value in PARAMS['modalities'].items() if value is True]
+    if PARAMS['multimodal']['run_clustering']:
+        mods.append("multimodal")
+    mods = ','.join(mods)
     cmd = """python %(py_path)s/plot_cluster_umaps.py \
     --infile %(infile)s 
+    --modalities '%(mods)s'
     """
     cmd += " >> %(log_file)s"
     job_kwargs["job_threads"] = PARAMS['resources_threads_medium']
     P.run(cmd, jobs_limit=1, **job_kwargs)
 
+
 @transform(aggregate_clusters, regex("(.*)/all_res_clusters_list.txt.gz"),
             r'logs/\1_clustree.log',
             r'\1/figures/clustree.png', ) 
@@ -218,38 +253,50 @@ def cluster_analysis(fname):
 # ------------------------------------
 # Markers
 # ------------------------------------
-@transform(calc_cluster,
-           regex("(.*)/(.*)/clusters.txt.gz"),
-           r"\1/\2/markers.txt",
-           r"\1/\2/markers",
+
+@subdivide(calc_cluster, regex("(.*)/(.*)/clusters.txt.gz"),
+           r"\1/\2/*_markers.txt",
+           r"\1/\2")
+def gen_marker_jobs(infile, outfile, base_dir):
+    mods =  [key for key, value in PARAMS['modalities'].items() if value is True]
+    for md in mods:
+        IOTools.touch_file(os.path.join(base_dir, md + "_markers.txt"))
+
+@follows(collate_mdata)
+@transform(gen_marker_jobs,
+           regex("(.*)/(.*)/(.*)_markers.txt"),
+           r"logs/\1_\2_\3_markers.log",
+           r"\1/\2/\3_markers",
            r"\1",
-           r"logs/\1_\2_markers.log",
+           r"\1/\2",
+           r"\3"
            )
-def find_markers(infile, outfile, outfile_prefix, mod, log_file ):
+def find_markers(infile, log_file, outfile_prefix, base_mod, cluster_dir, data_mod):
     """
     Runs scanpy.tl.rank_gene_groups in parallel for each cluster
     """
-    min_cells = PARAMS["markerspecs"][mod]["mincells"]
+    anndata_file=PARAMS['sample_prefix'] + "_clustered.h5mu" + "/" + data_mod
+    cluster_file = os.path.join(cluster_dir, "clusters.txt.gz")
     cmd = """
     python %(py_path)s/run_find_markers_multi.py 
-    --infile %(scaled_obj)s 
+    --infile %(anndata_file)s
+    --cluster_file %(cluster_file)s 
     --output_file_prefix %(outfile_prefix)s 
-    --cluster_file %(infile)s 
-    --mincells %(min_cells)s
+    --mod %(data_mod)s
     """
-    if mod is not None and mod != "multimodal":
-        cmd += " --modality %(mod)s"
-        layers = {mod: PARAMS["markerspecs"][mod]["layer"] for mod in ['rna', 'prot', 'atac']} 
-        cmd += " --layer '%(layers)s'"
-    if PARAMS['markerspecs_pseudo_seurat'] is True:
-        min_pct =  PARAMS["markerspecs"][mod]["minpct"]
-        threshuse =  PARAMS["markerspecs"][mod]["threshuse"]
+    min_cells = PARAMS["markerspecs"][data_mod]["mincells"]
+    cmd += " --mincells %(min_cells)s"
+    layer_choice = PARAMS["markerspecs"][data_mod]["layer"]
+    cmd += " --layer '%(layer_choice)s'"
+    testuse = PARAMS["markerspecs"][data_mod]["method"]
+    cmd += " --testuse '%(testuse)s'"
+    if PARAMS['markerspecs'][data_mod]['pseudo_seurat'] is True:
+        min_pct =  PARAMS["markerspecs"][data_mod]["minpct"]
+        threshuse =  PARAMS["markerspecs"][data_mod]["threshuse"]
         cmd += """ --pseudo_seurat True 
-         --minpct %(min_pct)s 
-         --threshuse %(threshuse)s 
+        --minpct %(min_pct)s 
+        --threshuse %(threshuse)s 
         """
-    else:
-        cmd += " --pseudo_seurat False"
     cmd += " > %(log_file)s "
     job_kwargs["job_threads"] = PARAMS['resources_threads_high']
     P.run(cmd, **job_kwargs)
@@ -276,19 +323,23 @@ def find_markers(infile, outfile, outfile_prefix, mod, log_file ):
 #     --subgroup %(plotspecs_discrete_variables)s > %(log_file)s"""
 #     P.run(cmd, job_threads=PARAMS['resources_threads_high'])
 
-
-# limit jobs because reading the h5 file simultaneouly is problematic
-# @follows(heatmap_markers)
+# this transforms gen_marker_jobs instead of find_markers because gen_marker_jobs creates empty marker files
+# which are then filled by find_markers.
 @follows(collate_mdata)
-@transform(find_markers,
-            regex(r"(.*)/alg(.*)/markers.txt"),
-            r"logs/\1_alg\2_dotplots.log",
-            r"\1/alg\2/figures/dotplot_top_markers.png",
+@follows(find_markers)
+@transform(gen_marker_jobs,
+            regex(r"(.*)/alg(.*)/(.*)_markers.txt"),
+            r"logs/cluster\1_alg\2_exprs_\3_dotplots.log",
+            r"\1/alg\2/figures/dotplot_top_markers_\3.png",
             r"\1/alg\2/figures",
-             r"\1", r"\2")
-def plot_marker_dotplots(marker_file, log_file, outfile, fig_path,mod, cluster_col):
+             r"\1", r"\2", r"\3")
+def plot_marker_dotplots(marker_file, log_file, outfile, 
+                         fig_path, cluster_mod, cluster_col, expr_mod):
     """
     Plots some additional marker plots
+    Read in the h5mu file, pull the cluster col from the h5mu obs.
+    then pull the epxression values from the expr_mod.
+    
     """
     data_obj=PARAMS['sample_prefix'] + "_clustered.h5mu"
     # check there is a figures directory
@@ -297,17 +348,17 @@ def plot_marker_dotplots(marker_file, log_file, outfile, fig_path,mod, cluster_c
     cmd = """
         python %(py_path)s/plot_scanpy_markers.py \
             --infile %(data_obj)s \
+            --modality %(expr_mod)s
             --marker_file %(marker_file)s \
-            --figure_prefix %(fig_path)s
-            --group_col %(cluster_col)s
+            --figure_prefix %(fig_path)s \
             --n %(plotspecs_top_n_markers)s
             
      """
-    if mod is not None:
-        cmd += " --modality %(mod)s"
-        cmd += " --layer '%(plotspecs_layers)s'"
-    else:
-        cmd += " --layer logged_counts"
+    if cluster_mod != "multimodal":
+        cluster_col = cluster_mod + ":" + cluster_col
+    cmd += " --group_col %(cluster_col)s"
+    layer_choice = PARAMS["markerspecs"][expr_mod]["layer"]
+    cmd += " --layer %(layer_choice)s"
     cmd += " > %(log_file)s "
     job_kwargs["job_threads"] = PARAMS['resources_threads_medium']
     P.run(cmd, **job_kwargs)
@@ -374,10 +425,7 @@ def marker_analysis(fname):
 # #--------
 
 
-
-
-
-@follows(cluster_analysis, marker_analysis, )
+@follows(cluster_analysis, marker_analysis )
 def full():
     """
     All cgat pipelines should end with a full() function which updates,
diff --git a/panpipes/pipeline_clustering/pipeline.yml b/panpipes/pipeline_clustering/pipeline.yml
index fbcef0f3..28716a91 100644
--- a/panpipes/pipeline_clustering/pipeline.yml
+++ b/panpipes/pipeline_clustering/pipeline.yml
@@ -126,40 +126,56 @@ clusterspecs:
 # ---------------------------------------
 # parameters for finding marker genes
 # ---------------------------------------
+# where pseudo_suerat is set to False 
+# we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
+# where pseudo_seurat is set to True, we run an a python implementation of Seurat::FindMarkers (written by CRG) is used,
+
 markerspecs:
   rna:
     run: True
     layer: logged_counts
+    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
+    method: t-test_overestim_var
     mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis
-    # Setting pseudo_seurat to True means that rather than use scanpy.rank_gene_groups to identify gene groups,
-    # a python implementation of Seurat::FindMarkers (written by CRG) is used,
-    # We recommend that pseudo_seurat is set to False
+    # where pseudo_suerat is set to False 
+    # we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html
+    # where pseudo_seurat is set to True, we run an a python implementation of Seurat::FindMarkers (written by CRG) is used,
     pseudo_seurat: False
     # these next two settings do not matter unless pseudo_seurat is set to True,
     # If applicable look at Seurat documentation for FindMarkers for details
     minpct: 0.1
     threshuse: 0.25
+
   prot:
     run:
     layer: clr
-    mincells: 10
+    mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis
+    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
+    method: wilcoxon
     pseudo_seurat: False
     minpct: 0.1
     threshuse: 0.25
+
   atac:
     run:
     layer: 
     mincells: 10
+    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
+    method: wilcoxon
     pseudo_seurat: False
     minpct: 0.1
     threshuse: 0.25
+
   multimodal:
     mincells: 10
+    # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]]
+    method: wilcoxon
     pseudo_seurat: False
     minpct: 0.1
     threshuse: 0.25
 
 
+
 # ---------------------------------------
 # plot specs are used to define which metadata columns are used in the visualisations
 # ---------------------------------------
diff --git a/panpipes/pipeline_preprocess.py b/panpipes/pipeline_preprocess.py
index d85d33fb..d8c91c83 100644
--- a/panpipes/pipeline_preprocess.py
+++ b/panpipes/pipeline_preprocess.py
@@ -49,7 +49,7 @@ def filter_mudata(outfile):
         python %(py_path)s/run_filter.py
         --input_mudata %(unfiltered_obj)s
         --output_mudata %(outfile)s
-        --filter_dict "%(filtering)s"
+        --filter_dict "%(filter_dict)s"
         """
         if PARAMS['filtering_keep_barcodes'] is not None:
             cmd += " --keep_barcodes %(filtering_keep_barcodes)s"
diff --git a/python/batch_correct_merge.py b/python/batch_correct_merge.py
index a73a6bba..b003a2a9 100644
--- a/python/batch_correct_merge.py
+++ b/python/batch_correct_merge.py
@@ -58,7 +58,6 @@
 
 if multi_mod=="totalvi":
     totalvi_extra_obsm = {'prot': ['totalvi_protein_foreground_prob', 'totalvi_denoised_protein'], 'rna': [ 'totalvi_denoised_rna']}
-
     # we will want to keep these so extract them first
     totalvi_extra_obsm_keep = {}
     for k, v in totalvi_extra_obsm.items():
@@ -68,7 +67,21 @@
 
 # this overwrites the old modalities
 for k,v in uni_mod_paths.items():
-    base_obj.mod[k] = mu.read(v)
+    mod_replace = mu.read(v)
+    if mod_replace.shape[1] == base_obj.shape[1]:
+        L.info('replacing the whole anndata object with batch corrected')
+        base_obj.mod[k] = mod_replace
+    else:
+        # in this situation the vars in the mod has been subset but the 
+        # original object contains all the genes.
+        L.info("number of genes does not match base object, only integrating obsp and obsm into full mdata[%s] object" % k)
+        base_obj.mod[k].obsm = mod_replace.obsm
+        base_obj.mod[k].obsp = mod_replace.obsp
+        base_obj.mod[k].uns = mod_replace.uns
+        base_obj.mod[k].uns['scVI_gene_subset'] = mod_replace.var_names.tolist()
+        # merge any extra obs columns
+        new_obs = mod_replace.obs.iloc[:, ~ mod_replace.obs.columns.isin( base_obj.mod[k].obs.columns)]
+        base_obj.mod[k].obs = base_obj.mod[k].obs.merge(new_obs, left_index=True, right_index=True)
 
 if multi_mod=="totalvi":
     # restore the totalvi extras
diff --git a/python/batch_correct_none.py b/python/batch_correct_none.py
index 6acd134d..21c7b7eb 100644
--- a/python/batch_correct_none.py
+++ b/python/batch_correct_none.py
@@ -71,13 +71,20 @@
 
 
 # run neighbours and umap without batch correction
+pc_kwargs = {}
+if int(args.neighbors_n_pcs) > 0:
+    pc_kwargs['use_rep'] = "X_pca"
+    pc_kwargs['n_pcs'] = int(args.neighbors_n_pcs)
+else:
+    # If n_pcs==0 use .X if use_rep is None.
+    pc_kwargs['use_rep'] = None
+    pc_kwargs['n_pcs'] = int(args.neighbors_n_pcs)
+    
 run_neighbors_method_choice(adata, 
     method=args.neighbors_method, 
     n_neighbors=int(args.neighbors_k), 
-    n_pcs=int(args.neighbors_n_pcs), 
     metric=args.neighbors_metric, 
-    use_rep='X_pca',
-    nthreads=max([threads_available, 6]))
+    nthreads=max([threads_available, 6]), **pc_kwargs)
 
 L.info("done n_neighbours, saving stuff")
 
diff --git a/python/batch_correct_scvi.py b/python/batch_correct_scvi.py
index f0807209..c88fde8f 100644
--- a/python/batch_correct_scvi.py
+++ b/python/batch_correct_scvi.py
@@ -10,8 +10,8 @@
 import muon as mu
 
 import panpipes.funcs as pp
-from panpipes.funcs.io import read_anndata
-from panpipes.funcs.scmethods import run_neighbors_method_choice
+from panpipes.funcs.io import read_anndata, read_yaml
+from panpipes.funcs.scmethods import run_neighbors_method_choice, X_is_raw
 
 import sys
 import logging
@@ -28,7 +28,7 @@
                     default='adata_scaled.h5ad',
                     help='')
 parser.add_argument('--raw_anndata',
-                    default='adata_raw.h5ad',
+                    default=None,
                     help='')
 parser.add_argument('--output_csv', default='batch_correction/umap_bc_scvi.csv',
                     help='')
@@ -47,12 +47,12 @@
 
 
 args, opt = parser.parse_known_args()
-
+L.info(args)
 
 threads_available = multiprocessing.cpu_count()
 
 # load parameters
-params = pp.io.read_yaml("pipeline.yml")
+params = read_yaml("pipeline.yml")
 L.info("Running scvi script")
 L.info("threads available: %s", threads_available)
 
@@ -77,7 +77,7 @@
 
 mdata = mu.read(args.scaled_anndata)
 if type(mdata) is mu.MuData:
-    rna = mdata['rna'].copy()
+    rna = mdata['rna']
     L.info(rna)
 elif type(mdata) is sc.AnnData:
     rna = mdata
@@ -106,6 +106,8 @@
 # add in test to see if raw layer exists alread
 if "raw_counts" in rna.layers.keys():
     L.info("raw counts found in layer")
+elif X_is_raw(rna):
+    rna.layers["raw_counts"] = rna.X.copy()
 else:
     L.info("merge in raw counts")
     sc_raw = read_anndata(args.raw_anndata, use_muon=True, modality="rna")
diff --git a/python/batch_correct_totalvi.py b/python/batch_correct_totalvi.py
index df612e48..a132b0c2 100644
--- a/python/batch_correct_totalvi.py
+++ b/python/batch_correct_totalvi.py
@@ -8,17 +8,15 @@
 import os
 import gc
 import muon as mu
-from cgatcore import pipeline as P
 
-import panpipes.funcs as pp
-from panpipes.funcs.processing import check_for_bool, intersect_obs_by_mod
-from panpipes.funcs.io import read_anndata, write_anndata, read_yaml
+from panpipes.funcs.io import read_anndata, read_yaml
 from panpipes.funcs.scmethods import run_neighbors_method_choice, X_is_raw
+from panpipes.funcs.processing import intersect_obs_by_mod
 
 import sys
 import logging
 L = logging.getLogger()
-L.setLevel(logging.DEBUG)
+L.setLevel(logging.INFO)
 log_handler = logging.StreamHandler(sys.stdout)
 formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s')
 log_handler.setFormatter(formatter)
@@ -30,7 +28,7 @@
                     default='adata_scaled.h5ad',
                     help='')
 parser.add_argument('--raw_anndata',
-                    default='adata_raw.h5ad',
+                    default=None,
                     help='')
 parser.add_argument('--output_csv', default='batch_correction/umap_bc_totalvi.csv',
                     help='')
@@ -60,7 +58,10 @@
 # load parameters
 
 threads_available = multiprocessing.cpu_count()
-params = pp.io.read_yaml("pipeline.yml")
+
+params = read_yaml("pipeline.yml")
+params['sample_prefix']
+
 
 
 test_script=False
@@ -78,6 +79,7 @@
 
 # this is a copy so we can subset vars and obs without changing the original object
 rna = mdata['rna'].copy()
+prot = mdata['prot'].copy()
 
 kwargs={}
 # in case of more than 1 variable, create a fake column with combined information
@@ -129,6 +131,9 @@
     else:
         raise ValueError("adt_outliers column not found in mdata['prot'].obs")
 
+# exluding isotypes
+if 'isotype' in prot.var.columns:
+    prot = prot[:, ~prot.var.isotype]
 
 # filter out mitochondria
 if params['multimodal']['totalvi']['exclude_mt_genes']:
@@ -142,22 +147,25 @@
     rna = rna[:, rna.var.highly_variable]
 
 mdata.update()
+
 #make protein obsm pandas
-# need to find the raw counts in mdata['prot']
-if X_is_raw(mdata['prot']):
-    X_array = mdata['prot'].X.copy()
-elif "raw_counts" in mdata['prot'].layers.keys(): 
-    X_array = mdata['prot'].layers['raw_counts'].copy()
+# need to find the raw counts in prot
+if X_is_raw(prot):
+    X_array = prot.X.copy()
+elif "raw_counts" in prot.layers.keys(): 
+    X_array = prot.layers['raw_counts'].copy()
 else:
-    raise AttributeError("raw counts not found in mdata['prot'], \
+    raise AttributeError("raw counts not found in prot, \
                         store in either X or in 'raw_counts' layer")
 
-X_df = pd.DataFrame(X_array.todense(), index=mdata['prot'].obs_names, columns=mdata['prot'].var_names)
+
+X_df = pd.DataFrame(X_array.todense(), index=prot.obs_names, columns=prot.var_names)
 
 if X_df.shape[0] == rna.X.shape[0]:
     L.info("adding protein_expression to obsm")
     #check the obs are in the correct order
     X_df = X_df.loc[rna.obs_names,:]
+    X_df = X_df.astype('int')
     rna.obsm['protein_expression'] = X_df 
 else:
     L.error("dimensions do not match, cannot integrate protein")
@@ -229,10 +237,12 @@
 
 if batch_categories is not None:
     L.debug(batch_categories)
+    if type(batch_categories) is not list:
+        batch_categories = [batch_categories]
     normX, protein = vae.get_normalized_expression(
         n_samples=25,
         return_mean=True,
-        transform_batch=[batch_categories]
+        transform_batch=batch_categories
     )
 
     # this is stored in obsm, because if we have filtered by hvg, it no longer fits specifications for a layer
@@ -243,7 +253,7 @@
     df = vae.get_protein_foreground_probability(
         n_samples=25,
         return_mean=True,
-        transform_batch=[batch_categories]
+        transform_batch=batch_categories
     )
     mdata['prot'].obsm["totalvi_protein_foreground_prob"] = df.loc[mdata['prot'].obs_names,:]
 
diff --git a/python/batch_correct_wnn.py b/python/batch_correct_wnn.py
index 2578f8a2..9dc373c6 100644
--- a/python/batch_correct_wnn.py
+++ b/python/batch_correct_wnn.py
@@ -92,6 +92,7 @@
 
 for kmod in dict_graph.keys():
     L.info(kmod)
+    pkmod=params['multimodal']['WNN']['knn'][kmod]
     if dict_graph[kmod]["obsm"] is not None:
         if dict_graph[kmod]["obsm"] not in tmp.mod[kmod].obsm.keys():
             if dict_graph[kmod]["anndata"] is not None:
@@ -114,8 +115,11 @@
         if repuse not in tmp.mod[kmod].obsm.keys():
             if "X_LSI" in tmp.mod[kmod].obsm.keys():
                 repuse = "X_LSI"
-        L.info("falling back on %s" %(repuse) )
-    pkmod=params['multimodal']['WNN']['knn'][kmod]
+            else:
+                L.info("falling back on %s" %(repuse) )
+        if int(pkmod['npcs']) == 0:
+            repuse = None
+            
     L.info("calculating neighbours")
     if repuse != "X_bbknn":
         run_neighbors_method_choice(tmp.mod[kmod], 
diff --git a/python/collate_mdata.py b/python/collate_mdata.py
index 5b5306db..23ae8d79 100644
--- a/python/collate_mdata.py
+++ b/python/collate_mdata.py
@@ -34,6 +34,7 @@
 L.info(args)
 mdata = mu.read(args.input_mudata)
 
+L.info("loading clusters")
 cf = pd.read_csv(args.clusters_files_csv)
 
 if isinstance(mdata, MuData):
@@ -86,4 +87,4 @@
 
 mdata.write(args.output_mudata)
 mdata.obs.to_csv(re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata), sep='\t')
-L.info("done")
\ No newline at end of file
+L.info("done")
diff --git a/python/downsample.py b/python/downsample.py
index a1e094bb..138d8849 100644
--- a/python/downsample.py
+++ b/python/downsample.py
@@ -7,6 +7,7 @@
 from muon import MuData
 from anndata import AnnData
 import muon as mu
+import re
 
 from panpipes.funcs.io import read_anndata, write_anndata, write_obs
 from panpipes.funcs.processing import downsample_mudata,  intersect_obs_by_mod, check_for_bool
@@ -87,10 +88,10 @@
 L.info(mdata.obs.sample_id.value_counts())
 
 # write out the observations
-write_obs(mdata, output_prefix=args.sampleprefix, 
+write_obs(mdata, output_prefix=re.sub("\\.(.*)", "", args.output_mudata), 
         output_suffix="_downsampled_cell_metadata.tsv")
 
 
-mdata.write(args.input_mudata)
+mdata.write(args.output_mudata)
 #This stage is the point (i.e. pre-normalisation) where the mdata file can be outputted so that we can extract raw matrix for the cellphonedb.
 L.info("Completed")
\ No newline at end of file
diff --git a/python/plot_cluster_umaps.py b/python/plot_cluster_umaps.py
index 37748a15..d1a0ea8c 100644
--- a/python/plot_cluster_umaps.py
+++ b/python/plot_cluster_umaps.py
@@ -28,9 +28,9 @@
 parser.add_argument("--infile",
                     default="mdata.h5mu",
                     help="file name, format: .h5mu")
-parser.add_argument("--figdir",
-                    default=None,
-                    help="figures directory")
+parser.add_argument("--modalities",
+                    default="rna",
+                    help="list of modalities to search for UMAPs in")
 args, opt = parser.parse_known_args()
 
 L.info("running with:")
@@ -44,7 +44,7 @@ def main(adata,figdir):
     obsm_keys = [x for x in adata.obsm.keys() if re.search(pattern, x)]
     L.info("Umap keys founds %s" % obsm_keys)
     # get all possible clustersclusters
-    pattern=re.compile(r'^leiden|^louvain_res') 
+    pattern=re.compile(r'^leiden|^louvain') 
     cluster_keys  = [x for x in adata.obs.columns if re.search(pattern, x)]
     L.info("Cluster keys founds %s" % cluster_keys)
     if len(obsm_keys) == 0  or len(cluster_keys) == 0:
@@ -65,23 +65,24 @@ def main(adata,figdir):
 L.debug("load data")
 mdata = read(args.infile)
 
+mods = args.modalities.split(',')
 # detemin initial figure directory based on object type
-if args.figdir is None: 
-    if type(mdata) is MuData :
-        figdir  = "multimodal/figures"
-        L.info("plotting multimodal umaps")
-    else:
-        figdir  = "figures"
-        L.info("plotting umaps from anndata")
-else:
-    figdir = args.figdir
+
 # do plotting
-main(mdata, figdir)
+if 'multimodal' in mods:
+    if os.path.exists("multimodal/figures") is False:
+        os.makedirs("multimodal/figures")
+    main(mdata, figdir="multimodal/figures")
 
 
 # we also need to plot per modality
 if type(mdata) is MuData:
     for mod in mdata.mod.keys():
-        L.info("plotting for modality: %s" % mod)
-        figdir  = os.path.join(mod, "figures")
-        main(mdata[mod], figdir)
+        if mod in mods:
+            L.info("plotting for modality: %s" % mod)
+            figdir  = os.path.join(mod, "figures")
+            if os.path.exists(figdir) is False:
+                os.makedirs(figdir)
+            main(mdata[mod], figdir)
+
+L.info('done')
\ No newline at end of file
diff --git a/python/plot_custom_markers.py b/python/plot_custom_markers.py
index 19174bce..0f6c24a9 100644
--- a/python/plot_custom_markers.py
+++ b/python/plot_custom_markers.py
@@ -130,21 +130,11 @@ def main(adata, mod, df, grouping_var, pfx, layer_choice=None):
     for mod in modalities:
         print(mod)
         df_sub = df[df['mod'] == mod]
-        # make sure every grouping var is a category
-        if mod is not "multimodal":
-            for gv in group_vars:
-                mdata[mod].obs[gv] = mdata.obs.loc[mdata[mod].obs_names,gv].astype('category')
-            mdata.update_obs()
-        else:
-            for gv in group_vars:
-                mdata.obs[gv] = mdata.obs.loc[mdata.obs_names,gv].astype('category')
-            mdata.update_obs()
-        # pull out the layers and set ll to None if no layer sepcificed (so the plotting funcs will use X)
-        try:
-            ll = layers[mod]
-        except KeyError:
-            ll = [None]
-        if len(group_vars) > 0 and ll is not None:
+        for gv in group_vars:
+            mdata[mod].obs[gv] = mdata.obs.loc[mdata[mod].obs_names,gv].astype('category')
+        mdata.update_obs()
+        ll = layers[mod]
+        if len(ll) > 0  and len(group_vars) > 0:
             for gv, layer in product(group_vars, ll):
                 print(gv, layer)
                 main(adata=mdata[mod], 
diff --git a/python/plot_custom_markers_umap.py b/python/plot_custom_markers_umap.py
index e3029728..9d268f2d 100644
--- a/python/plot_custom_markers_umap.py
+++ b/python/plot_custom_markers_umap.py
@@ -105,19 +105,12 @@ def main(adata, mod, layer_choice, df, basis):
         print(mod)
         df_sub = df[df['mod'] == mod]
         mdata.update_obs()
-        # pull out the layers and set ll to None 
-        # if no layer sepcificed 
-        # (so the plotting funcs will use X)
-        try:
-            ll = layers[mod]
-        except KeyError:
-            ll = [None]
-        # pull out the basis e.g. X_umap
+        ll = layers[mod]
         if mod in basis_dict.keys():
             bb= basis_dict[mod]
         else:
             bb = []
-        if len(bb) > 0:
+        if len(ll) > 0  and len(bb) > 0:
             for basis, layer in product(bb, ll):
                 print(basis,layer)
                 main(adata=mdata[mod], 
diff --git a/python/plot_scanpy_markers.py b/python/plot_scanpy_markers.py
index 74b88969..e6f167f2 100644
--- a/python/plot_scanpy_markers.py
+++ b/python/plot_scanpy_markers.py
@@ -59,81 +59,78 @@
 
 # script
 
+def calc_dendrogram(adata, group_col):
+    if len(adata.obs[group_col].cat.categories) > 5:
+        incl_dendrogram = True
+        if "X_pca" not in adata.obsm.keys():
+            sc.pp.pca(adata)
+        L.info("calculating dendrogram")
+        sc.tl.dendrogram(adata, groupby=group_col, use_rep="X_pca")
+    else:
+        incl_dendrogram = False
+    return incl_dendrogram
+
 
-def main(adata, mod, layer, group_col, mf, n):
+def do_plots(adata, mod, group_col, mf, n=10, layer=None):
     L.debug("check layers")
-    if layer != 'X':
-        adata.X = adata.layers[layer]
-    # run pca if not done already.( for dendorgram calc)
-    if("X_pca" in adata.obsm.keys()):
-        sc.pp.pca(adata)
-    # read file
+    # get markers for plotting
+    mf = mf[mf['avg_logFC'] > 0]
     df = mf.groupby(group_col).apply(lambda x: x.nlargest(n, ['scores'])).reset_index(drop=True)
     marker_list={str(k): list(v) for k,v in df.groupby(group_col)["gene"]}
     # add cluseter col to obs
-    adata.obs[group_col] = adata.obs[group_col].astype('str').astype('category')
-    L.debug("do plots")
+    # check whether a dendrogram is computed/
+    incl_dendrogram = calc_dendrogram(adata, group_col)
+    L.info("start plotting")
     sc.pl.stacked_violin(adata,
                 marker_list,
                 groupby=group_col,
-                save=  '_top_markers'+ mod +'.png',
-                figsize=(24, 5))
+                layer=layer,
+                save = '_top_markers'+ mod +'.png',
+                dendrogram=incl_dendrogram,
+                # figsize=(24, 5)
+                )
     sc.pl.matrixplot(adata,
                     marker_list,
                     groupby=group_col,
                     save=  '_top_markers'+ mod +'.png',
+                    dendrogram=incl_dendrogram,
                     figsize=(24, 5))
     sc.pl.dotplot(adata,
                 marker_list,
                 groupby=group_col,
                 save=  '_top_markers'+ mod +'.png',
+                dendrogram=incl_dendrogram,
                 figsize=(24, 5))
     sc.pl.heatmap(adata,
                 marker_list,
                 groupby=group_col,
                 save=  '_top_markers'+ mod +'.png',
-                figsize=(24, 5))
+                dendrogram=incl_dendrogram,
+                # figsize=(24, 5)
+                )
 
 
-L.debug("load data")
+# read data
 mdata = mu.read(args.infile)
 
-
-L.debug("load marker file")
-mf = pd.read_csv(args.marker_file, sep='\t' )
-mf[args.group_col] = mf['cluster'].astype('str').astype('category')
-# get layer
-try:
-    layers = read_yaml(args.layer)
-except AttributeError:
-    layers = args.layer
-
-if type(mdata) is mu.MuData and args.modality is None and type(layers) is not dict:
-    sys.exit('if inputting a mudata object, layers must be in a dictionary')
-    
-
 if type(mdata) is AnnData:
     adata = mdata
-    main(adata,mod=args.modality, layer=args.layer, n=args.n)
-elif args.modality is not None:
-    adata = mdata[args.modality]
-    if type(layers) is dict:
-        ll = layers[args.modality]
-    else:
-        ll = layers
-    main(adata,
-         mod=args.modality, 
-         layer = ll, 
-         mf=mf, 
-         group_col = args.group_col,
-         n=int(args.n))
+    # main function only does rank_gene_groups on X, so 
+elif type(mdata) is mu.MuData and args.modality is not None:
+    adata = mdata[args.modality]    
 else:
-    # we have multimodal object, and some kind of multimodal clustering output
-    for mod in mf['mod'].unique():
-        print(mod)
-        mf_sub = mf[mf['mod'] == mod]
-        mdata[mod].obs[args.group_col] = mdata.obs.loc[mdata[mod].obs_names,args.group_col].astype('category')
-        mdata.update_obs()
-        main(mdata[mod], mod=mod, layer = layers[mod], group_col = args.group_col, mf=mf_sub, n=int(args.n))
+    sys.exit('if inputting a mudata object, you need to specify a modality')
+    
 
+L.info("load marker file")
+mf = pd.read_csv(args.marker_file, sep='\t' )
+mf[args.group_col] = mf['cluster'].astype('category')
 
+# get layer
+adata.obs[args.group_col] = mdata.obs[args.group_col]
+do_plots(adata,
+         mod=args.modality, 
+         group_col=args.group_col,
+         mf=mf,
+         layer=args.layer, 
+         n=int(args.n))
diff --git a/python/plot_umaps_batch_correct.py b/python/plot_umaps_batch_correct.py
index 88dae768..5c2aca59 100644
--- a/python/plot_umaps_batch_correct.py
+++ b/python/plot_umaps_batch_correct.py
@@ -178,3 +178,4 @@
                     g.savefig(os.path.join(args.fig_dir, mod, "umap_method_" + qc + ".png"), dpi = 300)
                 plt.clf()
 
+L.info('done')
\ No newline at end of file
diff --git a/python/plot_variables_umaps.py b/python/plot_variables_umaps.py
index 8c4cef73..ec44c0f0 100644
--- a/python/plot_variables_umaps.py
+++ b/python/plot_variables_umaps.py
@@ -8,7 +8,7 @@
 from anndata import AnnData
 import pandas as pd
 import argparse
-from panpipes.funcs.io import read_yaml
+from panpipes.funcs.io import read_yaml, dictionary_stripper
 from itertools import product, chain
 
 sc.settings.autoshow = False
@@ -69,10 +69,11 @@ def main(adata, mod, plot_features,  basis, fig_suffix):
 
 
 # get bases
-basis_dict = read_yaml(args.basis_dict)
+basis_dict = dictionary_stripper(read_yaml(args.basis_dict))
 
 if args.categorical_variables is not None:
-    cat_vars = read_yaml(args.categorical_variables)
+    cat_vars = dictionary_stripper(read_yaml(args.categorical_variables))
+    
     uniq_discrete = list(set(chain(*cat_vars.values())))
     # make sure they are categories
     mdata.obs[uniq_discrete] = mdata.obs[uniq_discrete].apply(lambda x: x. astype('category'))
@@ -82,7 +83,7 @@ def main(adata, mod, plot_features,  basis, fig_suffix):
 
 if args.continuous_variables is not None:
     try:
-        cont_vars = read_yaml(args.continuous_variables)
+        cont_vars = dictionary_stripper(read_yaml(args.continuous_variables))
     except AttributeError:
         # this assumes that we have tried to parse a dict and nstead found a string
         # there is probably a better solution
diff --git a/python/refmap_scvitools.py b/python/refmap_scvitools.py
index 8c4dc476..d05ed303 100644
--- a/python/refmap_scvitools.py
+++ b/python/refmap_scvitools.py
@@ -1,7 +1,7 @@
 import multiprocessing
 from matplotlib import transforms 
 
-
+import anndata as ad
 import scanpy as sc
 import pandas as pd
 import matplotlib.pyplot as plt
@@ -30,6 +30,8 @@
 parser.add_argument('--query_data',
                     default='adata_scaled.h5ad',
                     help='path to query data. can be a raw 10x dataset or a preprocessed anndata/mudata')
+parser.add_argument('--query_celltype',default='celltype',
+                    help='if query has already a column with cell labeling in obs. Default to "celltype" ')
 parser.add_argument('--adata_reference',
                     default='adata_.h5ad',
                     help='path to reference ann/mudata. necessary only if you want to produce ref/ query map')
@@ -40,6 +42,8 @@
                     help='scvi, scanvi, totalvi supported')
 # parser.add_argument('--generate_scanvi', default=False,
 #                     help='create scanvi reference from scvi to allow for label transfer')
+parser.add_argument('--predict_rf',default=False,
+                    help='For totalvi and scvi, if reference model has random forest classifier, classify cells in query')
 parser.add_argument('--impute_proteins', default=False,
                     help='if totalvi reference is used, allow to impute protein levels')
 parser.add_argument('--transform_batch', default=None,
@@ -56,7 +60,10 @@
 
 args, opt = parser.parse_known_args()
 sc.settings.figdir = "figures/"
+sc.set_figure_params(figsize=(8, 6), dpi=300) 
 L.info("running with args:")
+args.predict_rf = check_for_bool(args.predict_rf)
+args.impute_proteins = check_for_bool(args.impute_proteins)
 L.info(args)
 
 threads_available = multiprocessing.cpu_count()
@@ -65,6 +72,9 @@
 params = read_yaml("pipeline.yml")
 query_data = os.path.basename(args.query_data)
 
+reference_architecture = str(args.reference_architecture)
+
+
 
 mdata = mu.read(args.query_data)
 if type(mdata) is mu.MuData:
@@ -72,6 +82,20 @@
         sys.exit("we only support querying using RNA but your mdata doesn't contain rna")
     else:
         adata_query = mdata["rna"].copy()
+        if "prot" in mdata.mod.keys() and reference_architecture=="totalvi":
+            if X_is_raw(mdata['prot']):
+                X_array = mdata['prot'].X.copy()
+            elif "raw_counts" in mdata['prot'].layers.keys(): 
+                X_array = mdata['prot'].layers['raw_counts'].copy()
+        
+        X_df = pd.DataFrame(X_array.todense(), index=mdata['prot'].obs_names, columns=mdata['prot'].var_names)
+        if X_df.shape[0] == adata_query.X.shape[0]:
+            L.info("adding protein_expression to obsm")
+            #check the obs are in the correct order
+            X_df = X_df.loc[adata_query.obs_names,:]
+            adata_query.obsm['protein_expression'] = X_df 
+        else:
+            L.error("dimensions do not match, cannot create the raw obsm counts in query")
 else:
     adata_query = mdata.copy()
 
@@ -87,7 +111,7 @@
         raise AttributeError("raw counts not found in query, \
                         store in either X or in 'counts/raw_counts' layer")
 
-reference_architecture = str(args.reference_architecture)
+
 if reference_architecture in ['totalvi','scvi','scanvi']:
     L.info("running using %s" % reference_architecture)
 else:
@@ -136,30 +160,132 @@
     vae_q.train(max_epochs= max_epochs , plan_kwargs=train_kwargs)
     adata_query.obsm["X_scANVI"] = vae_q.get_latent_representation()
     adata_query.obs["predictions"] = vae_q.predict()
-    df = adata_query.obs.groupby(["celltype", "predictions"]).size().unstack(fill_value=0)
-    norm_df = df / df.sum(axis=0)
-
-    plt.figure(figsize=(8, 8))
-    _ = plt.pcolor(norm_df)
-    _ = plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90)
-    _ = plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
-    plt.xlabel("Predicted")
-    plt.ylabel("Observed")
-    file_name = "SCANVI_predicted_vs_observed_labels_query_data"
-    plt.savefig(os.path.join("figures/", file_name + ".png"))
-    
+    if args.query_celltype is not None:
+        L.info("Query has celltypes in column %i, i will plot what predictions look like from scanvi model" % args.query_celltype)
+        df = adata_query.obs.groupby([str(args.query_celltype), "predictions"]).size().unstack(fill_value=0)
+        norm_df = df / df.sum(axis=0)
+
+        plt.figure(figsize=(8, 8))
+        _ = plt.pcolor(norm_df)
+        _ = plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90)
+        _ = plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
+        plt.xlabel("Predicted")
+        plt.ylabel("Observed")
+        file_name = "SCANVI_predicted_vs_observed_labels_query_data"
+        plt.savefig(os.path.join("figures/", file_name + ".png"))
+        
 if reference_architecture=="totalvi":
+    # temporary fix is disabled for now, need to modify to allow to fix query to match reference
+    fix_query = False
+    if fix_query:
+        L.info(" will do some manipulation of query data to make it match to referece structure")
+        if args.adata_reference is not None:
+            reference_data = os.path.basename(args.adata_reference)
+            mdata=mu.read(args.adata_reference)
+            if type(mdata) is MuData:
+                if "rna" not in mdata.mod.keys():
+                    sys.exit("we only support querying using RNA but your mdata doesn't contain rna")
+                else:
+                    adata_ref = mdata["rna"].copy()
+                    adata_ref.obsm = mdata.obsm.copy()
+                    adata_ref.obsp = mdata.obsp.copy()
+            else:
+                adata_ref = mdata.copy()
+            del mdata
+            # match query and reference vars
+            
+            #adata_query.layers["counts"] = adata_query.X.copy()#already taken care of 
+            # are the following necessary?
+            sc.pp.normalize_total(adata_query, target_sum=1e4)
+            sc.pp.log1p(adata_query)
+            adata_query.raw = adata_query
+            # subset to reference vars
+            adata_query = adata_query[:, adata_ref.var_names].copy()
+            
+            adata_query.obs["celltype.l2"] = "Unknown"
+            adata_query.obs["orig.ident"] = adata_query.obs["set"]
+            #query.obsm["X_umap"] = query.obs[["UMAP1", "UMAP2"]]
+
+
+            # obsm slot slot for protein counts can be called either prot_counts or protein_expression
+            # in the reference anndata and model. allow a 
+
+            if "protein_expression" in adata_ref.obsm.keys():
+                pname = "protein_expression"
+            
+            elif "protein_counts" in adata_ref.obsm.keys():
+                pname = "protein_counts"
+            
+            if pname not in adata_query.obsm.keys():
+                X_df = pd.DataFrame(0, index=adata_ref.obs_names, columns=adata_ref.obsm[pname].columns)
+
+                adata_query.obsm["protein_counts"] = query.obsm["pro_exp"].copy()
+            
+
+            for p in adata_ref.obsm[pname].columns:
+                if p not in adata_query.obsm[pname].columns:
+                    adata_query.obsm[pname][p] = 0.0
+            # ensure columns are in same order
+            adata_query.obsm[pname] = adata_query.obsm[pname].loc[:, adata_ref.obsm[pname].columns]
+
+        else:
+            
+            # For now make this only work when reference anndata is available
+
+            # if args.reference_var is not None:
+            #     reference_var = pd.read(args.reference_var, sep="\t")
+            # else: 
+            #     sys.exit("I need var names from reference to make sure query has overlap")    
+            # if args.reference_prot is not None:
+            #     reference_prot = pd.read(args.reference_prot, sep="\t")
+            # else: 
+            #     sys.exit("I need var names from reference to make sure query has overlap")    
+
+            # if args.reference_prot_assay is not None:
+            #     L.info("name of obsm slot for reference is: %i" % args.reference_prot_assay)
+            #     pname = args.reference_prot_assay
+            sys.exit("need a reference dataset to check for matching query entries")        
+        
     scvi.model.TOTALVI.prepare_query_anndata(adata_query, reference_path)
     vae_q = scvi.model.TOTALVI.load_query_data(
     adata_query,
     reference_path,
     freeze_expression=True)
-    latent_choice= "X_totalVI"
+    latent_choice= "X_totalvi"
     vae_q.train(max_epochs= max_epochs , plan_kwargs=train_kwargs)
-    adata_query.obsm["X_totalVI"] = vae_q.get_latent_representation()
+    adata_query.obsm["X_totalvi"] = vae_q.get_latent_representation()
+    #remove this after finishing
+    adata_query.write(os.path.join( "query_temp_check_totvi.h5ad"))
+        
+    if args.predict_rf :
+        L.info("predicting celltypes")
+        predictions = (
+           vae_q.latent_space_classifer_.predict(
+            adata_query.obsm["X_totalvi"]
+            )
+        )
+        adata_query.obs["predictions"] = predictions
+        #remove this after finishing
+        adata_query.write(os.path.join( "query_temp_check_predictions_totvi.h5ad"))
+    
+        if args.query_celltype is not None:
+            L.info("Query has celltypes in column %s, i will plot what predictions look like from totalvi model" % args.query_celltype)
+            df = adata_query.obs.groupby([str(args.query_celltype), "predictions"]).size().unstack(fill_value=0)
+            norm_df = df / df.sum(axis=0)
+
+            plt.figure(figsize=(8, 8))
+            _ = plt.pcolor(norm_df)
+            _ = plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90)
+            _ = plt.yticks(np.arange(0.5, len(df.index), 1), df.index)
+            plt.xlabel("Predicted")
+            plt.ylabel("Observed")
+            file_name = "totalvi_predicted_vs_observed_labels_query_data"
+            plt.savefig(os.path.join("figures/", file_name + ".png"))
+        
     
 L.info("Mapped Query to Reference, now some utilities")
 
+
 if args.adata_reference is not None:
     reference_data = os.path.basename(args.adata_reference)
     mdata=mu.read(args.adata_reference)
@@ -175,8 +301,10 @@
 
     adata_ref.obs.loc[:, 'is_reference'] = 'Reference'
     adata_query.obs.loc[:, 'is_reference'] = 'Query'
-
-    adata_full = adata_query.concatenate(adata_ref, batch_key="batch_key")
+    #expect the batch to be always encoded as `batch` in both Q and R
+    adata_full = ad.concat( [adata_ref,adata_query])
+    # add param to decide if to recompute the total embedding or to recalc the embedding
+    #if "X_totalvi" not in adata_ref.obsm.keys():
     adata_full.obsm[latent_choice] = vae_q.get_latent_representation(adata_full)
     
     if args.impute_proteins: 
@@ -200,10 +328,10 @@
     adata_full = adata_query.copy()
 
 if int(args.neighbors_n_pcs) > adata_full.obsm[latent_choice].shape[1]:
-    L.warn("N PCs is larger than %i dimensions, reducing n PCs to"  % adata_full.obsm[latent_choice].shape[1] -1)
+    L.warn("N PCs is larger than %i dimensions, reducing n PCs to " % adata_full.obsm[latent_choice].shape[1])
 
 
-n_pcs= min(int(args.neighbors_n_pcs), adata_full.obsm[latent_choice].shape[1]-1)
+n_pcs= min(int(args.neighbors_n_pcs), adata_full.obsm[latent_choice].shape[1])
 
 
 run_neighbors_method_choice(adata_full, 
@@ -220,24 +348,23 @@
 model_name = ''.join(e for e in model_name if e.isalnum())
 
 
-file_name= "umap" + model_name + "_" + latent_choice
+file_name= "umap_" + model_name + "_" + latent_choice
 
 fig = sc.pl.embedding(adata_full, basis = "umap",color=["is_reference"],
          show=False, return_fig=True)
 
 fig.savefig(os.path.join("figures/", file_name + ".png"))
 
-umap = pd.DataFrame(mdata.obsm['X_umap'], mdata.obs.index)
+umap = pd.DataFrame(adata_full.obsm['X_umap'], adata_full.obs.index)
 
 umap.to_csv(os.path.join("refmap/", file_name + ".csv") )
 file_name= "query_to_reference_" + model_name + "_" + latent_choice + ".h5mu" #change this to mudata
 
 mdata_save = MuData({"rna":adata_full})
 
-write_anndata(mdata_save,  use_muon=True, modality="rna", fname=os.path.join( "refmap/" , file_name))
-
-
+mdata_save.write(os.path.join( "refmap/" , file_name))
 
+L.info('done')
 
 
 
diff --git a/python/run_find_markers_multi.py b/python/run_find_markers_multi.py
index 66225d17..387e40ab 100644
--- a/python/run_find_markers_multi.py
+++ b/python/run_find_markers_multi.py
@@ -3,14 +3,11 @@
 from anndata import AnnData
 import argparse
 import pandas as pd
-import re
 import numpy as np
 from scipy.sparse import issparse
-from itertools import compress
-
 from panpipes.funcs.processing import check_for_bool
-from panpipes.funcs.scmethods import pseudo_seurat
-from panpipes.funcs.io import read_yaml
+from panpipes.funcs.scmethods import find_all_markers_pseudo_seurat
+
 
 import sys
 import logging
@@ -47,20 +44,14 @@
                     help="testing limited to genes with this (log scale) difference in mean expression level.")
 parser.add_argument("--mincells", type=str, default='3',
                     help="minimum number of cells required (applies to cluster and to the other cells)")
-parser.add_argument("--use_dense", type=str, default="False")
+
 
 args, opt = parser.parse_known_args()
-# args = argparse.Namespace(infile='prot/ncomps3_nneigh30_meteuclidean_dir/neighbors.h5ad', modality='prot', layer='rna', output_file_prefix='prot/ncomps3_nneigh30_meteuclidean_dir/algleiden_res1/markers', cluster_file='prot/ncomps3_nneigh30_meteuclidean_dir/algleiden_res1/clusters.txt.gz', testuse='wilcoxon', pseudo_seurat='False', minpct='0.1', mindiffpct='-inf', threshuse='0.25', mincells='10', use_dense='False')
 
 
 # script -------------------------------
 
-def main(adata, args, mod="", marker_columns=[]):
-    # check the X slot actually contains data
-    if adata.X.shape[1] == 0:
-        L.info("skipping %s because adata.X contains no values" % mod)
-        return None
-    
+def check_log1p_dict(adata):
     # temp code to get around this bug
     # https://github.com/scverse/muon/issues/40
     if 'log1p' in adata.uns.keys():
@@ -68,97 +59,135 @@ def main(adata, args, mod="", marker_columns=[]):
             L.debug( "log1p dict is Empty, refilling dict")
             adata.uns['log1p'] = {'base': None}
 
+
+def load_and_merge_clusters(adata, cluster_file):
     # load clusterss
-    df = pd.read_csv(args.cluster_file, sep="\t", index_col=0)
+    df = pd.read_csv(cluster_file, sep="\t", index_col=0)
     adata_shape = adata.shape
-
     # add clusters to adata.obs
     adata.obs = pd.merge(adata.obs, df, how="left", left_index=True, right_index=True)
     # check check we have not lost cells in merge
     if adata.shape != adata_shape:
         L.info("some cells lost in merge, not all cells have a cluster?")
     L.debug(adata.shape)
-    if issparse(adata.X):
-        bool_list = (adata.X.sum(axis=0) > 0).tolist()[0]
-    else:
-        bool_list = (adata.X.sum(axis=0) > 0).tolist()
 
+
+def get_header():
+    # write out the correct header before we start, need to werite it out before, because we use append mode for the multimodal find markers
+    # and we don't wantto end up with a new header half
+    if sc.__version__ < "1.7.0":
+        markers_columns = ['cluster', 'scores', 'gene', 'avg_logFC', 'pvals', 'p.adj.bonferroni']
+    elif sc.__version__ >= "1.7.0":
+        markers_columns = ['cluster', 'gene', 'scores', 'avg_logFC', 'pvals', 'p.adj.bonferroni']
+    return markers_columns
+
+
+def filter_zero_count_genes(adata, layer=None):
+    if layer is None:
+        if issparse(adata.X):
+            bool_list = (adata.X.sum(axis=0) > 0).tolist()[0]
+        else:
+            bool_list = (adata.X.sum(axis=0) > 0).tolist()
+    else:
+        if issparse(adata.layers[layer]):
+            bool_list = (adata.layers[layer].sum(axis=0) > 0).tolist()[0]
+        else:
+            bool_list = (adata.layers[layer].sum(axis=0) > 0).tolist()
     adata = adata[:, bool_list]
     L.debug(adata.shape)
-    run_pseudo_seurat = check_for_bool(args.pseudo_seurat)
-    use_dense = check_for_bool(args.use_dense)
-
-    clust_vals = set(adata.obs["clusters"])
-    markers_dict = {}
-    filter_stats_dict = {}
-
-    for cv in clust_vals:
-        L.info(cv)
-        # 1. set idents
-        adata.obs["idents"] = ["1" if cc == cv else "0" for cc in adata.obs["clusters"]]
-        adata.obs["idents"] = adata.obs["idents"].astype("category")
-        # check we have enough cells
-        if args.mincells == "None":
-            min_cells = 3
-        else:
-            min_cells = int(args.mincells)
-        n_cluster = sum([cell == '1' for cell in adata.obs['idents']])
-        n_other = sum([cell == '0' for cell in adata.obs['idents']])
-        if n_cluster <= int(args.mincells) | n_other <= int(args.mincells):
-            sys.exit("not enough cells in cluster, min cells: %i" % int(args.mincells))
-        L.info("Cluster %s number of cells: %i" % (cv, n_cluster))
-        L.info("Other number of cells: %i\n" % n_other)
-        # adata = adata.raw.to_adata().copy()
-        if run_pseudo_seurat is True:
-            filter_stats = pseudo_seurat(adata, use_dense=use_dense)
-            L.info("number of genes remaining after filtering:  %i\n" % filter_stats['background'].sum())
-            adata_rg = adata[:, filter_stats['background'].tolist()].copy()
-
-            sc.tl.rank_genes_groups(adata_rg, layer=None,
-            groups=["1"], groupby="idents", reference="0",
-            method="wilcoxon", n_genes=float("inf"), corr_method="bonferroni")
-
-            markers = sc.get.rank_genes_groups_df(adata_rg, group="1",)
-            # remove adata from mem
-            adata_rg = None
-        else: 
-            sc.tl.rank_genes_groups(adata, groups=["1"], groupby="idents", reference="0", 
-                                    method="t-test_overestim_var", 
-                                    n_genes=float("inf"), 
-                                    corr_method="bonferroni", layer=None)
-            markers = sc.get.rank_genes_groups_df(adata, group="1")
-        # filter positive only and adjusted pval < 0.05
-        # markers = markers[(markers.logfoldchanges > 0) & (markers.pvals_adj < 0.05)]
-
-        markers.head()
+
+
+def run_clustering(adata, 
+                   cluster_col=None, 
+                   cluster_groups='all', 
+                   layer=None, 
+                   methoduse=None, 
+                   pseudo_seurat=False):
+    if type(cluster_groups) is list:
+        # make sure the clusters are strings else rank_gene_groups crashes
+        cluster_groups = [str(x) for x in cluster_groups]
+    adata.obs[cluster_col] = adata.obs[cluster_col].astype('str').astype('category')
+    if pseudo_seurat:
+        L.info('running rank gene groups with pseudo-seurat method')
+        markers, filter_stats = find_all_markers_pseudo_seurat(adata,   
+                                        groups=cluster_groups, 
+                                        groupby=cluster_col, 
+                                        method=methoduse,
+                                        n_genes=float("inf"), 
+                                        corr_method="bonferroni",
+                                        layer=layer,
+                                        arg_minpct=0.1,
+                                        arg_mindiffpct=-float("inf"), 
+                                        arg_logfcdiff=0.25)
+        markers = markers.reset_index().drop(columns='level_1').rename(columns={'level_0':'gene'})
+    else:
+        # find markers
+        L.info('running rank gene groups with standard scanpy implementation')
+        sc.tl.rank_genes_groups(adata, 
+                                groups=cluster_groups, 
+                                groupby=cluster_col, 
+                                method=methoduse,
+                                n_genes=float("inf"), 
+                                corr_method="bonferroni",
+                                layer=layer)
+        markers = sc.get.rank_genes_groups_df(adata, group=None)
         L.info("Rank gene groups complete")
-        markers_dict[cv] = markers
-        if pseudo_seurat is True:
-            filter_stats_dict[cv] = filter_stats
-    
-    
-    all_markers = pd.concat(markers_dict, axis=0, keys=markers_dict.keys())
-    all_markers.columns = marker_columns[1:6]
-    all_markers = all_markers.reset_index().rename(columns={'level_0':'cluster'}).drop(columns="level_1")
+        filter_stats = None
+    return markers, filter_stats
+
+
+def main(adata, 
+         cluster_file,
+         mod, 
+         mincells=None,
+         layer=None,
+         testuse=None,
+         pseudo_seurat=False,
+         output_file_prefix="markers") :
+    # check the X slot actually contains data
+    if adata.X.shape[1] == 0:
+        L.info("exiting because adata.X contains no values")
+        sys.exit("exiting because adata.X contains no values")
+    # prevent log1p base error.
+    check_log1p_dict(adata)
+    # load clusters
+    load_and_merge_clusters(adata, cluster_file)
+    # filter out genes with zero counts
+    filter_zero_count_genes(adata, layer=layer)
+    # filter out clusters with less than min cells
+    if mincells is not None:
+        min_cell_df = adata.obs['clusters'].value_counts() 
+        if any(min_cell_df < mincells):
+            exclude = min_cell_df[min_cell_df < mincells].index
+            L.info('exluding clusters for haveing too few cells: %s' % str(list(exclude)))
+            clust_vals = list(set(min_cell_df[min_cell_df >= mincells].index))
+        else:
+            clust_vals = list(set(adata.obs["clusters"]))
+    all_markers, all_filter_stats = run_clustering(adata,
+                                                   cluster_groups=clust_vals,
+                                                   cluster_col="clusters",
+                                        layer=layer, 
+                                        methoduse=testuse, 
+                                        pseudo_seurat=pseudo_seurat)
+    # make sure all files have consitent headers
+    all_markers.columns = get_header()
     all_markers['mod'] = mod
-    all_markers.to_csv(args.output_file_prefix + ".txt", index=False, sep='\t', header=None, mode="a")
-
+    all_markers.to_csv(output_file_prefix + ".txt", index=False, sep='\t')
+    # subset to significant markers
     signif_markers = all_markers[(all_markers["p.adj.bonferroni"] < 0.05) &( all_markers.avg_logFC > 0)]
     if signif_markers.shape[0] !=0:
-        signif_markers.to_csv(args.output_file_prefix + mod + "_signif.txt", index=False, sep='\t')
-        L.debug("test_debug")
-        excel_file_top = args.output_file_prefix + mod + "_signif.xlsx"
+        signif_markers.to_csv(output_file_prefix + mod + "_signif.txt", index=False, sep='\t')
+        # write out to excel
+        excel_file_top = output_file_prefix + mod + "_signif.xlsx"
         with pd.ExcelWriter(excel_file_top) as writer:
             for xx in signif_markers.cluster.unique():
-                L.debug(xx)
                 df_sub = signif_markers[signif_markers.cluster == xx]
                 if df_sub.shape[0] !=0:
                     df_sub.to_excel(writer, sheet_name="cluster" + str(xx), index=False)
-
+    #  when relevent write out filter stats
     if pseudo_seurat is True:
-        all_filter_stats = pd.concat( filter_stats_dict, axis=0, keys= filter_stats_dict.keys(), ignore_index=True)
         all_filter_stats = all_filter_stats.reset_index().rename(columns={'index':'cluster'})
-        all_filter_stats.to_csv(args.output_file_prefix + "_filter_stats_" + mod + ".txt", index=False, sep="\t")
+        all_filter_stats.to_csv(output_file_prefix + "_filter_stats_" + mod + ".txt", index=False, sep="\t")
 
 
 
@@ -166,48 +195,28 @@ def main(adata, args, mod="", marker_columns=[]):
 L.info(args)
 L.info('\n')
 
-# get layer
-try:
-    layers = read_yaml(args.layer)
-except AttributeError:
-    layers = args.layer
-
 # read data
 mdata = read(args.infile)
     
-if type(mdata) is MuData and args.modality is None and type(layers) is not dict:
-    sys.exit('if inputting a mudata object, layers must be in a dictionary')
-    
-# write out the correct header before we start, need to werite it out before, because we use append mode for the multimodal find markers
-# and we don't wantto end up with a new header half
-if sc.__version__ < "1.7.0":
-    markers_columns = ['cluster', 'scores', 'gene', 'avg_logFC', 'pvals', 'p.adj.bonferroni', 'mod']
-elif sc.__version__ >= "1.7.0":
-    markers_columns = ['cluster', 'gene', 'scores', 'avg_logFC', 'pvals', 'p.adj.bonferroni', 'mod']
-pd.DataFrame(columns=markers_columns).to_csv(args.output_file_prefix + ".txt", index=False, sep='\t', mode="w")
-
 
 if type(mdata) is AnnData:
     adata = mdata
-    if layers != "X" :
-        adata.X = adata.layers[layers[args.modality]]
-    main(adata, args,marker_columns=markers_columns, mod=args.modality)
-elif args.modality is not None:
-    adata = mdata[args.modality]
-    if type(layers) is dict:
-        ll = layers[args.modality]
-    else:
-        ll = layers
-    adata.X = adata.layers[ll]
-    main(adata, args, marker_columns=markers_columns, mod=args.modality)
+    # main function only does rank_gene_groups on X, so 
+elif type(mdata) is MuData and args.modality is not None:
+    adata = mdata[args.modality]    
 else:
-    # we have multimodal object, and some kind of multimodal clustering output
-    for mod in mdata.mod.keys():
-        if type(layers) is dict:
-            ll = layers[args.modality]
-        else:
-            ll = layers
-        mdata[mod].X = mdata[mod].layers[ll]
-        main(mdata[mod], args, mod=mod, marker_columns=markers_columns)
+    sys.exit('if inputting a mudata object, you need to specify a modality')
+    
+
+
+main(adata, 
+     mod=args.modality,
+     cluster_file=args.cluster_file,
+    mincells=int(args.mincells),
+    layer=args.layer,
+    testuse=args.testuse,
+    pseudo_seurat=check_for_bool(args.pseudo_seurat),
+    output_file_prefix=args.output_file_prefix)
 
+L.info("done")
 
diff --git a/python/run_preprocess_rna.py b/python/run_preprocess_rna.py
index 1df9811b..84283e3d 100644
--- a/python/run_preprocess_rna.py
+++ b/python/run_preprocess_rna.py
@@ -234,4 +234,6 @@
 # save the scaled adata object
 
 mdata.write(args.output_scaled_mudata)
-L.debug(adata.uns['log1p'])
\ No newline at end of file
+L.debug(adata.uns['log1p'])
+
+L.info("done")
\ No newline at end of file
diff --git a/requirements_minimal.txt b/requirements_minimal.txt
index 47337598..40a54d2b 100644
--- a/requirements_minimal.txt
+++ b/requirements_minimal.txt
@@ -26,6 +26,6 @@ scrublet
 scikit-misc
 mofapy2
 pysam
-#mudata @ git+https://github.com/scverse/mudata
-#muon @ git+https://github.com/scverse/muon
-#scirpy @ git+https://github.com/scverse/scirpy
+mudata @ git+https://github.com/scverse/mudata
+muon @ git+https://github.com/scverse/muon
+scirpy @ git+https://github.com/scverse/scirpy