diff --git a/README.md b/README.md index 50142746..b9267187 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Available pipelines: See [installation instrcutions here](https://github.com/DendrouLab/panpipes/blob/main/docs/install.md) Review this issue before installatiion: https://github.com/DendrouLab/panpipes/issues/11 - +Oxford BMRC Rescomp users find additional advice in [docs/installation_rescomp](https://github.com/DendrouLab/panpipes/blob/main/docs/installation_rescomp.md) # General principles for running pipelines diff --git a/docs/install.md b/docs/install.md index 0fb1c1ce..1ba617a3 100644 --- a/docs/install.md +++ b/docs/install.md @@ -33,7 +33,7 @@ conda activate pipeline_env we include an environment.yml for a conda environment tested on all the pipelines packaged in this version of Panpipes. ##### Step 2 Download and install this repo - +If you have not already set up SSH keys for github first follow these [instructions](https://github.com/DendrouLab/panpipes/docs/set_up_ssh_keys_for_github.md): ``` git clone https://github.com/DendrouLab/panpipes diff --git a/docs/set_up_ssh_keys_for_github.md b/docs/set_up_ssh_keys_for_github.md new file mode 100644 index 00000000..2161c71c --- /dev/null +++ b/docs/set_up_ssh_keys_for_github.md @@ -0,0 +1,25 @@ + +## Set up SSH key for GitHub +For more advice: https://docs.github.com/en/authentication/connecting-to-github-with-ssh/about-ssh + +After checking for existing keys, if you receive error that ~/.ssh doesn't exist then you don't have one. If there already is one (ie. id_rsa.pub, id_ed25519.pub) then you can either connect it to GitHub or generate new one. +``` +ls -al ~/.ssh #check for existing keys +ssh-keygen -t ed25519 -C "your_email@example.com" #use your GitHub email address +#Enter a file in which to save the key (/c/Users/you/.ssh/id_algorithm):[Press enter] +#Enter passphrase (empty for no passphrase): [Type a passphrase] +eval "$(ssh-agent -s)" #start ssh-agent +ssh-add ~/.ssh/id_ed25519 #add your SSH private key to ssh-agent +clip < ~/.ssh/id_ed25519.pub #copy SSH public key +``` +After copying your SSH public key, go to GitHub --> Settings --> SSH and GPG keys (under Access) --> Add new public SSH key + +To test connection +``` +ssh -T git@github.com +``` +A successful connection should result in +> Hi username! You've successfully authenticated, but GitHub does not provide shell access. + +Activate the environment +``` \ No newline at end of file diff --git a/panpipes/funcs/scmethods.py b/panpipes/funcs/scmethods.py index 41f84ab9..8f025173 100644 --- a/panpipes/funcs/scmethods.py +++ b/panpipes/funcs/scmethods.py @@ -4,6 +4,7 @@ from scipy.sparse import issparse from scanpy.get import obs_df as get_obs_df from scanpy.pp import normalize_total +import scanpy as sc import warnings import logging from typing import Optional, Literal @@ -38,9 +39,53 @@ def exp_mean_dense(x): # convert out of compressed sparse matrix return np.log((np.sum(np.exp(x)-1)/x.shape[1]) + 1) - - -def pseudo_seurat(adata, arg_minpct=0.1, arg_mindiffpct=-float("inf"), arg_logfcdiff=0.25, use_dense=False): +def find_all_markers_pseudo_seurat( + adata, + groups, + groupby, + layer=None, + method=None, + n_genes=float("inf"), + corr_method="bonferroni", + arg_minpct=0.1, + arg_mindiffpct=-float("inf"), + arg_logfcdiff=0.25): + # add replace X with layer + if layer is not None: + adata.X = adata.layers[layer] + # need to check is the assay layer is dense or not + assay_is_sparse = issparse(adata.X) + use_dense = assay_is_sparse==False + if groups == 'all': + groups = adata.obs[groupby].unique().tolist() + markers_dict = {} + filter_dict = {} + for cv in groups: + # \ set up idenst as cv ==1 and everything else = 0 + adata.obs['idents'] = ['1' if x == cv else '0' for x in adata.obs[groupby]] + filter_dict[cv] = pseudo_seurat(adata, use_dense=use_dense,arg_minpct=arg_minpct, + arg_mindiffpct=arg_mindiffpct, + arg_logfcdiff=arg_logfcdiff ) + logging.info("number of genes remaining after filtering: %i\n" % filter_dict[cv]['background'].sum()) + adata_rg = adata[:, filter_dict[cv]['background'].tolist()].copy() + sc.tl.rank_genes_groups(adata_rg, layer=layer, + groupby="idents", groups=["1"], + reference="0", + method=method, + n_genes=float("inf"), + corr_method="bonferroni") + markers_dict[cv] = sc.get.rank_genes_groups_df(adata_rg, group="1") + # remove adata from mem + adata_rg = None + markers = pd.concat(markers_dict.values(), keys=markers_dict.keys()) + filter_stats = pd.concat(filter_dict.values(), keys=filter_dict.keys()) + return markers, filter_stats + +def pseudo_seurat(adata, + arg_minpct=0.1, + arg_mindiffpct=-float("inf"), + arg_logfcdiff=0.25, + use_dense=False): """ alternative method that"s more like seurat (pseudo seurat if you will) In that you filter genes before running rank genes @@ -79,7 +124,6 @@ def pseudo_seurat(adata, arg_minpct=0.1, arg_mindiffpct=-float("inf"), arg_logfc min_pct = pcts.min(axis=1) diff_pct = max_pct - min_pct take_diff_pct = diff_pct > arg_mindiffpct - # remove genes that are not expressed higher than 0.1 in one of the groups take_min_pct = max_pct > arg_minpct @@ -88,7 +132,7 @@ def pseudo_seurat(adata, arg_minpct=0.1, arg_mindiffpct=-float("inf"), arg_logfc # this has the potential to be very slow. Transposeing it speeds it up a bit. # I need to undertand sparse matrices better to make it work if use_dense: - print("using dense matrix") + logging.info("using dense matrix") # extract the counts for cluster cells and calculate exp means on each row nct = adata.X.T[:, cluster_cells_ind] cluster_mean = np.apply_along_axis(exp_mean_dense, 1, nct.todense()) @@ -98,7 +142,7 @@ def pseudo_seurat(adata, arg_minpct=0.1, arg_mindiffpct=-float("inf"), arg_logfc other_mean = np.apply_along_axis(exp_mean_dense, 1, nct.todense()) diff_mean = abs(cluster_mean - other_mean) else: - print("using sparse matrix") + logging.info("using sparse matrix") cluster_mean = exp_mean_sparse(adata.X.T[:, cluster_cells_ind]) other_mean = exp_mean_sparse(adata.X.T[:, other_cells_ind]) diff_mean = abs(cluster_mean - other_mean).A1 @@ -122,7 +166,7 @@ def run_neighbors_method_choice(adata, method, n_neighbors, n_pcs, metric, use_r # useful if we are dealing with a MuData object but we want to use single rep, e.g. # calculating neighbors on a totalVI latent rep if method == "scanpy": - print("Computing neighbors using scanpy") + logging.info("Computing neighbors using scanpy") from scanpy.pp import neighbors neighbors(adata, n_pcs=n_pcs, @@ -131,7 +175,7 @@ def run_neighbors_method_choice(adata, method, n_neighbors, n_pcs, metric, use_r use_rep=use_rep) elif method == "hnsw": from scvelo.pp import neighbors - print("Computing neighbors using hnswlib (with scvelo a la pegasus!)") + logging.info("Computing neighbors using hnswlib (with scvelo a la pegasus!)") # we use the neighbors function from scvelo (thanks!) # with parameters from pegasus (for a more exact result). # code snippet from Steve Sansom, via COMBAT project diff --git a/panpipes/pipeline_clustering.py b/panpipes/pipeline_clustering.py index 3cf8c7c0..c4733e4e 100644 --- a/panpipes/pipeline_clustering.py +++ b/panpipes/pipeline_clustering.py @@ -19,7 +19,7 @@ from itertools import chain, product import glob -from panpipes.funcs.processing import is_float_try, splitall, extract_parameter_from_fname +from panpipes.funcs.processing import extract_parameter_from_fname PARAMS = P.get_parameters( ["%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml", @@ -27,7 +27,7 @@ PARAMS['py_path'] = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'python') PARAMS['r_path'] = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'R') - +PARAMS['mudata_with_knn'] = 'mudata_w_neighbors.h5mu' job_kwargs={} if PARAMS['condaenv'] is not None: job_kwargs["job_condaenv"] =PARAMS['condaenv'] @@ -36,7 +36,6 @@ @originate("logs/setup_dirs.sentinel") def set_up_dirs(log_file): os.mkdir("logs") - mods = [key for key, value in PARAMS['modalities'].items() if value is True] if PARAMS["multimodal"]["run_clustering"] is True : mods.append("multimodal") @@ -48,16 +47,44 @@ def set_up_dirs(log_file): ## Single modality scripts ## ------------------------------------ +# -----------------------------------= +# neighbors +# -------------------------------------- +@follows(set_up_dirs) +@originate(PARAMS['mudata_with_knn']) +def run_neighbors(outfile): + mods = [key for key, value in PARAMS['modalities'].items() if value is True] + if any([PARAMS['neighbors'][mod]['use_existing'] is False for mod in mods]): + # this means we want to rerun neighbors for at least one assay + #we want to replace thhe scaled obj with the new neighbors + log_file="logs/run_single_mod_neighbors.log" + cmd=""" + python %(py_path)s/rerun_find_neighbors_for_clustering.py \ + --infile %(scaled_obj)s \ + --outfile %(outfile)s \ + --neighbor_dict '%(neighbors)s' \ + --n_threads %(resources_threads_high)s + """ + cmd += " > %(log_file)s" + job_kwargs["job_threads"] = PARAMS['resources_threads_high'] + P.run(cmd, **job_kwargs) + else: + P.run('ln -s %(scaled_obj)s %(outfile)s', without_cluster=True) + + + + # ------------------------------------ # UMAP # ------------------------------------ + def gen_umap_jobs(): """ Generate find neighbor jobs with all parameter combinations. """ # same infile for all jobs # define files based on jobs - infile = PARAMS['scaled_obj'] + infile = PARAMS['mudata_with_knn'] mods = [key for key, value in PARAMS['modalities'].items() if value is True] if PARAMS["multimodal"]["run_clustering"] is True : mods.append("multimodal") @@ -68,14 +95,14 @@ def gen_umap_jobs(): log_file = os.path.join("logs","_".join([mod, 'md' + str(md) +"_umap.log"])) yield [infile, output_file, mod, md, log_file] - +@follows(run_neighbors) @follows(set_up_dirs) @files(gen_umap_jobs) def calc_sm_umaps(infile, outfile, mod, mindist, log_file): prefix = os.path.split(infile)[0] cmd = """ python %(py_path)s/run_umap.py \ - --infile %(scaled_obj)s \ + --infile %(infile)s \ --outfile %(outfile)s \ --min_dist %(mindist)s """ @@ -100,7 +127,7 @@ def gen_cluster_jobs(): """ # same infile for all jobs # define files based on jobs - infile = PARAMS['scaled_obj'] + infile = PARAMS['mudata_with_knn'] mods = [key for key, value in PARAMS['modalities'].items() if value is True] if PARAMS['multimodal']['run_clustering'] is True: mods.append("multimodal") @@ -114,6 +141,7 @@ def gen_cluster_jobs(): @follows(set_up_dirs) @files(gen_cluster_jobs) +@follows(run_neighbors) def calc_cluster(infile, outfile, mod, res, alg, log_file): cmd = """python %(py_path)s/run_clustering.py --infile %(infile)s @@ -168,10 +196,11 @@ def collate_mdata(infiles,outfile): --umap_files_csv umap_file_paths.csv --output_mudata %(outfile)s """ - # if PARAMS['full_obj'] is None: - cmd += "--input_mudata %(scaled_obj)s" - # else: - # cmd += "--input_mudata %(scaled_obj)s %(full_obj)s" + if PARAMS['full_obj'] is None: + mdata_in = PARAMS['mudata_with_knn'] + cmd += "--input_mudata %(mdata_in)s" + else: + cmd += "--input_mudata %(full_obj)s" cmd += " > logs/collate_data.log" job_kwargs["job_threads"] = PARAMS['resources_threads_medium'] P.run(cmd, **job_kwargs) @@ -182,13 +211,19 @@ def collate_mdata(infiles,outfile): 'logs/plot_clusters_umaps.log') def plot_cluster_umaps(infile, log_file,): # get associated umap + mods = [key for key, value in PARAMS['modalities'].items() if value is True] + if PARAMS['multimodal']['run_clustering']: + mods.append("multimodal") + mods = ','.join(mods) cmd = """python %(py_path)s/plot_cluster_umaps.py \ --infile %(infile)s + --modalities '%(mods)s' """ cmd += " >> %(log_file)s" job_kwargs["job_threads"] = PARAMS['resources_threads_medium'] P.run(cmd, jobs_limit=1, **job_kwargs) + @transform(aggregate_clusters, regex("(.*)/all_res_clusters_list.txt.gz"), r'logs/\1_clustree.log', r'\1/figures/clustree.png', ) @@ -218,38 +253,50 @@ def cluster_analysis(fname): # ------------------------------------ # Markers # ------------------------------------ -@transform(calc_cluster, - regex("(.*)/(.*)/clusters.txt.gz"), - r"\1/\2/markers.txt", - r"\1/\2/markers", + +@subdivide(calc_cluster, regex("(.*)/(.*)/clusters.txt.gz"), + r"\1/\2/*_markers.txt", + r"\1/\2") +def gen_marker_jobs(infile, outfile, base_dir): + mods = [key for key, value in PARAMS['modalities'].items() if value is True] + for md in mods: + IOTools.touch_file(os.path.join(base_dir, md + "_markers.txt")) + +@follows(collate_mdata) +@transform(gen_marker_jobs, + regex("(.*)/(.*)/(.*)_markers.txt"), + r"logs/\1_\2_\3_markers.log", + r"\1/\2/\3_markers", r"\1", - r"logs/\1_\2_markers.log", + r"\1/\2", + r"\3" ) -def find_markers(infile, outfile, outfile_prefix, mod, log_file ): +def find_markers(infile, log_file, outfile_prefix, base_mod, cluster_dir, data_mod): """ Runs scanpy.tl.rank_gene_groups in parallel for each cluster """ - min_cells = PARAMS["markerspecs"][mod]["mincells"] + anndata_file=PARAMS['sample_prefix'] + "_clustered.h5mu" + "/" + data_mod + cluster_file = os.path.join(cluster_dir, "clusters.txt.gz") cmd = """ python %(py_path)s/run_find_markers_multi.py - --infile %(scaled_obj)s + --infile %(anndata_file)s + --cluster_file %(cluster_file)s --output_file_prefix %(outfile_prefix)s - --cluster_file %(infile)s - --mincells %(min_cells)s + --mod %(data_mod)s """ - if mod is not None and mod != "multimodal": - cmd += " --modality %(mod)s" - layers = {mod: PARAMS["markerspecs"][mod]["layer"] for mod in ['rna', 'prot', 'atac']} - cmd += " --layer '%(layers)s'" - if PARAMS['markerspecs_pseudo_seurat'] is True: - min_pct = PARAMS["markerspecs"][mod]["minpct"] - threshuse = PARAMS["markerspecs"][mod]["threshuse"] + min_cells = PARAMS["markerspecs"][data_mod]["mincells"] + cmd += " --mincells %(min_cells)s" + layer_choice = PARAMS["markerspecs"][data_mod]["layer"] + cmd += " --layer '%(layer_choice)s'" + testuse = PARAMS["markerspecs"][data_mod]["method"] + cmd += " --testuse '%(testuse)s'" + if PARAMS['markerspecs'][data_mod]['pseudo_seurat'] is True: + min_pct = PARAMS["markerspecs"][data_mod]["minpct"] + threshuse = PARAMS["markerspecs"][data_mod]["threshuse"] cmd += """ --pseudo_seurat True - --minpct %(min_pct)s - --threshuse %(threshuse)s + --minpct %(min_pct)s + --threshuse %(threshuse)s """ - else: - cmd += " --pseudo_seurat False" cmd += " > %(log_file)s " job_kwargs["job_threads"] = PARAMS['resources_threads_high'] P.run(cmd, **job_kwargs) @@ -276,19 +323,23 @@ def find_markers(infile, outfile, outfile_prefix, mod, log_file ): # --subgroup %(plotspecs_discrete_variables)s > %(log_file)s""" # P.run(cmd, job_threads=PARAMS['resources_threads_high']) - -# limit jobs because reading the h5 file simultaneouly is problematic -# @follows(heatmap_markers) +# this transforms gen_marker_jobs instead of find_markers because gen_marker_jobs creates empty marker files +# which are then filled by find_markers. @follows(collate_mdata) -@transform(find_markers, - regex(r"(.*)/alg(.*)/markers.txt"), - r"logs/\1_alg\2_dotplots.log", - r"\1/alg\2/figures/dotplot_top_markers.png", +@follows(find_markers) +@transform(gen_marker_jobs, + regex(r"(.*)/alg(.*)/(.*)_markers.txt"), + r"logs/cluster\1_alg\2_exprs_\3_dotplots.log", + r"\1/alg\2/figures/dotplot_top_markers_\3.png", r"\1/alg\2/figures", - r"\1", r"\2") -def plot_marker_dotplots(marker_file, log_file, outfile, fig_path,mod, cluster_col): + r"\1", r"\2", r"\3") +def plot_marker_dotplots(marker_file, log_file, outfile, + fig_path, cluster_mod, cluster_col, expr_mod): """ Plots some additional marker plots + Read in the h5mu file, pull the cluster col from the h5mu obs. + then pull the epxression values from the expr_mod. + """ data_obj=PARAMS['sample_prefix'] + "_clustered.h5mu" # check there is a figures directory @@ -297,17 +348,17 @@ def plot_marker_dotplots(marker_file, log_file, outfile, fig_path,mod, cluster_c cmd = """ python %(py_path)s/plot_scanpy_markers.py \ --infile %(data_obj)s \ + --modality %(expr_mod)s --marker_file %(marker_file)s \ - --figure_prefix %(fig_path)s - --group_col %(cluster_col)s + --figure_prefix %(fig_path)s \ --n %(plotspecs_top_n_markers)s """ - if mod is not None: - cmd += " --modality %(mod)s" - cmd += " --layer '%(plotspecs_layers)s'" - else: - cmd += " --layer logged_counts" + if cluster_mod != "multimodal": + cluster_col = cluster_mod + ":" + cluster_col + cmd += " --group_col %(cluster_col)s" + layer_choice = PARAMS["markerspecs"][expr_mod]["layer"] + cmd += " --layer %(layer_choice)s" cmd += " > %(log_file)s " job_kwargs["job_threads"] = PARAMS['resources_threads_medium'] P.run(cmd, **job_kwargs) @@ -374,10 +425,7 @@ def marker_analysis(fname): # #-------- - - - -@follows(cluster_analysis, marker_analysis, ) +@follows(cluster_analysis, marker_analysis ) def full(): """ All cgat pipelines should end with a full() function which updates, diff --git a/panpipes/pipeline_clustering/pipeline.yml b/panpipes/pipeline_clustering/pipeline.yml index fbcef0f3..28716a91 100644 --- a/panpipes/pipeline_clustering/pipeline.yml +++ b/panpipes/pipeline_clustering/pipeline.yml @@ -126,40 +126,56 @@ clusterspecs: # --------------------------------------- # parameters for finding marker genes # --------------------------------------- +# where pseudo_suerat is set to False +# we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html +# where pseudo_seurat is set to True, we run an a python implementation of Seurat::FindMarkers (written by CRG) is used, + markerspecs: rna: run: True layer: logged_counts + # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]] + method: t-test_overestim_var mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis - # Setting pseudo_seurat to True means that rather than use scanpy.rank_gene_groups to identify gene groups, - # a python implementation of Seurat::FindMarkers (written by CRG) is used, - # We recommend that pseudo_seurat is set to False + # where pseudo_suerat is set to False + # we run https://scanpy.readthedocs.io/en/stable/generated/scanpy.tl.rank_genes_groups.html + # where pseudo_seurat is set to True, we run an a python implementation of Seurat::FindMarkers (written by CRG) is used, pseudo_seurat: False # these next two settings do not matter unless pseudo_seurat is set to True, # If applicable look at Seurat documentation for FindMarkers for details minpct: 0.1 threshuse: 0.25 + prot: run: layer: clr - mincells: 10 + mincells: 10 # if a cluster contains less than n cells then do not bother doing marker analysis + # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]] + method: wilcoxon pseudo_seurat: False minpct: 0.1 threshuse: 0.25 + atac: run: layer: mincells: 10 + # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]] + method: wilcoxon pseudo_seurat: False minpct: 0.1 threshuse: 0.25 + multimodal: mincells: 10 + # method options: [‘logreg’, ‘t-test’, ‘wilcoxon’, ‘t-test_overestim_var’]] + method: wilcoxon pseudo_seurat: False minpct: 0.1 threshuse: 0.25 + # --------------------------------------- # plot specs are used to define which metadata columns are used in the visualisations # --------------------------------------- diff --git a/panpipes/pipeline_preprocess.py b/panpipes/pipeline_preprocess.py index d85d33fb..d8c91c83 100644 --- a/panpipes/pipeline_preprocess.py +++ b/panpipes/pipeline_preprocess.py @@ -49,7 +49,7 @@ def filter_mudata(outfile): python %(py_path)s/run_filter.py --input_mudata %(unfiltered_obj)s --output_mudata %(outfile)s - --filter_dict "%(filtering)s" + --filter_dict "%(filter_dict)s" """ if PARAMS['filtering_keep_barcodes'] is not None: cmd += " --keep_barcodes %(filtering_keep_barcodes)s" diff --git a/python/batch_correct_merge.py b/python/batch_correct_merge.py index a73a6bba..b003a2a9 100644 --- a/python/batch_correct_merge.py +++ b/python/batch_correct_merge.py @@ -58,7 +58,6 @@ if multi_mod=="totalvi": totalvi_extra_obsm = {'prot': ['totalvi_protein_foreground_prob', 'totalvi_denoised_protein'], 'rna': [ 'totalvi_denoised_rna']} - # we will want to keep these so extract them first totalvi_extra_obsm_keep = {} for k, v in totalvi_extra_obsm.items(): @@ -68,7 +67,21 @@ # this overwrites the old modalities for k,v in uni_mod_paths.items(): - base_obj.mod[k] = mu.read(v) + mod_replace = mu.read(v) + if mod_replace.shape[1] == base_obj.shape[1]: + L.info('replacing the whole anndata object with batch corrected') + base_obj.mod[k] = mod_replace + else: + # in this situation the vars in the mod has been subset but the + # original object contains all the genes. + L.info("number of genes does not match base object, only integrating obsp and obsm into full mdata[%s] object" % k) + base_obj.mod[k].obsm = mod_replace.obsm + base_obj.mod[k].obsp = mod_replace.obsp + base_obj.mod[k].uns = mod_replace.uns + base_obj.mod[k].uns['scVI_gene_subset'] = mod_replace.var_names.tolist() + # merge any extra obs columns + new_obs = mod_replace.obs.iloc[:, ~ mod_replace.obs.columns.isin( base_obj.mod[k].obs.columns)] + base_obj.mod[k].obs = base_obj.mod[k].obs.merge(new_obs, left_index=True, right_index=True) if multi_mod=="totalvi": # restore the totalvi extras diff --git a/python/batch_correct_none.py b/python/batch_correct_none.py index 6acd134d..21c7b7eb 100644 --- a/python/batch_correct_none.py +++ b/python/batch_correct_none.py @@ -71,13 +71,20 @@ # run neighbours and umap without batch correction +pc_kwargs = {} +if int(args.neighbors_n_pcs) > 0: + pc_kwargs['use_rep'] = "X_pca" + pc_kwargs['n_pcs'] = int(args.neighbors_n_pcs) +else: + # If n_pcs==0 use .X if use_rep is None. + pc_kwargs['use_rep'] = None + pc_kwargs['n_pcs'] = int(args.neighbors_n_pcs) + run_neighbors_method_choice(adata, method=args.neighbors_method, n_neighbors=int(args.neighbors_k), - n_pcs=int(args.neighbors_n_pcs), metric=args.neighbors_metric, - use_rep='X_pca', - nthreads=max([threads_available, 6])) + nthreads=max([threads_available, 6]), **pc_kwargs) L.info("done n_neighbours, saving stuff") diff --git a/python/batch_correct_scvi.py b/python/batch_correct_scvi.py index f0807209..c88fde8f 100644 --- a/python/batch_correct_scvi.py +++ b/python/batch_correct_scvi.py @@ -10,8 +10,8 @@ import muon as mu import panpipes.funcs as pp -from panpipes.funcs.io import read_anndata -from panpipes.funcs.scmethods import run_neighbors_method_choice +from panpipes.funcs.io import read_anndata, read_yaml +from panpipes.funcs.scmethods import run_neighbors_method_choice, X_is_raw import sys import logging @@ -28,7 +28,7 @@ default='adata_scaled.h5ad', help='') parser.add_argument('--raw_anndata', - default='adata_raw.h5ad', + default=None, help='') parser.add_argument('--output_csv', default='batch_correction/umap_bc_scvi.csv', help='') @@ -47,12 +47,12 @@ args, opt = parser.parse_known_args() - +L.info(args) threads_available = multiprocessing.cpu_count() # load parameters -params = pp.io.read_yaml("pipeline.yml") +params = read_yaml("pipeline.yml") L.info("Running scvi script") L.info("threads available: %s", threads_available) @@ -77,7 +77,7 @@ mdata = mu.read(args.scaled_anndata) if type(mdata) is mu.MuData: - rna = mdata['rna'].copy() + rna = mdata['rna'] L.info(rna) elif type(mdata) is sc.AnnData: rna = mdata @@ -106,6 +106,8 @@ # add in test to see if raw layer exists alread if "raw_counts" in rna.layers.keys(): L.info("raw counts found in layer") +elif X_is_raw(rna): + rna.layers["raw_counts"] = rna.X.copy() else: L.info("merge in raw counts") sc_raw = read_anndata(args.raw_anndata, use_muon=True, modality="rna") diff --git a/python/batch_correct_totalvi.py b/python/batch_correct_totalvi.py index df612e48..a132b0c2 100644 --- a/python/batch_correct_totalvi.py +++ b/python/batch_correct_totalvi.py @@ -8,17 +8,15 @@ import os import gc import muon as mu -from cgatcore import pipeline as P -import panpipes.funcs as pp -from panpipes.funcs.processing import check_for_bool, intersect_obs_by_mod -from panpipes.funcs.io import read_anndata, write_anndata, read_yaml +from panpipes.funcs.io import read_anndata, read_yaml from panpipes.funcs.scmethods import run_neighbors_method_choice, X_is_raw +from panpipes.funcs.processing import intersect_obs_by_mod import sys import logging L = logging.getLogger() -L.setLevel(logging.DEBUG) +L.setLevel(logging.INFO) log_handler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter('%(asctime)s: %(levelname)s - %(message)s') log_handler.setFormatter(formatter) @@ -30,7 +28,7 @@ default='adata_scaled.h5ad', help='') parser.add_argument('--raw_anndata', - default='adata_raw.h5ad', + default=None, help='') parser.add_argument('--output_csv', default='batch_correction/umap_bc_totalvi.csv', help='') @@ -60,7 +58,10 @@ # load parameters threads_available = multiprocessing.cpu_count() -params = pp.io.read_yaml("pipeline.yml") + +params = read_yaml("pipeline.yml") +params['sample_prefix'] + test_script=False @@ -78,6 +79,7 @@ # this is a copy so we can subset vars and obs without changing the original object rna = mdata['rna'].copy() +prot = mdata['prot'].copy() kwargs={} # in case of more than 1 variable, create a fake column with combined information @@ -129,6 +131,9 @@ else: raise ValueError("adt_outliers column not found in mdata['prot'].obs") +# exluding isotypes +if 'isotype' in prot.var.columns: + prot = prot[:, ~prot.var.isotype] # filter out mitochondria if params['multimodal']['totalvi']['exclude_mt_genes']: @@ -142,22 +147,25 @@ rna = rna[:, rna.var.highly_variable] mdata.update() + #make protein obsm pandas -# need to find the raw counts in mdata['prot'] -if X_is_raw(mdata['prot']): - X_array = mdata['prot'].X.copy() -elif "raw_counts" in mdata['prot'].layers.keys(): - X_array = mdata['prot'].layers['raw_counts'].copy() +# need to find the raw counts in prot +if X_is_raw(prot): + X_array = prot.X.copy() +elif "raw_counts" in prot.layers.keys(): + X_array = prot.layers['raw_counts'].copy() else: - raise AttributeError("raw counts not found in mdata['prot'], \ + raise AttributeError("raw counts not found in prot, \ store in either X or in 'raw_counts' layer") -X_df = pd.DataFrame(X_array.todense(), index=mdata['prot'].obs_names, columns=mdata['prot'].var_names) + +X_df = pd.DataFrame(X_array.todense(), index=prot.obs_names, columns=prot.var_names) if X_df.shape[0] == rna.X.shape[0]: L.info("adding protein_expression to obsm") #check the obs are in the correct order X_df = X_df.loc[rna.obs_names,:] + X_df = X_df.astype('int') rna.obsm['protein_expression'] = X_df else: L.error("dimensions do not match, cannot integrate protein") @@ -229,10 +237,12 @@ if batch_categories is not None: L.debug(batch_categories) + if type(batch_categories) is not list: + batch_categories = [batch_categories] normX, protein = vae.get_normalized_expression( n_samples=25, return_mean=True, - transform_batch=[batch_categories] + transform_batch=batch_categories ) # this is stored in obsm, because if we have filtered by hvg, it no longer fits specifications for a layer @@ -243,7 +253,7 @@ df = vae.get_protein_foreground_probability( n_samples=25, return_mean=True, - transform_batch=[batch_categories] + transform_batch=batch_categories ) mdata['prot'].obsm["totalvi_protein_foreground_prob"] = df.loc[mdata['prot'].obs_names,:] diff --git a/python/batch_correct_wnn.py b/python/batch_correct_wnn.py index 2578f8a2..9dc373c6 100644 --- a/python/batch_correct_wnn.py +++ b/python/batch_correct_wnn.py @@ -92,6 +92,7 @@ for kmod in dict_graph.keys(): L.info(kmod) + pkmod=params['multimodal']['WNN']['knn'][kmod] if dict_graph[kmod]["obsm"] is not None: if dict_graph[kmod]["obsm"] not in tmp.mod[kmod].obsm.keys(): if dict_graph[kmod]["anndata"] is not None: @@ -114,8 +115,11 @@ if repuse not in tmp.mod[kmod].obsm.keys(): if "X_LSI" in tmp.mod[kmod].obsm.keys(): repuse = "X_LSI" - L.info("falling back on %s" %(repuse) ) - pkmod=params['multimodal']['WNN']['knn'][kmod] + else: + L.info("falling back on %s" %(repuse) ) + if int(pkmod['npcs']) == 0: + repuse = None + L.info("calculating neighbours") if repuse != "X_bbknn": run_neighbors_method_choice(tmp.mod[kmod], diff --git a/python/collate_mdata.py b/python/collate_mdata.py index 5b5306db..23ae8d79 100644 --- a/python/collate_mdata.py +++ b/python/collate_mdata.py @@ -34,6 +34,7 @@ L.info(args) mdata = mu.read(args.input_mudata) +L.info("loading clusters") cf = pd.read_csv(args.clusters_files_csv) if isinstance(mdata, MuData): @@ -86,4 +87,4 @@ mdata.write(args.output_mudata) mdata.obs.to_csv(re.sub(".h5mu", "_cell_metdata.tsv", args.output_mudata), sep='\t') -L.info("done") \ No newline at end of file +L.info("done") diff --git a/python/downsample.py b/python/downsample.py index a1e094bb..138d8849 100644 --- a/python/downsample.py +++ b/python/downsample.py @@ -7,6 +7,7 @@ from muon import MuData from anndata import AnnData import muon as mu +import re from panpipes.funcs.io import read_anndata, write_anndata, write_obs from panpipes.funcs.processing import downsample_mudata, intersect_obs_by_mod, check_for_bool @@ -87,10 +88,10 @@ L.info(mdata.obs.sample_id.value_counts()) # write out the observations -write_obs(mdata, output_prefix=args.sampleprefix, +write_obs(mdata, output_prefix=re.sub("\\.(.*)", "", args.output_mudata), output_suffix="_downsampled_cell_metadata.tsv") -mdata.write(args.input_mudata) +mdata.write(args.output_mudata) #This stage is the point (i.e. pre-normalisation) where the mdata file can be outputted so that we can extract raw matrix for the cellphonedb. L.info("Completed") \ No newline at end of file diff --git a/python/plot_cluster_umaps.py b/python/plot_cluster_umaps.py index 37748a15..d1a0ea8c 100644 --- a/python/plot_cluster_umaps.py +++ b/python/plot_cluster_umaps.py @@ -28,9 +28,9 @@ parser.add_argument("--infile", default="mdata.h5mu", help="file name, format: .h5mu") -parser.add_argument("--figdir", - default=None, - help="figures directory") +parser.add_argument("--modalities", + default="rna", + help="list of modalities to search for UMAPs in") args, opt = parser.parse_known_args() L.info("running with:") @@ -44,7 +44,7 @@ def main(adata,figdir): obsm_keys = [x for x in adata.obsm.keys() if re.search(pattern, x)] L.info("Umap keys founds %s" % obsm_keys) # get all possible clustersclusters - pattern=re.compile(r'^leiden|^louvain_res') + pattern=re.compile(r'^leiden|^louvain') cluster_keys = [x for x in adata.obs.columns if re.search(pattern, x)] L.info("Cluster keys founds %s" % cluster_keys) if len(obsm_keys) == 0 or len(cluster_keys) == 0: @@ -65,23 +65,24 @@ def main(adata,figdir): L.debug("load data") mdata = read(args.infile) +mods = args.modalities.split(',') # detemin initial figure directory based on object type -if args.figdir is None: - if type(mdata) is MuData : - figdir = "multimodal/figures" - L.info("plotting multimodal umaps") - else: - figdir = "figures" - L.info("plotting umaps from anndata") -else: - figdir = args.figdir + # do plotting -main(mdata, figdir) +if 'multimodal' in mods: + if os.path.exists("multimodal/figures") is False: + os.makedirs("multimodal/figures") + main(mdata, figdir="multimodal/figures") # we also need to plot per modality if type(mdata) is MuData: for mod in mdata.mod.keys(): - L.info("plotting for modality: %s" % mod) - figdir = os.path.join(mod, "figures") - main(mdata[mod], figdir) + if mod in mods: + L.info("plotting for modality: %s" % mod) + figdir = os.path.join(mod, "figures") + if os.path.exists(figdir) is False: + os.makedirs(figdir) + main(mdata[mod], figdir) + +L.info('done') \ No newline at end of file diff --git a/python/plot_custom_markers.py b/python/plot_custom_markers.py index 19174bce..0f6c24a9 100644 --- a/python/plot_custom_markers.py +++ b/python/plot_custom_markers.py @@ -130,21 +130,11 @@ def main(adata, mod, df, grouping_var, pfx, layer_choice=None): for mod in modalities: print(mod) df_sub = df[df['mod'] == mod] - # make sure every grouping var is a category - if mod is not "multimodal": - for gv in group_vars: - mdata[mod].obs[gv] = mdata.obs.loc[mdata[mod].obs_names,gv].astype('category') - mdata.update_obs() - else: - for gv in group_vars: - mdata.obs[gv] = mdata.obs.loc[mdata.obs_names,gv].astype('category') - mdata.update_obs() - # pull out the layers and set ll to None if no layer sepcificed (so the plotting funcs will use X) - try: - ll = layers[mod] - except KeyError: - ll = [None] - if len(group_vars) > 0 and ll is not None: + for gv in group_vars: + mdata[mod].obs[gv] = mdata.obs.loc[mdata[mod].obs_names,gv].astype('category') + mdata.update_obs() + ll = layers[mod] + if len(ll) > 0 and len(group_vars) > 0: for gv, layer in product(group_vars, ll): print(gv, layer) main(adata=mdata[mod], diff --git a/python/plot_custom_markers_umap.py b/python/plot_custom_markers_umap.py index e3029728..9d268f2d 100644 --- a/python/plot_custom_markers_umap.py +++ b/python/plot_custom_markers_umap.py @@ -105,19 +105,12 @@ def main(adata, mod, layer_choice, df, basis): print(mod) df_sub = df[df['mod'] == mod] mdata.update_obs() - # pull out the layers and set ll to None - # if no layer sepcificed - # (so the plotting funcs will use X) - try: - ll = layers[mod] - except KeyError: - ll = [None] - # pull out the basis e.g. X_umap + ll = layers[mod] if mod in basis_dict.keys(): bb= basis_dict[mod] else: bb = [] - if len(bb) > 0: + if len(ll) > 0 and len(bb) > 0: for basis, layer in product(bb, ll): print(basis,layer) main(adata=mdata[mod], diff --git a/python/plot_scanpy_markers.py b/python/plot_scanpy_markers.py index 74b88969..e6f167f2 100644 --- a/python/plot_scanpy_markers.py +++ b/python/plot_scanpy_markers.py @@ -59,81 +59,78 @@ # script +def calc_dendrogram(adata, group_col): + if len(adata.obs[group_col].cat.categories) > 5: + incl_dendrogram = True + if "X_pca" not in adata.obsm.keys(): + sc.pp.pca(adata) + L.info("calculating dendrogram") + sc.tl.dendrogram(adata, groupby=group_col, use_rep="X_pca") + else: + incl_dendrogram = False + return incl_dendrogram + -def main(adata, mod, layer, group_col, mf, n): +def do_plots(adata, mod, group_col, mf, n=10, layer=None): L.debug("check layers") - if layer != 'X': - adata.X = adata.layers[layer] - # run pca if not done already.( for dendorgram calc) - if("X_pca" in adata.obsm.keys()): - sc.pp.pca(adata) - # read file + # get markers for plotting + mf = mf[mf['avg_logFC'] > 0] df = mf.groupby(group_col).apply(lambda x: x.nlargest(n, ['scores'])).reset_index(drop=True) marker_list={str(k): list(v) for k,v in df.groupby(group_col)["gene"]} # add cluseter col to obs - adata.obs[group_col] = adata.obs[group_col].astype('str').astype('category') - L.debug("do plots") + # check whether a dendrogram is computed/ + incl_dendrogram = calc_dendrogram(adata, group_col) + L.info("start plotting") sc.pl.stacked_violin(adata, marker_list, groupby=group_col, - save= '_top_markers'+ mod +'.png', - figsize=(24, 5)) + layer=layer, + save = '_top_markers'+ mod +'.png', + dendrogram=incl_dendrogram, + # figsize=(24, 5) + ) sc.pl.matrixplot(adata, marker_list, groupby=group_col, save= '_top_markers'+ mod +'.png', + dendrogram=incl_dendrogram, figsize=(24, 5)) sc.pl.dotplot(adata, marker_list, groupby=group_col, save= '_top_markers'+ mod +'.png', + dendrogram=incl_dendrogram, figsize=(24, 5)) sc.pl.heatmap(adata, marker_list, groupby=group_col, save= '_top_markers'+ mod +'.png', - figsize=(24, 5)) + dendrogram=incl_dendrogram, + # figsize=(24, 5) + ) -L.debug("load data") +# read data mdata = mu.read(args.infile) - -L.debug("load marker file") -mf = pd.read_csv(args.marker_file, sep='\t' ) -mf[args.group_col] = mf['cluster'].astype('str').astype('category') -# get layer -try: - layers = read_yaml(args.layer) -except AttributeError: - layers = args.layer - -if type(mdata) is mu.MuData and args.modality is None and type(layers) is not dict: - sys.exit('if inputting a mudata object, layers must be in a dictionary') - - if type(mdata) is AnnData: adata = mdata - main(adata,mod=args.modality, layer=args.layer, n=args.n) -elif args.modality is not None: - adata = mdata[args.modality] - if type(layers) is dict: - ll = layers[args.modality] - else: - ll = layers - main(adata, - mod=args.modality, - layer = ll, - mf=mf, - group_col = args.group_col, - n=int(args.n)) + # main function only does rank_gene_groups on X, so +elif type(mdata) is mu.MuData and args.modality is not None: + adata = mdata[args.modality] else: - # we have multimodal object, and some kind of multimodal clustering output - for mod in mf['mod'].unique(): - print(mod) - mf_sub = mf[mf['mod'] == mod] - mdata[mod].obs[args.group_col] = mdata.obs.loc[mdata[mod].obs_names,args.group_col].astype('category') - mdata.update_obs() - main(mdata[mod], mod=mod, layer = layers[mod], group_col = args.group_col, mf=mf_sub, n=int(args.n)) + sys.exit('if inputting a mudata object, you need to specify a modality') + +L.info("load marker file") +mf = pd.read_csv(args.marker_file, sep='\t' ) +mf[args.group_col] = mf['cluster'].astype('category') +# get layer +adata.obs[args.group_col] = mdata.obs[args.group_col] +do_plots(adata, + mod=args.modality, + group_col=args.group_col, + mf=mf, + layer=args.layer, + n=int(args.n)) diff --git a/python/plot_umaps_batch_correct.py b/python/plot_umaps_batch_correct.py index 88dae768..5c2aca59 100644 --- a/python/plot_umaps_batch_correct.py +++ b/python/plot_umaps_batch_correct.py @@ -178,3 +178,4 @@ g.savefig(os.path.join(args.fig_dir, mod, "umap_method_" + qc + ".png"), dpi = 300) plt.clf() +L.info('done') \ No newline at end of file diff --git a/python/plot_variables_umaps.py b/python/plot_variables_umaps.py index 8c4cef73..ec44c0f0 100644 --- a/python/plot_variables_umaps.py +++ b/python/plot_variables_umaps.py @@ -8,7 +8,7 @@ from anndata import AnnData import pandas as pd import argparse -from panpipes.funcs.io import read_yaml +from panpipes.funcs.io import read_yaml, dictionary_stripper from itertools import product, chain sc.settings.autoshow = False @@ -69,10 +69,11 @@ def main(adata, mod, plot_features, basis, fig_suffix): # get bases -basis_dict = read_yaml(args.basis_dict) +basis_dict = dictionary_stripper(read_yaml(args.basis_dict)) if args.categorical_variables is not None: - cat_vars = read_yaml(args.categorical_variables) + cat_vars = dictionary_stripper(read_yaml(args.categorical_variables)) + uniq_discrete = list(set(chain(*cat_vars.values()))) # make sure they are categories mdata.obs[uniq_discrete] = mdata.obs[uniq_discrete].apply(lambda x: x. astype('category')) @@ -82,7 +83,7 @@ def main(adata, mod, plot_features, basis, fig_suffix): if args.continuous_variables is not None: try: - cont_vars = read_yaml(args.continuous_variables) + cont_vars = dictionary_stripper(read_yaml(args.continuous_variables)) except AttributeError: # this assumes that we have tried to parse a dict and nstead found a string # there is probably a better solution diff --git a/python/refmap_scvitools.py b/python/refmap_scvitools.py index 8c4dc476..d05ed303 100644 --- a/python/refmap_scvitools.py +++ b/python/refmap_scvitools.py @@ -1,7 +1,7 @@ import multiprocessing from matplotlib import transforms - +import anndata as ad import scanpy as sc import pandas as pd import matplotlib.pyplot as plt @@ -30,6 +30,8 @@ parser.add_argument('--query_data', default='adata_scaled.h5ad', help='path to query data. can be a raw 10x dataset or a preprocessed anndata/mudata') +parser.add_argument('--query_celltype',default='celltype', + help='if query has already a column with cell labeling in obs. Default to "celltype" ') parser.add_argument('--adata_reference', default='adata_.h5ad', help='path to reference ann/mudata. necessary only if you want to produce ref/ query map') @@ -40,6 +42,8 @@ help='scvi, scanvi, totalvi supported') # parser.add_argument('--generate_scanvi', default=False, # help='create scanvi reference from scvi to allow for label transfer') +parser.add_argument('--predict_rf',default=False, + help='For totalvi and scvi, if reference model has random forest classifier, classify cells in query') parser.add_argument('--impute_proteins', default=False, help='if totalvi reference is used, allow to impute protein levels') parser.add_argument('--transform_batch', default=None, @@ -56,7 +60,10 @@ args, opt = parser.parse_known_args() sc.settings.figdir = "figures/" +sc.set_figure_params(figsize=(8, 6), dpi=300) L.info("running with args:") +args.predict_rf = check_for_bool(args.predict_rf) +args.impute_proteins = check_for_bool(args.impute_proteins) L.info(args) threads_available = multiprocessing.cpu_count() @@ -65,6 +72,9 @@ params = read_yaml("pipeline.yml") query_data = os.path.basename(args.query_data) +reference_architecture = str(args.reference_architecture) + + mdata = mu.read(args.query_data) if type(mdata) is mu.MuData: @@ -72,6 +82,20 @@ sys.exit("we only support querying using RNA but your mdata doesn't contain rna") else: adata_query = mdata["rna"].copy() + if "prot" in mdata.mod.keys() and reference_architecture=="totalvi": + if X_is_raw(mdata['prot']): + X_array = mdata['prot'].X.copy() + elif "raw_counts" in mdata['prot'].layers.keys(): + X_array = mdata['prot'].layers['raw_counts'].copy() + + X_df = pd.DataFrame(X_array.todense(), index=mdata['prot'].obs_names, columns=mdata['prot'].var_names) + if X_df.shape[0] == adata_query.X.shape[0]: + L.info("adding protein_expression to obsm") + #check the obs are in the correct order + X_df = X_df.loc[adata_query.obs_names,:] + adata_query.obsm['protein_expression'] = X_df + else: + L.error("dimensions do not match, cannot create the raw obsm counts in query") else: adata_query = mdata.copy() @@ -87,7 +111,7 @@ raise AttributeError("raw counts not found in query, \ store in either X or in 'counts/raw_counts' layer") -reference_architecture = str(args.reference_architecture) + if reference_architecture in ['totalvi','scvi','scanvi']: L.info("running using %s" % reference_architecture) else: @@ -136,30 +160,132 @@ vae_q.train(max_epochs= max_epochs , plan_kwargs=train_kwargs) adata_query.obsm["X_scANVI"] = vae_q.get_latent_representation() adata_query.obs["predictions"] = vae_q.predict() - df = adata_query.obs.groupby(["celltype", "predictions"]).size().unstack(fill_value=0) - norm_df = df / df.sum(axis=0) - - plt.figure(figsize=(8, 8)) - _ = plt.pcolor(norm_df) - _ = plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90) - _ = plt.yticks(np.arange(0.5, len(df.index), 1), df.index) - plt.xlabel("Predicted") - plt.ylabel("Observed") - file_name = "SCANVI_predicted_vs_observed_labels_query_data" - plt.savefig(os.path.join("figures/", file_name + ".png")) - + if args.query_celltype is not None: + L.info("Query has celltypes in column %i, i will plot what predictions look like from scanvi model" % args.query_celltype) + df = adata_query.obs.groupby([str(args.query_celltype), "predictions"]).size().unstack(fill_value=0) + norm_df = df / df.sum(axis=0) + + plt.figure(figsize=(8, 8)) + _ = plt.pcolor(norm_df) + _ = plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90) + _ = plt.yticks(np.arange(0.5, len(df.index), 1), df.index) + plt.xlabel("Predicted") + plt.ylabel("Observed") + file_name = "SCANVI_predicted_vs_observed_labels_query_data" + plt.savefig(os.path.join("figures/", file_name + ".png")) + if reference_architecture=="totalvi": + # temporary fix is disabled for now, need to modify to allow to fix query to match reference + fix_query = False + if fix_query: + L.info(" will do some manipulation of query data to make it match to referece structure") + if args.adata_reference is not None: + reference_data = os.path.basename(args.adata_reference) + mdata=mu.read(args.adata_reference) + if type(mdata) is MuData: + if "rna" not in mdata.mod.keys(): + sys.exit("we only support querying using RNA but your mdata doesn't contain rna") + else: + adata_ref = mdata["rna"].copy() + adata_ref.obsm = mdata.obsm.copy() + adata_ref.obsp = mdata.obsp.copy() + else: + adata_ref = mdata.copy() + del mdata + # match query and reference vars + + #adata_query.layers["counts"] = adata_query.X.copy()#already taken care of + # are the following necessary? + sc.pp.normalize_total(adata_query, target_sum=1e4) + sc.pp.log1p(adata_query) + adata_query.raw = adata_query + # subset to reference vars + adata_query = adata_query[:, adata_ref.var_names].copy() + + adata_query.obs["celltype.l2"] = "Unknown" + adata_query.obs["orig.ident"] = adata_query.obs["set"] + #query.obsm["X_umap"] = query.obs[["UMAP1", "UMAP2"]] + + + # obsm slot slot for protein counts can be called either prot_counts or protein_expression + # in the reference anndata and model. allow a + + if "protein_expression" in adata_ref.obsm.keys(): + pname = "protein_expression" + + elif "protein_counts" in adata_ref.obsm.keys(): + pname = "protein_counts" + + if pname not in adata_query.obsm.keys(): + X_df = pd.DataFrame(0, index=adata_ref.obs_names, columns=adata_ref.obsm[pname].columns) + + adata_query.obsm["protein_counts"] = query.obsm["pro_exp"].copy() + + + for p in adata_ref.obsm[pname].columns: + if p not in adata_query.obsm[pname].columns: + adata_query.obsm[pname][p] = 0.0 + # ensure columns are in same order + adata_query.obsm[pname] = adata_query.obsm[pname].loc[:, adata_ref.obsm[pname].columns] + + else: + + # For now make this only work when reference anndata is available + + # if args.reference_var is not None: + # reference_var = pd.read(args.reference_var, sep="\t") + # else: + # sys.exit("I need var names from reference to make sure query has overlap") + # if args.reference_prot is not None: + # reference_prot = pd.read(args.reference_prot, sep="\t") + # else: + # sys.exit("I need var names from reference to make sure query has overlap") + + # if args.reference_prot_assay is not None: + # L.info("name of obsm slot for reference is: %i" % args.reference_prot_assay) + # pname = args.reference_prot_assay + sys.exit("need a reference dataset to check for matching query entries") + scvi.model.TOTALVI.prepare_query_anndata(adata_query, reference_path) vae_q = scvi.model.TOTALVI.load_query_data( adata_query, reference_path, freeze_expression=True) - latent_choice= "X_totalVI" + latent_choice= "X_totalvi" vae_q.train(max_epochs= max_epochs , plan_kwargs=train_kwargs) - adata_query.obsm["X_totalVI"] = vae_q.get_latent_representation() + adata_query.obsm["X_totalvi"] = vae_q.get_latent_representation() + #remove this after finishing + adata_query.write(os.path.join( "query_temp_check_totvi.h5ad")) + + if args.predict_rf : + L.info("predicting celltypes") + predictions = ( + vae_q.latent_space_classifer_.predict( + adata_query.obsm["X_totalvi"] + ) + ) + adata_query.obs["predictions"] = predictions + #remove this after finishing + adata_query.write(os.path.join( "query_temp_check_predictions_totvi.h5ad")) + + if args.query_celltype is not None: + L.info("Query has celltypes in column %s, i will plot what predictions look like from totalvi model" % args.query_celltype) + df = adata_query.obs.groupby([str(args.query_celltype), "predictions"]).size().unstack(fill_value=0) + norm_df = df / df.sum(axis=0) + + plt.figure(figsize=(8, 8)) + _ = plt.pcolor(norm_df) + _ = plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=90) + _ = plt.yticks(np.arange(0.5, len(df.index), 1), df.index) + plt.xlabel("Predicted") + plt.ylabel("Observed") + file_name = "totalvi_predicted_vs_observed_labels_query_data" + plt.savefig(os.path.join("figures/", file_name + ".png")) + L.info("Mapped Query to Reference, now some utilities") + if args.adata_reference is not None: reference_data = os.path.basename(args.adata_reference) mdata=mu.read(args.adata_reference) @@ -175,8 +301,10 @@ adata_ref.obs.loc[:, 'is_reference'] = 'Reference' adata_query.obs.loc[:, 'is_reference'] = 'Query' - - adata_full = adata_query.concatenate(adata_ref, batch_key="batch_key") + #expect the batch to be always encoded as `batch` in both Q and R + adata_full = ad.concat( [adata_ref,adata_query]) + # add param to decide if to recompute the total embedding or to recalc the embedding + #if "X_totalvi" not in adata_ref.obsm.keys(): adata_full.obsm[latent_choice] = vae_q.get_latent_representation(adata_full) if args.impute_proteins: @@ -200,10 +328,10 @@ adata_full = adata_query.copy() if int(args.neighbors_n_pcs) > adata_full.obsm[latent_choice].shape[1]: - L.warn("N PCs is larger than %i dimensions, reducing n PCs to" % adata_full.obsm[latent_choice].shape[1] -1) + L.warn("N PCs is larger than %i dimensions, reducing n PCs to " % adata_full.obsm[latent_choice].shape[1]) -n_pcs= min(int(args.neighbors_n_pcs), adata_full.obsm[latent_choice].shape[1]-1) +n_pcs= min(int(args.neighbors_n_pcs), adata_full.obsm[latent_choice].shape[1]) run_neighbors_method_choice(adata_full, @@ -220,24 +348,23 @@ model_name = ''.join(e for e in model_name if e.isalnum()) -file_name= "umap" + model_name + "_" + latent_choice +file_name= "umap_" + model_name + "_" + latent_choice fig = sc.pl.embedding(adata_full, basis = "umap",color=["is_reference"], show=False, return_fig=True) fig.savefig(os.path.join("figures/", file_name + ".png")) -umap = pd.DataFrame(mdata.obsm['X_umap'], mdata.obs.index) +umap = pd.DataFrame(adata_full.obsm['X_umap'], adata_full.obs.index) umap.to_csv(os.path.join("refmap/", file_name + ".csv") ) file_name= "query_to_reference_" + model_name + "_" + latent_choice + ".h5mu" #change this to mudata mdata_save = MuData({"rna":adata_full}) -write_anndata(mdata_save, use_muon=True, modality="rna", fname=os.path.join( "refmap/" , file_name)) - - +mdata_save.write(os.path.join( "refmap/" , file_name)) +L.info('done') diff --git a/python/run_find_markers_multi.py b/python/run_find_markers_multi.py index 66225d17..387e40ab 100644 --- a/python/run_find_markers_multi.py +++ b/python/run_find_markers_multi.py @@ -3,14 +3,11 @@ from anndata import AnnData import argparse import pandas as pd -import re import numpy as np from scipy.sparse import issparse -from itertools import compress - from panpipes.funcs.processing import check_for_bool -from panpipes.funcs.scmethods import pseudo_seurat -from panpipes.funcs.io import read_yaml +from panpipes.funcs.scmethods import find_all_markers_pseudo_seurat + import sys import logging @@ -47,20 +44,14 @@ help="testing limited to genes with this (log scale) difference in mean expression level.") parser.add_argument("--mincells", type=str, default='3', help="minimum number of cells required (applies to cluster and to the other cells)") -parser.add_argument("--use_dense", type=str, default="False") + args, opt = parser.parse_known_args() -# args = argparse.Namespace(infile='prot/ncomps3_nneigh30_meteuclidean_dir/neighbors.h5ad', modality='prot', layer='rna', output_file_prefix='prot/ncomps3_nneigh30_meteuclidean_dir/algleiden_res1/markers', cluster_file='prot/ncomps3_nneigh30_meteuclidean_dir/algleiden_res1/clusters.txt.gz', testuse='wilcoxon', pseudo_seurat='False', minpct='0.1', mindiffpct='-inf', threshuse='0.25', mincells='10', use_dense='False') # script ------------------------------- -def main(adata, args, mod="", marker_columns=[]): - # check the X slot actually contains data - if adata.X.shape[1] == 0: - L.info("skipping %s because adata.X contains no values" % mod) - return None - +def check_log1p_dict(adata): # temp code to get around this bug # https://github.com/scverse/muon/issues/40 if 'log1p' in adata.uns.keys(): @@ -68,97 +59,135 @@ def main(adata, args, mod="", marker_columns=[]): L.debug( "log1p dict is Empty, refilling dict") adata.uns['log1p'] = {'base': None} + +def load_and_merge_clusters(adata, cluster_file): # load clusterss - df = pd.read_csv(args.cluster_file, sep="\t", index_col=0) + df = pd.read_csv(cluster_file, sep="\t", index_col=0) adata_shape = adata.shape - # add clusters to adata.obs adata.obs = pd.merge(adata.obs, df, how="left", left_index=True, right_index=True) # check check we have not lost cells in merge if adata.shape != adata_shape: L.info("some cells lost in merge, not all cells have a cluster?") L.debug(adata.shape) - if issparse(adata.X): - bool_list = (adata.X.sum(axis=0) > 0).tolist()[0] - else: - bool_list = (adata.X.sum(axis=0) > 0).tolist() + +def get_header(): + # write out the correct header before we start, need to werite it out before, because we use append mode for the multimodal find markers + # and we don't wantto end up with a new header half + if sc.__version__ < "1.7.0": + markers_columns = ['cluster', 'scores', 'gene', 'avg_logFC', 'pvals', 'p.adj.bonferroni'] + elif sc.__version__ >= "1.7.0": + markers_columns = ['cluster', 'gene', 'scores', 'avg_logFC', 'pvals', 'p.adj.bonferroni'] + return markers_columns + + +def filter_zero_count_genes(adata, layer=None): + if layer is None: + if issparse(adata.X): + bool_list = (adata.X.sum(axis=0) > 0).tolist()[0] + else: + bool_list = (adata.X.sum(axis=0) > 0).tolist() + else: + if issparse(adata.layers[layer]): + bool_list = (adata.layers[layer].sum(axis=0) > 0).tolist()[0] + else: + bool_list = (adata.layers[layer].sum(axis=0) > 0).tolist() adata = adata[:, bool_list] L.debug(adata.shape) - run_pseudo_seurat = check_for_bool(args.pseudo_seurat) - use_dense = check_for_bool(args.use_dense) - - clust_vals = set(adata.obs["clusters"]) - markers_dict = {} - filter_stats_dict = {} - - for cv in clust_vals: - L.info(cv) - # 1. set idents - adata.obs["idents"] = ["1" if cc == cv else "0" for cc in adata.obs["clusters"]] - adata.obs["idents"] = adata.obs["idents"].astype("category") - # check we have enough cells - if args.mincells == "None": - min_cells = 3 - else: - min_cells = int(args.mincells) - n_cluster = sum([cell == '1' for cell in adata.obs['idents']]) - n_other = sum([cell == '0' for cell in adata.obs['idents']]) - if n_cluster <= int(args.mincells) | n_other <= int(args.mincells): - sys.exit("not enough cells in cluster, min cells: %i" % int(args.mincells)) - L.info("Cluster %s number of cells: %i" % (cv, n_cluster)) - L.info("Other number of cells: %i\n" % n_other) - # adata = adata.raw.to_adata().copy() - if run_pseudo_seurat is True: - filter_stats = pseudo_seurat(adata, use_dense=use_dense) - L.info("number of genes remaining after filtering: %i\n" % filter_stats['background'].sum()) - adata_rg = adata[:, filter_stats['background'].tolist()].copy() - - sc.tl.rank_genes_groups(adata_rg, layer=None, - groups=["1"], groupby="idents", reference="0", - method="wilcoxon", n_genes=float("inf"), corr_method="bonferroni") - - markers = sc.get.rank_genes_groups_df(adata_rg, group="1",) - # remove adata from mem - adata_rg = None - else: - sc.tl.rank_genes_groups(adata, groups=["1"], groupby="idents", reference="0", - method="t-test_overestim_var", - n_genes=float("inf"), - corr_method="bonferroni", layer=None) - markers = sc.get.rank_genes_groups_df(adata, group="1") - # filter positive only and adjusted pval < 0.05 - # markers = markers[(markers.logfoldchanges > 0) & (markers.pvals_adj < 0.05)] - - markers.head() + + +def run_clustering(adata, + cluster_col=None, + cluster_groups='all', + layer=None, + methoduse=None, + pseudo_seurat=False): + if type(cluster_groups) is list: + # make sure the clusters are strings else rank_gene_groups crashes + cluster_groups = [str(x) for x in cluster_groups] + adata.obs[cluster_col] = adata.obs[cluster_col].astype('str').astype('category') + if pseudo_seurat: + L.info('running rank gene groups with pseudo-seurat method') + markers, filter_stats = find_all_markers_pseudo_seurat(adata, + groups=cluster_groups, + groupby=cluster_col, + method=methoduse, + n_genes=float("inf"), + corr_method="bonferroni", + layer=layer, + arg_minpct=0.1, + arg_mindiffpct=-float("inf"), + arg_logfcdiff=0.25) + markers = markers.reset_index().drop(columns='level_1').rename(columns={'level_0':'gene'}) + else: + # find markers + L.info('running rank gene groups with standard scanpy implementation') + sc.tl.rank_genes_groups(adata, + groups=cluster_groups, + groupby=cluster_col, + method=methoduse, + n_genes=float("inf"), + corr_method="bonferroni", + layer=layer) + markers = sc.get.rank_genes_groups_df(adata, group=None) L.info("Rank gene groups complete") - markers_dict[cv] = markers - if pseudo_seurat is True: - filter_stats_dict[cv] = filter_stats - - - all_markers = pd.concat(markers_dict, axis=0, keys=markers_dict.keys()) - all_markers.columns = marker_columns[1:6] - all_markers = all_markers.reset_index().rename(columns={'level_0':'cluster'}).drop(columns="level_1") + filter_stats = None + return markers, filter_stats + + +def main(adata, + cluster_file, + mod, + mincells=None, + layer=None, + testuse=None, + pseudo_seurat=False, + output_file_prefix="markers") : + # check the X slot actually contains data + if adata.X.shape[1] == 0: + L.info("exiting because adata.X contains no values") + sys.exit("exiting because adata.X contains no values") + # prevent log1p base error. + check_log1p_dict(adata) + # load clusters + load_and_merge_clusters(adata, cluster_file) + # filter out genes with zero counts + filter_zero_count_genes(adata, layer=layer) + # filter out clusters with less than min cells + if mincells is not None: + min_cell_df = adata.obs['clusters'].value_counts() + if any(min_cell_df < mincells): + exclude = min_cell_df[min_cell_df < mincells].index + L.info('exluding clusters for haveing too few cells: %s' % str(list(exclude))) + clust_vals = list(set(min_cell_df[min_cell_df >= mincells].index)) + else: + clust_vals = list(set(adata.obs["clusters"])) + all_markers, all_filter_stats = run_clustering(adata, + cluster_groups=clust_vals, + cluster_col="clusters", + layer=layer, + methoduse=testuse, + pseudo_seurat=pseudo_seurat) + # make sure all files have consitent headers + all_markers.columns = get_header() all_markers['mod'] = mod - all_markers.to_csv(args.output_file_prefix + ".txt", index=False, sep='\t', header=None, mode="a") - + all_markers.to_csv(output_file_prefix + ".txt", index=False, sep='\t') + # subset to significant markers signif_markers = all_markers[(all_markers["p.adj.bonferroni"] < 0.05) &( all_markers.avg_logFC > 0)] if signif_markers.shape[0] !=0: - signif_markers.to_csv(args.output_file_prefix + mod + "_signif.txt", index=False, sep='\t') - L.debug("test_debug") - excel_file_top = args.output_file_prefix + mod + "_signif.xlsx" + signif_markers.to_csv(output_file_prefix + mod + "_signif.txt", index=False, sep='\t') + # write out to excel + excel_file_top = output_file_prefix + mod + "_signif.xlsx" with pd.ExcelWriter(excel_file_top) as writer: for xx in signif_markers.cluster.unique(): - L.debug(xx) df_sub = signif_markers[signif_markers.cluster == xx] if df_sub.shape[0] !=0: df_sub.to_excel(writer, sheet_name="cluster" + str(xx), index=False) - + # when relevent write out filter stats if pseudo_seurat is True: - all_filter_stats = pd.concat( filter_stats_dict, axis=0, keys= filter_stats_dict.keys(), ignore_index=True) all_filter_stats = all_filter_stats.reset_index().rename(columns={'index':'cluster'}) - all_filter_stats.to_csv(args.output_file_prefix + "_filter_stats_" + mod + ".txt", index=False, sep="\t") + all_filter_stats.to_csv(output_file_prefix + "_filter_stats_" + mod + ".txt", index=False, sep="\t") @@ -166,48 +195,28 @@ def main(adata, args, mod="", marker_columns=[]): L.info(args) L.info('\n') -# get layer -try: - layers = read_yaml(args.layer) -except AttributeError: - layers = args.layer - # read data mdata = read(args.infile) -if type(mdata) is MuData and args.modality is None and type(layers) is not dict: - sys.exit('if inputting a mudata object, layers must be in a dictionary') - -# write out the correct header before we start, need to werite it out before, because we use append mode for the multimodal find markers -# and we don't wantto end up with a new header half -if sc.__version__ < "1.7.0": - markers_columns = ['cluster', 'scores', 'gene', 'avg_logFC', 'pvals', 'p.adj.bonferroni', 'mod'] -elif sc.__version__ >= "1.7.0": - markers_columns = ['cluster', 'gene', 'scores', 'avg_logFC', 'pvals', 'p.adj.bonferroni', 'mod'] -pd.DataFrame(columns=markers_columns).to_csv(args.output_file_prefix + ".txt", index=False, sep='\t', mode="w") - if type(mdata) is AnnData: adata = mdata - if layers != "X" : - adata.X = adata.layers[layers[args.modality]] - main(adata, args,marker_columns=markers_columns, mod=args.modality) -elif args.modality is not None: - adata = mdata[args.modality] - if type(layers) is dict: - ll = layers[args.modality] - else: - ll = layers - adata.X = adata.layers[ll] - main(adata, args, marker_columns=markers_columns, mod=args.modality) + # main function only does rank_gene_groups on X, so +elif type(mdata) is MuData and args.modality is not None: + adata = mdata[args.modality] else: - # we have multimodal object, and some kind of multimodal clustering output - for mod in mdata.mod.keys(): - if type(layers) is dict: - ll = layers[args.modality] - else: - ll = layers - mdata[mod].X = mdata[mod].layers[ll] - main(mdata[mod], args, mod=mod, marker_columns=markers_columns) + sys.exit('if inputting a mudata object, you need to specify a modality') + + + +main(adata, + mod=args.modality, + cluster_file=args.cluster_file, + mincells=int(args.mincells), + layer=args.layer, + testuse=args.testuse, + pseudo_seurat=check_for_bool(args.pseudo_seurat), + output_file_prefix=args.output_file_prefix) +L.info("done") diff --git a/python/run_preprocess_rna.py b/python/run_preprocess_rna.py index 1df9811b..84283e3d 100644 --- a/python/run_preprocess_rna.py +++ b/python/run_preprocess_rna.py @@ -234,4 +234,6 @@ # save the scaled adata object mdata.write(args.output_scaled_mudata) -L.debug(adata.uns['log1p']) \ No newline at end of file +L.debug(adata.uns['log1p']) + +L.info("done") \ No newline at end of file diff --git a/requirements_minimal.txt b/requirements_minimal.txt index 47337598..40a54d2b 100644 --- a/requirements_minimal.txt +++ b/requirements_minimal.txt @@ -26,6 +26,6 @@ scrublet scikit-misc mofapy2 pysam -#mudata @ git+https://github.com/scverse/mudata -#muon @ git+https://github.com/scverse/muon -#scirpy @ git+https://github.com/scverse/scirpy +mudata @ git+https://github.com/scverse/mudata +muon @ git+https://github.com/scverse/muon +scirpy @ git+https://github.com/scverse/scirpy