update

frankligy · May 3, 2021 · 3f0c255 · 3f0c255
1 parent b738c05
commit 3f0c255
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 75 deletions.
diff --git a/program/inspection.py b/program/inspection.py
@@ -6,10 +6,10 @@
 def plot_DE_umap_save(adata,reference):
     with open('./scTriangulate_inspection/log.txt','w') as f:
         for cluster in adata.obs[reference].astype('category').cat.categories:
-            adata_s = adata[adata.obs[reference]==cluster,:]
+            adata_s = adata[adata.obs[reference]==cluster,:].copy()
 
             # first, save adata_s.h5ad (cellxgene)
-            adata_s.raw.to_adata().write('./scTriangulate_inspection/to_cellxgene_{}.h5ad'.format(cluster))
+            adata_s.write('./scTriangulate_inspection/to_cellxgene_{}.h5ad'.format(cluster))
 
             # second, umap
             sc.pl.umap(adata_s,color=['reassign_prefix'])

diff --git a/program/metrics.py b/program/metrics.py
@@ -86,8 +86,8 @@ def marker_gene(adata, key):
     if adata.uns.get('rank_genes_groups') != None:
         del adata.uns['rank_genes_groups']
     # perform t-test
-    sc.tl.rank_genes_groups(adata, key, method='t-test',n_genes=adata.raw.shape[1])
-    all_genes = adata.raw.var_names.values  # ndarray, all the genes
+    sc.tl.rank_genes_groups(adata, key, method='t-test',n_genes=adata.shape[1])
+    all_genes = adata.var_names.values  # ndarray, all the genes
     all_clusters = adata.obs[key].cat.categories  # pd.Index, all the clusters
     cluster2gene = dict()  # {'cluster1':[gene1,gene2..]}
     rank_uns = adata.uns['rank_genes_groups']
@@ -111,11 +111,11 @@ def marker_gene(adata, key):
         assign = all_clusters[np.argmin(np.array(index_store))]  # get argmin, take the corresponding cluster
         cluster2gene[assign].append((gene,np.min(index_store)))
     # sort the cluster2gene
-    for key,value in cluster2gene.items():
+    for key_,value in cluster2gene.items():
         gene = [item[0] for item in value]
         rank = [item[1] for item in value]
         temp = sorted(zip(gene,rank),key=lambda x:x[1])
-        cluster2gene[key] = [item[0] for item in temp]
+        cluster2gene[key_] = [item[0] for item in temp]
     result = pd.Series(cluster2gene).to_frame()
     result.columns = ['whole_marker_genes']
 
@@ -137,7 +137,7 @@ def marker_gene(adata, key):
 
     result['enrichr'] = col_enrichr
     result['purify'] = col_purify
-
+    result.to_csv('./scTriangulate_result/marker_{0}.txt'.format(key),sep='\t')
     return result
 
 
@@ -150,13 +150,12 @@ def reassign_score(adata,key,marker):
         pick = marker_genes[:num]  # if the list doesn't have more than 30 markers, it is oK, python will automatically choose all
         pool.extend(pick)
     pool = list(set(pool))
-    adata_now = adata.raw.to_adata()
-    adata_now = adata_now[:,pool]
+    adata_now = adata[:,pool]
 
     # reducing dimension 
     from sklearn.decomposition import PCA
     reducer = PCA(n_components=30)
-    scoring = reducer.fit_transform(X=adata_now.X.toarray()) #become dense matrix
+    scoring = reducer.fit_transform(X=adata_now.X) 
 
     from sklearn.preprocessing import LabelEncoder
     le = LabelEncoder()
@@ -211,13 +210,13 @@ def tf_idf_bare_compute(df,cluster):
     return tf_idf_ori
 
 def tf_idf_for_cluster(adata,key):
-    df = pd.DataFrame(data=adata.raw.X.toarray(), index=adata.obs_names, columns=adata.raw.var_names)  #become dense matrix
+    df = pd.DataFrame(data=adata.X, index=adata.obs_names, columns=adata.var_names)  
     df['cluster'] = adata.obs[key].astype('str').values
     cluster_to_tfidf = {}  # store tfidf score
     cluster_to_exclusive = {}   # store exclusivly expressed genes
     for item in adata.obs[key].cat.categories:
         a = tf_idf_bare_compute(df,item)
-        a_names = adata.raw.var_names
+        a_names = adata.var_names
         test = pd.Series(data=a, index=a_names)
         test.sort_values(ascending=False, inplace=True)
         # remove artifact genes
@@ -240,7 +239,7 @@ def SCCAF_score(adata, key):
     from sklearn.linear_model import LogisticRegression
     from sklearn.metrics import confusion_matrix
     # define X and Y and exclude cells whose cluster only have 1 cell
-    X = adata.raw.X.toarray()
+    X = adata.X
     Y = adata.obs[key].values
 
     # label encoding Y to numerical values

diff --git a/program/prune.py b/program/prune.py
@@ -11,6 +11,7 @@
 import matplotlib.pyplot as plt
 import seaborn as sns
 from scipy.stats import rankdata
+import multiprocessing as mp
 
 import scanpy as sc
 import anndata as ad
@@ -71,45 +72,61 @@ def inclusiveness(obs,r,c):
     return fraction_r,fraction_c,result,nearly
 
 
-
-def reference_pruning(adata,reference,size_dict):
-    obs = adata.obs
-    obs['ori'] = np.arange(obs.shape[0])     # keep original index order in one column
-    pruned_chunks = [] # store pruned chunk, one chunk menas one reference cluster
-    for chunk in obs.groupby(by=reference):
+def run_reference_pruning(chunk,reference,size_dict,obs):
+    with open('./scTriangulate_present/log_prune_step_{}.txt'.format(chunk[0]),'a') as log:
+        log.write('reference cluster: {}\n'.format(chunk[0]))
+        log.flush()
         subset = chunk[1]
         vc = subset['engraft'].value_counts()
         overlap_clusters = vc.index
         mapping = {}
         for cluster in overlap_clusters:
+            print('--- query cluster: {}'.format(cluster),file=log,flush=True)
             r = {reference:chunk[0]}
-            c = {cluster.split('_')[0]:cluster.split('_')[1]}
+            c = {cluster.split('$')[0]:cluster.split('$')[1]}
             fraction_r,fraction_c,result,nearly = inclusiveness(obs,r,c)
             if not result:  # no inclusive, go back to reference annotation
-                mapping[cluster] = reference + '_' + chunk[0]
+                mapping[cluster] = reference + '$' + chunk[0]
             else:
                 proportion_to_ref = vc.loc[cluster] / vc.sum()
-                proportion_to_self = vc.loc[cluster] / size_dict[cluster.split('_')[0]][cluster.split('_')[1]]
+                proportion_to_self = vc.loc[cluster] / size_dict[cluster.split('$')[0]][cluster.split('$')[1]]
                 if proportion_to_ref < 0.1 and not nearly:  # only cover < 10% reference cluster and it is not nearly included
-                    mapping[cluster] = reference + '_' + chunk[0]
+                    mapping[cluster] = reference + '$' + chunk[0]
                 elif nearly and proportion_to_ref < 0.1 and proportion_to_self < 0.1: # it is nearly included, so evade the first catcher, but to_self proportion is low
-                    mapping[cluster] = reference + '_' + chunk[0]
+                    mapping[cluster] = reference + '$' + chunk[0]
                 else:
                     mapping[cluster] = cluster
 
         subset['reassign'] = subset['engraft'].map(mapping).values
 
+        print('finished first part',file=log,flush=True)
+
         # change to most abundant type if engraft only have 1
         vc2 = subset['reassign'].value_counts()
         most_abundant_cluster = vc2.loc[vc2==vc2.max()].index[0]  # if multiple, just pick the first one
         exclude_clusters = vc2.loc[vc2==1].index
         for i in range(subset.shape[0]):
             if subset.iloc[i]['reassign'] in exclude_clusters:
                 subset.loc[:,'reassign'].iloc[i] = most_abundant_cluster   # caution that Settingwithcopy issue
-        pruned_chunks.append(subset)
+
+        print('finished second part',file=log,flush=True)
+        print(subset.shape,file=log,flush=True)
+        return subset
+
+
+def reference_pruning(obs,reference,size_dict):
+    obs['ori'] = np.arange(obs.shape[0])     # keep original index order in one column
+    pruned_chunks = [] # store pruned chunk, one chunk menas one reference cluster
+    chunk_list = list(obs.groupby(by=reference))
+    cores = len(chunk_list)
+    pool = mp.Pool(processes=cores)
+    r = [pool.apply_async(run_reference_pruning,args=(chunk,reference,size_dict,obs)) for chunk in chunk_list]
+    pool.close()
+    pool.join()
+    pruned_chunks = [collect.get() for collect in r]
     modified_obs = pd.concat(pruned_chunks)
     modified_obs.sort_values(by='ori',inplace=True)
-    adata.obs = modified_obs
+    return modified_obs