Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Guangyuan Li authored and Guangyuan Li committed May 3, 2021
1 parent b738c05 commit 3f0c255
Show file tree
Hide file tree
Showing 5 changed files with 152 additions and 75 deletions.
4 changes: 2 additions & 2 deletions program/inspection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
def plot_DE_umap_save(adata,reference):
with open('./scTriangulate_inspection/log.txt','w') as f:
for cluster in adata.obs[reference].astype('category').cat.categories:
adata_s = adata[adata.obs[reference]==cluster,:]
adata_s = adata[adata.obs[reference]==cluster,:].copy()

# first, save adata_s.h5ad (cellxgene)
adata_s.raw.to_adata().write('./scTriangulate_inspection/to_cellxgene_{}.h5ad'.format(cluster))
adata_s.write('./scTriangulate_inspection/to_cellxgene_{}.h5ad'.format(cluster))

# second, umap
sc.pl.umap(adata_s,color=['reassign_prefix'])
Expand Down
21 changes: 10 additions & 11 deletions program/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,8 @@ def marker_gene(adata, key):
if adata.uns.get('rank_genes_groups') != None:
del adata.uns['rank_genes_groups']
# perform t-test
sc.tl.rank_genes_groups(adata, key, method='t-test',n_genes=adata.raw.shape[1])
all_genes = adata.raw.var_names.values # ndarray, all the genes
sc.tl.rank_genes_groups(adata, key, method='t-test',n_genes=adata.shape[1])
all_genes = adata.var_names.values # ndarray, all the genes
all_clusters = adata.obs[key].cat.categories # pd.Index, all the clusters
cluster2gene = dict() # {'cluster1':[gene1,gene2..]}
rank_uns = adata.uns['rank_genes_groups']
Expand All @@ -111,11 +111,11 @@ def marker_gene(adata, key):
assign = all_clusters[np.argmin(np.array(index_store))] # get argmin, take the corresponding cluster
cluster2gene[assign].append((gene,np.min(index_store)))
# sort the cluster2gene
for key,value in cluster2gene.items():
for key_,value in cluster2gene.items():
gene = [item[0] for item in value]
rank = [item[1] for item in value]
temp = sorted(zip(gene,rank),key=lambda x:x[1])
cluster2gene[key] = [item[0] for item in temp]
cluster2gene[key_] = [item[0] for item in temp]
result = pd.Series(cluster2gene).to_frame()
result.columns = ['whole_marker_genes']

Expand All @@ -137,7 +137,7 @@ def marker_gene(adata, key):

result['enrichr'] = col_enrichr
result['purify'] = col_purify

result.to_csv('./scTriangulate_result/marker_{0}.txt'.format(key),sep='\t')
return result


Expand All @@ -150,13 +150,12 @@ def reassign_score(adata,key,marker):
pick = marker_genes[:num] # if the list doesn't have more than 30 markers, it is oK, python will automatically choose all
pool.extend(pick)
pool = list(set(pool))
adata_now = adata.raw.to_adata()
adata_now = adata_now[:,pool]
adata_now = adata[:,pool]

# reducing dimension
from sklearn.decomposition import PCA
reducer = PCA(n_components=30)
scoring = reducer.fit_transform(X=adata_now.X.toarray()) #become dense matrix
scoring = reducer.fit_transform(X=adata_now.X)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Expand Down Expand Up @@ -211,13 +210,13 @@ def tf_idf_bare_compute(df,cluster):
return tf_idf_ori

def tf_idf_for_cluster(adata,key):
df = pd.DataFrame(data=adata.raw.X.toarray(), index=adata.obs_names, columns=adata.raw.var_names) #become dense matrix
df = pd.DataFrame(data=adata.X, index=adata.obs_names, columns=adata.var_names)
df['cluster'] = adata.obs[key].astype('str').values
cluster_to_tfidf = {} # store tfidf score
cluster_to_exclusive = {} # store exclusivly expressed genes
for item in adata.obs[key].cat.categories:
a = tf_idf_bare_compute(df,item)
a_names = adata.raw.var_names
a_names = adata.var_names
test = pd.Series(data=a, index=a_names)
test.sort_values(ascending=False, inplace=True)
# remove artifact genes
Expand All @@ -240,7 +239,7 @@ def SCCAF_score(adata, key):
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
# define X and Y and exclude cells whose cluster only have 1 cell
X = adata.raw.X.toarray()
X = adata.X
Y = adata.obs[key].values

# label encoding Y to numerical values
Expand Down
43 changes: 30 additions & 13 deletions program/prune.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import rankdata
import multiprocessing as mp

import scanpy as sc
import anndata as ad
Expand Down Expand Up @@ -71,45 +72,61 @@ def inclusiveness(obs,r,c):
return fraction_r,fraction_c,result,nearly



def reference_pruning(adata,reference,size_dict):
obs = adata.obs
obs['ori'] = np.arange(obs.shape[0]) # keep original index order in one column
pruned_chunks = [] # store pruned chunk, one chunk menas one reference cluster
for chunk in obs.groupby(by=reference):
def run_reference_pruning(chunk,reference,size_dict,obs):
with open('./scTriangulate_present/log_prune_step_{}.txt'.format(chunk[0]),'a') as log:
log.write('reference cluster: {}\n'.format(chunk[0]))
log.flush()
subset = chunk[1]
vc = subset['engraft'].value_counts()
overlap_clusters = vc.index
mapping = {}
for cluster in overlap_clusters:
print('--- query cluster: {}'.format(cluster),file=log,flush=True)
r = {reference:chunk[0]}
c = {cluster.split('_')[0]:cluster.split('_')[1]}
c = {cluster.split('$')[0]:cluster.split('$')[1]}
fraction_r,fraction_c,result,nearly = inclusiveness(obs,r,c)
if not result: # no inclusive, go back to reference annotation
mapping[cluster] = reference + '_' + chunk[0]
mapping[cluster] = reference + '$' + chunk[0]
else:
proportion_to_ref = vc.loc[cluster] / vc.sum()
proportion_to_self = vc.loc[cluster] / size_dict[cluster.split('_')[0]][cluster.split('_')[1]]
proportion_to_self = vc.loc[cluster] / size_dict[cluster.split('$')[0]][cluster.split('$')[1]]
if proportion_to_ref < 0.1 and not nearly: # only cover < 10% reference cluster and it is not nearly included
mapping[cluster] = reference + '_' + chunk[0]
mapping[cluster] = reference + '$' + chunk[0]
elif nearly and proportion_to_ref < 0.1 and proportion_to_self < 0.1: # it is nearly included, so evade the first catcher, but to_self proportion is low
mapping[cluster] = reference + '_' + chunk[0]
mapping[cluster] = reference + '$' + chunk[0]
else:
mapping[cluster] = cluster

subset['reassign'] = subset['engraft'].map(mapping).values

print('finished first part',file=log,flush=True)

# change to most abundant type if engraft only have 1
vc2 = subset['reassign'].value_counts()
most_abundant_cluster = vc2.loc[vc2==vc2.max()].index[0] # if multiple, just pick the first one
exclude_clusters = vc2.loc[vc2==1].index
for i in range(subset.shape[0]):
if subset.iloc[i]['reassign'] in exclude_clusters:
subset.loc[:,'reassign'].iloc[i] = most_abundant_cluster # caution that Settingwithcopy issue
pruned_chunks.append(subset)

print('finished second part',file=log,flush=True)
print(subset.shape,file=log,flush=True)
return subset


def reference_pruning(obs,reference,size_dict):
obs['ori'] = np.arange(obs.shape[0]) # keep original index order in one column
pruned_chunks = [] # store pruned chunk, one chunk menas one reference cluster
chunk_list = list(obs.groupby(by=reference))
cores = len(chunk_list)
pool = mp.Pool(processes=cores)
r = [pool.apply_async(run_reference_pruning,args=(chunk,reference,size_dict,obs)) for chunk in chunk_list]
pool.close()
pool.join()
pruned_chunks = [collect.get() for collect in r]
modified_obs = pd.concat(pruned_chunks)
modified_obs.sort_values(by='ori',inplace=True)
adata.obs = modified_obs
return modified_obs



Expand Down
Loading

0 comments on commit 3f0c255

Please sign in to comment.