Skip to content

Commit

Permalink
SDePER v1.5.0 release
Browse files Browse the repository at this point in the history
  • Loading branch information
az7jh2 committed Jul 12, 2024
1 parent d4bbbbd commit 505d74f
Show file tree
Hide file tree
Showing 9 changed files with 136 additions and 89 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Incorporate a time delay in Docker Automated Builds to prevent the installation of outdated releases
FROM python:3.9.12-slim-bullseye
RUN sleep 600
RUN sleep 1200
RUN mkdir /data && chmod -R 755 /data && pip install --no-cache-dir sdeper
12 changes: 12 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
Changelog
=========

Version 1.5.0 (2024-07-12)
--------------------------

**Updates**:

* The optimization of cell type proportion :math:`\theta` is skipped if the initial value of :math:`\theta` indicates the presence of only one cell type in the spot.

* When predicting cell type proportions utilizing the CVAE latent space, the values of the CVAE latent space are now directly used instead of PCA embeddings for proportion transferring.

* The default number of hidden layers has been changed from 2 to 1.


Version 1.4.0 (2024-06-26)
--------------------------

Expand Down
6 changes: 5 additions & 1 deletion docs/cli_options.rst
Original file line number Diff line number Diff line change
Expand Up @@ -431,10 +431,14 @@ CVAE-related options

:Type: integer

:Default: ``2``
:Default: ``1``

.. versionadded:: 1.2.0

.. versionchanged:: 1.5.0

Default value changed from 2 to 1.


.. option:: --use_batch_norm

Expand Down
32 changes: 26 additions & 6 deletions src/cvae.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def celltype2props(celltype_anno, celltype_order):



def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embedding='PCA', pca_dimension=None):
def transferProps(query, ref, ref_props, n_neighbors=10, sigma=1, use_embedding='PCA', pca_dimension=None):
'''
transfer cell-type proportions by select K Nearest Neighbors in ref and take Gaussian weighted average of ref proportions
Expand All @@ -83,7 +83,7 @@ def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embeddin
n_neighbors : int, optional
Number of neighbors to use. The default is 10.
sigma : float, optional
Standard deviation for the Gaussian weighting function. The default is 0.3.
Standard deviation for the Gaussian weighting function. The default is 1.
use_embedding : str, optional
which embedding to use, either PCA, UMAP or none. The default is PCA.
pca_dimension : int, optinal
Expand All @@ -96,6 +96,7 @@ def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embeddin
'''

assert query.shape[1] == ref.shape[1]
n_celltype = ref_props.shape[1]

if (query.shape[1]<=2) and (use_embedding!='none'):
print(f'WARNING: original latent space dimension {query.shape[1]} <= 2, no need to use {use_embedding} embedding!')
Expand Down Expand Up @@ -128,6 +129,8 @@ def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embeddin
else:
raise Exception(f'unknow embedding {use_embedding}')

print(f'embedding dimension: {query_pc.shape[1]}')

# perform KNN on query data on reduced dimension
nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(ref_pc)
# find nearest neighbors
Expand All @@ -148,6 +151,23 @@ def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embeddin
# normalize the proportions to sum to 1
query_props[i] = avg_props / np.sum(avg_props)

# NOTE if all proportions are 0 due to very small weights, the initial guess will be all NaN
if pd.isnull(query_props[i]).any():
# replace it as a vector with all elements identical
query_props[i, :] = np.full((n_celltype,), 1.0/n_celltype)
continue

# post-process theta to set theta<0.01 as 0 then re-normalize remaining theta to sum to 1
tmp_ind = query_props[i, :] < 0.01

if tmp_ind.all():
# all elements < threashold, just leave it unchanged
continue

if tmp_ind.any():
query_props[i, tmp_ind] = 0
query_props[i, :] = query_props[i, :] / np.sum(query_props[i, :])

return query_props


Expand Down Expand Up @@ -896,10 +916,10 @@ def build_CVAE(spatial_df, scRNA_df, scRNA_celltype, n_marker_per_cmp, n_pseudo_
cvae, new_decoder = CVAE_keras_model(p, p_cond, latent_dim, hidden_dim[::-1], hidden_dim, use_batch_norm=use_batch_norm, cvae_init_lr=cvae_init_lr)

# learning rate decay
lrate = ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=10, min_lr=5e-4, cooldown=5, verbose=False)
lrate = ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=20, min_lr=5e-4, cooldown=10, verbose=False)

# early stopping based on validation loss
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=30, restore_best_weights=True, verbose=False)
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=40, restore_best_weights=True, verbose=False)


# change tensorflow seed value, set the same seed value for sampling samples from latent space to decoder before training
Expand Down Expand Up @@ -1025,15 +1045,15 @@ def build_CVAE(spatial_df, scRNA_df, scRNA_celltype, n_marker_per_cmp, n_pseudo_
pseudo_spot_embed = encoder.predict([scRNA_min_max_scaler.transform(pseudo_spots_df), np.full((pseudo_spots_df.shape[0],1), 0)])[0]

if do_initial_guess:
use_embedding = 'PCA'
use_embedding = 'none'
if use_embedding == 'none':
print('HIGHLIGHT: got initial guess of cell type proportions based on original CVAE latent embedding')
else:
print(f'\nHIGHLIGHT: got initial guess of cell type proportions based on {use_embedding} embedding of CVAE latent space')
tmp_pred = transferProps(spatial_embed,
np.vstack((scRNA_embed, pseudo_spot_embed)),
pd.concat([scrna_cell_celltype_prop, pseudo_spots_celltype_prop]).values,
n_neighbors=10, sigma=0.3, use_embedding=use_embedding)
n_neighbors=10, sigma=1, use_embedding=use_embedding)
cvae_pred = pd.DataFrame(tmp_pred, index=spatial_df.index, columns=celltype_order)

if diagnosis:
Expand Down
156 changes: 78 additions & 78 deletions src/diagnosis_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,65 +378,7 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
plot_df.loc[plot_df['datatype']=='scrna-cell', ['UMAP1', 'UMAP2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')


# plot PCA of latent space of spatial spots and scRNA-seq cells plus pseudo spots
# the order will affect the point overlay, first row draw first
all_pca = PCA(n_components=2).fit_transform(np.concatenate((pseudo_spot_embed, pseudo_spatial_embed, scRNA_embed, spatial_embed), axis=0))

# add cell/spot count in the annotation
plot_df = pd.DataFrame({'PC1': all_pca[:, 0],
'PC2': all_pca[:, 1],
'celltype': ['pseudo']*pseudo_spot_embed.shape[0] +
['pseudo']*pseudo_spatial_embed.shape[0] +
[f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
[f'spatial ({spatial_embed.shape[0]})']*spatial_embed.shape[0],
'dataset': ['scRNA-seq']*pseudo_spot_embed.shape[0] +
['spatial']*pseudo_spatial_embed.shape[0] +
['scRNA-seq']*scRNA_embed.shape[0] +
['spatial']*spatial_embed.shape[0],
'datatype': ['scrna-pseudo']*pseudo_spot_embed.shape[0] +
['spatial-pseudo']*pseudo_spatial_embed.shape[0] +
['scrna-cell']*scRNA_embed.shape[0] +
['spatial-spot']*spatial_embed.shape[0],
},
index = [f'scrna_pseudo{x}' for x in range(pseudo_spot_embed.shape[0])] +
[f'spatial_pseudo{x}' for x in range(pseudo_spatial_embed.shape[0])] +
scRNA_decode_df.index.to_list() +
spatial_transformed_df.index.to_list())

# plot colors already defined
sns.set_style("darkgrid")

# relplot return a FacetGrid object
# specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
fgrid = sns.relplot(data=plot_df, x='PC1', y='PC2', hue='celltype', s=20, marker='o', palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
fgrid.set(xlabel='PC 1', ylabel='PC 2')
# Put the legend out of the figure
#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_celltype.png'))
plt.close()


# save PCA coordinates of latent space
plot_df.loc[plot_df['datatype']=='spatial-spot', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_spatial_spots.csv.gz'), compression='gzip')
# save scRNA-seq cells PCA coordinates
plot_df.loc[plot_df['datatype']=='scrna-cell', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')


# Plot PCA density
# for safe to avoid program exit when density estimation failed
try:
plt.figure(figsize=(6.4*2, 4.8*2))
sns.set_style('whitegrid')
ax = sns.kdeplot(data=plot_df, x='PC1', y='PC2', fill=True)
ax.set(xlabel='PC 1', ylabel='PC 2')
plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_density.png'))
plt.close()
except:
pass


# UPDATE: change to draw on PCA instead of UMAP
# NOTE: based on UMAP
# plot distribution of number of cells in pseudo-spots
# add number of cells in pseudo-spot to dataframe
plot_df['n_cell_in_spot'] = np.nan
Expand All @@ -451,31 +393,31 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(6.4*4, 4.8*2))
# left panel: scatter plot of pseudo-spots + scRNA-seq cells
tmp_df = plot_df.loc[plot_df['dataset']=='scRNA-seq']
sc = ax1.scatter(tmp_df['PC1'], tmp_df['PC2'], c=tmp_df['n_cell_in_spot'], cmap='cubehelix', s=10, marker='o')
sc = ax1.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], c=tmp_df['n_cell_in_spot'], cmap='cubehelix', s=10, marker='o')
ax1.set_title('dataset = scRNA-seq')
ax1.set_xlabel('PC 1')
ax1.set_ylabel('PC 2')
ax1.set_xlabel('UMAP 1')
ax1.set_ylabel('UMAP 2')

# right panel: scatter plot of spatial spots + spatial pseudo-spots
# first draw spatial pseudo-spots, which lay at the bottom
tmp_df = plot_df.loc[plot_df['datatype']=='spatial-pseudo']
ax2.scatter(tmp_df['PC1'], tmp_df['PC2'], color=plot_colors['pseudo'], s=10, marker='o')
ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors['pseudo'], s=10, marker='o')
tmp_df = plot_df.loc[plot_df['datatype']=='spatial-spot']
ax2.scatter(tmp_df['PC1'], tmp_df['PC2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
ax2.set_title('dataset = spatial')
ax2.set_xlabel('PC 1')
ax2.set_ylabel('PC 2')
ax2.set_xlabel('UMAP 1')
ax2.set_ylabel('UMAP 2')

# add colorbar with title to the most right (https://stackoverflow.com/questions/13784201/how-to-have-one-colorbar-for-all-subplots, conflict with tight_layout)
cbar = fig.colorbar(sc, ax=ax2)
cbar.ax.set_title('#cell in spot')

fig.tight_layout()
fig.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_num_cell_in_spot.png'))
fig.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_num_cell_in_spot.png'))
plt.close()


# UPDATE: change to draw on PCA instead of UMAP
# NOTE: based on UMAP
# plot distribution of cell-type proportions of each cell-type
# create a proportion dataframe with the same row order
prop_df = pd.concat([pseudo_spots_celltype_prop,
Expand All @@ -487,7 +429,7 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
assert (prop_df.index == plot_df.index).all()

# save multiple figures into one PDF
with PdfPages(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_celltype_proportion.pdf')) as pdf:
with PdfPages(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_celltype_proportion.pdf')) as pdf:
for this_celltype in celltype_order:

# start to plot
Expand All @@ -498,16 +440,16 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
# don't forget the .values
tmp_df = tmp_df.assign(proportion = prop_df.loc[plot_df['dataset']=='scRNA-seq', this_celltype].values)

sc = ax1.scatter(tmp_df['PC1'], tmp_df['PC2'], c=tmp_df['proportion'], cmap='cubehelix', s=10, marker='o', norm=Normalize(vmin=0, vmax=1))
sc = ax1.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], c=tmp_df['proportion'], cmap='cubehelix', s=10, marker='o', norm=Normalize(vmin=0, vmax=1))

ax1.set_title('dataset = scRNA-seq')
ax1.set_xlabel('PC 1')
ax1.set_ylabel('PC 2')
ax1.set_xlabel('UMAP 1')
ax1.set_ylabel('UMAP 2')

# right panel: scatter plot of spatial spots + spatial pseudo-spots
# we also interpolate the grid and draw a contour plot of cell type proportions in right panel
grid_x, grid_y = np.mgrid[tmp_df['PC1'].min():tmp_df['PC1'].max():0.025, tmp_df['PC2'].min():tmp_df['PC2'].max():0.025]
grid_z = griddata(tmp_df.loc[:, ['PC1', 'PC2']].values, tmp_df['proportion'].values, (grid_x, grid_y), method='linear', fill_value=np.nan)
grid_x, grid_y = np.mgrid[tmp_df['UMAP1'].min():tmp_df['UMAP1'].max():0.025, tmp_df['UMAP2'].min():tmp_df['UMAP2'].max():0.025]
grid_z = griddata(tmp_df.loc[:, ['UMAP1', 'UMAP2']].values, tmp_df['proportion'].values, (grid_x, grid_y), method='linear', fill_value=np.nan)

try:
ax2.contourf(grid_x, grid_y, grid_z, cmap='cubehelix', norm=Normalize(vmin=0, vmax=1), alpha=0.3)
Expand All @@ -516,13 +458,13 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,

# first draw spatial pseudo-spots, which lay at the bottom
tmp_df = plot_df.loc[plot_df['datatype']=='spatial-pseudo']
ax2.scatter(tmp_df['PC1'], tmp_df['PC2'], color=plot_colors['pseudo'], s=10, marker='o')
ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors['pseudo'], s=10, marker='o')
tmp_df = plot_df.loc[plot_df['datatype']=='spatial-spot']
ax2.scatter(tmp_df['PC1'], tmp_df['PC2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')

ax2.set_title('dataset = spatial')
ax2.set_xlabel('PC 1')
ax2.set_ylabel('PC 2')
ax2.set_xlabel('UMAP 1')
ax2.set_ylabel('UMAP 2')

# add colorbar with title
cbar = fig.colorbar(sc, ax=ax2)
Expand All @@ -536,6 +478,64 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
plt.close(fig) # Close the figure to free memory


# plot PCA of latent space of spatial spots and scRNA-seq cells plus pseudo spots
# the order will affect the point overlay, first row draw first
all_pca = PCA(n_components=2).fit_transform(np.concatenate((pseudo_spot_embed, pseudo_spatial_embed, scRNA_embed, spatial_embed), axis=0))

# add cell/spot count in the annotation
plot_df = pd.DataFrame({'PC1': all_pca[:, 0],
'PC2': all_pca[:, 1],
'celltype': ['pseudo']*pseudo_spot_embed.shape[0] +
['pseudo']*pseudo_spatial_embed.shape[0] +
[f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
[f'spatial ({spatial_embed.shape[0]})']*spatial_embed.shape[0],
'dataset': ['scRNA-seq']*pseudo_spot_embed.shape[0] +
['spatial']*pseudo_spatial_embed.shape[0] +
['scRNA-seq']*scRNA_embed.shape[0] +
['spatial']*spatial_embed.shape[0],
'datatype': ['scrna-pseudo']*pseudo_spot_embed.shape[0] +
['spatial-pseudo']*pseudo_spatial_embed.shape[0] +
['scrna-cell']*scRNA_embed.shape[0] +
['spatial-spot']*spatial_embed.shape[0],
},
index = [f'scrna_pseudo{x}' for x in range(pseudo_spot_embed.shape[0])] +
[f'spatial_pseudo{x}' for x in range(pseudo_spatial_embed.shape[0])] +
scRNA_decode_df.index.to_list() +
spatial_transformed_df.index.to_list())

# plot colors already defined
sns.set_style("darkgrid")

# relplot return a FacetGrid object
# specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
fgrid = sns.relplot(data=plot_df, x='PC1', y='PC2', hue='celltype', s=20, marker='o', palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
fgrid.set(xlabel='PC 1', ylabel='PC 2')
# Put the legend out of the figure
#ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_celltype.png'))
plt.close()


# save PCA coordinates of latent space
plot_df.loc[plot_df['datatype']=='spatial-spot', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_spatial_spots.csv.gz'), compression='gzip')
# save scRNA-seq cells PCA coordinates
plot_df.loc[plot_df['datatype']=='scrna-cell', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')


# Plot PCA density
# for safe to avoid program exit when density estimation failed
try:
plt.figure(figsize=(6.4*2, 4.8*2))
sns.set_style('whitegrid')
ax = sns.kdeplot(data=plot_df, x='PC1', y='PC2', fill=True)
ax.set(xlabel='PC 1', ylabel='PC 2')
plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_density.png'))
plt.close()
except:
pass


# UMAP of decoded spatial and scRNA-seq cell gene expression
# now we can add marker gene expression profile here
# the order will affect the point overlay, first row draw first
Expand Down
Loading

0 comments on commit 505d74f

Please sign in to comment.