SDePER v1.5.0 release

az7jh2 · Jul 12, 2024 · 505d74f · 505d74f
1 parent d4bbbbd
commit 505d74f
Show file tree

Hide file tree

Showing 9 changed files with 136 additions and 89 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
 # Incorporate a time delay in Docker Automated Builds to prevent the installation of outdated releases
 FROM python:3.9.12-slim-bullseye
-RUN sleep 600
+RUN sleep 1200
 RUN mkdir /data && chmod -R 755 /data && pip install --no-cache-dir sdeper
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,6 +1,18 @@
 Changelog
 =========
 
+Version 1.5.0 (2024-07-12)
+--------------------------
+
+**Updates**:
+
+* The optimization of cell type proportion :math:`\theta` is skipped if the initial value of :math:`\theta` indicates the presence of only one cell type in the spot.
+
+* When predicting cell type proportions utilizing the CVAE latent space, the values of the CVAE latent space are now directly used instead of PCA embeddings for proportion transferring.
+
+* The default number of hidden layers has been changed from 2 to 1.
+
+
 Version 1.4.0 (2024-06-26)
 --------------------------
 

diff --git a/docs/cli_options.rst b/docs/cli_options.rst
@@ -431,10 +431,14 @@ CVAE-related options
 
    :Type: integer
 
-   :Default: ``2``
+   :Default: ``1``
 
    .. versionadded:: 1.2.0
 
+   .. versionchanged:: 1.5.0
+
+      Default value changed from 2 to 1.
+
 
 .. option:: --use_batch_norm
 

diff --git a/src/cvae.py b/src/cvae.py
@@ -68,7 +68,7 @@ def celltype2props(celltype_anno, celltype_order):
 
 
 
-def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embedding='PCA', pca_dimension=None):
+def transferProps(query, ref, ref_props, n_neighbors=10, sigma=1, use_embedding='PCA', pca_dimension=None):
     '''
     transfer cell-type proportions by select K Nearest Neighbors in ref and take Gaussian weighted average of ref proportions
 
@@ -83,7 +83,7 @@ def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embeddin
     n_neighbors : int, optional
         Number of neighbors to use. The default is 10.
     sigma : float, optional
-        Standard deviation for the Gaussian weighting function. The default is 0.3.
+        Standard deviation for the Gaussian weighting function. The default is 1.
     use_embedding : str, optional
         which embedding to use, either PCA, UMAP or none. The default is PCA.
     pca_dimension : int, optinal
@@ -96,6 +96,7 @@ def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embeddin
     '''
 
     assert query.shape[1] == ref.shape[1]
+    n_celltype = ref_props.shape[1]
 
     if (query.shape[1]<=2) and (use_embedding!='none'):
         print(f'WARNING: original latent space dimension {query.shape[1]} <= 2, no need to use {use_embedding} embedding!')
@@ -128,6 +129,8 @@ def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embeddin
     else:
         raise Exception(f'unknow embedding {use_embedding}')
 
+    print(f'embedding dimension: {query_pc.shape[1]}')
+
     # perform KNN on query data on reduced dimension
     nbrs = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto').fit(ref_pc)
     # find nearest neighbors
@@ -148,6 +151,23 @@ def transferProps(query, ref, ref_props, n_neighbors=10, sigma=0.3, use_embeddin
         # normalize the proportions to sum to 1
         query_props[i] = avg_props / np.sum(avg_props)
 
+        # NOTE if all proportions are 0 due to very small weights, the initial guess will be all NaN
+        if pd.isnull(query_props[i]).any():
+            # replace it as a vector with all elements identical
+            query_props[i, :] = np.full((n_celltype,), 1.0/n_celltype)
+            continue
+
+        # post-process theta to set theta<0.01 as 0 then re-normalize remaining theta to sum to 1
+        tmp_ind = query_props[i, :] < 0.01
+
+        if tmp_ind.all():
+            # all elements < threashold, just leave it unchanged
+            continue
+
+        if tmp_ind.any():
+            query_props[i, tmp_ind] = 0
+            query_props[i, :] = query_props[i, :] / np.sum(query_props[i, :])
+
     return query_props
 
 
@@ -896,10 +916,10 @@ def build_CVAE(spatial_df, scRNA_df, scRNA_celltype, n_marker_per_cmp, n_pseudo_
     cvae, new_decoder = CVAE_keras_model(p, p_cond, latent_dim, hidden_dim[::-1], hidden_dim, use_batch_norm=use_batch_norm, cvae_init_lr=cvae_init_lr)
 
     # learning rate decay
-    lrate = ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=10, min_lr=5e-4, cooldown=5, verbose=False)
+    lrate = ReduceLROnPlateau(monitor='val_loss', factor=0.9, patience=20, min_lr=5e-4, cooldown=10, verbose=False)
 
     # early stopping based on validation loss
-    early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=30, restore_best_weights=True, verbose=False)
+    early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=40, restore_best_weights=True, verbose=False)
 
 
     # change tensorflow seed value, set the same seed value for sampling samples from latent space to decoder before training
@@ -1025,15 +1045,15 @@ def build_CVAE(spatial_df, scRNA_df, scRNA_celltype, n_marker_per_cmp, n_pseudo_
         pseudo_spot_embed = encoder.predict([scRNA_min_max_scaler.transform(pseudo_spots_df), np.full((pseudo_spots_df.shape[0],1), 0)])[0]
 
     if do_initial_guess:
-        use_embedding = 'PCA'
+        use_embedding = 'none'
         if use_embedding == 'none':
             print('HIGHLIGHT: got initial guess of cell type proportions based on original CVAE latent embedding')
         else:
             print(f'\nHIGHLIGHT: got initial guess of cell type proportions based on {use_embedding} embedding of CVAE latent space')
         tmp_pred = transferProps(spatial_embed,
                                  np.vstack((scRNA_embed, pseudo_spot_embed)),
                                  pd.concat([scrna_cell_celltype_prop, pseudo_spots_celltype_prop]).values,
-                                 n_neighbors=10, sigma=0.3, use_embedding=use_embedding)
+                                 n_neighbors=10, sigma=1, use_embedding=use_embedding)
         cvae_pred = pd.DataFrame(tmp_pred, index=spatial_df.index, columns=celltype_order)
 
         if diagnosis:

diff --git a/src/diagnosis_plots.py b/src/diagnosis_plots.py
@@ -378,65 +378,7 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
     plot_df.loc[plot_df['datatype']=='scrna-cell', ['UMAP1', 'UMAP2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')
 
 
-    # plot PCA of latent space of spatial spots and scRNA-seq cells plus pseudo spots
-    # the order will affect the point overlay, first row draw first
-    all_pca = PCA(n_components=2).fit_transform(np.concatenate((pseudo_spot_embed, pseudo_spatial_embed, scRNA_embed, spatial_embed), axis=0))
-
-    # add cell/spot count in the annotation
-    plot_df = pd.DataFrame({'PC1': all_pca[:, 0],
-                            'PC2': all_pca[:, 1],
-                            'celltype': ['pseudo']*pseudo_spot_embed.shape[0] +
-                                        ['pseudo']*pseudo_spatial_embed.shape[0] + 
-                                        [f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
-                                        [f'spatial ({spatial_embed.shape[0]})']*spatial_embed.shape[0],
-                            'dataset': ['scRNA-seq']*pseudo_spot_embed.shape[0] +
-                                       ['spatial']*pseudo_spatial_embed.shape[0] +
-                                       ['scRNA-seq']*scRNA_embed.shape[0] +
-                                       ['spatial']*spatial_embed.shape[0],
-                            'datatype': ['scrna-pseudo']*pseudo_spot_embed.shape[0] +
-                                        ['spatial-pseudo']*pseudo_spatial_embed.shape[0] +
-                                        ['scrna-cell']*scRNA_embed.shape[0] +
-                                        ['spatial-spot']*spatial_embed.shape[0],
-                            },
-                           index = [f'scrna_pseudo{x}' for x in range(pseudo_spot_embed.shape[0])] +
-                                   [f'spatial_pseudo{x}' for x in range(pseudo_spatial_embed.shape[0])] +
-                                   scRNA_decode_df.index.to_list() +
-                                   spatial_transformed_df.index.to_list())
-
-    # plot colors already defined
-    sns.set_style("darkgrid")
-
-    # relplot return a FacetGrid object
-    # specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
-    fgrid = sns.relplot(data=plot_df, x='PC1', y='PC2', hue='celltype', s=20, marker='o', palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
-    fgrid.set(xlabel='PC 1', ylabel='PC 2')
-    # Put the legend out of the figure
-    #ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
-
-    plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_celltype.png'))
-    plt.close()
-
-
-    # save PCA coordinates of latent space
-    plot_df.loc[plot_df['datatype']=='spatial-spot', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_spatial_spots.csv.gz'), compression='gzip')
-    # save scRNA-seq cells PCA coordinates
-    plot_df.loc[plot_df['datatype']=='scrna-cell', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')
-
-
-    # Plot PCA density
-    # for safe to avoid program exit when density estimation failed
-    try:
-        plt.figure(figsize=(6.4*2, 4.8*2))
-        sns.set_style('whitegrid')
-        ax = sns.kdeplot(data=plot_df, x='PC1', y='PC2', fill=True)
-        ax.set(xlabel='PC 1', ylabel='PC 2')
-        plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_density.png'))
-        plt.close()
-    except:
-        pass
-
-
-    # UPDATE: change to draw on PCA instead of UMAP
+    # NOTE: based on UMAP
     # plot distribution of number of cells in pseudo-spots
     # add number of cells in pseudo-spot to dataframe
     plot_df['n_cell_in_spot'] = np.nan
@@ -451,31 +393,31 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
     fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, sharex=True, figsize=(6.4*4, 4.8*2))
     # left panel: scatter plot of pseudo-spots + scRNA-seq cells
     tmp_df = plot_df.loc[plot_df['dataset']=='scRNA-seq']
-    sc = ax1.scatter(tmp_df['PC1'], tmp_df['PC2'], c=tmp_df['n_cell_in_spot'], cmap='cubehelix', s=10, marker='o')
+    sc = ax1.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], c=tmp_df['n_cell_in_spot'], cmap='cubehelix', s=10, marker='o')
     ax1.set_title('dataset = scRNA-seq')
-    ax1.set_xlabel('PC 1')
-    ax1.set_ylabel('PC 2')
+    ax1.set_xlabel('UMAP 1')
+    ax1.set_ylabel('UMAP 2')
 
     # right panel: scatter plot of spatial spots + spatial pseudo-spots
     # first draw spatial pseudo-spots, which lay at the bottom
     tmp_df = plot_df.loc[plot_df['datatype']=='spatial-pseudo']
-    ax2.scatter(tmp_df['PC1'], tmp_df['PC2'], color=plot_colors['pseudo'], s=10, marker='o')
+    ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors['pseudo'], s=10, marker='o')
     tmp_df = plot_df.loc[plot_df['datatype']=='spatial-spot']
-    ax2.scatter(tmp_df['PC1'], tmp_df['PC2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
+    ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
     ax2.set_title('dataset = spatial')
-    ax2.set_xlabel('PC 1')
-    ax2.set_ylabel('PC 2')
+    ax2.set_xlabel('UMAP 1')
+    ax2.set_ylabel('UMAP 2')
 
     # add colorbar with title to the most right (https://stackoverflow.com/questions/13784201/how-to-have-one-colorbar-for-all-subplots, conflict with tight_layout)
     cbar = fig.colorbar(sc, ax=ax2)
     cbar.ax.set_title('#cell in spot')
 
     fig.tight_layout()
-    fig.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_num_cell_in_spot.png'))
+    fig.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_num_cell_in_spot.png'))
     plt.close()
 
 
-    # UPDATE: change to draw on PCA instead of UMAP
+    # NOTE: based on UMAP
     # plot distribution of cell-type proportions of each cell-type
     # create a proportion dataframe with the same row order
     prop_df = pd.concat([pseudo_spots_celltype_prop,
@@ -487,7 +429,7 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
     assert (prop_df.index == plot_df.index).all()
 
     # save multiple figures into one PDF
-    with PdfPages(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_celltype_proportion.pdf')) as pdf:
+    with PdfPages(os.path.join(diagnosis_path, 'CVAE_latent_space', 'UMAP_latent_mu_embedding_color_by_celltype_proportion.pdf')) as pdf:
         for this_celltype in celltype_order:
 
             # start to plot
@@ -498,16 +440,16 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
             # don't forget the .values
             tmp_df = tmp_df.assign(proportion = prop_df.loc[plot_df['dataset']=='scRNA-seq', this_celltype].values)
 
-            sc = ax1.scatter(tmp_df['PC1'], tmp_df['PC2'], c=tmp_df['proportion'], cmap='cubehelix', s=10, marker='o', norm=Normalize(vmin=0, vmax=1))
+            sc = ax1.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], c=tmp_df['proportion'], cmap='cubehelix', s=10, marker='o', norm=Normalize(vmin=0, vmax=1))
 
             ax1.set_title('dataset = scRNA-seq')
-            ax1.set_xlabel('PC 1')
-            ax1.set_ylabel('PC 2')
+            ax1.set_xlabel('UMAP 1')
+            ax1.set_ylabel('UMAP 2')
 
             # right panel: scatter plot of spatial spots + spatial pseudo-spots
             # we also interpolate the grid and draw a contour plot of cell type proportions in right panel
-            grid_x, grid_y = np.mgrid[tmp_df['PC1'].min():tmp_df['PC1'].max():0.025, tmp_df['PC2'].min():tmp_df['PC2'].max():0.025]
-            grid_z = griddata(tmp_df.loc[:, ['PC1', 'PC2']].values, tmp_df['proportion'].values, (grid_x, grid_y), method='linear',  fill_value=np.nan)
+            grid_x, grid_y = np.mgrid[tmp_df['UMAP1'].min():tmp_df['UMAP1'].max():0.025, tmp_df['UMAP2'].min():tmp_df['UMAP2'].max():0.025]
+            grid_z = griddata(tmp_df.loc[:, ['UMAP1', 'UMAP2']].values, tmp_df['proportion'].values, (grid_x, grid_y), method='linear',  fill_value=np.nan)
 
             try:
                 ax2.contourf(grid_x, grid_y, grid_z, cmap='cubehelix', norm=Normalize(vmin=0, vmax=1), alpha=0.3)
@@ -516,13 +458,13 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
 
             # first draw spatial pseudo-spots, which lay at the bottom
             tmp_df = plot_df.loc[plot_df['datatype']=='spatial-pseudo']
-            ax2.scatter(tmp_df['PC1'], tmp_df['PC2'], color=plot_colors['pseudo'], s=10, marker='o')
+            ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors['pseudo'], s=10, marker='o')
             tmp_df = plot_df.loc[plot_df['datatype']=='spatial-spot']
-            ax2.scatter(tmp_df['PC1'], tmp_df['PC2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
+            ax2.scatter(tmp_df['UMAP1'], tmp_df['UMAP2'], color=plot_colors[f'spatial ({spatial_embed.shape[0]})'], s=10, marker='o')
 
             ax2.set_title('dataset = spatial')
-            ax2.set_xlabel('PC 1')
-            ax2.set_ylabel('PC 2')
+            ax2.set_xlabel('UMAP 1')
+            ax2.set_ylabel('UMAP 2')
 
             # add colorbar with title
             cbar = fig.colorbar(sc, ax=ax2)
@@ -536,6 +478,64 @@ def diagnosisCVAE(cvae, encoder, decoder, spatial_embed, spatial_transformed_df,
             plt.close(fig)  # Close the figure to free memory
 
 
+    # plot PCA of latent space of spatial spots and scRNA-seq cells plus pseudo spots
+    # the order will affect the point overlay, first row draw first
+    all_pca = PCA(n_components=2).fit_transform(np.concatenate((pseudo_spot_embed, pseudo_spatial_embed, scRNA_embed, spatial_embed), axis=0))
+
+    # add cell/spot count in the annotation
+    plot_df = pd.DataFrame({'PC1': all_pca[:, 0],
+                            'PC2': all_pca[:, 1],
+                            'celltype': ['pseudo']*pseudo_spot_embed.shape[0] +
+                                        ['pseudo']*pseudo_spatial_embed.shape[0] + 
+                                        [f'{x} ({celltype_count_dict[x]})' for x in scRNA_celltype.celltype.to_list()] +
+                                        [f'spatial ({spatial_embed.shape[0]})']*spatial_embed.shape[0],
+                            'dataset': ['scRNA-seq']*pseudo_spot_embed.shape[0] +
+                                       ['spatial']*pseudo_spatial_embed.shape[0] +
+                                       ['scRNA-seq']*scRNA_embed.shape[0] +
+                                       ['spatial']*spatial_embed.shape[0],
+                            'datatype': ['scrna-pseudo']*pseudo_spot_embed.shape[0] +
+                                        ['spatial-pseudo']*pseudo_spatial_embed.shape[0] +
+                                        ['scrna-cell']*scRNA_embed.shape[0] +
+                                        ['spatial-spot']*spatial_embed.shape[0],
+                            },
+                           index = [f'scrna_pseudo{x}' for x in range(pseudo_spot_embed.shape[0])] +
+                                   [f'spatial_pseudo{x}' for x in range(pseudo_spatial_embed.shape[0])] +
+                                   scRNA_decode_df.index.to_list() +
+                                   spatial_transformed_df.index.to_list())
+
+    # plot colors already defined
+    sns.set_style("darkgrid")
+
+    # relplot return a FacetGrid object
+    # specify figure size by Height (in inches) of each facet, and Aspect ratio of each facet
+    fgrid = sns.relplot(data=plot_df, x='PC1', y='PC2', hue='celltype', s=20, marker='o', palette=plot_colors, kind='scatter', col='dataset', col_order=['scRNA-seq', 'spatial'], height=4.8*2, aspect=6.4/4.8)
+    fgrid.set(xlabel='PC 1', ylabel='PC 2')
+    # Put the legend out of the figure
+    #ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+
+    plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_celltype.png'))
+    plt.close()
+
+
+    # save PCA coordinates of latent space
+    plot_df.loc[plot_df['datatype']=='spatial-spot', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_spatial_spots.csv.gz'), compression='gzip')
+    # save scRNA-seq cells PCA coordinates
+    plot_df.loc[plot_df['datatype']=='scrna-cell', ['PC1', 'PC2']].to_csv(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_coordinates_latent_mu_embedding_scRNA-seq_cells.csv.gz'), compression='gzip')
+
+
+    # Plot PCA density
+    # for safe to avoid program exit when density estimation failed
+    try:
+        plt.figure(figsize=(6.4*2, 4.8*2))
+        sns.set_style('whitegrid')
+        ax = sns.kdeplot(data=plot_df, x='PC1', y='PC2', fill=True)
+        ax.set(xlabel='PC 1', ylabel='PC 2')
+        plt.savefig(os.path.join(diagnosis_path, 'CVAE_latent_space', 'PCA_latent_mu_embedding_color_by_density.png'))
+        plt.close()
+    except:
+        pass
+
+
     # UMAP of decoded spatial and scRNA-seq cell gene expression
     # now we can add marker gene expression profile here
     # the order will affect the point overlay, first row draw first