-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtutorial.Rmd
202 lines (163 loc) · 7.14 KB
/
tutorial.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
---
title: "scMM: Mixture-of-experts multimodal deep generative model for single-cell multiomics analysis"
author: "Kodai Minoura"
date: "1/25/2021"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## Analyzing scMM output
This vignette provides tutorial for exploring scMM output. You can find how to run scMM using Colab notebook at **colab_tutorial.ipynb**.
First, set directory and install required packages:
```{r}
source("R/fun.R")
library(data.table)
library(ggplot2)
library(Matrix)
library(umap)
library(Rphenograph)
library(pheatmap)
set.seed(123)
data_path <- "data/BMNC" #Path to data
base_path <- "experiments/rna_protein/"
runPath <- "2021-01-25T05_21_13.505149xsav_onr"
runPath <- gsub("/",":",runPath)
runPath <- paste0(base_path,runPath)
runPath #Path to scMM outputs
```
##Clustering on multimodal latent variables
scMM outputs unimodal and multimodal latent variables in csv format:
```{r}
#Unimodal latent variables for training data.
train_r <- as.matrix(fread(file=paste0(runPath,"/lat_train_rna.csv"),
sep=",",header=TRUE,drop="V1"))
train_p <- as.matrix(fread(file=paste0(runPath,"/lat_train_protein.csv"),
sep=",",header=TRUE,drop="V1"))
#Unimodal latent varibles for test data.
test_r <- as.matrix(fread(file=paste0(runPath,"/lat_test_rna.csv"),
sep=",",header=TRUE,drop="V1"))
test_p <- as.matrix(fread(file=paste0(runPath,"/lat_test_protein.csv"),
sep=",",header=TRUE,drop="V1"))
#Multimodal latent variables for training data.
train_m <- as.matrix(fread(file=paste0(runPath,"/lat_train_mean.csv"),
sep=",",header=TRUE,drop="V1"))
#Multimodal latent variables for test data.
test_m <- as.matrix(fread(file=paste0(runPath,"/lat_test_mean.csv"),
sep=",",header=TRUE,drop="V1"))
```
We perform clustering on multimodal latent variables by using Rphenograph package.:
```{r}
#Clustring by Rphenograph
phenog <- Rphenograph(rbind(train_m, test_m), k=20)
cluster <- phenog[[2]]$membership
#UMAP visualization of multimodal latent variables
umap <- umap(rbind(train_m, test_m), method="naive")
#Plot UMAP enbedding
color <- gg_color_hue(length(unique(cluster)))
gg_embcluster <- PlotEmbCluster(umap$layout,cluster)
#Visualize latent value for the dimension of interest.
gg_selectdim <- PlotEmbSelectDim(umap$layout,rbind(train_m,test_m), dim=1)
#Plot
gridExtra::grid.arrange(gg_embcluster, gg_selectdim, nrow=1)
```
Visualization of latent values show cluster 22 and 23 is enriched with high values for dimension 1. Genes and proteins associated with latent dimension of interest by calculating Spearman correlation between traversal of latent dimension and multimodal features.
```{r}
#Traverse latent dimensions
#RNA
gene <- as.matrix(fread(file=paste0(data_path, "/RNA-seq/gene.tsv"),
header=FALSE,sep="\t"))[,1]
traverse <- as.matrix(fread(file=paste0(runPath,"/traverse/traverse_dim",1,".csv"),
sep=",",header=TRUE,drop="V1"))
cor_up_g <- cor_down_g <- gene_up <- gene_down <- list()
latent_dim <- 10 #Number of latent dimensions
for(i in 1:latent_dim){
traverse <- as.matrix(fread(file=paste0(runPath,"/traverse/traverse_dim",i,".csv"),
sep=",",header=TRUE,drop="V1"))
traverse_r <- as.matrix(fread(file=paste0(runPath,"/traverse/rna_traverse_dim",i,".csv"),
sep=",",header=TRUE,drop="V1"))
cor <- apply(traverse_r,2,function(x) CorTest(traverse[,1], x, alpha=1e-12)) #Threshold set by alpha
#UP
order <- order(cor,decreasing=TRUE)
tmp_cor <- sort(cor,decreasing=TRUE)
cor_up_g[[i]] <- tmp_cor
tmp_gene <- gene[order]
gene_up[[i]] <- tmp_gene
#DOWN
order <- order(cor,decreasing=FALSE)
tmp_cor <- sort(cor,decreasing=FALSE)
cor_down_g[[i]] <- tmp_cor
tmp_gene <- gene[order]
gene_down[[i]] <- tmp_gene
}
#Protein
protein <- as.matrix(fread(file=paste0(data_path, "/CITE-seq/protein.tsv"),
header=FALSE,sep="\t"))[,1]
cor_up_p <- cor_down_p <- protein_up <- protein_down <- list()
for(i in 1:latent_dim){
traverse <- as.matrix(fread(file=paste0(runPath,"/traverse/traverse_dim",i,".csv"),
sep=",",header=TRUE,drop="V1"))
traverse_p <- as.matrix(fread(file=paste0(runPath,"/traverse/protein_traverse_dim",i,".csv"),
sep=",",header=TRUE,drop="V1"))
cor <- apply(traverse_p,2,function(x) CorTest(traverse[,1], x, alpha=1e-2))
#UP
order <- order(cor,decreasing=TRUE)
tmp_cor <- sort(cor,decreasing=TRUE)
cor_up_p[[i]] <- tmp_cor
tmp_protein <- protein[order]
protein_up[[i]] <- tmp_protein
#Down
order <- order(cor,decreasing=FALSE)
tmp_cor <- sort(cor,decreasing=FALSE)
cor_down_p[[i]] <- tmp_cor
tmp_protein <- protein[order]
protein_down[[i]] <- tmp_protein
}
```
```{r}
gg_assocgene <- AssociatedGene(dimension=1)
gg_assocprotein <- AssociatedProtein(dimension=1)
#Plot genes/proteins associated with latent dimension 1.
gridExtra::grid.arrange(gg_assocgene, gg_assocprotein, nrow=1)
```
##Cross-modal generation
scMM predicts surface protein measurements by sampling from estimated negative binomial distributions.
```{r}
data <- Matrix::readMM(paste0(data_path, "/CITE-seq/Protein_count.mtx"))
t_id <- as.vector(as.matrix(fread(file=paste0(runPath,"/t_id.csv"),
sep=",",header=TRUE,drop="V1"))) + 1 #train data index
s_id <- as.vector(as.matrix(fread(file=paste0(runPath,"/s_id.csv"),
sep=",",header=TRUE,drop="V1"))) + 1 #test data index
protein <- as.matrix(fread(file=paste0(data_path, "/CITE-seq/protein.tsv"),
header=FALSE,sep="\t"))[,1]
barcode <- as.matrix(fread(file=paste0(data_path, "/CITE-seq/barcode.tsv"),
header=FALSE,sep="\t"))[,1]
rownames(data) <- protein
colnames(data) <- barcode
data_s <- data[,s_id]
#Get cluster assignment for test data.
cluster_s <- cluster[(length(t_id)+1):length(cluster)]
```
```{r}
#Obtain pseudobulk data by aggregating per cluster.
agg <- CreateAgg(t(data_s), cluster_s)
rownames(agg) <- as.character(c(1:max(cluster)))
colnames(agg) <- protein
#Obtain pseudobulk data from cross-modal generation
recon <- Matrix::readMM(paste0(runPath,"/pred_test_r_p.mtx"))
recon_agg <- CreateAgg(recon, cluster_s)
rownames(recon_agg) <- as.character(c(1:max(cluster)))
colnames(recon_agg) <- protein
#Apply hclust on protein
scale <- apply(agg,2,scale)
protein_dist <- dist(t(scale))
h_protein <- hclust(protein_dist)
breaks <- seq(-3,3,by=0.01)
color <- colorRampPalette(rev(brewer.pal(n=7 , name="RdYlBu")))(length(breaks))
p1 <- pheatmap(t(agg), breaks=breaks, color=color, scale="row",
legend_breaks=c(-3,0,3), legend_labels=c("-3","0","3"),
cluster_rows=h_protein, cluster_cols=FALSE, main="Original")
p2 <- pheatmap(t(recon_agg), breaks=breaks, color=color, scale="row",
legend_breaks=c(-3,0,3), legend_labels=c("-3","0","3"),
cluster_rows=h_protein, cluster_cols=FALSE, main="Cross-modal generation")
```