-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathidentities.R
145 lines (114 loc) · 5.49 KB
/
identities.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
## This script assigns identities to individual cells
## using two alternative approaches with similar results
## Load data and apply transformations if not loaded yet
data.ok <- exists('FGF.all')
if (data.ok == FALSE) {
source('Transformations.R')
}
rm(data.ok)
# ------------------------------------------------------------------------------
# Asign identities to embryos cultured and treated with FGF4/MEKi/FGFRi (+ Littermates)
## Assign identities using a thresholding method (threshold = 50)
## for GATA6 (Channel 4) and NANOG (Channel 5) levels
## Identities thus assigned will appear in Identity.lin column
## Only reliable for embryos where Treatment == 'Littermate'
FGF.all <- id.linear(FGF.all)
## Assign identity to ICM cells
## using a k-means clustering approach based on GATA6 and NANOG expression
## Using only embryos stained for CDX2, GATA6 and NANOG
## Scan experimental dates
FGF.all$Embryo_ID <- as.character(FGF.all$Embryo_ID)
edates <- sapply(FGF.all$Embryo_ID, function(x) {strsplit(x, "_")[[1]][1]})
## Keep embryos where Markers == C2G6NG (CDX2, GATA6, NANOG)
## and remove experiments with edate 082015 (see Figure S2 and legend)
jj <- FGF.all$Markers == "C2G6NG" & edates != "082015"
## Subset FGF.all to jj=TRUE
xx0 <- FGF.all[jj,]
## k-means of these embryos using Channel4 (GATA6), and Channel5 (NANOG) data
## for Littermate embryos only (fixed upon collection, Figure 1 & S1-2)
xx2 <- subset(xx0, Treatment == "Littermate" &
((Cellcount >= 32 & Cellcount < 64) |
(Cellcount >= 120 & Cellcount < 150)) &
(Identity != "TE"),
select = c(CH4.ebLogCor, CH5.ebLogCor, Cellcount))
## Perform k-means clustering with 3 means
set.seed(20160606)
oo <- kmeans(xx2[,1:2], 3)
## oo$centers should look like this:
# CH4.ebLogCor CH5.ebLogCor
#1 4.809746 5.693890
#2 5.075773 2.096386
#3 2.302035 5.415014
## Assign identity to all cells based on the centers obtained
centers <- rbind(oo$centers, c(2.302035, 2.096386))
ii <- xx0$Identity != "TE"
dkm <- matrix(0, sum(ii), 4)
for(i in 1:4) dkm[,i] <- (xx0$CH4.ebLogCor[ii] - centers[i,1])^2 +
(xx0$CH5.ebLogCor[ii] - centers[i,2])^2
idkm <- apply(dkm, 1, which.min)
xx0$Identity.km <- rep(NA, nrow(xx0))
xx0$Identity.km[!ii] <- "TE"
xx0$Identity.km[ii] <- c("DP", "PRE", "EPI", "DN")[idkm]
## Littermates (Figure 1) and embryos cultured for 48h (Figures 2, 5)
## have different scales because of different acquisition parameters
## Thus, assign identities to cultured embryos using the same k-means approach
## but where cluster centers are calculated based
## on the data distribution of GATA6 and NANOG levels of cells in Control embryos
xx2 <- subset(xx0, Treatment == "Control" &
(Identity != "TE"),
select = c(CH4.ebLogCor, CH5.ebLogCor))
set.seed(20160609)
oo <- kmeans(xx2[,1:2], 3)
#> oo$centers
# CH4.ebLogCor CH5.ebLogCor
#1 3.716422 3.455725
#2 4.719526 5.860470
#3 5.718721 3.355891
centers <- rbind(c(5.718721, 5.860470), oo$centers)
ii <- xx0$Identity != "TE" & xx0$Treatment != "Littermate" & xx0$Xpoint != "xp"
dkm <- matrix(0, sum(ii), 4)
for(i in 1:4) dkm[,i] <- (xx0$CH4.ebLogCor[ii] - centers[i,1])^2 +
(xx0$CH5.ebLogCor[ii] - centers[i,2])^2
idkm <- apply(dkm, 1, which.min)
xx0$Identity.km[ii] <- c("DP", "DN", "EPI", "PRE")[idkm]
## Integrate the CDX2, GATA6, NANOG subset (xx0) with rest of data
FGF.all <- rbind.fill(xx0, subset(FGF.all, Markers != 'C2G6NG' |
Exp_date == '20150820'))
# ------------------------------------------------------------------------------
# Asign identities to half and double embryos (scaling data) using same approach
## Control embryos belong to the FGF.all dataset
## however, for Double, Half and Single embryos
## we detected a shift in expression levels
## possibly due to these experiments being done months later
## (although acquired in the same system, with same parameters)
## Correct this shift by median centering the controls and the rest by imaging date
## Extract imaging dates (Img_date) from scaling data
uidates <- unique(scaling$Img_date)
## Calculate median of controls for channels 4 & 5 (GATA6 and NANOG) in ICM cells
ii <- scaling$TE_ICM == "ICM"
cmed4 <- quantile(scaling$CH4.ebLogCor[ii & scaling$Treatment == "Control"], 3/4)
cmed5 <- quantile(scaling$CH5.ebLogCor[ii & scaling$Treatment == "Control"], 3/4)
for(i in uidates) {
scaling$CH4.ebLogCor[scaling$Img_date == i] <-
scaling$CH4.ebLogCor[scaling$Img_date == i] +
cmed4 - quantile(scaling$CH4.ebLogCor[ii & scaling$Img_date == i],3/4)
scaling$CH5.ebLogCor[scaling$Img_date == i] <-
scaling$CH5.ebLogCor[scaling$Img_date == i] +
cmed5 - quantile(scaling$CH5.ebLogCor[ii & scaling$Img_date == i],3/4)
}
# Set two centers
set.seed(20160610)
oo <- kmeans(scaling[ii, 32:33], 2)
oo$centers
# CH4.ebLogCor CH5.ebLogCor
#1 5.764162 3.608910
#2 4.060173 5.884474
ii <- scaling$TE_ICM == "ICM"
centers <- rbind(c(5.764162, 5.884474), oo$centers[2:1,], c(4.060173,3.608910))
#centers <- matrix(c(6,6,3.5,6,6,3.5,3.5,3.5), ncol=2, byrow=TRUE)
dkm <- matrix(0, sum(ii), 4)
for(i in 1:4) dkm[,i] <- (scaling$CH4.ebLogCor[ii] - centers[i,1])^2 +
(scaling$CH5.ebLogCor[ii] - centers[i,2])^2
idkm <- apply(dkm, 1, which.min)
scaling$Identity.km <- scaling$Identity
scaling$Identity.km[ii] <- c("DP", "PRE", "EPI", "DN")[idkm]