-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathall_commands_99_analysis_09.R
249 lines (204 loc) · 8.67 KB
/
all_commands_99_analysis_09.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
###This script contains commands run for the whole-genome analysis paper for whitefish De-Kayne et al.
#for all bash analyses please look at all_commands_99_analysis.txt
#then the following analyses
# 9. KEGG overlap
# - 9.1 overlap of genes in outlier windows between all four lakes
# - 9.2 overlap of fst outlier windows with genes and the KEGG pathways of those genes
# - 9.3 table output for paper
#set directory and load background file
setwd("/Users/rishidek/Dropbox/RishiMAC/99_reanalysis/")
background <- read.csv("background_2021_99.csv", header = T)
library(tidyverse)
################# 9. 1 gene overlap from outlier windows ###############################
#load genes that fall within outliers in each lake fst comparison
brienz_outlier_genes <- read.csv("KEGG/Brienz_extractedannotation_unique_gene_names.txt", header = FALSE)
neuchatel_outlier_genes <- read.csv("KEGG/Neuchatel_extractedannotation_unique_gene_names.txt", header = FALSE)
walen_outlier_genes <- read.csv("KEGG/Walen_extractedannotation_unique_gene_names.txt", header = FALSE)
luzern_outlier_genes <- read.csv("KEGG/Lucerne_extractedannotation_unique_gene_names.txt", header = FALSE)
brienz_outlier_genes$V1 <- as.character(brienz_outlier_genes$V1)
brienz_outlier_genes$lakeB <- "YES"
neuchatel_outlier_genes$V1 <- as.character(neuchatel_outlier_genes$V1)
neuchatel_outlier_genes$lakeN <- "YES"
walen_outlier_genes$V1 <- as.character(walen_outlier_genes$V1)
walen_outlier_genes$lakeW <- "YES"
luzern_outlier_genes$V1 <- as.character(luzern_outlier_genes$V1)
luzern_outlier_genes$lakeL <- "YES"
full_gene_df <- as.data.frame(full_join(brienz_outlier_genes, neuchatel_outlier_genes, by = "V1"))
full_gene_df <- as.data.frame(full_join(full_gene_df, walen_outlier_genes, by = "V1"))
full_gene_df <- as.data.frame(full_join(full_gene_df, luzern_outlier_genes, by = "V1"))
full_gene_df$count_empty <- rowSums(apply(is.na(full_gene_df), 2, as.numeric))
full_gene_df_shared <- subset(full_gene_df, full_gene_df$count_empty < 1)
################# 9. 2 KEGG overlap between comparisons ###############################
#and load gene-kegg file
gene_to_KEGG <- read.csv("KEGG/all.maker.proteins.user_ko.txt", sep = "\t", header = FALSE)
#remove genes with no kegg annotation
gene_to_KEGG$empty <- "yes"
for (i in 1:nrow(gene_to_KEGG)){
if(nchar(as.character(gene_to_KEGG$V2[i]))>0){
gene_to_KEGG$empty[i] <- "no"
}
}
#and make new df with only kegg annotated genes
gene_to_KEGG_clean <- subset(gene_to_KEGG, gene_to_KEGG$empty == "no")
gene_to_KEGG_clean$V1 <- as.character(gene_to_KEGG_clean$V1)
gene_to_KEGG_clean$V2 <- as.character(gene_to_KEGG_clean$V2)
gene_to_KEGG_clean$V1 <- gsub("-mRNA-1", "", gene_to_KEGG_clean$V1)
############First match the gene list we have with the gene lists for outliers in each lake###########
#make new empty vectors
brienz_vec <- c()
neuchatel_vec <- c()
walen_vec <- c()
luzern_vec <- c()
#and run loop so that if gene is in outlier window from that lake then it adds the kegg id of that gene into the column
for(i in 1:nrow(gene_to_KEGG_clean)){
KEGG_ID_to_check <- as.character(gene_to_KEGG_clean$V1)[i]
ID <- as.character(gene_to_KEGG_clean$V2)[i]
a <- which(as.character(KEGG_ID_to_check) == as.vector(brienz_outlier_genes$V1))
b <- which(as.character(KEGG_ID_to_check) == as.vector(neuchatel_outlier_genes$V1))
c <- which(as.character(KEGG_ID_to_check) == as.vector(walen_outlier_genes$V1))
d <- which(as.character(KEGG_ID_to_check) == as.vector(luzern_outlier_genes$V1))
if(length(a)>0){
brienz_vec[i] <- ID
}
if(length(a)<1){
brienz_vec[i] <- "no"
}
if(length(b)>0){
neuchatel_vec[i] <- ID
}
if(length(b)<1){
neuchatel_vec[i] <- "no"
}
if(length(c)>0){
walen_vec[i] <- ID
}
if(length(c)<1){
walen_vec[i] <- "no"
}
if(length(d)>0){
luzern_vec[i] <- ID
}
if(length(d)<1){
luzern_vec[i] <- "no"
}
}
gene_to_KEGG_clean$brienz <- brienz_vec
gene_to_KEGG_clean$neuchatel <- neuchatel_vec
gene_to_KEGG_clean$walen <- walen_vec
gene_to_KEGG_clean$luzern <- luzern_vec
#now sum number of missing values across rows
gene_to_KEGG_clean$count <- rowSums(gene_to_KEGG_clean[-1] == "no")
total_genes_present_at_least_once_across_4_lakesDF <- subset(gene_to_KEGG_clean, gene_to_KEGG_clean$count <5)
total_genes_present_at_least_once_across_4_lakes <- nrow(total_genes_present_at_least_once_across_4_lakesDF)
total_genes_present_all_4_lakesDF <- subset(gene_to_KEGG_clean, gene_to_KEGG_clean$count <2)
total_genes_present_all_4_lakes <- nrow(total_genes_present_all_4_lakesDF)
#so now we have four vectors with the KEGG IDs associated with outliers in each lake (some may occur more than once)
###########now use the list of KEGG IDs in each lake outlier to see if some are shared across all four###########
#now get unique kegg list from original df
unique_KEGGs <- unique(gene_to_KEGG_clean[c("V2")])
brienz_PRESENT <- c()
neuchatel_PRESENT <- c()
walen_PRESENT <- c()
luzern_PRESENT <- c()
for(i in 1:nrow(unique_KEGGs)){
K_ID <- as.character(unique_KEGGs$V2)[i]
aa <- which(as.character(K_ID) == brienz_vec)
bb <- which(as.character(K_ID) == neuchatel_vec)
cc <- which(as.character(K_ID) == walen_vec)
dd <- which(as.character(K_ID) == luzern_vec)
if(length(aa)>0){
brienz_PRESENT[i] <- "yes"
}
if(length(aa)<1){
brienz_PRESENT[i] <- "no"
}
if(length(bb)>0){
neuchatel_PRESENT[i] <- "yes"
}
if(length(bb)<1){
neuchatel_PRESENT[i] <- "no"
}
if(length(cc)>0){
walen_PRESENT[i] <- "yes"
}
if(length(cc)<1){
walen_PRESENT[i] <- "no"
}
if(length(dd)>0){
luzern_PRESENT[i] <- "yes"
}
if(length(dd)<1){
luzern_PRESENT[i] <- "no"
}
}
FULL_KEGG_ID <- as.data.frame(unique_KEGGs)
FULL_KEGG_ID$brienz_ID <- brienz_PRESENT
FULL_KEGG_ID$neuchatel_ID <- neuchatel_PRESENT
FULL_KEGG_ID$walen_ID <- walen_PRESENT
FULL_KEGG_ID$luzern_ID <- luzern_PRESENT
#and count yes across columns
FULL_KEGG_ID$count <- rowSums(FULL_KEGG_ID[-1] == "yes")
colnames(FULL_KEGG_ID) <- c("KEGG_ID","brienz_ID","neuchatel_ID","walen_ID","luzern_ID","count")
write.csv(FULL_KEGG_ID, "KEGG/FULL_KEGG_ID.csv", row.names=FALSE)
#calculate and write all kegg ids present at least once
morethan1 <- subset(FULL_KEGG_ID, FULL_KEGG_ID$count > 0)
write.csv(morethan1, "KEGG/morethan1.csv", row.names=FALSE)
KEGGID_SHARED_ALL4 <- subset(morethan1, morethan1$count > 3)
#write all lake-specific kegg id lists
brienz_KEGGs <- subset(morethan1, morethan1$brienz_ID == "yes")
neuchatel_KEGGs <- subset(morethan1, morethan1$neuchatel_ID == "yes")
walen_KEGGs <- subset(morethan1, morethan1$walen_ID == "yes")
luzern_KEGGs <- subset(morethan1, morethan1$luzern_ID == "yes")
write.csv(brienz_KEGGs, "KEGG/brienz_KEGGs.csv", row.names=FALSE)
write.csv(neuchatel_KEGGs, "KEGG/neuchatel_KEGGs.csv", row.names=FALSE)
write.csv(walen_KEGGs, "KEGG/walen_KEGGs.csv", row.names=FALSE)
write.csv(luzern_KEGGs, "KEGG/luzern_KEGGs.csv", row.names=FALSE)
#now the KEGG IDs are pasted into: https://www.kegg.jp/kegg/ko.html and the output copied back into an excel spreadsheet
#and read back in here:
B_KO <- read.csv("KEGG/brienz_KEGG_ID_OUTPUT.csv", header = FALSE, na.strings=c(""))
B_KO <- na.omit(B_KO)
colnames(B_KO) <- c("V1", "B_path", "B_count")
N_KO <- read.csv("KEGG/neuchatel_KEGG_ID_OUTPUT.csv", header = FALSE, na.strings=c(""))
N_KO <- N_KO[,1:3]
N_KO <- na.omit(N_KO)
colnames(N_KO) <- c("V1", "N_path", "N_count")
W_KO <- read.csv("KEGG/walen_KEGG_ID_OUTPUT.csv", header = FALSE, na.strings=c(""))
W_KO <- W_KO[,1:3]
W_KO <- na.omit(W_KO)
colnames(W_KO) <- c("V1", "W_path", "W_count")
L_KO <- read.csv("KEGG/luzern_KEGG_ID_OUTPUT.csv", header = FALSE, na.strings=c(""))
L_KO <- L_KO[,1:3]
L_KO <- na.omit(L_KO)
colnames(L_KO) <- c("V1", "L_path", "L_count")
nrow(B_KO)
nrow(N_KO)
nrow(W_KO)
nrow(L_KO)
#and join them all together to get a list of shared pathways
shared_KEGG_pathwaysDF <- full_join(B_KO, N_KO, by = "V1")
shared_KEGG_pathwaysDF <- full_join(shared_KEGG_pathwaysDF, W_KO, by = "V1")
shared_KEGG_pathwaysDF <- full_join(shared_KEGG_pathwaysDF, L_KO, by = "V1")
shared_KEGG_pathwaysDF$count_empty <- rowSums(apply(is.na(shared_KEGG_pathwaysDF), 2, as.numeric))
shared_KEGG_pathways <- subset(shared_KEGG_pathwaysDF, shared_KEGG_pathwaysDF$count_empty < 1)
################# 9. 3 table output for paper ###############################
#table output:
##Genes overlapping with FST outlier windows
#total:
nrow(full_gene_df)
#shared:
nrow(full_gene_df_shared)
##Genes with associated KEGG IDs
#total =
total_genes_present_at_least_once_across_4_lakes
#shared =
total_genes_present_all_4_lakes
##Unique KEGG IDs from these genes
#total =
nrow(morethan1)
#shared =
nrow(KEGGID_SHARED_ALL4)
#KEGG pathways from these KEGG IDs
#total =
nrow(shared_KEGG_pathwaysDF)
#shared =
nrow(shared_KEGG_pathways)