-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathapp.R
406 lines (301 loc) · 17.3 KB
/
app.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
# Goal: Fetch info from Norwegian submissions in PRIDE
# Plot them in ggplot
# Create Shiny-applications
# June 2023, Illimar Rekand ([email protected], [email protected])
# ELIXIR Bergen, Department of Informatics, University of Bergen
library(jsonlite)
library(dplyr)
library(ggplot2)
library(shiny)
library(tidyverse)
library(ggwordcloud)
################################################################################################################
################################################ Functions #####################################################
################################################################################################################
unpack.df <- function(df, df.col){ # This function unpacks the lists inside a given column, and creates a single new row for each unpacked element, maintaining the id-field
unpackt <- (df %>%
rowwise() |>
mutate(
field =
list(as.character(!!sym(df.col)))) |>
unnest_longer(!!sym(df.col)))
unpackt.w.count <- unpackt %>% group_by(!!sym(df.col)) %>% add_count() #adds counts for the relevant column #!!sym() converts character to variable
return(unpackt.w.count)
}
################################################################################################################
################################################ Extracting data ###############################################
################################################################################################################
#size limits the search to the first n hits. Otherwise, only 15 first are included
# URL for the fields available in PRIDE: https://www.ebi.ac.uk/ebisearch/metadata.ebi?db=pride
query = "https://www.ebi.ac.uk/ebisearch/ws/rest/pride?query=submitter_country:Norway&size=1000&fields=submission_date,publication_date,labhead_affiliation,submitter,submitter_country,submitter_affiliation,labhead_mail,labhead,submitter_keywords,search_count,view_count,download_count,citation_count,technology_type,instrument_platform,tissue,species,species_suggester,omics_type,disease,disease_suggester,modification&format=json"
#SimplifyDataFrame = T enables data retrieved into a dataframe format
# Sometimes the command below crashes. giving a CacheKey error. If this happens, restart R session (ctrl+shift+F10)
datasets.raw <- fromJSON(query, flatten = TRUE, simplifyDataFrame = TRUE)
datasets.df <- datasets.raw$entries
datasets.df
################################################################################################################
################################################ Cleanup #######################################################
################################################################################################################
#we extract only the years from the dates below, to make plotting later on easier.
#convert to character first, because conversion directly to date does not work....
datasets.df$fields.submission_date <- as.character(datasets.df$fields.submission_date)
datasets.df$fields.submission_date <- format(as.Date(datasets.df$fields.submission_date, format = "%Y%m%d"), "%Y")
datasets.df$fields.publication_date <- as.character(datasets.df$fields.publication_date)
datasets.df$fields.publication_date <- format(as.Date(datasets.df$fields.publication_date, format = "%Y%m%d"), "%Y")
datasets.df$fields.affiliation <- "other" # All institutiions which do not below to the lines below will be binned under the same label
# Institution names are written in free-form, so the below text mining is necessary to group the affiliations properly
# Below we are mining both the affiliation and the e-mail fields
datasets.df$fields.affiliation <- ifelse(grepl("Science and Technology", as.character(datasets.df$fields.labhead_affiliation), ignore.case = T), "NTNU", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("Bergen", datasets.df$fields.labhead_affiliation, ignore.case = T)|grepl("PROBE", datasets.df$fields.labhead_affiliation)|grepl("UiB", datasets.df$fields.labhead_affiliation), "UiB", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("uib", datasets.df$fields.labhead_mail, ignore.case = T), "UiB", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("Oslo", datasets.df$fields.labhead_affiliation), "UiO", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("uio", datasets.df$fields.labhead_mail), "UiO", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("Tuula", datasets.df$fields.labhead_mail, ignore.case = T), "UiO", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("UiT", datasets.df$fields.labhead_affiliation, ignore.case = T)|grepl("Troms", datasets.df$fields.labhead_affiliation, ignore.case = T), "UiT", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("uit", datasets.df$fields.labhead_mail), "UiT", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("Life Sciences", datasets.df$fields.labhead_affiliation, ignore.case = T)|grepl("NMBU", datasets.df$fields.labhead_affiliation, ignore.case = T), "NMBU", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("Nord university", datasets.df$fields.labhead_affiliation, ignore.case = T), "NORD", datasets.df$fields.affiliation)
datasets.df$fields.affiliation <- ifelse(grepl("UiS", datasets.df$fields.labhead_affiliation, ignore.case = T)|grepl("Stavanger", datasets.df$fields.labhead_affiliation, ignore.case = T), "UiS", datasets.df$fields.affiliation)
################################################################################################################
################################################ Wordcloud #####################################################
################################################################################################################
unpack.df.keywords <- unpack.df(datasets.df, "fields.submitter_keywords") #unpacks all the nested keywords
unpack.df.keywords.unique <- unpack.df.keywords %>% distinct(fields.submitter_keywords) #removes redudancy
wordcloud <- ggplot(unpack.df.keywords.unique, aes(label = fields.submitter_keywords)) +
geom_text_wordcloud() +
theme_minimal()
wordcloud
################################################################################################################
################################################ Lollipop-plots ################################################
################################################################################################################
ui <- fluidPage(
# App title ----
titlePanel("Choose input below"),
# Sidebar layout with input and output definitions ----
sidebarLayout(
# Sidebar panel for inputs ----
sidebarPanel(
# Input: Selector for variable to plot against count ----
# The below variables will be output as character strings
selectInput("variable", "Variable:",
c("View count" = "fields.view_count" #currently only one input selectable, other countables have missing data
)),
downloadButton('downloadPlot')
),
# Main panel for displaying outputs ----
mainPanel(
# Output: Formatted text for caption ----
h3(textOutput("caption")),
# Output: Plot of the requested variable against count ----
plotOutput("ebi.plot")
)
)
)
server <- function(input, output) { #shiny passes selectInput as a string. To use these variables for subsetting dataframes, use e.g. df$!!sym(input$variable) or df[[input$variable]]
formulaText <- reactive({
paste("Entries to the PRIDE database from Norwegian institutions")
})
# Return the formula text for printing as a caption ----
output$caption <- renderText({
formulaText()
})
# Render slider input for certain plots
# Generate a plot of the requested variable against count ----
bar_plot.reactive <- reactive({
df.unpackt <- unpack.df(datasets.df, input$variable) #unpacks the lists
df.unpackt[[input$variable]] <- as.numeric(df.unpackt[[input$variable]]) #convert from char to numeric
plot.df <- df.unpackt
ggplot(plot.df, #sort bars after count, lowest to highest
aes(reorder(x = id, !!sym(input$variable)), y = !!sym(input$variable)))+ #input$variable is a string, !!sym() converts them into symbols
geom_point() + #fill-component in aes needs to be declared here, because it is not compatible with aes_string ##this is probably not necessary after all with the impl of !!sym(), but we will keep it to make it easier to read
geom_segment( aes(x=id, xend=id, y=0, yend=!!sym(input$variable))) +
ylab(paste(str_to_title( #Capitalize first words
sub("_", " ", #replace underscores with spaces
substring(input$variable, 8, nchar(input$variable)))))) +
xlab("PRIDE entry ID") +
theme_classic() + #remove gridline
theme(axis.text.x = element_text(angle = -45)) +
scale_fill_brewer(palette = "Paired")
#theme(legend.position = "none") # No legend
})
output$ebi.plot <- renderPlot(
{ #The fields below are sorted chronologically, not after count5
bar_plot.reactive()
}
)
output$downloadPlot <- downloadHandler(
filename <- function()
{paste0("PRIDE-plot-",input$variable,".png")},
content <- function(file){
png(file=file)
plot(bar_plot.reactive())
dev.off()
}
)
}
shinyApp(ui, server)
################################################################################################################
################################################ Barplots ######################################################
################################################################################################################
#below, most variables will be sorted after frequency. The exceptions (dates) are added to a list
exceptions <- c("fields.publication_date", "fields.submission_date") #these bar_plots will be ordered in chronological order, not after count
## Barplots
# Define UI for EBI-db-app app ----
ui <- fluidPage(
# App title ----
titlePanel("Choose input below"),
# Sidebar layout with input and output definitions ----
sidebarLayout(
# Sidebar panel for inputs ----
sidebarPanel(
# Input: Selector for variable to plot against count ----
# The below variables will be output as character strings
selectInput("variable", "Variable:",
c("Submission date" = "fields.submission_date",
"Publication date" = "fields.publication_date",
"Affiliation" = "fields.affiliation",
"Species" = "fields.species",
"Disease" = "fields.disease",
"Tissue" = "fields.tissue",
"Instrument" = "fields.instrument_platform",
"Modifications" = "fields.modification",
"Keywords" = "fields.submitter_keywords",
"Omics" = "fields.omics_type"
)),
uiOutput("numeric"), #This renders the slider input for some plots,
downloadButton('downloadPlot')
),
# Main panel for displaying outputs ----
mainPanel(
# Output: Formatted text for caption ----
h3(textOutput("caption")),
# Output: Plot of the requested variable against count ----
plotOutput("ebi.plot")
)
)
)
# Define server logic to plot various variables against count ----
server <- function(input, output) { #shiny passes selectInput as a string. To use these variables for subsetting dataframes, use e.g. df$!!sym(input$variable) or df[[input$variable]]
# Compute the formula text ----
# This is in a reactive expression since it is shared by the
# output$caption and output$ebi.plot functions
formulaText <- reactive({
paste("Entries to the PRIDE database from Norwegian institutions")
})
# Return the formula text for printing as a caption ----
output$caption <- renderText({
formulaText()
})
# Render slider input for certain plots
output$numeric <- renderUI({
if(!(input$variable %in% exceptions)) { #if the input is NOT among the exceptions, render the min+max input options
#find.count(datasets.df)
df.unpackt <- unpack.df(datasets.df, input$variable)
max.count <- max(df.unpackt$n)
list( # for the inputs below to be succesfully rendered inside an if-statement, we need to wrap them inside a list
numericInput(inputId = "min","Exclude entries with counts >=", 0),
numericInput(inputId = "max","Exclude entries with counts <=", max.count)
)
}
}
)
# Generate a plot of the requested variable against count ----
bar_plot.reactive <- reactive({
if(input$variable %in% exceptions
){order_bars <- FALSE
}
else{
order_bars <- TRUE
min_value <- input$min
max_value <- input$max
}
if(!(input$variable %in% exceptions)){
df.unpackt <- unpack.df(datasets.df, input$variable)
plot.df <- df.unpackt %>% group_by(!!sym(input$variable)) %>% filter(n() >= min_value) %>% filter(n() <= max_value)
}
else{
plot.df <- datasets.df
}
ggplot(plot.df,
aes( fill = fields.affiliation,
if(order_bars == TRUE) {x = fct_rev(fct_infreq(!!sym(input$variable)))} #sort bars after count, lowest to highest
else {x = !!sym(input$variable)} #sort chronologically
))+ #input$variable is a string, !!sym() converts them into symbols
geom_bar(position = "stack") + #fill-component in aes needs to be declared here, because it is not compatible with aes_string ##this is probably not necessary after all with the impl of !!sym(), but we will keep it to make it easier to read
xlab(paste(str_to_title( #Capitalize first words
sub("_", " ", #replace underscores with spaces
substring(input$variable, 8, nchar(input$variable)))))) +
theme_classic() + #remove gridline
theme(axis.text.x = element_text(angle = -45)) +
scale_fill_brewer(palette = "Paired")
#theme(legend.position = "none") # No legend
})
output$ebi.plot <- renderPlot(
{ #The fields below are sorted chronologically, not after count5
bar_plot.reactive()
}
)
output$downloadPlot <- downloadHandler(
filename <- function()
{paste0("PRIDE-plot-",input$variable,".png")},
content <- function(file){
png(file=file)
plot(bar_plot.reactive())
dev.off()
}
)
}
shinyApp(ui, server)
################################################################################################################
################################################ Heatmaps ######################################################
################################################################################################################
## heatmaps
field.list <- c("Submission date" = "fields.submission_date",
"Publication date" = "fields.publication_date",
"Affiliation" = "fields.affiliation",
"Species" = "fields.species",
"Disease" = "fields.disease",
"Tissue" = "fields.tissue",
"Instrument" = "fields.instrument_platform",
"Modifications" = "fields.modification",
"Keywords" = "fields.submitter_keywords",
"Omics" = "fields.omics_type"
)
ui <- fluidPage(
# App title ----
titlePanel("Number of datasets per year"),
# Sidebar layout with input and output definitions ----
sidebarLayout(
# Sidebar panel for inputs ----
sidebarPanel(
# Input: Selector for variable to plot against count ----
# The below variables will be output as character strings
selectInput("xvar", "X Variable:",
field.list),
selectInput("yvar", "Y Variable:",
field.list),
),
# Main panel for displaying outputs ----
mainPanel(
# Output: Formatted text for caption ----
h3(textOutput("caption")),
# Output: Plot of the requested variable against count ----
plotOutput("ebi.plot"),
)
)
)
# Create Shiny app ----
server <- function(input, output) {
# Generate a plot of the requested variable against count ----
output$ebi.plot <- renderPlot({
datasets.df
heatmap <- ggplot(datasets.df,
aes(y = as.character(!!sym(input$yvar)), x = as.character(!!sym(input$xvar)), fill = as.factor(after_stat(count)))
) +
geom_bin2d() +
theme_classic()
heatmap
})
}
# Create Shiny app ----
shinyApp(ui, server)