-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSource Code.Rmd
620 lines (490 loc) · 20.4 KB
/
Source Code.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
---
title: "Orebro RNA-seq analysis"
author: "Rodrigo Arcoverde Cerveira & Gustav Joas"
date: '`r format(Sys.Date(), "%Y-%m-%d")`'
output: html_document
abstract: Örebro study RNA-seq data analysis.
knit: (function(inputFile, encoding) {
rmarkdown::render(inputFile,
encoding = encoding,
output_file = paste0(
xfun::sans_ext(inputFile), '_', Sys.Date(), '.html'),
output_dir = "../results/lab_book/")})
editor_options:
chunk_output_type: inline
---
```{r global.options, include=FALSE}
#setting up figures and chunks messages
knitr::opts_knit$set(echo = TRUE,
root.dir = getwd(),
fig.width = 6, fig.height = 5,
fig.align = "center",
out.width = 768,
fig.pos = "H",
warning = FALSE,
message = FALSE)
knitr::opts_chunk$set(warning = FALSE,
message = FALSE,
fig.width = 6, fig.height = 5,
fig.align = "center",
out.width = 768,
fig.pos = "H")
result.dir <- paste0("results/",Sys.Date(),"/")
figures.dir <- paste0("results/",Sys.Date(),"/", "figures/")
## creates result.dir with date in if not existent
ifelse(isFALSE(dir.exists(paste0("../",result.dir))), dir.create(paste0("../",result.dir),recursive = TRUE),"Result directory for today exists already!")
ifelse(isFALSE(dir.exists(paste0("../",figures.dir))), dir.create(paste0("../",figures.dir),recursive = TRUE),"Result directory for today exists already!")
options(stringsAsFactors = FALSE)
```
## Loading libraries
```{r, message=FALSE}
#if you do not have libraries, they are located in either CRAN or Bioconductor
library(data.table)
library(kableExtra)
library(forcats)
library(caret)
library(C50)
library(mgsub)
library(DESeq2)
library(pheatmap)
library(ggplot2)
library(ggrepel)
library(RColorBrewer)
library(limma)
library(edgeR)
library(enrichR)
library(gridExtra)
library(stringr)
library(ggVennDiagram)
library(dplyr)
library(ensembldb)
library(EnsDb.Hsapiens.v86)
library(factoextra)
library(rafalib)
library(plotly)
library(tibble)
library(devtools)
library(clusterProfiler)
library(enrichplot)
library(pathview)
```
## Download and load data
```{r}
# Reading data
counts_raw <- data.table::fread("../data/proces-data_core-facility/subreadCounts_hg38ens_minus_frag.txt", header = TRUE, sep = "\t") %>% as.data.frame()
sampleTable <- read.table("../data/metadata/sample_metadata.csv", sep = ",", header = TRUE) %>%
mutate(dose = as.factor(plyr::mapvalues(visit, from = c(1,2,4,5), to = c("0_dose1","24_dose1","0_dose2","24_dose2"))),
group = as.factor(group))
# Renaming genes based on emsembl annotation
edb <- EnsDb.Hsapiens.v86
edb_genes <- genes(edb) %>% as.data.frame()
counts_raw <- counts_raw %>%
dplyr::select(-c(Chr, Start, End, Strand, Length)) %>%
dplyr::mutate(Geneid = plyr::mapvalues(x = counts_raw$Geneid, from = edb_genes$gene_id, to = edb_genes$symbol, warn_missing = FALSE))
# Aggregate and sum up genes with same gene symbol, which were basically non-coding RNAs
counts_raw <- aggregate(counts_raw[,-1], list(Geneid=counts_raw[,1]), FUN = sum)
rownames(counts_raw) <- counts_raw$Geneid
counts_raw <- dplyr::select(counts_raw, -c(Geneid))
```
## Basic Quality Control
### Inspect raw data
Matching metadata to countTable and plotting the raw counts for first visual inspection.
```{r}
# Synchronize count data with sample table
counts_raw <- counts_raw[, pmatch(sampleTable$sample_ID, colnames(counts_raw))]
colnames(counts_raw) <- sampleTable$sample_ID
all(rownames(sampleTable$sample_ID) == colnames(counts_raw))
# Visualize distribution of raw counts w/ boxplot and density plot
{
pdf(paste0("../", figures.dir,"raw_counts_QC.pdf"), width = 10, height = 8, compress = TRUE )
rafalib::mypar(1,2,mar=c(10,3,3,2))
boxplot(log2(as.matrix(counts_raw)+1),ylab=expression('Log'[2]~'Read counts'),las=2,main="Raw data")
hist(log2(as.matrix(counts_raw)+1),ylab="",las=2,main="Raw data")
par(mfrow=c(1,1))
dev.off()
}
```
### Filter data
Plot detection of genes across samples. All samples are more or less close to average so we do not need to discard any samples.
```{r}
{
pdf(paste0("../", figures.dir,"number_detected_genes.pdf"), width = 10, height = 8, compress = TRUE )
par(mar=c(10,4,3,4))
barplot(colSums(counts_raw>3),ylab="Number of detected genes",las=2)
abline(h=median(colSums(counts_raw>3)))
dev.off()
}
```
Removing reads with the log2 of the counts per million (cpm) lower than 1.
```{r}
#Filter low counts
keep_genes <- rowSums( counts_raw > 5 ) >= 8
counts_filtered<- counts_raw[keep_genes,]
sum(keep_genes)
meanLog2CPM <- rowMeans(log2(cpm(counts_raw) + 1))
counts_filtered <- counts_raw[meanLog2CPM > 1, ]
```
Plot detection rate across genes for raw and filtered counts
```{r}
{
pdf(paste0("../", figures.dir,"detection_rate_raw_filtered.pdf"), width = 10, height = 8, compress = TRUE )
par(mar=c(10,4,3,4))
hist(rowSums(counts_raw>3))
hist(rowSums(counts_filtered>3))
par(mfrow=c(1,1))
dev.off()
}
```
Plot distribution of the filtered counts
```{r}
{
pdf(paste0("../", figures.dir,"filtered_counts_distribution.pdf"), width = 10, height = 8, compress = TRUE )
rafalib::mypar(1,2,mar=c(10,3,3,2))
boxplot(log2(as.matrix(counts_filtered)+1),ylab=expression('Log'[2]~'Read counts'),las=2,main="Filtered data")
hist(log2(as.matrix(counts_filtered)+1),ylab="",las=2,main="Filtered data")
par(mfrow=c(1,1))
dev.off()
}
```
### DESeq object creation, Normalization and data Quality Control
Generating the DESeq dataset.
```{r}
# prepare for DESeq
sampleTable$conditions <- (str_c(sampleTable$group, "_", sampleTable$dose ))
sampleTable$conditions <- factor(sampleTable$conditions)
sampleTable$sex <- factor(sampleTable$sex)
sampleTable$dose <- factor(sampleTable$dose)
sampleTable$group <- factor(sampleTable$group)
sampleTable$time <- sampleTable$conditions
sampleTable$time <- gsub('.{6}$', '', sampleTable$time)
sampleTable$time <- factor(sampleTable$time)
# create 3 DESeq object
dds1 <- DESeqDataSetFromMatrix(countData = counts_filtered,
colData = as.data.frame(sampleTable),
design = ~ sex + age + conditions)
dds2 <- DESeqDataSetFromMatrix(countData = counts_filtered,
colData = as.data.frame(sampleTable),
design = ~ sex + age + time)
dds3 <- DESeqDataSetFromMatrix(countData = counts_filtered,
colData = as.data.frame(sampleTable),
design = ~ sex + age + dose)
# Normalize with rlog
counts_rlog_normalized <- rlog(dds1, blind = TRUE)
# Normalize with variance stabilizing transformation for later PCA and heatmap
counts_vst_normalized <- vst(dds1, blind = TRUE)
```
Plot distribution of data after normalization
```{r}
vst_matrix <- assay(counts_vst_normalized)
rlog_matrix <- assay(counts_rlog_normalized)
hist(vst_matrix)
hist(rlog_matrix)
boxplot(vst_matrix ,ylab=expression('Log'[2]~'Read counts'),las=2,main="VST")
boxplot(rlog_matrix ,ylab=expression('Log'[2]~'Read counts'),las=2,main="rlog")
```
### Heatmap
```{r}
# Sample heatmap
sampleDist <- cor(vst_matrix, method = "spearman")
Metadata <- data.frame(sampleTable$group, sampleTable$dose)
names(Metadata) <- c("Group","Dose")
rownames(Metadata) <- sampleTable$sample_ID
# Plot heatmap
colors<-colorRampPalette(rev(brewer.pal(n=7,name="RdBu")))(255)
{
pdf(paste0("../", figures.dir,"heatmap.pdf"), width = 10, height = 8, compress = TRUE )
Heatmap <- pheatmap(sampleDist,
color = colors,
clustering_distance_rows = as.dist(1 - sampleDist),
clustering_distance_cols = as.dist(1 - sampleDist),
show_rownames = F,
show_colnames = F,
clustering_method = "ward.D2",
annotation_col = Metadata)
par(mfrow=c(1,1))
dev.off()
}
print(Heatmap)
```
### Principal Component Analyais (PCA)
Dimensionality reduction for evaluating outliers and global sample clusters for both quality control but also for primary exploratory analysis.
```{r}
#Sample PCA
pcaRes <- prcomp(t(assay(counts_vst_normalized)))
varExp <- round(summary(pcaRes)[[1]],digits = 2)
pcaDF <- data.frame(
PC1 = pcaRes$x[, 1],
PC2 = pcaRes$x[, 2],
Group = sampleTable$group,
Sample = sampleTable$sample_ID,
Dose = sampleTable$dose)
{
pdf(paste0("../", figures.dir,"PCA.pdf"), width = 10, height = 8, compress = TRUE )
pcaPlot <- ggplot( pcaDF, mapping = aes(x = PC1, y = PC2, color = Group, label = Sample)) +
geom_point(aes(shape = Dose), size = 3) +
labs(x = paste0("PC1 (", varExp[1], " %)"), y = paste0("PC2 (", varExp[2], " %)")) +
theme(axis.text = element_text(size = 12), legend.text = element_text(size = 10)) +
scale_color_manual(values = brewer.pal(3, "Accent")) +
cowplot::theme_cowplot()
par(mfrow=c(1,1))
dev.off()
}
print(pcaPlot)
## Scree plot to compute PC variance
pca.var <- pcaRes$sdev^2
pca.var.per <- round(pca.var/sum(pca.var)*100, 1)
{
pdf(paste0("../", figures.dir,"Scree_plot.pdf"), width = 10, height = 8, compress = TRUE )
barplot(pca.var.per, main="Scree Plot", xlab="Principal Component", ylab="Percent Variation")
par(mfrow=c(1,1))
dev.off()
}
## exploring which variables contribute the most for each PC
var <- get_pca_var(pcaRes)
var$contrib %>%
as.data.frame() %>%
arrange(desc(Dim.1)) %>%
head()
```
## Differential Gene Expression Analysis
```{r}
# Run the DESeq2 analysis
dds1 <- DESeq(dds1)
dds2 <- DESeq(dds2)
dds3 <- DESeq(dds3)
# contrast
resultsNames(dds1)
resultsNames(dds2)
resultsNames(dds3)
# Results dds1
# Baselines DEGs
res_baseline_pos_neg_dose1 <- as.data.frame(results(dds1, contrast = c("conditions", "Pos_0_dose1", "Neg_0_dose1")))
res_baseline_pos_neg_dose2 <- as.data.frame(results(dds1, contrast = c("conditions", "Pos_0_dose2", "Neg_0_dose2")))
res_baseline_pos_doses <- as.data.frame(results(dds1, contrast = c("conditions", "Pos_0_dose2", "Pos_0_dose1")))
res_baseline_neg_doses <- as.data.frame(results(dds1, contrast = c("conditions", "Neg_0_dose2", "Neg_0_dose1")))
#add vaseline for Pos vs neg regardless of dose (theres is a difference here), and then regardles of statu (not here)
# 0-24h Degs
res_neg_dose1 <- as.data.frame(results(dds1, contrast = c("conditions", "Neg_24_dose1", "Neg_0_dose1")))
res_pos_dose1 <- as.data.frame(results(dds1, contrast = c("conditions", "Pos_24_dose1", "Pos_0_dose1")))
res_neg_dose2 <- as.data.frame(results(dds1, contrast = c("conditions", "Neg_24_dose2", "Neg_0_dose2")))
res_pos_dose2 <- as.data.frame(results(dds1, contrast = c("conditions", "Pos_24_dose2", "Pos_0_dose2")))
# Results dds2
res_pos <- as.data.frame(results(dds2, contrast = c("time", "Pos_24", "Pos_0")))
res_neg <- as.data.frame(results(dds2, contrast = c("time", "Neg_24", "Neg_0")))
# Results dds3
res_dose1 <- as.data.frame(results(dds3, contrast = c("dose", "24_dose1", "0_dose1")))
res_dose2 <- as.data.frame(results(dds3, contrast = c("dose", "24_dose2", "0_dose2")))
```
### Volcano plots
Volcano plot for data exploratory analysis, filtering for significant values with False Discovery Rate \< 0.05 and log fold change greater than 1.
```{r}
results_list <- list(res_baseline_pos_neg_dose1, res_baseline_pos_neg_dose2, res_baseline_pos_doses, res_baseline_neg_doses, res_neg_dose1, res_pos_dose1, res_neg_dose2, res_pos_dose2, res_pos, res_neg, res_dose1, res_dose2)
names(results_list) <- c("Baseline Dose 1", "Baseline Dose 2", "Baseline Conv", "Baseline Naïve", "Naïve Dose 1", "Conv Dose 1", "Naïve Dose 2", "Conv Dose 2", "Conv","Naïve", "Dose 1", "Dose 2")
results_list_plot <- lapply(results_list, function(x) {
x <- x %>% mutate(test_padj = case_when(pvalue < 0.05 & log2FoldChange >= 1 ~ "Up regulated",
pvalue < 0.05 & log2FoldChange <= -1 ~ "Down regulated",
TRUE ~ "Not significant"),
test_pvalue = case_when(pvalue < 0.05 & log2FoldChange >= 1 ~ "Up regulated",
pvalue < 0.05 & log2FoldChange <= -1 ~ "Down regulated",
TRUE ~ "Not significant"),
)
})
# function to plot and save volcano plot
.volcano_plot <- function(x){
subsetted <- subset(results_list_plot[[x]] %>% tibble::rownames_to_column("gene_symbol"), abs(log2FoldChange) >= 2 & pvalue < 0.05)
# count up regulated DEGs
deg_number_up <- subset(results_list_plot[[x]] %>% tibble::rownames_to_column("gene_symbol"), log2FoldChange >= 1 & pvalue < 0.05) %>%
nrow()
# count down regulated DEGs
deg_number_down <- subset(results_list_plot[[x]] %>% tibble::rownames_to_column("gene_symbol"), log2FoldChange <= -1 & pvalue < 0.05) %>%
nrow()
results_list_plot[[x]] %>%
ggplot(aes(x=log2FoldChange,y=-log10(pvalue))) +
geom_point(aes(colour=test_pvalue), size=3, alpha=0.3) +
scale_color_manual(values = c("Down regulated"="blue", "Not significant" ="grey80", "Up regulated"="red")) +
xlab(expression("Fold Change (Log"[2]*")")) +
ylab(expression("-Log"[10]*"(p-value)")) +
geom_vline(xintercept = c(-1, 1), linetype = "dotted", size = 1) +
geom_hline(yintercept = -log10(0.05), linetype = "dotted", size = 1) +
geom_label_repel(data = subsetted, aes(log2FoldChange,-log10(pvalue),label= gene_symbol), max.overlaps = 10) +
xlim(-2,6) +
ylim(0,10) +
# ggtitle(x) +
annotate(
geom='text',
colour = "red",
size = 10,
x= 4, y=10,
hjust=0,
label=paste0(deg_number_up)
) +
annotate(
geom='text',
colour = "blue",
size = 10,
x= -2, y= 10,
hjust=0,
label=paste0(deg_number_down)
) +
cowplot::theme_cowplot(font_size = 20, line_size = 1.5, rel_small = 1.6, rel_tiny = 1,font_family = "Arial",rel_large = 4) +
theme(legend.position = 'none',
axis.title = element_text(size = 35))
}
volcano_plots <- lapply(names(results_list_plot), .volcano_plot)
names(volcano_plots) <- names(results_list_plot)
volcano_plots
lapply(names(volcano_plots), function(x) ggsave(filename = paste0("../", figures.dir, x,"_volcano_plot.png"), width = 7, height = 7,
plot = volcano_plots[[x]]))
```
```{r}
#MA plot
#Counts plot
# plotCounts(d,gene=rownames(res)[1],intgroup="group",normalized=F)
# plotCounts(d,gene=rownames(res)[1],intgroup="group",normalized=T)
```
### Enrichment analysis
Selecting two different databases for search for enrichment analysis. The databases selected were "GO_Biological_Process_2021" and "KEGG_2021_Human". Testing also the Blood Transcriptome modules presented in [Li et al. 2014](10.1038/ni.2789) using some GSEA package.
```{r}
#Enrichment analysis
listEnrichrDbs()
enrichment_list_up <- lapply(results_list_plot, function(x) {
enrichr(
genes = rownames(x[x$test_padj == "Up regulated", ]),
databases = c("GO_Biological_Process_2021", "KEGG_2021_Human","GO_Molecular_Function_2021"))})
enrichment_list_down <- lapply(results_list_plot, function(x) {
enrichr(
genes = rownames(x[x$test_padj == "Down regulated", ]),
databases = c("GO_Biological_Process_2021", "KEGG_2021_Human","GO_Molecular_Function_2021"))})
gmt <- read.gmt("../data/blood_transcription_module/BTM_for_GSEA_20131008.gmt")
```
#### Visualize enriched pathways
```{r echo=TRUE, fig.width=15, fig.height=8}
#Removing items without significant results from list
enrichment_list_up[c("neg_dose1")] <- NULL
enrichment_list_down[c("baseline", "neg_dose1", "pos_dose1", "neg", "dose1")] <- NULL
#Visualize significant terms
enrichment_list <- enrichment_list_up
plots_enrich <- list()
for(pathway in 1:length(enrichment_list_up[[1]])){
rbinded_enrich_list <- lapply(enrichment_list, function(x)x[[pathway]])
print(names(enrichment_list[[1]])[[pathway]])
rbinded_enrich_df <- data.table::rbindlist(l = rbinded_enrich_list, idcol = TRUE, fill = TRUE)
plots_enrich[[pathway]] <- rbinded_enrich_df %>%
group_by(.id) %>%
arrange(Adjusted.P.value) %>%
slice(1:10)
}
lapply(plots_enrich, function(x){
x %>%
ggplot(aes(x = Term, y = -log10(Adjusted.P.value))) +
geom_bar(stat = "identity", width = 0.05) + geom_point(size = 3) +
theme_minimal() +
theme(text = element_text(size = 10),
plot.title = element_text(hjust = (5 / nchar(results_list)) * 2),
plot.margin = margin(t = 5, r = 50, b = 5, unit = "pt"),
axis.text.y = element_text(size = 8)) +
coord_flip() +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "gray") +
theme_bw() +
labs(y = expression("Adjusted P value, Log"[10]*"")) +
facet_wrap(~.id)
})
```
### Gene Set Enrichment Analysis with ClusterProfiler
#### Annotaions
```{r, message=F, warning=F}
# SET THE DESIRED ORGANISM HERE
organism <- "org.Hs.eg.db"
# BiocManager::install(organism, character.only = TRUE)
library(organism, character.only = TRUE)
```
#### Prepare Input
```{r}
process_to_gsea <- function(x){
original_gene_list <- x$log2FoldChange
names(original_gene_list) <- rownames(x)
gene_list<-na.omit(original_gene_list)
gene_list <- sort(gene_list, decreasing = TRUE)
}
ls_processed <- lapply(results_list, process_to_gsea)
```
#### Gene Set Enrichment
```{r}
list_gse <- lapply(ls_processed, function(x) GSEA(geneList=x,
TERM2GENE = gmt,
minGSSize = 3,
maxGSSize = 800,
nPermSimple = 10000,
pvalueCutoff = 0.05,
verbose = TRUE,
pAdjustMethod = "fdr"))
```
#### Ridgeplot
Helpful to interpret up/down-regulated pathways.
```{r fig.width=18, fig.height=12}
rigdgeplots_list <- lapply(list_gse, ridgeplot)
lapply(names(rigdgeplots_list), function(x) ggsave(filename = paste0("../", figures.dir, x,"_BTM_plot.pdf"),
plot = rigdgeplots_list[[x]], width = 15, height = 10))
rigdgeplots_list
```
#### Dotplot
```{r echo=TRUE, fig.width=15, fig.height=8}
#require(DOSE)
#dotplot(gse, showCategory=10, split=".sign") + facet_grid(.~.sign)
```
#### GSEA Plot
Traditional method for visualizing GSEA result.
```{r fig.height=6, fig.width=8}
# Use the `Gene Set` param for the index in the title, and as the value for geneSetId
#gseaplot(gse, by = "all", title = gse$Description[55], geneSetID = 55)
#head(gse, 100)
```
### KEGG Gene Set Enrichment Analysis
#### Prepare Input
```{r}
# Convert gene IDs for gseKEGG function
# We will lose some genes here because not all IDs will be converted
ids<-bitr(names(ls_processed[[5]]), fromType = "SYMBOL", toType = "ENTREZID", OrgDb=organism)
# remove duplicate IDS (here I use "ENSEMBL", but it should be whatever was selected as keyType)
dedup_ids = ids[!duplicated(ids[c("SYMBOL")]),]
# Create a new dataframe df2 which has only the genes which were successfully mapped using the bitr function above
df2 = res_pos[rownames(res_pos) %in% dedup_ids$SYMBOL,]
# Create a new column in df2 with the corresponding ENTREZ IDs
df2$Y = dedup_ids$ENTREZID
# Create a vector of the gene unuiverse
kegg_gene_list <- df2$log2FoldChange
# Name vector with ENTREZ ids
names(kegg_gene_list) <- df2$Y
# omit any NA values
kegg_gene_list<-na.omit(kegg_gene_list)
# sort the list in decreasing order (required for clusterProfiler)
kegg_gene_list = sort(kegg_gene_list, decreasing = TRUE)
```
#### Create gseKEGG object
```{r}
#kegg_organism = "hsa"
#kk2 <- gseKEGG(geneList = kegg_gene_list,
organism = kegg_organism,
nPerm = 10000,
minGSSize = 3,
maxGSSize = 800,
pvalueCutoff = 0.05,
pAdjustMethod = "fdr",
keyType = "ncbi-geneid")
```
#### Ridgeplot
Helpful to interpret up/down-regulated pathways.
```{r fig.width=18, fig.height=12}
#gseKEGG(kk2) + labs(x = "enrichment distribution")
```
### Pathview
This will create a PNG and *different* PDF of the enriched KEGG pathway.
```{r, message=F, warning=F, echo = TRUE}
# Produce the native KEGG plot (PNG)
dme <- pathview(gene.data=kegg_gene_list, pathway.id="hsa04620", species = kegg_organism)
# Produce a different plot (PDF) (not displayed here)
dme <- pathview(gene.data=kegg_gene_list, pathway.id="map04620", species = kegg_organism, kegg.native = F)
```
```{r pressure, echo=TRUE, fig.cap="KEGG Native Enriched Pathway Plot", out.width = '100%'}
knitr::include_graphics("hsa03015.pathview.png")
```