-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
207 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
--- | ||
title: "Gene Expression Analysis and Visualisation of Glioblastoma Data" | ||
author: "Authors: Chioma Onyido, Chairunnisa Amanda, Hayford Osei, Bassam Elhamsa, Chukwuemeka Nwachuya, Emmanuel Afolayemi, Ibrahim Fangary" | ||
date: "`r Sys.Date()`" | ||
output: html_document | ||
--- | ||
|
||
```{r setup, include=TRUE} | ||
knitr::opts_chunk$set(echo = TRUE) | ||
``` | ||
|
||
#### In this document, we'll walk through our gene expression analysis process. Beginning from the selection of dataset of interest to the visualization of our results. The codes and outputs will be shown. | ||
|
||
```{r heatmap visualisation, echo=TRUE, results='asis', message=FALSE, warning=FALSE} | ||
# STEP 1: Read in data and assign to variable | ||
gbm_data <- read.csv("glioblastoma.csv", row.names=1) | ||
head(gbm_data) | ||
# STEP 2: Generate heatmaps and visualize clusters | ||
library(gplots) | ||
library(RColorBrewer) | ||
# a- Define a color palette for heatmap | ||
# Diverging color palettes | ||
colors_diverging <- colorRampPalette(c("Blue", "White", "Brown"))(100) | ||
heatmap.2(as.matrix(gbm_data), trace = 'none', | ||
scale='row', dendogram = 'col', | ||
Colv = TRUE, Rowv = FALSE, | ||
cexCol = 0.6, | ||
col = colors_diverging, | ||
main = "Diverging color scale") | ||
# Sequential color palettes | ||
heatmap.2(as.matrix(gbm_data), trace = 'none', | ||
scale='row', dendogram = 'col', | ||
Colv = TRUE, Rowv = FALSE, | ||
cexCol = 0.6, | ||
col = colorRampPalette(brewer.pal(11, "Blues"))(100), | ||
main = "Sequential color scale") | ||
# b- Cluster by genes, samples and both | ||
# Cluster by both genes and samples | ||
heatmap.2(as.matrix(gbm_data), trace = 'none', | ||
scale='row', dendrogram = 'both', | ||
Colv = TRUE, Rowv = TRUE, | ||
cexCol = 0.6, | ||
col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100), | ||
main = "Clustering by genes and samples") | ||
# Cluster by rows (genes) | ||
heatmap.2(as.matrix(gbm_data), trace = 'none', | ||
scale='row', dendrogram = 'row', | ||
Colv = FALSE, Rowv = TRUE, | ||
cexCol = 0.6, | ||
col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100), | ||
main = "Clustering by genes") | ||
# Cluster by columns (samples) | ||
heatmap.2(as.matrix(gbm_data), trace = 'none', | ||
scale='row', dendrogram = 'col', | ||
Colv = TRUE, Rowv = FALSE, | ||
cexCol = 0.6, | ||
col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100), | ||
main = "Clustering by samples") | ||
``` | ||
|
||
```{r diff exp genes, echo=TRUE, results='asis', message=FALSE, warning=FALSE} | ||
# STEP 3: Subset genes that are significantly upregulated and downregulated | ||
# a- First, calculate fold change and pvalue | ||
# Divide the groups based on the clusters from the heatmap | ||
# Get column names | ||
colnames(gbm_data) | ||
# Select groups by index positions, group A and group B | ||
groupA <- c(1,2,3,4,5) | ||
groupB <- c(6,7,8,9,10) | ||
groupA_data <- gbm_data[, groupA] | ||
groupB_data <- gbm_data[, groupB] | ||
# View data | ||
View(groupA_data) | ||
View(groupB_data) | ||
# Get means of both groups to calculate fold change | ||
groupA_mean <- rowMeans(groupA_data) | ||
groupB_mean <- rowMeans(groupB_data) | ||
# Calculate log fold change | ||
logFC <- log2(groupB_mean+0.5) - log2(groupA_mean+0.5) | ||
# Calculate p values using Wilcoxon test | ||
pvalues <- apply(gbm_data, 1, function(row) { | ||
wilcox.test(row[1:5], row[6:10])$p.value | ||
}) | ||
plot(logFC, -log10(pvalues)) | ||
# Create a dataframe that has the logFC and pvalue for each gene | ||
library(dplyr) | ||
DEG <- tibble(logFC, pvalues) | ||
View(DEG) | ||
rownames(DEG) <- rownames(gbm_data) | ||
# b- Subset upregulated and downregulated genes | ||
upreg_genes <- DEG %>% filter(logFC >= 1.5 & pvalues<= 0.05) | ||
View(upreg_genes) | ||
downreg_genes <- DEG %>% filter(logFC <= -1.5 & pvalues<= 0.05) | ||
View(downreg_genes) | ||
# writing the up and downregulated genes | ||
write.csv(upreg_genes,"upregul_genes.csv",row.names=TRUE) | ||
write.csv(downreg_genes,"downregul_genes.csv",row.names=TRUE) | ||
# c- Get gene symbol from Ensembl | ||
# Connect to Ensembl | ||
library(biomaRt) | ||
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl") | ||
ensembl_ids <- rownames(DEG) | ||
gene_annotations <- getBM( | ||
attributes = c("ensembl_gene_id", "hgnc_symbol"), | ||
filters = "ensembl_gene_id", | ||
values = ensembl_ids, | ||
mart = ensembl | ||
) | ||
head(gene_annotations) | ||
# Create a list of gene annotations | ||
keys <- gene_annotations$ensembl_gene_id | ||
values <- gene_annotations$hgnc_symbol | ||
l <- list() | ||
for (i in 1:length(keys)){ | ||
l[keys[i]] <- values[i] | ||
} | ||
# Read in upregulated genes file and assign to variable | ||
upreg_genes_with_symbol <- read.csv("upregul_genes.csv", row.names = 1) | ||
# add new column "symbol" to upreg_genes_with_symbol | ||
upreg_genes_with_symbol$symbol <- unlist(l[rownames(upreg_genes_with_symbol)], use.names = FALSE) | ||
# Read in downregulated genes file and assign to variable | ||
downreg_genes_with_symbol <- read.csv("downregul_genes.csv", row.names = 1) | ||
# add new column "symbol" to downreg_genes_with_symbol | ||
downreg_genes_with_symbol$symbol <- unlist(l[rownames(downreg_genes_with_symbol)], use.names = FALSE) | ||
write.csv(upreg_genes_with_symbol, "upregul_genes_with_symb.csv", row.names=FALSE) | ||
write.csv(downreg_genes_with_symbol, "downregul_genes_with_symb.csv", row.names=FALSE) | ||
View(downreg_genes_with_symbol) | ||
View(upreg_genes_with_symbol) | ||
``` | ||
|
||
|
||
```{r lollipop plot, echo=TRUE, results='asis', message=FALSE, warning=FALSE} | ||
# STEP 4/5- Functional enrichment and Visualisation of top pathways | ||
# Import upregulated enriched pathways (ShinyGO) | ||
library(tibble) | ||
upreg_pathways <- read.csv("upreg_top_pathways.csv") | ||
# Getting the name of the pathways only in new column | ||
upreg_pathways <- upreg_pathways %>% | ||
mutate(Pathways_names = trimws(sub("GO:........","",upreg_pathways$Pathway))) | ||
# Lollipop plot of the top 5 upregulated enriched pathways color scaling the | ||
# points according to - log 10 of the p_val | ||
library(ggplot2) | ||
ggplot(upreg_pathways[c(1:5),], aes(x = Pathways_names, y = nGenes)) + # defining the data and x,y axes | ||
geom_segment(aes(x = Pathways_names, xend = Pathways_names, | ||
y = 0, yend = nGenes), | ||
color="black") + # drawing segments of the lollipop in black color | ||
geom_point(aes(color=-log10(Enrichment.FDR)), | ||
size=8) + # Drawing the points of the lollipop and color scale it by - log 10 of the p_val | ||
scale_color_gradient(low = "blue",high = "red")+ # Specify the color scaling from blue to red | ||
coord_flip()+ # Flipping the axes for better visualization | ||
theme_grey()+ # Using grey theme | ||
labs(title = "Top 5 upregulated pathways: Gene Number and Significance Levels", | ||
x = "pathway", | ||
y = "gene_num", | ||
color="-log10 p_val")+ # specifying the labels for each element in the plot | ||
theme(plot.title = element_text(size = 20), | ||
axis.title = element_text(size = 16), | ||
axis.text = element_text(size = 14)) # Adjusting text sizes in the plot | ||
``` | ||
|
||
##### For more details on the tools and libraries used for each step in this analysis, | ||
##### kindly check the README file in the Stage 2 folder in this repo. |