Skip to content

Commit

Permalink
Add new file
Browse files Browse the repository at this point in the history
  • Loading branch information
Omabekee authored Sep 15, 2024
1 parent db52fba commit 526f6c7
Showing 1 changed file with 207 additions and 0 deletions.
207 changes: 207 additions & 0 deletions gene-expression-analysis.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
---
title: "Gene Expression Analysis and Visualisation of Glioblastoma Data"
author: "Authors: Chioma Onyido, Chairunnisa Amanda, Hayford Osei, Bassam Elhamsa, Chukwuemeka Nwachuya, Emmanuel Afolayemi, Ibrahim Fangary"
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=TRUE}
knitr::opts_chunk$set(echo = TRUE)
```

#### In this document, we'll walk through our gene expression analysis process. Beginning from the selection of dataset of interest to the visualization of our results. The codes and outputs will be shown.

```{r heatmap visualisation, echo=TRUE, results='asis', message=FALSE, warning=FALSE}
# STEP 1: Read in data and assign to variable
gbm_data <- read.csv("glioblastoma.csv", row.names=1)
head(gbm_data)
# STEP 2: Generate heatmaps and visualize clusters
library(gplots)
library(RColorBrewer)
# a- Define a color palette for heatmap
# Diverging color palettes
colors_diverging <- colorRampPalette(c("Blue", "White", "Brown"))(100)
heatmap.2(as.matrix(gbm_data), trace = 'none',
scale='row', dendogram = 'col',
Colv = TRUE, Rowv = FALSE,
cexCol = 0.6,
col = colors_diverging,
main = "Diverging color scale")
# Sequential color palettes
heatmap.2(as.matrix(gbm_data), trace = 'none',
scale='row', dendogram = 'col',
Colv = TRUE, Rowv = FALSE,
cexCol = 0.6,
col = colorRampPalette(brewer.pal(11, "Blues"))(100),
main = "Sequential color scale")
# b- Cluster by genes, samples and both
# Cluster by both genes and samples
heatmap.2(as.matrix(gbm_data), trace = 'none',
scale='row', dendrogram = 'both',
Colv = TRUE, Rowv = TRUE,
cexCol = 0.6,
col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100),
main = "Clustering by genes and samples")
# Cluster by rows (genes)
heatmap.2(as.matrix(gbm_data), trace = 'none',
scale='row', dendrogram = 'row',
Colv = FALSE, Rowv = TRUE,
cexCol = 0.6,
col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100),
main = "Clustering by genes")
# Cluster by columns (samples)
heatmap.2(as.matrix(gbm_data), trace = 'none',
scale='row', dendrogram = 'col',
Colv = TRUE, Rowv = FALSE,
cexCol = 0.6,
col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100),
main = "Clustering by samples")
```

```{r diff exp genes, echo=TRUE, results='asis', message=FALSE, warning=FALSE}
# STEP 3: Subset genes that are significantly upregulated and downregulated
# a- First, calculate fold change and pvalue
# Divide the groups based on the clusters from the heatmap
# Get column names
colnames(gbm_data)
# Select groups by index positions, group A and group B
groupA <- c(1,2,3,4,5)
groupB <- c(6,7,8,9,10)
groupA_data <- gbm_data[, groupA]
groupB_data <- gbm_data[, groupB]
# View data
View(groupA_data)
View(groupB_data)
# Get means of both groups to calculate fold change
groupA_mean <- rowMeans(groupA_data)
groupB_mean <- rowMeans(groupB_data)
# Calculate log fold change
logFC <- log2(groupB_mean+0.5) - log2(groupA_mean+0.5)
# Calculate p values using Wilcoxon test
pvalues <- apply(gbm_data, 1, function(row) {
wilcox.test(row[1:5], row[6:10])$p.value
})
plot(logFC, -log10(pvalues))
# Create a dataframe that has the logFC and pvalue for each gene
library(dplyr)
DEG <- tibble(logFC, pvalues)
View(DEG)
rownames(DEG) <- rownames(gbm_data)
# b- Subset upregulated and downregulated genes
upreg_genes <- DEG %>% filter(logFC >= 1.5 & pvalues<= 0.05)
View(upreg_genes)
downreg_genes <- DEG %>% filter(logFC <= -1.5 & pvalues<= 0.05)
View(downreg_genes)
# writing the up and downregulated genes
write.csv(upreg_genes,"upregul_genes.csv",row.names=TRUE)
write.csv(downreg_genes,"downregul_genes.csv",row.names=TRUE)
# c- Get gene symbol from Ensembl
# Connect to Ensembl
library(biomaRt)
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
ensembl_ids <- rownames(DEG)
gene_annotations <- getBM(
attributes = c("ensembl_gene_id", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = ensembl_ids,
mart = ensembl
)
head(gene_annotations)
# Create a list of gene annotations
keys <- gene_annotations$ensembl_gene_id
values <- gene_annotations$hgnc_symbol
l <- list()
for (i in 1:length(keys)){
l[keys[i]] <- values[i]
}
# Read in upregulated genes file and assign to variable
upreg_genes_with_symbol <- read.csv("upregul_genes.csv", row.names = 1)
# add new column "symbol" to upreg_genes_with_symbol
upreg_genes_with_symbol$symbol <- unlist(l[rownames(upreg_genes_with_symbol)], use.names = FALSE)
# Read in downregulated genes file and assign to variable
downreg_genes_with_symbol <- read.csv("downregul_genes.csv", row.names = 1)
# add new column "symbol" to downreg_genes_with_symbol
downreg_genes_with_symbol$symbol <- unlist(l[rownames(downreg_genes_with_symbol)], use.names = FALSE)
write.csv(upreg_genes_with_symbol, "upregul_genes_with_symb.csv", row.names=FALSE)
write.csv(downreg_genes_with_symbol, "downregul_genes_with_symb.csv", row.names=FALSE)
View(downreg_genes_with_symbol)
View(upreg_genes_with_symbol)
```


```{r lollipop plot, echo=TRUE, results='asis', message=FALSE, warning=FALSE}
# STEP 4/5- Functional enrichment and Visualisation of top pathways
# Import upregulated enriched pathways (ShinyGO)
library(tibble)
upreg_pathways <- read.csv("upreg_top_pathways.csv")
# Getting the name of the pathways only in new column
upreg_pathways <- upreg_pathways %>%
mutate(Pathways_names = trimws(sub("GO:........","",upreg_pathways$Pathway)))
# Lollipop plot of the top 5 upregulated enriched pathways color scaling the
# points according to - log 10 of the p_val
library(ggplot2)
ggplot(upreg_pathways[c(1:5),], aes(x = Pathways_names, y = nGenes)) + # defining the data and x,y axes
geom_segment(aes(x = Pathways_names, xend = Pathways_names,
y = 0, yend = nGenes),
color="black") + # drawing segments of the lollipop in black color
geom_point(aes(color=-log10(Enrichment.FDR)),
size=8) + # Drawing the points of the lollipop and color scale it by - log 10 of the p_val
scale_color_gradient(low = "blue",high = "red")+ # Specify the color scaling from blue to red
coord_flip()+ # Flipping the axes for better visualization
theme_grey()+ # Using grey theme
labs(title = "Top 5 upregulated pathways: Gene Number and Significance Levels",
x = "pathway",
y = "gene_num",
color="-log10 p_val")+ # specifying the labels for each element in the plot
theme(plot.title = element_text(size = 20),
axis.title = element_text(size = 16),
axis.text = element_text(size = 14)) # Adjusting text sizes in the plot
```

##### For more details on the tools and libraries used for each step in this analysis,
##### kindly check the README file in the Stage 2 folder in this repo.

0 comments on commit 526f6c7

Please sign in to comment.