Add new file

Omabekee · Sep 15, 2024 · 526f6c7 · 526f6c7
1 parent db52fba
commit 526f6c7
Showing 1 changed file with 207 additions and 0 deletions.
diff --git a/gene-expression-analysis.Rmd b/gene-expression-analysis.Rmd
@@ -0,0 +1,207 @@
+---
+title: "Gene Expression Analysis and Visualisation of Glioblastoma Data"
+author: "Authors: Chioma Onyido, Chairunnisa Amanda, Hayford Osei, Bassam Elhamsa, Chukwuemeka Nwachuya, Emmanuel Afolayemi, Ibrahim Fangary"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+```{r setup, include=TRUE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+#### In this document, we'll walk through our gene expression analysis process. Beginning from the selection of dataset of interest to the visualization of our results. The codes and outputs will be shown.
+
+```{r heatmap visualisation, echo=TRUE, results='asis', message=FALSE, warning=FALSE}
+
+# STEP 1: Read in data and assign to variable
+
+gbm_data <- read.csv("glioblastoma.csv", row.names=1)
+
+head(gbm_data)
+
+# STEP 2: Generate heatmaps and visualize clusters
+library(gplots)
+library(RColorBrewer)
+
+# a- Define a color palette for heatmap
+# Diverging color palettes
+
+colors_diverging <- colorRampPalette(c("Blue", "White", "Brown"))(100)
+heatmap.2(as.matrix(gbm_data), trace = 'none',
+          scale='row', dendogram = 'col',
+          Colv = TRUE, Rowv = FALSE,
+          cexCol = 0.6,
+          col = colors_diverging,
+          main = "Diverging color scale")
+
+# Sequential color palettes
+heatmap.2(as.matrix(gbm_data), trace = 'none',
+          scale='row', dendogram = 'col',
+          Colv = TRUE, Rowv = FALSE,
+          cexCol = 0.6,
+          col = colorRampPalette(brewer.pal(11, "Blues"))(100),
+          main = "Sequential color scale")
+
+# b- Cluster by genes, samples and both
+# Cluster by both genes and samples
+heatmap.2(as.matrix(gbm_data), trace = 'none',
+          scale='row', dendrogram = 'both',
+          Colv = TRUE, Rowv = TRUE,
+          cexCol = 0.6,
+          col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100),
+          main = "Clustering by genes and samples")
+
+# Cluster by rows (genes)
+heatmap.2(as.matrix(gbm_data), trace = 'none',
+          scale='row', dendrogram = 'row',
+          Colv = FALSE, Rowv = TRUE,
+          cexCol = 0.6,
+          col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100),
+          main = "Clustering by genes")
+
+
+# Cluster by columns (samples)
+heatmap.2(as.matrix(gbm_data), trace = 'none',
+          scale='row', dendrogram = 'col',
+          Colv = TRUE, Rowv = FALSE,
+          cexCol = 0.6,
+          col = colorRampPalette(brewer.pal(9, "YlGnBu"))(100),
+          main = "Clustering by samples")
+```
+
+```{r diff exp genes, echo=TRUE, results='asis', message=FALSE, warning=FALSE}
+# STEP 3: Subset genes that are significantly upregulated and downregulated
+
+# a- First, calculate fold change and pvalue
+
+# Divide the groups based on the clusters from the heatmap
+
+# Get column names
+colnames(gbm_data)
+
+# Select groups by index positions, group A and group B
+groupA <- c(1,2,3,4,5)
+groupB <- c(6,7,8,9,10)
+
+groupA_data <- gbm_data[, groupA]
+groupB_data <- gbm_data[, groupB]
+
+# View data
+View(groupA_data)
+View(groupB_data)
+
+# Get means of both groups to calculate fold change
+groupA_mean <- rowMeans(groupA_data)
+groupB_mean <- rowMeans(groupB_data)
+
+# Calculate log fold change
+logFC <- log2(groupB_mean+0.5) - log2(groupA_mean+0.5)
+
+
+# Calculate p values using Wilcoxon test
+pvalues <- apply(gbm_data, 1, function(row) {
+  wilcox.test(row[1:5], row[6:10])$p.value
+})
+
+plot(logFC, -log10(pvalues))
+
+
+# Create a dataframe that has the logFC and pvalue for each gene
+library(dplyr)
+DEG <- tibble(logFC, pvalues)
+View(DEG)
+rownames(DEG) <- rownames(gbm_data)
+
+# b- Subset upregulated and downregulated genes
+upreg_genes <- DEG %>% filter(logFC >= 1.5 & pvalues<= 0.05)
+View(upreg_genes)
+
+downreg_genes <- DEG %>% filter(logFC <= -1.5 & pvalues<= 0.05)
+View(downreg_genes)                              
+
+# writing the up and downregulated genes 
+write.csv(upreg_genes,"upregul_genes.csv",row.names=TRUE)  
+write.csv(downreg_genes,"downregul_genes.csv",row.names=TRUE)
+
+# c- Get gene symbol from Ensembl
+# Connect to Ensembl
+library(biomaRt)
+ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")
+ensembl_ids <- rownames(DEG)
+
+gene_annotations <- getBM(
+  attributes = c("ensembl_gene_id", "hgnc_symbol"),
+  filters = "ensembl_gene_id",                     
+  values = ensembl_ids,                           
+  mart = ensembl                                    
+)
+
+head(gene_annotations)
+
+# Create a list of gene annotations
+keys <- gene_annotations$ensembl_gene_id
+values <- gene_annotations$hgnc_symbol
+
+l <- list()
+for (i in 1:length(keys)){
+  l[keys[i]] <- values[i]
+}
+
+# Read in upregulated genes file and assign to variable
+upreg_genes_with_symbol <- read.csv("upregul_genes.csv", row.names = 1)
+
+
+# add new column "symbol" to upreg_genes_with_symbol
+upreg_genes_with_symbol$symbol <- unlist(l[rownames(upreg_genes_with_symbol)], use.names = FALSE)
+
+# Read in downregulated genes file and assign to variable
+downreg_genes_with_symbol <- read.csv("downregul_genes.csv", row.names = 1)
+
+
+# add new column "symbol" to downreg_genes_with_symbol
+downreg_genes_with_symbol$symbol <- unlist(l[rownames(downreg_genes_with_symbol)], use.names = FALSE)
+
+write.csv(upreg_genes_with_symbol, "upregul_genes_with_symb.csv", row.names=FALSE)
+write.csv(downreg_genes_with_symbol, "downregul_genes_with_symb.csv", row.names=FALSE)
+
+View(downreg_genes_with_symbol)
+View(upreg_genes_with_symbol)
+```
+
+
+```{r lollipop plot, echo=TRUE, results='asis', message=FALSE, warning=FALSE}
+# STEP 4/5- Functional enrichment and Visualisation of top pathways
+
+# Import upregulated enriched pathways (ShinyGO)
+library(tibble)
+upreg_pathways <- read.csv("upreg_top_pathways.csv")
+
+# Getting the name of the pathways only in new column
+upreg_pathways <- upreg_pathways %>%
+  mutate(Pathways_names = trimws(sub("GO:........","",upreg_pathways$Pathway)))
+
+
+# Lollipop plot of the top 5 upregulated enriched pathways color scaling the
+# points according to - log 10 of the p_val
+library(ggplot2)
+
+ggplot(upreg_pathways[c(1:5),], aes(x = Pathways_names, y = nGenes)) + # defining the data and x,y axes
+  geom_segment(aes(x = Pathways_names, xend = Pathways_names,
+                   y = 0, yend = nGenes),
+               color="black") +  # drawing segments of the lollipop in black color
+  geom_point(aes(color=-log10(Enrichment.FDR)),
+             size=8) + # Drawing the points of the lollipop and color scale it by - log 10 of the p_val
+  scale_color_gradient(low = "blue",high = "red")+ # Specify the color scaling from blue to red
+  coord_flip()+ # Flipping the axes for better visualization 
+  theme_grey()+ # Using grey theme 
+  labs(title = "Top 5 upregulated pathways: Gene Number and Significance Levels",
+       x = "pathway",
+       y = "gene_num",
+       color="-log10 p_val")+    # specifying the labels for each element in the plot
+  theme(plot.title = element_text(size = 20),
+        axis.title = element_text(size = 16),
+        axis.text = element_text(size = 14)) # Adjusting text sizes in the plot
+```
+
+##### For more details on the tools and libraries used for each step in this analysis,
+##### kindly check the README file in the Stage 2 folder in this repo.