Exploration of participant's chem data.R

##Title: Exploration of participant's chem data
##Author: Emily Oosterhout
## setwd('C:/DATA FOOD COMPONENT ANALYSIS/RP2_ChemIBDFood')
##

setwd('C:/DATA FOOD COMPONENT ANALYSIS/RP2_ChemIBDFood')
##========================= LOAD DATA AND IMPORT LIBRARIES =============================##

#Import libraries
library(readxl)
library(dplyr)
library(tidyverse)
library(data.table)
library(reshape2)
library(ggplot2)
library(ggbreak) 
library(patchwork)
library(rlang)
library(writexl)
#packages for testing/visualizing data distribution
library(ggpubr)
library(FSA)
library(ggsignif)
library(ggrepel)
#specific packages for PCA
library(ggfortify)
library(corrr)
library(corrplot)
library(factoextra)
library(MASS)
library(Hotelling)


# Load data
chem_part <- read_xlsx('chem_raw_participant.xlsx')
chem <- read_xlsx('transposed_ffqchem_V2.xlsx')
diet_raw <- read_xlsx('diet_raw_v2.xlsx')

##=============================== CLEANING AND SUBSETTING ==============================##

names(chem_part) = gsub(pattern = ":", replacement = "_", x = names(chem_part))

# Subset full participant intake to IBD and LLDEEP
IBD <- chem_part[chem_part$UMCGIBDResearchIDorLLDeepID %like% "UMCGIBD",]
LLDEEP <- chem_part[chem_part$UMCGIBDResearchIDorLLDeepID %like% "LLDeep",]

#Only chem intake IBD and LLDEEP
IBD_chem <- IBD[,7:1114]
LLDEEP_chem <- LLDEEP[,7:1114]

# Identify IBD participants
IBD_ID <- as.character(IBD$UMCGIBDResearchIDorLLDeepID)
# Create a new column specifying the group for each sample
chem_part$group <- ifelse(chem_part$UMCGIBDResearchIDorLLDeepID %in% IBD_ID, "IBD", "LLDEEP")

num_dat <- chem_part[,7:1114] #only selecting numerical from chem_part
chem_names <- colnames(num_dat)

##============================ SCALING OF RAW CHEM_PART ==============================##

## Common scaling methods are 
# Autoscaling (also known as standardization), where the responses for each variable are centered by subtracting its mean value and then divided by its standard deviation 
# Pareto scaling, where the mean-centered values are divided by the square root of the standard deviation.

# Calculate the percentage of non-zero values for each variable
non_zero_pct <- apply(num_dat != 0, 2, mean)
# Filter variables with at least a non-zero value in 10% of the data
filter_10pct <- num_dat[,non_zero_pct >= 0.1]
# Filter variables with at least a non-zero value in 20% of the data
filter_20pct <- num_dat[,non_zero_pct >= 0.2]

#add pseudocount to all variables and perform log-transformation
pseudo_chem <- filter_20pct + 1
log_chem <- as.data.frame(log(pseudo_chem))

##=================================== PCA ANALYSIS FOOD METABOLITES: AUTO SCALING ==============================##

#PCA food metabolites on log transformed data
logchem_pca_auto= prcomp(log_chem,scale. = T, center = T)

#Here I use the plots generated by factoextra library for a quick check but you can extract the data and repeat graphs using ggplot
#Check percentage of variance explained by each component
fviz_eig(logchem_pca_auto, addlabels = T)

#PCA plot
fviz_pca_ind(logchem_pca_auto, geom = "point",
             col.ind = "cos2", # Color by the quality of representation
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE     # Avoid text overlapping
)

#Biplot
fviz_pca_biplot(logchem_pca_auto, repel = TRUE, geom = "point",
                col.var = "#2E9FDF", # Variables color
                col.ind = "#696969",# Individuals color
                select.var= list(name=NULL, cos2=NULL, contrib=20) # contrib parameter top X individuals/variables with the highest contribution are drawn
)

# Color by cos2 values: quality on the factor map
fviz_pca_var(logchem_pca_auto, col.var = "cos2",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE # Avoid text overlapping
)

# Contributions of variables to PC1
fviz_contrib(logchem_pca_auto, choice = "var", axes = 1, top = 20)
# Contributions of variables to PC2
fviz_contrib(logchem_pca_auto, choice = "var", axes = 2, top = 20)



#store PCA results in df
pca_res <- summary(logchem_pca_auto)
scree.data <- as.data.frame(pca_res$importance)
score.data <- as.data.frame(pca_res$x)
loadings.data <- as.data.frame(pca_res$rotation)

# Using ggplot to add diet and type of diet as factor in pca plot
chem.pca.var <- get_pca_ind(logchem_pca)
chem.pca.coord <- as.data.frame(chem.pca.var$coord)
chem.pca.coord.diet <- merge(chem_part, chem.pca.coord, by="row.names" )
chem.pca.contrib <- as.data.frame(chem.pca.var$contrib)
chem.pca.contrib.diet <- merge(chem_part, chem.pca.contrib, by="row.names" )

ggplot(chem.pca.contrib.diet, aes(Dim.1,Dim.2, fill=Diet)) + 
  geom_point(size=3,shape=21) + 
  theme_bw() +
  theme(legend.position = 'bottom',
        axis.text.x = element_text(vjust=0.5, size = 10)) 
ggplot(chem.pca.contrib.diet, aes(Dim.1,Dim.2, fill=type_of_diet)) + 
  geom_point(size=3,shape=21) + 
  theme_bw() + 
  theme(legend.position = 'bottom',
        axis.text.x = element_text(vjust=0.5, size = 10)) 

##=================================== DESCRIPTIVE STATISTICS INTAKE METABOLITE DATA =================================##

#function that creates a new data object containing descriptive statistics
sumstats <- function(z){
  Mean <- apply(z, 2, mean)
  Median <- apply(z, 2, median)
  SD <- apply(z, 2, sd)
  SE <- apply(z, 2, function(x) sd(x)/sqrt(length(x)))
  CV <- apply(z, 2, function(x) sd(x)/mean(x))
  result <- data.frame(Mean, Median, SD, SE, CV)
  return(result)
}

#apply descriptive statistics function 
stat_IBD <- sumstats(IBD_chem) # IBD patients
stat_IBD$group <- 'IBD'
stat_LLDEEP <- sumstats(LLDEEP_chem) # LLDEEP participants
stat_LLDEEP$group <- 'LLDEEP'
stat_chem <- rbind(stat_IBD, stat_LLDEEP) # full chem data


##===================================== DIFFERENTIAL ABUNDANCE ANALYSIS ON LOG TRANSFORMED AND FILTERED CHEM DATA ====================================================##

# Calculate the percentage of non-zero values for each variable
non_zero_pct <- apply(num_dat != 0, 2, mean)
# Filter variables with at least a non-zero value in 10% of the data
filter_10pct <- num_dat[,non_zero_pct >= 0.1]
# Filter variables with at least a non-zero value in 20% of the data
filter_20pct <- num_dat[,non_zero_pct >= 0.2]

#add pseudocount to all variables
pseudo_chem <- filter_20pct + 1

# Create a new column specifying the group for each sample
pseudo_chem$group <- ifelse(chem_part$UMCGIBDResearchIDorLLDeepID %in% IBD_ID, "IBD", "LLDEEP")
names <- names(pseudo_chem)

#Write remove outliers function
remove_outliers <- function(data, group_var) {
  # Split data into two groups based on group_var
  groups <- split(data, data[[group_var]])
  
  # Loop through each group and remove outliers in each column
  for (i in seq_along(groups)) {
    group <- groups[[i]]
    for (j in seq_along(group)) {
      column <- group[[j]]
      if (is.numeric(column)) {
        q1 <- quantile(column, 0.25)
        q3 <- quantile(column, 0.75)
        iqr <- q3 - q1
        upper_limit <- q3 + 1.5 * iqr
        lower_limit <- q1 - 1.5 * iqr
        group[[j]] <- ifelse(column > upper_limit | column < lower_limit, NA, column)
      }
    }
    groups[[i]] <- group
  }
  
  # Combine the groups back into a single dataframe
  result <- do.call(rbind, groups)
  
  # Remove rows with any NA values
  result <- na.omit(result)
  
  return(result)
}


pseudo_chem_clean <- remove_outliers(pseudo_chem, 'group')

##============================= T-TEST ==================================##
ttest_p <- c() # Initialize empty vector for p-values

# Do "for loop" over selected column names
for (i in names) {
  
  result <- t.test(pseudo_chem_clean[, i] ~ group,
                        data = pseudo_chem_clean)
  
  # Stores p-value to the vector with this column name
  ttest_p[[i]]  <- result$p.value
  
}

#store metabolites with raw p-value in new dataframe
ttest_p <- data.frame(metabolites =  names(ttest_p),
                         p_raw = unlist(ttest_p))
ttest_p$p_adjusted <- p.adjust(ttest_p$p_raw, method = "fdr") #add column with p_adj for multiple testing

# prepare a dataframe to plot p values
df <- data.frame(x = c(ttest_p$p_raw, ttest_p$p_adjusted), 
                 type=rep(c("raw", "fdr"),
                          c(length(ttest_p$p_raw),
                            length(ttest_p$p_adjusted))))

# make a histrogram of p values and adjusted p values
ttest_plot <- ggplot(df) +
  geom_histogram(aes(x=x, fill=type)) +
  labs(x = "p-value", y = "Frequency") 

ttest_plot

# Sorts p-values in decreasing order. Takes 3 first ones. Takes those rows that match
# with p-values. Takes metabolites. 
highest3 <- ttest_p[ttest_p$p_adjusted %in% sort(ttest_p$p_adjusted, decreasing = TRUE)[1:3], ]$metabolites
# From log transformed table, takes only those metabolites that had highest p-values
highest3_chem <- pseudo_chem_clean[,colnames(pseudo_chem_clean) %in% highest3]
# Adds colData that includes patient status infomation
highest3_full <- cbind(pseudo_chem_clean$group, highest3_chem)

# Sorts p-values in increasing order. Takes 3rd first ones. Takes those rows that match
# with p-values. Takes metabolites. 
lowest3 <- ttest_p[ttest_p$p_adjusted %in% sort(ttest_p$p_adjusted, decreasing = FALSE)[1:3], ]$metabolites
# From log transformed table, takes only those metabolites that had highest p-values
lowest3_chem <- pseudo_chem_clean[,colnames(pseudo_chem_clean) %in% lowest3]
# Adds colData that includes patient status infomation
lowest3_full <- cbind(pseudo_chem_clean$group, lowest3_chem)

# Puts plots in the same picture
gridExtra::grid.arrange(
  
  # Plot 1
  ggplot(highest3_full, aes(x = pseudo_chem_clean$group, y = highest3_full[,2], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(highest3_full)[2]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 2
  ggplot(highest3_full, aes(x = pseudo_chem_clean$group, y = highest3_full[,3], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(highest3_full)[3]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 3
  ggplot(highest3_full, aes(x = pseudo_chem_clean$group, y = highest3_full[,4], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(highest3_full)[4]) + # main title
    theme_minimal() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 4
  ggplot(lowest3_full, aes(x = pseudo_chem_clean$group, y = lowest3_full[,2], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(lowest3_full)[2]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 5
  ggplot(lowest3_full, aes(x = pseudo_chem_clean$group, y = lowest3_full[,3], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(lowest3_full)[3]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 6
  ggplot(lowest3_full, aes(x = pseudo_chem_clean$group, y = lowest3_full[,4], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(lowest3_full)[4]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # 3 columns and 2 rows
  ncol = 3, 
  nrow = 2
)

##======================= WILCOXON TEST ======================##
wilcoxon_p <- c() # Initialize empty vector for p-values

# Do "for loop" over selected column names
for (i in names) {
  
  result <- wilcox.test(pseudo_chem_clean[, i] ~ group,
                        data = pseudo_chem_clean)
  
  # Stores p-value to the vector with this column name
  wilcoxon_p[[i]]  <- result$p.value
  
}

#store metabolites with raw p-value in new dataframe
wilcoxon_p <- data.frame(metabolites =  names(wilcoxon_p),
                         p_raw = unlist(wilcoxon_p))
wilcoxon_p$p_adjusted <- p.adjust(wilcoxon_p$p_raw, method = "fdr") #add column with p_adj for multiple testing

# prepare a dataframe to plot p values
df <- data.frame(x = c(wilcoxon_p$p_raw, wilcoxon_p$p_adjusted), 
                 type=rep(c("raw", "fdr"),
                          c(length(wilcoxon_p$p_raw),
                            length(wilcoxon_p$p_adjusted))))

# make a histrogram of p values and adjusted p values
wilcoxon_plot <- ggplot(df) +
  geom_histogram(aes(x=x, fill=type)) +
  labs(x = "p-value", y = "Frequency") 

wilcoxon_plot


# Sorts p-values in decreasing order. Takes 3 first ones. Takes those rows that match
# with p-values. Takes metabolites. 
highest3 <- wilcoxon_p[wilcoxon_p$p_adjusted %in% sort(wilcoxon_p$p_adjusted, decreasing = TRUE)[1:3], ]$metabolites
# From pseudo transformed table, takes only those metabolites that had highest p-values
highest3_chem <- pseudo_chem_clean[,colnames(pseudo_chem_clean) %in% highest3]
# Adds colData that includes patient status infomation
highest3_full <- cbind(pseudo_chem_clean$group, highest3_chem)

# Sorts p-values in increasing order. Takes 3rd first ones. Takes those rows that match
# with p-values. Takes metabolites. 
lowest3 <- wilcoxon_p[wilcoxon_p$p_adjusted %in% sort(wilcoxon_p$p_adjusted, decreasing = FALSE)[1:3], ]$metabolites
# From pseudo transformed table, takes only those metabolites that had highest p-values
lowest3_chem <- pseudo_chem_clean[,colnames(pseudo_chem_clean) %in% lowest3]
# Adds colData that includes patient status infomation
lowest3_full <- cbind(pseudo_chem_clean$group, lowest3_chem)

# Puts plots in the same picture
gridExtra::grid.arrange(
  
  # Plot 1
  ggplot(highest3_full, aes(x = pseudo_chem_clean$group, y = highest3_full[,2], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(highest3_full)[2]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 2
  ggplot(highest3_full, aes(x = pseudo_chem_clean$group, y = highest3_full[,3], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(highest3_full)[3]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 3
  ggplot(highest3_full, aes(x = pseudo_chem_clean$group, y = highest3_full[,4], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(highest3_full)[4]) + # main title
    theme_minimal() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 4
  ggplot(lowest3_full, aes(x = pseudo_chem_clean$group, y = lowest3_full[,2], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(lowest3_full)[2]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 5
  ggplot(lowest3_full, aes(x = pseudo_chem_clean$group, y = lowest3_full[,3], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(lowest3_full)[3]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # Plot 6
  ggplot(lowest3_full, aes(x = pseudo_chem_clean$group, y = lowest3_full[,4], fill = pseudo_chem_clean$group)) + 
    geom_boxplot() + 
    geom_signif(comparisons = list(c("IBD", "LLDEEP")), 
                map_signif_level=TRUE) +
    ylab("Metabolite intake") + # y axis title
    ggtitle(names(lowest3_full)[4]) + # main title
    theme_minimal() + 
    theme(title = element_text(size = 7),
          legend.position = 'none',
          axis.text = element_text(size = 7),
          axis.title.x=element_blank()), # makes titles smaller, removes x axis title
  
  # 3 columns and 2 rows
  ncol = 3, 
  nrow = 2
)


##============================== VOLCANO PLOT OF DIFFERENTIAL ABUNDANCE ANALYSIS ===============================##
#log2 transformation
pseudo_chem_clean_log2 <- log2(pseudo_chem_clean[,1:1009])
pseudo_chem_clean_log2$group <- ifelse(rownames(pseudo_chem_clean_log2) %like% 'IBD', "IBD", "LLDEEP")

#calculate the mean of each metabolite in IBD group
pseudo_chem_clean_ibd <- filter(pseudo_chem_clean_log2, pseudo_chem_clean_log2$group == 'IBD')
IBD = apply(pseudo_chem_clean_ibd[,1:1009], 2, mean)

#calcuate the mean of each metabolite in LLDEEP group
pseudo_chem_clean_lldeep <- filter(pseudo_chem_clean_log2, pseudo_chem_clean_log2$group == 'LLDEEP')
LLDEEP = apply(pseudo_chem_clean_lldeep[,1:1009], 2, mean)

#because the data is already log2 transformed, take the difference between the means.
foldchange <- LLDEEP - IBD
hist(foldchange, xlab = "log2 Fold Change (IBD vs LLDEEP)")

#add foldchange to df containing p-values
wilcoxon_p$foldchange <- foldchange

# add a column of NAs
wilcoxon_p$intakedifference <- "NO"
# if log2Foldchange > 0.6 and pvalue < 0.05, set as "UP" 
wilcoxon_p$intakedifference[wilcoxon_p$foldchange > 0.6 & wilcoxon_p$p_raw < 0.05] <- "UP"
# if log2Foldchange < -0.6 and pvalue < 0.05, set as "DOWN"
wilcoxon_p$intakedifference[wilcoxon_p$foldchange < -0.6 & wilcoxon_p$p_raw < 0.05] <- "DOWN"


# Create a new column "label" to df, that will contain the name of metabolites where intake is different between groups (NA in case they are not)
wilcoxon_p$label <- NA
wilcoxon_p$label[wilcoxon_p$intakedifference != "NO"] <- wilcoxon_p$metabolites[wilcoxon_p$intakedifference != "NO"]

ggplot(wilcoxon_p, aes(x=foldchange, y=-1*log10(p_raw), col=intakedifference, label=label)) + 
  geom_point() + 
  theme_minimal() +
  geom_vline(xintercept=c(-0.6, 0.6), col="red") +
  geom_hline(yintercept=-log10(0.05), col="red") + 
  geom_text_repel(size = 1.5)