example_solutions.Rmd

---
title: "Oulu 2023 course -- example solutions"
output: html_document
date: '2023-06-27'
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Load packages

```{r}
# List of packages that we need from cran and bioc 
cran_pkg <- c("BiocManager", "bookdown", "dplyr", "ecodist", "ggplot2", 
              "gridExtra", "kableExtra",  "knitr", "scales", "vegan", "matrixStats", "stringr",
              "ComplexHeatmap")
bioc_pkg <- c("yulab.utils","ggtree","ANCOMBC", "ape", "DESeq2", "DirichletMultinomial", "mia", "miaViz", "scater", "sechm")

# Get those packages that are already installed
cran_pkg_already_installed <- cran_pkg[ cran_pkg %in% installed.packages() ]
bioc_pkg_already_installed <- bioc_pkg[ bioc_pkg %in% installed.packages() ]

# Get those packages that need to be installed
cran_pkg_to_be_installed <- setdiff(cran_pkg, cran_pkg_already_installed)
bioc_pkg_to_be_installed <- setdiff(bioc_pkg, bioc_pkg_already_installed)

# Reorders bioc packages, so that mia and miaViz are first
bioc_pkg <- c(bioc_pkg[ bioc_pkg %in% c("mia", "miaViz") ], 
              bioc_pkg[ !bioc_pkg %in% c("mia", "miaViz") ] ) 

# Combine to one vector
packages <- c(bioc_pkg, cran_pkg)
packages_to_install <- c( bioc_pkg_to_be_installed, cran_pkg_to_be_installed )
```

```{r}
# If there are packages that need to be installed, install them 
if( length(packages_to_install) ) {
   BiocManager::install(packages_to_install)
}

# Install the latest version of mia
library(devtools)
devtools::install_github("microbiome/mia")
```

```{r message=FALSE}
# Loading all packages into session. Returns true if package was successfully loaded.
loaded <- sapply(packages, require, character.only = TRUE)
as.data.frame(loaded)
```

## Import files

```{r}
# Paths for files
rd_taxa_path <- "../shared/data/hintikka_microbiome_rowdata.csv"
cd_taxa_path <- "../shared/data/hintikka_microbiome_coldata.csv"
assay_taxa_path <- "../shared/data/hintikka_microbiome_assay.csv"

cd_metab_path <- "../shared/data/hintikka_metabolites_coldata.csv"
assay_metab_path <- "../shared/data/hintikka_metabolites_assay.csv"

cd_biom_path <- "../shared/data/hintikka_biomarkers_coldata.csv"
assay_biom_path <- "../shared/data/hintikka_biomarkers_assay.csv"
```

```{r}
# Read file
rd_taxa <- read.csv(rd_taxa_path)
# Give rownames and remove additional column
rownames(rd_taxa) <- rd_taxa$X
rd_taxa$X <- NULL
# Convert into DataFrame
rd_taxa <- DataFrame(rd_taxa)

# Read file
cd_taxa <- read.csv(cd_taxa_path)
# Give rownames and remove additional column
rownames(cd_taxa) <- cd_taxa$X
cd_taxa$X <- NULL
# Convert into DataFrame
cd_taxa <- DataFrame(cd_taxa)

# Read file
assay_taxa <- read.csv(assay_taxa_path)
# Give rownames and remove additional column
rownames(assay_taxa) <- assay_taxa$X
assay_taxa$X <- NULL
# Convert into matrix
assay_taxa <- as.matrix(assay_taxa)
# Store assay to a list
assays <- SimpleList(counts = assay_taxa)

# Create a TreeSE
tse_taxa <- TreeSummarizedExperiment(assays = assays,
                                     rowData = rd_taxa,
                                     colData = cd_taxa)
tse_taxa
```

```{r}
# Read file
cd_metab <- read.csv(cd_metab_path)
# Give rownames and remove additional column
rownames(cd_metab) <- cd_metab$X
cd_metab$X <- NULL
# Convert into DataFrame
cd_metab <- DataFrame(cd_metab)

# Read file
assay_metab <- read.csv(assay_metab_path)
# Give rownames and remove additional column
rownames(assay_metab) <- assay_metab$X
assay_metab$X <- NULL
# Convert into matrix
assay_metab <- as.matrix(assay_metab)
# Store assay to a list
assays <- SimpleList(counts = assay_metab)

# Create a TreeSE
tse_metab <- TreeSummarizedExperiment(assays = assays,
                                     colData = cd_metab)
tse_metab
```

```{r}
# Read file
cd_biom <- read.csv(cd_biom_path)
# Give rownames and remove additional column
rownames(cd_biom) <- cd_biom$X
cd_biom$X <- NULL
# Convert into DataFrame
cd_biom <- DataFrame(cd_biom)

# Read file
assay_biom <- read.csv(assay_biom_path)
# Give rownames and remove additional column
rownames(assay_biom) <- assay_biom$X
assay_biom$X <- NULL
# Convert into matrix
assay_biom <- as.matrix(assay_biom)
# Store assay to a list
assays <- SimpleList(counts = assay_biom)

# Create a TreeSE
tse_biom <- TreeSummarizedExperiment(assays = assays,
                                     colData = cd_biom)
tse_biom
```

```{r}
# Store experiments to list
experiments <- ExperimentList(microbiota = tse_taxa,
                              metabolites = tse_metab,
                              biomarkers = tse_biom)
# Create MAE
mae <- MultiAssayExperiment(experiments)
mae
```

## Exploration
```{r}
# Agglomerate microbiota data
altExps( mae[[1]] ) <- splitByRanks( mae[[1]] ) 

# Print dimensions
lapply( altExps(mae[[1]]), dim)

```

```{r}
# Calculate library size
mae[[1]] <- addPerCellQC(mae[[1]])

# Convert coldata to data.frame
df <- as.data.frame(colData(mae[[1]]))

ggplot(df, aes(x = total)) +
  geom_histogram() +
  theme_bw()
```
```{r}
# Log10 transform
mae[[2]] <- transformSamples(mae[[2]], abund_values = "counts", "log10")

# Calculate total concentrations
mae[[2]] <- addPerCellQC(mae[[2]], exprs_values = "log10")

# Convert coldata to data.frame
df <- as.data.frame(colData(mae[[2]]))

ggplot(df, aes(x = total)) +
  geom_density() +
  theme_bw() +
  labs(title = names(mae)[2] )
```

## Alpha diversity

```{r}
# Calculate shannon diversity index
mae[[1]] <- estimateDiversity(mae[[1]], index = "shannon")

# Visualize shannon
df <- as.data.frame(colData(mae[[1]]))

ggplot(df, aes(x = Fat, y = shannon, color = Fat) ) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2)

# Compare with wilcoxon test
wilcox.test(shannon ~ Fat, df)
```

## Beta diversity

PCoA for microbiota

```{r}
# Relative transform
mae[[1]] <- transformSamples(mae[[1]], method = "relabundance")

# Run PCoA
mae[[1]] <- runMDS(mae[[1]], FUN = vegan::vegdist, method = "bray", 
                   exprs_values = "relabundance")

# Visualize PCoA
plotReducedDim(mae[[1]], "MDS", colour_by = "Fat")
```

PCA for metabolites

```{r}
# Run PCA (PCoA with euclidean distances reduces to PCA)
mae[[2]] <- runMDS(mae[[2]], method = "euclidean", 
                   exprs_values = "log10")

# Visualize PCoA
plotReducedDim(mae[[2]], "MDS", colour_by = "Fat")
```

RDA analysis for microbiota data

```{r}
variable_names <- c("Fat", "XOS")

# Varisables should be factors
colData(mae[[1]])$Site <- as.factor(colData(mae[[1]])$Site)

# Create a formula
formula <- as.formula(paste0("data ~ ", str_c(variable_names, collapse = " + ")) )

# # Perform RDA
mae[[1]] <- runRDA(mae[[1]], formula = formula, 
                   abund_values = "relabundance", method = "bray",
                   scale = TRUE, na.action = na.exclude)

# Get rda object
rda <- reducedDim(mae[[1]], "RDA")
rda <- attr(rda, "rda")


# Initialize list for p-values
rda_info <- list()
# Name for storing the result
variable_name <- "all"
# Calculate and store p-value, and other information
rda_info[[variable_name]] <- c(constrained = rda$CCA$tot.chi, 
                               unconstrainded = rda$CA$tot.chi, 
                               proportion = rda$CCA$tot.chi/rda$CA$tot.chi, 
                               p_value = anova.cca(rda)["Model", "Pr(>F)"] )

# Loop through variables
permutations <- 99
for( variable_name in variable_names ){
    # Create a formula
    formula <- as.formula(paste0("data ~ ", variable_name) )
    # Perform RDA
    rda_temp <- calculateRDA(mae[[1]], abund_values = "relabundance", method = "bray",
                             formula = formula, scale = TRUE, na.action = na.exclude)
    rda_temp <- attr(rda_temp, "rda")
    # Add Info to list
    rda_info[[variable_name]] <- c(constrained = rda_temp$CCA$tot.chi, 
                                   unconstrainded = rda_temp$CA$tot.chi, 
                                   proportion = rda_temp$CCA$tot.chi/rda$CA$tot.chi, 
                                   p_value = anova.cca(rda_temp, permutations = permutations
                                                       )["Model", "Pr(>F)"] )
}  
# Convert into data.frame
rda_info <- t(as.data.frame(rda_info))
rda_info_clean <- rda_info
# Adjust names
colnames(rda_info_clean) <- 
    c("Explained by variables", "Unexplained by variables", "Proportion expl by vars", 
      paste0("P-value (PERMANOVA ", permutations, " permutations)") )
# Print info
kable(rda_info_clean)
```

```{r}
# Load ggord for plotting
if(!require("ggord")){
    if(!require("devtools")){
        install.packages("devtools")
        library("devtools")
    }
    install_github("https://github.com/fawda123/ggord/")
    library("ggord")
}

# Since na.exclude was used, if there were rows missing information, they were 
# dropped off. Subset coldata so that it matches with rda.
coldata <- colData(mae[[1]])[ rownames(rda$CCA$wa), ]


# Create a plot        
plot <- ggord(rda, grp_in = coldata[["Diet"]],
              alpha = 0.5,
              size = 4, addsize = -4,
              txt = 3.5, repel = TRUE, 
          ) 
plot
```

```{r}
# Relative transform
altExp(mae[[1]], "Genus") <- transformSamples(altExp(mae[[1]], "Genus"),
                                                method = "relabundance")
# # Perform RDA
rda <- calculateRDA(altExp(mae[[1]], "Genus"), abund_values = "relabundance", method = "bray",
                    formula = data ~ Diet, scale = TRUE, na.action = na.exclude)
rda <- attr(rda, "rda")

# Add taxa info
sppscores(rda) <- t(assay(altExp(mae[[1]], "Genus"),"relabundance"))
# Get coefficients
coef <- rda$CCA$v
# Get the taxa with biggest weights
top.coef <- head( coef[rev(order(abs(coef[,1]))), , drop = FALSE], 10)
# Sort weights in increasing order
top.coef <- top.coef[ order(top.coef[,1]), ]
top.coef <- top.coef[,1]
# Get top names
top_names <- names(top.coef)[ order(abs(top.coef), decreasing = TRUE) ]
```

```{r}
ggplot(data.frame(x = top.coef,
                  y = factor(names(top.coef),
                                      unique(names(top.coef)))),
        aes(x = x, y = y)) +
    geom_bar(stat="identity") +
    labs(x="",y="",title="Top Taxa") +
    theme_bw()
```

The largest differences between Diet groups can be attributed to `r names(top.coef)[1]`.


## Differential abundance analysis

```{r fig.height=5, fig.width=10}
# Take prevalent genus
tse <- subsetByPrevalentTaxa(mae[[1]], rank = "Genus", detection = 0.01, prevalence = 0.2)

# currently, ancombc requires the phyloseq format, but we can easily convert:
pseq <- makePhyloseqFromTreeSummarizedExperiment(tse)

# perform the analysis 
out = ancombc(
  phyloseq = pseq, 
  formula = "Diet", 
  p_adj_method = "fdr", 
  group = "Diet", 
  global = TRUE
)
# store the results in res 
res <- out$res

# Get significant taxa
signif <- rownames(res$diff_abn)[ rowSums(res$diff_abn) > 0 ]

# Drop uncultured bacteria
signif <- signif[ !grepl("uncultured", signif) ]

# Subset tse
tse_temp <- tse[ signif, ]

# Z transform
tse_temp <- ZTransform(tse_temp, abund_values = "relabundance")

sechm(tse_temp, rownames(tse_temp), assayName = "z", do.scale = TRUE,
      top_annotation = "Diet", gaps_at = "Diet")
```

## Cross-correlation analysis

```{r}
# CLR
mat <- assay(altExp(mae[[1]], "Genus"), "relabundance")
pseudocount <- min(mat[ mat >0])
altExp(mae[[1]], "Genus") <- transformSamples(altExp(mae[[1]], "Genus"), 
                                              abund_values = "relabundance", 
                                              method = "clr", pseudocount = pseudocount)

# Take prevalent genus
tse <- subsetByPrevalentTaxa(altExp(mae[[1]], "Genus"), detection = 0.01, prevalence = 0.2)

correlations <- testExperimentCrossAssociation(tse, mae[[2]], "clr", "log10", 
                                               show_warnings = FALSE, mode= "matrix")
```

```{r fig.height=10, fig.width=10}
# Create a heatmap and store it
plot <- Heatmap(correlations$cor,
                # Print values to cells
                cell_fun = function(j, i, x, y, width, height, fill) {
                    # If the p-value is under threshold
                    if( !is.na(correlations$p_adj[i, j]) & correlations$p_adj[i, j] < 0.05 ){
                        # Print "X"
                        grid.text(sprintf("%s", "X"), x, y, gp = gpar(fontsize = 8, col = "black"))
                        }
                    },
                heatmap_legend_param = list(title = "", legend_height = unit(5, "cm"))
                )
plot
```