Dada_pipeline_in_R.qmd

---
title: "Dada2 pipeline in R"
author: "Marko Suokas"
format: pdf
pdf-engine: lualatex
editor: visual
mainfont: Aptos
monofont: PT Mono
always_allow_html: yes
header-includes:
   \usepackage[dvipsnames]{xcolor}
   \definecolor{darkblue}{rgb}{0.0, 0.0, 0.55}
   \definecolor{maroon}{rgb}{0.5, 0.0, 0.0}
   \definecolor{ivory}{rgb}{1.0, 1.0, 0.94}
   \definecolor{indigo}{rgb}{0.29, 0.0, 0.51}
---

```{r, include = FALSE}
#Code to adjust text size inside R code chunks, default is normal size
def.chunk.hook  <- knitr::knit_hooks$get("chunk")
knitr::knit_hooks$set(chunk = function(x, options) {
  x <- def.chunk.hook(x, options)
  ifelse(options$size != "normalsize", paste0("\n \\", options$size,"\n\n", x, "\n\n \\normalsize"), x)
})
```

#### Load libraries

```{r libraries, warning=FALSE, message=FALSE, size = "tiny"}
library(dada2);packageVersion("dada2")
library(knitr);packageVersion("knitr")
library(Biostrings);packageVersion("Biostrings")
library(DECIPHER);packageVersion("DECIPHER")
library(phyloseq);packageVersion("phyloseq")
library(tidyverse);packageVersion("tidyverse")
library(kableExtra);packageVersion("kableExtra")
library(patchwork);packageVersion("patchwork")
```

\newpage

#### Parameters

```{r parameters, warning = FALSE, message = FALSE, size = "tiny"}
#Path variables
path <- "data/reads"
training <- "~/feature_classifiers/SILVA_SSU_r138_2019.RData"
meta_file <- "data/metadata.tsv"
exportloc <- "result_tables"
#Truncation length and phix (Illumina)
truncation <- 245
phi <- FALSE
#Name of first column in metadata file
meta_1stcol <- "Sampleid"
#Create results directory
dir.create(exportloc)
```

#### Sample lists

```{r lists, warning=FALSE, message=FALSE, size="tiny"}
#List files in path
list.files(path)
#Filenames have format: SAMPLENAME_R1_001.fastq
fnFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE))
# Extract sample names, assuming pattern
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
#Filtered files will be placed in filtered/ subdirectory
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
names(filtFs) <- sample.names
```

**Tip:** If you have numbered samples, use 0X format. Otherwise you have problems in sort order.

\newpage

#### Quality profile

```{r qplot, warning=FALSE, message=FALSE, size = "tiny", fig.dim=c(7,7)}
# Base quality plot in first 6 samples
plotQualityProfile(fnFs[1:6])
```

\newpage

### Filtering and trimming reads

```{r filtering, warning=FALSE, message=FALSE, eval = FALSE, size = "tiny"}
#Filtered files will be placed in filtered/ subdirectory
out <- filterAndTrim(fnFs, filtFs, truncLen=245,
                     maxN=0, maxEE=2, truncQ=2,
                     compress=TRUE, multithread=FALSE)
#Output is saved to rds file, so we don't have to recalculate, if we make changes
#If you are making changes to chunk, change eval = TRUE
saveRDS(out,"rds/out.rds")
```

```{r rds1, warning = FALSE, message = FALSE, size = "tiny"}
#read rds file
out <- readRDS("rds/out.rds")
```

**Considerations:** The standard parameters are starting points. If you want to speed up downstream computation, consider tightening `maxEE`. If too few reads are passing the filter, consider relaxing `maxEE`.

For ITS sequencing, it is usually undesirable to truncate reads to a fixed length due to the large length variation at that locus. You can omit in this case truncLen parameter.

#### Learn error rates

Step determinates error rate of dataset using *learnErrors* function.

```{r learnerrors, warning=FALSE, message=FALSE, size = "tiny", eval = FALSE}
# Forward read error rate
errF <- learnErrors(filtFs, multithread=TRUE)
# saverds
saveRDS(errF,"rds/errF.rds")
```

```{r rds2, warning = FALSE, message = FALSE, size = "tiny"}
errF <- readRDS("rds/errF.rds")
```

\newpage

#### Plot error profiles

```{r errorplot, warning=FALSE, message=FALSE, size = "tiny", fig.dim = c(6.5,6)}
# Plotting error rate profile for forward reads
plotErrors(errF, nominalQ=TRUE)
```

\newpage

#### Denoise data

```{r denoise, warning=FALSE, message=FALSE, eval = FALSE, size = "tiny"}
#denoise command
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
#save to rds file
saveRDS(dadaFs,"rds/dadaFs.rds")
```

```{r rds3, warning = FALSE, message = FALSE, size = "tiny"}
#read rds
dadaFs <- readRDS("rds/dadaFs.rds")
```

#### Create ASV table

```{r asvtable, warning=FALSE, message=FALSE, size = "tiny"}
seqtab <- makeSequenceTable(dadaFs)
# Dimensions of ASV table
dim(seqtab)
```

#### Remove chimeric variants

```{r chimeras, warning=FALSE, message=FALSE, size = "tiny"}
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = TRUE, verbose = TRUE)
dim(seqtab.nochim)
```

\newpage

#### Summary

Summary can help to pinpoint if at some stage, abnormal amount of the data is lost

```{r summary, warning=FALSE, message=FALSE, size = "small"}
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), rowSums(seqtab.nochim),
               rowSums(seqtab.nochim != 0))
# If processing a single sample, replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("Input", "Filtered", "Denoised", "Nonchimeric", "Variants")
rownames(track) <- sample.names
kable(track, caption="Denoising summary") %>%
  kable_styling(latex_options = c("HOLD_position","striped")) %>%
  row_spec(0, background="indigo", color="ivory")
```

\newpage

#### Assign taxonomy

We use idTaxa from DECIPHER and Silva database to assign taxonomic information.

```{r taxonomy, warning = FALSE, message = FALSE, size = "tiny", eval = FALSE}
#Create a DNAStringSet from the ASVs
sequences <- DNAStringSet(getSequences(seqtab.nochim))
# CHANGE TO THE PATH OF YOUR TRAINING SET
load("~/feature_classifiers/SILVA_SSU_r138_2019.RData")
#IdTaxa
ids <- IdTaxa(sequences, trainingSet, strand="top", processors = 3, verbose = FALSE)
ranks <- c("domain", "phylum", "class", "order", "family", "genus", "species") 
#Convert the output object of class "Taxa" to a matrix analogous to the output from assignTaxonomy
taxid <- t(sapply(ids, function(x) {
        m <- match(ranks, x$rank)
        taxa <- x$taxon[m]
        taxa[startsWith(taxa, "unclassified_")] <- NA
        taxa
}))
colnames(taxid) <- ranks; rownames(taxid) <- getSequences(seqtab.nochim)
#save end result to rds
saveRDS(taxid, "rds/taxid.rds")
```

```{r, warning = FALSE, message = FALSE, size = "tiny"}
taxid <- readRDS("rds/taxid.rds")
```

#### Create phyloseq object

```{r phyloseq, warning=FALSE, message=FALSE, size = "tiny"}
# Reading tsv file, arranging first column to rownames and creating phyloseq object pseq
samples_meta <- read_tsv("data/metadata.tsv", show_col_types = FALSE)
samples_meta <- samples_meta %>% tibble::column_to_rownames("Sampleid")
sampletable = sample_data(samples_meta)
pseq <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows = FALSE),
               tax_table(taxid),
               sampletable)
#Viewing basic information of pseq object
pseq
```

\newpage

Our variant sequences are currently stored as names. They will moved to refseq and taxa names will be replaced by more convenient format

```{r, warning = FALSE, message = FALSE, size = "tiny"}
repseq <- Biostrings::DNAStringSet(taxa_names(pseq))
names(repseq) <- taxa_names(pseq)
pseq <- merge_phyloseq(pseq, repseq)
taxa_names(pseq) <- paste0("ASV", seq(ntaxa(pseq)))
pseq
```

Finally, minor modifications for dataset. Number of taxa lost is checked at each step

```{r, warning = FALSE, message = FALSE, size = "tiny"}
#We  capitalise taxonomic rank names
colnames(tax_table(pseq)) <- c("Kingdom", "Phylum", "Class", 
  "Order", "Family", "Genus", "Species")
#Sample18 negative control in unnecessary as there is nothing to investigate
pseq <- subset_samples(pseq, Name != "control")
pseq
# Keeping all taxa that are not unknown at Kingdom rank
pseq <- subset_taxa(pseq, Kingdom != "NA")
pseq
# Keeping all that are not Chloroplastic at Order rank
pseq <- subset_taxa(pseq, Order != "Chloroplast" | is.na(Order))
pseq
# Keeping all that are not Mitochondrial at Family rank
pseq <- subset_taxa(pseq, Family != "Mitochondria" | is.na(Family))
pseq
```

In the end we have 17 samples and 1417 taxa

\newpage

#### Writing data

Last step is to save data to suitable file formats.

All variant sequences are save to fasta

```{r, warning = FALSE, message = FALSE, size = "tiny"}
pseq %>% refseq() %>% writeXStringSet(paste0(exportloc,"/repseq.fasta"), append=FALSE,
                                  compress=FALSE, format="fasta")
```

Taxonomy table is converted to dataframe and written as tsv

```{r, warning = FALSE, message = FALSE, size = "tiny"}
taxonomy <- as.data.frame(tax_table(pseq))
write_tsv(taxonomy, paste0(exportloc, "/taxonomy.tsv"))
```

For metadata we add sampleid colum and write as tsv

```{r, warning = FALSE, message = FALSE, size = "tiny"}
sampleid <- sample_names(pseq)
metafile <- sample_data(pseq)
metadf <- data.frame(sampleid,metafile)
write_tsv(metadf, paste0(exportloc, "/metadata.tsv"))
```

ASV count data need to be transposed prior writing

```{r, warning = FALSE, message = FALSE, size = "tiny"}
ASV_names <- taxa_names(pseq)
ASV_counts <- t(otu_table(pseq))
ASVdf <- (data.frame(ASV_names,ASV_counts))
write_tsv(ASVdf, paste0(exportloc, "/fishery_asvs.tsv"))
```