-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dada_pipeline_in_R.qmd
294 lines (233 loc) · 9.1 KB
/
Dada_pipeline_in_R.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
---
title: "Dada2 pipeline in R"
author: "Marko Suokas"
format: pdf
pdf-engine: lualatex
editor: visual
mainfont: Aptos
monofont: PT Mono
always_allow_html: yes
header-includes:
\usepackage[dvipsnames]{xcolor}
\definecolor{darkblue}{rgb}{0.0, 0.0, 0.55}
\definecolor{maroon}{rgb}{0.5, 0.0, 0.0}
\definecolor{ivory}{rgb}{1.0, 1.0, 0.94}
\definecolor{indigo}{rgb}{0.29, 0.0, 0.51}
---
```{r, include = FALSE}
#Code to adjust text size inside R code chunks, default is normal size
def.chunk.hook <- knitr::knit_hooks$get("chunk")
knitr::knit_hooks$set(chunk = function(x, options) {
x <- def.chunk.hook(x, options)
ifelse(options$size != "normalsize", paste0("\n \\", options$size,"\n\n", x, "\n\n \\normalsize"), x)
})
```
#### Load libraries
```{r libraries, warning=FALSE, message=FALSE, size = "tiny"}
library(dada2);packageVersion("dada2")
library(knitr);packageVersion("knitr")
library(Biostrings);packageVersion("Biostrings")
library(DECIPHER);packageVersion("DECIPHER")
library(phyloseq);packageVersion("phyloseq")
library(tidyverse);packageVersion("tidyverse")
library(kableExtra);packageVersion("kableExtra")
library(patchwork);packageVersion("patchwork")
```
\newpage
#### Parameters
```{r parameters, warning = FALSE, message = FALSE, size = "tiny"}
#Path variables
path <- "data/reads"
training <- "~/feature_classifiers/SILVA_SSU_r138_2019.RData"
meta_file <- "data/metadata.tsv"
exportloc <- "result_tables"
#Truncation length and phix (Illumina)
truncation <- 245
phi <- FALSE
#Name of first column in metadata file
meta_1stcol <- "Sampleid"
#Create results directory
dir.create(exportloc)
```
#### Sample lists
```{r lists, warning=FALSE, message=FALSE, size="tiny"}
#List files in path
list.files(path)
#Filenames have format: SAMPLENAME_R1_001.fastq
fnFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE))
# Extract sample names, assuming pattern
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
#Filtered files will be placed in filtered/ subdirectory
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
names(filtFs) <- sample.names
```
**Tip:** If you have numbered samples, use 0X format. Otherwise you have problems in sort order.
\newpage
#### Quality profile
```{r qplot, warning=FALSE, message=FALSE, size = "tiny", fig.dim=c(7,7)}
# Base quality plot in first 6 samples
plotQualityProfile(fnFs[1:6])
```
\newpage
### Filtering and trimming reads
```{r filtering, warning=FALSE, message=FALSE, eval = FALSE, size = "tiny"}
#Filtered files will be placed in filtered/ subdirectory
out <- filterAndTrim(fnFs, filtFs, truncLen=245,
maxN=0, maxEE=2, truncQ=2,
compress=TRUE, multithread=FALSE)
#Output is saved to rds file, so we don't have to recalculate, if we make changes
#If you are making changes to chunk, change eval = TRUE
saveRDS(out,"rds/out.rds")
```
```{r rds1, warning = FALSE, message = FALSE, size = "tiny"}
#read rds file
out <- readRDS("rds/out.rds")
```
**Considerations:** The standard parameters are starting points. If you want to speed up downstream computation, consider tightening `maxEE`. If too few reads are passing the filter, consider relaxing `maxEE`.
For ITS sequencing, it is usually undesirable to truncate reads to a fixed length due to the large length variation at that locus. You can omit in this case truncLen parameter.
#### Learn error rates
Step determinates error rate of dataset using *learnErrors* function.
```{r learnerrors, warning=FALSE, message=FALSE, size = "tiny", eval = FALSE}
# Forward read error rate
errF <- learnErrors(filtFs, multithread=TRUE)
# saverds
saveRDS(errF,"rds/errF.rds")
```
```{r rds2, warning = FALSE, message = FALSE, size = "tiny"}
errF <- readRDS("rds/errF.rds")
```
\newpage
#### Plot error profiles
```{r errorplot, warning=FALSE, message=FALSE, size = "tiny", fig.dim = c(6.5,6)}
# Plotting error rate profile for forward reads
plotErrors(errF, nominalQ=TRUE)
```
\newpage
#### Denoise data
```{r denoise, warning=FALSE, message=FALSE, eval = FALSE, size = "tiny"}
#denoise command
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
#save to rds file
saveRDS(dadaFs,"rds/dadaFs.rds")
```
```{r rds3, warning = FALSE, message = FALSE, size = "tiny"}
#read rds
dadaFs <- readRDS("rds/dadaFs.rds")
```
#### Create ASV table
```{r asvtable, warning=FALSE, message=FALSE, size = "tiny"}
seqtab <- makeSequenceTable(dadaFs)
# Dimensions of ASV table
dim(seqtab)
```
#### Remove chimeric variants
```{r chimeras, warning=FALSE, message=FALSE, size = "tiny"}
seqtab.nochim <- removeBimeraDenovo(seqtab, method = "consensus", multithread = TRUE, verbose = TRUE)
dim(seqtab.nochim)
```
\newpage
#### Summary
Summary can help to pinpoint if at some stage, abnormal amount of the data is lost
```{r summary, warning=FALSE, message=FALSE, size = "small"}
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), rowSums(seqtab.nochim),
rowSums(seqtab.nochim != 0))
# If processing a single sample, replace sapply(dadaFs, getN) with getN(dadaFs)
colnames(track) <- c("Input", "Filtered", "Denoised", "Nonchimeric", "Variants")
rownames(track) <- sample.names
kable(track, caption="Denoising summary") %>%
kable_styling(latex_options = c("HOLD_position","striped")) %>%
row_spec(0, background="indigo", color="ivory")
```
\newpage
#### Assign taxonomy
We use idTaxa from DECIPHER and Silva database to assign taxonomic information.
```{r taxonomy, warning = FALSE, message = FALSE, size = "tiny", eval = FALSE}
#Create a DNAStringSet from the ASVs
sequences <- DNAStringSet(getSequences(seqtab.nochim))
# CHANGE TO THE PATH OF YOUR TRAINING SET
load("~/feature_classifiers/SILVA_SSU_r138_2019.RData")
#IdTaxa
ids <- IdTaxa(sequences, trainingSet, strand="top", processors = 3, verbose = FALSE)
ranks <- c("domain", "phylum", "class", "order", "family", "genus", "species")
#Convert the output object of class "Taxa" to a matrix analogous to the output from assignTaxonomy
taxid <- t(sapply(ids, function(x) {
m <- match(ranks, x$rank)
taxa <- x$taxon[m]
taxa[startsWith(taxa, "unclassified_")] <- NA
taxa
}))
colnames(taxid) <- ranks; rownames(taxid) <- getSequences(seqtab.nochim)
#save end result to rds
saveRDS(taxid, "rds/taxid.rds")
```
```{r, warning = FALSE, message = FALSE, size = "tiny"}
taxid <- readRDS("rds/taxid.rds")
```
#### Create phyloseq object
```{r phyloseq, warning=FALSE, message=FALSE, size = "tiny"}
# Reading tsv file, arranging first column to rownames and creating phyloseq object pseq
samples_meta <- read_tsv("data/metadata.tsv", show_col_types = FALSE)
samples_meta <- samples_meta %>% tibble::column_to_rownames("Sampleid")
sampletable = sample_data(samples_meta)
pseq <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows = FALSE),
tax_table(taxid),
sampletable)
#Viewing basic information of pseq object
pseq
```
\newpage
Our variant sequences are currently stored as names. They will moved to refseq and taxa names will be replaced by more convenient format
```{r, warning = FALSE, message = FALSE, size = "tiny"}
repseq <- Biostrings::DNAStringSet(taxa_names(pseq))
names(repseq) <- taxa_names(pseq)
pseq <- merge_phyloseq(pseq, repseq)
taxa_names(pseq) <- paste0("ASV", seq(ntaxa(pseq)))
pseq
```
Finally, minor modifications for dataset. Number of taxa lost is checked at each step
```{r, warning = FALSE, message = FALSE, size = "tiny"}
#We capitalise taxonomic rank names
colnames(tax_table(pseq)) <- c("Kingdom", "Phylum", "Class",
"Order", "Family", "Genus", "Species")
#Sample18 negative control in unnecessary as there is nothing to investigate
pseq <- subset_samples(pseq, Name != "control")
pseq
# Keeping all taxa that are not unknown at Kingdom rank
pseq <- subset_taxa(pseq, Kingdom != "NA")
pseq
# Keeping all that are not Chloroplastic at Order rank
pseq <- subset_taxa(pseq, Order != "Chloroplast" | is.na(Order))
pseq
# Keeping all that are not Mitochondrial at Family rank
pseq <- subset_taxa(pseq, Family != "Mitochondria" | is.na(Family))
pseq
```
In the end we have 17 samples and 1417 taxa
\newpage
#### Writing data
Last step is to save data to suitable file formats.
All variant sequences are save to fasta
```{r, warning = FALSE, message = FALSE, size = "tiny"}
pseq %>% refseq() %>% writeXStringSet(paste0(exportloc,"/repseq.fasta"), append=FALSE,
compress=FALSE, format="fasta")
```
Taxonomy table is converted to dataframe and written as tsv
```{r, warning = FALSE, message = FALSE, size = "tiny"}
taxonomy <- as.data.frame(tax_table(pseq))
write_tsv(taxonomy, paste0(exportloc, "/taxonomy.tsv"))
```
For metadata we add sampleid colum and write as tsv
```{r, warning = FALSE, message = FALSE, size = "tiny"}
sampleid <- sample_names(pseq)
metafile <- sample_data(pseq)
metadf <- data.frame(sampleid,metafile)
write_tsv(metadf, paste0(exportloc, "/metadata.tsv"))
```
ASV count data need to be transposed prior writing
```{r, warning = FALSE, message = FALSE, size = "tiny"}
ASV_names <- taxa_names(pseq)
ASV_counts <- t(otu_table(pseq))
ASVdf <- (data.frame(ASV_names,ASV_counts))
write_tsv(ASVdf, paste0(exportloc, "/fishery_asvs.tsv"))
```