-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWLE_sxt_notebook4_MAGPresenceAbsence.Rmd
810 lines (665 loc) · 39.5 KB
/
WLE_sxt_notebook4_MAGPresenceAbsence.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
---
title: "Examination of MAG presence in samples"
author: "Paul Den Uyl"
date: "2023-11-12"
output: html_document
---
```{r setup, include=FALSE}
#rm(list=ls());if(is.null(dev.list()["RStudioGD"])){} else {dev.off(dev.list()["RStudioGD"])};cat("\014")
library(tidyverse)
library(dplyr)
library(here)
library(vroom)
library(ggmap)
library(googledrive)
library(readxl)
library(Biostrings)
library(lubridate)
library(ggplot2)
library(ggpmisc)
library(Rsamtools)
library(ggnewscale)
library(cowplot)
library(furrr)
knitr::opts_knit$set(root.dir = here::here(""))
#Load postgres server running on Alpena
pg <- DBI::dbConnect(RPostgres::Postgres(),dbname = "XXX", host = "XXX", port = "XXX", user = "XXX", password = "XXX")
#Load GLAMR GTDBTK data
gtdb <- tbl(pg, "GTDB") %>%
collect() %>%
mutate(sample = str_extract(user_genome, "samp_\\d+")) %>%
relocate(user_genome, sample)
```
---Outline---
1.) symlink samples for mapping MAG coverage check
2.) Perform read mapping to this study's reference MAGs
3.) Import reference MAG IDs and sequences for steps 4 and 5
4.) Analyze rep mags across GLAMR - RPKM
5.) Analyze rep mags across GLAMR - Coverage and %ID
6.) Plot GLAMR bam rpkm data from ref mags - heat map
7.) Perform coverage check on specificity of mapping --- rep MAGs LE20-WE8 & LE16-WE8-Aug
8.) Analysis for manuscript statements
-------------
1.) symlink samples for mapping MAG coverage check
```{r}
#Identify samples by set (i.e. project) of interest in GLAMR - Western Lake Erie only
StudyID_to_use_WLE <- c("set_17", "set_18", "set_35", "set_36", "set_38", "set_40", "set_41", "set_42", "set_46", "set_51", "set_56")
GLAMR_table_import <- read_xlsx("data/Great_Lakes_Omics_Datasets_Oct27_23.xlsx", sheet = "samples") #Import GLAMR metadata
##
#IMPORTANT NOTE
##
#Dataset includes all relevant samples. In future code, updated sample tables may be used to add relevant information (i.e. collection date, station)
#Done in heat map chunk
GLAMR_sample_table <- GLAMR_table_import %>%
mutate(geo_loc_name = if_else(SampleID == "samp_4333",
"Lake Erie", geo_loc_name)) %>% #change geo_loc_name from NA to Lake Erie for samp_4333 (KEEP!)
filter(StudyID %in% StudyID_to_use_WLE, #include only StudyID of interest (WLE)
sample_type == "metagenome", #include only data type of interest (metagenome)
!(is.na(NOAA_Site) & is.na(geo_loc_name)), #remove samples without NOAA_Site and geo_loc_name
!(NOAA_Site %in% c("NA", "NF") & geo_loc_name %in% c("NA", "NF")), #remove samples without NOAA_Site and geo_loc_name
!grepl("SB.*", NOAA_Site), #remove samples from Sag. Bay
SampleID != "samp_4304") #remove sample from Thames River
GLAMR_sample_table %>% distinct(SampleID) %>% count #570 WLE metagenome samples in GLAMR - October 27, 2023
#Gather all WLE GLAMR sample raw read paths, sym link to new path
read_paths_raw_WLE <- system("ls /geomicro/data2/kiledal/GLAMR/data/omics/metagenomes/*/reads/decon_*_reads_fastp.fastq.gz",intern = TRUE) %>%
tibble(read_path = .) %>% mutate(sample = read_path %>% str_remove(".*metagenomes/") %>% str_remove("/reads.*"),
dir = read_path %>% str_remove(".*/decon_") %>% str_remove("_reads.*"),
new_path = str_glue("/geomicro/data2/pdenuyl2/neurotoxin_thesis/FINAL2/mapping/reads/GLAMR_WLERef_mm2/reads/{sample}__{dir}.fastq.gz")) %>%
filter(sample %in% GLAMR_sample_table$SampleID)
#symbolic link read_paths
file.symlink(read_paths_raw_WLE$read_path, read_paths_raw_WLE$new_path)
```
2.) Perform read mapping to this study's reference MAGs
```{bash}
#Database WLERefCat3.fa - concatenated bins:
conda config --set channel_priority strict
snakemake run_read_mapping_GLAMRraw_WLERef_mm2 --profile config/snakemake_profiles/alpena/
```
3.) Import reference MAG IDs and sequences for steps 4 and 5
```{r}
# MAG ID label import
ref_mag_id <- c("samp_468", "samp_471", "samp_477", "samp_481", "samp_507", "samp_2059", "samp_2073", "samp_3937", "samp_4305", "samp_4344", "samp_4380") # identify MAG names
pattern <- paste(ref_mag_id, collapse = "|") # allow them to be extracted individually through pattern matching
# import reference MAGs and calculate length
ADA_MAG_ref <- Biostrings::readDNAStringSet("mapping/reads/GLAMR_WLERef_mm2/database/WLERefCat3.fa") %>% # import reference fasta
as.character() %>%
data.frame(ref_base = ., seqnames = names(.)) %>%
mutate(mag = str_extract(seqnames, pattern), # label MAG names
seq_length = nchar(ref_base)) # get length of each contig
```
4.) Analyze rep mags across GLAMR - RPKM
```{r}
ref_mag_length <- ADA_MAG_ref %>%
group_by(mag) %>%
mutate(mag_length = sum(seq_length)) %>% # get length of each MAG
reframe(mag = unique(mag),
mag_length = unique(mag_length))
# function to generate mag rpkm for each sample
ref_mag_bam_rpkm <- function(bam_path){
# identify sample_id from bam file path
sample_id <- bam_path %>% str_remove(".*GLAMR_WLERef_mm2/output/bam/") %>% str_remove("_WLERefCat.*")
# get total sequencing read counts for sample from GLAMR
read_count_fastp <- read_tsv(paste0("/geomicro/data2/kiledal/GLAMR/data/omics/metagenomes/",sample_id,"/reads/",sample_id,"_read_count_fastp.tsv"))
read_count_decon_fwd <- read_count_fastp[4, 2]
read_count_decon_rev <- read_count_fastp[4, 3]
total_decon_sample_reads <- sum(read_count_decon_fwd, read_count_decon_rev) # get total read count from GLAMR (all fwd and rev reads)
# read in bam file
bam <- Rsamtools::BamFile(file = bam_path,
index = paste0(bam_path,".bai"))
# read the BAM file and count aligned reads
bam_file_df <- as.data.frame(scanBam(bam))
aligned_read_counts <- as.data.frame(table(bam_file_df$rname))
mapped_reads_df <- aligned_read_counts %>%
mutate(mag = str_extract(Var1, pattern)) %>% # label MAG names
group_by(mag) %>%
mutate(mapped_reads = sum(Freq)) %>% # calculate number of mapped reads per MAG
distinct(mag,.keep_all = TRUE) %>%
select(-Freq) # remove 'Freq' after distinct(), because not relevant for entire MAG
rpkm <- left_join(mapped_reads_df, ref_mag_length, by = "mag") %>%
mutate(total_decon_sample_reads, # add total_decon_sample_reads determined earlier in function
sample_id, # add sample_id determined earlier in function
rpkm = (mapped_reads / (mag_length * total_decon_sample_reads)) * 1000000) %>% # Calculate RPKM
select(-Var1) # remove 'Freq' after distinct(), because not relevant for entire MAG
}
# create list of bam paths to process
bam_paths_rpkm <- system("ls mapping/reads/GLAMR_WLERef_mm2/output/bam/*.bam", intern = TRUE) %>%
tibble(bam_path = .) #%>%
#filter(!(bam_path %in% c("mapping/reads/GLAMR_WLERef_mm2/output/bam/samp_40_WLERefCat3_mapped.bam")))
# Error in next step caused by: '/geomicro/data2/kiledal/GLAMR/data/omics/metagenomes/samp_4306/reads/samp_4306_read_count_fastp.tsv' does not exist. Also samp_4333.
##Actually, samp_40 doesn't have any read counts, so I'll add them manually!
#40 -
# function to generate mag rpkm for samp_40
bam_path <- "mapping/reads/GLAMR_WLERef_mm2/output/bam/samp_40_WLERefCat3_mapped.bam"
# identify sample_id from bam file path
sample_id <- bam_path %>% str_remove(".*GLAMR_WLERef_mm2/output/bam/") %>% str_remove("_WLERefCat.*")
# get total sequencing read counts for sample from GLAMR
#command: printf "decon_reads\t$(($(zcat decon_fwd_reads_fastp.fastq.gz | wc -l) / 4 ))\t$(($(zcat decon_rev_reads_fastp.fastq.gz | wc -l) / 4 ))\n"
read_count_decon_fwd <- 69425017
read_count_decon_rev <- 69425017
total_decon_sample_reads <- sum(read_count_decon_fwd, read_count_decon_rev) # get total read count from GLAMR (all fwd and rev reads)
# read in bam file
bam <- Rsamtools::BamFile(file = bam_path,
index = paste0(bam_path,".bai"))
# read the BAM file and count aligned reads
bam_file_df <- as.data.frame(scanBam(bam))
aligned_read_counts <- as.data.frame(table(bam_file_df$rname))
mapped_reads_df <- aligned_read_counts %>%
mutate(mag = str_extract(Var1, pattern)) %>% # label MAG names
group_by(mag) %>%
mutate(mapped_reads = sum(Freq)) %>% # calculate number of mapped reads per MAG
distinct(mag,.keep_all = TRUE) %>%
dplyr::select(-Freq) # remove 'Freq' after distinct(), because not relevant for entire MAG
rpkm <- left_join(mapped_reads_df, ref_mag_length, by = "mag") %>%
mutate(total_decon_sample_reads, # add total_decon_sample_reads determined earlier in function
sample_id, # add sample_id determined earlier in function
rpkm = (mapped_reads / (mag_length * total_decon_sample_reads)) * 1000000) %>% # Calculate RPKM
dplyr::select(-Var1) # remove 'Freq' after distinct(), because not relevant for entire MAG
Samp_40_bam_rpkm_mm2 <- rpkm
#39 -
# function to generate mag rpkm for samp_39
bam_path <- "mapping/reads/GLAMR_WLERef_mm2/output/bam/samp_39_WLERefCat3_mapped.bam"
# identify sample_id from bam file path
sample_id <- bam_path %>% str_remove(".*GLAMR_WLERef_mm2/output/bam/") %>% str_remove("_WLERefCat.*")
# get total sequencing read counts for sample from GLAMR
#command: printf "decon_reads\t$(($(zcat decon_fwd_reads_fastp.fastq.gz | wc -l) / 4 ))\t$(($(zcat decon_rev_reads_fastp.fastq.gz | wc -l) / 4 ))\n"
read_count_decon_fwd <- 110543202
read_count_decon_rev <- 110543202
total_decon_sample_reads <- sum(read_count_decon_fwd, read_count_decon_rev) # get total read count from GLAMR (all fwd and rev reads)
# read in bam file
bam <- Rsamtools::BamFile(file = bam_path,
index = paste0(bam_path,".bai"))
# read the BAM file and count aligned reads
bam_file_df <- as.data.frame(scanBam(bam))
aligned_read_counts <- as.data.frame(table(bam_file_df$rname))
mapped_reads_df <- aligned_read_counts %>%
mutate(mag = str_extract(Var1, pattern)) %>% # label MAG names
group_by(mag) %>%
mutate(mapped_reads = sum(Freq)) %>% # calculate number of mapped reads per MAG
distinct(mag,.keep_all = TRUE) %>%
dplyr::select(-Freq) # remove 'Freq' after distinct(), because not relevant for entire MAG
rpkm <- left_join(mapped_reads_df, ref_mag_length, by = "mag") %>%
mutate(total_decon_sample_reads, # add total_decon_sample_reads determined earlier in function
sample_id, # add sample_id determined earlier in function
rpkm = (mapped_reads / (mag_length * total_decon_sample_reads)) * 1000000) %>% # Calculate RPKM
dplyr::select(-Var1) # remove 'Freq' after distinct(), because not relevant for entire MAG
Samp_39_bam_rpkm_mm2 <- rpkm
#2141 -
# function to generate mag rpkm for samp_2141
bam_path <- "mapping/reads/GLAMR_WLERef_mm2/output/bam/samp_2141_WLERefCat3_mapped.bam"
# identify sample_id from bam file path
sample_id <- bam_path %>% str_remove(".*GLAMR_WLERef_mm2/output/bam/") %>% str_remove("_WLERefCat.*")
# get total sequencing read counts for sample from GLAMR
#command: printf "decon_reads\t$(($(zcat decon_fwd_reads_fastp.fastq.gz | wc -l) / 4 ))\t$(($(zcat decon_rev_reads_fastp.fastq.gz | wc -l) / 4 ))\n"
read_count_decon_fwd <- 76247941
read_count_decon_rev <- 76247941
total_decon_sample_reads <- sum(read_count_decon_fwd, read_count_decon_rev) # get total read count from GLAMR (all fwd and rev reads)
# read in bam file
bam <- Rsamtools::BamFile(file = bam_path,
index = paste0(bam_path,".bai"))
# read the BAM file and count aligned reads
bam_file_df <- as.data.frame(scanBam(bam))
aligned_read_counts <- as.data.frame(table(bam_file_df$rname))
mapped_reads_df <- aligned_read_counts %>%
mutate(mag = str_extract(Var1, pattern)) %>% # label MAG names
group_by(mag) %>%
mutate(mapped_reads = sum(Freq)) %>% # calculate number of mapped reads per MAG
distinct(mag,.keep_all = TRUE) %>%
dplyr::select(-Freq) # remove 'Freq' after distinct(), because not relevant for entire MAG
rpkm <- left_join(mapped_reads_df, ref_mag_length, by = "mag") %>%
mutate(total_decon_sample_reads, # add total_decon_sample_reads determined earlier in function
sample_id, # add sample_id determined earlier in function
rpkm = (mapped_reads / (mag_length * total_decon_sample_reads)) * 1000000) %>% # Calculate RPKM
dplyr::select(-Var1) # remove 'Freq' after distinct(), because not relevant for entire MAG
Samp_2141_bam_rpkm_mm2 <- rpkm
#2057 -
# function to generate mag rpkm for samp_2057
bam_path <- "mapping/reads/GLAMR_WLERef_mm2/output/bam/samp_2057_WLERefCat3_mapped.bam"
# identify sample_id from bam file path
sample_id <- bam_path %>% str_remove(".*GLAMR_WLERef_mm2/output/bam/") %>% str_remove("_WLERefCat.*")
# get total sequencing read counts for sample from GLAMR
#command: printf "decon_reads\t$(($(zcat decon_fwd_reads_fastp.fastq.gz | wc -l) / 4 ))\t$(($(zcat decon_rev_reads_fastp.fastq.gz | wc -l) / 4 ))\n"
read_count_decon_fwd <- 13409039
read_count_decon_rev <- 13409039
total_decon_sample_reads <- sum(read_count_decon_fwd, read_count_decon_rev) # get total read count from GLAMR (all fwd and rev reads)
# read in bam file
bam <- Rsamtools::BamFile(file = bam_path,
index = paste0(bam_path,".bai"))
# read the BAM file and count aligned reads
bam_file_df <- as.data.frame(scanBam(bam))
aligned_read_counts <- as.data.frame(table(bam_file_df$rname))
mapped_reads_df <- aligned_read_counts %>%
mutate(mag = str_extract(Var1, pattern)) %>% # label MAG names
group_by(mag) %>%
mutate(mapped_reads = sum(Freq)) %>% # calculate number of mapped reads per MAG
distinct(mag,.keep_all = TRUE) %>%
dplyr::select(-Freq) # remove 'Freq' after distinct(), because not relevant for entire MAG
rpkm <- left_join(mapped_reads_df, ref_mag_length, by = "mag") %>%
mutate(total_decon_sample_reads, # add total_decon_sample_reads determined earlier in function
sample_id, # add sample_id determined earlier in function
rpkm = (mapped_reads / (mag_length * total_decon_sample_reads)) * 1000000) %>% # Calculate RPKM
dplyr::select(-Var1) # remove 'Freq' after distinct(), because not relevant for entire MAG
Samp_2057_bam_rpkm_mm2 <- rpkm
#IMPORTANT: Because there were no read counts for samp_39, samp_40, and samp_2141, I'm going to manually calculate them and make new data.tables here. Then add them to GLAMR_bam_rpkm_mm2.
#plan(multisession, workers = 8)
##GLAMR_bam_rpkm_mm2 <- future_map_dfr(bam_paths_rpkm$bam_path, ~ ref_mag_bam_rpkm(.x), .progress = TRUE) # last run: Dec 13, 2023
#plan(sequential) #stop multisession
#write_csv(GLAMR_bam_rpkm_mm2_comb, file = "mapping/reads/GLAMR_WLERef_mm2/output/GLAMR_bam_rpkm_mm2_Dec14_23.csv")
GLAMR_bam_rpkm_mm2 <- read_csv(file = "mapping/reads/GLAMR_WLERef_mm2/output/GLAMR_bam_rpkm_mm2_Dec14_23.csv")
GLAMR_bam_rpkm_mm2_comb <- bind_rows(GLAMR_bam_rpkm_mm2, Samp_39_bam_rpkm_mm2, Samp_40_bam_rpkm_mm2, Samp_2141_bam_rpkm_mm2, Samp_2057_bam_rpkm_mm2) %>%
filter(!(sample_id %in% c("samp_39", "samp_40", "samp_2141", "samp_2057") & is.na(rpkm))) %>%
distinct(sample_id, mag, .keep_all = TRUE)
GLAMR_bam_rpkm_mm2 <- GLAMR_bam_rpkm_mm2_comb
```
5.) Analyze rep mags across GLAMR - Coverage and %ID
```{r}
# format reference sequences to address coverage and %ID
ref_mag_cov <- ADA_MAG_ref %>%
separate_longer_position(ref_base, width = 1) %>%
group_by(seqnames) %>%
mutate(pos = row_number(),
mag = str_extract(seqnames, pattern)) %>%
group_by(mag) %>%
mutate(mag_length = n()) %>%
ungroup()
ref_mag_bam_cov <- function(bam_path){
# identify sample_id from bam file path
sample_id <- bam_path %>% str_remove(".*GLAMR_WLERef_mm2/output/bam/") %>% str_remove("_WLERefCat.*")
# read in bam file
bam <- Rsamtools::BamFile(file = bam_path,
index = paste0(bam_path,".bai"))
# read in pileup file
pileup_ref_join <- Rsamtools::pileup(bam,pileupParam = PileupParam(distinguish_strands = FALSE)) %>%
full_join(ref_mag_cov, ., by = c("seqnames", "pos")) # join pileup file and reference sequences
out_mag_depth <- pileup_ref_join %>%
group_by(mag) %>%
mutate(mag_depth = sum(count, na.rm = TRUE)) %>% # alignment depth at each position
distinct(mag, mag_depth) # keep unique MAG depth data
out_percent_id <- pileup_ref_join %>%
group_by(mag) %>%
filter(nucleotide == ref_base) %>%
left_join(., out_mag_depth, by = c("mag")) %>%
mutate(id_ref_mag_percent = (sum(count, na.rm = TRUE) / mag_depth) * 100) # percent identity of MAG compared to reference
out_cover <- pileup_ref_join %>%
filter(count >= 1, na.rm = TRUE) %>% # keep only positions with at least 1 count
distinct(seqnames, pos, .keep_all = TRUE) %>% # remove duplicate positions (doesn't matter if matching ref)
group_by(mag) %>%
mutate(mag_cover_length = sum(count >= 1), # count number of unique bases in MAG with at least 1 count
mag_cover_percent = mag_cover_length / mag_length * 100, # calculate % cover
sample_id)
out <- left_join(out_percent_id, out_cover, by = c("mag", "seqnames", "pos")) %>% # merge %id and cover data tables
distinct(mag, mag_cover_length, mag_cover_percent, id_ref_mag_percent, sample_id) # keep unique %id and cover MAG data
}
# create list of bam paths to process
bam_paths_ref_cov <- system("ls mapping/reads/GLAMR_WLERef_mm2/output/bam/*.bam", intern = TRUE) %>%
tibble(bam_path = .)
#plan(multisession, workers = 24)
#options(future.globals.maxSize = 10*1024^3)
##GLAMR_bam_MAGcover_mm2 <- future_map_dfr(bam_paths_ref_cov$bam_path, ~ ref_mag_bam_cov(.x), .progress = TRUE) # last run: Nov 30, 2023
#plan(sequential) #stop multisession
#write_csv(GLAMR_bam_MAGcover_mm2, file = "mapping/reads/GLAMR_WLERef_mm2/output/GLAMR_bam_MAGcover_mm2_Dec1_23.csv")
GLAMR_bam_MAGcover_mm2 <- read_csv(file = "mapping/reads/GLAMR_WLERef_mm2/output/GLAMR_bam_MAGcover_mm2_Dec1_23.csv")
```
6.) Plot GLAMR bam rpkm data from ref mags - dot plot
```{r}
#Join rpkm and query cover results
GLAMR_bam_stats <- left_join(GLAMR_bam_rpkm_mm2, GLAMR_bam_MAGcover_mm2, by = c("sample_id", "mag"))
#Join rpkm and query cover results with GLAMR sample table
GLAMR_sample_table_bam_stats <- full_join(GLAMR_sample_table, GLAMR_bam_stats, by = c("SampleID" = "sample_id"))
##**********
#COMBINE WITH DATA FROM ABOVE -
#Metadata addition (Dec. 1, 2023)
GLAMR_Seagull_Site_import <- read_csv("data/Great_Lakes_Omics_Datasets_SeagullSiteAssignments_Nov25_23.csv") #Import Seagull Site Assignments
GLAMR_sample_table_bam_stats <- left_join(GLAMR_sample_table_bam_stats, GLAMR_Seagull_Site_import, by = c("SampleID", "StudyID"))
##**********
##Here we are with a complete dataframe! - just call df
df <- GLAMR_sample_table_bam_stats
View(df)
#Format date - subset data by query cover - select columns of interest for plotting
df$collection_date <- as.Date(df$collection_date, format = "%Y-%m-%d")
#Format and manipulate data frame for use in figure
df_sub_data <- df %>%
mutate(rpkm = ifelse(is.na(rpkm), 0, rpkm),
collection_date = collection_date %>% str_remove("T.*"),
year = year(collection_date),
month = month(collection_date, label=TRUE),
day = day(collection_date),
yday = yday(collection_date),
week = week(collection_date),
group = ifelse(mag %in% c("samp_471", "samp_2073"), "D_lem",
ifelse(mag %in% c("samp_3937", "samp_4344", "samp_4380", "samp_507"), "D_circ",
ifelse(mag %in% c("samp_4305", "samp_2059", "samp_468"), "D_flos",
ifelse(mag %in% c("samp_477", "samp_481"), "C_issa", NA))))) %>%
group_by(SampleID, group) %>%
mutate(group_rpkm = sum(rpkm)) %>%
ungroup() %>%
mutate(clustered_site = if_else(clustered_site == "WE12-S", "WE12", clustered_site), #Note in manuscript
clustered_site = if_else(clustered_site == "WE16-WSW", "WE16", clustered_site), #Note in manuscript
clustered_site = if_else(is.na(clustered_site), NOAA_Site, clustered_site)) %>%
dplyr::select(SampleName, SampleID, mag, group, group_rpkm, mag_length, id_ref_mag_percent, mag_cover_percent, rpkm, lat, lon, NOAA_Site, collection_date, clustered_site, StudyID, year, month, day, yday, week) %>% #select columns
dplyr::filter(!clustered_site %in% c("glamr_13", "glamr_11", "glamr_9", "glamr_8", "glamr_4", "glamr_3", "glamr_2")) %>%
filter(!month %in% c("Mar", "Apr", "May", "Jun", "Dec"))
#Extract samples with critical missing data
missing_data_df <- df_sub_data %>%
filter(id_ref_mag_percent > 95 & mag_cover_percent > 50) %>%
mutate_if(is.character, ~ na_if(., "NA")) %>%
filter(is.na(id_ref_mag_percent) | is.na(mag_cover_percent) | is.na(clustered_site) | is.na(year) | is.na(week) | is.na(group_rpkm))
##LOOKS LIKE ONLY samp_4333 is missing data!
################################################
###PART TIME FIX!!! CHANGE LATER - Not 100% sure on samp_4333 station, but pretty sure it's close to WE6
df_sub_data <- df_sub_data %>%
mutate(clustered_site = if_else(SampleID == "samp_4333", "WE6", clustered_site),
week = if_else(SampleID == "samp_4333", 44, week),
year = if_else(SampleID == "samp_4333", 2022, year))
################################################
###Just a quick look at filtered data - for browsing
df_sub_data_filt <- df_sub_data %>%
filter(id_ref_mag_percent > 95 & mag_cover_percent > 50)
#DONT USE
###df_sub_data_filt_lem <- df_sub_data %>%
### filter(id_ref_mag_percent > 95 & mag_cover_percent > 50 & group == "D_lem" & SampleID != "samp_40")
#unique dates where D_lem appears
#Combined D_lem: there should be highlights around weeks when sxt present. Sxt detection determined by all three contigs >60% cover (notebook 3b - determined from coverage depth plot)
samp_w_sxt <- c("samp_2042", "samp_2047", "samp_2088", "samp_2092", "samp_2096", "samp_2141", "samp_2142", "samp_2153", "samp_2155", "samp_2158", "samp_4338", "samp_4341", "samp_451", "samp_470", "samp_471", "samp_472", "samp_477", "samp_481", "samp_484", "samp_499", "samp_509", "samp_511")
samp_w_sxtX <- c("samp_2151", "samp_2153", "samp_2158", "samp_4338", "samp_4341", "samp_481", "samp_484", "samp_797")
plotted_samp_w_sxt <- df_sub_data_filt %>%
filter(SampleID %in% samp_w_sxt) %>%
distinct(SampleID, .keep_all = TRUE)
plotted_samp_w_sxtX <- df_sub_data_filt %>%
filter(SampleID %in% samp_w_sxtX) %>%
distinct(SampleID, .keep_all = TRUE)
groups_to_filter <- c("D_lem", "D_circ", "D_flos", "C_issa")
for (group_name in groups_to_filter) {
filtered_data <- df_sub_data %>%
filter(group == group_name & id_ref_mag_percent > 95 & mag_cover_percent > 50) %>%
arrange(desc(group_rpkm)) %>% # sort the data by 'group_rpkm'
group_by(year, week, clustered_site) %>%
filter(row_number() == 1) # keep only one row for each group_rpkm (highest value)
assign(paste0("GLAMR_sample_table_bam_stats_plotdf_", group_name), filtered_data)
}
# Accessing the created data frames:
# GLAMR_sample_table_bam_stats_plotdf_D_lem
# GLAMR_sample_table_bam_stats_plotdf_D_circ
# GLAMR_sample_table_bam_stats_plotdf_D_flos
# GLAMR_sample_table_bam_stats_plotdf_C_issa
# Reordering x-axis variables
df_sub_data$clustered_site <- factor(df_sub_data$clustered_site, levels = c("glamr_13", "glamr_11", "glamr_9", "glamr_8", "glamr_4", "glamr_3", "glamr_2", "WE16", "WE15", "WE14", "WE13", "WE12", "WE9", "WE8", "WE6", "WE4", "WE2"))
MAG_map_plot1a <-
ggplot(mapping = aes(week, clustered_site)) +
geom_tile(data = df_sub_data, aes(fill = "Sequenced Sample")) +
scale_fill_identity() +
#geom_point(data = GLAMR_sample_table_bam_stats_plotdf_D_lem_sxtPlus, aes(x = week + 0, size = log(rpkm), fill = "D_lem_sxtPlus"), shape = 21) +
geom_point(data = GLAMR_sample_table_bam_stats_plotdf_D_lem, aes(x = week - 0.13, size = log(group_rpkm), fill = "D_lem"), shape = 21) +
geom_point(data = GLAMR_sample_table_bam_stats_plotdf_D_circ, aes(x = week + 0.13, size = log(rpkm), fill = "D_circ"), shape = 21) +
geom_point(data = GLAMR_sample_table_bam_stats_plotdf_D_flos, aes(x = week - 0.4, size = log(rpkm), fill = "D_flos"), shape = 21) +
geom_point(data = GLAMR_sample_table_bam_stats_plotdf_C_issa, aes(x = week + 0.4, size = log(rpkm), fill = "C_issa"), shape = 21) +
scale_size_continuous(range = c(0.00000000001, 7)) +
facet_grid(year ~ week,
scales = "free",
space = "free_y",
switch = "x") +
scale_x_continuous(breaks = NULL) +
labs(x = "Week Number", y = "NOAA Site") +
scale_fill_manual(name = "MAGs",
values = c("Sequenced Sample" = "lightgray", "D_circ" = "blue", "D_lem" = "orange", "D_flos" = "#BF40BF", "C_issa" = "#4DDBD4")) +
scale_color_manual("black") +
theme_classic() +
theme(axis.title.x = element_text(size = 28), # Set the size of the x-axis label text
axis.title.y = element_text(size = 28),
strip.text = element_text(size = 22),
axis.text.y = element_text(size = 15)) # Set the size of the y-axis label text
plot(MAG_map_plot1a)
# Use ggsave to save the ggplot as a PDF
# ggsave(filename = "/geomicro/data2/pdenuyl2/neurotoxin_thesis/FINAL2/notebook4_pdf/rpkm_fig_WEStations_Dec12_23.pdf", plot = MAG_map_plot1a, device = "pdf", width = 26, height = 12)
MAG_map_plot2a <-
ggplot(mapping = aes(week, clustered_site)) +
geom_tile(data = df_sub_data, aes(fill = "Sequenced Sample")) +
scale_fill_identity() +
geom_tile(data = subset(plotted_samp_w_sxt, !is.na(SampleID)), color = "red", fill = NA, size = 1.5) + # Outline tiles from plotted_samp_w_sxt
geom_tile(data = subset(plotted_samp_w_sxtX, !is.na(SampleID)), color = "black", fill = NA, size = 1.5, linetype = "dashed") + # Outline tiles from plotted_samp_w_sxtX
#geom_point(data = GLAMR_sample_table_bam_stats_plotdf_D_lem_sxtPlus, aes(x = week + 0, size = log(rpkm), fill = "D_lem_sxtPlus"), shape = 21) +
geom_point(data = GLAMR_sample_table_bam_stats_plotdf_D_lem, aes(x = week - 0.13, size = log(group_rpkm), fill = "D_lem"), shape = 21) +
geom_point(data = GLAMR_sample_table_bam_stats_plotdf_D_circ, aes(x = week + 0.13, size = log(rpkm), fill = "D_circ"), shape = 21) +
geom_point(data = GLAMR_sample_table_bam_stats_plotdf_D_flos, aes(x = week - 0.4, size = log(rpkm), fill = "D_flos"), shape = 21) +
geom_point(data = GLAMR_sample_table_bam_stats_plotdf_C_issa, aes(x = week + 0.4, size = log(rpkm), fill = "C_issa"), shape = 21) +
scale_size_continuous(range = c(0.00000000001, 7)) +
facet_grid(year ~ week,
scales = "free",
space = "free_y",
switch = "x") +
scale_x_continuous(breaks = NULL) +
labs(x = "Week Number", y = "NOAA Site") +
scale_fill_manual(name = "MAGs",
values = c("Sequenced Sample" = "lightgray", "D_circ" = "blue", "D_lem" = "orange", "D_flos" = "#BF40BF", "C_issa" = "#4DDBD4")) +
scale_color_manual("black") +
theme_classic() +
theme(axis.title.x = element_text(size = 28), # Set the size of the x-axis label text
axis.title.y = element_text(size = 28),
strip.text = element_text(size = 22),
axis.text.y = element_text(size = 15)) # Set the size of the y-axis label text
plot(MAG_map_plot2a)
# Use ggsave to save the ggplot as a PDF
#ggsave(plot = MAG_map_plot2a, filename = #"/geomicro/data2/pdenuyl2/neurotoxin_thesis/FINAL2/notebook_pdf/BLACKDASH_REDrpkm_fig_WEStations_Dec15_23_FINAL.pdf", #device = "pdf", width = 26, height = 12)
```
7.) Analysis for manuscript statements
```{r}
# Accessing the plotted dataframes:
df_sub_data_filt
GLAMR_sample_table_bam_stats_plotdf_D_lem
GLAMR_sample_table_bam_stats_plotdf_D_circ
GLAMR_sample_table_bam_stats_plotdf_D_flos
GLAMR_sample_table_bam_stats_plotdf_C_issa
samp_w_sxt #red squares data source
df84 <- df %>%
filter(SampleID %in% samp_w_sxt) %>%
table("collection_date")
#Statement: "Their abundance and distribution exhibited significant variability between years with some years such as 2014, 2017, and 2018 having just 4 weeks where ADA MAGs were significantly detected"
df_sub_data_filt %>%
distinct(week, year, .keep_all = TRUE) %>%
count(year)
#Statement: "Multiple species often co-occured in the same sample (37% of samples) but when just one species was detected it was most often D. circinale (42% of samples), followed by D. lemmamannii and D. flosaquae (5% and 15% of samples, respectively)."
#Number of samples with more than one, two, or three species detected
result <- df_sub_data_filt %>%
group_by(SampleID) %>%
summarize(distinct_group_count = n_distinct(group)) %>%
group_by(distinct_group_count) %>%
summarize(sample_count = n())
#108 samples with 1 species, 55 samples with 2 species, 8 samples with 3 species
View(result)
#number of samples to divide by (check)
samp_count <- df_sub_data_filt %>%
summarize(distinct_sample = n_distinct(SampleID)) # 171
#Count number of samples with 1 species
#108
#Count number of samples with >1 species
#63
#When just one species detected:
result2 <- df_sub_data_filt %>%
group_by(SampleID) %>%
mutate(distinct_group_count = n_distinct(group)) %>%
filter(distinct_group_count == 1)
result2_Dcirc <- result2 %>%
filter(group == "D_circ") %>%
ungroup() %>%
summarize(distinct_sample = n_distinct(SampleID)) # 71 D_circ
result2_Dlem <- result2 %>%
filter(group == "D_lem") %>%
ungroup() %>%
summarize(distinct_sample = n_distinct(SampleID)) # 9 D_lem
result2_Dflos <- result2 %>%
filter(group == "D_flos") %>%
ungroup() %>%
summarize(distinct_sample = n_distinct(SampleID)) # 25 D_flos
#max rpkm for D. lem
df_sub_data_D_lem_analysis <- df_sub_data %>% filter(group == "D_lem")
df_sub_data_D_lem_analysis[df_sub_data_D_lem_analysis$group_rpkm == max(df_sub_data_D_lem_analysis$group_rpkm), ] #0.006929648
###
#occurances of D. lem at WE12
GLAMR_sample_table_bam_stats_plotdf_D_lem %>%
ungroup() %>%
filter(clustered_site == "WE12") %>%
distinct(collection_date, .keep_all = TRUE) %>%
count(collection_date) #8
GLAMR_sample_table_bam_stats_plotdf_D_lem %>%
ungroup() %>%
distinct(collection_date, .keep_all = TRUE) %>%
count(collection_date) #15
#Overlap D_lem 2014, 2016, 2018 (14, 16, 18)
#Make df of samples where D_lem appears in 14, 16, 18
D_lem_samps_14_16_18 <- df_sub_data_filt %>%
filter(group == "D_lem" & year %in% c(2014, 2016, 2018))
#Make df of all group data sample list from above
D_lem_otherGroups_14_16_18 <- df_sub_data_filt %>%
filter(SampleID %in% D_lem_samps_14_16_18$SampleID)
#Overlap D_lem 2017, 2019 - 2022
#Make df of samples where D_lem appears in 14, 16, 18
D_lem_samps_17_19_20_21_22 <- df_sub_data_filt %>%
filter(group == "D_lem" & year %in% c(2017, 2019, 2020, 2021, 2022))
#Make df of all group data sample list from above (17, 19-22)
D_lem_otherGroups_17_19_20_21_22 <- df_sub_data_filt %>%
filter(SampleID %in% D_lem_samps_17_19_20_21_22$SampleID)
# Count months where D_lem appears
df_sub_data_filt %>%
filter(group == "D_lem") %>%
distinct(collection_date, .keep_all = TRUE) %>%
group_by(month) %>%
count(month)
# Count number of D_lem occurrences that DO match up with samp_w_sxt
df_sub_data_filt %>%
ungroup() %>%
filter(group == "D_lem",
SampleID %in% samp_w_sxt) %>%
distinct(SampleID, .keep_all = TRUE) %>%
count()
results_DOmatch_sxt <- df_sub_data_filt %>%
ungroup() %>%
filter(group == "D_lem",
SampleID %in% samp_w_sxt) %>%
distinct(SampleID, .keep_all = TRUE)
# Count number of D_lem occurrences that DO NOT match up with samp_w_sxt
df_sub_data_filt %>%
ungroup() %>%
filter(group == "D_lem",
!(SampleID %in% samp_w_sxt)) %>%
distinct(SampleID, .keep_all = TRUE) %>%
count()
# Count number of sxt genes occurrences that DO NOT match up with D_lem
df_sub_data %>%
ungroup() %>%
filter(group == "D_lem",
SampleID %in% samp_w_sxt) %>%
distinct(SampleID, .keep_all = TRUE)
samples_NOTmatching_sxt <- samp_w_sxt[!(samp_w_sxt %in% results_DOmatch_sxt$SampleID)]
#Details about second sxt operon statement:
#"For the 7 of 22 samples where D. sp000312705 was not present by metagenomic read mapping when STX biosynthesis genes were present"
#So 6 of 7 samples where D. sp000312705 was not present has sxtX gene. "samp_2153" "samp_2155" "samp_2158" "samp_4338" "samp_4341" "samp_481" "samp_484" #SAMPLE samp_2155 does not have sxtX gene mapped, but full operon
df_sub_data %>%
ungroup() %>%
filter(SampleID %in% samples_NOTmatching_sxt) %>%
distinct(SampleID, .keep_all = TRUE)
# Max rpkm for all samples
df_sub_data_filt[df_sub_data_filt$group_rpkm == max(df_sub_data_filt$group_rpkm), ]
view(df_sub_data_filt) #for lower ranking group_rpkm and rpkm
# C_issa year count
df_sub_data_filt %>%
filter(group == "C_issa") %>%
distinct(year)
# C_issa frequency count (unique dates)
df_sub_data_filt %>%
group_by(year, group, collection_date) %>%
summarise(count_dates = n_distinct(collection_date)) %>%
group_by(year, group) %>%
summarise(sum(count_dates)) %>%
filter(group == "C_issa")
# Unique sampling dates in 2020
df_sub_data %>%
filter(year == 2020) %>%
summarise(count_dates = n_distinct(collection_date))
# C_isssa frequency count (instances)
df_sub_data_filt %>%
group_by(year, group, SampleID) %>%
summarise(count_instances = n_distinct(SampleID)) %>%
group_by(year, group) %>%
summarise(sum(count_instances)) %>%
arrange(-`sum(count_instances)`) %>%
filter(group == "C_issa")
# Unique samples in 2020
df_sub_data %>%
filter(year == 2020) %>%
summarise(count_samps = n_distinct(SampleID))
# D_flos frequency count (unique dates)
df_sub_data_filt %>%
group_by(year, group, collection_date) %>%
summarise(count_dates = n_distinct(collection_date)) %>%
group_by(year, group) %>%
summarise(sum(count_dates)) %>%
filter(group == "D_flos")
# Unique dates in 2019
df_sub_data %>%
filter(year == 2019) %>%
summarise(count_dates = n_distinct(collection_date))
# D_flos frequency count (instances)
df_sub_data_filt %>%
group_by(year, group, SampleID) %>%
summarise(count_instances = n_distinct(SampleID)) %>%
group_by(year, group) %>%
summarise(sum(count_instances)) %>%
filter(group == "D_flos")
# Unique samples in 2019
df_sub_data %>%
filter(year == 2019) %>%
summarise(count_samps = n_distinct(SampleID))
# All frequency count
df_sub_data_filt %>%
group_by(year, group, collection_date) %>%
summarise(count_dates = n_distinct(collection_date)) %>%
group_by(year, group) %>%
summarise(sum_count_dates = sum(count_dates)) %>%
arrange(-sum_count_dates)
# Presence by date - D_circ, 2014 - 2019
pres_D_Circ_14_15_16_17_18_19 <- df_sub_data_filt %>%
filter(week >= 37) %>%
group_by(year, group, collection_date) %>%
summarise(count_dates = n_distinct(collection_date)) %>%
group_by(year, group) %>%
summarise(sum(count_dates)) %>%
filter(group == "D_circ",
year %in% 2014:2019)
# Total sampling dates - 2014 - 2019
samp_date_14_15_16_17_18_19 <- df_sub_data %>%
filter(week >= 37) %>%
group_by(year, collection_date) %>%
summarise(count_dates = n_distinct(collection_date)) %>%
group_by(year) %>%
summarise(sum(count_dates)) %>%
filter(year %in% 2014:2019)
# Presence by date - D_circ, 2020 - 2022
pres_D_Circ_20_21_22 <- df_sub_data_filt %>%
filter(week >= 37) %>%
group_by(year, group, collection_date) %>%
summarise(count_dates = n_distinct(collection_date)) %>%
group_by(year, group) %>%
summarise(sum(count_dates)) %>%
filter(group == "D_circ",
year %in% 2020:2022)
# Total sampling dates - 2020 - 2022
samp_date_20_21_22 <- df_sub_data %>%
filter(week >= 37) %>%
group_by(year, collection_date) %>%
summarise(count_dates = n_distinct(collection_date)) %>%
group_by(year) %>%
summarise(sum(count_dates)) %>%
filter(year %in% 2020:2022)
#GENERAL QUESTIONS
#unique date counts for each group / year
df_sub_data_filt %>%
group_by(year, group, collection_date) %>%
summarise(count_dates = n_distinct(collection_date)) %>%
group_by(year, group) %>%
summarise(sum(count_dates))
#Export supplemental table 2, RPKM values
View(df_sub_data_filt)
df_sub_data_filt_TableS2 <- df_sub_data_filt %>%
select(SampleName, mag, group, group_rpkm, mag_length, id_ref_mag_percent, mag_cover_percent, rpkm, lat, lon, NOAA_Site, collection_date, clustered_site, StudyID, year, month, day, yday, week, SampleID) %>% #rearrange columns
rename("SampleID" = "SampleID_internal")
write.csv(df_sub_data_filt_TableS2, file = "figs/TableS2.csv")
```