-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbarcode_analysis_V2.R
113 lines (81 loc) · 4.04 KB
/
barcode_analysis_V2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
library(tidyverse)
library(QSutils)
library(vegan)
library(fs)
#assign filepath to "data_path":
data_path <- "/Users/emilyfitzmeyer/Desktop/BC_WNV/mgi_bcWNV/mgi_bcWNV_trueBC/"
#txt_files <- dir(data_path, pattern = "*.txt")
txt_files <- fs::dir_ls(data_path)
#assign column classes to "column_classes" (use shortcut values):
column_classes <- c("c", "n")
# read all files in using map_dfr
# specify column types and column names ('counts' column tends to read in as both 'chr' and 'dbl'when col types aren't specified)
# mutate source column to cut down file path length:
# change '65' to whatever number of characters in path precedes your filename +1
# use str_replace to remove redundant parts of filename "_barcode_counts.txt"
df <- txt_files %>%
map_dfr(read_delim, col_types = column_classes, col_names = c("barcodes", "counts"), .id = "source") %>%
mutate(source = str_sub(source, 65, -1)) %>%
mutate(source = str_replace(source, "_true_barcodes.txt", ""))
# nest by "source" (the filenames)
# pull the 'counts' columns from the nested dataframe and use Shannon() to determine sample complexity
nested_df <- df %>%
group_by(source) %>%
nest() %>%
mutate(complexity =
map(data, ~ .x %>%
pull(counts) %>%
QSutils::Shannon()))
# add a column for normalized Shannon values using QSutils::NormShannon
# we don't trust NormShannon yet but everyone likes to be included
nested_df <- nested_df %>%
mutate(norm_complexity =
map(data, ~ .x %>%
pull(counts) %>%
QSutils::NormShannon()))
# repeat with Length() to get # of unique barcodes (note, nrow() and n() do not work here because
# in list format 'counts' is a vector not a column)
nested_df <- nested_df %>%
mutate(unique_barcodes =
map(data, ~ .x %>%
pull(counts) %>%
length()))
# sum abundance vectors and make a column called "total_barcodes" with these values.
nested_df <- nested_df %>%
mutate(total_barcodes =
map(data, ~ .x %>%
pull(counts) %>%
sum()))
# select columns of interest
# not selecting for 'Data' column because upon unnesting this will generate an unweildy and confusing dataframe
# if you don't specify 'cols' it works but throws an error about it
exp_df <- nested_df %>%
select(1, 3, 4, 5, 6) %>%
unnest(cols = c(complexity, norm_complexity, unique_barcodes, total_barcodes))
# export new df as a csv file (specify filename after '\')
# weird thing: using '\' before filename cuts of the first letter of the filename
# using '\\' before filename includes a '\' in the filename
# NOTE: if you are already in the directory where you want the file to go you do not have to specify a path
# if you specify a path anyway your file will not show up, set appropriate working directory before writing out file
setwd("/Users/emilyfitzmeyer/Desktop")
write_csv(exp_df, "mgi_sample_diversity.csv")
#EXTRAS
#scaling attempt 2:
#exp_df %>%
# mutate(scaled_complexity = rescale(complexity)) %>%
# mutate(scaled_complexity = sprintf(scaled_complexity, fmt = '%#.5f'))
#even when using sprintf to add decimal points to the scaled values the scaling simplifies
#a bunch of unique complexity values 0.5.
#I want to retain the differences in these values!!!!
#setwd("/Users/emilyfitzmeyer/Desktop/BC_WNV/Sequencing/ffbcWNV_NextSeq/barcode_counts/stocks")
#test_file <- read_delim("3_P1_200_S349_barcode_counts.txt", col_types = c("c", "n"), col_names = c("barcodes", "counts"))
#iNEXT(test_file$counts, q = c(0,1,2), datatype = "abundance")
#Shannon(test_file$counts)
#scaling attempt 1:
#this is not a great way to do what I was trying to do, it will force high complexity samples in small datasets
#to be presented as "low" complexity.
#create function that will rescale the diversity index vector from 0 to 1:
#range01 <- function(x){(x-min(x))/(max(x)-min(x))}
# generate a new column 'complexity_01' containing complexity values normalized to a 0 to 1 scale
#exp_df <- exp_df %>%
# mutate(complexity_01 = range01(complexity))