Updated code and dataset

digitalwright · May 7, 2020 · 369ce86 · 369ce86
1 parent a697ca3
commit 369ce86
Show file tree

Hide file tree

Showing 6 changed files with 18,315 additions and 3,344 deletions.
diff --git a/AnalyzeSequences_v5.R → AnalyzeSequences_v6.R b/AnalyzeSequences_v5.R → AnalyzeSequences_v6.R
@@ -1,46 +1,75 @@
-library(DECIPHER) # MUST INSTALL DECIPHER FIRST
+## Load library
+
+library(DECIPHER)
+
+sessionInfo() # NEED DECIPHER >= v2.16.1 !!!!!!!!
 
 # need to set the working directory
 setwd("<<path to ncov directory>>/") # needs trailing slash
 
-###
-# Load files
-###
+## Load files
 
-source('./RobustRegression_v1.R')
 source('./map_v1.R')
 source('./movavg_v1.R')
 source('./coordinates_v1.R')
+meta <- read.table("./metadata_v1.tsv",
+	sep="\t",
+	header=TRUE,
+	quote="")
 ref <- readDNAStringSet("./NC_045512.2.fas")
-dna <- readDNAStringSet("./gisaid_cov2020_sequences-Apr14.fasta.gz") # expect warning for ignored spaces
+dna <- readDNAStringSet("./gisaid_cov2020_sequences-May2.fasta.gz") # expect warning for ignored spaces
 
-cols <- c(A="#C53932", C="#56BBCC", G="#529D3F", T="#D57FBF", U="#F08536")
+cols <- c(A="#C53932", C="#56BBCC", G="#529D3F", T="#D57FBF", U="#F08536") # color vector
 
-# prepare genomes and record date of collection
+# prepare genomes
 dna <- RemoveGaps(dna)
+a <- alphabetFrequency(dna, baseOnly=TRUE)
+w <- which(a[, "other"] <= 500) # too many degeneracies
+dna <- dna[w]
+print(length(dna)) # total number of genomes
+
+# record date of collection
 ns <- strsplit(names(dna), "|", fixed=TRUE)
 date <- sapply(ns, tail, n=1)
 date <- as.Date(date, "%Y-%m-%d")
 min_date <- min(date, na.rm=TRUE)
 day <- date - min_date
 sum(!is.na(day)) # number with collection date
 
-###
-# Create Figure 1
-###
+# record lab of origin
+ns <- strsplit(names(dna), "/", fixed=TRUE)
+laboratory <- sapply(ns, `[`, 2L)
+s <- strsplit(names(dna), "|", fixed=TRUE)
+s <- sapply(s, `[`, 1L)
+s <- substring(s, nchar("hCoV-19/") + 1L)
+s <- gsub(" ", "", s)
+m <- match(s, meta$strain)
+w <- !is.na(m)
+laboratory[w] <- meta$originating_lab[m[w]]
+
+## Create Figure 1
 
 muts <- rep(NA_real_, length(dna))
 pBar <- txtProgressBar(style=3)
+DNA2 <- DNAStringSet(character(length(dna)))
 for (i in seq_along(dna)) {
 	DNA <- AlignProfiles(dna[i], ref)
+	#DNA <- subseq(DNA, 61, width(DNA) - 60)
 	d <- DistanceMatrix(DNA,
 		type="dist",
 		verbose=FALSE,
 		penalizeGapLetterMatches=FALSE)
 	t <- TerminalChar(DNA)
 	muts[i] <- d[1]*(width(DNA)[1] - max(t[, 1]) - max(t[, 2]))
+
+	s <- strsplit(as.character(DNA), "")
+	w <- which(s[[2]] != "-")
+	DNA2[i] <- paste(s[[1]][w], collapse="")
 	setTxtProgressBar(pBar, i/length(dna))
 }
+DNA <- DNA2
+names(DNA) <- names(dna)
+DNA2 <- c(ref, DNA2)
 
 dev.new(height=3.5, width=4)
 p <- par(mar=c(4, 4, 2, 1))
@@ -68,13 +97,17 @@ text(0,
 	pos=4)
 par(p)
 
-###
-# Map independent substitutions (Fig. 2b)
-###
+## Map independent substitutions
 
-DNA <- AlignSeqs(dna,
-	processors=NULL)
-DNA2 <- AlignProfiles(ref, DNA)
+# CODE BLOCK TO CREATE A MULTIPLE ALIGNMENT
+# INSTEAD OF USING THE ALIGNMENT FROM ABOVE
+#u_dna <- unique(dna) # the unique input sequences
+#index <- match(dna, u_dna) # de-replication index
+#U_DNA <- AlignSeqs(u_dna, processors=NULL)
+#DNA <- U_DNA[index]
+#names(DNA) <- names(dna) # the re-replicated alignment
+#DNA2 <- AlignProfiles(ref, DNA)
+# END CODE BLOCK
 
 d <- DistanceMatrix(DNA,
 	type="dist",
@@ -84,31 +117,32 @@ c <- IdClusters(d,
 	method="NJ",
 	processors=NULL,
 	type="dendrogram",
-	collapse=-1)
+	collapse=-1,
+	root=which(dna==ref)[1])
 c <- reorder(c, rev(unlist(c)))
 o <- unlist(c)
 
-dev.new(width=8.5, height=7)
-p <- par(mar=c(0.1, 0.1, 10, 30))
+dev.new(width=8.5, height=7.7)
+p <- par(mar=c(0.1, 0.1, 12.5, 30))
 plot(c, leaflab="none", yaxt="n", horiz=TRUE)
 
 dx <- par("xaxp")
 dx <- dx[1] - dx[2]
 dy <- par("yaxp")
 dy <- dy[2] - dy[1]
 x <- par("pin")
-segments(0.001,
-	0.98*dy,
-	0.001,
-	0.98*dy + 2e-4/dx*dy*x[1]/x[2])
-text(0.001,
-	1.093*dy,
-	"0.0002",
+segments(0.00194,
+	0.99*dy - 4e-4/dx*dy*x[1]/x[2]/2,
+	0.00194,
+	0.99*dy + 4e-4/dx*dy*x[1]/x[2]/2)
+text(0.00194,
+	1.047*dy,
+	"0.0004",
 	pos=2,
 	srt=90)
 
-delta <- -6e-5
-space <- -2e-5
+delta <- -10e-5
+space <- -4e-5
 min_width <- length(o)*0.001 # set min_width = 1 to max all rectanges exact below leaves, but might be invisible because too small to render
 sep <- 1 # minimum horizontal separation on the tree
 count <- 0L # number of parallel changes
@@ -122,6 +156,9 @@ results <- data.frame(ref_pos=integer(),
 	syn=integer(),
 	gene=character(),
 	aa=character(),
+	size=integer(),
+	tot=integer(),
+	labs=integer(),
 	stringsAsFactors=FALSE)
 
 subs <- matrix(0L,
@@ -141,16 +178,36 @@ for (i in seq_len(l)) {
 	# number of cases of parallel mutation
 	w1 <- which(s[-1][o] %in% DNA_BASES)
 	w2 <- which(s[-1][o][w1] != s[1])
+	#BrowseSeqs(subseq(DNA2[c(1, o[w1[w2]] + 1)], ref_pos - 5, ref_pos + 5), hi=1, htmlFile=paste0(tempdir(), "/", ref_pos, ".html"))
 	if (length(w2)==0) {
 		reps <- 0
 	} else {
 		# need to resolve substitutions on the tree
-		reps <- map(tree=c,
+		reps <- map(tree=unclass(c),
 			bases=as.character(s[-1]),
 			ancestor=as.character(s[1]))
 	}
 
 	if (reps > 0) {
+		# determine largest consecutive group
+		runs <- rle(diff(w1[w2]))
+		w <- which(runs$values==1)
+		if (length(w)==0) {
+			runs <- 1L # no groups
+		} else {
+			# print sequences by clade
+			#cs <- cumsum(runs$lengths) + 1
+			#for (j in seq_along(w)) {
+			#	print(names(dna)[o][w1[w2[seq(to=cs[w[j]],
+			#		length.out=runs$lengths[w[j]] + 1)]]])
+			#}
+			runs <- max(runs$lengths[w]) + 1L
+		}
+
+		# determine number of originating laboratories
+		labs <- laboratory[o[w1[w2]]]
+		labs <- length(unique(labs))
+
 		w <- which(s[-1][o] != s[1] &
 			s[-1][o] %in% DNA_BASES)
 		start <- tail(cds[cds <= ref_pos], n=1)
@@ -195,6 +252,9 @@ for (i in seq_len(l)) {
 						ceiling((ref_pos - gene + 1)/3),
 						paste(nsyn,
 							collapse="/")),
+					size=runs,
+					tot=length(w2),
+					labs=labs,
 					stringsAsFactors=FALSE))
 			nsyn <- length(nsyn)
 		} else { # non-coding region
@@ -207,16 +267,21 @@ for (i in seq_len(l)) {
 					syn=NA_integer_,
 					gene=NA_character_,
 					aa=NA_character_,
+					size=runs,
+					tot=length(w2),
+					labs=labs,
 					stringsAsFactors=FALSE))
 		}
 
 		fromto <- cbind(as.character(s[1]),
 			as.character(unique(s[-1][o][w1][w2])))
 		subs[fromto] <- subs[fromto] + 1L
 
-		if (reps >= 7 && # evidence of parallelism is >= N
-			ref_pos > 33 && # 5'UTR > 265
-			ref_pos < 29675) { # 3'UTR < 29675
+		# plot evidence for the substitution
+		if (reps >= 10 && # evidence of parallelism is >= N
+			ref_pos >= 50 && # 5'UTR > 265
+			ref_pos <= 29853 && # 3'UTR < 29675
+			labs >= 4) { # at least N originating labs
 			count <- count + 1L
 			rect(delta*(count - 1) + space*count,
 				seq_along(DNA)[w] - min_width/2,
@@ -232,7 +297,7 @@ for (i in seq_len(l)) {
 					ref_pos,
 					paste(ifelse(vals=="T", "U", vals),
 						collapse="/"),
-					"(",
+					" (",
 					reps,
 					")")
 			if (!is.null(codons)) {
@@ -254,6 +319,8 @@ for (i in seq_len(l)) {
 					tail(results$aa, n=1),
 					")")
 			}
+			lab <- paste(lab, " {", runs, "}", sep="")
+
 			text(x=delta*(count - 1.2) + space*count,
 				y=length(o)*1.01,
 				labels=lab,
@@ -266,12 +333,10 @@ for (i in seq_len(l)) {
 }
 par(p)
 
-#write.csv(results,
-#	file="./results_v3.csv")
+write.csv(results,
+	file="~/Desktop/Virus/ncov/data/genomes/results_v5.csv")
 
-###
-# Compute statistics including dN/dS
-###
+## Compute statistics including dN/dS
 
 # total number of mutations
 nrow(results)
@@ -285,6 +350,8 @@ sum(results$reps > 1)
 # number independent in non-coding regions
 sum(!is.na(results$syn[results$reps > 1]))
 
+# number synonymous
+
 # N and S overall
 sum(results[, "nsyn"], na.rm=TRUE) # N
 sum(results[, "syn"], na.rm=TRUE) # S
@@ -293,13 +360,16 @@ sum(results[, "syn"], na.rm=TRUE) # S
 sum(results[results$reps > 1, "nsyn"], na.rm=TRUE) # N
 sum(results[results$reps > 1, "syn"], na.rm=TRUE) # S
 
-###
-# Determine expected frequency of N and S
-###
+## Determine background frequency of mutations
 
 a <- alphabetFrequency(ref, as.prob=TRUE)[1:4] # base frequency
 subs # substitutions [from, to]
 
+pval <- 0.001 # p-value to reach statistical significance
+qpois(pval/width(ref), # multiple testing correction
+	subs/(a*width(ref)), # probability of substitution
+	lower=FALSE) # upper tail of the distribution
+
 t <- subseq(rep(ref, length(protein_s)),
 	protein_s,
 	protein_e)
@@ -330,9 +400,7 @@ S
 N
 N/(S + N) # fraction N expected
 
-###
-# Draw Figure 2a
-###
+## Draw Figure 2a
 
 arrow <- 300 # maximum arrow length
 h <- 0.1 # height of gene bars
@@ -351,9 +419,9 @@ plot(NA,
 
 # add conserved regions
 zeros <- rep(0, width(ref))
-zeros[results$ref_pos] <- 1 # mutations
+zeros[results$ref_pos] <- results$reps # mutations
 zeros <- movavg(zeros)
-zeros <- rle(zeros < 0.06) # absence of mutations
+zeros <- rle(zeros < 0.2) # absence of mutations
 ranges <- cumsum(zeros$lengths)
 zeros <- which(zeros$values &
 	zeros$lengths >= 100)
@@ -478,7 +546,7 @@ segments(protein_e[w],
 	h)
 xs <- (protein_s + protein_e)/2
 xs <- xs + 540 # center the labels
-w <- which(names(xs) %in% c("nsp7", "nsp9", "nsp11", "ORF6", "ORF7b"))
+w <- which(names(xs) %in% c("nsp7", "nsp9", "nsp11", "ORF6", "ORF7b")) # omit because too small
 text(x=xs[-w],
 	y=-h/10,
 	labels=names(protein_s)[-w],

diff --git a/README.txt b/README.txt
@@ -1,26 +1,34 @@
 README for ncov analysis presented in:
 "SARS-CoV-2 genome evolution exposes early human adaptations"
 
+# AUTHOR
+Erik S. Wright <[email protected]>
+
 # DEPENDENCIES
-R >= 3.15
-DECIPHER >= 2.14
+R >= 4.0
+DECIPHER >= 2.16.1
 
 # TO RUN
-Open AnalyzeSequences_v5.R
+Install R then DECIPHER
+Open AnalyzeSequences_v6.R
 Set working directory to path to ncov
 Source the code in R
 
 # FILES
-AnalyzeSequences_v5.R = MAIN SCRIPT to perform all analyses
+AnalyzeSequences_v6.R = MAIN SCRIPT to perform all analyses
 coordinates_v1.R = Positions of features in reference genome
-gisaid_cov2020_sequences-Apr14.fasta.gz = FASTA file with all genomes except reference
+gisaid_cov2020_sequences-May2.fasta.gz = FASTA file with all genomes
 map_v1.R = Function for mapping substitutions on the phylogenetic tree
+movavg_v1.R = Function for performing center-point exponential moving averaging
 NC_045512.2.fas = FASTA file with reference genome
-results_v3.csv = Results of the analysis
+results_v5.csv = Results of the analysis
+metadata_v1.tsv = Tab delimited matrix of metadata and acknowledgements
 
 # CHANGING THE DATASET
 To change the dataset, simply change the file name of:
-gisaid_cov2020_sequences-Apr14.fasta.gz
+gisaid_cov2020_sequences-May2.fasta.gz
+and:
+metadata_v1.tsv
 in:
-AnalyzeSequences_v5.R
-All analyses should (hopefully) work, but figures might need some adjustment.
+AnalyzeSequences_v6.R
+All analyses should (hopefully) still work, but figures will need some adjustment.
diff --git a/gisaid_cov2020_sequences-Apr14.fasta.gz → gisaid_cov2020_sequences-May2.fasta.gz b/gisaid_cov2020_sequences-Apr14.fasta.gz → gisaid_cov2020_sequences-May2.fasta.gz