Skip to content

Commit

Permalink
Updated code and dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
Erik Wright authored and Erik Wright committed May 7, 2020
1 parent a697ca3 commit 369ce86
Show file tree
Hide file tree
Showing 6 changed files with 18,315 additions and 3,344 deletions.
164 changes: 116 additions & 48 deletions AnalyzeSequences_v5.R → AnalyzeSequences_v6.R
Original file line number Diff line number Diff line change
@@ -1,46 +1,75 @@
library(DECIPHER) # MUST INSTALL DECIPHER FIRST
## Load library

library(DECIPHER)

sessionInfo() # NEED DECIPHER >= v2.16.1 !!!!!!!!

# need to set the working directory
setwd("<<path to ncov directory>>/") # needs trailing slash

###
# Load files
###
## Load files

source('./RobustRegression_v1.R')
source('./map_v1.R')
source('./movavg_v1.R')
source('./coordinates_v1.R')
meta <- read.table("./metadata_v1.tsv",
sep="\t",
header=TRUE,
quote="")
ref <- readDNAStringSet("./NC_045512.2.fas")
dna <- readDNAStringSet("./gisaid_cov2020_sequences-Apr14.fasta.gz") # expect warning for ignored spaces
dna <- readDNAStringSet("./gisaid_cov2020_sequences-May2.fasta.gz") # expect warning for ignored spaces

cols <- c(A="#C53932", C="#56BBCC", G="#529D3F", T="#D57FBF", U="#F08536")
cols <- c(A="#C53932", C="#56BBCC", G="#529D3F", T="#D57FBF", U="#F08536") # color vector

# prepare genomes and record date of collection
# prepare genomes
dna <- RemoveGaps(dna)
a <- alphabetFrequency(dna, baseOnly=TRUE)
w <- which(a[, "other"] <= 500) # too many degeneracies
dna <- dna[w]
print(length(dna)) # total number of genomes

# record date of collection
ns <- strsplit(names(dna), "|", fixed=TRUE)
date <- sapply(ns, tail, n=1)
date <- as.Date(date, "%Y-%m-%d")
min_date <- min(date, na.rm=TRUE)
day <- date - min_date
sum(!is.na(day)) # number with collection date

###
# Create Figure 1
###
# record lab of origin
ns <- strsplit(names(dna), "/", fixed=TRUE)
laboratory <- sapply(ns, `[`, 2L)
s <- strsplit(names(dna), "|", fixed=TRUE)
s <- sapply(s, `[`, 1L)
s <- substring(s, nchar("hCoV-19/") + 1L)
s <- gsub(" ", "", s)
m <- match(s, meta$strain)
w <- !is.na(m)
laboratory[w] <- meta$originating_lab[m[w]]

## Create Figure 1

muts <- rep(NA_real_, length(dna))
pBar <- txtProgressBar(style=3)
DNA2 <- DNAStringSet(character(length(dna)))
for (i in seq_along(dna)) {
DNA <- AlignProfiles(dna[i], ref)
#DNA <- subseq(DNA, 61, width(DNA) - 60)
d <- DistanceMatrix(DNA,
type="dist",
verbose=FALSE,
penalizeGapLetterMatches=FALSE)
t <- TerminalChar(DNA)
muts[i] <- d[1]*(width(DNA)[1] - max(t[, 1]) - max(t[, 2]))

s <- strsplit(as.character(DNA), "")
w <- which(s[[2]] != "-")
DNA2[i] <- paste(s[[1]][w], collapse="")
setTxtProgressBar(pBar, i/length(dna))
}
DNA <- DNA2
names(DNA) <- names(dna)
DNA2 <- c(ref, DNA2)

dev.new(height=3.5, width=4)
p <- par(mar=c(4, 4, 2, 1))
Expand Down Expand Up @@ -68,13 +97,17 @@ text(0,
pos=4)
par(p)

###
# Map independent substitutions (Fig. 2b)
###
## Map independent substitutions

DNA <- AlignSeqs(dna,
processors=NULL)
DNA2 <- AlignProfiles(ref, DNA)
# CODE BLOCK TO CREATE A MULTIPLE ALIGNMENT
# INSTEAD OF USING THE ALIGNMENT FROM ABOVE
#u_dna <- unique(dna) # the unique input sequences
#index <- match(dna, u_dna) # de-replication index
#U_DNA <- AlignSeqs(u_dna, processors=NULL)
#DNA <- U_DNA[index]
#names(DNA) <- names(dna) # the re-replicated alignment
#DNA2 <- AlignProfiles(ref, DNA)
# END CODE BLOCK

d <- DistanceMatrix(DNA,
type="dist",
Expand All @@ -84,31 +117,32 @@ c <- IdClusters(d,
method="NJ",
processors=NULL,
type="dendrogram",
collapse=-1)
collapse=-1,
root=which(dna==ref)[1])
c <- reorder(c, rev(unlist(c)))
o <- unlist(c)

dev.new(width=8.5, height=7)
p <- par(mar=c(0.1, 0.1, 10, 30))
dev.new(width=8.5, height=7.7)
p <- par(mar=c(0.1, 0.1, 12.5, 30))
plot(c, leaflab="none", yaxt="n", horiz=TRUE)

dx <- par("xaxp")
dx <- dx[1] - dx[2]
dy <- par("yaxp")
dy <- dy[2] - dy[1]
x <- par("pin")
segments(0.001,
0.98*dy,
0.001,
0.98*dy + 2e-4/dx*dy*x[1]/x[2])
text(0.001,
1.093*dy,
"0.0002",
segments(0.00194,
0.99*dy - 4e-4/dx*dy*x[1]/x[2]/2,
0.00194,
0.99*dy + 4e-4/dx*dy*x[1]/x[2]/2)
text(0.00194,
1.047*dy,
"0.0004",
pos=2,
srt=90)

delta <- -6e-5
space <- -2e-5
delta <- -10e-5
space <- -4e-5
min_width <- length(o)*0.001 # set min_width = 1 to max all rectanges exact below leaves, but might be invisible because too small to render
sep <- 1 # minimum horizontal separation on the tree
count <- 0L # number of parallel changes
Expand All @@ -122,6 +156,9 @@ results <- data.frame(ref_pos=integer(),
syn=integer(),
gene=character(),
aa=character(),
size=integer(),
tot=integer(),
labs=integer(),
stringsAsFactors=FALSE)

subs <- matrix(0L,
Expand All @@ -141,16 +178,36 @@ for (i in seq_len(l)) {
# number of cases of parallel mutation
w1 <- which(s[-1][o] %in% DNA_BASES)
w2 <- which(s[-1][o][w1] != s[1])
#BrowseSeqs(subseq(DNA2[c(1, o[w1[w2]] + 1)], ref_pos - 5, ref_pos + 5), hi=1, htmlFile=paste0(tempdir(), "/", ref_pos, ".html"))
if (length(w2)==0) {
reps <- 0
} else {
# need to resolve substitutions on the tree
reps <- map(tree=c,
reps <- map(tree=unclass(c),
bases=as.character(s[-1]),
ancestor=as.character(s[1]))
}

if (reps > 0) {
# determine largest consecutive group
runs <- rle(diff(w1[w2]))
w <- which(runs$values==1)
if (length(w)==0) {
runs <- 1L # no groups
} else {
# print sequences by clade
#cs <- cumsum(runs$lengths) + 1
#for (j in seq_along(w)) {
# print(names(dna)[o][w1[w2[seq(to=cs[w[j]],
# length.out=runs$lengths[w[j]] + 1)]]])
#}
runs <- max(runs$lengths[w]) + 1L
}

# determine number of originating laboratories
labs <- laboratory[o[w1[w2]]]
labs <- length(unique(labs))

w <- which(s[-1][o] != s[1] &
s[-1][o] %in% DNA_BASES)
start <- tail(cds[cds <= ref_pos], n=1)
Expand Down Expand Up @@ -195,6 +252,9 @@ for (i in seq_len(l)) {
ceiling((ref_pos - gene + 1)/3),
paste(nsyn,
collapse="/")),
size=runs,
tot=length(w2),
labs=labs,
stringsAsFactors=FALSE))
nsyn <- length(nsyn)
} else { # non-coding region
Expand All @@ -207,16 +267,21 @@ for (i in seq_len(l)) {
syn=NA_integer_,
gene=NA_character_,
aa=NA_character_,
size=runs,
tot=length(w2),
labs=labs,
stringsAsFactors=FALSE))
}

fromto <- cbind(as.character(s[1]),
as.character(unique(s[-1][o][w1][w2])))
subs[fromto] <- subs[fromto] + 1L

if (reps >= 7 && # evidence of parallelism is >= N
ref_pos > 33 && # 5'UTR > 265
ref_pos < 29675) { # 3'UTR < 29675
# plot evidence for the substitution
if (reps >= 10 && # evidence of parallelism is >= N
ref_pos >= 50 && # 5'UTR > 265
ref_pos <= 29853 && # 3'UTR < 29675
labs >= 4) { # at least N originating labs
count <- count + 1L
rect(delta*(count - 1) + space*count,
seq_along(DNA)[w] - min_width/2,
Expand All @@ -232,7 +297,7 @@ for (i in seq_len(l)) {
ref_pos,
paste(ifelse(vals=="T", "U", vals),
collapse="/"),
"(",
" (",
reps,
")")
if (!is.null(codons)) {
Expand All @@ -254,6 +319,8 @@ for (i in seq_len(l)) {
tail(results$aa, n=1),
")")
}
lab <- paste(lab, " {", runs, "}", sep="")

text(x=delta*(count - 1.2) + space*count,
y=length(o)*1.01,
labels=lab,
Expand All @@ -266,12 +333,10 @@ for (i in seq_len(l)) {
}
par(p)

#write.csv(results,
# file="./results_v3.csv")
write.csv(results,
file="~/Desktop/Virus/ncov/data/genomes/results_v5.csv")

###
# Compute statistics including dN/dS
###
## Compute statistics including dN/dS

# total number of mutations
nrow(results)
Expand All @@ -285,6 +350,8 @@ sum(results$reps > 1)
# number independent in non-coding regions
sum(!is.na(results$syn[results$reps > 1]))

# number synonymous

# N and S overall
sum(results[, "nsyn"], na.rm=TRUE) # N
sum(results[, "syn"], na.rm=TRUE) # S
Expand All @@ -293,13 +360,16 @@ sum(results[, "syn"], na.rm=TRUE) # S
sum(results[results$reps > 1, "nsyn"], na.rm=TRUE) # N
sum(results[results$reps > 1, "syn"], na.rm=TRUE) # S

###
# Determine expected frequency of N and S
###
## Determine background frequency of mutations

a <- alphabetFrequency(ref, as.prob=TRUE)[1:4] # base frequency
subs # substitutions [from, to]

pval <- 0.001 # p-value to reach statistical significance
qpois(pval/width(ref), # multiple testing correction
subs/(a*width(ref)), # probability of substitution
lower=FALSE) # upper tail of the distribution

t <- subseq(rep(ref, length(protein_s)),
protein_s,
protein_e)
Expand Down Expand Up @@ -330,9 +400,7 @@ S
N
N/(S + N) # fraction N expected

###
# Draw Figure 2a
###
## Draw Figure 2a

arrow <- 300 # maximum arrow length
h <- 0.1 # height of gene bars
Expand All @@ -351,9 +419,9 @@ plot(NA,

# add conserved regions
zeros <- rep(0, width(ref))
zeros[results$ref_pos] <- 1 # mutations
zeros[results$ref_pos] <- results$reps # mutations
zeros <- movavg(zeros)
zeros <- rle(zeros < 0.06) # absence of mutations
zeros <- rle(zeros < 0.2) # absence of mutations
ranges <- cumsum(zeros$lengths)
zeros <- which(zeros$values &
zeros$lengths >= 100)
Expand Down Expand Up @@ -478,7 +546,7 @@ segments(protein_e[w],
h)
xs <- (protein_s + protein_e)/2
xs <- xs + 540 # center the labels
w <- which(names(xs) %in% c("nsp7", "nsp9", "nsp11", "ORF6", "ORF7b"))
w <- which(names(xs) %in% c("nsp7", "nsp9", "nsp11", "ORF6", "ORF7b")) # omit because too small
text(x=xs[-w],
y=-h/10,
labels=names(protein_s)[-w],
Expand Down
26 changes: 17 additions & 9 deletions README.txt
Original file line number Diff line number Diff line change
@@ -1,26 +1,34 @@
README for ncov analysis presented in:
"SARS-CoV-2 genome evolution exposes early human adaptations"

# AUTHOR
Erik S. Wright <[email protected]>

# DEPENDENCIES
R >= 3.15
DECIPHER >= 2.14
R >= 4.0
DECIPHER >= 2.16.1

# TO RUN
Open AnalyzeSequences_v5.R
Install R then DECIPHER
Open AnalyzeSequences_v6.R
Set working directory to path to ncov
Source the code in R

# FILES
AnalyzeSequences_v5.R = MAIN SCRIPT to perform all analyses
AnalyzeSequences_v6.R = MAIN SCRIPT to perform all analyses
coordinates_v1.R = Positions of features in reference genome
gisaid_cov2020_sequences-Apr14.fasta.gz = FASTA file with all genomes except reference
gisaid_cov2020_sequences-May2.fasta.gz = FASTA file with all genomes
map_v1.R = Function for mapping substitutions on the phylogenetic tree
movavg_v1.R = Function for performing center-point exponential moving averaging
NC_045512.2.fas = FASTA file with reference genome
results_v3.csv = Results of the analysis
results_v5.csv = Results of the analysis
metadata_v1.tsv = Tab delimited matrix of metadata and acknowledgements

# CHANGING THE DATASET
To change the dataset, simply change the file name of:
gisaid_cov2020_sequences-Apr14.fasta.gz
gisaid_cov2020_sequences-May2.fasta.gz
and:
metadata_v1.tsv
in:
AnalyzeSequences_v5.R
All analyses should (hopefully) work, but figures might need some adjustment.
AnalyzeSequences_v6.R
All analyses should (hopefully) still work, but figures will need some adjustment.
Binary file not shown.
Loading

0 comments on commit 369ce86

Please sign in to comment.