forked from tbenavi1/genomescope2.0
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenomescope.R
executable file
·238 lines (197 loc) · 10.7 KB
/
genomescope.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
#!/usr/bin/env Rscript
## GenomeScope: Fast Genome Analysis from Unassembled Short Reads
## This is the automated script for computing genome characteristics
## from a kmer histogram file, k-mer size, and ploidy
## Load libraries for non-linear least squares and argument parser
library(minpack.lm)
library(argparse)
library(jsonlite)
## Load the genomescope library
library(genomescope)
## Number of rounds before giving up
NUM_ROUNDS=4
## Coverage steps to trim off between rounds
START_SHIFT=5
## Typical cutoff for sequencing error
TYPICAL_ERROR = 15
## Max rounds on NLS
MAX_ITERATIONS=200
## Overrule if two scores are within this percent (0.05 = 5%) but larger difference in het
SCORE_CLOSE = 0.20
## Overrule heterozygosity if there is a large difference in het rate
SCORE_HET_FOLD_DIFFERENCE = 10
## Suppress the warnings if the modeling goes crazy, those are in try/catch blocks anyways
options(warn=-1)
## Colors for plots
COLOR_BGCOLOR = "light grey"
COLOR_HIST = "#56B4E9"
COLOR_2pPEAK = "black"
COLOR_pPEAK = "#F0E442"
COLOR_ERRORS = "#D55E00"
COLOR_KMERPEAK = "black"
COLOR_RESIDUAL = "purple"
COLOR_COVTHRES = "red"
## Given mean +/- stderr, report min and max value within 2 SE
###############################################################################
min_max <- function(table) {
return (c(max(0,table[1] - 2*table[2]), table[1]+ 2*table[2]))
}
min_max1 <- function(table) {
return (c(max(0,table[1] - 2*table[2]), min(1, table[1]+ 2*table[2])))
}
## Main program starts here
###############################################################################
parser <- ArgumentParser()
parser$add_argument("-v", "--version", action="store_true", default=FALSE, help="print the version and exit")
parser$add_argument("-i", "--input", help = "input histogram file")
parser$add_argument("-o", "--output", help = "output directory name")
parser$add_argument("-p", "--ploidy", type = "integer", default = 2, help = "ploidy (1, 2, 3, 4, 5, or 6) for model to use [default 2]")
parser$add_argument("-k", "--kmer_length", type = "integer", default = 21, help = "kmer length used to calculate kmer spectra [default 21]")
parser$add_argument("-n", "--name_prefix", default = "", help = "optional name_prefix for output files")
parser$add_argument("-l", "--lambda", "--kcov", "--kmercov", type = "integer", default=-1, help = "optional initial kmercov estimate for model to use")
parser$add_argument("-m", "--max_kmercov", type = "integer", default=-1, help = "optional maximum kmer coverage threshold (kmers with coverage greater than max_kmercov are ignored by the model)")
parser$add_argument("--verbose", action="store_true", default=FALSE, help = "optional flag to print messages during execution")
parser$add_argument("--no_unique_sequence", action="store_true", default=FALSE, help = "optional flag to turn off yellow unique sequence line in plots")
parser$add_argument("--json_report", action="store_true", default=FALSE, help = "write a JSON format report file in addition to the text summary")
parser$add_argument("-t", "--topology", type = "integer", default = 0, help = "ADVANCED: flag for topology for model to use")
parser$add_argument("--initial_repetitiveness", type="character", default = -1, help = "ADVANCED: flag to set initial value for repetitiveness")
parser$add_argument("--initial_heterozygosities", type="character", default = -1, help = "ADVANCED: flag to set initial values for nucleotide heterozygosity rates")
parser$add_argument("--transform_exp", type="integer", default=1, help = "ADVANCED: parameter for the exponent when fitting a transformed (x**transform_exp*y vs. x) kmer histogram [default 1]")
parser$add_argument("--testing", action="store_true", default=FALSE, help = "ADVANCED: flag to create testing.tsv file with model parameters")
parser$add_argument("--true_params", type="character", default = -1, help = "ADVANCED: flag to state true simulated parameters for testing mode")
parser$add_argument("--trace_flag", action="store_true", default=FALSE, help = "ADVANCED: flag to turn on printing of iteration progress of nlsLM function")
parser$add_argument("--num_rounds", type = "integer", default = NUM_ROUNDS, help = "ADVANCED: parameter for the number of optimization rounds")
parser$add_argument("--fitted_hist", action="store_true", default=FALSE, help = "ADVANCED: generates a fitted histogram for kmer multiplicity 0-4 and a lookup table of probabilities")
parser$add_argument("--start_shift", type = "integer", default=START_SHIFT, help = "ADVANCED: coverage shifts to exclude between fitting rounds")
parser$add_argument("--typical_error", type = "integer", default=TYPICAL_ERROR, help = "ADVANCED: typical level of sequencing error")
arguments <- parser$parse_args()
version_message <- "GenomeScope 2.0\n"
if (arguments$version) {
cat(version_message)
quit()
}
if (is.null(arguments$input) | is.null(arguments$output)) {
cat("USAGE: genomescope.R -i input_histogram_file -o output_dir -p ploidy -k kmer_length\n")
cat("OPTIONAL PARAMETERS: -n 'name_prefix' -l lambda -m max_kmercov --verbose --no_unique_sequence\n")
cat("ADVANCED PARAMETERS: -t topology --initial_repetitiveness init_d --initial_heterozygosities init_r1,init_r2,...,init_rx --transform_exp t_exp --testing --true_params --trace_flag --num_rounds --fitted_hist\n")
cat("HELP: genomescope.R --help\n")
} else {
## Load the arguments from the user
histfile <- arguments$input
foldername <- arguments$output
p <- arguments$ploidy
k <- arguments$kmer_length
if (arguments$name_prefix != "") {
arguments$name_prefix = paste0(arguments$name_prefix,"_")
}
estKmercov <- arguments$lambda
max_kmercov <- arguments$max_kmercov
VERBOSE <- arguments$verbose
NO_UNIQUE_SEQUENCE <- arguments$no_unique_sequence
topology <- arguments$topology
d_init <- arguments$initial_repetitiveness
r_inits <- arguments$initial_heterozygosities
transform_exp <- arguments$transform_exp
TESTING <- arguments$testing
TRUE_PARAMS <- arguments$true_params
TRACE_FLAG <- arguments$trace_flag
NUM_ROUNDS <- arguments$num_rounds
FITTED_HIST <- arguments$fitted_hist
START_SHIFT <- arguments$start_shift
TYPICAL_ERROR <- arguments$typical_error
cat(paste("GenomeScope analyzing ", histfile, " p=", p, " k=", k, " outdir=", foldername, "\n", sep=""))
dir.create(foldername, showWarnings=FALSE)
## Initialize the status
progressFilename <- paste(foldername,"/", arguments$name_prefix, "progress.txt",sep="")
cat("starting", file=progressFilename, sep="\n")
kmer_prof <- read.csv(file=histfile,sep="", header=FALSE,colClasses=c("numeric","numeric"))
minkmerx = 1;
if (kmer_prof[1,1] == 0) {
if (VERBOSE) {cat("Histogram starts with zero, reseting minkmerx\n")}
minkmerx = 2;
}
kmer_prof_orig <- kmer_prof # kmer_prof_orig will now store all including the last position
kmer_prof <- kmer_prof[c(minkmerx:(nrow(kmer_prof)-1)),] #get rid of the last position
## try to find the local minimum between errors and the first (heterozygous) peak
kmer_trans = as.numeric(kmer_prof[,1])**transform_exp*as.numeric(kmer_prof[,2])
start <- tail(which(kmer_trans[1:TYPICAL_ERROR]==min(kmer_trans[1:TYPICAL_ERROR])),n=1)
start_max <- start + which(kmer_trans[start:length(kmer_trans)]==max(kmer_trans[start:length(kmer_trans)])) - 1
maxCovIndex = -1
## Figure out which kmers to exclude, if any
if(max_kmercov == -1) {
maxCovIndex <- length(kmer_prof[,1])
max_kmercov <- kmer_prof[maxCovIndex,1]
}
else {
## Figure out the index we should use for this coverage length
x <- kmer_prof[,1]
maxCovIndex <- length(x[x<=max_kmercov])
}
if (VERBOSE) {cat(paste("using max_kmercov:", max_kmercov, " with index:", maxCovIndex, "\n"))}
# terminate after NUM_ROUND iterations, store best result so far in container
round <- 0
best_container <- list(NULL,0)
while(round < NUM_ROUNDS) {
cat(paste("round", round, "trimming to", start, "trying 2p peak model... "), file=progressFilename, sep="", append=TRUE)
if (VERBOSE) {cat(paste("round", round, "trimming to", start, "trying 2p peak model... \n"))}
## Reset the input trimming off low frequency error kmers
kmer_prof=kmer_prof_orig[1:maxCovIndex,]
x <- kmer_prof[start:maxCovIndex,1]
y <- kmer_prof[start:maxCovIndex,2]
model_peaks <- estimate_Genome_peakp(kmer_prof, x, y, k, p, topology, estKmercov, round, foldername, arguments)
if (!is.null(model_peaks[[1]])) {
cat(paste("converged. score: ", model_peaks[[2]]$all[[1]]), file=progressFilename, sep="\n", append=TRUE)
if (VERBOSE) {
mdir = paste(foldername, "/round", round, sep="")
dir.create(mdir, showWarnings=FALSE)
report_results(kmer_prof,kmer_prof_orig, k, p, model_peaks, mdir, arguments, TRUE)
}
}
else {
cat(paste("unconverged"), file=progressFilename, sep="\n", append=TRUE)
}
#check if this result is better than previous
if (!is.null(model_peaks[[1]])) {
if (is.null(best_container[[1]])) {
if (VERBOSE) {cat("no previous best, updating best\n")}
best_container = model_peaks
}
else {
best_container_score = best_container[[1]]$m$deviance()
model_peaks_score = model_peaks[[1]]$m$deviance()
pdiff = abs(model_peaks_score - best_container_score) / max(model_peaks_score, best_container_score)
if (pdiff < SCORE_CLOSE) {
hetm = model_peaks[[1]]$ahet
hetb = best_container[[1]]$ahet
#if (hetb * SCORE_HET_FOLD_DIFFERENCE < hetm) {
if (hetb + 0.01 < hetm) {
if (VERBOSE) {cat("model has significantly higher heterozygosity but similar score, overruling\n")}
}
#else if (hetm * SCORE_HET_FOLD_DIFFERENCE < hetb) {
else if (hetm + 0.01 < hetb) {
if (VERBOSE) {cat("previous best has significantly higher heterozygosity and similar score, keeping\n")}
best_container = model_peaks
}
else if (model_peaks_score < best_container_score) {
if (VERBOSE) {cat("score is marginally better but het rate is not extremely different, updating\n")}
best_container = model_peaks
}
}
else if (model_peaks_score < best_container_score) {
if (VERBOSE) {cat("score is significantly better, updating\n")}
best_container = model_peaks
}
}
}
## Ignore a larger number of kmers as errors
start <- start + START_SHIFT
round <- round + 1
}
## Report the results, note using the original full profile
report_results(kmer_prof,kmer_prof_orig, k, p, best_container, foldername, arguments, FALSE)
# if (!is.null(best_container[[1]])) {
# print('model score')
# print(best_container[[2]]$all[[1]])
# print(best_container[[1]]$m$deviance())
# }
}