-
Notifications
You must be signed in to change notification settings - Fork 0
/
trim-batch.rb
executable file
·192 lines (167 loc) · 6.76 KB
/
trim-batch.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env ruby
#
# trim-batch
#
# Take in a set of fastq files and run khmer trimmomatic on each
#
# You MUST specify the location of the trimmomatic JAR
#
# Richard Smith ([email protected])
#
require 'rubygems'
require 'trollop'
require 'csv'
TRIMPREFIX = 't.'
UNPAIREDPREFIX = 'u.'
opts = Trollop::options do
version "v0.0.1a"
banner <<-EOS
trim-batch: run trimmomatic on multiple fastq files.
Single-end and/or paired-end can be included.
Trimmed files are outputted as '#{TRIMPREFIX}<input filename>'.
Paired reads whose pair is discarded are outputted as '#{TRIMPREFIX}#{UNPAIREDPREFIX}<input filename>'.
Log will be printed to <data><time>.trim.csv
Make sure Trimmomatic is installed
EOS
opt :pairedfile, "A file of paired-end FASTQ file paths, 1 per line, paired files in F, R order", :type => String
opt :paired, "A list of colon separated paired input FASTQ file paths, paired files in F, R order", :type => String
opt :singlefile, "A file of single-end FASTQ file paths, 1 per line", :type => String
opt :single, "A list of colon separated single-end input FASTQ file paths", :type => String
opt :jar, "Location of the trimmomatic jar file", :required => true, :type => String
opt :adapters, "Path to adapter FASTA file. If provided, adapters will be trimmed", :type => String
opt :phred, "Quality encoding. Either 33 or 64", :type => :int, :required => true
opt :leading, "Minimum quality required to keep a leading base", :default => 15, :type => Integer
opt :trailing, "Minimum quality required to keep a trailing base", :default => 15, :type => Integer
opt :windowsize, "Size of sliding window across which to average quality", :default => 4, :type => Integer
opt :quality, "Quality cutoff to use in sliding window trimming", :default => 15, :type => Integer
opt :minlen, "Minimum length of reads (any shorter than this after trimming are discarded)", :default => 60, :type => Integer
opt :threads, "How many threads to use for trimmomatic", :default => 1, :type => :int
opt :cleanup, "Remove input files after they are processed"
opt :output, "Directory the output files go to", :type => String
opt :test, "Don't actually run anything"
opt :verbose, "Be verbose"
end
Trollop::die :phred, "must be 33 or 64" if (opts[:phred]!=33 and opts[:phred]!=64) if opts[:phred]
t0 = Time.now
pairedlist, singlelist = [], []
# check inputs
if (opts.pairedfile && opts.paired) || (opts.singlefile && opts.single)
abort "Choose either file or list input but not both"
end
# check list file and load if OK
def check_list_file(file, outlist)
unless File.exists?(file)
raise "Can't find file \"#{opts.input}\""
end
File.open(file, "r").each_line do |line|
unless line.nil?
outlist << line.chomp
end
end
end
check_list_file(opts.pairedfile, pairedlist) if opts.pairedfile
check_list_file(opts.singlefile, singlelist) if opts.singlefile
# check list and load if OK
def check_list(inlist, outlist)
p inlist, outlist
outlist += inlist.split(":")
p outlist
outlist.map! { |file| File.expand_path(file) }
outlist.each do |file|
unless File.exists?(file)
raise "Can't find file \"#{file}\""
end
end
end
check_list(opts.paired, pairedlist) if opts.paired
check_list(opts.single, singlelist) if opts.single
phred = "-phred#{opts.phred}"
# build command(s)
pairedcmd, singlecmd = nil, nil
if opts.paired || opts.pairedfile
pairedcmd = "java -jar #{opts.jar} PE #{phred} -threads #{opts.threads} INFILEF INFILER OUTFILEF OUTFILEFU OUTFILER OUTFILERU"
pairedcmd += " ILLUMINACLIP:#{opts.adapters}:2:40:15" if opts.adapters
pairedcmd += " LEADING:#{opts.leading} TRAILING:#{opts.trailing} SLIDINGWINDOW:#{opts.windowsize}:#{opts.quality} MINLEN:#{opts.minlen}"
end
if opts.single || opts.singlefile
singlecmd = "java -jar #{opts.jar} SE #{phred} -threads #{opts.threads} INFILE OUTFILE"
singlecmd += " ILLUMINACLIP:#{opts.adapters}:2:40:15" if opts.adapters
singlecmd += " LEADING:#{opts.leading} TRAILING:#{opts.trailing} SLIDINGWINDOW:#{opts.windowsize}:#{opts.quality} MINLEN:#{opts.minlen}"
end
paired_trimlog = []
unpaired_trimlog = []
inpathf=""
# trim
pairedlist.each_slice(2) do |infilef, infiler|
cmd = pairedcmd
cmd = cmd.gsub(/INFILEF/, infilef)
cmd = cmd.gsub(/INFILER/, infiler)
if opts.output
inpathf = opts.output
inpathr = opts.output
infilef = File.basename(infilef)
infiler = File.basename(infiler)
else
inpathf = File.dirname(infilef)
infilef = File.basename(infilef)
inpathr = File.dirname(infiler)
infiler = File.basename(infiler)
end
cmd = cmd.gsub(/OUTFILEF/, "#{inpathf}/#{TRIMPREFIX}#{infilef}")
cmd = cmd.gsub(/OUTFILEFU/, "#{inpathf}/#{TRIMPREFIX}#{UNPAIREDPREFIX}#{infilef}")
cmd = cmd.gsub(/OUTFILER/, "#{inpathr}/#{TRIMPREFIX}#{infiler}")
cmd = cmd.gsub(/OUTFILERU/, "#{inpathr}/#{TRIMPREFIX}#{UNPAIREDPREFIX}#{infiler}")
puts "trimming #{infilef} and #{infiler}"
ret = `#{cmd} 2>&1` if !opts.test
puts cmd if opts.verbose
if !opts.test
ret.split('\n').each do |line|
next unless line =~ /^Input/
data = /Input Read Pairs: (?<input_reads>\d+) Both Surviving: (?<both_kept>\d+) \((?<both_kept_pc>[^\)]+)\) Forward Only Surviving: (?<fwd_kept>\d+) \((?<fwd_kept_pc>[^\)]+)\) Reverse Only Surviving: (?<rev_kept>\d+) \((?<rev_kept_pc>[^\)]+)\) Dropped: (?<dropped>\d+) \((?<dropped_pc>[^\)]+)\)/.match(line)
logline = Hash[data.names.zip(data.captures)]
logline['file'] = infilef
paired_trimlog << logline
end
end
if opts.cleanup
File.delete infilef
File.delete infiler
end
end
singlelist.each do |infile|
cmd = singlecmd
cmd = cmd.gsub(/INFILE/, infile)
inpath = File.dirname(infile)
infile = File.basename(infile)
cmd = cmd.gsub(/OUTFILE/, "#{inpath}/#{TRIMPREFIX}#{infile}")
puts "trimming #{infile}"
ret = `#{cmd} 2>&1`
p ret
ret.split(/\n/).each do |line|
p line
next unless line =~ /^Input/
data = /Input Reads: (?<input_reads>\d+) Surviving: (?<kept>\d+) \((?<kept_pc>[^\)]+)\) Dropped: (?<dropped>\d+) \((?<dropped_pc>[^\)]+)\)/.match(line)
logline = Hash[data.names.zip(data.captures)]
logline['file'] = infile
unpaired_trimlog << logline
end
# File.delete infile if opts.cleanup
end
datestr = Time.now.strftime('%d_%m_%Y_%H_%M_%S')
protfile = "trim.#{datestr}.protocol"
puts "Saving protocol to #{protfile}"
File.open(protfile, 'w') {|io| io.puts opts }
logsuffix = "#{datestr}.trim.csv"
def writelog(logarr, logfile)
return if logarr.length == 0
header = logarr.first.keys
CSV.open(logfile, 'w') do |log|
log << header
logarr.each do |line|
log << header.map { |h| line[h] }
end
end
end
writelog(paired_trimlog, "paired.#{logsuffix}")
writelog(unpaired_trimlog, "unpaired.#{logsuffix}")
puts "Done! Trimmed #{pairedlist.length + singlelist.length} files in #{Time.now - t0} seconds"