-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline_main.pl
287 lines (243 loc) · 11.1 KB
/
pipeline_main.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/perl
use strict;
use Getopt::Long;
use Cwd 'abs_path';
use FindBin;
# Record the version number.
$main::version="2.069";
$main::versionDate="April 2012";
# Define some global variables.
$main::cwd = abs_path(".");
# Define required files and packages.
$main::codeDir= $FindBin::Bin;
require "$main::codeDir/bamtools.pm";
require "$main::codeDir/command_line.pm";
require "$main::codeDir/create_scripts.pm";
require "$main::codeDir/freebayes.pm";
require "$main::codeDir/general_tools.pm";
require "$main::codeDir/merge_tools.pm";
require "$main::codeDir/modules.pm";
require "$main::codeDir/mosaik.pm";
require "$main::codeDir/pair_statistics.pm";
require "$main::codeDir/reference.pm";
require "$main::codeDir/script_tools.pm";
require "$main::codeDir/search.pl";
require "$main::codeDir/sequence_index.pm";
require "$main::codeDir/software.pm";
require "$main::codeDir/status.pm";
require "$main::codeDir/target_regions.pm";
require "$main::codeDir/tools.pm";
# Set buffer to flush after every print statement.
$| = 1;
# Find the current time. This will be used in generating
# file names as well as for printout to output files.
($main::time,$main::day,$main::month,$main::year)=general_tools::findTime();
# Check command line arguments and throw an exception if
# information is missing or incorrect.
GetOptions('aligner=s' => \$main::aligner,
'bam-list=s' => \$main::bamList,
'date=s' => \$main::date,
'dir=s' => \$main::outputDirectory,
'divide:i' => \$main::targetRegionSize,
'divide-genome:s' => \$main::divideGenome,
'exome' => \$main::exome,
'fastq=s' => \$main::fastqDirectory,
'fastq-list=s' => \$main::fastqList,
'include-improper' => \$main::includeImproper,
'index=s' => \$main::indexFile,
'job-id=s' => \$main::jobID,
'keep-orig-quals' => \$main::keepOriginalQualities,
'local' => \$main::local,
'low-memory' => \$main::lowMemory,
'meta=s' => \$main::metaData,
'previous-date=s' => \$main::previousDate,
'previous-index=s' => \$main::previousIndex,
'snp=s' => \$main::snpCaller,
'map-Q-zero' => \$main::mapQ0,
'memory:s' => \$main::nodeMemory,
'mosaikv2' => \$main::mosaikVersion2,
'node:s' => \$main::nodeName,
'no-baq' => \$main::noBaq,
'no-bin-priors' => \$main::noBinPriors,
'no-bq-recal' => \$main::noBQRecal,
'no-complex' => \$main::noComplex,
'no-fastq-val' => \$main::noFastqValidation,
'no-indels' => \$main::noIndels,
'no-mnps' => \$main::noMnps,
'no-ogap' => \$main::noOgap,
'no-proper-pair' => \$main::noProperPair,
'threads:i' => \$main::threads,
'queue:s' => \$main::queue,
'reference=s' => \$main::reference,
'ref-seq=s' => \$main::referenceSequence,
'scratch=s' => \$main::scratch,
'snp-delimiter=s' => \$main::snpDelimiter,
'software=s' => \$main::softwareList,
'status:s' => \$main::scriptStatus,
'user=s' => \$main::userID,
'wall-time:s' => \$main::wallTime,
'h|help|?' => \$main::help)
|| command_line::pipelineHelp();
# Print the version number to screen.
print("\n=======================================\n");
print("Boston College variant calling pipeline\n");
print("Version $main::version - $main::versionDate\n\n$main::time\n");
print("=======================================\n\n");
# Provide usage information if the help option was specified.
if (defined $main::help) {command_line::pipelineHelp();}
# Initialise variables.
general_tools::initialise();
# Determine the user ID if not specified.
if (! defined $main::userID) {$main::userID = getlogin();}
# Define the name of the directory to keep all of the files generated by
# the pipeline.
general_tools::generateDirectory();
# If the status of scripts is requested, no script files will be
# generated, just the number of completed, queued, running and
# failed jobs will be displayed.
if (defined $main::scriptStatus) {status::determineScriptStatus();}
# Check that the specified aligner and associated options are valid.
command_line::checkAligner();
# Check that the specified SNP caller and associated options are valid.
command_line::checkSnpCaller();
# If SNP calling, determine reference sequences to call on and what
# size chunks to call on.
command_line::checkTargets();
# If a wall time is provided for the process, check that it is in
# the correct format.
command_line::checkWallTime();
# If a memory requirement for the node is requested, check it has
# the correct form.
command_line::checkMemory();
# If a bam list is defined, check that it exists, that all of the
# contents are bam files. If so, populate @main::bamList with the
# list of bam files.
if (defined $main::bamList) {command_line::checkBamList();}
# Define the software tools that can be run in the pipeline.
# For each tools, a list of parameters is also defined and the
# files that in creates are listed. If new tools are added to
# the pipeline, a new routine to run the package is required as
# well as an entry in this routine. Everything else will be
# automatically dealt with.
modules::defineModules();
# If a text file containing the paths for the software components
# is provided, read in the files and store.
modules::readSoftware();
# If a reference is specified, check that it exists as well as the jump
# database if Mosaik is being used. If not, set the default.
reference::reference();
software::aligners();
software::mergePipeline();
software::snpCallers();
target_regions::snpDelimiter();
$main::refSequences = bamtools::getReferenceSequences(\@main::bamList);
target_regions::defineRegions();
# Check if a date has been supplied. If not, the current date will be
# used in the generated filenames.
command_line::checkDate();
# Write to screen the options used. This is intended to make sure
# that the user is aware of everything being used so that errors
# can be identified and rectified.
command_line::displayOptions();
# If an older sequence.index file is provided, a search for release
# bams from the older sequence.index file will be performed. This
# search requires knowledge of the date of the previous index file.
# Thus, the old index file and the corresponding date must BOTH be
# provided. This is only required if alignments are being performed.
#if ($main::aligner ne "none") {CommandLine::IncrementalIndex();}
# If a previous sequence index is provided along with a current index file,
# read through the 'old' sequence.index file and build a hash table of the
# md5 check sums for the fastq files. Skip this step if running from a
# supplied bam list.
if (! defined $main::bamList) {
if (defined $main::previousIndex) {
$main::previousIndex = abs_path($main::previousIndex);
sequence_index::parsePreviousIndexFile();
}
# Read metadata to determine the parameters and fastq files to
# input into the pipeline. If a sequence.index file exists, this
# will be read. The entire file is parsed prior to creating any scripts.
# This is to ensure that all files associated with each run are
# accounted for and the relevant single-end, paired-end or both
# pipeline scripts can be created.
if (defined $main::indexFile) {
$main::indexFile = abs_path($main::indexFile);
sequence_index::parseIndexFile();
}
elsif (defined $main::metaData) {
$main::metaData = abs_path($main::metaData);
sequence_index::metaData();
}
else {
print("\n***SCRIPT TERMINATED***\n");
print("A sequence index or meta data file must be provided.\n");
die("Error in pipeline_main.pl.\n");
}
}
# Search for files already existing within the directory structure. Only search
# for files created by the defined aligner. Do not perform this search if a
# list of bam files has been provided.
if (! defined $main::bamList) {
print("Searching for existing files...");
find(\&fileSearch, $main::cwd); # Search.pl
print("done.\n");
# If a separate fastq directory is defined, also search within this directory.
if (defined $main::fastqDirectory && !defined $main::fastqList) {
print("Searching for fastq files...");
$main::fastqDirectory = abs_path($main::fastqDirectory);
find(\&findFastq, $main::fastqDirectory); # Search.pl
print("done.\n");
} elsif (defined $main::fastqList) {
command_line::checkFastqList();
} elsif (defined $main::fastqDirectory && $main::fastqList) {
print STDERR ("\n***SCRIPT TERMINATED***\n\n");
print STDERR ("Either define a fastq directory or a fastq list, but not both.");
print STDERR ("Error in pipeline_main.pm.\n");
}
# Now search for previous bam files in the specified bam directory.
# This step is only performef if the directory is specified.
if (defined $main::bamDirectory) {
$main::bamDirectory = abs_path($main::bamDirectory);
print("Searching for previous release bam files...");
find(\&fileSearch,$main::bamDirectory); # Search.pl
print("done.\n");
}
} else {
foreach my $bam (@main::bamList) {push(@main::completedBamFiles, $bam);}
}
# Ceunt the number of samples and sample x technology pairs that will be
# analysed by the pipeline.
merge_tools::countSamples();
# Perform a Check for each of the merged bam files. Specifically, if multiple
# bam files exist (i.e. different dates), choose in the following order:
# 1: Current date
# 2: Supplied previous date (if exists).
# 3: Whichever bam file is left (i.e. neither of these dates).
#
# If no bam file exists, mark the MergeInfo info string with the tag,
# exists:no. Otherwise include the tags: exists:yes, path:<path> and
# file:<file>.
#
# If there is an extant file, check the following aspects:
# 1: Are all runs in MergeInfo included in the file?
# 2: Are any other runs found in the file (e.g. withdrawn runs)?
# 3: Have the md5 tags of the runs changed?
merge_tools::selectMergedBam();
merge_tools::interrogateMergedBamFile();
# Check the arrays of failed jobs and include an additional status tag.
create_scripts::checkFailed();
# Now work through each sample x technology pair in turn, creating all of the
# necessary script files.
if ($main::aligner ne "none") {
create_scripts::createScripts();
create_scripts::printScriptStatistics();
} else {
foreach my $bam (@main::currentIncrementBams) {push(@main::completedBamFiles, $bam);}
foreach my $bam (@main::previousIncrementBams) {push(@main::completedBamFiles, $bam);}
foreach my $bam (@main::previousBams) {push(@main::completedBamFiles, $bam);}
foreach my $file (@main::existingFiles) {if ($file =~ /\.bam$/) {push(@main::completedBamFiles, $file);}}
}
# Now create SNP calling scripts if required.
if ($main::snpCaller ne "none") {
create_scripts::snpScripts();
}