-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_fastq.pl
executable file
·217 lines (172 loc) · 7.25 KB
/
preprocess_fastq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/usr/bin/perl -w
# Usage: see usage message
use Getopt::Long;
use tools;
use strict;
my $version=getversion($0);
warn "Running $0, version $version\n";
warn "Arguments: @ARGV\n";
my($fastq, $umi_len, $cbc_len, $polytrim, $CBCbeforeUMI, $read1,
$five, $three, $minlength, $help);
$minlength=20; # Below this the ratio multimappers/uniquehits
# starts to increase sharply
$CBCbeforeUMI=0; # CELSeq2
## -xytrim worked, but didn't help so was removed at after commit 26547a0374 (2016-11-02 16:44:17)
my $usage = "
Usage: $0 --fastq s_R1.fastq.gz,s_R2.fastq.gz --umi_len 6 --cbc_len 8 [ OPTIONS ] | gzip -n > s_cbc.fastq.gz
In CELSeq2, read1 contains (in that order) CBC, UMI, polyT, whereas read2
contains the mRNA. This script takes the CBC and UMI from read1, and
appends e.g.':cbc=TACTGCTG:umi=GTCTTT' onto the read-id. The resulting
FASTQ file is written to stdout (where it usually will be gzipped before
being written to disk)
The current protocols have an artefact that tends to produces long
stretches of polyA (and to a lesser extent polyT). Specifying
e.g. --polytrim=A=12,T=18 will delete any occurrence of AAAAAAAAAAAA.* and
TTTTTTTTTTTTTTTTTT.* from read2. The quality lines are trimmed in the same
way. (These numbers correspond to roughly 0.1% of the actual occurrences
in the human transcriptome). In addition, reads can be clipped using the
--five and --three options; this is done after the polytrimming.
If reads have become too short (see the --minlength option; default $minlength) as a result of
all the trimming steps, the read is discarded.
By default CELSeq2 is used, i.e. UMI precedes the cell bar code. Use -protocol=1
to swap them.
If also read1 should get its read-id changed, use the -read1 option; this will
write the amended reads to the (gzipped) fastq file (they will *not* be trimmed
if the -trim option is specified, because the meaningful information,
if any, is beyond the polyT stretch). Note that the transcript is part of read2,
so this option is prolly only meaningful when debugging script, lab protocol or both.
Arguments:
--fastq s_R1.fastq.gz,s_R2.fastq.gz # input files
--umi_len=6 # length of the UMI
--cbc_len=8 # length of the cell barcode
Options:
--read1 s_R1_cbc.fastqc.gz # also save the read1's with the new id
--polytrim=G12,A14 # trim read2 of any stretches of 12 G's and 14 A's (in that order) and beyond
--CBCbeforeUMI # CELseq2 has first the UMI, then the CBC, this option inverts that
--five=6 # Trim 6 nt from the 5'-side of read2
--three=8 # Trim 8 nt from the 3'-side of read2 (only for reads that were not polytrimmed)
--minlength=20 # Discard reads that have become too short (default: $minlength) after all the trimming
Heavily adapted by <plijnzaad\@gmail.com> from the original written by Lennart Kester.
";
die $usage unless GetOptions('fastq=s'=> \$fastq,
'umi_len=i'=> \$umi_len,
'cbc_len=i'=> \$cbc_len,
'read1=s'=> \$read1,
'polytrim=s' => \$polytrim,
'CBCbeforeUMI' => \$CBCbeforeUMI,
'five=i' => \$five,
'three=i' => \$three,
'minlength=i' => \$minlength,
'help|h' => \$help);
die $usage if $help;
die $usage unless $fastq && defined($umi_len) && defined($cbc_len);
my $regexps ={};
my @regexpids = (); # to maintain the order
if (defined($polytrim) && $polytrim && $polytrim !~ /^n(o(ne)?)?/i && $polytrim !~ /^f(alse)?/i ) {
my @oligos=split(',', $polytrim);
for my $oli (@oligos) {
my($nuc, $num)= ($oli =~ /^([ACGT])(\d+)/);
die "$0: expected string like --trim=A18,T18" unless $nuc && $num;
my $re = '(' . $nuc x $num . ".*)";
$regexps->{$nuc}= qr/$re/;
push(@regexpids, $nuc);
}
}
my $ntrimmed={};
my $ntrimmedtotal={};
for my $rid (@regexpids) { # rid=regexp-id
$ntrimmed->{$rid}=0;
$ntrimmedtotal->{$rid}=0;
}
die "$0: no -umi_len specified" unless $umi_len > 0; # length of the UMI
die "$0: no -cbc_len specified" unless $cbc_len > 0; # length of the cell bar code
my $prefix_len = $cbc_len + $umi_len;
die "$fastq: input files must be gzipped " unless $fastq =~ /\.gz,.*\.gz$/;
my @fastq = split(/\,/,$fastq);
my($IN1, $IN2);
open($IN1, "zcat $fastq[0] |") || die "$0: $fastq[0]: $!";
open($IN2, "zcat $fastq[1] |") || die "$0: $fastq[1]: $!";
if($read1) {
$read1 =~ s/\.fastq.*$//i;
open(READ1, " | gzip -n > $read1.fastq.gz ") || die "read1: $!";
}
my ($line1, $line2, $bar);
my $polytrimmedlen={};
my (@lines1, @lines2);
my $nreads=0;
my $ntooshort=0;
READ:
while( not eof $IN1 and not eof $IN2) {
$polytrimmedlen={};
for(my $i=0; $i<4;$i++) { # 4 lines at a time
$lines1[$i] = <$IN1>;
$lines2[$i] = <$IN2>;
}
die "expected '+' lines in files @fastq, line $."
unless $lines1[2] eq "+\n" && $lines2[2] eq "+\n";
### id line:
chomp($lines2[0]);
my($id, $rest)=split(' ',$lines2[0]);
my(undef, $rest1)=split(' ',$lines1[0]) if $read1;
### sequence line:
$bar = substr($lines1[1], 0, $prefix_len);
my $umi=substr($bar,0, $umi_len);
my $cbc=substr($bar, $umi_len, $cbc_len);
if ($CBCbeforeUMI) {
$cbc=substr($bar,0, $cbc_len);
$umi=substr($bar, $cbc_len, $umi_len);
}
$lines1[0] = "$id:cbc=$cbc:umi=$umi $rest1\n" if $read1;
$lines2[0] = "$id:cbc=$cbc:umi=$umi $rest\n";
## do polytrimming, if any:
my $seq2=$lines2[1];
chomp($seq2);
for my $rid (@regexpids) {
if( $seq2 =~ $regexps->{$rid} ) {
my $trimmed=length($1);
my $newlen=length($seq2) - $trimmed;
$seq2= substr($seq2,0, $newlen);
$polytrimmedlen->{$rid}=$newlen; # remember for the qual line
$ntrimmed->{$rid}++;
$ntrimmedtotal->{$rid} += $trimmed;
}
}
my $qual2=$lines2[3];
chomp($qual2);
# apply same trimming to phred qualities
for my $rid (@regexpids) {
if(exists($polytrimmedlen->{$rid})) {
$qual2= substr($qual2,0, $polytrimmedlen->{$rid});
}
}
## ordinary trimming:
if($five) {
$seq2 = substr($seq2, $five);
$qual2 = substr($qual2, $five);
}
if ($three && ! int(keys %$polytrimmedlen)) {
$seq2 = substr($seq2, 0, -$three);
$qual2 = substr($qual2, 0, -$three);
}
if ( length($seq2) < $minlength ) {
$ntooshort ++;
next READ;
}
$lines2[1]=$seq2 ."\n";
$lines2[3]=$qual2."\n";
print join("", @lines2);
print READ1 join("", @lines1) if $read1;
$nreads++;
} # READ
close $IN1 || die "$0: $fastq[0]: $!";
close $IN2 || die "$0: $fastq[1]: $!";
if ($read1) {
close (READ1) || die "$0: could not close (or open1) $read1: $!";
}
warn "preprocessed $nreads reads\n";
for my $rid (@regexpids) {
warn "trimmed $ntrimmed->{$rid} poly${rid}'s from the reads (totalling $ntrimmedtotal->{$rid} nucleotides)\n"
if exists($ntrimmed->{$rid}) && $ntrimmed->{$rid} > 0;
}
warn(sprintf("%d (%.1f%%) of the trimmed reads were trimmed to length smaller than %d and therefore discarded\n",
$ntooshort, $ntooshort/$nreads, $minlength));