-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMakefile
407 lines (361 loc) · 15.6 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
SHELL := /bin/bash -euo pipefail
.DELETE_ON_ERROR:
RUNS := $(notdir $(shell find pipeline/ -mindepth 1 -maxdepth 1 -type d))
.PHONY: denovo stats all clean
denovo: $(addprefix pipeline/, $(addsuffix /spades-contigs.fasta, $(RUNS)))
stats: $(addprefix pipeline/, $(addsuffix /aggregated-stats.tsv, $(RUNS)))
all: stats
# cleanup
clean:
rm -rf pipeline/*
.SECONDARY:
#===============================================================================
# DeNovo seqWell ExpressPlex Pipeline
# ------------------------
# 1) Organize well samples by plate index
# 2) Preprocess the reads (trim, filter, error-correct, ...) according to JGI
# 3) De novo assemble the results
# 4) Align contigs to library to get well identity
#===============================================================================
# ---------------
# 1) Organize:
# ---------------
# create a new directory data to house sequencing data
# move all inputs to the new directory data
# create separate directories for each plate based on Sample_Plate column in SampleSheet.csv
pipeline/%/organize: pipeline/%/SampleSheet.csv
@echo "Organizing $(@D)"
@mkdir -p $(@D)/data
@awk '/\[Data\]/{f=1;next}f' $< \
| dos2unix \
| mlr --csvlite --headerless-csv-output cut -f Sample_Plate \
| sort -u \
| while IFS= read -r plate; do \
mkdir -p $(@D)/data/"$$plate"; \
mv $(@D)/"$$plate"*fastq* $(@D)/data/"$$plate"/; \
for i in $(@D)/data/"$$plate"/*fastq*; do \
mv "$$i" "$${i//$$plate-/}"; \
done; \
done
@mv -t $(@D)/data \
$(@D)/SampleSheet.csv \
$(@D)/Undetermined_R*.fastq.gz \
&& touch $@
# store original input references (if provided) in the newly created data directory
pipeline/%/data/input.fasta: pipeline/%/input.fasta pipeline/%/organize
@echo "Moving $< to $@"
@mv $< $@
# get list of plates to process
# -print0 ensures nasty filenames are handled with grace
# anything that parses plates.txt must handle null characters
pipeline/%/plates.txt: pipeline/%/organize
@find -L -path "./$(@D)/data/*" -type d ! -name 'SampleSheet.csv' ! -regex '.*Undetermined.*' -print0 \
| sed --null-data -e 's/_R.*//' \
| sort --zero-terminated \
| uniq --zero-terminated \
| sed --null-data -e 's/.*\///'\
> $@
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ---------------
# 2) Preprocess:
# ---------------
# 0) see src/jgi-preproc.sh for implementation
# 1) trim adapters, filter contams (ecoli gDNA background, phiX, etc)
# 2) error correct reads
# 3) get stats on well contaminants
# Grab DH5a Genome:
# -----------------
src/background.fasta:
@echo "Grabbing e. coli genome"
@curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=CP017100&rettype=fasta&retmode=text" > $@
# Preprocessing:
# --------------
# if you want a different genome, change out here
# ensure everything ends in null character to handle filenames with nasty characters
# xargs -I {} converts {} into substitution (much like parallel)
# sed can change out delimiter s|foo|bar| == s/foo/bar/
# -n2 puts two lines of the input at {}
pipeline/%/preproc: pipeline/%/plates.txt pipeline/%/organize src/background.fasta
@echo "Preprocessing plates in $(<D)"
@sed --null-data -e 's|^|./$(<D)/data/|' -e 's|$$|/*.fastq*|' $< \
| xargs --null -n1 -I {} find -L -path {} -print0 \
| sort --zero-terminated \
| parallel --null -n2 src/jgi-preproc.sh {} $(lastword $^) \
&& touch $@
# Contamination Stats:
# --------------------
# jgi's pipeline provides per-well contam stats
# recall parallel requires escaping '
pipeline/%/read-stats.tsv: pipeline/%/plates.txt pipeline/%/preproc
@echo "Calculating well statistics for $(<D)"
@sed --null-data -e 's|^|./$(<D)/data/|' $< \
| xargs --null -n1 -I {} find -L -path {} -print0 \
| parallel --null \
grep -F -B4 \'Unique 31\' {}/*.pre-proc \
\| sed -e \''/Total /d; /Input/d; /Unique/d; /--/d'\' \
-e \''s/reads/\t/g; s/bases/\t/g; s/\.pre-proc-/\t/g'\' \
-e \''s/\.\/pipeline\///g; s/data//g; s/[:()]//g'\' \
-e \''s/\//\t/g; s/ \+//g'\' \
| awk '{NF = NF -2; print}' OFS="\t" \
| mlr --tsvlite --implicit-csv-header label Run,Plate,Well,Metric,Reads,Percent \
> $@
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# --------------------
# 3) De novo Assembly:
# --------------------
# 0) see src/jgi-denovo.sh for implementation
# 1) merge reads
# 2) use SPAdes to assemble
pipeline/%/de-novo: pipeline/%/plates.txt pipeline/%/preproc
@echo "De novo assembling all plates in $(<D)"
@sed --null-data -e 's|^|./$(<D)/data/|' -e 's|$$|/*.ecc.fq.gz|' $< \
| xargs --null -n1 -I {} find -L -path {} -print0 \
| parallel --null src/jgi-denovo.sh {} \
&& touch $@
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# -----------------
# 4) Align Contigs:
# -----------------
# 1) flatten input plasmids to allow for alignments to span the junction
# 2) aggregate all assemblies into one fasta (and reorient)
# 3) align to input library
# 4) parse alignment for reference info
# Flatten input library:
# -----------------------
# cat input references together to enable mapping through plasmid junction
pipeline/%/input-flatten.fasta: pipeline/%/data/input.fasta
@echo "Flattening $<"
@python src/flatten-fasta.py $< > $@
# Collapse Contigs:
# -----------------
# again harness parallel's string processing to output as contigs/PLATE_WELL.fasta
# {/} = basename {//} = dirname
# only take the first contig
# 1) dump all fastas into spades-contigs/Plate_Well.spades-contigs.fasta
# 2) take only the first record
pipeline/%/spades-contigs.fasta: pipeline/%/plates.txt pipeline/%/de-novo
@echo "Aggregating all contigs in $(<D)"
@mkdir -p $(@:.fasta=) \
&& sed --null-data -e 's|^|./$(<D)/data/|' -e 's|$$|/*.spades-contig.fasta|' $< \
| xargs --null -n1 -I {} find -L -path {} -print0 \
| parallel --null \
'well="$$(basename -s .spades-contig.fasta {/})"; \
plate="$$(basename {//})"; \
cp {} $(@:.fasta=)/"$$plate"_"$$well".fasta' \
&& python src/flatten-fasta.py --no-flat --records 1 $(@:.fasta=)/*.fasta \
> $@
# Align Contigs:
# --------------
# --eqx: =/X in cigar string
# -N: no secondary alignments
# -a output sam
# take only good alignments (0 for forward, 16 for rev-comp)
pipeline/%/spades-contigs.sam: pipeline/%/input-flatten.fasta pipeline/%/spades-contigs.fasta
@echo "Mapping contigs from $(<D)"
@minimap2 --eqx --secondary=no -x asm20 -a -t $$(nproc --all) $^ \
2> $(@:.sam=.map.err) \
| awk '$$2 == 0 || $$2 == 16' \
> $@
# Parse Alignment:
# ----------------
pipeline/%/de-novo-ref-stats.tsv: pipeline/%/spades-contigs.sam
@awk -v plate=$(<D) \
'$$1 !~ /@/ \
{len=split(plate,p,"/"); split($$1,a,"_"); print p[len],a[1],a[2],$$3,$$6,$$10}' \
OFS="\t" $< \
| mlr --tsvlite --implicit-csv-header label Run,Plate,Well,DeNovo_Ref,CIGAR,Contig \
> $@
#===============================================================================
# Variant Calling
# ---------------
# 0) Determine well identity with the de novo assembly contig
# 1) Prep input reference for alignment
# 2) Variant calling
# 3) Variant parsing
#===============================================================================
# -----------------------
# 1) Prep Input Reference
# -----------------------
# flatten-fasta --no-flat cleans input
pipeline/%/input-refs.fasta: pipeline/%/data/input.fasta
@echo "Sanitizing input fastas"
@python src/flatten-fasta.py --no-flat $< > $@
# split input into multiple files for downstream alignment and variant calls
pipeline/%/lib/split: pipeline/%/input-refs.fasta
@echo "Splitting $< into individual fastas"
@mkdir -p $(@D) \
&& paste - - < $< \
| awk '{print $$1"\n"$$2 > "$(@D)/"substr($$1,2)".fasta"}' \
&& parallel samtools faidx {} ::: $(@D)/*.fasta \
&& touch $@
# split input into tsv for easy joining
pipeline/%/input-refs.tsv: pipeline/%/input-refs.fasta
@echo "Splitting input library into tsv"
@sed 's/>//g' $< \
| paste - - \
| mlr --tsvlite --implicit-csv-header label Ref,Ref_Seq \
> $@
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ------------------
# 2) Variant Calling
# ------------------
# 0) see /src/variant-calling.sh for details
# 1) get well identity
# 2) re-map preprocessed reads to that specific reference
# 3) call variants with Q>20, and >=1 read that makes up 20% of reads at that position
pipeline/%/guided: pipeline/%/de-novo-ref-stats.tsv pipeline/%/preproc pipeline/%/lib/split
@echo "Calling variants for all plates in $(<D)"
@awk 'NR > 1{print "pipeline",$$1,"lib",$$4".fasta pipeline",$$1,"data",$$2,$$3".ecc.fq.gz"}' \
OFS=/ \
$< \
| parallel --col-sep ' ' src/variant-calling.sh {1} {2} \
&& touch $@
# embed variant calls into the original reference sequence, generating the consensus sequence
# consensus can be aligned to reference afterward to visualize variant calls
# parallel requires escaping |
pipeline/%/consensus-seqs.tsv: pipeline/%/plates.txt pipeline/%/guided
@echo "Building consensus sequences for $(<D)"
@sed --null-data -e 's|^|./$(<D)/data/|' -e 's|$$|/*.consensus.fasta|' $< \
| xargs --null -n1 -I {} find -L -path {} -print0 \
| parallel --null \
python src/flatten-fasta.py --no-flat {} \
\| awk -v wells='{= s:\.consensus\.fasta:: =}' \
\''{len=split(wells,a,"/"); if ($$0 !~ /^>/) print a[len-3],a[len-1],a[len],$$0}'\' OFS=\'\\t\' \
| mlr --tsvlite --implicit-csv-header \
label Run,Plate,Well,Consensus \
then unsparsify --fill-with "NA" \
> $@
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ------------------
# 3) Variant Parsing
# ------------------
# 0) see /src/vcf-parse.py for details
# 1) parse input fasta for barcodes
# 2) compare barcodes found in VCF file to expected number from input fasta
# 3) un-tidy for nice excel output
# parallel requires escaping |
pipeline/%/freebayes-tidy.tsv: pipeline/%/plates.txt pipeline/%/input-refs.fasta pipeline/%/guided
@echo "Parsing variants from FreeBayes for $(<D)"
@sed --null-data -e 's|^|./$(<D)/data/|' -e 's|$$|/*.freebayes.vcf.gz|' $< \
| xargs --null -n1 -I {} find -L -path {} -print0 \
| parallel --null \
bcftools view {} \
\| python src/vcf-parse.py $(word 2, $^) - \
\| awk -v wells='{= s:\.freebayes\.vcf.gz:: =}' \
\''{len=split(wells,a,"/"); print a[len-3],a[len-1],a[len],$$0}'\' OFS=\'\\t\' \
> $@
pipeline/%/freebayes.tsv: pipeline/%/freebayes-tidy.tsv
@mlr --tsvlite --implicit-csv-header \
label Run,Plate,Well,Ref,idx,bc,bc_revcomp,pos,n_vars,n_barcodes,expected_bcs \
then reshape -i bc,bc_revcomp,pos -o foo,bar \
then put '$$baz = $$foo . $$idx' \
then cut -x -f foo,idx \
then reshape -s baz,bar \
then unsparsify --fill-with "NA" \
$< \
> $@
#===============================================================================
# Ancillary Analysis
# ------------------
# 1) Coverage
# 2) Barcode Filter
# 3) Aggregation
#===============================================================================
# -----------------
# 1) Coverage Info:
# -----------------
# 1) calculate coverage from variant calling
# 2) report percentages of bases with <10x, <3x coverage
# -aa -> absolutely all positions; -d0 no max depth
pipeline/%/lt-X.tsv: pipeline/%/plates.txt pipeline/%/guided
@echo "Calculating percent bases <10x and <3x coverage for $(@D)"
@sed --null-data -e 's|^|./$(<D)/data/|' -e 's|$$|/*.map.bam|' $< \
| xargs --null -n1 -I {} find -L -path {} -print0 \
| parallel --null \
samtools depth -aa -d0 {} \
\| awk -v path='{= s:\.map\.bam:: =}' \
\''{if($$3 < 3) lt3 += 1; else if($$3 < 10) lt10 += 1} END {len=split(path,a,"/"); print a[len-3],a[len-1],a[len],lt10/NR,lt3/NR}'\' \
OFS=\"\\t\" \
| mlr --tsvlite --implicit-csv-header label Run,Plate,Well,LT_10,LT_3 \
> $@
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# -----------------
# 2) Barcode Filter
# -----------------
# 0) see src/bc-contam.py for details
# 1) parse bcf file for barcode locations
# 2) output pileup at barcode
# 3) collapse barcodes at lev dist 1
# 4) count their frequency
# 5) <--- good ---[ ratio check ]--- contam --->
# (0 0.04)( 2/3 > 2 )(0.1 oo)
# *) catch edge case with two barcodes but 0.04 < bc2 < 0.1 -> CONTAM
# the -e trick in mlr allows us to break up the ifs while keeping make happy
# finally clean up the pipeline/run/plate/well.map.bam into columns
pipeline/%/barcode-filter.tsv: pipeline/%/freebayes-tidy.tsv
@echo "Checking for contaminated barcodes"
@awk 'NR > 1{print "pipeline",$$1,"data",$$2,$$3".map.bam "$$4" "$$6" "$$8}' \
OFS='/' $< \
| python src/bc-contam.py - \
| mlr --tsvlite --implicit-csv-header \
label Well,Barcode,Reads \
then fraction -g Well -f Reads \
then top -n 3 -g Well -f Reads_fraction \
then reshape -s top_idx,Reads_fraction_top \
then put \
-e 'if ($$2 >= 0.1){$$BC_Contam = "TRUE"}' \
-e 'elif ($$2 < 0.1 && $$2 > 0.04 && $$3 == ""){$$BC_Contam = "TRUE"}' \
-e 'elif ($$2 < 0.1 && $$2 > 0.04 && $$2 / $$3 > 2){$$BC_Contam = "TRUE"}' \
-e 'else{$$BC_Contam = "FALSE"}' \
then cut -f Well,BC_Contam \
then nest --explode --values --across-fields -f Well --nested-fs / \
then label foo,Run,bar,Plate,Well \
then cut -x -f foo,bar \
| sed 's/\.map\.bam//g' \
> $@
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----------------------------
# 3) Aggregate All the Things:
# ----------------------------
# unsparsify may be unnecessary depending on how clean your data is
# replicates right_join
# http://johnkerl.org/miller-releases/miller-head/doc/faq.html#How_to_rectangularize_after_joins_with_unpaired?
pipeline/%/aggregated-stats.tsv: \
pipeline/%/read-stats.tsv \
pipeline/%/freebayes.tsv \
pipeline/%/de-novo-ref-stats.tsv \
pipeline/%/lt-X.tsv \
pipeline/%/barcode-filter.tsv \
pipeline/%/input-refs.tsv \
pipeline/%/consensus-seqs.tsv
@echo "Aggregating everying together into $@"
@mlr --tsvlite \
cut -x -f Percent \
then reshape -s Metric,Reads \
then rename Result,Leftover \
then join -j Run,Plate,Well --ur -f $(word 2, $^) \
then unsparsify --fill-with "NA" \
then join -j Run,Plate,Well --ur -f $(word 3, $^) \
then unsparsify --fill-with "NA" \
then join -j Run,Plate,Well --ur -f $(word 4, $^) \
then unsparsify --fill-with "NA" \
then join -j Run,Plate,Well --ur -f $(word 5, $^) \
then unsparsify --fill-with "NA" \
then join -j Ref --ur -f $(word 6, $^) \
then unsparsify --fill-with "NA" \
then join -j Run,Plate,Well --ur -f $(word 7, $^) \
then unsparsify --fill-with "NA" \
then cut -x -f 1,2 \
then put '$$Plate_Well = $$Plate."_".$$Well' \
then reorder -f Run,Plate,Well,Plate_Well,Ref,DeNovo_Ref,CIGAR,LT_10,LT_3,BC_Contam,BC_Clash,n_vars \
then reorder -e -f Consensus,Contig,Ref_Seq \
< $< \
> $@
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ----------------------------
# N) Any additional processing
# ----------------------------
# this is space for additional processing of the raw output for your workflow!
# e.g. generating separate QC plots or adding in additional columns
#pipeline/%/final: pipeline/%/aggregated-stats.tsv
# @echo "Finalizing analysis for $(@D)"