forked from octantbio/SwabSeq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakefile
61 lines (50 loc) · 2.17 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
SHELL := /bin/bash
.DELETE_ON_ERROR:
#===============================================================================
# VARIABLES
# sequencing run ids
RUNS := $(shell find data/seq-runs -maxdepth 1 -mindepth 1 -type d -exec basename {} \;)
#===============================================================================
# RECIPIES
all: conds star
conds: $(addprefix pipeline/, $(addsuffix /conditions.csv, $(RUNS)))
star: $(addprefix pipeline/, $(addsuffix /starcode.csv, $(RUNS)))
# cleanup
clean:
rm -rf pipeline/*
.PRECIOUS: $(addprefix pipeline/, %/conditions.csv %/starcode.csv)
.SECONDARY:
#===============================================================================
# PIPELINE
# copy barcode map from data/ folder.
pipeline/%/bc-map.csv:
@echo "Grabbing $@"
@mkdir -p $(dir $@)
@cp data/barcode-map.csv $@
# grab relevant section of samplesheet (make sure to catch windows return)
# Also collapse the SampleSheet so there's only one line per well - there will be
# multiple lines per well if there are multiple i5/i7 pairs per well.
pipeline/%/conditions.csv: data/seq-runs/%/SampleSheet.csv pipeline/%/bc-map.csv
@echo "Parsing $<"
@python src/strip-windows.py $< \
| awk '/Sample_ID/{seen=1} seen{print}' \
| Rscript src/collapseSampleSheet.R \
2> $(@:.csv=.err) \
| Rscript src/bc-lengths.R $(word 2, $^) $@ \
2>> $(@:.csv=.err)
# Run starcode on each fastq, using gnu parallel to parse the conditions for barcode length
pipeline/%/starcode.csv: data/seq-runs/% pipeline/%/conditions.csv
@echo "Counting BCs for all fastq's in $<"
@parallel --header : --colsep "," \
zcat $</"{Sample_ID}"*_S*_R1_001.fastq.gz \
\| awk -v bc_len="{bc_len}" -f src/count-bcs.awk \
\| starcode -d2 -t1 --sphere --print-clusters 2> /dev/null \
\| python src/tidy-star.py \
\| awk -v name="{Sample_ID}" \''{print name, $$1, $$2, $$3}'\' OFS="," \
:::: $(word 2, $^) 2> $(@:.csv=.err) > $(@:.csv=.tmp) \
&& echo "Sample_ID,Centroid,Count,barcode" \
| cat - $(@:.csv=.tmp) > $@ \
&& rm $(@:.csv=.tmp)
# a dummy recipe so we dont have to store the example fastqs
pipeline/example/starcode.csv: data/seq-runs/example/example-starcode.csv.gz pipeline/example/conditions.csv
zcat $< > $@