A toolkit for preprocessing single cell sequencing data.
scarecrow is undergoing substantial editing and may not behave as intended.
-
Run through ruff to check and format files
-
Error handling to capture missing or incorrect parameters, and unexpected file content
-
Peaks in between barcodes may need further investigation
-
Jitter does not currently apply to UMI or insert sequence
-
- if UMI on same read and downstream then needs position updating before extraction
-
Plot generated by harvest
-
- currently will not handle > 1 barcode peak per whitelist (doesn't affect CSV output)
-
Benchmark different assays (SPLiTseq, Parse, 10X) and methods (split-pipe, scarecrow, UMI tools)
-
- barcode recovery
-
- alignment (STAR and kallisto)
-
Added functionality to reap.py _get_sequence_with_jitter to tackle potentially clipped barcodes starting at position 1
-
- this is not applied to the trie method currently
R1=./WTv2/100K_R1.fastq
R2=./WTv2/100K_R2.fastq
BARCODES=(BC1:n99_v5:./WTv2/bc_data_n99_v5.txt
BC2:v1:./WTv2/bc_data_v1.txt
BC3:v1:./WTv2/bc_data_v1.txt)
# Seed
for BARCODE in ${BARCODES[@]}
do
ID=${BARCODE%:*:*}
WHITELIST=${BARCODE#*:*:}
echo ${ID}
# time scarecrow seed --fastqs ${R1} ${R2} \
# -o ./WTv2/barcodes_${ID}_set.csv --barcodes ${BARCODE} -n 0 -u 0
time scarecrow seed --fastqs ${R1} ${R2} \
-o ./WTv2/barcodes_${ID}_trie.csv --barcodes ${BARCODE} -n 0 -u 0 \
--trie ${WHITELIST}.${ID}.trie.gz -k 2
done
# Harvest (set-based approach)
FILES=(./WTv2/barcodes_BC*_set.csv)
scarecrow harvest ${FILES[@]} --barcode_count 1 --min_distance 10 \
--conserved ./WTv2/barcodes_BC1_conserved.tsv \
--out ./WTv2/barcode_positions_set.csv
# Harvest (trie and kmer index approach)
FILES=(./WTv2/barcodes_BC*_trie.csv)
scarecrow harvest ${FILES[@]} --barcode_count 1 --min_distance 10 \
--conserved ./WTv2/barcodes_BC1_conserved.tsv \
--out ./WTv2/barcode_positions_trie.csv
# Reap (set-based approach)
BARCODES=(BC1:n99_v5:./WTv2/bc_data_n99_v5.txt
BC2:v1:./WTv2/bc_data_v1.txt
BC3:v1:./WTv2/bc_data_v1.txt)
time scarecrow reap --fastqs ${R1} ${R2} -j 1 -m 2 -q 10 \
-p ./WTv2/barcode_positions_set.csv \
--barcodes ${BARCODES[@]} --extract 1:1-74 --umi 2:1-10 \
--out ./WTv2/cDNA_set --threads 1 --verbose &> debug_set.log
# Reap (trie and kmer index approach)
BARCODES=(BC1:n99_v5:./WTv2/bc_data_n99_v5.txt.BC1.trie.gz
BC2:v1:./WTv2/bc_data_v1.txt.BC2.trie.gz
BC3:v1:./WTv2/bc_data_v1.txt.BC3.trie.gz)
time scarecrow reap --fastqs ${R1} ${R2} -j 1 -m 2 -q 10 \
-p ./WTv2/barcode_positions_trie.csv \
--barcodes ${BARCODES[@]} --extract 1:1-74 --umi 2:1-10 \
--out ./WTv2/cDNA_trie --threads 1 --verbose &> debug_trie.log
cd ~/Documents/split-seq
R1=./r1.fastq
R2=./r2.fastq
BARCODES=(BC1:3lvl:./BC1.txt
BC2:3lvl_lig:./BC2.txt
BC3:P7:./BC3.txt)
# Seed
for BARCODE in ${BARCODES[@]}
do
ID=${BARCODE%:*:*}
WHITELIST=${BARCODE#*:*:}
echo ${ID}
time scarecrow seed --fastqs ${R1} ${R2} \
-o ./barcodes_${ID}.csv --barcodes ${BARCODE} -n 0 -u 0
done
FILES=(./barcodes_BC*.csv)
scarecrow harvest ${FILES[@]} --barcode_count 1 --min_distance 10 \
--conserved ./barcodes_BC1_conserved.tsv \
--out ./barcode_positions.csv
scarecrow reap --fastqs ${R1} ${R2} -j 2 -m 3 -q 10 \
-p ./barcode_positions.csv \
--barcodes ${BARCODES[@]} --extract 2:11-150 --umi 2:1-10 --base_quality 10 \
--out ./cDNA_v2 --threads 1 --verbose &> debug.log
scarecrow weed --fastq P443A_index_10nt_1005_EKDL250000649-1A_22LJ3MLT4_L3_1.fq.gz \
--sam cDNA_v2.sam \
-i 1 \
--out cDNA_v2_fix.sam \
-m 1 \
--barcodes BC3:P7:./BC3.txt &> debug.log
R1=./10X3p/SRR28867562_3.1M.fastq.gz
R2=./10X3p/SRR28867562_4.1M.fastq.gz
BARCODE=(BC1:3M-Feb2018:./10X3p/3M-february-2018.txt)
# Generate custom trie for use with scarecrow seed (3m30s)
time scarecrow encode --force_overwrite --barcodes ${BARCODE} --trie -k 8
# Seed using trie
time scarecrow seed --fastqs ${R1} ${R2} \
-o ./10X3p/barcodes_${BARCODE%:*:*}.csv \
--barcodes ${BARCODE} -n 0 -u 0 -k 8 \
--trie ./10X3p/3M-february-2018.txt.k8.trie.gz
# Harvest (trie and kmer index approach)
FILES=(./10X3p/barcodes_BC1.csv)
scarecrow harvest ${FILES[@]} --barcode_count 1 --min_distance 10 \
--conserved ./10X3p/barcodes_BC1_conserved.tsv \
--out ./10X3p/barcode_positions.csv
# Reap (trie and kmer index approach)
BARCODE=(BC1:3M-Feb2018:./10X3p/3M-february-2018.txt.k8.trie.gz)
time scarecrow reap --fastqs ${R1} ${R2} -j 0 -m 1 -q 10 \
-p ./10X3p/barcode_positions.csv \
--barcodes ${BARCODE} --extract 2:1-90 --umi 1:17-28 \
--out ./10X3p/cDNA_k8 --threads 1 &> debug.log
scarecrow samstat --sam ./10X3p/cDNA_trie.sam
READ=SRR28867558.397
grep -m1 ${READ} WTv2/*sam
grep -m1 -A1 ${READ} ${R2}
grep -m1 -A300 ${READ} debug_set.log | less
grep -m1 -A300 ${READ} debug_trie.log | less