Skip to content

MorganResearchLab/scarecrow

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

scarecrow

scarecrow

A toolkit for preprocessing single cell sequencing data.

Documentation

scarecrow is undergoing substantial editing and may not behave as intended.

Todo

  • Run through ruff to check and format files

  • Error handling to capture missing or incorrect parameters, and unexpected file content

  • Peaks in between barcodes may need further investigation

  • Jitter does not currently apply to UMI or insert sequence

    • if UMI on same read and downstream then needs position updating before extraction
  • Plot generated by harvest

    • currently will not handle > 1 barcode peak per whitelist (doesn't affect CSV output)
  • Benchmark different assays (SPLiTseq, Parse, 10X) and methods (split-pipe, scarecrow, UMI tools)

    • barcode recovery
    • alignment (STAR and kallisto)
  • Added functionality to reap.py _get_sequence_with_jitter to tackle potentially clipped barcodes starting at position 1

    • this is not applied to the trie method currently

Testing on laptop (WTv2)

R1=./WTv2/100K_R1.fastq
R2=./WTv2/100K_R2.fastq
BARCODES=(BC1:n99_v5:./WTv2/bc_data_n99_v5.txt
          BC2:v1:./WTv2/bc_data_v1.txt
          BC3:v1:./WTv2/bc_data_v1.txt)

# Seed
for BARCODE in ${BARCODES[@]}
do
    ID=${BARCODE%:*:*}
    WHITELIST=${BARCODE#*:*:}
    echo ${ID}
#    time scarecrow seed --fastqs ${R1} ${R2} \
#        -o ./WTv2/barcodes_${ID}_set.csv --barcodes ${BARCODE} -n 0 -u 0
    time scarecrow seed --fastqs ${R1} ${R2} \
        -o ./WTv2/barcodes_${ID}_trie.csv --barcodes ${BARCODE} -n 0 -u 0 \
        --trie ${WHITELIST}.${ID}.trie.gz -k 2
done

# Harvest (set-based approach)
FILES=(./WTv2/barcodes_BC*_set.csv)
scarecrow harvest ${FILES[@]} --barcode_count 1 --min_distance 10 \
    --conserved ./WTv2/barcodes_BC1_conserved.tsv \
    --out ./WTv2/barcode_positions_set.csv

# Harvest (trie and kmer index approach)
FILES=(./WTv2/barcodes_BC*_trie.csv)
scarecrow harvest ${FILES[@]} --barcode_count 1 --min_distance 10 \
    --conserved ./WTv2/barcodes_BC1_conserved.tsv \
    --out ./WTv2/barcode_positions_trie.csv

# Reap (set-based approach)
BARCODES=(BC1:n99_v5:./WTv2/bc_data_n99_v5.txt
          BC2:v1:./WTv2/bc_data_v1.txt
          BC3:v1:./WTv2/bc_data_v1.txt)
time scarecrow reap --fastqs ${R1} ${R2} -j 1 -m 2 -q 10 \
    -p ./WTv2/barcode_positions_set.csv \
    --barcodes ${BARCODES[@]} --extract 1:1-74 --umi 2:1-10 \
    --out ./WTv2/cDNA_set --threads 1 --verbose &> debug_set.log

# Reap (trie and kmer index approach)
BARCODES=(BC1:n99_v5:./WTv2/bc_data_n99_v5.txt.BC1.trie.gz
          BC2:v1:./WTv2/bc_data_v1.txt.BC2.trie.gz
          BC3:v1:./WTv2/bc_data_v1.txt.BC3.trie.gz)
time scarecrow reap --fastqs ${R1} ${R2} -j 1 -m 2 -q 10 \
    -p ./WTv2/barcode_positions_trie.csv \
    --barcodes ${BARCODES[@]} --extract 1:1-74 --umi 2:1-10 \
    --out ./WTv2/cDNA_trie --threads 1 --verbose &> debug_trie.log

Testing on laptop (split-seq)

cd ~/Documents/split-seq
R1=./r1.fastq
R2=./r2.fastq
BARCODES=(BC1:3lvl:./BC1.txt
          BC2:3lvl_lig:./BC2.txt
          BC3:P7:./BC3.txt)

# Seed
for BARCODE in ${BARCODES[@]}
do
    ID=${BARCODE%:*:*}
    WHITELIST=${BARCODE#*:*:}
    echo ${ID}
    time scarecrow seed --fastqs ${R1} ${R2} \
        -o ./barcodes_${ID}.csv --barcodes ${BARCODE} -n 0 -u 0
done

FILES=(./barcodes_BC*.csv)
scarecrow harvest ${FILES[@]} --barcode_count 1 --min_distance 10 \
    --conserved ./barcodes_BC1_conserved.tsv \
    --out ./barcode_positions.csv

scarecrow reap --fastqs ${R1} ${R2} -j 2 -m 3 -q 10 \
    -p ./barcode_positions.csv \
    --barcodes ${BARCODES[@]} --extract 2:11-150 --umi 2:1-10 --base_quality 10 \
    --out ./cDNA_v2 --threads 1 --verbose &> debug.log

scarecrow weed --fastq P443A_index_10nt_1005_EKDL250000649-1A_22LJ3MLT4_L3_1.fq.gz \
    --sam cDNA_v2.sam \
    -i 1 \
    --out cDNA_v2_fix.sam \
    -m 1 \
    --barcodes BC3:P7:./BC3.txt &> debug.log

Testing on laptop (10X3p)

R1=./10X3p/SRR28867562_3.1M.fastq.gz
R2=./10X3p/SRR28867562_4.1M.fastq.gz
BARCODE=(BC1:3M-Feb2018:./10X3p/3M-february-2018.txt)

# Generate custom trie for use with scarecrow seed (3m30s)
time scarecrow encode --force_overwrite --barcodes ${BARCODE} --trie -k 8

# Seed using trie
time scarecrow seed --fastqs ${R1} ${R2} \
    -o ./10X3p/barcodes_${BARCODE%:*:*}.csv \
    --barcodes ${BARCODE} -n 0 -u 0 -k 8 \
    --trie ./10X3p/3M-february-2018.txt.k8.trie.gz

# Harvest (trie and kmer index approach)
FILES=(./10X3p/barcodes_BC1.csv)
scarecrow harvest ${FILES[@]} --barcode_count 1 --min_distance 10 \
    --conserved ./10X3p/barcodes_BC1_conserved.tsv \
    --out ./10X3p/barcode_positions.csv

# Reap (trie and kmer index approach)
BARCODE=(BC1:3M-Feb2018:./10X3p/3M-february-2018.txt.k8.trie.gz)
time scarecrow reap --fastqs ${R1} ${R2} -j 0 -m 1 -q 10 \
    -p ./10X3p/barcode_positions.csv \
    --barcodes ${BARCODE} --extract 2:1-90 --umi 1:17-28 \
    --out ./10X3p/cDNA_k8 --threads 1 &> debug.log



scarecrow samstat --sam ./10X3p/cDNA_trie.sam

Debugging notes

READ=SRR28867558.397
grep -m1 ${READ} WTv2/*sam
grep -m1 -A1 ${READ} ${R2}
grep -m1 -A300 ${READ} debug_set.log | less
grep -m1 -A300 ${READ} debug_trie.log | less

About

No description, website, or topics provided.

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published