diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 000000000..b9f6a85f3 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,3 @@ +10k_PBMC_Multiome_nextgem_Chromium_X_atac_possorted_bam.bam +10k_PBMC_Multiome_nextgem_Chromium_X_atac_possorted_bam.bam.bai +.README.md.swp diff --git a/data/ChrM.fa.gz b/data/ChrM.fa.gz new file mode 100644 index 000000000..7dbe88404 Binary files /dev/null and b/data/ChrM.fa.gz differ diff --git a/data/ChrM_subset.bam b/data/ChrM_subset.bam new file mode 100644 index 000000000..2de3bc93d Binary files /dev/null and b/data/ChrM_subset.bam differ diff --git a/data/ChrM_subset.bam.bai b/data/ChrM_subset.bam.bai new file mode 100644 index 000000000..c18668d51 Binary files /dev/null and b/data/ChrM_subset.bam.bai differ diff --git a/data/ChrM_testData_R1_001.fastq.gz b/data/ChrM_testData_R1_001.fastq.gz new file mode 100644 index 000000000..ed40196ef Binary files /dev/null and b/data/ChrM_testData_R1_001.fastq.gz differ diff --git a/data/ChrM_testData_R2_001.fastq.gz b/data/ChrM_testData_R2_001.fastq.gz new file mode 100644 index 000000000..db5200daf Binary files /dev/null and b/data/ChrM_testData_R2_001.fastq.gz differ diff --git a/data/README.md b/data/README.md new file mode 100644 index 000000000..9940be394 --- /dev/null +++ b/data/README.md @@ -0,0 +1,72 @@ +# nf-core/mitotags test files + +This is the minimal test files to test the nf-core/mitotags package. + +## input + +ChrM_testData_R1_001.fastq.gz + +``` +bedtools bamtofastq -i ChrM_subset.bam -fq ChrM_testData_R1_001.fastq -fq2 ChrM_testData_R2_001.fastq +gzip *.fastq +``` + +## bam + +ChrM_subset.bam + + +``` +wget https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-arc/2.0.0/10k_PBMC_Multiome_nextgem_Chromium_X/10k_PBMC_Multiome_nextgem_Chromium_X_atac_possorted_bam.bam +wget https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-arc/2.0.0/10k_PBMC_Multiome_nextgem_Chromium_X/10k_PBMC_Multiome_nextgem_Chromium_X_atac_possorted_bam.bam.bai +samtools view 10k_PBMC_Multiome_nextgem_Chromium_X_atac_possorted_bam.bam chrM:15000-22000 -b > Chrm_subset.bam +## find the 300 highest chrM read count barcodes and split on them. +gunzip barcodes.tsv +samtools view -h Chrm_subset.bam | head -n 3000 | grep "^@" > ChrM_subset.sam +samtools view Chrm_subset.bam | grep -f barcodes.tsv >> ChrM_subset.sam +samtools view ChrM_subset.sam -b > ChrM_subset.bam +gzip barcodes.tsv +rm Chrm_subset.bam +rm ChrM_subset.sam + +``` + +## bai + +ChrM_subset.bam.bai + +``` +samtools index ChrM_subset.bam +``` + +## barcodes + +barcodes.tsv.gz + +``` +samtools view Chrm_subset.bam | grep -o "CB:Z:[AGCT]*-1" > barcodes_total.txt +``` + +R script to get the top 300 barcodes: + +``` +dat = scan( 'barcodes_total.txt', what=character()) +write ( names( sort(table(dat), decreasing=TRUE)[1:300]), file="barcodes.tsv" ) +``` + +Or on the command line (slow!): + +``` +sort barcodes_total.txt | uniq -c > barcodes_unique.txt +sort -k 2n barcodes_unique.txt| head -n 300 | cut -f1 > barcodes.tsv +``` + +Clean up + +``` +gzip barcodes.tsv +rm barcodes_total.txt +``` + +## genome +ChrM.fa.gz diff --git a/data/barcodes.tsv b/data/barcodes.tsv new file mode 100644 index 000000000..a57715b77 --- /dev/null +++ b/data/barcodes.tsv @@ -0,0 +1,300 @@ +CB:Z:TGTGTTAAGTAGCGGG-1 +CB:Z:TAGTTGTCAAACCCTA-1 +CB:Z:CGGCAATGTCAAAGGG-1 +CB:Z:GTCGAGGAGTGAACGA-1 +CB:Z:CGAGCGAAGTCATTAG-1 +CB:Z:GAAGGCTAGGCAAGTA-1 +CB:Z:ATTCAACCAATCGCAC-1 +CB:Z:TGTTGTTTCACGAATC-1 +CB:Z:GAACCAGCACCTCAGG-1 +CB:Z:GATTTGCAGTAGCGCC-1 +CB:Z:CCTAAATCATAAGGAC-1 +CB:Z:TCTGGCTTCTCGACCT-1 +CB:Z:TTAAGGACACCAGGTT-1 +CB:Z:TGGTTAATCAGCTAAC-1 +CB:Z:GATTGATGTGAGCACT-1 +CB:Z:GACGCAACACGGTACT-1 +CB:Z:GTTTGCTGTGCTCCGT-1 +CB:Z:GTAGTTTCATTGGGAG-1 +CB:Z:ACCTGGTCAATTAAGG-1 +CB:Z:CCAGTTTGTAACGAGG-1 +CB:Z:CGAGGAAGTGAGCACT-1 +CB:Z:CCATAAATCCGCAACA-1 +CB:Z:ATGTCAATCGCTAAAC-1 +CB:Z:CAGGTCCAGCTAATTG-1 +CB:Z:GTGCTCCGTACCAGGT-1 +CB:Z:TAAGTAGCAATTGAGA-1 +CB:Z:ACCCAGGGTGACATGC-1 +CB:Z:GCTTAACAGCTTATGA-1 +CB:Z:GGCCTAATCGCAGGCT-1 +CB:Z:CCGCAAATCACGAATC-1 +CB:Z:AGGTTAACAGCAATAA-1 +CB:Z:TCCGGAATCAGGTTTA-1 +CB:Z:CAGATTCAGCACAGCC-1 +CB:Z:ACAGCCGGTTAGGTTG-1 +CB:Z:CGGGCTTAGAGAGGAG-1 +CB:Z:TCAAGCTAGAAGGTGC-1 +CB:Z:GTGCAAGCAATTGACT-1 +CB:Z:CGGTAACGTGTGTGGT-1 +CB:Z:CAATATGTCCTGGCTT-1 +CB:Z:GTTCTTGTCGGTTCCT-1 +CB:Z:GGATACTTCTCCTCTT-1 +CB:Z:GCTTTGTGTTTGTCTA-1 +CB:Z:AACAAGCCATTCAGCA-1 +CB:Z:TGGATTCAGCCTAACG-1 +CB:Z:CTTTAGTTCCTGGTCT-1 +CB:Z:CCTATATTCGCAATCG-1 +CB:Z:GATTCGCCACTAAGCC-1 +CB:Z:GAGAGGCGTTGTCATC-1 +CB:Z:GCTGTACCAATCATGT-1 +CB:Z:TCACATAAGCCTTAAA-1 +CB:Z:ACGGTACGTCTCACTG-1 +CB:Z:ACAACAGAGGCCTTAG-1 +CB:Z:CTTAAGATCCGCATGA-1 +CB:Z:TAATTGCCAATATGGA-1 +CB:Z:ATATGCTCAGCAACAG-1 +CB:Z:CGCACCACACAGAACG-1 +CB:Z:TCGCTAACAAATTCGT-1 +CB:Z:GATTCATCATGTCAGC-1 +CB:Z:TTGGCTGAGATAACCC-1 +CB:Z:CTCGCTCCAGGCGATA-1 +CB:Z:ATTGTGATCTTGGATA-1 +CB:Z:ACACGGACAGGAACAT-1 +CB:Z:GATTGCGTCGCATTAA-1 +CB:Z:ATGATGGAGTTAACCA-1 +CB:Z:AAGTCTATCAGCTAAC-1 +CB:Z:CAATCGCCACACAATT-1 +CB:Z:GCATTGCCAGTTATCG-1 +CB:Z:CAAGACAAGGACACTT-1 +CB:Z:CGAGCGAAGCTCAATA-1 +CB:Z:TACGCACCATTAGGTT-1 +CB:Z:AACGCCCAGCAAACCT-1 +CB:Z:TTAGAAGCAATTTAGC-1 +CB:Z:TGTGTGAGTTAGGTTG-1 +CB:Z:TACCTGCTCATGCTAA-1 +CB:Z:ATGAAGCCAGGAACTG-1 +CB:Z:GTCATGAGTCACGGAT-1 +CB:Z:TAGTTTGAGCGCAATT-1 +CB:Z:GGTACCGGTGTCCAGG-1 +CB:Z:TCGTTTCCACTATGGC-1 +CB:Z:CCTAGTTGTACTTCAC-1 +CB:Z:GGATGTAAGGCGCACT-1 +CB:Z:GCTTAACAGTTATCTC-1 +CB:Z:GGTAAACCAGCGCTTG-1 +CB:Z:CTAATAGTCAGAAACG-1 +CB:Z:TCTAATCTCTAAGTGC-1 +CB:Z:TTCAGCACAAGCCACT-1 +CB:Z:TAGGTTATCCGGTTGA-1 +CB:Z:TAAGCCTAGGCGCTTA-1 +CB:Z:TACGGATTCCTTCTAG-1 +CB:Z:CAAGGCTGTTGAGGTC-1 +CB:Z:TAGTGAGAGTTGGGCC-1 +CB:Z:CGGTGAACATGTGGGA-1 +CB:Z:GGTAGGAGTGGAAACG-1 +CB:Z:GAACGAATCGCTATAA-1 +CB:Z:GGTGAGCCAATCCCTT-1 +CB:Z:AAGCCTGTCCTAATAG-1 +CB:Z:ATATGCTCAAGGTCCT-1 +CB:Z:AGCTTTAAGCTTAGTA-1 +CB:Z:TAACAAGCACTCAACA-1 +CB:Z:TCTTAGCGTGTGTCCC-1 +CB:Z:AACTGTTCAACCTGGT-1 +CB:Z:GTTCTCATCCAGGTTG-1 +CB:Z:GGCAGGATCCGCATGA-1 +CB:Z:AGTAATGCACGTAATT-1 +CB:Z:CGTCAATAGCGATAGA-1 +CB:Z:CGTCCTAGTAGGATTT-1 +CB:Z:TTATCCGTCTTAATGG-1 +CB:Z:AATTTGCCATGAATCT-1 +CB:Z:TCTCAAGCAATAGCAA-1 +CB:Z:AGGTGAGGTTACTTCA-1 +CB:Z:GCTCCTTAGATGGAGC-1 +CB:Z:AAAGGAGCATAATTGC-1 +CB:Z:GCAAGTCGTCAAGTAT-1 +CB:Z:TTAAGGACAGCATGTC-1 +CB:Z:TTAGGCGTCGGGACCT-1 +CB:Z:ATTCCGGAGCAAGATG-1 +CB:Z:ATGAAGTAGTGAGGTA-1 +CB:Z:TTTCGTCCAGCAAGAT-1 +CB:Z:GAGTGTTTCCCTCGCA-1 +CB:Z:CGAAGCGAGCTAAAGG-1 +CB:Z:TCGTGCTTCGGTTTGG-1 +CB:Z:ATGTTTGAGCGCCTTT-1 +CB:Z:GCTTTATTCCTAAGAC-1 +CB:Z:AGGCAATCAGGCGAGT-1 +CB:Z:CCTATTTAGCAGGTTT-1 +CB:Z:GTGCCTTTCTTGCAGG-1 +CB:Z:TTAGGAACAGAAATGC-1 +CB:Z:AGCACTTAGTAAGAAC-1 +CB:Z:GGCAATCGTCATAACG-1 +CB:Z:CCTAAGTAGGCGCTAC-1 +CB:Z:GCAAGTCGTTGGTGAC-1 +CB:Z:GTTGCCCGTTAGTTGG-1 +CB:Z:AACCCGCAGTCACCAG-1 +CB:Z:GGTCGGTTCCGCCAAA-1 +CB:Z:CTCCAAACAAGTAAGC-1 +CB:Z:AGGGTTGCAAAGCTCC-1 +CB:Z:AGGCAGGTCAGGATGA-1 +CB:Z:TTTGCGACAGTTTCTC-1 +CB:Z:GCTCTGTTCCTAGTTT-1 +CB:Z:TGAGCACGTGGTGAGA-1 +CB:Z:ATGAAGCCAGTAAAGC-1 +CB:Z:AATCATGTCTTAGCCC-1 +CB:Z:TGGTCAAGTTTGGCTT-1 +CB:Z:CTAGGACGTGCCGCAA-1 +CB:Z:GGCGGTAAGTTGGCCA-1 +CB:Z:CCAATATCACAAAGCG-1 +CB:Z:GGATTCAGTGATCAGC-1 +CB:Z:AACTAGTGTAATCCCT-1 +CB:Z:GCTGCACAGTCACCAG-1 +CB:Z:GCTTTCATCCGGTTAG-1 +CB:Z:TTTGTGAAGGCTAAGA-1 +CB:Z:TCAAGACTCTGCAAAC-1 +CB:Z:AGTTTGCAGGCTACAT-1 +CB:Z:TCTAGCCTCGTGCTTC-1 +CB:Z:CTAGGACGTTAGCTGA-1 +CB:Z:CACAAGCGTTCGGTAA-1 +CB:Z:CGTGCTGCAAGGTATA-1 +CB:Z:CCTTCGTAGCATGTTA-1 +CB:Z:CGTTATCGTAAGCTCA-1 +CB:Z:GAGTATCTCCAGCACA-1 +CB:Z:TTCCTTCTCTCACACC-1 +CB:Z:AGTAACGAGTTGGCCA-1 +CB:Z:GAAACTGAGAAGGTGC-1 +CB:Z:GCATATATCCTCGATC-1 +CB:Z:AGTTTGATCAGGGCCT-1 +CB:Z:GAGGCTACAGTTATCG-1 +CB:Z:AATTGGACACCGGCTA-1 +CB:Z:GAGCTAGCAGCATGGA-1 +CB:Z:ACATCAATCCAGGGAG-1 +CB:Z:TTGTTGTTCTAGCTTT-1 +CB:Z:GGCATGGAGAATGACG-1 +CB:Z:CCGTTATGTAGGTGTC-1 +CB:Z:CTTAGTTTCCTAATTC-1 +CB:Z:GACTTGGTCGTAATCA-1 +CB:Z:ATGTCATCACGAATTT-1 +CB:Z:TGCTTAAAGGGCTTAT-1 +CB:Z:ACTGAAACAGCACCAT-1 +CB:Z:GTCCTAGAGGCATTGT-1 +CB:Z:ATCAGGTTCTGTGAGT-1 +CB:Z:TATAGGTGTAATCGCA-1 +CB:Z:GGGCATTGTCCTTCAG-1 +CB:Z:TTTGTGTTCTAGCGTG-1 +CB:Z:TTGAGCTAGACAACAG-1 +CB:Z:GAGAGGCGTGTTTGTC-1 +CB:Z:TCGACAAGTTCCGGCT-1 +CB:Z:TTGTCCGGTTGGTGAC-1 +CB:Z:CTCAATAAGGAACCAA-1 +CB:Z:GTAAGCGCAGTTATCG-1 +CB:Z:CATCCTGGTGGAGCAA-1 +CB:Z:ATTAGTCCAGTTATGT-1 +CB:Z:TGTGGAGCAAACCTTG-1 +CB:Z:AGTGTGGCAGTTGCGT-1 +CB:Z:CGTTTGGAGTTCCTCA-1 +CB:Z:GTCCAAGTCCTTCGTA-1 +CB:Z:GGAACTAAGGCAATAG-1 +CB:Z:GCGCGATTCAATCTCT-1 +CB:Z:CGTTATCGTCATCAAG-1 +CB:Z:GCTAAGTTCGCCTGTT-1 +CB:Z:TCTCGCCCACAACCTA-1 +CB:Z:GGTAACCGTGTGTGGT-1 +CB:Z:CGTTGCGCACGGTTTA-1 +CB:Z:ACAAAGGTCGGCCAGT-1 +CB:Z:AAAGCACCAAGATTCT-1 +CB:Z:GGTACCGGTCATAGAT-1 +CB:Z:CTAACCTGTAATAGCT-1 +CB:Z:TTGGGCGGTACGGGTT-1 +CB:Z:GCGCCTTGTGCTGGTG-1 +CB:Z:AATCATGTCGGTACGC-1 +CB:Z:GAGTGATCAGTTTGGC-1 +CB:Z:GCGCAAACATCTAGCA-1 +CB:Z:AATCGCCCAGGCGAGT-1 +CB:Z:GGTGAGGTCCGGCTAA-1 +CB:Z:CCAGCCTGTTAGAGCC-1 +CB:Z:ACACTAGGTCACAAAT-1 +CB:Z:TTTGTCTAGAACAAGT-1 +CB:Z:GCAAGTCGTTTCCGGC-1 +CB:Z:GCTAACCCATGGCCCA-1 +CB:Z:CCCGTAAAGGTCCGTA-1 +CB:Z:TTGGAGGCATATAACC-1 +CB:Z:CCTGATGAGAAGCTAG-1 +CB:Z:CGATTTGCACCGGTAT-1 +CB:Z:CTCACACTCGACAAAG-1 +CB:Z:TGGCCTGCATAAGTCT-1 +CB:Z:TGTTATGAGCACAGCC-1 +CB:Z:GATAACGAGCGTGCGT-1 +CB:Z:TCTAGCCTCTTGCAGG-1 +CB:Z:TTTAAGGTCGATAACC-1 +CB:Z:CAGCTCAAGTACCGCA-1 +CB:Z:CCAGGAACAGGCTAGA-1 +CB:Z:TTTAGGATCATAGCCG-1 +CB:Z:GCGGTTATCGGTTTCC-1 +CB:Z:TGGTGCATCCGCAAGC-1 +CB:Z:GACGCCTAGCTCCTTA-1 +CB:Z:CGCATGATCCTGGTCT-1 +CB:Z:GAGTGAGGTACGCGCA-1 +CB:Z:GTAGGATCAAGGACCA-1 +CB:Z:TCGACAAGTGGATTCA-1 +CB:Z:GTGCTGGGTCGACTAA-1 +CB:Z:ATCAATCCAGCTTAAT-1 +CB:Z:AATTGGGAGTCCTGAG-1 +CB:Z:GGACTAAAGGCTATGT-1 +CB:Z:GGCTTCAAGACAACAG-1 +CB:Z:TGCTCTCAGCTTTGTT-1 +CB:Z:ATCCTGACAGTAGCCT-1 +CB:Z:AGTAACCTCCGCATGA-1 +CB:Z:GCTGTGCAGGATGATG-1 +CB:Z:CAAGTATGTTGAGCCG-1 +CB:Z:ATCCATAAGATAGACG-1 +CB:Z:GAGTATCTCATTGCAA-1 +CB:Z:CCTCAATGTGGACCTG-1 +CB:Z:ACAGTATGTTGTTGGA-1 +CB:Z:CTTTGGTGTTGTTGGA-1 +CB:Z:CTCTAAGCAAGGACCA-1 +CB:Z:CATTGTGCACTTACAG-1 +CB:Z:ACTGAAACAACAGCCT-1 +CB:Z:CCAGTTTGTAACCTAG-1 +CB:Z:GTATTGATCCTAGTAA-1 +CB:Z:ATGGACAAGGGACCTC-1 +CB:Z:TGCCGGTAGGACCGCT-1 +CB:Z:TGAGGGAGTGACCTGG-1 +CB:Z:TTTAAGGTCGCTCCAT-1 +CB:Z:CCGTTATGTAATCGTG-1 +CB:Z:ACTTACTTCCAAGTGT-1 +CB:Z:AAAGGAGCATAAACCT-1 +CB:Z:GTACACCCACACAATT-1 +CB:Z:CGCTAACCACTTACAG-1 +CB:Z:TCTAGCACAACAGCCT-1 +CB:Z:GGGCGAATCATGCATA-1 +CB:Z:ACGTCCAAGTACTGGT-1 +CB:Z:TCTTCAAGTACGTTTC-1 +CB:Z:GCACCTAAGCACAGGG-1 +CB:Z:CCATAGCCAGCCTGCA-1 +CB:Z:ATTATCCTCATAATCG-1 +CB:Z:GTTGCCCGTCCTTTAA-1 +CB:Z:CGTTAACAGCCTGATG-1 +CB:Z:TTGCACACAACTGGGA-1 +CB:Z:TCGACAAGTTAACGGC-1 +CB:Z:TCTTCAAGTGTTCCCA-1 +CB:Z:AAGCTCCCAGGACCAA-1 +CB:Z:CCGTTGCGTAAGCTTG-1 +CB:Z:CGTGTGTCATTAGCGC-1 +CB:Z:GCACTAAGTACTGAAT-1 +CB:Z:GTCCGTAAGCTACTGG-1 +CB:Z:GGGCATTGTGACATAT-1 +CB:Z:CTCTTGATCGAGCAAA-1 +CB:Z:CTGTTCATCGGGACTC-1 +CB:Z:GGTCAATTCCGTTATT-1 +CB:Z:GGTCTTTGTGTGTGGT-1 +CB:Z:ATCCCGCGTTTAGTCC-1 +CB:Z:TTAGGCGTCGATTTAG-1 +CB:Z:ATCCTTAGTGACCTGG-1 +CB:Z:ATCTATGAGTTATTCC-1 +CB:Z:TTTCTTGCAGACAAAC-1 +CB:Z:CAAACACTCCTACCTA-1 +CB:Z:TGCTCTCAGGCTACAT-1 +CB:Z:AGCTACTAGGTGCGGA-1 +CB:Z:GCTGACCAGGCGCTAC-1 +CB:Z:CGAAGTAAGCAGGTGG-1 +CB:Z:TGATCGAGTCCGCTGT-1 +CB:Z:ACGTCCTTCGTTACTT-1 diff --git a/data/var.flt.vcf.gz b/data/var.flt.vcf.gz new file mode 100644 index 000000000..f691f261d Binary files /dev/null and b/data/var.flt.vcf.gz differ