-
Notifications
You must be signed in to change notification settings - Fork 78
Commit
…y by haplotypes. Added function to calculate reference length by haplotypes.
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from quast_libs.fastaparser import read_fasta | ||
|
||
class DipQuastAnalyzer: | ||
def __init__(self): | ||
self.dip_genome_by_chr = {} | ||
self.dip_genome_by_chr_len = {} | ||
This comment has been minimized.
Sorry, something went wrong. |
||
self.genome_size_by_haplotypes = {} | ||
self.__remember_haplotypes = [] | ||
This comment has been minimized.
Sorry, something went wrong. |
||
def fill_dip_dict_by_chromosomes(self, fasta_fpath): | ||
This comment has been minimized.
Sorry, something went wrong.
alexeigurevich
Contributor
|
||
for name, seq in read_fasta(fasta_fpath): | ||
chr_name, haplotype = name.strip('\n').split('_') | ||
This comment has been minimized.
Sorry, something went wrong. |
||
chr_len = len(seq) | ||
if haplotype not in self.dip_genome_by_chr_len.keys(): | ||
self.dip_genome_by_chr_len[haplotype] = {} | ||
self.dip_genome_by_chr[haplotype] = {} | ||
self.__remember_haplotypes.append(haplotype) | ||
This comment has been minimized.
Sorry, something went wrong.
alexeigurevich
Contributor
|
||
self.dip_genome_by_chr_len[haplotype][chr_name] = chr_len | ||
self.dip_genome_by_chr[haplotype][chr_name] = seq | ||
|
||
for haplotype_n in self.__remember_haplotypes: | ||
self.genome_size_by_haplotypes[haplotype_n] = sum(self.dip_genome_by_chr_len[haplotype_n].values()) | ||
|
||
return self.dip_genome_by_chr, self.genome_size_by_haplotypes | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -172,6 +172,8 @@ class Fields: | |
|
||
# Reference statistics | ||
REFLEN = 'Reference length' | ||
REFLEN_HAPLOTYPE1 = 'Reference length of haplotype 1' | ||
REFLEN_HAPLOTYPE2 = 'Reference length of haplotype 2' | ||
This comment has been minimized.
Sorry, something went wrong.
alexeigurevich
Contributor
|
||
ESTREFLEN = 'Estimated reference length' | ||
REF_FRAGMENTS = 'Reference fragments' | ||
REFGC = 'Reference GC (%)' | ||
|
@@ -200,7 +202,7 @@ class Fields: | |
SIMILAR_MIS_BLOCKS = '# similar misassembled blocks' | ||
|
||
### content and order of metrics in MAIN REPORT (<quast_output_dir>/report.txt, .tex, .tsv): | ||
order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, ESTREFLEN, GC, REFGC, | ||
order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPE1, REFLEN_HAPLOTYPE2, ESTREFLEN, GC, REFGC, | ||
N50, NG50, Nx, NGx, auN, auNG, L50, LG50, Lx, LGx, | ||
TOTAL_READS, LEFT_READS, RIGHT_READS, | ||
MAPPED_READS_PCNT, REF_MAPPED_READS_PCNT, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
>chr1_haplotype1 | ||
ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG | ||
AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG | ||
GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA | ||
TTTAGTGCCCATTTCGGCCAACTCTCTATCATCTTTCTTTGGCTGAGTGGCATGTATTTCCATGGTGCTC | ||
GTTTTTCCAATTATGAAGCATGGCTGAGTGATCCTACTCACATTGGACCTAGTGCTCAGGTGGTTTGGCC | ||
AATAGTGGGCCAAGAAATCCTGAATGGAGATGTGGGCGGAGGCTTCCGAGGAATACAAATAACCTCAGGC | ||
TTTTTTCAGATTTGGCGAGCATCCGGAATAACTAGTGAATTACAACTTTATTGTACCGCAATTGGCGCAT | ||
TGGTCTTCGCAGCCTTAATGCTTTTTGCTGGTTGGTTCCATTATCACAAAGCAGCTCCAAAATTGGCTTG | ||
GTTCCAAGATGTAGAATCTATGTTGAATCACCATTTAGCAGGGCTACTAGGACTTGGGTCCCTTTCTTGG | ||
GCAGGACATCAAGTACATGTATCTTTACCGATTAACCAATTTCTAAACGCTGGAGTAGATCCTAAAGAAA | ||
TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG | ||
AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT | ||
CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA | ||
TCGCGGGTCATATGTATAGGACC | ||
>chr2_haplotype1 | ||
AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA | ||
TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT | ||
ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT | ||
TCTCATAGTTGGTGCTGCTGCGCATGCAGCCATTTTTATGGTAAGAGACTATGATCCAACTAATCGATAT | ||
AACGATTTATTAGATCGTGTCCTGAGGCATCGCGATGCAATCATATCACATCTCAACTGGGTATGTATAT | ||
TTCTAGGCTTCCACAGTTTTGGTTTGTATATTCATAATGATACCATGAGTGCTTTAGGGCGTCCACAAGA | ||
TATGTTTTCAGATACTGCTATACAATTACAACCAGTCTTTGCTCAATGGATACAAAATACCCATGCTTTA | ||
GCACCTGGTGTAACAGCCCCTGGTGAAACAGCGAGCACCAGTTTGACTTGGGGGGGCGGTGAGTTAGTAG | ||
CAGTGGGTGGCAAAGTAGCTTT | ||
>chr3_haplotype1 | ||
TGCATTTACAATTCATGTGACGGTATTGATACTGTTGAAAGGTGTTCTATTTGCTCGTAGCTCGCGTTTA | ||
ATACCAGATAAAGCAAATCTTGGTTTTCGTTTCCCTTGTGATGGGCCGGGAAGAGGAGGAACATGTCAAG | ||
TATCTGCTTGGGATCATGTCTTCTTAGGACTATTCTGGATGTACAATGCTATTTCCGTAGTAATATTCCA | ||
TTTCAGTTGGAAAATGCAGTCAGATGTTTGGGGTAGTATAAGTGATCAAGGGGTGGTAACTCATATTACT | ||
GGAGGAAACTTTGCACAGAGTTCCATTACTATTAATGGGTGGCTCCGCGATTTCTTATGGGCACAAGCAT | ||
CTCAGGTAATTCAGTCTTATGGTTCTTCGTTATCTGCATATGGTCTTTTTTTCCTAGGTGCTCATTTTGT | ||
ATGGGCTTTCAGTTTAATGTTTCTATTCAGCGGGCGTGGTTATTGGCAAGAACTTATTGAATCCATTGTT | ||
TGGGCTCATAATAAATTAAAAGTTGCTCCTGCTACTCAGCCTAGAGCCTTGAGCATTATACAAGGACGTG | ||
CTGTAGGAGTAACCCATTACCTTCTGGGTGGAATTGCCACAACATGGGCGTTCTTCTTAGCAAGAATTAT | ||
TGCAGTAGGATAAAACTGGGGTATTGGTCATGGTATAAAAGATATTTTAGAGGCTCATAAGTTACCTATT | ||
CCATTAGGAACGGCAGATTTTTTGGTACATCATATTCA | ||
>chr1_haplotype2 | ||
ATGGAATTAAGATTTCCCAGGTTTAGCCAAGGCTTAGCTCAGGACCCCACTACTCGTCGTATTTGGTTTG | ||
GTATTGCTACCGCACATGATTTCGAAAGTCATGATGATATTACTGAGGAACGTCTTTATCAGAACATTTT | ||
TGCTTCTCACTTTGGGCAGTTAGCAATAATCTTTCTATGGACGTCCGGAAATCTGTTTCATGTAGCTTGG | ||
CAAGGAAATTTTGAATCATGGATACAGGATCCTTTACACGTAAGACCTATTGCTCATGCCATTTGGGATC | ||
CTCATTTTGGGCAACCCGCTGTGGAAGCCTTTACTCGAGGAGGTGCTGCCGGTCCAGTGAATATCGCTTA | ||
TTCTGGGGTTTATCAGTGGTGGTATACAATTGGATTGCGCACCAATGAAGATCTTTATACTGGAGCTCTT | ||
TTTCTATTATTTCTTTCTACGCTATCCTTAATAGGGGGTTGGTTACATCTACAACCCAAATGGAAGCCAA | ||
GCCTTTCGTGGTTCAAAAACGCCGAATCTCGTCTGAATCATCATTTGTCAGGACTTTTCGGAGTAAGTTC | ||
TTTGGCTTGGACAGGACATTTAGTTCATGTTGCTATTCCCGGATCCTCTAGGGGGGAGTACGTTCGATGG | ||
AATAATTTCTTAGATGTATTACCCTATCCCCAGGGGTTGGGTCCCCTTCTGACGGGTCAGTGGAATCTTT | ||
ATGCCCAAAATCCTGATTCGAGTAATCATTTATTTGGTACCACTCAAGGAGCGGGAACTGCCATTCTGAC | ||
CCTTCTTGGGGGATTCC | ||
>chr2_haplotype2 | ||
ATTGCATTTATTTTTCTCATTGCCGGTCATATGTATCGAACTAACTTCGGAATTGGGCACAGTATCAAAG | ||
ATCTTTTAGAAGCACATACTCCTCCGGGGGGTCGATTAGGACGTGGGCATAAAGGCCTTTATGATACAAT | ||
CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT | ||
CAACATATGTACTCTTTACCTGCTTATGCATTCATAGCACAAGACTTTACTACTCAAGCTGCTTTATATA | ||
CTCATCACCAATACATTGCAGGGTTCATCATGACAGGGGCTTTTGCTCATGGAGCTATTTTTTTCATTAG | ||
GGATTACAATCCGGAACAGAATGAAGATAATGTATTGGCAAGAATGTTAGACCATAAGGAAGCTATCATA | ||
TCTCATTTAAGTTGGGCTAGCCTCTTCCTAGGATTCCATACCTTGGGCCCTTATGTTCATAACGACGTTA | ||
TGCTTGCTTTTGGTACTCCAGAAAAGCAAATCTTGATTGAACCTATATTTGCCCAATGGATACAATCTGC | ||
TCATGGTAAGACGACATATGGGTTCGATATACTCTTATCTTCAACGAATGGCCCCACTTTCAATGCAGGT | ||
CGAAACATATGGTTGCCCGGATGGTTGAATGCTGTTAATGAGAATAGTAATTCGCTTTTCTTAACAATAG | ||
GACCTGGGGATTTCTTGGTTCATCATGCTATTGCTCTAGGTTTGCATACAACTACATTGATTTTAGTAAA | ||
GGGTGCTTTAGATGCACGCGGTTCCAAATTAATGCCGGATAAAAAGGATTTCGGGTATAGTTTT | ||
>chr3_haplotype2 | ||
GACGGCCCAGGGCGCGGCGGTACTTGTGATATTTCTGCTTGGGACGCGTTTTATTTGGCAGTTTTCTGGA | ||
TGTTAAATACCATTGGATGGGTTACTTTTTATTGGCATTGGAAACACATTACATTATGGCAGGGCAACGT | ||
TTCACAATTTAATGAATCCTCCACTTATTTGATGGGATGGTTAAGAGATTACCTATGGTTAAACTCTTCA | ||
CAACTTATTAATGGATATAATCCTTTTGGGATGAATAGTTTATCAGTATGGGCTTGGATGTTCTTATTTG | ||
GACATCTTGTTTGGGCTACAGGATTTATGTTCTTAATTTCCTGGCGTGGATATTGGCAGGAATTAATTGA | ||
GACTTTAGCATGGGCTCATGAACGGACACCTTTGGCTAATTTAATTCGCTGGAGAGATAAGCCCGTGGCT | ||
CTTTCCGTTGTGCAAGCAAGATTGGTCGGATTAGCCCACTTTTCCGTGGGTTATATATTCACTTATGCAG | ||
CTTTCTTGATTGCCTCAACATCAGGCAAGTTCGGTTAAATCCACAAACACAAAGTTTGTGGCTGACCGAT | ||
ATTGCTCATCATCATTTAGCTCCTTGC |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
>cont_1 | ||
ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG | ||
AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG | ||
GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA | ||
TTTAGTGCCCATTTCGGCCAACTCTCTATCATCTTTCTTTGGCTGAGTGGCATGTATTTCCATGGTGCTC | ||
GTTTTTCCAATTATGAAGCATGGCTGAGTGATCCTACTCACATTGGACCTAGTGCTCAGGTGGTTTGGCC | ||
AATAGTGGGCCAAGAAATCCTGAATGGAGATGTGGGCGGAGGCTTCCGAGGAATACAAATAACCTCAGGC | ||
>cont_2 | ||
TTTTTTCAGATTTGGCGAGCATCCGGAATAACTAGTGAATTACAACTTTATTGTACCGCAATTGGCGCAT | ||
TGGTCTTCGCAGCCTTAATGCTTTTTGCTGGTTGGTTCCATTATCACAAAGCAGCTCCAAAATTGGCTTG | ||
GTTCCAAGATGTAGAATCTATGTTGAATCACCATTTAGCAGGGCTACTAGGACTTGGGTCCCTTTCTTGG | ||
GCAGGACATCAAGTACATGTATCTTTACCGATTAACCAATTTCTAAACGCTGGAGTAGATCCTAAAGAAA | ||
TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG | ||
AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT | ||
CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA | ||
TCGCGGGTCATATGTATAGGACC | ||
>cont_3 | ||
AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA | ||
TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT | ||
ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT | ||
TCTCATAGTTGGTGCTGCTGCGCATGCAGCCATTTTTATGGTAAGAGACTATGATCCAACTAATCGATAT | ||
AACGATTTATTAGATCGTGTCCTGAGGCATCGCGATGCAATCATATCACATCTCAACTGGGTATGTATAT | ||
TTC | ||
>cont_4 | ||
ATGGAATTAAGATTTCCCAGGTTTAGCCAAGGCTTAGCTCAGGACCCCACTACTCGTCGTATTTGGTTTG | ||
GTATTGCTACCGCACATGATTTCGAAAGTCATGATGATATTACTGAGGAACGTCTTTATCAGAACATTTT | ||
TGCTTCTCACTTTGGGCAGTTAGCAATAATCTTTCTATGGACGTCCGGAAATCTGTTTCATGTAGCTTGG | ||
CAAGGAAATTTTGAATCATGGATACAGGATCCTTTACACGTAAGACCTATTGCTCATGCCATTTGGGATC | ||
CTCATTCATAAAGGCCTTTATGATACAAT | ||
>cont_5 | ||
CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT | ||
CAACATATGTACTCTTTACCTGCTTATGCATTCATAGCACAAGACTTTACTACTCAAGCTGCTTTATATA | ||
CTCATCACCAATACATTGCAGGGTTCATCATGACAGGGGCTTTTGCTCATGGAGCTATTTTTTTCATTAG | ||
GGATTACAATCCGGAACAGAATGAAGATAATGTATTGGCAAGAATGTTAGACCATAAGGAAGCTATCATA | ||
TCTCATTTAAGTTGGGCTAGCCTCTTCCTAGGATTCCATACCTTGGGCCCTTATGTTCATAACGACGTTA | ||
TGCTTGCTTTTGGTACTCCAGAAAAGCAAATCTTGATTGAACCTATATTTGCCCAATGGATACAATCTGC | ||
TCATGGTAAGACGACAT |
Let's not store chr lengths here, we need just mapping of haplotypes and their corresponding chromosome names to answer the following questions:
(1) To which haplotype belongs this chromosome
(2) List all chromosomes of this haplotype
(3) List all haplotypes
We can compute the length per haplotype when we calculate the Ref length in
basic_stats.py
(we calculate all chr lengths there anyway).