From 8cb01e6764dbdb7821b31f525a27f886f74c6dbe Mon Sep 17 00:00:00 2001 From: Alexandra Kasianova Date: Mon, 13 Feb 2023 16:51:15 +0300 Subject: [PATCH] Added diputils.py. Added the ability to create a dictionary separately by haplotypes. Added function to calculate reference length by haplotypes. --- quast_libs/basic_stats.py | 7 +++ quast_libs/diputils.py | 30 +++++++++++++ quast_libs/reporting.py | 4 +- test_data/dip_reference.fasta | 73 ++++++++++++++++++++++++++++++++ test_data/dip_test_contigs.fasta | 37 ++++++++++++++++ 5 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 quast_libs/diputils.py create mode 100644 test_data/dip_reference.fasta create mode 100644 test_data/dip_test_contigs.fasta diff --git a/quast_libs/basic_stats.py b/quast_libs/basic_stats.py index 1c470a6216..e9f0f9a242 100644 --- a/quast_libs/basic_stats.py +++ b/quast_libs/basic_stats.py @@ -14,6 +14,8 @@ from quast_libs import fastaparser, qconfig, qutils, reporting, plotter from quast_libs.circos import set_window_size from quast_libs.log import get_logger +from quast_libs.diputils import DipQuastAnalyzer + logger = get_logger(qconfig.LOGGER_DEFAULT_NAME) MIN_HISTOGRAM_POINTS = 5 MIN_GC_WINDOW_SIZE = qconfig.GC_window_size // 2 @@ -323,6 +325,11 @@ def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir): report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)))) if ref_fpath: report.add_field(reporting.Fields.REFLEN, int(reference_length)) + + dipquast = DipQuastAnalyzer() + _, genome_size_by_haplotypes = dipquast.fill_dip_dict_by_chromosomes(ref_fpath) + report.add_field(reporting.Fields.REFLEN_HAPLOTYPE1, int(genome_size_by_haplotypes['haplotype1'])) + report.add_field(reporting.Fields.REFLEN_HAPLOTYPE2, int(genome_size_by_haplotypes['haplotype2'])) report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments) if not qconfig.is_combined_ref: report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None)) diff --git a/quast_libs/diputils.py b/quast_libs/diputils.py new file mode 100644 index 0000000000..1add258f41 --- /dev/null +++ b/quast_libs/diputils.py @@ -0,0 +1,30 @@ +from quast_libs.fastaparser import read_fasta + +class DipQuastAnalyzer: + def __init__(self): + self.dip_genome_by_chr = {} + self.dip_genome_by_chr_len = {} + self.genome_size_by_haplotypes = {} + self.__remember_haplotypes = [] + def fill_dip_dict_by_chromosomes(self, fasta_fpath): + for name, seq in read_fasta(fasta_fpath): + chr_name, haplotype = name.strip('\n').split('_') + chr_len = len(seq) + if haplotype not in self.dip_genome_by_chr_len.keys(): + self.dip_genome_by_chr_len[haplotype] = {} + self.dip_genome_by_chr[haplotype] = {} + self.__remember_haplotypes.append(haplotype) + self.dip_genome_by_chr_len[haplotype][chr_name] = chr_len + self.dip_genome_by_chr[haplotype][chr_name] = seq + + for haplotype_n in self.__remember_haplotypes: + self.genome_size_by_haplotypes[haplotype_n] = sum(self.dip_genome_by_chr_len[haplotype_n].values()) + + return self.dip_genome_by_chr, self.genome_size_by_haplotypes + + + + + + + diff --git a/quast_libs/reporting.py b/quast_libs/reporting.py index d1f764f8c5..c7262d3d24 100644 --- a/quast_libs/reporting.py +++ b/quast_libs/reporting.py @@ -172,6 +172,8 @@ class Fields: # Reference statistics REFLEN = 'Reference length' + REFLEN_HAPLOTYPE1 = 'Reference length of haplotype 1' + REFLEN_HAPLOTYPE2 = 'Reference length of haplotype 2' ESTREFLEN = 'Estimated reference length' REF_FRAGMENTS = 'Reference fragments' REFGC = 'Reference GC (%)' @@ -200,7 +202,7 @@ class Fields: SIMILAR_MIS_BLOCKS = '# similar misassembled blocks' ### content and order of metrics in MAIN REPORT (/report.txt, .tex, .tsv): - order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, ESTREFLEN, GC, REFGC, + order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPE1, REFLEN_HAPLOTYPE2, ESTREFLEN, GC, REFGC, N50, NG50, Nx, NGx, auN, auNG, L50, LG50, Lx, LGx, TOTAL_READS, LEFT_READS, RIGHT_READS, MAPPED_READS_PCNT, REF_MAPPED_READS_PCNT, diff --git a/test_data/dip_reference.fasta b/test_data/dip_reference.fasta new file mode 100644 index 0000000000..76c7639a67 --- /dev/null +++ b/test_data/dip_reference.fasta @@ -0,0 +1,73 @@ +>chr1_haplotype1 +ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG +AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG +GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA +TTTAGTGCCCATTTCGGCCAACTCTCTATCATCTTTCTTTGGCTGAGTGGCATGTATTTCCATGGTGCTC +GTTTTTCCAATTATGAAGCATGGCTGAGTGATCCTACTCACATTGGACCTAGTGCTCAGGTGGTTTGGCC +AATAGTGGGCCAAGAAATCCTGAATGGAGATGTGGGCGGAGGCTTCCGAGGAATACAAATAACCTCAGGC +TTTTTTCAGATTTGGCGAGCATCCGGAATAACTAGTGAATTACAACTTTATTGTACCGCAATTGGCGCAT +TGGTCTTCGCAGCCTTAATGCTTTTTGCTGGTTGGTTCCATTATCACAAAGCAGCTCCAAAATTGGCTTG +GTTCCAAGATGTAGAATCTATGTTGAATCACCATTTAGCAGGGCTACTAGGACTTGGGTCCCTTTCTTGG +GCAGGACATCAAGTACATGTATCTTTACCGATTAACCAATTTCTAAACGCTGGAGTAGATCCTAAAGAAA +TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG +AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT +CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA +TCGCGGGTCATATGTATAGGACC +>chr2_haplotype1 +AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA +TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT +ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT +TCTCATAGTTGGTGCTGCTGCGCATGCAGCCATTTTTATGGTAAGAGACTATGATCCAACTAATCGATAT +AACGATTTATTAGATCGTGTCCTGAGGCATCGCGATGCAATCATATCACATCTCAACTGGGTATGTATAT +TTCTAGGCTTCCACAGTTTTGGTTTGTATATTCATAATGATACCATGAGTGCTTTAGGGCGTCCACAAGA +TATGTTTTCAGATACTGCTATACAATTACAACCAGTCTTTGCTCAATGGATACAAAATACCCATGCTTTA +GCACCTGGTGTAACAGCCCCTGGTGAAACAGCGAGCACCAGTTTGACTTGGGGGGGCGGTGAGTTAGTAG +CAGTGGGTGGCAAAGTAGCTTT +>chr3_haplotype1 +TGCATTTACAATTCATGTGACGGTATTGATACTGTTGAAAGGTGTTCTATTTGCTCGTAGCTCGCGTTTA +ATACCAGATAAAGCAAATCTTGGTTTTCGTTTCCCTTGTGATGGGCCGGGAAGAGGAGGAACATGTCAAG +TATCTGCTTGGGATCATGTCTTCTTAGGACTATTCTGGATGTACAATGCTATTTCCGTAGTAATATTCCA +TTTCAGTTGGAAAATGCAGTCAGATGTTTGGGGTAGTATAAGTGATCAAGGGGTGGTAACTCATATTACT +GGAGGAAACTTTGCACAGAGTTCCATTACTATTAATGGGTGGCTCCGCGATTTCTTATGGGCACAAGCAT +CTCAGGTAATTCAGTCTTATGGTTCTTCGTTATCTGCATATGGTCTTTTTTTCCTAGGTGCTCATTTTGT +ATGGGCTTTCAGTTTAATGTTTCTATTCAGCGGGCGTGGTTATTGGCAAGAACTTATTGAATCCATTGTT +TGGGCTCATAATAAATTAAAAGTTGCTCCTGCTACTCAGCCTAGAGCCTTGAGCATTATACAAGGACGTG +CTGTAGGAGTAACCCATTACCTTCTGGGTGGAATTGCCACAACATGGGCGTTCTTCTTAGCAAGAATTAT +TGCAGTAGGATAAAACTGGGGTATTGGTCATGGTATAAAAGATATTTTAGAGGCTCATAAGTTACCTATT +CCATTAGGAACGGCAGATTTTTTGGTACATCATATTCA +>chr1_haplotype2 +ATGGAATTAAGATTTCCCAGGTTTAGCCAAGGCTTAGCTCAGGACCCCACTACTCGTCGTATTTGGTTTG +GTATTGCTACCGCACATGATTTCGAAAGTCATGATGATATTACTGAGGAACGTCTTTATCAGAACATTTT +TGCTTCTCACTTTGGGCAGTTAGCAATAATCTTTCTATGGACGTCCGGAAATCTGTTTCATGTAGCTTGG +CAAGGAAATTTTGAATCATGGATACAGGATCCTTTACACGTAAGACCTATTGCTCATGCCATTTGGGATC +CTCATTTTGGGCAACCCGCTGTGGAAGCCTTTACTCGAGGAGGTGCTGCCGGTCCAGTGAATATCGCTTA +TTCTGGGGTTTATCAGTGGTGGTATACAATTGGATTGCGCACCAATGAAGATCTTTATACTGGAGCTCTT +TTTCTATTATTTCTTTCTACGCTATCCTTAATAGGGGGTTGGTTACATCTACAACCCAAATGGAAGCCAA +GCCTTTCGTGGTTCAAAAACGCCGAATCTCGTCTGAATCATCATTTGTCAGGACTTTTCGGAGTAAGTTC +TTTGGCTTGGACAGGACATTTAGTTCATGTTGCTATTCCCGGATCCTCTAGGGGGGAGTACGTTCGATGG +AATAATTTCTTAGATGTATTACCCTATCCCCAGGGGTTGGGTCCCCTTCTGACGGGTCAGTGGAATCTTT +ATGCCCAAAATCCTGATTCGAGTAATCATTTATTTGGTACCACTCAAGGAGCGGGAACTGCCATTCTGAC +CCTTCTTGGGGGATTCC +>chr2_haplotype2 +ATTGCATTTATTTTTCTCATTGCCGGTCATATGTATCGAACTAACTTCGGAATTGGGCACAGTATCAAAG +ATCTTTTAGAAGCACATACTCCTCCGGGGGGTCGATTAGGACGTGGGCATAAAGGCCTTTATGATACAAT +CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT +CAACATATGTACTCTTTACCTGCTTATGCATTCATAGCACAAGACTTTACTACTCAAGCTGCTTTATATA +CTCATCACCAATACATTGCAGGGTTCATCATGACAGGGGCTTTTGCTCATGGAGCTATTTTTTTCATTAG +GGATTACAATCCGGAACAGAATGAAGATAATGTATTGGCAAGAATGTTAGACCATAAGGAAGCTATCATA +TCTCATTTAAGTTGGGCTAGCCTCTTCCTAGGATTCCATACCTTGGGCCCTTATGTTCATAACGACGTTA +TGCTTGCTTTTGGTACTCCAGAAAAGCAAATCTTGATTGAACCTATATTTGCCCAATGGATACAATCTGC +TCATGGTAAGACGACATATGGGTTCGATATACTCTTATCTTCAACGAATGGCCCCACTTTCAATGCAGGT +CGAAACATATGGTTGCCCGGATGGTTGAATGCTGTTAATGAGAATAGTAATTCGCTTTTCTTAACAATAG +GACCTGGGGATTTCTTGGTTCATCATGCTATTGCTCTAGGTTTGCATACAACTACATTGATTTTAGTAAA +GGGTGCTTTAGATGCACGCGGTTCCAAATTAATGCCGGATAAAAAGGATTTCGGGTATAGTTTT +>chr3_haplotype2 +GACGGCCCAGGGCGCGGCGGTACTTGTGATATTTCTGCTTGGGACGCGTTTTATTTGGCAGTTTTCTGGA +TGTTAAATACCATTGGATGGGTTACTTTTTATTGGCATTGGAAACACATTACATTATGGCAGGGCAACGT +TTCACAATTTAATGAATCCTCCACTTATTTGATGGGATGGTTAAGAGATTACCTATGGTTAAACTCTTCA +CAACTTATTAATGGATATAATCCTTTTGGGATGAATAGTTTATCAGTATGGGCTTGGATGTTCTTATTTG +GACATCTTGTTTGGGCTACAGGATTTATGTTCTTAATTTCCTGGCGTGGATATTGGCAGGAATTAATTGA +GACTTTAGCATGGGCTCATGAACGGACACCTTTGGCTAATTTAATTCGCTGGAGAGATAAGCCCGTGGCT +CTTTCCGTTGTGCAAGCAAGATTGGTCGGATTAGCCCACTTTTCCGTGGGTTATATATTCACTTATGCAG +CTTTCTTGATTGCCTCAACATCAGGCAAGTTCGGTTAAATCCACAAACACAAAGTTTGTGGCTGACCGAT +ATTGCTCATCATCATTTAGCTCCTTGC \ No newline at end of file diff --git a/test_data/dip_test_contigs.fasta b/test_data/dip_test_contigs.fasta new file mode 100644 index 0000000000..753ec42c48 --- /dev/null +++ b/test_data/dip_test_contigs.fasta @@ -0,0 +1,37 @@ +>cont_1 +ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG +AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG +GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA +TTTAGTGCCCATTTCGGCCAACTCTCTATCATCTTTCTTTGGCTGAGTGGCATGTATTTCCATGGTGCTC +GTTTTTCCAATTATGAAGCATGGCTGAGTGATCCTACTCACATTGGACCTAGTGCTCAGGTGGTTTGGCC +AATAGTGGGCCAAGAAATCCTGAATGGAGATGTGGGCGGAGGCTTCCGAGGAATACAAATAACCTCAGGC +>cont_2 +TTTTTTCAGATTTGGCGAGCATCCGGAATAACTAGTGAATTACAACTTTATTGTACCGCAATTGGCGCAT +TGGTCTTCGCAGCCTTAATGCTTTTTGCTGGTTGGTTCCATTATCACAAAGCAGCTCCAAAATTGGCTTG +GTTCCAAGATGTAGAATCTATGTTGAATCACCATTTAGCAGGGCTACTAGGACTTGGGTCCCTTTCTTGG +GCAGGACATCAAGTACATGTATCTTTACCGATTAACCAATTTCTAAACGCTGGAGTAGATCCTAAAGAAA +TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG +AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT +CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA +TCGCGGGTCATATGTATAGGACC +>cont_3 +AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA +TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT +ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT +TCTCATAGTTGGTGCTGCTGCGCATGCAGCCATTTTTATGGTAAGAGACTATGATCCAACTAATCGATAT +AACGATTTATTAGATCGTGTCCTGAGGCATCGCGATGCAATCATATCACATCTCAACTGGGTATGTATAT +TTC +>cont_4 +ATGGAATTAAGATTTCCCAGGTTTAGCCAAGGCTTAGCTCAGGACCCCACTACTCGTCGTATTTGGTTTG +GTATTGCTACCGCACATGATTTCGAAAGTCATGATGATATTACTGAGGAACGTCTTTATCAGAACATTTT +TGCTTCTCACTTTGGGCAGTTAGCAATAATCTTTCTATGGACGTCCGGAAATCTGTTTCATGTAGCTTGG +CAAGGAAATTTTGAATCATGGATACAGGATCCTTTACACGTAAGACCTATTGCTCATGCCATTTGGGATC +CTCATTCATAAAGGCCTTTATGATACAAT +>cont_5 +CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT +CAACATATGTACTCTTTACCTGCTTATGCATTCATAGCACAAGACTTTACTACTCAAGCTGCTTTATATA +CTCATCACCAATACATTGCAGGGTTCATCATGACAGGGGCTTTTGCTCATGGAGCTATTTTTTTCATTAG +GGATTACAATCCGGAACAGAATGAAGATAATGTATTGGCAAGAATGTTAGACCATAAGGAAGCTATCATA +TCTCATTTAAGTTGGGCTAGCCTCTTCCTAGGATTCCATACCTTGGGCCCTTATGTTCATAACGACGTTA +TGCTTGCTTTTGGTACTCCAGAAAAGCAAATCTTGATTGAACCTATATTTGCCCAATGGATACAATCTGC +TCATGGTAAGACGACAT \ No newline at end of file