Skip to content

Commit

Permalink
Added diputils.py. Added the ability to create a dictionary separatel…
Browse files Browse the repository at this point in the history
…y by haplotypes. Added function to calculate reference length by haplotypes.
  • Loading branch information
albidgy committed Feb 13, 2023
1 parent 98e0bec commit 8cb01e6
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 1 deletion.
7 changes: 7 additions & 0 deletions quast_libs/basic_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from quast_libs import fastaparser, qconfig, qutils, reporting, plotter
from quast_libs.circos import set_window_size
from quast_libs.log import get_logger
from quast_libs.diputils import DipQuastAnalyzer

logger = get_logger(qconfig.LOGGER_DEFAULT_NAME)
MIN_HISTOGRAM_POINTS = 5
MIN_GC_WINDOW_SIZE = qconfig.GC_window_size // 2
Expand Down Expand Up @@ -323,6 +325,11 @@ def do(ref_fpath, contigs_fpaths, output_dirpath, results_dir):
report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
if ref_fpath:
report.add_field(reporting.Fields.REFLEN, int(reference_length))

dipquast = DipQuastAnalyzer()
_, genome_size_by_haplotypes = dipquast.fill_dip_dict_by_chromosomes(ref_fpath)
report.add_field(reporting.Fields.REFLEN_HAPLOTYPE1, int(genome_size_by_haplotypes['haplotype1']))
report.add_field(reporting.Fields.REFLEN_HAPLOTYPE2, int(genome_size_by_haplotypes['haplotype2']))
report.add_field(reporting.Fields.REF_FRAGMENTS, reference_fragments)
if not qconfig.is_combined_ref:
report.add_field(reporting.Fields.REFGC, ('%.2f' % reference_GC if reference_GC is not None else None))
Expand Down
30 changes: 30 additions & 0 deletions quast_libs/diputils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from quast_libs.fastaparser import read_fasta

class DipQuastAnalyzer:
def __init__(self):
self.dip_genome_by_chr = {}
self.dip_genome_by_chr_len = {}

This comment has been minimized.

Copy link
@alexeigurevich

alexeigurevich Feb 13, 2023

Contributor

Let's not store chr lengths here, we need just mapping of haplotypes and their corresponding chromosome names to answer the following questions:
(1) To which haplotype belongs this chromosome
(2) List all chromosomes of this haplotype
(3) List all haplotypes

We can compute the length per haplotype when we calculate the Ref length in basic_stats.py (we calculate all chr lengths there anyway).

self.genome_size_by_haplotypes = {}
self.__remember_haplotypes = []

This comment has been minimized.

Copy link
@alexeigurevich

alexeigurevich Feb 13, 2023

Contributor

The empty line between functions is missing

def fill_dip_dict_by_chromosomes(self, fasta_fpath):

This comment has been minimized.

Copy link
@alexeigurevich

alexeigurevich Feb 13, 2023

Contributor

we can make the filling right in the init, so not necessary to have a separate function for it (at least now)

for name, seq in read_fasta(fasta_fpath):
chr_name, haplotype = name.strip('\n').split('_')

This comment has been minimized.

Copy link
@alexeigurevich

alexeigurevich Feb 13, 2023

Contributor

Will crash on e.g., "chr1_is_my_best_chr_of_haplotype1

chr_len = len(seq)
if haplotype not in self.dip_genome_by_chr_len.keys():
self.dip_genome_by_chr_len[haplotype] = {}
self.dip_genome_by_chr[haplotype] = {}
self.__remember_haplotypes.append(haplotype)

This comment has been minimized.

Copy link
@alexeigurevich

alexeigurevich Feb 13, 2023

Contributor

The field self.__remember_haplotypes in the current form is excessive, you can always get the same list via self.dip_genome_by_chr.keys(). If you want this list to be sorted, use collection.OrderedDict() instead of {} for self.dip_genome_by_chr

self.dip_genome_by_chr_len[haplotype][chr_name] = chr_len
self.dip_genome_by_chr[haplotype][chr_name] = seq

for haplotype_n in self.__remember_haplotypes:
self.genome_size_by_haplotypes[haplotype_n] = sum(self.dip_genome_by_chr_len[haplotype_n].values())

return self.dip_genome_by_chr, self.genome_size_by_haplotypes







4 changes: 3 additions & 1 deletion quast_libs/reporting.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,8 @@ class Fields:

# Reference statistics
REFLEN = 'Reference length'
REFLEN_HAPLOTYPE1 = 'Reference length of haplotype 1'
REFLEN_HAPLOTYPE2 = 'Reference length of haplotype 2'

This comment has been minimized.

Copy link
@alexeigurevich

alexeigurevich Feb 13, 2023

Contributor

Let's make it more generic to potentially support an unlimited number of haplotypes. See "CONTIGS__FOR_THRESHOLDS" as an example of a metric without the hardcoded prerequisites. We can hardcode the upper limit, e.g., 8 (or have a new option "--max-ploidy " with default e.g., 8, but I think the constant is good enough).
Later on, we can simply remove unnecessary lines from the report.
Alternatively, you can "process" the reference early in quast.py (right after parsing the options) and know the exact number of haplotypes and used it for the metric "initiations", but I prefer the way with the constant.

ESTREFLEN = 'Estimated reference length'
REF_FRAGMENTS = 'Reference fragments'
REFGC = 'Reference GC (%)'
Expand Down Expand Up @@ -200,7 +202,7 @@ class Fields:
SIMILAR_MIS_BLOCKS = '# similar misassembled blocks'

### content and order of metrics in MAIN REPORT (<quast_output_dir>/report.txt, .tex, .tsv):
order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, ESTREFLEN, GC, REFGC,
order = [NAME, CONTIGS__FOR_THRESHOLDS, TOTALLENS__FOR_THRESHOLDS, CONTIGS, LARGCONTIG, TOTALLEN, REFLEN, REFLEN_HAPLOTYPE1, REFLEN_HAPLOTYPE2, ESTREFLEN, GC, REFGC,
N50, NG50, Nx, NGx, auN, auNG, L50, LG50, Lx, LGx,
TOTAL_READS, LEFT_READS, RIGHT_READS,
MAPPED_READS_PCNT, REF_MAPPED_READS_PCNT,
Expand Down
73 changes: 73 additions & 0 deletions test_data/dip_reference.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
>chr1_haplotype1
ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG
AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG
GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA
TTTAGTGCCCATTTCGGCCAACTCTCTATCATCTTTCTTTGGCTGAGTGGCATGTATTTCCATGGTGCTC
GTTTTTCCAATTATGAAGCATGGCTGAGTGATCCTACTCACATTGGACCTAGTGCTCAGGTGGTTTGGCC
AATAGTGGGCCAAGAAATCCTGAATGGAGATGTGGGCGGAGGCTTCCGAGGAATACAAATAACCTCAGGC
TTTTTTCAGATTTGGCGAGCATCCGGAATAACTAGTGAATTACAACTTTATTGTACCGCAATTGGCGCAT
TGGTCTTCGCAGCCTTAATGCTTTTTGCTGGTTGGTTCCATTATCACAAAGCAGCTCCAAAATTGGCTTG
GTTCCAAGATGTAGAATCTATGTTGAATCACCATTTAGCAGGGCTACTAGGACTTGGGTCCCTTTCTTGG
GCAGGACATCAAGTACATGTATCTTTACCGATTAACCAATTTCTAAACGCTGGAGTAGATCCTAAAGAAA
TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG
AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT
CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA
TCGCGGGTCATATGTATAGGACC
>chr2_haplotype1
AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA
TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT
ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT
TCTCATAGTTGGTGCTGCTGCGCATGCAGCCATTTTTATGGTAAGAGACTATGATCCAACTAATCGATAT
AACGATTTATTAGATCGTGTCCTGAGGCATCGCGATGCAATCATATCACATCTCAACTGGGTATGTATAT
TTCTAGGCTTCCACAGTTTTGGTTTGTATATTCATAATGATACCATGAGTGCTTTAGGGCGTCCACAAGA
TATGTTTTCAGATACTGCTATACAATTACAACCAGTCTTTGCTCAATGGATACAAAATACCCATGCTTTA
GCACCTGGTGTAACAGCCCCTGGTGAAACAGCGAGCACCAGTTTGACTTGGGGGGGCGGTGAGTTAGTAG
CAGTGGGTGGCAAAGTAGCTTT
>chr3_haplotype1
TGCATTTACAATTCATGTGACGGTATTGATACTGTTGAAAGGTGTTCTATTTGCTCGTAGCTCGCGTTTA
ATACCAGATAAAGCAAATCTTGGTTTTCGTTTCCCTTGTGATGGGCCGGGAAGAGGAGGAACATGTCAAG
TATCTGCTTGGGATCATGTCTTCTTAGGACTATTCTGGATGTACAATGCTATTTCCGTAGTAATATTCCA
TTTCAGTTGGAAAATGCAGTCAGATGTTTGGGGTAGTATAAGTGATCAAGGGGTGGTAACTCATATTACT
GGAGGAAACTTTGCACAGAGTTCCATTACTATTAATGGGTGGCTCCGCGATTTCTTATGGGCACAAGCAT
CTCAGGTAATTCAGTCTTATGGTTCTTCGTTATCTGCATATGGTCTTTTTTTCCTAGGTGCTCATTTTGT
ATGGGCTTTCAGTTTAATGTTTCTATTCAGCGGGCGTGGTTATTGGCAAGAACTTATTGAATCCATTGTT
TGGGCTCATAATAAATTAAAAGTTGCTCCTGCTACTCAGCCTAGAGCCTTGAGCATTATACAAGGACGTG
CTGTAGGAGTAACCCATTACCTTCTGGGTGGAATTGCCACAACATGGGCGTTCTTCTTAGCAAGAATTAT
TGCAGTAGGATAAAACTGGGGTATTGGTCATGGTATAAAAGATATTTTAGAGGCTCATAAGTTACCTATT
CCATTAGGAACGGCAGATTTTTTGGTACATCATATTCA
>chr1_haplotype2
ATGGAATTAAGATTTCCCAGGTTTAGCCAAGGCTTAGCTCAGGACCCCACTACTCGTCGTATTTGGTTTG
GTATTGCTACCGCACATGATTTCGAAAGTCATGATGATATTACTGAGGAACGTCTTTATCAGAACATTTT
TGCTTCTCACTTTGGGCAGTTAGCAATAATCTTTCTATGGACGTCCGGAAATCTGTTTCATGTAGCTTGG
CAAGGAAATTTTGAATCATGGATACAGGATCCTTTACACGTAAGACCTATTGCTCATGCCATTTGGGATC
CTCATTTTGGGCAACCCGCTGTGGAAGCCTTTACTCGAGGAGGTGCTGCCGGTCCAGTGAATATCGCTTA
TTCTGGGGTTTATCAGTGGTGGTATACAATTGGATTGCGCACCAATGAAGATCTTTATACTGGAGCTCTT
TTTCTATTATTTCTTTCTACGCTATCCTTAATAGGGGGTTGGTTACATCTACAACCCAAATGGAAGCCAA
GCCTTTCGTGGTTCAAAAACGCCGAATCTCGTCTGAATCATCATTTGTCAGGACTTTTCGGAGTAAGTTC
TTTGGCTTGGACAGGACATTTAGTTCATGTTGCTATTCCCGGATCCTCTAGGGGGGAGTACGTTCGATGG
AATAATTTCTTAGATGTATTACCCTATCCCCAGGGGTTGGGTCCCCTTCTGACGGGTCAGTGGAATCTTT
ATGCCCAAAATCCTGATTCGAGTAATCATTTATTTGGTACCACTCAAGGAGCGGGAACTGCCATTCTGAC
CCTTCTTGGGGGATTCC
>chr2_haplotype2
ATTGCATTTATTTTTCTCATTGCCGGTCATATGTATCGAACTAACTTCGGAATTGGGCACAGTATCAAAG
ATCTTTTAGAAGCACATACTCCTCCGGGGGGTCGATTAGGACGTGGGCATAAAGGCCTTTATGATACAAT
CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT
CAACATATGTACTCTTTACCTGCTTATGCATTCATAGCACAAGACTTTACTACTCAAGCTGCTTTATATA
CTCATCACCAATACATTGCAGGGTTCATCATGACAGGGGCTTTTGCTCATGGAGCTATTTTTTTCATTAG
GGATTACAATCCGGAACAGAATGAAGATAATGTATTGGCAAGAATGTTAGACCATAAGGAAGCTATCATA
TCTCATTTAAGTTGGGCTAGCCTCTTCCTAGGATTCCATACCTTGGGCCCTTATGTTCATAACGACGTTA
TGCTTGCTTTTGGTACTCCAGAAAAGCAAATCTTGATTGAACCTATATTTGCCCAATGGATACAATCTGC
TCATGGTAAGACGACATATGGGTTCGATATACTCTTATCTTCAACGAATGGCCCCACTTTCAATGCAGGT
CGAAACATATGGTTGCCCGGATGGTTGAATGCTGTTAATGAGAATAGTAATTCGCTTTTCTTAACAATAG
GACCTGGGGATTTCTTGGTTCATCATGCTATTGCTCTAGGTTTGCATACAACTACATTGATTTTAGTAAA
GGGTGCTTTAGATGCACGCGGTTCCAAATTAATGCCGGATAAAAAGGATTTCGGGTATAGTTTT
>chr3_haplotype2
GACGGCCCAGGGCGCGGCGGTACTTGTGATATTTCTGCTTGGGACGCGTTTTATTTGGCAGTTTTCTGGA
TGTTAAATACCATTGGATGGGTTACTTTTTATTGGCATTGGAAACACATTACATTATGGCAGGGCAACGT
TTCACAATTTAATGAATCCTCCACTTATTTGATGGGATGGTTAAGAGATTACCTATGGTTAAACTCTTCA
CAACTTATTAATGGATATAATCCTTTTGGGATGAATAGTTTATCAGTATGGGCTTGGATGTTCTTATTTG
GACATCTTGTTTGGGCTACAGGATTTATGTTCTTAATTTCCTGGCGTGGATATTGGCAGGAATTAATTGA
GACTTTAGCATGGGCTCATGAACGGACACCTTTGGCTAATTTAATTCGCTGGAGAGATAAGCCCGTGGCT
CTTTCCGTTGTGCAAGCAAGATTGGTCGGATTAGCCCACTTTTCCGTGGGTTATATATTCACTTATGCAG
CTTTCTTGATTGCCTCAACATCAGGCAAGTTCGGTTAAATCCACAAACACAAAGTTTGTGGCTGACCGAT
ATTGCTCATCATCATTTAGCTCCTTGC
37 changes: 37 additions & 0 deletions test_data/dip_test_contigs.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
>cont_1
ATGATTATTCGTTCGCCGGAACCAGAAGTCAAAATTTTGGTAGATAGGGATCCCATAAAAACTTCTTTCG
AGGAATGGGCTAAACCCGGTCATTTCTCAAGAACAATAGCTAAGGGACCTGATACTACCACTTGGATCTG
GAACCTACATGCTGATGCTCACGATTTTGATAGTCATACCAGTGATTTGGAGGAAATCTCTCGAAAAGTA
TTTAGTGCCCATTTCGGCCAACTCTCTATCATCTTTCTTTGGCTGAGTGGCATGTATTTCCATGGTGCTC
GTTTTTCCAATTATGAAGCATGGCTGAGTGATCCTACTCACATTGGACCTAGTGCTCAGGTGGTTTGGCC
AATAGTGGGCCAAGAAATCCTGAATGGAGATGTGGGCGGAGGCTTCCGAGGAATACAAATAACCTCAGGC
>cont_2
TTTTTTCAGATTTGGCGAGCATCCGGAATAACTAGTGAATTACAACTTTATTGTACCGCAATTGGCGCAT
TGGTCTTCGCAGCCTTAATGCTTTTTGCTGGTTGGTTCCATTATCACAAAGCAGCTCCAAAATTGGCTTG
GTTCCAAGATGTAGAATCTATGTTGAATCACCATTTAGCAGGGCTACTAGGACTTGGGTCCCTTTCTTGG
GCAGGACATCAAGTACATGTATCTTTACCGATTAACCAATTTCTAAACGCTGGAGTAGATCCTAAAGAAA
TACCGCTTCCTCATGAATTTATCTTGAATCGGGATCTTTTGGCTCAACTTTATCCAAGTTTTGCTGAAGG
AGCAACTCCCTTTTTTACCTTAAATTGGTCAAAATACTCGGAATTTCTTACTTTTCGTGGCGGATTAGAT
CCAGTGACTGGGGGTCTATGGTTAACCGATATAGCACATCATCATTTAGCTATCGCAATTCTTTTTCTAA
TCGCGGGTCATATGTATAGGACC
>cont_3
AGGTCCATTTACAGGCCAAGGCCATAAAGGCCTATATGAAATTCTAACAACATCATGGCATGCTCAATTA
TCTCTTAACCTAGCTATGTTAGGCTCTTTAACCATTATTGTAGCTCACCATATGTATTCCATGCCCCCTT
ATCCATATCTAGCTACTGACTATGCTACACAACTGTCATTGTTCACACATCACATGTGGATTGGTGGATT
TCTCATAGTTGGTGCTGCTGCGCATGCAGCCATTTTTATGGTAAGAGACTATGATCCAACTAATCGATAT
AACGATTTATTAGATCGTGTCCTGAGGCATCGCGATGCAATCATATCACATCTCAACTGGGTATGTATAT
TTC
>cont_4
ATGGAATTAAGATTTCCCAGGTTTAGCCAAGGCTTAGCTCAGGACCCCACTACTCGTCGTATTTGGTTTG
GTATTGCTACCGCACATGATTTCGAAAGTCATGATGATATTACTGAGGAACGTCTTTATCAGAACATTTT
TGCTTCTCACTTTGGGCAGTTAGCAATAATCTTTCTATGGACGTCCGGAAATCTGTTTCATGTAGCTTGG
CAAGGAAATTTTGAATCATGGATACAGGATCCTTTACACGTAAGACCTATTGCTCATGCCATTTGGGATC
CTCATTCATAAAGGCCTTTATGATACAAT
>cont_5
CAATAATTCGATTCATTTTCAATTAGGCCTTGCTCTAGCTTCCTTAGGGGTTATTACTTCCTTAGTAGCT
CAACATATGTACTCTTTACCTGCTTATGCATTCATAGCACAAGACTTTACTACTCAAGCTGCTTTATATA
CTCATCACCAATACATTGCAGGGTTCATCATGACAGGGGCTTTTGCTCATGGAGCTATTTTTTTCATTAG
GGATTACAATCCGGAACAGAATGAAGATAATGTATTGGCAAGAATGTTAGACCATAAGGAAGCTATCATA
TCTCATTTAAGTTGGGCTAGCCTCTTCCTAGGATTCCATACCTTGGGCCCTTATGTTCATAACGACGTTA
TGCTTGCTTTTGGTACTCCAGAAAAGCAAATCTTGATTGAACCTATATTTGCCCAATGGATACAATCTGC
TCATGGTAAGACGACAT

0 comments on commit 8cb01e6

Please sign in to comment.