From 2a3f4ced4b4f3d7c74e5b7019ac3f9c4e64acbb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 29 Apr 2022 17:31:49 +0200 Subject: [PATCH 01/20] Make Docstring --- VERSION | 2 +- ppanggolin/align/alignOnPang.py | 227 ++++++++++++++++++++++++-------- 2 files changed, 170 insertions(+), 59 deletions(-) diff --git a/VERSION b/VERSION index 19838b23..60e5fb3c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.92 +1.2.93 diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index c11ede62..ed402e1d 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -2,38 +2,56 @@ # coding:utf-8 # default libraries +from _io import TextIOWrapper import logging import tempfile import subprocess import argparse from collections import defaultdict +from typing import Tuple # local libraries from ppanggolin.formats import check_pangenome_info from ppanggolin.utils import mk_outdir, read_compressed_or_not from ppanggolin.pangenome import Pangenome +from ppanggolin.region import Spot from ppanggolin.figures.draw_spot import draw_selected_spots, subgraph -def createdb(file_obj, tmpdir): +def createdb(file_obj: TextIOWrapper, tmpdir: tempfile.TemporaryDirectory) -> TextIOWrapper: """ Create a MMseqs2 sequence database with the given fasta file :param file_obj: Fasta file - :type file_obj: _io.TextIOWrapper :param tmpdir: temporary directory - :type tmpdir: tempfile.TemporaryDirectory :return: DB file - :rtype: _io.TextIOWrapper """ seqdb = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name) cmd = ["mmseqs", "createdb", file_obj.name, seqdb.name, '--dbtype', '0'] + logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL) return seqdb -def align_seq_to_pang(pang_file, seq_file, output, tmpdir, cpu=1, no_defrag=False, identity=0.8, coverage=0.8): +def align_seq_to_pang(pang_file: TextIOWrapper, seq_file: TextIOWrapper, output: str, + tmpdir: tempfile.TemporaryDirectory, cpu: int = 1, no_defrag: bool = False, + identity: float = 0.8, coverage: float = 0.8) -> str: + """ + Align pangenome sequences against fasta sequence + + :param pang_file: File with sequences in pangenome + :param seq_file: File with sequences from input file + :param output: Path of the output directory + :param tmpdir: Temporary directory to align sequences + :param cpu: Number of available cpu + :param no_defrag: Allow to pass the defragmentation step + :param identity: minimal identity threshold for the alignment + :param coverage: minimal identity threshold for the alignment + + :return: Alignement result file + """ + pang_db = createdb(pang_file, tmpdir) seq_db = createdb(seq_file, tmpdir) cov_mode = "0" # coverage of query and target @@ -57,10 +75,18 @@ def align_seq_to_pang(pang_file, seq_file, output, tmpdir, cpu=1, no_defrag=Fals return outfile -def read_alignments(outfile, pangenome): +def read_alignments(aln_res: str, pangenome: Pangenome) -> Tuple[dict, str]: + """ + Read alignment result to link input sequence to pangenome + + :param aln_res: Alignement result file + :param pangenome: Input pangenome + + :return: Dictionnary with sequence link to pangenome and actual name of resulting alignment file + """ seq2pang = {} - outname = open(outfile.replace("_tmp", ""), "w") # write the actual result file - with open(outfile, "r") as alnFile: + outname = open(aln_res.replace("_tmp", ""), "w") # write the actual result file + with open(aln_res, "r") as alnFile: for line in alnFile: line = line.replace("ppanggolin_", "") # remove the 'ppanggolin_' bit of the id outname.write(line) @@ -71,7 +97,13 @@ def read_alignments(outfile, pangenome): return seq2pang, outname.name -def get_seq(seq_file): +def get_seq(seq_file: TextIOWrapper) -> set: + """ + get sequence from sequence input file + + :param seq_file: file containing sequences + :return: + """ seqset = set() for line in seq_file: if line.startswith(">"): @@ -79,39 +111,67 @@ def get_seq(seq_file): return seqset -def write_gene_fam_sequences(pangenome, file_obj, add=""): +def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIOWrapper, add: str = ""): + """ + Export the sequence of genes in families + + :param pangenome: Pangenome containing families + :param file_obj: Temporary file where sequences will be written + :param add: Add prefix to sequence name + """ for fam in pangenome.gene_families: file_obj.write(">" + add + fam.name + "\n") file_obj.write(fam.sequence + "\n") file_obj.flush() -def project_partition(seq2pang, seq_set, output): +def project_partition(seq_to_pang: dict, seq_set: set, output: str) -> str: + """ + Project the partition of each sequence from the input file + + :param seq_to_pang: dictionnary which link sequence and pangenome + :param seq_set: input sequences + :param output: Path of the output directory + + :return: Path to file which contain partition projection + """ + partition_proj = output + "/sequences_partition_projection.tsv" with open(partition_proj, "w") as partProjFile: - for key, pangFam in seq2pang.items(): + for key, pangFam in seq_to_pang.items(): partProjFile.write(key + "\t" + pangFam.named_partition + "\n") - for remainingSeq in (seq2pang.keys() & seq_set): + for remainingSeq in (seq_to_pang.keys() & seq_set): partProjFile.write(remainingSeq + "\tcloud\n") # if there is no hit, it's going to be cloud genes. return partition_proj -def get_fam_to_rgp(pangenome, multigenics): - """associates families to the RGP they belong to, and those they are bordering""" +def get_fam_to_rgp(pangenome, multigenics: set) -> dict: + """ + Associate families to the RGP they belong to, and those they are bordering + + :param pangenome: Input pangenome + :param multigenics: multigenics families + + :return: Dictionnary link families to RGP + """ fam2rgp = defaultdict(list) for rgp in pangenome.regions: for fam in rgp.families: fam2rgp[fam].append(rgp.name) - for fam in [gene.family for border in - rgp.get_bordering_genes(pangenome.parameters["spots"]["set_size"], multigenics) for gene in border]: + for fam in [gene.family for border in rgp.get_bordering_genes(pangenome.parameters["spots"]["set_size"], + multigenics) for gene in border]: fam2rgp[fam].append(rgp.name) return fam2rgp -def get_fam_to_spot(pangenome, multigenics): +def get_fam_to_spot(pangenome: Pangenome, multigenics: set) -> Tuple[dict, dict]: """ - reads a pangenome object and returns a dictionary of family to RGP and family to spot, - that indicates where each family is + Reads a pangenome object to link families and spots and indicate where each family is. + + :param pangenome: Input pangenome + :param multigenics: multigenics families + + :return: Dictionary of family to RGP and family to spot """ # those are to be replaced as spots should be stored in the pangenome, and in the h5. fam2spot = defaultdict(list) @@ -121,37 +181,62 @@ def get_fam_to_spot(pangenome, multigenics): fams_border = set() for rgp in spot.regions: fams |= rgp.families - fams_border |= set( - [gene.family for border in - rgp.get_bordering_genes(pangenome.parameters["spots"]["set_size"], multigenics) - for gene in border]) + fams_border |= set([gene.family for border in # Set of families in border of spot + rgp.get_bordering_genes(pangenome.parameters["spots"]["set_size"], multigenics) + for gene in border]) for fam in fams: fam2spot[fam].append(spot) for fam in fams_border: fam2border[fam].append(spot) - return fam2spot, fam2border, multigenics + return fam2spot, fam2border -def add_spot_str(a): - return "spot_" + str(a.ID) +def add_spot_str(spot: Spot) -> str: + """ + allow to map spot set + :param spot: spot which will be return -def draw_spot_gexf(spots, output, multigenics, fam2mod, set_size=3): + :return: Str with spot ID + """ + return "spot_" + str(spot.ID) + + +def draw_spot_gexf(spots: set, output: str, multigenics: set, fam_to_mod: dict, set_size: int = 3): + """ + Draw a gexf graph of the spot + + :param spots: spot find in the alignment between pangenome and input sequences + :param output: Path of the output directory + :param multigenics: multigenics families + :param fam_to_mod: dictionnary which link families and modules + :param set_size: + """ for spot in spots: fname = output + "/spot_" + str(spot.ID) + ".gexf" - subgraph(spot, fname, set_size=set_size, multigenics=multigenics, fam2mod=fam2mod) + subgraph(spot, fname, set_size=set_size, multigenics=multigenics, fam2mod=fam_to_mod) -def get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=False): +def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: str, draw_related: bool = False, disable_bar=False): + """ + Get sequences information after alignment + + :param seq_to_pang: Alignment result + :param pangenome: Pangenome which contain information + :param output: Path of the output directory + :param draw_related: Draw figures and graphs in a gexf format of spots associated to the input sequences + :param disable_bar: disable progress bar + :return: + """ logging.getLogger().info("Writing RGP and spot information related to hits in the pan") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) finfo = open(output + "/info_input_seq.tsv", "w") finfo.write("input\tfamily\tpartition\tspot_list_as_member\tspot_list_as_border\trgp_list\n") fam2rgp = get_fam_to_rgp(pangenome, multigenics) - fam2spot, fam2border, multigenics = get_fam_to_spot(pangenome, multigenics) + fam2spot, fam2border = get_fam_to_spot(pangenome, multigenics) spot_list = set() - for seq, panfam in seq2pang.items(): + for seq, panfam in seq_to_pang.items(): finfo.write(seq + '\t' + panfam.name + "\t" + panfam.named_partition + "\t" + ",".join( map(add_spot_str, fam2spot[panfam])) + "\t" + ",".join( map(add_spot_str, fam2border[panfam])) + "\t" + ','.join(fam2rgp[panfam]) + "\n") @@ -163,48 +248,40 @@ def get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=False): for spot in spot_list: if len(spot.get_uniq_ordered_set()) > 1: drawn_spots.add(spot) - logging.getLogger().info( - f"Drawing the {len(drawn_spots)} spots with more than 1 organization " - f"related to hits of the input sequences...") + logging.getLogger().info(f"Drawing the {len(drawn_spots)} spots with more than 1 organization " + f"related to hits of the input sequences...") draw_selected_spots(drawn_spots, pangenome, output, pangenome.parameters["spots"]["overlapping_match"], pangenome.parameters["spots"]["exact_match"], pangenome.parameters["spots"]["set_size"], disable_bar=disable_bar) - # fam2module - fam2mod = {} + + fam2mod = {} # fam2module if pangenome.status["modules"] != "No": for mod in pangenome.modules: for fam in mod.families: fam2mod[fam] = f"module_{mod.ID}" - draw_spot_gexf(drawn_spots, output, multigenics=multigenics, fam2mod=fam2mod) + draw_spot_gexf(drawn_spots, output, multigenics=multigenics, fam_to_mod=fam2mod) - logging.getLogger().info( - f"File listing RGP and spots where sequences of interest are located : '{output + '/info_input_seq.tsv'}'") + logging.getLogger().info(f"File listing RGP and spots where sequences of interest are located : " + f"'{output + '/info_input_seq.tsv'}'") -def get_seq2pang(pangenome, sequence_file, output, tmpdir, cpu=1, no_defrag=False, identity=0.8, coverage=0.8): +def get_seq2pang(pangenome: Pangenome, sequence_file: str, output: str, tmpdir: tempfile.TemporaryDirectory, + cpu: int = 1, no_defrag: bool = False, identity: float = 0.8, + coverage: float = 0.8) -> Tuple[set, str, dict]: """ Assign a pangenome gene family to the input sequences. :param pangenome: Pangenome with gene families to align with the given input sequences - :type pangenome: Pangenome :param sequence_file: Sequences in a .fasta file to align with the given Pangenome - :type sequence_file: str - :param output: Output directory - :type output: str + :param output: Path of the output directory :param tmpdir: Temporary directory - :type tmpdir: tempfile.TemporaryDirectory - :param cpu: number of CPU cores to use - :type cpu: int + :param cpu: number of CPU cores to use :param no_defrag: do not use the defrag workflow if true - :type no_defrag: Boolean :param identity: minimal identity threshold for the alignment - :type identity: float :param coverage: minimal identity threshold for the alignment - :type coverage: float :return: sequence set, blast-tab result file string, and sequences aligned with families - :rtype: set, str, dic """ tmp_pang_file = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir.name) @@ -221,8 +298,25 @@ def get_seq2pang(pangenome, sequence_file, output, tmpdir, cpu=1, no_defrag=Fals return seq_set, align_file, seq2pang -def align(pangenome, sequence_file, output, tmpdir, identity=0.8, coverage=0.8, no_defrag=False, cpu=1, getinfo=False, - draw_related=False, disable_bar=False): +def align(pangenome: Pangenome, sequence_file: str, output: str, tmpdir: str, identity: float = 0.8, + coverage: float = 0.8, no_defrag: bool = False, cpu: int = 1, getinfo: bool = False, + draw_related: bool = False, disable_bar: bool = False): + """ + Main function to align pangenome sequences with fasta file using MMSeqs2 + + :param pangenome: Pangenome with gene families to align with the given input sequences + :param sequence_file: Sequences in a .fasta file to align with the given Pangenome + :param output: Path of the output directory + :param tmpdir: Temporary directory + :param identity: minimal identity threshold for the alignment + :param coverage: minimal identity threshold for the alignment + :param no_defrag: do not use the defrag workflow if true + :param cpu: number of CPU cores to use + :param getinfo: Extract info related to the best hit of each query, such as the RGP it is in, or the spots. + :param draw_related: Draw figures and graphs in a gexf format of spots associated to the input sequences + :param disable_bar: Disable the progresse bar + """ + if pangenome.status["geneFamilySequences"] not in ["inFile", "Loaded", "Computed"]: raise Exception("Cannot use this function as your pangenome does not have gene families representatives " "associated to it. For now this works only if the clustering is realised by PPanGGOLiN.") @@ -244,7 +338,7 @@ def align(pangenome, sequence_file, output, tmpdir, identity=0.8, coverage=0.8, seq_set, align_file, seq2pang = get_seq2pang(pangenome, sequence_file, output, new_tmpdir, cpu, no_defrag, identity, coverage) - if getinfo or draw_related: + if getinfo or draw_related: # TODO Add getinfo to function and remove if get_seq_info(seq2pang, pangenome, output, draw_related, disable_bar=disable_bar) part_proj = project_partition(seq2pang, seq_set, output) # write the partition assignation only logging.getLogger().info(f"sequences partition projection : '{part_proj}'") @@ -254,7 +348,12 @@ def align(pangenome, sequence_file, output, tmpdir, identity=0.8, coverage=0.8, new_tmpdir.cleanup() -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) @@ -267,13 +366,25 @@ def launch(args): draw_related=args.draw_related, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("align", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_align(parser) return parser -def parser_align(parser): +def parser_align(parser: argparse.ArgumentParser): + """ + Parser for specific argument of align command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") required.add_argument('-S', '--sequences', required=True, type=str, From 50bb823ee0f293dcca1a510307abcb39b5734720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 2 May 2022 17:37:25 +0200 Subject: [PATCH 02/20] Define docstring for annotation module --- VERSION | 2 +- ppanggolin/annotate/annotate.py | 220 +++++++++++++++++++++++--------- ppanggolin/annotate/synta.py | 151 +++++++++++++++------- ppanggolin/utils.py | 18 ++- 4 files changed, 284 insertions(+), 107 deletions(-) diff --git a/VERSION b/VERSION index 60e5fb3c..4deca18a 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.93 +1.2.94 diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index ea750905..74544f69 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -14,15 +14,19 @@ # local libraries from ppanggolin.annotate.synta import annotate_organism, read_fasta, get_dna_sequence from ppanggolin.pangenome import Pangenome -from ppanggolin.genome import Organism, Gene, RNA +from ppanggolin.genome import Organism, Gene, RNA, Contig from ppanggolin.utils import read_compressed_or_not, mk_file_name, min_one from ppanggolin.formats import write_pangenome def detect_filetype(filename): """ - detects whether the current file is gff3, gbk/gbff, fasta or unknown. + Detects whether the current file is gff3, gbk/gbff, fasta or unknown. If unknown, it will raise an error + + :param filename: path to file + + :return: current file type """ with read_compressed_or_not(filename) as f: first_line = f.readline() @@ -33,13 +37,33 @@ def detect_filetype(filename): elif first_line.startswith(">"): return 'fasta' else: - raise Exception( - "Filetype was not gff3 (file starts with '##gff-version 3') nor gbff/gbk (file starts with 'LOCUS ')." - " Only those two file formats are supported (for now).") + raise Exception("Filetype was not gff3 (file starts with '##gff-version 3') " + "nor gbff/gbk (file starts with 'LOCUS '). " + "Only those two file formats are supported (for now).") -def create_gene(org, contig, gene_counter, rna_counter, gene_id, dbxref, start, stop, strand, gene_type, position=None, - gene_name="", product="", genetic_code=11, protein_id=""): +def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxref: set, + start: int, stop: int, strand: str, gene_type: str, position: int = None, gene_name: str = "", + product: str = "", genetic_code: int = 11, protein_id: str = ""): + """ + Create a Gene object and associate to contig and Organism + + :param org: Organism to add gene + :param contig: Contig to add gene + :param gene_counter: Gene counter to name gene + :param rna_counter: RNA counter to name RNA + :param gene_id: local identifier + :param dbxref: cross-reference to external DB + :param start: Gene start position + :param stop: Gene stop position + :param strand: gene strand association + :param gene_type: gene type + :param position: position in contig + :param gene_name: Gene name + :param product: Function of gene + :param genetic_code: Genetic code used + :param protein_id: Protein identifier + """ if any('MaGe' or 'SEED' in dbref for dbref in dbxref): if gene_name == "": gene_name = gene_id @@ -80,8 +104,17 @@ def create_gene(org, contig, gene_counter, rna_counter, gene_id, dbxref, start, new_gene.fill_parents(org, contig) -def read_org_gbff(organism, gbff_file_path, circular_contigs, pseudo=False): - """ reads a gbff file and fills Organism, Contig and Genes objects based on information contained in this file """ +def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, pseudo: bool = False) -> (Organism, bool): + """ + Read a GBFF file and fills Organism, Contig and Genes objects based on information contained in this file + + :param organism: Organism name + :param gbff_file_path: Path to corresponding GFF file + :param circular_contigs: list of contigs + :param pseudo: Allow to read pseudogène + + :return: Organism complete and true for sequence in file + """ org = Organism(organism) logging.getLogger().debug(f"Extracting genes informations from the given gbff {gbff_file_path.split('/')[-1]}") @@ -177,7 +210,7 @@ def read_org_gbff(organism, gbff_file_path, circular_contigs, pseudo=False): elif line[21:].startswith('/gene'): # gene name gene_name = line.split("=")[1].replace('"', '').strip() elif line[21:].startswith('/transl_table'): - genetic_code = line.split("=")[1].replace('"', '').strip() + genetic_code = int(line.split("=")[1].replace('"', '').strip()) elif line[21:].startswith('/product'): # need to loop as it can be more than one line long product = line.split('=')[1].replace('"', '').strip() if line.count('"') == 1: # then the product line is on multiple lines @@ -215,18 +248,27 @@ def read_org_gbff(organism, gbff_file_path, circular_contigs, pseudo=False): return org, True -def read_org_gff(organism, gff_file_path, circular_contigs, pseudo=False): +def read_org_gff(organism: str, gff_file_path: str, circular_contigs, pseudo: bool = False) -> (Organism, bool): + """ + Read annotation from GFF file + + :param organism: Organism name + :param gff_file_path: Path to corresponding GFF file + :param circular_contigs: list of contigs + :param pseudo: Allow to read pseudogène + + :return: Organism object and if there are sequences associate or not + """ (GFF_seqname, _, GFF_type, GFF_start, GFF_end, _, GFF_strand, _, GFF_attribute) = range(0, 9) # missing values : source, score, frame. They are unused. - def get_gff_attributes(gff_fields): + def get_gff_attributes(gff_fields: list) -> dict: """ - Parses the gff attribute's line and outputs the attributes_get in a dict structure. - :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff. - :type gff_fields: list - :return: attributes_get - :rtype: dict + Parses the gff attribute's line and outputs the attributes_get in a dict structure. + :param gff_fields: a gff line stored as a list. Each element of the list is a column of the gff. + + :return: attributes get """ attributes_field = [f for f in gff_fields[GFF_attribute].strip().split(';') if len(f) > 0] attributes_get = {} @@ -238,21 +280,18 @@ def get_gff_attributes(gff_fields): pass # we assume that it is a strange, but useless field for our analysis return attributes_get - def get_id_attribute(attributes_dict): + def get_id_attribute(attributes_dict: dict) -> str: """ - Gets the ID of the element from which the provided attributes_get were extracted. - Raises an error if no ID is found. - :param attributes_dict: - :type attributes_dict: dict - :return: element_id: - :rtype: string + Gets the ID of the element from which the provided attributes_get were extracted. + Raises an error if no ID is found. + :param attributes_dict: attributes from one gff line + + :return: CDS identifier """ element_id = attributes_dict.get("ID") if not element_id: - logging.getLogger().error( - "Each CDS type of the gff files must own a unique ID attribute. Not the case for file: " + - gff_file_path) - exit(1) + raise Exception(f"Each CDS type of the gff files must own a unique ID attribute. " + f"Not the case for file: {gff_file_path}") return element_id contig = None # initialize contig @@ -305,9 +344,9 @@ def get_id_attribute(attributes_dict): product = "" try: - genetic_code = attributes.pop("TRANSL_TABLE") + genetic_code = int(attributes.pop("TRANSL_TABLE")) except KeyError: - genetic_code = "11" + genetic_code = 11 if contig is None or contig.name != fields_gff[GFF_seqname]: # get the current contig contig = org.get_or_add_contig(fields_gff[GFF_seqname], @@ -352,11 +391,26 @@ def get_id_attribute(attributes_dict): return org, has_fasta -def launch_read_anno(args): +def launch_read_anno(args: tuple) -> (Organism, bool): + """ Allow to launch in multiprocessing the read of genome annotation + + :param args: Pack of argument for annotate_organism function + + :return: Organism object for pangenome + """ return read_anno_file(*args) -def read_anno_file(organism_name, filename, circular_contigs, pseudo): +def read_anno_file(organism_name: str, filename: str, circular_contigs: list, pseudo: bool = False) -> (Organism, bool): + """ + Read a GBFF file for one organism + + :param organism_name: Name of the organism + :param filename: Path to the corresponding file + :param circular_contigs: list of sequence in contig + :param pseudo: allow to read pseudogène + :return: + """ filetype = detect_filetype(filename) if filetype == "gff": try: @@ -368,9 +422,9 @@ def read_anno_file(organism_name, filename, circular_contigs, pseudo): return read_org_gbff(organism_name, filename, circular_contigs, pseudo) except Exception: raise Exception(f"Reading the gbff file '{filename}' raised an error.") - else: - raise Exception( - "Wrong file type provided. This looks like a fasta file. You may be able to use --fasta instead.") + else: # Fasta type obligatory because unknow raise an error in detect_filetype function + raise Exception("Wrong file type provided. This looks like a fasta file. " + "You may be able to use --fasta instead.") def chose_gene_identifiers(pangenome): @@ -396,7 +450,17 @@ def chose_gene_identifiers(pangenome): return True -def read_annotations(pangenome, organisms_file, cpu, pseudo=False, disable_bar=False): +def read_annotations(pangenome: Pangenome, organisms_file: str, cpu: int = 1, pseudo: bool = False, + disable_bar: bool = False): + """ + Read the annotation from GBFF file + + :param pangenome: pangenome object + :param organisms_file: List of GBFF files for each organism + :param cpu: number of CPU cores to use + :param pseudo: + :param disable_bar: Disable the progresse bar + """ logging.getLogger().info("Reading " + organisms_file + " the list of organism files ...") pangenome.status["geneSequences"] = "Computed" @@ -408,20 +472,18 @@ def read_annotations(pangenome, organisms_file, cpu, pseudo=False, disable_bar=F if len(elements) <= 1: raise Exception(f"No tabulation separator found in given --fasta file: '{organisms_file}'") args.append((elements[0], elements[1], elements[2:], pseudo)) - bar = tqdm(range(len(args)), unit="file", disable=disable_bar) with get_context('fork').Pool(cpu) as p: - for org, flag in p.imap_unordered(launch_read_anno, args): + for org, flag in tqdm(p.imap_unordered(launch_read_anno, args), unit="file", total=len(args), + disable=disable_bar): pangenome.add_organism(org) if not flag: pangenome.status["geneSequences"] = "No" - bar.update() - bar.close() # decide whether we use local ids or ppanggolin ids. used_local_identifiers = chose_gene_identifiers(pangenome) if used_local_identifiers: - logging.getLogger().info( - "gene identifiers used in the provided annotation files were unique, PPanGGOLiN will use them.") + logging.getLogger().info("gene identifiers used in the provided annotation files were unique, " + "PPanGGOLiN will use them.") else: logging.getLogger().info("gene identifiers used in the provided annotation files were not unique, " "PPanGGOLiN will use self-generated identifiers.") @@ -434,6 +496,12 @@ def read_annotations(pangenome, organisms_file, cpu, pseudo=False, disable_bar=F def get_gene_sequences_from_fastas(pangenome, fasta_file): + """ + + :param pangenome: + :param fasta_file: + :return: + """ fasta_dict = {} for line in read_compressed_or_not(fasta_file): elements = [el.strip() for el in line.split("\t")] @@ -469,33 +537,52 @@ def get_gene_sequences_from_fastas(pangenome, fasta_file): pangenome.status["geneSequences"] = "Computed" -def launch_annotate_organism(pack): +def launch_annotate_organism(pack: tuple) -> Organism: + """ Allow to launch in multiprocessing the genome annotation + + :param pack: Pack of argument for annotate_organism function + + :return: Organism object for pangenome + """ return annotate_organism(*pack) -def annotate_pangenome(pangenome, fasta_list, tmpdir, cpu, translation_table="11", kingdom="bacteria", norna=False, - overlap=True, contig_filter=1, disable_bar=False): +def annotate_pangenome(pangenome: Pangenome, fasta_list: str, tmpdir: str, cpu: int = 1, translation_table: int = 11, + kingdom: str = "bacteria", norna: bool = False, overlap: bool = True, contig_filter: int = 1, + disable_bar: bool = False): + """ + Main function to annotate a pangenome + + :param pangenome: Pangenome with gene families to align with the given input sequences + :param fasta_list: List of fasta file containing sequences that will be base of pangenome + :param tmpdir: Path to temporary directory + :param cpu: number of CPU cores to use + :param translation_table: Translation table (genetic code) to use. + :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. + :param norna: Use to avoid annotating RNA features. + :param overlap: Use to not remove genes overlapping with RNA features + :param contig_filter: Filter the contig size + :param disable_bar: Disable the progresse bar + :return: + """ logging.getLogger().info(f"Reading {fasta_list} the list of organism files") - arguments = [] + arguments = [] # Argument given to annotate organism in same order than prototype for line in read_compressed_or_not(fasta_list): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - logging.getLogger().error("No tabulation separator found in organisms file") - exit(1) - arguments.append((elements[0], elements[1], elements[2:], translation_table, kingdom, norna, - tmpdir, overlap, contig_filter)) + raise Exception("No tabulation separator found in organisms file") + arguments.append((elements[0], elements[1], elements[2:], tmpdir, translation_table, + norna, kingdom, overlap, contig_filter)) if len(arguments) == 0: raise Exception("There are no genomes in the provided file") logging.getLogger().info(f"Annotating {len(arguments)} genomes using {cpu} cpus...") with get_context('fork').Pool(processes=cpu) as p: - bar = tqdm(range(len(arguments)), unit="genome", disable=disable_bar) - for organism in p.imap_unordered(launch_annotate_organism, arguments): - bar.update() + for organism in tqdm(p.imap_unordered(launch_annotate_organism, arguments), unit="genome", + total=len(arguments), disable=disable_bar): pangenome.add_organism(organism) p.close() p.join() - bar.close() logging.getLogger().info("Done annotating genomes") pangenome.status["genomesAnnotated"] = "Computed" # the pangenome is now annotated. @@ -509,7 +596,12 @@ def annotate_pangenome(pangenome, fasta_list, tmpdir, cpu, translation_table="11 pangenome.parameters["annotation"]["read_annotations_from_file"] = False -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ if not any([args.fasta, args.anno]): raise Exception("At least one of --fasta or --anno must be given") filename = mk_file_name(args.basename, args.output, args.force) @@ -532,13 +624,25 @@ def launch(args): write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("annotate", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_annot(parser) return parser -def parser_annot(parser): +def parser_annot(parser: argparse.ArgumentParser): + """ + Parser for specific argument of annotate command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('--fasta', required=False, type=str, @@ -562,7 +666,7 @@ def parser_annot(parser): choices=["bacteria", "archaea"], help="Kingdom to which the prokaryota belongs to, " "to know which models to use for rRNA annotation.") - optional.add_argument("--translation_table", required=False, default="11", + optional.add_argument("--translation_table", required=False, type=int, default=11, help="Translation table (genetic code) to use.") optional.add_argument("--basename", required=False, default="pangenome", help="basename for the output file") optional.add_argument("--use_pseudo", required=False, action="store_true", diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 2b8a28e6..ec25805f 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -2,33 +2,45 @@ # coding:utf-8 # default libraries +import logging import os import tempfile +from io import TextIOWrapper from subprocess import Popen, PIPE import ast from collections import defaultdict +from typing import Union # local libraries from ppanggolin.genome import Organism, Gene, RNA from ppanggolin.utils import is_compressed, read_compressed_or_not -def reverse_complement(seq): - """ reverse complement the given dna sequence """ +def reverse_complement(seq: str): + """reverse complement the given dna sequence + + :param seq: sequence which need to be reversed + + :return: reverse sequence + """ + complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N', 'R': 'Y', 'Y': 'R', 'S': 'S', 'W': 'W', 'K': 'M', 'M': 'K', 'B': 'V', 'V': 'B', 'D': 'H', 'H': 'D'} # see https://www.bioinformatics.org/sms/iupac.html for the code. - # complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N' } ## basic rcseq = "" for i in reversed(seq): rcseq += complement[i] return rcseq -def launch_aragorn(fna_file, org): +def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: """ - launches Aragorn to annotate tRNAs. Takes a fna file name and a locustag to give an ID to the found genes. - returns the annotated genes in a list of gene objects. + Launches Aragorn to annotate tRNAs. + + :param fna_file: file-like object containing the uncompressed fasta sequences + :param org: Organism which will be annotated + + :return: Annotated genes in a list of gene objects """ locustag = org.name cmd = ["aragorn", "-t", "-gcbact", "-l", "-w", fna_file] @@ -48,19 +60,24 @@ def launch_aragorn(fna_file, org): start, stop = ast.literal_eval(line_data[2].replace("c", "")) c += 1 gene = RNA(identifier=locustag + '_tRNA_' + str(c).zfill(3)) - gene.fill_annotations(start=start, stop=stop, strand="-" if line_data[2].startswith( - "c") else "+", gene_type="tRNA", product=line_data[1] + line_data[4]) + gene.fill_annotations(start=start, stop=stop, gene_type="tRNA", product=line_data[1] + line_data[4], + strand="-" if line_data[2].startswith("c") else "+",) gene_objs[header].add(gene) return gene_objs -def launch_prodigal(fna_file, org, code): +def launch_prodigal(fna_file: str, org: Organism, code: int = 11) -> defaultdict: """ - launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the found genes. - returns the annotated genes in a list of gene objects. + Launches Prodigal to annotate CDS. Takes a fna file name and a locustag to give an ID to the found genes. + + :param fna_file: file-like object containing the uncompressed fasta sequences + :param org: Organism which will be annotated + :param code: Translation table (genetic code) to use. + + :return: Annotated genes in a list of gene objects """ locustag = org.name - cmd = ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", "single", "-q"] + cmd = list(map(str, ["prodigal", "-f", "sco", "-g", code, "-m", "-c", "-i", fna_file, "-p", "single", "-q"])) p = Popen(cmd, stdout=PIPE) gene_objs = defaultdict(set) @@ -71,7 +88,6 @@ def launch_prodigal(fna_file, org, code): for data in line.split(";"): if data.startswith("seqhdr"): header = data.split("=")[1].replace('"', "").split()[0] - # print(header) elif line.startswith(">"): c += 1 @@ -84,11 +100,16 @@ def launch_prodigal(fna_file, org, code): return gene_objs -def launch_infernal(fna_file, org, kingdom, tmpdir): +def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = "bacteria") -> defaultdict: """ - launches Infernal in hmmer-only mode to annotate rRNAs. - Takes a fna file name and a locustag to give an ID to the found genes. - returns the annotated genes in a list of gene objects. + Launches Infernal in hmmer-only mode to annotate rRNAs. + + :param fna_file: file-like object containing the uncompressed fasta sequences + :param org: Organism which will be annotated + :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. + :param tmpdir: Path to temporary directory + + :return: Annotated genes in a list of gene objects. """ locustag = org.name modelfile = "" @@ -131,9 +152,14 @@ def launch_infernal(fna_file, org, kingdom, tmpdir): return gene_objs -def read_fasta(org, fna_file, contig_filter=1): - """ - Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value. +def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list], contig_filter: int = 1) -> dict: + """ Reads a fna file (or stream, or string) and stores it in a dictionary with contigs as key and sequence as value. + + :param org: Organism corresponding to fasta file + :param fna_file: Input fasta file with sequences or list of each line as sequence + :param contig_filter: Filter the contig by size + + :return: Dictionnary with contig_name as keys and contig sequence in values """ try: contigs = {} @@ -148,23 +174,29 @@ def read_fasta(org, fna_file, contig_filter=1): contig = org.get_or_add_contig(line.split()[0][1:]) else: contig_seq += line.strip() - # processing the last contig - if len(contig_seq) >= contig_filter: + if len(contig_seq) >= contig_filter: # processing the last contig contigs[contig.name] = contig_seq.upper() except AttributeError as e: raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. " f"One possibility for this error is that the file did not start with a '>' " f"as it would be expected from a fna file.") + except Exception: # To manage other exception which can occur + raise Exception("Unexpected error. Please check your input file and if everything looks fine, " + "please post an issue on our github") return contigs -def write_tmp_fasta(contigs, tmpdir): +def write_tmp_fasta(contigs: dict, tmpdir: str) -> tempfile._TemporaryFileWrapper: """ - Writes a temporary fna formated file, and returns the file-like object. - This is for the cases where the given file is compressed, - then we write a temporary file for the annotation tools to read from. - The file will be deleted when close() is called. + Writes a temporary fna formated file and returns the file-like object. Useful in case of compressed input file. + The file will be deleted when close() is called. + + :param contigs: Contigs sequences of each contig + :param tmpdir: path to temporary directory + + :return: fasta file """ + tmp_file = tempfile.NamedTemporaryFile(mode="w", dir=tmpdir) for header in contigs.keys(): tmp_file.write(f">{header}\n") @@ -176,32 +208,44 @@ def write_tmp_fasta(contigs, tmpdir): return tmp_file -def syntaxic_annotation(org, fasta_file, norna, kingdom, code, tmpdir): +def syntaxic_annotation(org: Organism, fasta_file: TextIOWrapper, tmpdir: str, norna: bool = False, + kingdom: str = "bacteria", code: int = 11) -> defaultdict: """ - Runs the different software for the syntaxic annotation. + Runs the different software for the syntaxic annotation. - Takes in the file-like object containing the uncompressed fasta sequences to annotate - the number of cpus that we can use. - whether to annotate rna or not - the locustag to give gene IDs. + :param org: Organism which will be annotated + :param fasta_file: file-like object containing the uncompressed fasta sequences + :param tmpdir: Path to temporary directory + :param norna: Use to avoid annotating RNA features. + :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. + :param code: Translation table (genetic code) to use. + + :return: list of genes in the organism """ + # launching tools for syntaxic annotation genes = defaultdict(list) - for key, items in launch_prodigal(fasta_file.name, org, code).items(): + for key, items in launch_prodigal(fna_file=fasta_file.name, org=org, code=code).items(): genes[key].extend(items) if not norna: - for key, items in launch_aragorn(fasta_file.name, org).items(): + for key, items in launch_aragorn(fna_file=fasta_file.name, org=org).items(): genes[key].extend(items) - for key, items in launch_infernal(fasta_file.name, org, kingdom, tmpdir).items(): + for key, items in launch_infernal(fna_file=fasta_file.name, org=org, kingdom=kingdom, tmpdir=tmpdir).items(): genes[key].extend(items) fasta_file.close() # closing either tmp file or original fasta file. return genes -def overlap_filter(all_genes, overlap): +def overlap_filter(all_genes: defaultdict, overlap: bool = True) -> defaultdict: """ - Removes the CDS that overlap with RNA genes. + Removes the CDS that overlap with RNA genes. + + :param all_genes: Dictionary with complete list of genes + :param overlap: + + :return: Dictionary with genes filtered """ + sorted_genes = defaultdict(list) for key, genes in all_genes.items(): tmp_genes = sorted(genes, key=lambda x: x.start) @@ -226,25 +270,46 @@ def overlap_filter(all_genes, overlap): return sorted_genes -def get_dna_sequence(contig_seq, gene): +def get_dna_sequence(contig_seq: str, gene: Gene) -> str: + """ + Return the gene sequence + + :param contig_seq: Contig sequence + :param gene: Gene + + :return: str + """ if gene.strand == "+": return contig_seq[gene.start - 1:gene.stop] elif gene.strand == "-": return reverse_complement(contig_seq[gene.start - 1:gene.stop]) -def annotate_organism(org_name, file_name, circular_contigs, code, kingdom, norna, tmpdir, overlap, contig_filter): +def annotate_organism(org_name: str, file_name: str, circular_contigs, tmpdir: str, code: int = 11, norna: bool = False, + kingdom: str = "bacteria", overlap: bool = True, contig_filter: int = 1) -> Organism: """ - Function to annotate a single organism + Function to annotate a single organism + + :param org_name: Name of the organism / genome + :param file_name: Path to the fasta file containing organism sequences + :param circular_contigs: + :param code: Translation table (genetic code) to use. + :param kingdom: Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. + :param norna: Use to avoid annotating RNA features. + :param tmpdir: Path to temporary directory + :param overlap: Use to not remove genes overlapping with RNA features + :param contig_filter: + + :return: Complete organism object for pangenome """ org = Organism(org_name) fasta_file = read_compressed_or_not(file_name) contig_sequences = read_fasta(org, fasta_file, contig_filter) - if is_compressed(file_name): + if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) - genes = syntaxic_annotation(org, fasta_file, norna, kingdom, code, tmpdir) + genes = syntaxic_annotation(org, fasta_file, tmpdir, norna, kingdom, code) genes = overlap_filter(genes, overlap) for contigName, genes in genes.items(): diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index 2e4a42e2..a0949197 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -9,6 +9,7 @@ import argparse from io import TextIOWrapper from pathlib import Path +from typing import TextIO, Union, BinaryIO import pkg_resources from numpy import repeat @@ -102,10 +103,14 @@ def jaccard_similarities(mat, jaccard_similarity_th): return similarities -def read_compressed_or_not(file_or_file_path): +def read_compressed_or_not(file_or_file_path: Union[str, BinaryIO, TextIOWrapper, TextIO]) -> Union[TextIOWrapper, + BinaryIO, TextIO]: """ - reads a file object or file path, uncompresses it, if need be. - returns a TextIO object in read only. + Reads a file object or file path, uncompresses it, if need be. + + :param file_or_file_path: Path to the input file + + :return: TextIO object in read only """ file = file_or_file_path if isinstance(file, str): @@ -135,8 +140,11 @@ def write_compressed_or_not(file_path, compress): def is_compressed(file_or_file_path): - """ - Checks is a file, or file path given is compressed or not + """ Checks is a file, or file path given is compressed or not + + :param file_or_file_path: + + :return: """ file = file_or_file_path if isinstance(file, str): From 0950bde5bfb83c10771ab120144d5b3659acba76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 5 May 2022 16:18:40 +0200 Subject: [PATCH 03/20] Add docstring and few refactoring --- VERSION | 3 +- ppanggolin/align/alignOnPang.py | 4 +- ppanggolin/annotate/annotate.py | 5 +- ppanggolin/annotate/synta.py | 2 + ppanggolin/cluster/cluster.py | 376 ++++++++++++++++-------- ppanggolin/context/searchGeneContext.py | 139 +++------ ppanggolin/figures/draw_spot.py | 174 +++++++++-- ppanggolin/figures/drawing.py | 26 +- ppanggolin/figures/tile_plot.py | 12 +- ppanggolin/figures/ucurve.py | 11 +- ppanggolin/region.py | 42 +++ ppanggolin/workflow/all.py | 8 +- ppanggolin/workflow/workflow.py | 8 +- 13 files changed, 536 insertions(+), 274 deletions(-) diff --git a/VERSION b/VERSION index 8be93473..15caaaae 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1 @@ -1.2.95 -..1 +1.2.96 diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index ed402e1d..4b38cee2 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -214,7 +214,7 @@ def draw_spot_gexf(spots: set, output: str, multigenics: set, fam_to_mod: dict, """ for spot in spots: fname = output + "/spot_" + str(spot.ID) + ".gexf" - subgraph(spot, fname, set_size=set_size, multigenics=multigenics, fam2mod=fam_to_mod) + subgraph(spot, fname, set_size=set_size, multigenics=multigenics, fam_to_mod=fam_to_mod) def get_seq_info(seq_to_pang: dict, pangenome: Pangenome, output: str, draw_related: bool = False, disable_bar=False): @@ -309,7 +309,7 @@ def align(pangenome: Pangenome, sequence_file: str, output: str, tmpdir: str, id :param output: Path of the output directory :param tmpdir: Temporary directory :param identity: minimal identity threshold for the alignment - :param coverage: minimal identity threshold for the alignment + :param coverage: minimal coverage threshold for the alignment :param no_defrag: do not use the defrag workflow if true :param cpu: number of CPU cores to use :param getinfo: Extract info related to the best hit of each query, such as the RGP it is in, or the spots. diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index 0c3b7f06..e4f9103c 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -409,7 +409,8 @@ def read_anno_file(organism_name: str, filename: str, circular_contigs: list, ps :param filename: Path to the corresponding file :param circular_contigs: list of sequence in contig :param pseudo: allow to read pseudogène - :return: + + :return: Annotated organism for pangenome """ filetype = detect_filetype(filename) if filetype == "gff": @@ -460,7 +461,7 @@ def read_annotations(pangenome: Pangenome, organisms_file: str, cpu: int = 1, ps :param pangenome: pangenome object :param organisms_file: List of GBFF files for each organism :param cpu: number of CPU cores to use - :param pseudo: + :param pseudo: allow to read pseudogène :param disable_bar: Disable the progresse bar """ logging.getLogger().info("Reading " + organisms_file + " the list of organism files ...") diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index 0049fcd3..f753aadb 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -179,6 +179,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list], contig_filte contig_seq += line.strip() if len(contig_seq) >= contig_filter: # processing the last contig contigs[contig.name] = contig_seq.upper() + all_contig_len += len(contig_seq) except AttributeError as e: raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. " f"One possibility for this error is that the file did not start with a '>' " @@ -316,6 +317,7 @@ def annotate_organism(org_name: str, file_name: str, circular_contigs, tmpdir: s if is_compressed(file_name): # TODO simply copy file with shutil.copyfileobj fasta_file = write_tmp_fasta(contig_sequences, tmpdir) if procedure is None: # prodigal procedure is not force by user + logging.getLogger().debug(all_contig_len) if all_contig_len < 20000: # case of short sequence procedure = "meta" else: diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index 8d6fc23e..ec82efbe 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -8,6 +8,7 @@ from collections import defaultdict import os import argparse +from typing import io # installed libraries from networkx import Graph @@ -22,63 +23,108 @@ from ppanggolin.formats.writeSequences import write_gene_sequences_from_annotations -def align_rep(faa_file, tmpdir, cpu, coverage, identity): - seqdb = tmpdir.name + '/rep_sequence_db' - cmd = ["mmseqs", "createdb", faa_file, seqdb] - logging.getLogger().debug(" ".join(cmd)) - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - alndb = tmpdir.name + '/rep_alignment_db' - cmd = ["mmseqs", "search", seqdb, seqdb, alndb, tmpdir.name, "-a", "--min-seq-id", str(identity), "-c", - str(coverage), "--cov-mode", "1", "--threads", str(cpu)] - logging.getLogger().debug(" ".join(cmd)) - logging.getLogger().info("Aligning cluster representatives...") - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - outfile = tmpdir.name + '/rep_families.tsv' - cmd = ["mmseqs", "convertalis", seqdb, seqdb, alndb, outfile, "--format-output", "query,target,qlen,tlen,bits"] - logging.getLogger().debug(" ".join(cmd)) - logging.getLogger().info("Extracting alignments...") - subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) - return outfile +# Global functions +def check_pangenome_former_clustering(pangenome: Pangenome, force: bool = False): + """ + Checks pangenome status and .h5 files for former clusterings, delete them if allowed or raise an error + + :param pangenome: Annotated Pangenome + :param force: Force to write on existing pangenome information + """ + if pangenome.status["genesClustered"] == "inFile" and not force: + raise Exception("You are trying to cluster genes that are already clustered together. If you REALLY want to " + "do that, use --force (it will erase everything except annotation data in your HDF5 file!)") + elif pangenome.status["genesClustered"] == "inFile" and force: + erase_pangenome(pangenome, gene_families=True) -def first_clustering(sequences, tmpdir, cpu, code, coverage, identity, mode): +# Clustering functions +def check_pangenome_for_clustering(pangenome: Pangenome, tmp_file: io.TextIO, force: bool = False, + disable_bar: bool = False): + """ + Check the pangenome statuses and write the gene sequences in the provided tmpFile. + (whether they are written in the .h5 file or currently in memory) + + :param pangenome: Annotated Pangenome + :param tmp_file: Temporary file + :param force: Force to write on existing pangenome information + :param disable_bar: Allow to disable progress bar + """ + check_pangenome_former_clustering(pangenome, force) + if pangenome.status["geneSequences"] in ["Computed", "Loaded"]: + # we append the gene ids by 'ppanggolin' to avoid crashes from mmseqs when sequence IDs are only numeric. + write_gene_sequences_from_annotations(pangenome, tmp_file, add="ppanggolin_", disable_bar=disable_bar) + elif pangenome.status["geneSequences"] == "inFile": + get_gene_sequences_from_file(pangenome.file, tmp_file, add="ppanggolin_", + disable_bar=disable_bar) # write CDS sequences to the tmpFile + else: + tmp_file.close() # closing the tmp file since an exception will be raised. + raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster " + "the genes in gene families. Either provide clustering results (see --clusters), " + "or provide a way to access the gene sequence during the annotation step " + "(having the fasta in the gff files, or providing the fasta files through the --fasta option)") + + +def first_clustering(sequences: io.TextIO, tmpdir: tempfile.TemporaryDirectory, cpu: int = 1, code: int = 11, + coverage: float = 0.8, identity: float = 0.8, mode: int = 1) -> (str, str): + """ + Make a first clustering of all sequences in pangenome + + :param sequences: Sequence from pangenome + :param tmpdir: Temporary directory + :param cpu: number of CPU cores to use + :param code: Genetic code used + :param coverage: minimal coverage threshold for the alignment + :param identity: minimal identity threshold for the alignment + :param mode: MMseqs2 clustering mode + + :return: path to representative sequence file and path to tsv clustering result + """ seq_nucdb = tmpdir.name + '/nucleotid_sequences_db' - cmd = ["mmseqs", "createdb", sequences.name, seq_nucdb] + cmd = list(map(str, ["mmseqs", "createdb", sequences.name, seq_nucdb])) logging.getLogger().debug(" ".join(cmd)) logging.getLogger().info("Creating sequence database...") subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + logging.getLogger().debug("Translate sequence ...") seqdb = tmpdir.name + '/aa_db' - cmd = ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", str(cpu), "--translation-table", code] + cmd = list(map(str, ["mmseqs", "translatenucs", seq_nucdb, seqdb, "--threads", cpu, "--translation-table", code])) logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + logging.getLogger().info("Clustering sequences...") cludb = tmpdir.name + '/cluster_db' - cmd = ["mmseqs", "cluster", seqdb, cludb, tmpdir.name, "--cluster-mode", mode, "--min-seq-id", str(identity), "-c", - str(coverage), "--threads", str(cpu), "--kmer-per-seq", "80", "--max-seqs", "300"] + cmd = list(map(str, ["mmseqs", "cluster", seqdb, cludb, tmpdir.name, "--cluster-mode", mode, "--min-seq-id", + identity, "-c", coverage, "--threads", cpu, "--kmer-per-seq", 80, "--max-seqs", 300])) logging.getLogger().debug(" ".join(cmd)) - logging.getLogger().info("Clustering sequences...") subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) logging.getLogger().info("Extracting cluster representatives...") repdb = tmpdir.name + '/representative_db' - cmd = ["mmseqs", "result2repseq", seqdb, cludb, repdb] + cmd = list(map(str, ["mmseqs", "result2repseq", seqdb, cludb, repdb])) logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) reprfa = tmpdir.name + '/representative_sequences.fasta' cmd = ["mmseqs", "result2flat", seqdb, seqdb, repdb, reprfa, "--use-fasta-header"] logging.getLogger().debug(" ".join(cmd)) subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + logging.getLogger().info("Writing gene to family informations") outtsv = tmpdir.name + '/families_tsv' - cmd = ["mmseqs", "createtsv", seqdb, seqdb, cludb, outtsv, "--threads", str(cpu), "--full-header"] + cmd = list(map(str, ["mmseqs", "createtsv", seqdb, seqdb, cludb, outtsv, "--threads", cpu, "--full-header"])) logging.getLogger().debug(" ".join(cmd)) - logging.getLogger().info("Writing gene to family informations") subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) return reprfa, outtsv -def read_faa(faa_file_name): +def read_faa(faa_file_name: str) -> dict: + """ + Read a faa file to link pangenome families to sequences. + + :param faa_file_name: path to the faa file + + :return: dictionary with families ID as key and sequence as value + """ fam2seq = {} head = "" - with open(faa_file_name, "r") as faFile: - for line in faFile: + with open(faa_file_name, "r") as faaFile: + for line in faaFile: if line.startswith('>'): head = line[1:].strip().replace("ppanggolin_", "") # remove the eventual addition else: @@ -86,14 +132,44 @@ def read_faa(faa_file_name): return fam2seq -def read_tsv(tsv_file_name): +def align_rep(faa_file: str, tmpdir: tempfile.TemporaryDirectory, cpu: int = 1, + coverage: float = 0.8, identity: float = 0.8) -> str: """ - reading tsv file + Align representative sequence + + :param faa_file: sequence of representative family + :param tmpdir: Temporary directory + :param cpu: number of CPU cores to use + :param coverage: minimal coverage threshold for the alignment + :param identity: minimal identity threshold for the alignment + + :return: Result of alignment + """ + logging.getLogger().debug("Create database") + seqdb = tmpdir.name + '/rep_sequence_db' + cmd = ["mmseqs", "createdb", faa_file, seqdb] + logging.getLogger().debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + logging.getLogger().info("Aligning cluster representatives...") + alndb = tmpdir.name + '/rep_alignment_db' + cmd = list(map(str, ["mmseqs", "search", seqdb, seqdb, alndb, tmpdir.name, "-a", "--min-seq-id", identity, + "-c", coverage, "--cov-mode", 1, "--threads", cpu])) + logging.getLogger().debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + logging.getLogger().info("Extracting alignments...") + outfile = tmpdir.name + '/rep_families.tsv' + cmd = ["mmseqs", "convertalis", seqdb, seqdb, alndb, outfile, "--format-output", "query,target,qlen,tlen,bits"] + logging.getLogger().debug(" ".join(cmd)) + subprocess.run(cmd, stdout=subprocess.DEVNULL, check=True) + return outfile + + +def read_tsv(tsv_file_name: str) -> (dict, dict): + """Reading tsv file + :param tsv_file_name: path to the tsv - :type tsv_file_name: str :return: two dictionnary which link genes and families - :rtype: tuple(dict, dict) """ genes2fam = {} fam2genes = defaultdict(set) @@ -106,13 +182,23 @@ def read_tsv(tsv_file_name): return genes2fam, fam2genes -def refine_clustering(tsv, aln_file, fam_to_seq): +def refine_clustering(tsv: str, aln_file: str, fam_to_seq: dict) -> (dict, dict): + """ + Refine clustering by removing fragment + + :param tsv: First clusterin result + :param aln_file: Reprensentative alignment result + :param fam_to_seq: Dictionary which link families to sequence + + :return: Two dictionary which link genes and families + """ simgraph = Graph() genes2fam, fam2genes = read_tsv(tsv) logging.getLogger().info(f"Starting with {len(fam_to_seq)} families") # create the nodes for fam, genes in fam2genes.items(): simgraph.add_node(fam, nbgenes=len(genes)) + # add the edges with open(aln_file, "r") as alnfile: for line in alnfile: @@ -122,6 +208,7 @@ def refine_clustering(tsv, aln_file, fam_to_seq): simgraph.add_edge(line[0], line[1], score=float(line[4])) simgraph.nodes[line[0]]["length"] = int(line[2]) simgraph.nodes[line[1]]["length"] = int(line[3]) + for node, nodedata in simgraph.nodes(data=True): choice = (None, 0, 0, 0) for neighbor in simgraph.neighbors(node): @@ -131,10 +218,11 @@ def refine_clustering(tsv, aln_file, fam_to_seq): choice = (genes2fam[neighbor][0], nei["length"], nei["nbgenes"], score) # `genes2fam[neighbor]` instead of just neighbor in case that family has been assigned already # (this is for smaller fragments that are closer to other fragments than the actual gene family) + if choice[0] is not None: genestochange = fam2genes[node] for gene in genestochange: - genes2fam[gene] = (choice[0], True) + genes2fam[gene] = (str(choice[0]), True) fam2genes[choice[0]].add(gene) del fam2genes[node] new_fam_to_seq = {} @@ -144,19 +232,38 @@ def refine_clustering(tsv, aln_file, fam_to_seq): return genes2fam, new_fam_to_seq -def read_gene2fam(pangenome, gene2fam, disable_bar=False): - logging.getLogger().info(f"Adding {len(gene2fam)} genes to the gene families") +def read_fam2seq(pangenome: Pangenome, fam_to_seq: dict): + """ + Add gene family to pangenome and sequences to gene families + + :param pangenome: Annotated pangenome + :param fam_to_seq: Dictionary which link families and sequences + """ + logging.getLogger().info("Adding protein sequences to the gene families") + for family, protein in fam_to_seq.items(): + fam = pangenome.add_gene_family(family) + fam.add_sequence(protein) + + +def read_gene2fam(pangenome: Pangenome, gene_to_fam: dict, disable_bar: bool = False): + """ + Add gene to pangenome families + + :param pangenome: Annotated Pangenome + :param gene_to_fam: Dictionary which link gene to families + :param disable_bar: Allow to disable progress bar + """ + logging.getLogger().info(f"Adding {len(gene_to_fam)} genes to the gene families") link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False if link: - if len(gene2fam) != len(pangenome.genes): # then maybe there are genes with identical IDs + if len(gene_to_fam) != len(pangenome.genes): # then maybe there are genes with identical IDs raise Exception("Something unexpected happened during clustering " "(have less genes clustered than genes in the pangenome). " "A probable reason is that two genes in two different organisms have the same IDs; " "If you are sure that all of your genes have non identical IDs, " "please post an issue at https://github.com/labgem/PPanGGOLiN/") - bar = tqdm(gene2fam.items(), unit="gene", disable=disable_bar) - for gene, (family, is_frag) in bar: + for gene, (family, is_frag) in tqdm(gene_to_fam.items(), unit="gene", total=len(gene_to_fam), disable=disable_bar): fam = pangenome.add_gene_family(family) if link: # doing the linking if the annotations are loaded. gene_obj = pangenome.get_gene(gene) @@ -164,69 +271,36 @@ def read_gene2fam(pangenome, gene2fam, disable_bar=False): gene_obj = Gene(gene) gene_obj.is_fragment = is_frag fam.add_gene(gene_obj) - bar.close() -def read_fam2seq(pangenome, fam2seq): - logging.getLogger().info("Adding protein sequences to the gene families") - for family, protein in fam2seq.items(): - fam = pangenome.add_gene_family(family) - fam.add_sequence(protein) - - -def check_pangenome_former_clustering(pangenome, force): - """ checks pangenome status and .h5 files for former clusterings, delete them if allowed or raise an error """ - if pangenome.status["genesClustered"] == "inFile" and not force: - raise Exception("You are trying to cluster genes that are already clustered together. If you REALLY want to " - "do that, use --force (it will erase everything except annotation data in your HDF5 file!)") - elif pangenome.status["genesClustered"] == "inFile" and force: - erase_pangenome(pangenome, gene_families=True) - - -def check_pangenome_for_clustering(pangenome, tmp_file, force, disable_bar=False): +def clustering(pangenome: Pangenome, tmpdir: str, cpu: int = 1, defrag: bool = True, code: int = 11, + coverage: float = 0.8, identity: float = 0.8, mode: int = 1, force: bool = False, + disable_bar: bool = False): """ - Check the pangenome statuses and write the gene sequences in the provided tmpFile. - (whether they are written in the .h5 file or currently in memory) + Main function to cluster pangenome gene sequences into families + + :param pangenome: Annoatated Pangenome + :param tmpdir: Path to temporary directory + :param cpu: number of CPU cores to use + :param defrag: Allow to remove fragment + :param code: Genetic code used + :param coverage: minimal coverage threshold for the alignment + :param identity: minimal identity threshold for the alignment + :param mode: MMseqs2 clustering mode + :param force: force to write in the pangenome + :param disable_bar: Allow to disable progress bar """ - check_pangenome_former_clustering(pangenome, force) - if pangenome.status["geneSequences"] in ["Computed", "Loaded"]: - # we append the gene ids by 'ppanggolin' to avoid crashes from mmseqs when sequence IDs are only numeric. - write_gene_sequences_from_annotations(pangenome, tmp_file, add="ppanggolin_", disable_bar=disable_bar) - elif pangenome.status["geneSequences"] == "inFile": - get_gene_sequences_from_file(pangenome.file, tmp_file, add="ppanggolin_", - disable_bar=disable_bar) # write CDS sequences to the tmpFile - else: - tmp_file.close() # closing the tmp file since an exception will be raised. - raise Exception("The pangenome does not include gene sequences, thus it is impossible to cluster " - "the genes in gene families. Either provide clustering results (see --clusters), " - "or provide a way to access the gene sequence during the annotation step " - "(having the fasta in the gff files, or providing the fasta files through the --fasta option)") - -def infer_singletons(pangenome): - """creates a new family for each gene with no associated family""" - singleton_counter = 0 - for gene in pangenome.genes: - if gene.family is None: - pangenome.add_gene_family(gene.ID).add_gene(gene) - singleton_counter += 1 - logging.getLogger().info(f"Inferred {singleton_counter} singleton families") - - -def clustering(pangenome, tmpdir, cpu, defrag=True, code="11", coverage=0.8, identity=0.8, mode="1", force=False, - disable_bar=False): newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) - sequence_file = open(newtmpdir.name + '/nucleotid_sequences', "w") + with open(newtmpdir.name + '/nucleotid_sequences', "w") as sequence_file: + check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) + logging.getLogger().info("Clustering all of the genes sequences...") + rep, tsv = first_clustering(sequence_file, newtmpdir, cpu, code, coverage, identity, mode) - check_pangenome_for_clustering(pangenome, sequence_file, force, disable_bar=disable_bar) - logging.getLogger().info("Clustering all of the genes sequences...") - rep, tsv = first_clustering(sequence_file, newtmpdir, cpu, code, coverage, identity, mode) - - sequence_file.close() fam2seq = read_faa(rep) if not defrag: logging.getLogger().debug("No defragmentation") - genes2fam = read_tsv(tsv)[0] + genes2fam, _ = read_tsv(tsv) else: logging.getLogger().info("Associating fragments to their original gene family...") aln = align_rep(rep, newtmpdir, cpu, coverage, identity) @@ -247,9 +321,13 @@ def clustering(pangenome, tmpdir, cpu, defrag=True, code="11", coverage=0.8, ide pangenome.parameters["cluster"]["read_clustering_from_file"] = False -def mk_local_to_gene(pangenome): - """ - Creates a dictionary that stores local identifiers, if all local identifiers are unique (and if they exist) +# Read clustering +def mk_local_to_gene(pangenome: Pangenome) -> dict: + """Creates a dictionary that stores local identifiers, if all local identifiers are unique (and if they exist) + + :param pangenome: Input Pangenome + + :return: Dictionary with local identifiers """ local_dict = {} for gene in pangenome.genes: @@ -267,10 +345,30 @@ def mk_local_to_gene(pangenome): return local_dict -def read_clustering(pangenome, families_tsv_file, infer_singleton=False, force=False, disable_bar=False): +def infer_singletons(pangenome: Pangenome): + """Creates a new family for each gene with no associated family + + :param pangenome: Input pangenome """ - Creates the pan, the gene families and the genes with an associated gene family. - Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pan. + singleton_counter = 0 + for gene in pangenome.genes: + if gene.family is None: + pangenome.add_gene_family(gene.ID).add_gene(gene) + singleton_counter += 1 + logging.getLogger().info(f"Inferred {singleton_counter} singleton families") + + +def read_clustering(pangenome: Pangenome, families_tsv_file: str, infer_singleton: bool = False, force: bool = False, + disable_bar: bool = False): + """ + Get the pangenome information, the gene families and the genes with an associated gene family. + Reads a families tsv file from mmseqs2 output and adds the gene families and the genes to the pangenome. + + :param pangenome: Input Pangenome + :param families_tsv_file: MMseqs2 clustering results + :param infer_singleton: creates a new family for each gene with no associated family + :param force: force to write in the pangenome + :param disable_bar: Allow to disable progress bar """ check_pangenome_former_clustering(pangenome, force) check_pangenome_info(pangenome, need_annotations=True, disable_bar=disable_bar) @@ -278,8 +376,7 @@ def read_clustering(pangenome, families_tsv_file, infer_singleton=False, force=F logging.getLogger().info("Reading " + families_tsv_file + " the gene families file ...") filesize = os.stat(families_tsv_file).st_size families_tsv_file = read_compressed_or_not(families_tsv_file) - frag = False - # the genome annotations are necessarily loaded. + frag = False # the genome annotations are necessarily loaded. nb_gene_with_fam = 0 local_dict = mk_local_to_gene(pangenome) bar = tqdm(total=filesize, unit="bytes", disable=disable_bar) @@ -291,7 +388,7 @@ def read_clustering(pangenome, families_tsv_file, infer_singleton=False, force=F elements = [el.strip() for el in line.split()] # 2 or 3 fields expected if len(elements) <= 1: raise ValueError("No tabulation separator found in gene families file") - (fam_id, gene_id, is_frag) = elements if len(elements) == 3 else elements + [None] + (fam_id, gene_id, is_frag) = elements if len(elements) == 3 else elements + ["Na"] # case of 2 fields try: gene_obj = pangenome.get_gene(gene_id) except KeyError: @@ -299,7 +396,7 @@ def read_clustering(pangenome, families_tsv_file, infer_singleton=False, force=F if gene_obj is not None: nb_gene_with_fam += 1 fam = pangenome.add_gene_family(fam_id) - gene_obj.is_fragment = True if is_frag == "F" else False + gene_obj.is_fragment = True if is_frag == "F" else False # F for Fragment fam.add_gene(gene_obj) if is_frag == "F": frag = True @@ -328,52 +425,75 @@ def read_clustering(pangenome, families_tsv_file, infer_singleton=False, force=F pangenome.parameters["cluster"]["infer_singletons"] = infer_singleton -def launch(args): - """ launch the clustering step""" +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ pangenome = Pangenome() pangenome.add_file(args.pangenome) if args.clusters is None: + if args.infer_singletons is not None: + logging.getLogger().warning("--infer_singletons option is not compatible with clustering creation. " + "To infer singleton you should give a clustering") clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, code=args.translation_table, coverage=args.coverage, identity=args.identity, mode=args.mode, force=args.force, disable_bar=args.disable_prog_bar) logging.getLogger().info("Done with the clustering") else: + if None in [args.tmpdir, args.cpu, args.no_defrag, args.translation_table, + args.coverage, args.identity, args.mode]: + logging.getLogger().warning("You are using an option compatible only with clustering creation.") read_clustering(pangenome, args.clusters, args.infer_singletons, args.force, disable_bar=args.disable_prog_bar) logging.getLogger().info("Done reading the cluster file") write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("cluster", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_clust(parser) return parser -def parser_clust(parser): +def parser_clust(parser: argparse.ArgumentParser): + """ + Parser for specific argument of cluster command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") - optional = parser.add_argument_group(title="Optional arguments") - optional.add_argument("--defrag", required=False, action="store_true", - help=argparse.SUPPRESS) # This ensures compatibility with the old option "defrag" - optional.add_argument('--no_defrag', required=False, default=False, action="store_true", - help="DO NOT Use the defragmentation strategy to link potential fragments " - "with their original gene family.") - optional.add_argument("--translation_table", required=False, default="11", - help="Translation table (genetic code) to use.") - optional.add_argument('--clusters', required=False, type=str, - help="A tab-separated list containing the result of a clustering. One line per gene. " - "First column is cluster ID, and second is gene ID") - optional.add_argument("--infer_singletons", required=False, action="store_true", - help="When reading a clustering result with --clusters, if a gene is not in the provided file" - " it will be placed in a cluster where the gene is the only member.") - optional.add_argument("--mode", required=False, default="1", choices=["0", "1", "2", "3"], - help="the cluster mode of MMseqs2. 0: Setcover, 1: single linkage (or connected component)," - " 2: CD-HIT-like, 3: CD-HIT-like (lowmem)") - optional.add_argument("--coverage", required=False, type=restricted_float, default=0.8, - help="Minimal coverage of the alignment for two proteins to be in the same cluster") - optional.add_argument("--identity", required=False, type=restricted_float, default=0.8, - help="Minimal identity percent for two proteins to be in the same cluster") + clust = parser.add_argument_group(title="Clustering arguments") + clust.add_argument("--identity", required=False, type=restricted_float, default=0.8, + help="Minimal identity percent for two proteins to be in the same cluster") + clust.add_argument("--coverage", required=False, type=restricted_float, default=0.8, + help="Minimal coverage of the alignment for two proteins to be in the same cluster") + clust.add_argument("--mode", required=False, default="1", choices=["0", "1", "2", "3"], + help="the cluster mode of MMseqs2. 0: Setcover, 1: single linkage (or connected component)," + " 2: CD-HIT-like, 3: CD-HIT-like (lowmem)") + clust.add_argument('--no_defrag', required=False, default=False, action="store_true", + help="DO NOT Use the defragmentation strategy to link potential fragments " + "with their original gene family.") + clust.add_argument("--defrag", required=False, action="store_true", + help=argparse.SUPPRESS) # This ensures compatibility with the old option "defrag" + clust.add_argument("--translation_table", required=False, default="11", + help="Translation table (genetic code) to use.") + read = parser.add_argument_group(title="Read clustering arguments") + read.add_argument('--clusters', required=False, type=str, + help="A tab-separated list containing the result of a clustering. One line per gene. " + "First column is cluster ID, and second is gene ID") + read.add_argument("--infer_singletons", required=False, action="store_true", + help="When reading a clustering result with --clusters, if a gene is not in the provided file" + " it will be placed in a cluster where the gene is the only member.") if __name__ == '__main__': diff --git a/ppanggolin/context/searchGeneContext.py b/ppanggolin/context/searchGeneContext.py index 82c25f81..75e2350c 100644 --- a/ppanggolin/context/searchGeneContext.py +++ b/ppanggolin/context/searchGeneContext.py @@ -14,84 +14,32 @@ # local libraries from ppanggolin.formats import check_pangenome_info +from ppanggolin.genome import Gene, Contig from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components from ppanggolin.pangenome import Pangenome from ppanggolin.align.alignOnPang import get_seq2pang, project_partition -from ppanggolin.geneFamily import GeneFamily +from ppanggolin.region import GeneContext -class GeneContext: - """ - A class used to represent a gene context - - Attributes - ---------- - gc_id : int - ID of the Gene context - families : set - Gene families related to the GeneContext - - Methods - ------- - """ - - def __init__(self, gc_id, families=None): - """ Initial methods - - :param gc_id: ID of the GeneContext - :type gc_id: int - :param families: Gene families related to the GeneContext - :type families: set - """ - self.ID = gc_id - self.families = set() - if families is not None: - if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception(f"You provided elements that were not GeneFamily object." - f" GeneContext are only made of GeneFamily") - self.families |= set(families) - - def add_family(self, family): - """ - Allow to add one family in the GeneContext - :param family: family to add - :type family: GeneFamily - """ - if not isinstance(family, GeneFamily): - raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") - self.families.add(family) - - -def search_gene_context_in_pangenome(pangenome, output, tmpdir, sequences=None, families=None, transitive=4, - identity=0.5, coverage=0.8, jaccard=0.85, no_defrag=False, cpu=1, - disable_bar=True): +def search_gene_context_in_pangenome(pangenome: Pangenome, output: str, tmpdir: str, sequences: str = None, + families: str = None, transitive: int = 4, identity: float = 0.5, + coverage: float = 0.8, jaccard: float = 0.85, no_defrag: bool = False, + cpu: int = 1, disable_bar=True): """ Main function to search common gene contexts between sequence set and pangenome families :param pangenome: Pangenome containing GeneFamilies to align with sequence set - :type pangenome: Pangenome :param sequences: Path to file containing the sequences - :type sequences: str :param families: Path to file containing families name - :type families: str :param output: Path to output directory - :type output: str :param tmpdir: Path to temporary directory - :type tmpdir: str :param transitive: number of genes to check on both sides of a family aligned with an input sequence - :type transitive: int :param identity: minimum identity threshold between sequences and gene families for the alignment - :type identity: float :param coverage: minimum coverage threshold between sequences and gene families for the alignment - :type coverage: float :param jaccard: Jaccard index to filter edges in graph - :type jaccard: float :param no_defrag: do not use the defrag workflow if true - :type no_defrag: Boolean :param cpu: Number of core used to process - :type cpu: int :param disable_bar: Allow preventing bar progress print - :type disable_bar: Boolean """ # check statuses and load info @@ -141,20 +89,17 @@ def search_gene_context_in_pangenome(pangenome, output, tmpdir, sequences=None, logging.getLogger().info(f"Computing gene contexts took {round(time.time() - start_time, 2)} seconds") -def compute_gene_context_graph(families, t, disable_bar=False): +def compute_gene_context_graph(families: dict, t: int = 4, disable_bar: bool = False) -> nx.Graph: """ Construct the graph of gene contexts between families of the pan :param families: Gene families of interest - :type families: dict :param t: transitive value - :type t: int :param disable_bar: Prevents progress bar printing - :type disable_bar: Boolean :return: Graph of gene contexts between interesting gene families of the pan - :rtype: nx.Graph """ + g = nx.Graph() for family in tqdm(families.values(), unit="families", disable=disable_bar): for gene in family.genes: @@ -166,19 +111,16 @@ def compute_gene_context_graph(families, t, disable_bar=False): return g -def _compute_gene_context_graph(g, env_gene, contig, pos_r): +def _compute_gene_context_graph(g: nx.Graph, env_gene: Gene, contig: Contig, pos_r: int): """ Compute graph of gene contexts between one gene and the other part of the contig :param: Graph of gene contexts between interesting gene families of the pan - :type: nx.Graph :param env_gene: Gene of the current position - :type env_gene: Gene :param contig: Current contig to search a gene context - :type contig: Contig :param pos_r: Gene to search a gene context - :type pos_r: Gene """ + g.add_node(env_gene.family) add_gene(g.nodes[env_gene.family], env_gene, fam_split=False) pos = env_gene.position + 1 @@ -191,22 +133,18 @@ def _compute_gene_context_graph(g, env_gene, contig, pos_r): pos += 1 -def extract_gene_context(gene, contig, families, t=4): +def extract_gene_context(gene: Gene, contig: list, families: dict, t: int = 4) -> (int, bool, int, bool): """ Extract gene context and whether said gene context exists :param gene: Gene of interest - :type gene: Gene - :param contig: Gene's contig - :type contig: Contig + :param contig: list of genes in contig :param families: Alignment results - :param families: dict :param t: transitive value - :type t: int :return: Position of the context and if it exists for each side ('left' and 'right') - :rtype: (int, Bool, int, Bool) """ + pos_left, pos_right = (max(0, gene.position - t), min(gene.position + t, len(contig) - 1)) # Gene positions to compare family in_context_left, in_context_right = (False, False) @@ -225,18 +163,16 @@ def extract_gene_context(gene, contig, families, t=4): return pos_left, in_context_left, pos_right, in_context_right -def compute_gene_context(g, jaccard=0.85): +def compute_gene_context(g: nx.Graph, jaccard: float = 0.85) -> set: """ Compute the gene contexts in the graph :param g: Graph of gene contexts between interesting gene families of the pan - :type g: nx.Graph :param jaccard: Jaccard index - :type jaccard: float :return: Set of gene contexts find in graph - :rtype: Set """ + gene_contexts = set() c = 1 for comp in connected_components(g, removed=set(), weight=jaccard): @@ -245,15 +181,16 @@ def compute_gene_context(g, jaccard=0.85): return gene_contexts -def fam2seq(seq_to_pan): +def fam2seq(seq_to_pan: dict) -> dict: """ Create a dictionary with gene families as keys and list of sequences id as values :param seq_to_pan: Dictionary storing the sequence ids as keys and the gene families - to which they are assigned as values + to which they are assigned as values + :return: Dictionary reversed - :rtype: dict """ + fam_2_seq = {} for sequence, family in seq_to_pan.items(): if family.ID in fam_2_seq.keys(): @@ -263,28 +200,25 @@ def fam2seq(seq_to_pan): return fam_2_seq -def export_to_dataframe(families, gene_contexts, fam_2_seq, output): +def export_to_dataframe(families: set, gene_contexts: set, fam_to_seq: dict, output: str): """ Export the results into dataFrame :param families: Families related to the connected components - :type families: set :param gene_contexts: connected components found in the pan - :type gene_contexts: set - :param fam_2_seq: Dictionary with gene families as keys and list of sequence ids as values - :type fam_2_seq: dict + :param fam_to_seq: Dictionary with gene families as keys and list of sequence ids as values :param output: output path - :type output: str """ + logging.getLogger().debug(f"There are {len(families)} families among {len(gene_contexts)} gene contexts") lines = [] for gene_context in gene_contexts: for family in gene_context.families: line = [gene_context.ID] - if fam_2_seq is None or fam_2_seq.get(family.ID) is None: + if fam_to_seq is None or fam_to_seq.get(family.ID) is None: line += [family.name, None, len(family.organisms), family.named_partition] else: - line += [family.name, ','.join(fam_2_seq.get(family.ID)), + line += [family.name, ','.join(fam_to_seq.get(family.ID)), len(family.organisms), family.named_partition] lines.append(line) df = pd.DataFrame(lines, @@ -295,9 +229,15 @@ def export_to_dataframe(families, gene_contexts, fam_2_seq, output): logging.getLogger(f"detected gene context(s) are listed in: '{output}/gene_contexts.tsv'") -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ + if not any([args.sequences, args.family]): - raise Exception("At least one of --sequences or --family must be given") + raise Exception("At least one of --sequences or --family option must be given") mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) @@ -307,22 +247,27 @@ def launch(args): no_defrag=args.no_defrag, cpu=args.cpu, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ - Parser arguments specific to context command + Subparser to launch PPanGGOLiN in Command line :param sub_parser : sub_parser for align command - :type sub_parser : argparse._SubParsersAction :return : parser arguments for align command - :rtype : argparse.ArgumentParser """ + parser = sub_parser.add_parser("context", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_context(parser) return parser -def parser_context(parser): +def parser_context(parser: argparse.ArgumentParser): + """ + Parser for specific argument of context command + + :param parser: parser for align argument + """ + required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome.h5 file") diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 53da1750..ee1754d9 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -9,6 +9,8 @@ from math import pi # local libraries +from ppanggolin.pangenome import Pangenome +from ppanggolin.region import Spot from ppanggolin.utils import jaccard_similarities from ppanggolin.formats import check_pangenome_info from ppanggolin.RGP.spot import comp_border @@ -20,10 +22,10 @@ import networkx as nx from tqdm import tqdm -from bokeh.plotting import ColumnDataSource, figure, save +from bokeh.plotting import ColumnDataSource, figure, save, Figure from bokeh.io import output_file from bokeh.layouts import column, row -from bokeh.models import WheelZoomTool, LabelSet, Slider, CustomJS, HoverTool, RadioGroup, Div +from bokeh.models import WheelZoomTool, LabelSet, Slider, CustomJS, HoverTool, RadioGroup, Div, Column, GlyphRenderer def check_predicted_spots(pangenome): @@ -33,8 +35,15 @@ def check_predicted_spots(pangenome): "Please see the 'spot' subcommand.") -def make_colors_for_iterable(it): - """randomly picks a color for all elements of a given iterable""" +def make_colors_for_iterable(it: set) -> dict: + """ + Randomly picks a color for all elements of a given iterable + + :param it: Iterable families or modules + + :return: Dictionary with for each element a random color associate + """ + famcol = {} for element in it: col = list(random.choices(range(256), k=3)) @@ -45,12 +54,29 @@ def make_colors_for_iterable(it): return famcol -def order_gene_lists(gene_lists, overlapping_match, exact_match, set_size): - gene_lists = line_order_gene_lists(gene_lists, overlapping_match, exact_match, set_size) +def order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: int, set_size: int): + """ + Order all rgps the same way, and order them by similarity in gene content. + + :param gene_lists: List of genes in rgps + :param overlapping_match: Allowed number of missing persistent genes when comparing flanking genes + :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs + :param set_size: Number of single copy markers to use as flanking genes for RGP + + :return: List of ordered genes + """ + line_order_gene_lists(gene_lists, overlapping_match, exact_match, set_size) return row_order_gene_lists(gene_lists) -def row_order_gene_lists(gene_lists): +def row_order_gene_lists(gene_lists: list) -> list: + """ + Row ordering of all rgps + + :param gene_lists: + + :return : An ordered genes list + """ fam_dict = defaultdict(set) for index, genelist in enumerate([genelist[0] for genelist in gene_lists]): @@ -76,7 +102,15 @@ def row_order_gene_lists(gene_lists): return new_gene_lists -def line_order_gene_lists(gene_lists, overlapping_match, exact_match, set_size): +def line_order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: int, set_size: int): + """ + Line ordering of all rgps + + :param gene_lists: list + :param overlapping_match: Allowed number of missing persistent genes when comparing flanking genes + :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs + :param set_size: Number of single copy markers to use as flanking genes for RGP + """ classified = {0} # first gene list has the right order new_classify = set() @@ -109,11 +143,20 @@ def line_order_gene_lists(gene_lists, overlapping_match, exact_match, set_size): classified |= new_classify # the newly classified will help to check the unclassified, # the formerly classified are not useful for what remains (if something remains) new_classify = set() - return gene_lists -def subgraph(spot, outname, with_border=True, set_size=3, multigenics=None, fam2mod=None): - """ write a pangeome subgraph of the gene families of a spot in gexf format""" +def subgraph(spot: Spot, outname: str, with_border: bool = True, set_size: int = 3, + multigenics: set = None, fam_to_mod: dict = None): + """ + Write a pangeome subgraph of the gene families of a spot in gexf format + + :param spot: + :param outname: + :param with_border: + :param set_size: + :param multigenics: + :param fam_to_mod: + """ g = nx.Graph() for rgp in spot.regions: @@ -128,8 +171,8 @@ def subgraph(spot, outname, with_border=True, set_size=3, multigenics=None, fam2 prev = None for gene in gene_list: g.add_node(gene.family.name, partition=gene.family.named_partition) - if fam2mod is not None: - curr_mod = fam2mod.get(gene.family) + if fam_to_mod is not None: + curr_mod = fam_to_mod.get(gene.family) if curr_mod is not None: g.nodes[gene.family.name]["module"] = curr_mod try: @@ -161,7 +204,14 @@ def subgraph(spot, outname, with_border=True, set_size=3, multigenics=None, fam2 nx.write_gexf(g, outname) -def mk_source_data(genelists, fam_col, fam_to_mod): +def mk_source_data(genelists: list, fam_col: dict, fam_to_mod: dict) -> (ColumnDataSource, list): + """ + + :param genelists: + :param fam_col: Dictionary with for each family the corresponding color + :param fam_to_mod: Dictionary with the correspondance modules families + :return: + """ partition_colors = {"shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF"} df = {'name': [], 'ordered': [], 'strand': [], "start": [], "stop": [], "length": [], 'module': [], @@ -251,11 +301,22 @@ def mk_source_data(genelists, fam_col, fam_to_mod): return ColumnDataSource(data=df), tooltips -def add_gene_tools(recs, source_data): - """ define tools to change the outline and fill colors of genes""" +def add_gene_tools(recs: GlyphRenderer, source_data: ColumnDataSource) -> Column: + """ + Define tools to change the outline and fill colors of genes + + :param recs: + :param source_data: + :return: + """ - def color_str(color_element): - """javascript code to switch between partition, family and module color for the given 'color_element'""" + def color_str(color_element: str) -> str: + """ Javascript code to switch between partition, family and module color for the given 'color_element' + + :param color_element: + + :return: Javascript code + """ return f""" if(this.active == 0){{ source.data['{color_element}'] = source.data['partition_color']; @@ -294,7 +355,13 @@ def color_str(color_element): gene_outline_size) -def add_gene_labels(fig, source_data): +def add_gene_labels(fig: Figure, source_data: ColumnDataSource) -> (Column, LabelSet): + """ + + :param fig: + :param source_data: + :return: + """ labels = LabelSet(x='x_label', y='y_label', text='label', source=source_data, render_mode='canvas', text_font_size="18px") slider_font = Slider(start=0, end=64, value=16, step=1, title="Gene label font size in px") @@ -341,7 +408,13 @@ def add_gene_labels(fig, source_data): return labels_block, labels -def mk_genomes(gene_lists, ordered_counts): +def mk_genomes(gene_lists: list, ordered_counts: list) -> (ColumnDataSource, list): + """ + + :param gene_lists: + :param ordered_counts: + :return: + """ df = {"name": [], "width": [], "occurrences": [], 'x': [], 'y': [], "x_label": []} for index, GeneList in enumerate(gene_lists): @@ -364,7 +437,19 @@ def mk_genomes(gene_lists, ordered_counts): return ColumnDataSource(data=df), tooltip -def add_genome_tools(fig, gene_recs, genome_recs, gene_source, genome_source, nb, gene_labels): +def add_genome_tools(fig: Figure, gene_recs: GlyphRenderer, genome_recs: GlyphRenderer, gene_source: ColumnDataSource, + genome_source: ColumnDataSource, nb: int, gene_labels: LabelSet): + """ + + :param fig: + :param gene_recs: + :param genome_recs: + :param gene_source: + :param genome_source: + :param nb: + :param gene_labels: + :return: + """ # add genome labels genome_labels = LabelSet(x='x_label', y='y', x_offset=-20, text='name', text_align="right", source=genome_source, render_mode='canvas', text_font_size="16px") @@ -411,9 +496,18 @@ def add_genome_tools(fig, gene_recs, genome_recs, gene_source, genome_source, nb return column(genome_header, slider_spacing, slider_font, slider_offset) -def draw_curr_spot(gene_lists, ordered_counts, fam_to_mod, fam_col, file_name): - # prepare the source data +def draw_curr_spot(gene_lists: list, ordered_counts: list, fam_to_mod: dict, fam_col: dict, file_name: str): + """ + + :param gene_lists: + :param ordered_counts: + :param fam_to_mod: + :param fam_col: Dictionnary with for each family the corresponding color + :param file_name: + :return: + """ + # Prepare the source data output_file(file_name + ".html") # generate the figure and add some tools to it @@ -448,18 +542,30 @@ def draw_curr_spot(gene_lists, ordered_counts, fam_to_mod, fam_col, file_name): save(column(fig, row(labels_tools, gene_tools), row(genome_tools))) -def draw_selected_spots(selected_spots, pangenome, output, overlapping_match, exact_match, set_size, disable_bar): +def draw_selected_spots(selected_spots: list, pangenome: Pangenome, output: str, overlapping_match: int, + exact_match: int, set_size: int, disable_bar: bool = False): + """ + Draw only the selected spot and give parameters + + :param selected_spots: List of the selected spot by user + :param pangenome: Pangenome containing spot + :param output: Path to output directory + :param overlapping_match: Allowed number of missing persistent genes when comparing flanking genes + :param exact_match: + :param set_size: + :param disable_bar: Allow preventing bar progress print + """ + logging.getLogger().info("Ordering genes among regions, and drawing spots...") multigenics = pangenome.get_multigenics(pangenome.parameters["RGP"]["dup_margin"]) - # bar = tqdm(range(len(selected_spots)), unit = "spot", disable = disable_bar) fam2mod = {} for mod in pangenome.modules: for fam in mod.families: fam2mod[fam] = f"module_{mod.ID}" - for spot in tqdm(selected_spots, unit="spot", disable=disable_bar): + for spot in tqdm(selected_spots, total=len(selected_spots), unit="spot", disable=disable_bar): fname = output + '/spot_' + str(spot.ID) @@ -507,11 +613,19 @@ def draw_selected_spots(selected_spots, pangenome, output, overlapping_match, ex ordered_counts.append(curr_genelist_count) draw_curr_spot(uniq_gene_lists, ordered_counts, fam2mod, famcolors, fname) - subgraph(spot, fname + ".gexf", set_size=set_size, multigenics=multigenics, fam2mod=fam2mod) + subgraph(spot, fname + ".gexf", set_size=set_size, multigenics=multigenics, fam_to_mod=fam2mod) logging.getLogger().info(f"Done drawing spot(s), they can be found in the directory: '{output}'") -def draw_spots(pangenome, output, spot_list, disable_bar): +def draw_spots(pangenome: Pangenome, output: str, spot_list: str, disable_bar: bool = False): + """ + Main function to draw spot + + :param pangenome: Pangenome with spot predicted + :param output: Path to output directory + :param spot_list: List of spot to draw separate by ',' + :param disable_bar: Allow to disable progress bar + """ # check that the pangenome has spots check_predicted_spots(pangenome) @@ -530,8 +644,8 @@ def draw_spots(pangenome, output, spot_list, disable_bar): else: selected_spots = [s for s in pangenome.spots if "spot_" + str(s.ID) in curated_spot_list] if len(selected_spots) < 10: - logging.getLogger().info( - f"Drawing the following spots: {','.join(['spot_' + str(s.ID) for s in selected_spots])}") + logging.getLogger().info(f"Drawing the following spots: " + f"{','.join(['spot_' + str(s.ID) for s in selected_spots])}") else: logging.getLogger().info(f"Drawing {len(selected_spots)} spots") diff --git a/ppanggolin/figures/drawing.py b/ppanggolin/figures/drawing.py index f8276687..fc13ff91 100644 --- a/ppanggolin/figures/drawing.py +++ b/ppanggolin/figures/drawing.py @@ -14,7 +14,13 @@ from ppanggolin.figures.ucurve import draw_ucurve -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ + mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) @@ -26,13 +32,27 @@ def launch(args): draw_spots(pangenome=pangenome, output=args.output, spot_list=args.spots, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ + parser = sub_parser.add_parser("draw", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_draw(parser) return parser -def parser_draw(parser): +def parser_draw(parser: argparse.ArgumentParser): + """ + Parser for specific argument of draw command + + :param parser: parser for align argument + """ + required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome.h5 file") diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index d72538fb..2826391b 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -15,10 +15,20 @@ import colorlover as cl # local libraries from ppanggolin.formats import check_pangenome_info +from ppanggolin.pangenome import Pangenome from ppanggolin.utils import jaccard_similarities -def draw_tile_plot(pangenome, output, nocloud=False, disable_bar=False): +def draw_tile_plot(pangenome: Pangenome, output: str, nocloud: bool = False, disable_bar: bool = False): + """ + Draw a tile plot from a partitioned pangenome + + :param pangenome: Partitioned pangenome + :param output: Path to output directory + :param nocloud: Do not draw the cloud partition + :param disable_bar: Allow to disable progress bar + """ + check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) if pangenome.status["partitioned"] == "No": raise Exception("Cannot draw the tile plot as your pangenome has not been partitioned") diff --git a/ppanggolin/figures/ucurve.py b/ppanggolin/figures/ucurve.py index b4173bc2..a95ea288 100644 --- a/ppanggolin/figures/ucurve.py +++ b/ppanggolin/figures/ucurve.py @@ -7,9 +7,18 @@ import plotly.offline as out_plotly # local libraries from ppanggolin.formats import check_pangenome_info +from ppanggolin.pangenome import Pangenome -def draw_ucurve(pangenome, output, soft_core=0.95, disable_bar=False): +def draw_ucurve(pangenome: Pangenome, output: str, soft_core: float = 0.95, disable_bar: bool = False): + """ + + :param pangenome: Partitioned pangenome + :param output: Path to output directory + :param soft_core: Soft core threshold to use + :param disable_bar: Allow to disable progress bar + :return: + """ check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=True, disable_bar=disable_bar) logging.getLogger().info("Drawing the U-shaped curve...") max_bar = 0 diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 53b1ee5e..4857dab8 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -314,3 +314,45 @@ def mk_bitarray(self, index, partition='all'): self.bitarray[index[fam]] = 1 else: raise Exception("There is not any partition corresponding please report a github issue") + + +class GeneContext: + """ + A class used to represent a gene context + + Attributes + ---------- + gc_id : int + ID of the Gene context + families : set + Gene families related to the GeneContext + + Methods + ------- + """ + + def __init__(self, gc_id, families=None): + """ Initial methods + + :param gc_id: ID of the GeneContext + :type gc_id: int + :param families: Gene families related to the GeneContext + :type families: set + """ + self.ID = gc_id + self.families = set() + if families is not None: + if not all(isinstance(fam, GeneFamily) for fam in families): + raise Exception(f"You provided elements that were not GeneFamily object." + f" GeneContext are only made of GeneFamily") + self.families |= set(families) + + def add_family(self, family): + """ + Allow to add one family in the GeneContext + :param family: family to add + :type family: GeneFamily + """ + if not isinstance(family, GeneFamily): + raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") + self.families.add(family) \ No newline at end of file diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index d53f7f38..d6ae6efa 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -52,8 +52,8 @@ def launch(args): read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar) elif args.clusters is None: # we should have the sequences here. - clustering(pangenome, args.tmpdir, args.cpu, identity=args.identity, coverage=args.coverage, mode=args.mode, - defrag=not args.no_defrag, disable_bar=args.disable_prog_bar) + clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, coverage=args.coverage, + identity=args.identity, mode=args.mode, disable_bar=args.disable_prog_bar) clust_time = time.time() - start_clust elif args.fasta is not None: start_anno = time.time() @@ -64,8 +64,8 @@ def launch(args): write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) writing_time = time.time() - start_writing start_clust = time.time() - clustering(pangenome, args.tmpdir, args.cpu, identity=args.identity, coverage=args.coverage, mode=args.mode, - defrag=not args.no_defrag, disable_bar=args.disable_prog_bar) + clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, coverage=args.coverage, + identity=args.identity, mode=args.mode, disable_bar=args.disable_prog_bar) clust_time = time.time() - start_clust write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) diff --git a/ppanggolin/workflow/workflow.py b/ppanggolin/workflow/workflow.py index 1ffb214b..de623936 100644 --- a/ppanggolin/workflow/workflow.py +++ b/ppanggolin/workflow/workflow.py @@ -42,15 +42,15 @@ def launch(args): read_clustering(pangenome, args.clusters, disable_bar=args.disable_prog_bar) elif args.clusters is None: # we should have the sequences here. - clustering(pangenome, tmpdir=args.tmpdir, cpu=args.cpu, identity=args.identity, coverage=args.coverage, - mode=args.mode, defrag=not args.no_defrag, disable_bar=args.disable_prog_bar) + clustering(pangenome, tmpdir=args.tmpdir, cpu=args.cpu, defrag=not args.no_defrag, coverage=args.coverage, + identity=args.identity, mode=args.mode, disable_bar=args.disable_prog_bar) elif args.fasta is not None: pangenome = Pangenome() annotate_pangenome(pangenome, args.fasta, args.tmpdir, args.cpu, contig_filter=args.contig_filter, disable_bar=args.disable_prog_bar) write_pangenome(pangenome, filename, args.force, disable_bar=args.disable_prog_bar) - clustering(pangenome, tmpdir=args.tmpdir, cpu=args.cpu, identity=args.identity, coverage=args.coverage, - mode=args.mode, defrag=not args.no_defrag, disable_bar=args.disable_prog_bar) + clustering(pangenome, tmpdir=args.tmpdir, cpu=args.cpu, defrag=not args.no_defrag, coverage=args.coverage, + identity=args.identity, mode=args.mode, disable_bar=args.disable_prog_bar) compute_neighbors_graph(pangenome, disable_bar=args.disable_prog_bar) From 070955ab1e3bf14505ec95f32fbc9dcbaafc8a3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Wed, 18 May 2022 10:28:25 +0200 Subject: [PATCH 04/20] Refactoring and documentation of code --- VERSION | 2 +- ppanggolin/RGP/genomicIsland.py | 112 +++++-- ppanggolin/RGP/spot.py | 127 ++++++-- ppanggolin/align/alignOnPang.py | 16 +- ppanggolin/annotate/annotate.py | 51 +-- ppanggolin/annotate/synta.py | 21 +- ppanggolin/edge.py | 36 +-- ppanggolin/figures/draw_spot.py | 22 +- ppanggolin/figures/drawing.py | 2 + ppanggolin/figures/tile_plot.py | 2 +- ppanggolin/formats/readBinaries.py | 306 ++++++++++++------ ppanggolin/formats/writeBinaries.py | 458 ++++++++++++++++++++------- ppanggolin/formats/writeFlat.py | 323 +++++++++++++++---- ppanggolin/formats/writeMSA.py | 139 ++++++-- ppanggolin/formats/writeSequences.py | 213 ++++++++++--- ppanggolin/geneFamily.py | 73 +++-- ppanggolin/genome.py | 168 ++++++++-- ppanggolin/graph/makeGraph.py | 59 +++- ppanggolin/info/info.py | 33 +- ppanggolin/main.py | 22 +- ppanggolin/metrics/fluidity.py | 24 +- ppanggolin/metrics/metrics.py | 60 ++-- ppanggolin/mod/module.py | 142 +++++---- ppanggolin/nem/partition.py | 250 +++++++++------ ppanggolin/nem/rarefaction.py | 168 +++++++--- ppanggolin/pangenome.py | 270 ++++++++-------- ppanggolin/region.py | 222 +++++++++---- ppanggolin/utils.py | 148 +++++++-- ppanggolin/workflow/all.py | 18 +- ppanggolin/workflow/panModule.py | 18 +- ppanggolin/workflow/panRGP.py | 16 +- ppanggolin/workflow/workflow.py | 16 +- tests/genome/test_Contig.py | 2 +- tests/genome/test_Gene.py | 2 +- tests/genome/test_Organism.py | 13 +- tests/region/test_Region.py | 2 +- tests/test_Pangenome.py | 8 +- 37 files changed, 2488 insertions(+), 1076 deletions(-) diff --git a/VERSION b/VERSION index 87c40b64..dd91a8b7 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.97 +1.2.98 diff --git a/ppanggolin/RGP/genomicIsland.py b/ppanggolin/RGP/genomicIsland.py index 66a83180..99adb011 100644 --- a/ppanggolin/RGP/genomicIsland.py +++ b/ppanggolin/RGP/genomicIsland.py @@ -9,6 +9,7 @@ from tqdm import tqdm # local libraries +from ppanggolin.genome import Organism, Contig from ppanggolin.pangenome import Pangenome from ppanggolin.region import Region from ppanggolin.formats import check_pangenome_info, write_pangenome, erase_pangenome @@ -83,8 +84,17 @@ def rewrite_matrix(contig, matrix, index, persistent, continuity, multi): next_node = matrix[index] -def init_matrices(contig, persistent_penalty, variable_gain, multi): - """initialize the vector of score/state nodes""" +def init_matrices(contig: Contig, multi: set, persistent_penalty: int = 3, variable_gain: int = 1) -> list: + """ + Initialize the vector of score/state nodes + + :param contig: Current contig from one organism + :param persistent_penalty: Penalty score to apply to persistent genes + :param variable_gain: Gain score to apply to variable genes + :param multi: multigenic persistent families of the pangenome graph. + + :return: Initialized matrice + """ mat = [] prev = None nb_perc = 0 @@ -137,8 +147,22 @@ def init_matrices(contig, persistent_penalty, variable_gain, multi): return mat -def mk_regions(contig, matrix, min_length, min_score, persistent, continuity, multi, naming="contig"): - # processing matrix and 'emptying' it to get the regions. +def mk_regions(contig: Contig, matrix: list, multi: set, min_length: int = 3000, min_score: int = 4, + persistent: int = 3, continuity: int = 1, naming: str = "contig") -> set: + """ + Processing matrix and 'emptying' it to get the regions. + + :param contig: Current contig from one organism + :param matrix: Initialized matrix + :param multi: multigenic persistent families of the pangenome graph. + :param min_length: Minimum length (bp) of a region to be considered RGP + :param min_score: Minimal score wanted for considering a region as being RGP + :param persistent: Penalty score to apply to persistent genes + :param continuity: Gain score to apply to variable genes + :param naming: + + :return: + """ def max_index_node(lst): """gets the last node with the highest score from a list of matriceNode""" if isinstance(lst, list): @@ -165,18 +189,19 @@ def max_index_node(lst): return contig_regions -def compute_org_rgp(organism, persistent_penalty, variable_gain, min_length, min_score, multigenics, naming="contig"): +def compute_org_rgp(organism: Organism, multigenics: set, persistent_penalty: int = 3, variable_gain: int = 1, + min_length: int = 3000, min_score: int = 4, naming: str = "contig") -> set: org_regions = set() for contig in organism.contigs: if len(contig.genes) != 0: # some contigs have no coding genes... # can definitely multiprocess this part, as not THAT much information is needed... - matrix = init_matrices(contig, persistent_penalty, variable_gain, multigenics) - org_regions |= mk_regions(contig, matrix, min_length, min_score, persistent_penalty, variable_gain, - multigenics, naming=naming) + matrix = init_matrices(contig, multigenics, persistent_penalty, variable_gain) + org_regions |= mk_regions(contig, matrix, multigenics, min_length, min_score, persistent_penalty, + variable_gain, naming=naming) return org_regions -def test_naming_scheme(pangenome): +def naming_scheme(pangenome: Pangenome): contigsids = set() for org in pangenome.organisms: for contig in org.contigs: @@ -189,18 +214,34 @@ def test_naming_scheme(pangenome): return "contig" -def check_pangenome_former_rgp(pangenome, force): - """ checks pan status and .h5 files for former rgp, delete them if allowed or raise an error """ +def check_pangenome_former_rgp(pangenome: Pangenome, force: bool = False): + """ checks pangenome status and .h5 files for former rgp, delete them if allowed or raise an error + + :param pangenome: Pangenome object + :param force: Allow to force write on Pangenome file + """ if pangenome.status["predictedRGP"] == "inFile" and not force: - raise Exception("You are trying to predict RGPs in a pan that already have them predicted. " + raise Exception("You are trying to predict RGPs in a pangenome that already have them predicted. " "If you REALLY want to do that, use --force " "(it will erase RGPs and every feature computed from them).") elif pangenome.status["predictedRGP"] == "inFile" and force: erase_pangenome(pangenome, rgp=True) -def predict_rgp(pangenome, force=False, persistent_penalty=3, variable_gain=1, min_length=3000, min_score=4, - dup_margin=0.05, disable_bar=False): +def predict_rgp(pangenome: Pangenome, persistent_penalty: int = 3, variable_gain: int = 1, min_length: int = 3000, + min_score: int = 4, dup_margin: float = 0.05, force: bool = False, disable_bar: bool = False): + """ + Main function to predict region of genomic plasticity + + :param pangenome: blank pangenome object + :param persistent_penalty: Penalty score to apply to persistent genes + :param variable_gain: Gain score to apply to variable genes + :param min_length: Minimum length (bp) of a region to be considered RGP + :param min_score: Minimal score wanted for considering a region as being RGP + :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated + :param force: Allow to force write on Pangenome file + :param disable_bar: Disable progress bar + """ # check statuses and load info check_pangenome_former_rgp(pangenome, force) check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True, @@ -209,12 +250,10 @@ def predict_rgp(pangenome, force=False, persistent_penalty=3, variable_gain=1, m logging.getLogger().info("Detecting multigenic families...") multigenics = pangenome.get_multigenics(dup_margin) logging.getLogger().info("Compute Regions of Genomic Plasticity ...") - naming_scheme = test_naming_scheme(pangenome) - bar = tqdm(pangenome.organisms, unit="genomes", disable=disable_bar) - for org in bar: - pangenome.add_regions( - compute_org_rgp(org, persistent_penalty, variable_gain, min_length, min_score, multigenics, - naming=naming_scheme)) + name_scheme = naming_scheme(pangenome) + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genomes", disable=disable_bar): + pangenome.add_regions(compute_org_rgp(org, multigenics, persistent_penalty, variable_gain, min_length, + min_score, naming=name_scheme)) logging.getLogger().info(f"Predicted {len(pangenome.regions)} RGP") # save parameters and save status @@ -227,25 +266,42 @@ def predict_rgp(pangenome, force=False, persistent_penalty=3, variable_gain=1, m pangenome.status['predictedRGP'] = "Computed" -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ pangenome = Pangenome() pangenome.add_file(args.pan) - predict_rgp(pangenome, force=args.force, persistent_penalty=args.persistent_penalty, - variable_gain=args.variable_gain, min_length=args.min_length, min_score=args.min_score, - dup_margin=args.dup_margin, disable_bar=args.disable_prog_bar) + predict_rgp(pangenome, persistent_penalty=args.persistent_penalty, variable_gain=args.variable_gain, + min_length=args.min_length, min_score=args.min_score, dup_margin=args.dup_margin, force=args.force, + disable_bar=args.disable_prog_bar) write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("rgp", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_rgp(parser) return parser -def parser_rgp(parser): +def parser_rgp(parser: argparse.ArgumentParser): + """ + Parser for specific argument of rgp command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pan', required=True, type=str, help="The pan .h5 file") + required.add_argument('-p', '--pan', required=True, type=str, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('--persistent_penalty', required=False, type=int, default=3, @@ -278,6 +334,6 @@ def parser_rgp(parser): help="disables the progress bars") common.add_argument("-c", "--cpu", required=False, default=1, type=int, help="Number of available cpus") common.add_argument('-f', '--force', action="store_true", - help="Force writing in output directory and in pan output file.") + help="Force writing in output directory and in pangenome output file.") set_verbosity_level(main_parser.parse_args()) launch(main_parser.parse_args()) diff --git a/ppanggolin/RGP/spot.py b/ppanggolin/RGP/spot.py index bf20f50e..356273be 100644 --- a/ppanggolin/RGP/spot.py +++ b/ppanggolin/RGP/spot.py @@ -12,12 +12,24 @@ # local libraries from ppanggolin.pangenome import Pangenome -from ppanggolin.region import Spot +from ppanggolin.region import Region, Spot from ppanggolin.formats import check_pangenome_info, write_pangenome, erase_pangenome from ppanggolin.utils import mk_outdir -def comp_border(border1, border2, overlapping_match, exact_match, set_size): +def comp_border(border1: list, border2: list, overlapping_match: int = 2, + set_size: int = 3, exact_match: int = 1) -> bool: + """ + Compare two border + + :param border1: + :param border2: + :param overlapping_match: + :param set_size: + :param exact_match: + + :return: + """ if border1[0:exact_match] == border2[0:exact_match]: return True elif len(border1) == set_size and len(border2) == set_size: @@ -30,16 +42,25 @@ def comp_border(border1, border2, overlapping_match, exact_match, set_size): return False -def check_sim(pair_border1, pair_border2, overlapping_match, exact_match, set_size): +def check_sim(pair_border1: list, pair_border2: list, overlapping_match: int = 2, + set_size: int = 3, exact_match: int = 1) -> bool: """ - Checks if the two pairs of 'exact_match' first gene families are identical, + Checks if the two pairs of exact_match first gene families are identical, or eventually if they overlap in an ordered way at least 'overlapping_match' + + :param pair_border1: First flanking gene families pair + :param pair_border2: Second flanking gene families pair + :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes + :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation + :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs + + :return: Whether identical gene families or not """ b1pair = [False, False] b2pair = [False, False] for countb1, b1 in enumerate(pair_border1): for countb2, b2 in enumerate(pair_border2): - if comp_border(b2, b1, overlapping_match, exact_match, set_size): + if comp_border(b2, b1, overlapping_match, set_size, exact_match): b1pair[countb1] = True b2pair[countb2] = True @@ -48,8 +69,30 @@ def check_sim(pair_border1, pair_border2, overlapping_match, exact_match, set_si return False -def make_spot_graph(rgps, multigenics, output, spot_graph=False, overlapping_match=2, set_size=3, exact_match=1): - def add_new_node(g, region, borders): +def make_spot_graph(rgps: list, multigenics: set, output: str, spot_graph: bool = False, overlapping_match: int = 2, + set_size: int = 3, exact_match: int = 1) -> list: + """ + Create a spot graph from pangenome RGP + + :param rgps: list of pangenome RGP + :param multigenics: pangenome graph multigenic persistent families + :param output: Output directory to save the spot graph + :param spot_graph: Writes gexf graph of pairs of blocks of single copy markers flanking RGPs from same hotspot + :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes + :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation + :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs + + :return: list of computed spot + """ + + def add_new_node(g: nx.Graph, region: Region, borders: list): + """ + Add bordering region as node to graph + + :param g: spot graph + :param region: region in spot + :param borders: bordering families in spot + """ blocks = str(sorted([[gene.family.ID for gene in borders[0]], [gene.family.ID for gene in borders[1]]], key=lambda x: x[0])) g.add_node(blocks) @@ -82,7 +125,7 @@ def add_new_node(g, region, borders): node_obj_i = graph_spot.nodes[nodei] node_obj_j = graph_spot.nodes[nodej] if check_sim([node_obj_i["border0"], node_obj_i["border1"]], [node_obj_j["border0"], node_obj_j["border1"]], - overlapping_match, exact_match, set_size): + overlapping_match, set_size, exact_match): graph_spot.add_edge(nodei, nodej) spots = [] spot_id = 0 @@ -103,23 +146,45 @@ def add_new_node(g, region, borders): return spots -def check_pangenome_former_spots(pangenome, force): - """ checks pan status and .h5 files for former spots, delete them if allowed or raise an error """ +def check_pangenome_former_spots(pangenome: Pangenome, force: bool = False): + """ + checks pangenome status and .h5 files for former spots, delete them if allowed or raise an error + + :param pangenome: Pangenome object + :param force: Allow to force write on Pangenome file + """ if pangenome.status["spots"] == "inFile" and not force: - raise Exception("You are trying to detect spots on a pan which already has predicted spots. " + raise Exception("You are trying to detect spots on a pangenome which already has predicted spots. " "If you REALLY want to do that, use --force (it will erase spots previously predicted).") elif pangenome.status["spots"] == "inFile" and force: erase_pangenome(pangenome, spots=True) -def predict_hotspots(pangenome, output, force=False, spot_graph=False, overlapping_match=2, set_size=3, - exact_match=1, disable_bar=False): +def predict_hotspots(pangenome: Pangenome, output: str, spot_graph: bool = False, overlapping_match: int = 2, + set_size: int = 3, exact_match: int = 1, force: bool = False, disable_bar: bool = False): + """ + Main function to predict hotspot + + :param pangenome: Blank pangenome object + :param output: Output directory to save the spot graph + :param spot_graph: Writes gexf graph of pairs of blocks of single copy markers flanking RGPs from same hotspot + :param overlapping_match: Number of missing persistent genes allowed when comparing flanking genes + :param set_size: Number of single copy markers to use as flanking genes for RGP during hotspot computation + :param exact_match: Number of perfectly matching flanking single copy markers required to associate RGPs + :param force: Allow to force write on Pangenome file + :param disable_bar: Disable progress bar + """ # check that given parameters for hotspot computation make sense - check_parameter_logic(overlapping_match, set_size, exact_match) + if overlapping_match >= set_size: + raise Exception(f'--overlapping_match_hotspot ({overlapping_match}) cannot be bigger than (or equal to) ' + f'--set_size_hotspot ({set_size})') + if exact_match > set_size: + raise Exception(f'--exact_match_size_hotspot ({exact_match}) cannot be bigger than ' + f'--set_size_hotspot ({set_size})') # check for formerly computed stuff, and erase if allowed check_pangenome_former_spots(pangenome, force) # check statuses and load info - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_graph=False, need_partitions=True, + check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=True, need_rgp=True, disable_bar=disable_bar) # get multigenic gene families @@ -145,16 +210,12 @@ def predict_hotspots(pangenome, output, force=False, spot_graph=False, overlappi pangenome.parameters["spots"]["exact_match"] = exact_match -def check_parameter_logic(overlapping_match, set_size, exact_match): - if overlapping_match >= set_size: - raise Exception(f'--overlapping_match_hotspot ({overlapping_match}) cannot be bigger than (or equal to) ' - f'--set_size_hotspot ({set_size})') - if exact_match > set_size: - raise Exception(f'--exact_match_size_hotspot ({exact_match}) cannot be bigger than ' - f'--set_size_hotspot ({set_size})') - +def launch(args: argparse.Namespace): + """ + Command launcher -def launch(args): + :param args: All arguments provide by user + """ pangenome = Pangenome() pangenome.add_file(args.pangenome) if args.spot_graph: @@ -169,16 +230,28 @@ def launch(args): write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("spot", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_spot(parser) return parser -def parser_spot(parser): +def parser_spot(parser: argparse.ArgumentParser): + """ + Parser for specific argument of spot command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pangenome', required=True, type=str, help="The pan .h5 file") + required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument('-o', '--output', required=False, type=str, default="ppanggolin_output" + time.strftime("_DATE%Y-%m-%d_HOUR%H.%M.%S", diff --git a/ppanggolin/align/alignOnPang.py b/ppanggolin/align/alignOnPang.py index 4b38cee2..762917c9 100644 --- a/ppanggolin/align/alignOnPang.py +++ b/ppanggolin/align/alignOnPang.py @@ -8,10 +8,11 @@ import subprocess import argparse from collections import defaultdict -from typing import Tuple +from typing import Tuple, Set, Dict # local libraries from ppanggolin.formats import check_pangenome_info +from ppanggolin.geneFamily import GeneFamily from ppanggolin.utils import mk_outdir, read_compressed_or_not from ppanggolin.pangenome import Pangenome from ppanggolin.region import Spot @@ -75,14 +76,14 @@ def align_seq_to_pang(pang_file: TextIOWrapper, seq_file: TextIOWrapper, output: return outfile -def read_alignments(aln_res: str, pangenome: Pangenome) -> Tuple[dict, str]: +def read_alignments(aln_res: str, pangenome: Pangenome) -> Tuple[Dict[str, GeneFamily], str]: """ Read alignment result to link input sequence to pangenome :param aln_res: Alignement result file :param pangenome: Input pangenome - :return: Dictionnary with sequence link to pangenome and actual name of resulting alignment file + :return: Dictionnary with sequence link to pangenome gene families and actual name of resulting alignment file """ seq2pang = {} outname = open(aln_res.replace("_tmp", ""), "w") # write the actual result file @@ -97,12 +98,13 @@ def read_alignments(aln_res: str, pangenome: Pangenome) -> Tuple[dict, str]: return seq2pang, outname.name -def get_seq(seq_file: TextIOWrapper) -> set: +def get_seq(seq_file: TextIOWrapper) -> Set[str]: """ get sequence from sequence input file :param seq_file: file containing sequences - :return: + + :return: set of sequences """ seqset = set() for line in seq_file: @@ -125,7 +127,7 @@ def write_gene_fam_sequences(pangenome: Pangenome, file_obj: TextIOWrapper, add: file_obj.flush() -def project_partition(seq_to_pang: dict, seq_set: set, output: str) -> str: +def project_partition(seq_to_pang: Dict[str, GeneFamily], seq_set: Set[str], output: str) -> str: """ Project the partition of each sequence from the input file @@ -164,7 +166,7 @@ def get_fam_to_rgp(pangenome, multigenics: set) -> dict: return fam2rgp -def get_fam_to_spot(pangenome: Pangenome, multigenics: set) -> Tuple[dict, dict]: +def get_fam_to_spot(pangenome: Pangenome, multigenics: Set[GeneFamily]) -> Tuple[dict, dict]: """ Reads a pangenome object to link families and spots and indicate where each family is. diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py index e4f9103c..9faae5dc 100644 --- a/ppanggolin/annotate/annotate.py +++ b/ppanggolin/annotate/annotate.py @@ -82,23 +82,13 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i # but was when cases like this were encountered) new_gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) - new_gene.fill_annotations(start=start, - stop=stop, - strand=strand, - gene_type=gene_type, - position=position, - name=gene_name, - product=product, - genetic_code=genetic_code, - local_identifier=gene_id) + new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, + position=position, product=product, local_identifier=gene_id, + genetic_code=genetic_code) contig.add_gene(new_gene) else: # if not CDS, it is RNA new_gene = RNA(org.name + "_RNA_" + str(rna_counter).zfill(4)) - new_gene.fill_annotations(start=start, - stop=stop, - strand=strand, - gene_type=gene_type, - name=gene_name, + new_gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type=gene_type, name=gene_name, product=product) contig.add_rna(new_gene) new_gene.fill_parents(org, contig) @@ -141,7 +131,7 @@ def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, ps if contig_id != "": if contig_id in circular_contigs: is_circ = True - contig = org.get_or_add_contig(contig_id, is_circ) + contig = org.get_contig(contig_id, is_circ) set_contig = True line = lines.pop() if not set_contig: @@ -150,7 +140,7 @@ def read_org_gbff(organism: str, gbff_file_path: str, circular_contigs: list, ps # might still be the same even though it should not(?) if contig_locus_id in circular_contigs: is_circ = True - contig = org.get_or_add_contig(contig_locus_id, is_circ) + contig = org.get_contig(contig_locus_id, is_circ) # start of the feature object. dbxref = set() gene_name = "" @@ -310,7 +300,7 @@ def get_id_attribute(attributes_dict: dict) -> str: has_fasta = True elif line.startswith('sequence-region', 2, 17): fields = [el.strip() for el in line.split()] - contig = org.get_or_add_contig(fields[1], True if fields[1] in circular_contigs else False) + contig = org.get_contig(fields[1], True if fields[1] in circular_contigs else False) continue elif line.startswith('#'): # comment lines to be ignores by parsers continue @@ -349,33 +339,24 @@ def get_id_attribute(attributes_dict: dict) -> str: genetic_code = 11 if contig is None or contig.name != fields_gff[GFF_seqname]: # get the current contig - contig = org.get_or_add_contig(fields_gff[GFF_seqname], - True if fields_gff[GFF_seqname] in circular_contigs else False) + contig = org.get_contig(fields_gff[GFF_seqname], + True if fields_gff[GFF_seqname] in circular_contigs else False) if fields_gff[GFF_type] == "CDS" and (not pseudogene or (pseudogene and pseudo)): gene = Gene(org.name + "_CDS_" + str(gene_counter).zfill(4)) # here contig is filled in order, so position is the number of genes already stored in the contig. - gene.fill_annotations(start=int(fields_gff[GFF_start]), - stop=int(fields_gff[GFF_end]), - strand=fields_gff[GFF_strand], - gene_type=fields_gff[GFF_type], - position=len(contig.genes), - name=name, - product=product, - genetic_code=genetic_code, - local_identifier=gene_id) + gene.fill_annotations(start=int(fields_gff[GFF_start]), stop=int(fields_gff[GFF_end]), + strand=fields_gff[GFF_strand], gene_type=fields_gff[GFF_type], name=name, + position=len(contig.genes), product=product, local_identifier=gene_id, + genetic_code=genetic_code) gene.fill_parents(org, contig) contig.add_gene(gene) gene_counter += 1 elif "RNA" in fields_gff[GFF_type]: rna = RNA(org.name + "_CDS_" + str(rna_counter).zfill(4)) - rna.fill_annotations(start=int(fields_gff[GFF_start]), - stop=int(fields_gff[GFF_end]), - strand=fields_gff[GFF_strand], - gene_type=fields_gff[GFF_type], - name=name, - product=product, - local_identifier=gene_id) + rna.fill_annotations(start=int(fields_gff[GFF_start]), stop=int(fields_gff[GFF_end]), + strand=fields_gff[GFF_strand], gene_type=fields_gff[GFF_type], name=name, + product=product, local_identifier=gene_id) rna.fill_parents(org, contig) contig.add_rna(rna) rna_counter += 1 diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py index f753aadb..e55aee3e 100644 --- a/ppanggolin/annotate/synta.py +++ b/ppanggolin/annotate/synta.py @@ -57,11 +57,11 @@ def launch_aragorn(fna_file: str, org: Organism) -> defaultdict: file_data.pop() # then next line must be removed too. elif len(line) > 0: # if the line isn't empty, there's data to get. line_data = line.split() - start, stop = ast.literal_eval(line_data[2].replace("c", "")) + start, stop = map(int, ast.literal_eval(line_data[2].replace("c", ""))) c += 1 - gene = RNA(identifier=locustag + '_tRNA_' + str(c).zfill(3)) - gene.fill_annotations(start=start, stop=stop, gene_type="tRNA", product=line_data[1] + line_data[4], - strand="-" if line_data[2].startswith("c") else "+",) + gene = RNA(rna_id=locustag + '_tRNA_' + str(c).zfill(3)) + gene.fill_annotations(start=start, stop=stop, strand="-" if line_data[2].startswith("c") else "+", + gene_type="tRNA", product=line_data[1] + line_data[4]) gene_objs[header].add(gene) return gene_objs @@ -95,8 +95,8 @@ def launch_prodigal(fna_file: str, org: Organism, code: int = 11, procedure: str c += 1 line_data = line[1:].split("_") # not considering the '>' gene = Gene(gene_id=locustag + "_CDS_" + str(c).zfill(4)) - gene.fill_annotations(start=line_data[1], stop=line_data[2], strand=line_data[3], - gene_type="CDS", genetic_code=code) + gene.fill_annotations(start=int(line_data[1]), stop=int(line_data[2]), strand=line_data[3], gene_type="CDS", + genetic_code=code) gene_objs[header].add(gene) return gene_objs @@ -144,9 +144,8 @@ def launch_infernal(fna_file: str, org: Organism, tmpdir: str, kingdom: str = " start = line_data[8] stop = line_data[7] else: - start = line_data[7] - stop = line_data[8] - gene = RNA(identifier=locustag + "_rRNA_" + str(c).zfill(3)) + start, stop = map(int, (line_data[7], line_data[8])) + gene = RNA(rna_id=locustag + "_rRNA_" + str(c).zfill(3)) gene.fill_annotations(start=start, stop=stop, strand=strand, gene_type="rRNA", product=" ".join(line_data[17:])) gene_objs[line_data[2]].add(gene) @@ -174,7 +173,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list], contig_filte contigs[contig.name] = contig_seq.upper() all_contig_len += len(contig_seq) contig_seq = "" - contig = org.get_or_add_contig(line.split()[0][1:]) + contig = org.get_contig(line.split()[0][1:]) else: contig_seq += line.strip() if len(contig_seq) >= contig_filter: # processing the last contig @@ -326,7 +325,7 @@ def annotate_organism(org_name: str, file_name: str, circular_contigs, tmpdir: s genes = overlap_filter(genes, overlap) for contigName, genes in genes.items(): - contig = org.get_or_add_contig(contigName) + contig = org.get_contig(contigName) if contig.name in circular_contigs: contig.is_circular = True for gene in genes: diff --git a/ppanggolin/edge.py b/ppanggolin/edge.py index fc1bf127..6c9db0a8 100644 --- a/ppanggolin/edge.py +++ b/ppanggolin/edge.py @@ -3,6 +3,9 @@ # default libraries from collections import defaultdict +from typing import Dict, List, Tuple + +from ppanggolin.genome import Gene, Organism class Edge: @@ -10,18 +13,16 @@ class Edge: organisms in which the neighborship is found, and all the involved genes as well. :param source_gene: a first gene to initialize the edge - :type source_gene: :class:`ppanggolin.genome.Gene` :param target_gene: a second gene to initialize the edge - :type target_gene: :class:`ppanggolin.genome.Gene` """ - def __init__(self, source_gene, target_gene): + def __init__(self, source_gene: Gene, target_gene: Gene): if source_gene.family is None: - raise Exception( - f"You cannot create a graph without gene families. gene {source_gene.ID} did not have a gene family.") + raise Exception(f"You cannot create a graph without gene families. " + f"gene {source_gene.ID} did not have a gene family.") if target_gene.family is None: - raise Exception( - f"You cannot create a graph without gene families. gene {target_gene.ID} did not have a gene family.") + raise Exception(f"You cannot create a graph without gene families. " + f"gene {target_gene.ID} did not have a gene family.") self.source = source_gene.family self.target = target_gene.family self.source._edges[self.target] = self @@ -29,32 +30,27 @@ def __init__(self, source_gene, target_gene): self.organisms = defaultdict(list) self.add_genes(source_gene, target_gene) - def get_org_dict(self): - """ + def get_org_dict(self) -> Dict[Organism, List[Tuple[Gene, Gene]]]: + """ Create a dictionnary of the Organisms in which the edge is found - :return: A dictionnary of the Organisms in which the edge is found, with organisms as key and an iterable of the - pairs of genes as value - :rtype: dict[:class:`ppanggolin.genome.Organism`, tuple[:class:`ppanggolin.genome.Gene`, - :class:`ppanggolin.genome.Gene`]] + :return: Dictionary with organisms as key and an iterable of the pairs of genes as value """ return self.organisms @property - def gene_pairs(self): - """ - + def gene_pairs(self) -> List[Tuple[Gene, Gene]]: + """ Get list of all the gene pairs of the Edge + :return: A list of all the gene pairs of the Edge - :rtype: list[tuple[:class:`ppanggolin.genome.Gene`, :class:`ppanggolin.genome.Gene`]] """ return [gene_pair for gene_list in self.organisms.values() for gene_pair in gene_list] - def add_genes(self, source_gene, target_gene): + def add_genes(self, source_gene: Gene, target_gene: Gene): """Adds genes to the edge. They are supposed to be on the same organism. :param source_gene: a source gene to add to the edge - :type source_gene: :class:`ppanggolin.genome.Gene` :param target_gene: a target gene to add to the edge - :type target_gene: :class:`ppanggolin.genome.Gene` + :raises Exception: If the genes are not on the same organism. """ org = source_gene.organism diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index ee1754d9..02c69332 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -8,13 +8,6 @@ import random from math import pi -# local libraries -from ppanggolin.pangenome import Pangenome -from ppanggolin.region import Spot -from ppanggolin.utils import jaccard_similarities -from ppanggolin.formats import check_pangenome_info -from ppanggolin.RGP.spot import comp_border - # installed libraries from scipy.spatial.distance import pdist from scipy.sparse import csc_matrix @@ -27,6 +20,13 @@ from bokeh.layouts import column, row from bokeh.models import WheelZoomTool, LabelSet, Slider, CustomJS, HoverTool, RadioGroup, Div, Column, GlyphRenderer +# local libraries +from ppanggolin.pangenome import Pangenome +from ppanggolin.region import Spot +from ppanggolin.utils import jaccard_similarities +from ppanggolin.formats import check_pangenome_info +from ppanggolin.RGP.spot import comp_border + def check_predicted_spots(pangenome): """ checks pangenome status and .h5 files for predicted spots, raises an error if they were not predicted""" @@ -123,12 +123,12 @@ def line_order_gene_lists(gene_lists: list, overlapping_match: int, exact_match: for unclassIndex in list(to_classify): border1 = [gene.family for gene in gene_lists[unclassIndex][1][0]] border2 = [gene.family for gene in gene_lists[unclassIndex][1][1]] - if comp_border(base_border1, border1, overlapping_match, exact_match, set_size) and \ - comp_border(base_border2, border2, overlapping_match, exact_match, set_size): + if comp_border(base_border1, border1, overlapping_match, set_size, exact_match) and \ + comp_border(base_border2, border2, overlapping_match, set_size, exact_match): to_classify.discard(unclassIndex) new_classify.add(unclassIndex) - elif comp_border(base_border2, border1, overlapping_match, exact_match, set_size) and \ - comp_border(base_border1, border2, overlapping_match, exact_match, set_size): + elif comp_border(base_border2, border1, overlapping_match, set_size, exact_match) and \ + comp_border(base_border1, border2, overlapping_match, set_size, exact_match): # reverse the order of the genes to match the 'reference' gene_lists[unclassIndex][0] = gene_lists[unclassIndex][0][::-1] # inverse the borders diff --git a/ppanggolin/figures/drawing.py b/ppanggolin/figures/drawing.py index fc13ff91..231a55d9 100644 --- a/ppanggolin/figures/drawing.py +++ b/ppanggolin/figures/drawing.py @@ -6,6 +6,8 @@ import time import os +# Installed libraries + # local libraries from ppanggolin.utils import mk_outdir from ppanggolin.pangenome import Pangenome diff --git a/ppanggolin/figures/tile_plot.py b/ppanggolin/figures/tile_plot.py index 2826391b..9a2b7caa 100644 --- a/ppanggolin/figures/tile_plot.py +++ b/ppanggolin/figures/tile_plot.py @@ -45,7 +45,7 @@ def draw_tile_plot(pangenome: Pangenome, output: str, nocloud: bool = False, dis families = {fam for fam in pangenome.gene_families if not fam.partition.startswith("C")} else: families = set(pangenome.gene_families) - org_index = pangenome.get_index() + org_index = pangenome.get_org_index() index2org = {} for org, index in org_index.items(): index2org[index] = org diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index f49f7139..df6928da 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -6,16 +6,25 @@ import sys # installed libraries +from typing import TextIO + +from tables import Table from tqdm import tqdm import tables # local libraries from ppanggolin.genome import Organism, Gene, RNA +from ppanggolin.pangenome import Pangenome from ppanggolin.region import Spot, Module -def get_number_of_organisms(pangenome): - """ standalone function to get the number of organisms in a pan""" +def get_number_of_organisms(pangenome: Pangenome) -> int: + """ Standalone function to get the number of organisms in a pangenome + + :param pangenome: Annotated pangenome + + :return: Number of organisms in the pangenome + """ if hasattr(pangenome, "file"): filename = pangenome.file else: @@ -31,9 +40,12 @@ def get_number_of_organisms(pangenome): return len(org_set) -def get_status(pangenome, pangenome_file): +def get_status(pangenome: Pangenome, pangenome_file: str): """ - Checks which elements are already present in the file. + Checks which elements are already present in the file. + + :param pangenome: Blank pangenome + :param pangenome_file: path to the pangenome file """ h5f = tables.open_file(pangenome_file, "r") logging.getLogger().info("Getting the current pangenome status") @@ -73,47 +85,70 @@ def get_status(pangenome, pangenome_file): h5f.close() -def read_chunks(table, column=None, chunk=10000): +def read_chunks(table: Table, column: str = None, chunk: int = 10000): """ - Reading entirely the provided table (or column if specified) chunk per chunk to limit RAM usage. + Reading entirely the provided table (or column if specified) chunk per chunk to limit RAM usage. + + :param table: + :param column: + :param chunk: """ for i in range(0, table.nrows, chunk): for row in table.read(start=i, stop=i + chunk, field=column): yield row -def get_gene_sequences_from_file(filename, file_obj, list_cds=None, add='', disable_bar=False): +def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter = None, add: str = '', + disable_bar: bool = False): """ - Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, - and adds the eventual str 'add' in front of the identifiers - Loads the sequences from a .h5 pangenome file + Writes the CDS sequences of the Pangenome object to a File object that can be filtered or not by a list of CDS, + and adds the eventual str 'add' in front of the identifiers. Loads the sequences from a .h5 pangenome file. + + :param filename: Name of the pangenome file + :param file_obj: Name of the output file + :param list_cds: An iterable object of CDS + :param add: Add a prefix to sequence header + :param disable_bar: Prevent to print disable progress bar """ - logging.getLogger().info("Extracting and writing CDS sequences from a .h5 pangenome file to a fasta file...") + logging.getLogger().info(f"Extracting and writing CDS sequences from a {filename} file to a fasta file...") h5f = tables.open_file(filename, "r", driver_core_backing_store=0) table = h5f.root.geneSequences - bar = tqdm(range(table.nrows), unit="gene", disable=disable_bar) list_cds = set(list_cds) if list_cds is not None else None - for row in read_chunks(table, - chunk=20000): # reading the table chunk per chunk otherwise RAM dies on big pangenomes + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): + # Read the table chunk per chunk otherwise RAM dies on big pangenomes name_cds = row["gene"].decode() if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds): file_obj.write('>' + add + name_cds + "\n") file_obj.write(row["dna"].decode() + "\n") - bar.update() file_obj.flush() - bar.close() h5f.close() -def launch_read_organism(args): +def launch_read_organism(args) -> None: + """ + Allow to launch read organism in multiprocessing + + :param args: (pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, link: bool) + + :return: Nothing function not called yet + """ return read_organism(*args) -def read_organism(pangenome, org_name, contig_dict, circular_contigs, link=False): +def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circular_contigs: dict, link: bool = False): + """ + Read information from pangenome to assign to organism object + + :param pangenome: Input pangenome + :param org_name: Name of the organism + :param contig_dict: Dictionary with all contig and associate genes + :param circular_contigs: Dictionary of contigs + :param link: get the gene object if the genes are clustered + """ org = Organism(org_name) gene, gene_type = (None, None) for contigName, geneList in contig_dict.items(): - contig = org.get_or_add_contig(contigName, is_circular=circular_contigs[contigName]) + contig = org.get_contig(contigName, is_circular=circular_contigs[contigName]) for row in geneList: if link: # if the gene families are already computed/loaded the gene exists. gene = pangenome.get_gene(row["ID"].decode()) @@ -127,10 +162,15 @@ def read_organism(pangenome, org_name, contig_dict, circular_contigs, link=False local = row["local"].decode() except ValueError: local = "" - gene.fill_annotations(start=row["start"], stop=row["stop"], strand=row["strand"].decode(), - gene_type=row["type"].decode(), name=row["name"].decode(), - product=row["product"].decode(), local_identifier=local, position=row["position"], - genetic_code=row["genetic_code"]) + if isinstance(gene, Gene): + gene.fill_annotations(start=row["start"], stop=row["stop"], strand=row["strand"].decode(), + gene_type=row["type"].decode(), name=row["name"].decode(), position=row['position'], + genetic_code=row["genetic_code"], product=row["product"].decode(), + local_identifier=local) + else: + gene.fill_annotations(start=row["start"], stop=row["stop"], strand=row["strand"].decode(), + gene_type=row["type"].decode(), name=row["name"].decode(), + product=row["product"].decode(), local_identifier=local) gene.is_fragment = row["is_fragment"] gene.fill_parents(org, contig) if gene_type == "CDS": @@ -142,52 +182,63 @@ def read_organism(pangenome, org_name, contig_dict, circular_contigs, link=False pangenome.add_organism(org) -def read_graph(pangenome, h5f, disable_bar=False): +def read_graph(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Read information about graph in pangenome hdf5 file to add in pangenome object + + :param pangenome: Pangenome object without graph information + :param h5f: Pangenome HDF5 file with graph information + :param disable_bar: Disable the progress bar + """ table = h5f.root.edges if not pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] or \ not pangenome.status["genesClustered"] in ["Computed", "Loaded"]: raise Exception("It's not possible to read the graph " "if the annotations and the gene families have not been loaded.") - - bar = tqdm(range(table.nrows), unit="contig adjacency", disable=disable_bar) - for row in read_chunks(table): + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="contig adjacency", disable=disable_bar): source = pangenome.get_gene(row["geneSource"].decode()) target = pangenome.get_gene(row["geneTarget"].decode()) pangenome.add_edge(source, target) - bar.update() - bar.close() pangenome.status["neighborsGraph"] = "Loaded" -def read_gene_families(pangenome, h5f, disable_bar=False): +def read_gene_families(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Read gene families in pangenome hdf5 file to add in pangenome object + + :param pangenome: Pangenome object without gene families + :param h5f: Pangenome HDF5 file with gene families information + :param disable_bar: Disable the progress bar + """ table = h5f.root.gene_families link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False - bar = tqdm(range(table.nrows), unit="gene", disable=disable_bar) - for row in read_chunks(table): + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): fam = pangenome.add_gene_family(row["geneFam"].decode()) if link: # linking if we have loaded the annotations gene_obj = pangenome.get_gene(row["gene"].decode()) else: # else, no gene_obj = Gene(row["gene"].decode()) fam.add_gene(gene_obj) - bar.update() - bar.close() pangenome.status["genesClustered"] = "Loaded" -def read_gene_families_info(pangenome, h5f, disable_bar=False): +def read_gene_families_info(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Read information about gene families in pangenome hdf5 file to add in pangenome object + + :param pangenome: Pangenome object without gene families information + :param h5f: Pangenome HDF5 file with gene families information + :param disable_bar: Disable the progress bar + """ table = h5f.root.geneFamiliesInfo - bar = tqdm(range(table.nrows), unit="gene family", disable=disable_bar) - for row in read_chunks(table): + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene family", disable=disable_bar): fam = pangenome.add_gene_family(row["name"].decode()) fam.add_partition(row["partition"].decode()) fam.add_sequence(row["protein"].decode()) - bar.update() - bar.close() if h5f.root.status._v_attrs.Partitioned: pangenome.status["partitioned"] = "Loaded" @@ -195,111 +246,134 @@ def read_gene_families_info(pangenome, h5f, disable_bar=False): pangenome.status["geneFamilySequences"] = "Loaded" -def read_gene_sequences(pangenome, h5f, disable_bar=False): +def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Read gene sequences in pangenome hdf5 file to add in pangenome object + + :param pangenome: Pangenome object without gene sequence associate to gene + :param h5f: Pangenome HDF5 file with gene sequence associate to gene + :param disable_bar: Disable the progress bar + """ if not pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"]: raise Exception("It's not possible to read the pangenome gene dna sequences " "if the annotations have not been loaded.") table = h5f.root.geneSequences - bar = tqdm(range(table.nrows), unit="gene", disable=disable_bar) - for row in read_chunks(table): + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): gene = pangenome.get_gene(row['gene'].decode()) gene.add_dna(row['dna'].decode()) - bar.update() - bar.close() pangenome.status["geneSequences"] = "Loaded" -def read_rgp(pangenome, h5f, disable_bar=False): +def read_rgp(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Read region of genomic plasticty in pangenome hdf5 file to add in pangenome object + + :param pangenome: Pangenome object without RGP + :param h5f: Pangenome HDF5 file with RGP computed + :param disable_bar: Disable the progress bar + """ if not pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] or \ not pangenome.status["genesClustered"] in ["Computed", "Loaded"]: raise Exception("It's not possible to read the RGP " "if the annotations and the gene families have not been loaded.") table = h5f.root.RGP - bar = tqdm(range(table.nrows), unit="gene", disable=disable_bar) - for row in read_chunks(table): - region = pangenome.get_or_add_region(row["RGP"].decode()) + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="region", disable=disable_bar): + region = pangenome.get_region(row["RGP"].decode()) region.append(pangenome.get_gene(row["gene"].decode())) - bar.update() - bar.close() # order the genes properly in the regions for region in pangenome.regions: region.genes = sorted(region.genes, key=lambda x: x.position) # order the same way as on the contig pangenome.status["predictedRGP"] = "Loaded" -def read_spots(pangenome, h5f, disable_bar=False): +def read_spots(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Read hotspot in pangenome hdf5 file to add in pangenome object + + :param pangenome: Pangenome object without spot + :param h5f: Pangenome HDF5 file with spot computed + :param disable_bar: Disable the progress bar + """ table = h5f.root.spots - bar = tqdm(range(table.nrows), unit="region", disable=disable_bar) spots = {} - for row in read_chunks(table): + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="spot", disable=disable_bar): curr_spot = spots.get(row["spot"]) if curr_spot is None: curr_spot = Spot(row["spot"]) spots[row["spot"]] = curr_spot - curr_spot.add_region(pangenome.get_or_add_region(row["RGP"].decode())) + curr_spot.add_region(pangenome.get_region(row["RGP"].decode())) curr_spot.spot_2_families() - bar.update() - bar.close() pangenome.add_spots(spots.values()) pangenome.status["spots"] = "Loaded" -def read_modules(pangenome, h5f, disable_bar=False): +def read_modules(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Read modules in pangenome hdf5 file to add in pangenome object + + :param pangenome: Pangenome object without modules + :param h5f: Pangenome HDF5 file with modules computed + :param disable_bar: Disable the progress bar + """ if not pangenome.status["genesClustered"] in ["Computed", "Loaded"]: raise Exception("It's not possible to read the modules if the gene families have not been loaded.") table = h5f.root.modules - bar = tqdm(range(table.nrows), unit="module", disable=disable_bar) modules = {} # id2mod - for row in read_chunks(table): + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="module", disable=disable_bar): curr_module = modules.get(row['module']) if curr_module is None: curr_module = Module(row['module']) modules[row["module"]] = curr_module curr_module.add_family(pangenome.get_gene_family(row['geneFam'].decode())) - bar.update() - bar.close() pangenome.add_modules(modules.values()) pangenome.status["modules"] = "Loaded" -def read_annotation(pangenome, h5f, disable_bar=False): +def read_annotation(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Read annotation in pangenome hdf5 file to add in pangenome object + + :param pangenome: Pangenome object without annotation + :param h5f: Pangenome HDF5 file with annotation + :param disable_bar: Disable the progress bar + """ annotations = h5f.root.annotations table = annotations.genes - bar = tqdm(range(table.nrows), unit="gene", disable=disable_bar) pangenome_dict = {} circular_contigs = {} - for row in read_chunks(table): + for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): + decode_org = row["organism"].decode() try: - pangenome_dict[row["organism"].decode()][row["contig"]["name"].decode()].append( - row["gene"]) # new gene, seen contig, seen org + # new gene, seen contig, seen org + pangenome_dict[decode_org][row["contig"]["name"].decode()].append(row["gene"]) except KeyError: try: - pangenome_dict[row["organism"].decode()][row["contig"]["name"].decode()] = [ - row["gene"]] # new contig, seen org - circular_contigs[row["organism"].decode()][row["contig"]["name"].decode()] = \ - row["contig"]["is_circular"] + # new contig, seen org + pangenome_dict[decode_org][row["contig"]["name"].decode()] = [row["gene"]] + circular_contigs[decode_org][row["contig"]["name"].decode()] = row["contig"]["is_circular"] except KeyError: - pangenome_dict[sys.intern(row["organism"].decode())] = { - row["contig"]["name"].decode(): [row["gene"]]} # new org - circular_contigs[row["organism"].decode()] = { - row["contig"]["name"].decode(): row["contig"]["is_circular"]} - bar.update() - bar.close() + # new org + pangenome_dict[sys.intern(decode_org)] = {row["contig"]["name"].decode(): [row["gene"]]} + circular_contigs[decode_org] = {row["contig"]["name"].decode(): row["contig"]["is_circular"]} link = True if pangenome.status["genesClustered"] in ["Computed", "Loaded"] else False - bar = tqdm(range(len(pangenome_dict)), unit="organism", disable=disable_bar) - for orgName, contigDict in pangenome_dict.items(): + for orgName, contigDict in tqdm(pangenome_dict.items(), total=len(pangenome_dict), + unit="organism", disable=disable_bar): + # TODO read organism in multiprocessing read_organism(pangenome, orgName, contigDict, circular_contigs[orgName], link) - bar.update() - bar.close() pangenome.status["genomesAnnotated"] = "Loaded" -def read_info(h5f): +def read_info(h5f: tables.File): + """ + Read the pangenome content + + :param h5f: Pangenome HDF5 file + """ if "/info" in h5f: info_group = h5f.root.info @@ -345,10 +419,14 @@ def read_info(h5f): else: print(f"Modules: {info_group._v_attrs['numberOfModules']}") print(f"Families in Modules: {info_group._v_attrs['numberOfFamiliesInModules']}") - # readModulesInfo(h5f) -def read_modules_info(h5f): +def read_modules_info(h5f: tables.File): + """ + Read modules information in pangenome hdf5 file + + :param h5f: Pangenome HDF5 file with RGP computed + """ if "/info" in h5f: info_group = h5f.root.info if all(x in info_group._v_attrs._f_list() for x in ['CloudSpecInModules', 'PersistentSpecInModules', @@ -366,7 +444,12 @@ def read_modules_info(h5f): f"mean: {info_group._v_attrs['StatOfFamiliesInModules']['mean']}") -def read_parameters(h5f): +def read_parameters(h5f: tables.File): + """ + Read pangenome parameters + + :param h5f: Pangenome HDF5 file + """ if "/info" in h5f: info_group = h5f.root.info if "parameters" in info_group._v_attrs._f_list(): @@ -376,11 +459,22 @@ def read_parameters(h5f): print(f" {key2} : {val}") -def read_pangenome(pangenome, annotation=False, gene_families=False, graph=False, rgp=False, spots=False, - gene_sequences=False, modules=False, disable_bar=False): +def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = False, graph: bool = False, + rgp: bool = False, spots: bool = False, gene_sequences: bool = False, modules: bool = False, + disable_bar: bool = False): """ - Reads a previously written pan, with all of its parts, depending on what is asked, - with regard to what is filled in the 'status' field of the hdf5 file. + Reads a previously written pan, with all of its parts, depending on what is asked, + with regard to what is filled in the 'status' field of the hdf5 file. + + :param pangenome: Pangenome object without some information + :param annotation: get annotation + :param gene_families: get gene families + :param graph: get graph + :param rgp: get RGP + :param spots: get hotspot + :param gene_sequences: get gene sequences + :param modules: get modules + :param disable_bar: Allow to disable the progress bar """ if hasattr(pangenome, "file"): filename = pangenome.file @@ -414,38 +508,50 @@ def read_pangenome(pangenome, annotation=False, gene_families=False, graph=False logging.getLogger().info("Reading the neighbors graph edges...") read_graph(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception( - f"The pangenome in file '{filename}' does not have graph information, or has been improperly filled") + raise Exception(f"The pangenome in file '{filename}' does not have graph information, " + f"or has been improperly filled") if rgp: if h5f.root.status._v_attrs.predictedRGP: logging.getLogger().info("Reading the RGP...") read_rgp(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception( - f"The pangenome in file '{filename}' does not have RGP information, or has been improperly filled") + raise Exception(f"The pangenome in file '{filename}' does not have RGP information, " + f"or has been improperly filled") if spots: if h5f.root.status._v_attrs.spots: logging.getLogger().info("Reading the spots...") read_spots(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception( - f"The pangenome in file '{filename}' does not have spots information, or has been improperly filled") + raise Exception(f"The pangenome in file '{filename}' does not have spots information, " + f"or has been improperly filled") if modules: if h5f.root.status._v_attrs.modules: logging.getLogger().info("Reading the modules...") read_modules(pangenome, h5f, disable_bar=disable_bar) else: - raise Exception( - f"The pangenome in file '{filename}' does not have modules information, or has been improperly filled") + raise Exception(f"The pangenome in file '{filename}' does not have modules information, " + f"or has been improperly filled") h5f.close() -def check_pangenome_info(pangenome, need_annotations=False, need_families=False, need_graph=False, - need_partitions=False, need_rgp=False, need_spots=False, need_gene_sequences=False, - need_modules=False, disable_bar=False): +def check_pangenome_info(pangenome, need_annotations: bool = False, need_families: bool = False, + need_graph: bool = False, need_partitions: bool = False, need_rgp: bool = False, + need_spots: bool = False, need_gene_sequences: bool = False, need_modules: bool = False, + disable_bar: bool = False): """ - defines what needs to be read depending on what is needed, and automatically checks if the required elements + Defines what needs to be read depending on what is needed, and automatically checks if the required elements have been computed with regard to the `pangenome.status` + + :param pangenome: Pangenome object without some information + :param need_annotations: get annotation + :param need_families: get gene families + :param need_graph: get graph + :param need_partitions: get partition + :param need_rgp: get RGP + :param need_spots: get hotspot + :param need_gene_sequences: get gene sequences + :param need_modules: get modules + :param disable_bar: Allow to disable the progress bar """ annotation = False gene_families = False diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 5a9a8ca6..bdde667a 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -12,8 +12,23 @@ import tables from gmpy2 import popcount +from ppanggolin.pangenome import Pangenome -def gene_desc(org_len, contig_len, id_len, type_len, name_len, product_len, max_local_id): + +def gene_desc(org_len, contig_len, id_len, type_len, name_len, product_len, max_local_id) -> dict: + """ + Create a table to save gene description + + :param org_len: Maximum size of organism + :param contig_len: Maximum size of contigs + :param id_len: Maximum size of gene ID + :param type_len: Maximum size of gene Type + :param name_len: Maximum size of gene name + :param product_len: Maximum size of gene product + :param max_local_id: Maximum size of gene local identifier + + :return: Formatted table + """ return { 'organism': tables.StringCol(itemsize=org_len), "contig": { @@ -36,7 +51,13 @@ def gene_desc(org_len, contig_len, id_len, type_len, name_len, product_len, max_ } -def get_max_len_annotations(pangenome): +def get_max_len_annotations(pangenome: Pangenome) -> (int, int, int, int, int, int, int): + """ + Get the maximum size of each annotation information to optimize saving + + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ max_org_len = 1 max_contig_len = 1 max_gene_id_len = 1 @@ -76,17 +97,20 @@ def get_max_len_annotations(pangenome): return max_org_len, max_contig_len, max_gene_id_len, max_type_len, max_name_len, max_product_len, max_local_id -def write_annotations(pangenome, h5f, disable_bar=False): +def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ - Function writing all the pangenome annotations + Function writing all the pangenome annotations + + :param pangenome: Annotated pangenome + :param h5f: Pangenome HDF5 file + :param disable_bar: Alow to disable progress bar """ annotation = h5f.create_group("/", "annotations", "Annotations of the pangenome organisms") gene_table = h5f.create_table(annotation, "genes", gene_desc(*get_max_len_annotations(pangenome)), expectedrows=len(pangenome.genes)) - bar = tqdm(pangenome.organisms, unit="genome", disable=disable_bar) gene_row = gene_table.row - for org in bar: + for org in tqdm(pangenome.organisms, total=pangenome.number_of_organisms(), unit="genome", disable=disable_bar): for contig in org.contigs: for gene in contig.genes: gene_row["organism"] = org.name @@ -118,10 +142,15 @@ def write_annotations(pangenome, h5f, disable_bar=False): gene_row["gene/is_fragment"] = rna.is_fragment gene_row.append() gene_table.flush() - bar.close() -def get_gene_sequences_len(pangenome): +def get_gene_sequences_len(pangenome: Pangenome) -> (int, int, int): + """ + Get the maximum size of gene sequences to optimize saving + + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ max_seq_len = 1 max_gene_id_len = 1 max_gene_type = 1 @@ -135,7 +164,16 @@ def get_gene_sequences_len(pangenome): return max_gene_id_len, max_seq_len, max_gene_type -def gene_sequences_desc(gene_id_len, gene_seq_len, gene_type_len): +def gene_sequences_desc(gene_id_len, gene_seq_len, gene_type_len) -> dict: + """ + Create table to save gene sequences + + :param gene_id_len: Maximum size of gene sequence identifier + :param gene_seq_len: Maximum size of gene sequences + :param gene_type_len: Maximum size of gene type + + :return: Formated table + """ return { "gene": tables.StringCol(itemsize=gene_id_len), "dna": tables.StringCol(itemsize=gene_seq_len), @@ -143,21 +181,35 @@ def gene_sequences_desc(gene_id_len, gene_seq_len, gene_type_len): } -def write_gene_sequences(pangenome, h5f, disable_bar=False): +def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Function writing all the pangenome gene sequences + + :param pangenome: Pangenome with gene sequences + :param h5f: Pangenome HDF5 file without sequences + :param disable_bar: Disable progress bar + """ gene_seq = h5f.create_table("/", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), expectedrows=len(pangenome.genes)) gene_row = gene_seq.row - bar = tqdm(pangenome.genes, unit="gene", disable=disable_bar) - for gene in bar: + for gene in tqdm(pangenome.genes, total=pangenome.number_of_gene(), unit="gene", disable=disable_bar): gene_row["gene"] = gene.ID gene_row["dna"] = gene.dna gene_row["type"] = gene.type gene_row.append() gene_seq.flush() - bar.close() -def gene_fam_desc(max_name_len, max_sequence_length, max_part_len): +def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int) -> dict: + """ + Create a formated table for gene families description + + :param max_name_len: Maximum size of gene family name + :param max_sequence_length: Maximum size of gene family representing gene sequences + :param max_part_len: Maximum size of gene family partition + + :return: Formated table + """ return { "name": tables.StringCol(itemsize=max_name_len), "protein": tables.StringCol(itemsize=max_sequence_length), @@ -165,7 +217,14 @@ def gene_fam_desc(max_name_len, max_sequence_length, max_part_len): } -def get_gene_fam_len(pangenome): +def get_gene_fam_len(pangenome: Pangenome) -> (int, int, int): + """ + Get maximum size of gene families information + + :param pangenome: Pangenome with gene families computed + + :return: Maximum size of each element + """ max_gene_fam_name_len = 1 max_gene_fam_seq_len = 1 max_part_len = 1 @@ -179,9 +238,14 @@ def get_gene_fam_len(pangenome): return max_gene_fam_name_len, max_gene_fam_seq_len, max_part_len -def write_gene_fam_info(pangenome, h5f, force, disable_bar=False): +def write_gene_fam_info(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): """ - Writing a table containing the protein sequences of each family + Writing a table containing the protein sequences of each family + + :param pangenome: Pangenome with gene families computed + :param h5f: HDF5 file to write gene families + :param force: force to write information if precedent information exist + :param disable_bar: Disable progress bar """ if '/geneFamiliesInfo' in h5f and force is True: logging.getLogger().info("Erasing the formerly computed gene family representative sequences...") @@ -190,24 +254,38 @@ def write_gene_fam_info(pangenome, h5f, force, disable_bar=False): expectedrows=len(pangenome.gene_families)) row = gene_fam_seq.row - bar = tqdm(pangenome.gene_families, unit="gene family", disable=disable_bar) - for fam in bar: + for fam in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families(), + unit="gene family", disable=disable_bar): row["name"] = fam.name row["protein"] = fam.sequence row["partition"] = fam.partition row.append() gene_fam_seq.flush() - bar.close() -def gene_to_fam_desc(gene_fam_name_len, gene_id_len): +def gene_to_fam_desc(gene_fam_name_len: int, gene_id_len: int) -> dict: + """ + Create a formated table for gene in gene families information + + :param gene_fam_name_len: Maximum size of gene family names + :param gene_id_len: Maximum sizez of gene identifier + + :return: formated table + """ return { "geneFam": tables.StringCol(itemsize=gene_fam_name_len), "gene": tables.StringCol(itemsize=gene_id_len) } -def get_gene_to_fam_len(pangenome): +def get_gene_to_fam_len(pangenome: Pangenome): + """ + Get maximum size of gene in gene families information + + :param pangenome: Pangenome with gene families computed + + :return: Maximum size of each element + """ max_gene_fam_name = 1 max_gene_id = 1 for geneFam in pangenome.gene_families: @@ -219,33 +297,51 @@ def get_gene_to_fam_len(pangenome): return max_gene_fam_name, max_gene_id -def write_gene_families(pangenome, h5f, force, disable_bar=False): +def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): """ - Function writing all the pangenome gene families + Function writing all the pangenome gene families + + :param pangenome: pangenome with gene families computed + :param h5f: HDF5 file to save pangenome with gene families + :param force: Force to write gene families in hdf5 file if there is already gene families + :param disable_bar: Disable progress bar """ if '/gene_families' in h5f and force is True: logging.getLogger().info("Erasing the formerly computed gene family to gene associations...") h5f.remove_node('/', 'gene_families') # erasing the table, and rewriting a new one. gene_families = h5f.create_table("/", "gene_families", gene_to_fam_desc(*get_gene_to_fam_len(pangenome))) gene_row = gene_families.row - bar = tqdm(pangenome.gene_families, unit="gene family", disable=disable_bar) - for geneFam in bar: + for geneFam in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families(), unit="gene family", + disable=disable_bar): for gene in geneFam.genes: gene_row["gene"] = gene.ID gene_row["geneFam"] = geneFam.name gene_row.append() gene_families.flush() - bar.close() def graph_desc(max_gene_id_len): + """ + Create a formated table for pangenome graph + + :param max_gene_id_len: Maximum size of gene id + + :return: formated table + """ return { 'geneTarget': tables.StringCol(itemsize=max_gene_id_len), 'geneSource': tables.StringCol(itemsize=max_gene_id_len) } -def get_gene_id_len(pangenome): +def get_gene_id_len(pangenome: Pangenome): + """ + Get maximum size of gene id in pangenome graph + + :param pangenome: Pangenome with graph computed + + :return: Maximum size of gene id + """ max_gene_len = 1 for gene in pangenome.genes: if len(gene.ID) > max_gene_len: @@ -253,35 +349,56 @@ def get_gene_id_len(pangenome): return max_gene_len -def write_graph(pangenome, h5f, force, disable_bar=False): - # if we want to be able to read the graph without reading the annotations - # (because it's one of the most time consumming parts to read), - # it might be good to add the organism name in the table here. for now, forcing the read of annotations. +def write_graph(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): + """ + Function writing the pangenome graph + + :param pangenome: pangenome with graph computed + :param h5f: HDF5 file to save pangenome graph + :param force: Force to write graph in hdf5 file if there is already one + :param disable_bar: Disable progress bar + """ + # TODO if we want to be able to read the graph without reading the annotations (because it's one of the most time + # consumming parts to read), it might be good to add the organism name in the table here. + # for now, forcing the read of annotations. if '/edges' in h5f and force is True: logging.getLogger().info("Erasing the formerly computed edges") h5f.remove_node("/", "edges") edge_table = h5f.create_table("/", "edges", graph_desc(get_gene_id_len(pangenome)), expectedrows=len(pangenome.edges)) edge_row = edge_table.row - bar = tqdm(pangenome.edges, unit="edge", disable=disable_bar) - for edge in bar: + for edge in tqdm(pangenome.edges, total=pangenome.number_of_edge(), unit="edge", disable=disable_bar): for genePairs in edge.organisms.values(): for gene1, gene2 in genePairs: edge_row["geneTarget"] = gene1.ID edge_row["geneSource"] = gene2.ID edge_row.append() - bar.close() edge_table.flush() def rgp_desc(max_rgp_len, max_gene_len): + """ + Create a formated table for region of genomic plasticity + + :param max_rgp_len: Maximum size of RGP + :param max_gene_len: Maximum sizez of gene + + :return: formated table + """ return { 'RGP': tables.StringCol(itemsize=max_rgp_len), 'gene': tables.StringCol(itemsize=max_gene_len) } -def get_rgp_len(pangenome): +def get_rgp_len(pangenome: Pangenome): + """ + Get maximum size of region of genomic plasticity and gene + + :param pangenome: Pangenome with gene families computed + + :return: Maximum size of each element + """ max_gene_len = 1 max_rgp_len = 1 for region in pangenome.regions: @@ -293,7 +410,15 @@ def get_rgp_len(pangenome): return max_rgp_len, max_gene_len -def write_rgp(pangenome, h5f, force, disable_bar=False): +def write_rgp(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): + """ + Function writing all the region of genomic plasticity in pangenome + + :param pangenome: pangenome with RGP computed + :param h5f: HDF5 file to save pangenome with RGP + :param force: Force to write gene families in hdf5 file if there is already RGP + :param disable_bar: Disable progress bar + """ if '/RGP' in h5f and force is True: logging.getLogger().info("Erasing the formerly computer RGP") h5f.remove_node('/', 'RGP') @@ -301,24 +426,36 @@ def write_rgp(pangenome, h5f, force, disable_bar=False): rgp_table = h5f.create_table('/', 'RGP', rgp_desc(*get_rgp_len(pangenome)), expectedrows=sum([len(region.genes) for region in pangenome.regions])) rgp_row = rgp_table.row - bar = tqdm(pangenome.regions, unit="region", disable=disable_bar) - for region in bar: + for region in tqdm(pangenome.regions, total=pangenome.number_of_rgp(), unit="region", disable=disable_bar): for gene in region.genes: rgp_row["RGP"] = region.name rgp_row["gene"] = gene.ID rgp_row.append() - bar.close() rgp_table.flush() def spot_desc(max_rgp_len): + """ + Create a formated table for hotspot + + :param max_rgp_len: Maximum size of RGP + + :return: formated table + """ return { 'spot': tables.UInt32Col(), 'RGP': tables.StringCol(itemsize=max_rgp_len) } -def get_spot_desc(pangenome): +def get_spot_desc(pangenome: Pangenome): + """ + Get maximum size of region of genomic plasticity in hotspot + + :param pangenome: Pangenome with gene families computed + + :return: Maximum size of each element + """ max_rgp_len = 1 for spot in pangenome.spots: for region in spot.regions: @@ -327,7 +464,15 @@ def get_spot_desc(pangenome): return max_rgp_len -def write_spots(pangenome, h5f, force, disable_bar=False): +def write_spots(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): + """ + Function writing all the pangenome hotspot + + :param pangenome: pangenome with spot computed + :param h5f: HDF5 file to save pangenome with spot + :param force: Force to write gene families in hdf5 file if there is already spot + :param disable_bar: Disable progress bar + """ if '/spots' in h5f and force is True: logging.getLogger().info("Erasing the formerly computed spots") h5f.remove_node("/", "spots") @@ -335,25 +480,36 @@ def write_spots(pangenome, h5f, force, disable_bar=False): spot_table = h5f.create_table("/", "spots", spot_desc(get_spot_desc(pangenome)), expectedrows=sum([len(spot.regions) for spot in pangenome.spots])) spot_row = spot_table.row - bar = tqdm(pangenome.spots, unit="spot", disable=disable_bar) - for spot in pangenome.spots: + for spot in tqdm(pangenome.spots, total=pangenome.number_of_spots(), unit="spot", disable=disable_bar): for region in spot.regions: spot_row["spot"] = spot.ID spot_row["RGP"] = region.name spot_row.append() - bar.update() - bar.close() spot_table.flush() def mod_desc(gene_fam_name_len): + """ + Create a formated table for hotspot + + :param gene_fam_name_len: Maximum size of gene families name + + :return: formated table + """ return { "geneFam": tables.StringCol(itemsize=gene_fam_name_len), "module": tables.UInt32Col(), } -def get_mod_desc(pangenome): +def get_mod_desc(pangenome: Pangenome): + """ + Get maximum size of gene families name in modules + + :param pangenome: Pangenome with modules computed + + :return: Maximum size of each element + """ max_fam_len = 1 for mod in pangenome.modules: for fam in mod.families: @@ -362,7 +518,15 @@ def get_mod_desc(pangenome): return max_fam_len -def write_modules(pangenome, h5f, force, disable_bar=False): +def write_modules(pangenome: Pangenome, h5f: tables.File, force: bool = False, disable_bar: bool = False): + """ + Function writing all the pangenome modules + + :param pangenome: pangenome with spot computed + :param h5f: HDF5 file to save pangenome with spot + :param force: Force to write gene families in hdf5 file if there is already spot + :param disable_bar: Disable progress bar + """ if '/modules' in h5f and force is True: logging.getLogger().info("Erasing the formerly computed modules") h5f.remove_node("/", "modules") @@ -371,17 +535,21 @@ def write_modules(pangenome, h5f, force, disable_bar=False): expectedrows=sum([len(mod.families) for mod in pangenome.modules])) mod_row = mod_table.row - bar = tqdm(pangenome.modules, unit="modules", disable=disable_bar) - for mod in bar: + for mod in tqdm(pangenome.modules, total=pangenome.number_of_modules(), unit="modules", disable=disable_bar): for fam in mod.families: mod_row["geneFam"] = fam.name mod_row["module"] = mod.ID mod_row.append() - bar.close() mod_table.flush() -def write_status(pangenome, h5f): +def write_status(pangenome: Pangenome, h5f: tables.File): + """ + Write pangenome status in HDF5 file + + :param pangenome: Pangenome object + :param h5f: Pangenome file + """ if "/status" in h5f: # if statuses are already written status_group = h5f.root.status else: # else create the status group. @@ -398,7 +566,7 @@ def write_status(pangenome, h5f): status_group._v_attrs.NeighborsGraph = True if pangenome.status["neighborsGraph"] in ["Computed", "Loaded", "inFile"] else False status_group._v_attrs.Partitioned = True if pangenome.status["partitioned"] in ["Computed", "Loaded", - "inFile"] else False + "inFile"] else False status_group._v_attrs.defragmented = True if pangenome.status["defragmented"] in ["Computed", "Loaded", "inFile"] else False status_group._v_attrs.predictedRGP = True if pangenome.status["predictedRGP"] in ["Computed", "Loaded", @@ -408,32 +576,49 @@ def write_status(pangenome, h5f): status_group._v_attrs.version = pkg_resources.get_distribution("ppanggolin").version -def write_info(pangenome, h5f): - """ writes information and numbers to be eventually called with the 'info' submodule """ +def write_info(pangenome: Pangenome, h5f: tables.File): + """ + Writes information and numbers to be eventually called with the 'info' submodule + + :param pangenome: Pangenome object with some information computed + :param h5f: Pangenome file to save information + """ - def getmean(arg): - if len(arg) == 0: - return 0 - else: - return round(statistics.mean(arg), 2) + def getmean(arg: iter) -> float: + """ Compute the mean of arguments if exist 0 else - def getstdev(arg): - if len(arg) <= 1: - return 0 - else: - return round(statistics.stdev(arg), 2) + :param arg: list of values - def getmax(arg): - if len(arg) == 0: - return 0 - else: - return round(max(arg), 2) + :return: return the mean + """ + return 0 if len(arg) == 0 else round(statistics.mean(arg), 2) - def getmin(arg): - if len(arg) == 0: - return 0 - else: - return round(min(arg), 2) + def getstdev(arg: iter) -> float: + """ Compute the standard deviation of arguments if exist 0 else + + :param arg: list of values + + :return: return the sd + """ + return 0 if len(arg) <= 1 else round(statistics.stdev(arg), 2) + + def getmax(arg: iter) -> float: + """ Get the maximum of arguments if exist 0 else + + :param arg: list of values + + :return: return the maximum + """ + return 0 if len(arg) == 0 else round(max(arg), 2) + + def getmin(arg: iter) -> float: + """ Get the minimum of arguments if exist 0 else + + :param arg: list of values + + :return: return the minimum + """ + return 0 if len(arg) == 0 else round(min(arg), 2) if "/info" in h5f: info_group = h5f.root.info @@ -485,37 +670,63 @@ def getmin(arg): info_group._v_attrs.parameters = pangenome.parameters # saving the pangenome parameters -def write_info_modules(pangenome, h5f): - def getmean(arg): - if len(arg) == 0: - return 0 - else: - return round(statistics.mean(arg), 2) +def write_info_modules(pangenome: Pangenome, h5f: tables.File): + """ + Writes more information about modules if computed by metrics subpackage + + :param pangenome: Pangenome object with some information computed + :param h5f: Pangenome file to save information + """ + + def getmean(arg: iter) -> float: + """ Compute the mean of arguments if exist 0 else + + :param arg: list of values + + :return: return the mean + """ + return 0 if len(arg) == 0 else round(statistics.mean(arg), 2) + + def getstdev(arg: iter) -> float: + """ Compute the standard deviation of arguments if exist 0 else - def getstdev(arg): - if len(arg) <= 1: - return 0 - else: - return round(statistics.stdev(arg), 2) + :param arg: list of values - def getmax(arg): - if len(arg) == 0: - return 0 - else: - return round(max(arg), 2) + :return: return the sd + """ + return 0 if len(arg) <= 1 else round(statistics.stdev(arg), 2) - def getmin(arg): - if len(arg) == 0: - return 0 - else: - return round(min(arg), 2) + def getmax(arg: iter) -> float: + """ Get the maximum of arguments if exist 0 else + + :param arg: list of values + + :return: return the maximum + """ + return 0 if len(arg) == 0 else round(max(arg), 2) + + def getmin(arg: iter) -> float: + """ Get the minimum of arguments if exist 0 else + + :param arg: list of values + + :return: return the minimum + """ + return 0 if len(arg) == 0 else round(min(arg), 2) if "/info" not in h5f: write_info(pangenome, h5f) info_group = h5f.root.info if pangenome.status["modules"] in ["Computed", "Loaded"]: - def part_spec(part): + def part_spec(part: str) -> list: + """ + Get the list of module for a specific partition of pangenome + + :param part: pangenome partition name + + :return: list of module specific to partition + """ pangenome.compute_mod_bitarrays(part) return [popcount(module.bitarray) for module in pangenome.modules] @@ -546,36 +757,51 @@ def part_spec(part): raise Exception("Modules were not computed in your pangenome. Please see the module subcommand.") -def update_gene_fam_partition(pangenome, h5f, disable_bar=False): +def update_gene_fam_partition(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): + """ + Update the gene families table with partition information + + :param pangenome: Partitioned pangenome + :param h5f: HDF5 file with gene families + :param disable_bar: Allow to disable progress bar + """ logging.getLogger().info("Updating gene families with partition information") table = h5f.root.geneFamiliesInfo - bar = tqdm(range(table.nrows), unit="gene family", disable=disable_bar) - for row in table: + for row in tqdm(table, total=table.nrows, unit="gene family", disable=disable_bar): row["partition"] = pangenome.get_gene_family(row["name"].decode()).partition row.update() - bar.update() - bar.close() -def update_gene_fragments(pangenome, h5f, disable_bar=False): +def update_gene_fragments(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ - updates the annotation table with the fragmentation information from the defrag pipeline + Updates the annotation table with the fragmentation information from the defrag pipeline + + :param pangenome: Annotated pangenome + :param h5f: HDF5 pangenome file + :param disable_bar: Allow to disable progress bar """ logging.getLogger().info("Updating annotations with fragment information") table = h5f.root.annotations.genes - bar = tqdm(range(table.nrows), unit="gene", disable=disable_bar) - for row in table: + for row in tqdm(table, total=table.nrows, unit="gene", disable=disable_bar): if row['gene/type'].decode() == 'CDS': row['gene/is_fragment'] = pangenome.get_gene(row['gene/ID'].decode()).is_fragment row.update() - bar.update() - bar.close() table.flush() -def erase_pangenome(pangenome, graph=False, gene_families=False, partition=False, rgp=False, spots=False, - modules=False): - """ erases tables from a pangenome .h5 file """ +def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bool = False, partition: bool = False, + rgp: bool = False, spots: bool = False, modules: bool = False): + """ + Erases tables from a pangenome .h5 file + + :param pangenome: Pangenome + :param graph: remove graph information + :param gene_families: remove gene families information + :param partition: remove partition information + :param rgp: remove rgp information + :param spots: remove spots information + :param modules: remove modules information + """ h5f = tables.open_file(pangenome.file, "a") status_group = h5f.root.status @@ -650,10 +876,14 @@ def erase_pangenome(pangenome, graph=False, gene_families=False, partition=False h5f.close() -def write_pangenome(pangenome, filename, force, disable_bar=False): +def write_pangenome(pangenome: Pangenome, filename, force: bool = False, disable_bar: bool = False): """ - Writes or updates a pangenome file - pangenome is the corresponding pangenome object, filename the h5 file and status what has been modified. + Writes or updates a pangenome file + + :param pangenome: pangenome object + :param filename: HDF5 file to save pangenome + :param force: force to write on pangenome if information already exist + :param disable_bar: Allow to disable progress bar """ if pangenome.status["genomesAnnotated"] == "Computed": diff --git a/ppanggolin/formats/writeFlat.py b/ppanggolin/formats/writeFlat.py index f5f032a5..14686964 100644 --- a/ppanggolin/formats/writeFlat.py +++ b/ppanggolin/formats/writeFlat.py @@ -6,11 +6,15 @@ from multiprocessing import get_context from collections import Counter, defaultdict import logging +from typing import TextIO import pkg_resources from statistics import median, mean, stdev import os # local libraries +from ppanggolin.edge import Edge +from ppanggolin.geneFamily import GeneFamily +from ppanggolin.genome import Organism from ppanggolin.pangenome import Pangenome from ppanggolin.utils import write_compressed_or_not, mk_outdir, restricted_float from ppanggolin.formats.readBinaries import check_pangenome_info @@ -27,7 +31,11 @@ ignore_err = False -def write_json_header(json): +def write_json_header(json: TextIO): + """Write the header of json file to save graph + + :param json: file-like object, compressed or not + """ json.write('{"directed": false, "multigraph": false,') json.write(' "graph": {') json.write(' "organisms": {') @@ -36,8 +44,8 @@ def write_json_header(json): orgstr.append('"' + org.name + '": {') contigstr = [] for contig in org.contigs: - contigstr.append( - '"' + contig.name + '": {"is_circular": ' + ('true' if contig.is_circular else 'false') + '}') + contigstr.append(f'"{contig.name}": ' + '{"is_circular: ' + + ('true' if contig.is_circular else 'false') + '}') orgstr[-1] += ', '.join(contigstr) + "}" json.write(', '.join(orgstr) + "}") @@ -45,7 +53,12 @@ def write_json_header(json): json.write('},') -def write_json_gene_fam(gene_fam, json): +def write_json_gene_fam(gene_fam: GeneFamily, json: TextIO): + """Write the gene families corresponding to node graph in json file + + :param gene_fam: file-like object, compressed or not + :param json: file-like object, compressed or not + """ json.write('{' + f'"id": "{gene_fam.name}", "nb_genes": {len(gene_fam.genes)}, ' f'"partition": "{gene_fam.named_partition}", "subpartition": "{gene_fam.partition}"' + '}') org_dict = {} @@ -86,7 +99,11 @@ def write_json_gene_fam(gene_fam, json): json.write(", ".join(orgstr) + "}}") -def write_json_nodes(json): +def write_json_nodes(json: TextIO): + """Write the node graph in json file + + :param json: file-like object, compressed or not + """ json.write('"nodes": [') fam_list = list(pan.gene_families) first_fam = fam_list[0] @@ -97,7 +114,12 @@ def write_json_nodes(json): json.write(']') -def write_json_edge(edge, json): +def write_json_edge(edge: Edge, json: TextIO): + """Write the edge graph in json file + + :param edge: file-like object, compressed or not + :param json: file-like object, compressed or not + """ json.write("{") json.write(f'"weight": {len(edge.gene_pairs)}, "source": "{edge.source.name}", "target": "{edge.target.name}"') json.write(', "organisms": {') @@ -113,8 +135,12 @@ def write_json_edge(edge, json): def write_json_edges(json): + """Write the edge graph in json file + + :param json: file-like object, compressed or not + """ json.write(', "links": [') - edgelist = list(pan.edges) + edgelist = pan.edges write_json_edge(edgelist[0], json) for edge in edgelist[1:]: json.write(", ") @@ -122,7 +148,12 @@ def write_json_edges(json): json.write(']') -def write_json(output, compress): +def write_json(output: str, compress: bool = False): + """Writes the graph in a json file format + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ logging.getLogger().info("Writing the json file for the pangenome graph...") outname = output + "/pangenomeGraph.json" with write_compressed_or_not(outname, compress) as json: @@ -133,10 +164,15 @@ def write_json(output, compress): logging.getLogger().info(f"Done writing the json file : '{outname}'") -def write_gexf_header(gexf, light): +def write_gexf_header(gexf: TextIO, light: bool = True): + """Write the header of gexf file to save graph + + :param gexf: file-like object, compressed or not + :param light: save the light version of the pangenome graph + """ index = None if not light: - index = pan.get_index() # has been computed already + index = pan.get_org_index() # has been computed already gexf.write('\n\n') # TODO update link gexf.write(' \n') @@ -168,13 +204,19 @@ def write_gexf_header(gexf, light): gexf.write(' \n') -def write_gexf_nodes(gexf, light, soft_core=0.95): +def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): + """Write the node of pangenome graph in gexf file + + :param gexf: file-like object, compressed or not + :param light: save the light version of the pangenome graph + :param soft_core: Soft core threshold to use + """ index = None gexf.write(' \n') colors = {"persistent": 'a="0" b="7" g="165" r="247"', 'shell': 'a="0" b="96" g="216" r="0"', 'cloud': 'a="0" b="255" g="222" r="121"'} if not light: - index = pan.get_index() + index = pan.get_org_index() for fam in pan.gene_families: name = Counter() @@ -216,10 +258,15 @@ def write_gexf_nodes(gexf, light, soft_core=0.95): gexf.write(' \n') -def write_gexf_edges(gexf, light): +def write_gexf_edges(gexf: TextIO, light: bool = True): + """Write the edge of pangenome graph in gexf file + + :param gexf: file-like object, compressed or not + :param light: save the light version of the pangenome graph + """ gexf.write(' \n') edgeids = 0 - index = pan.get_index() + index = pan.get_org_index() for edge in pan.edges: gexf.write(f' ") gexf.write("") -def write_gexf(output, light=True, compress=False): - txt = "Writing the gexf file for the pangenome graph..." - if light: - txt = "Writing the light gexf file for the pangenome graph..." +def write_gexf(output: str, light: bool = True, compress: bool = False): + """Write the node of pangenome in gexf file + + :param output: Path to output directory + :param light: save the light version of the pangenome graph + :param compress: Compress the file in .gz + """ + txt = "Writing the " + txt += "light gexf file for the pangenome graph..." if light else "gexf file for the pangenome graph..." + logging.getLogger().info(txt) outname = output + "/pangenomeGraph" outname += "_light" if light else "" @@ -257,7 +314,17 @@ def write_gexf(output, light=True, compress=False): logging.getLogger().info(f"Done writing the gexf file : '{outname}'") -def write_matrix(sep, ext, output, compress=False, gene_names=False): +def write_matrix(output: str, sep: str = ',', ext: str = 'csv', compress: bool = False, gene_names: bool = False): + """ + Write a csv file format as used by Roary, among others. + The alternative gene ID will be the partition, if there is one + + :param sep: Column field separator + :param ext: file extension + :param output: Path to output directory + :param compress: Compress the file in .gz + :param gene_names: write the genes name if there are saved in pangenome + """ logging.getLogger().info(f"Writing the .{ext} file ...") outname = output + "/matrix." + ext with write_compressed_or_not(outname, compress) as matrix: @@ -284,15 +351,15 @@ def write_matrix(sep, ext, output, compress=False, gene_names=False): '"Avg group size nuc"'] # 14 + ['"' + str(org) + '"' for org in pan.organisms]) + "\n") # 15 default_genes = ['""'] * len(pan.organisms) if gene_names else ["0"] * len(pan.organisms) - org_index = pan.get_index() # should just return things + org_index = pan.get_org_index() # should just return things for fam in pan.gene_families: genes = default_genes.copy() lis = [] genenames = Counter() product = Counter() for org, gene_list in fam.get_org_dict().items(): - genes[org_index[org]] = " ".join(['"' + str(gene) + '"' for gene in gene_list]) if gene_names else str( - len(gene_list)) + genes[org_index[org]] = " ".join(['"' + str(gene) + + '"' for gene in gene_list]) if gene_names else str(len(gene_list)) for gene in gene_list: lis.append(gene.stop - gene.start) product[gene.product] += 1 @@ -322,7 +389,13 @@ def write_matrix(sep, ext, output, compress=False, gene_names=False): logging.getLogger().info(f"Done writing the matrix : '{outname}'") -def write_gene_presence_absence(output, compress=False): +def write_gene_presence_absence(output: str, compress: bool = False): + """ + Write the gene presence absence matrix + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ logging.getLogger().info(f"Writing the gene presence absence file ...") outname = output + "/gene_presence_absence.Rtab" with write_compressed_or_not(outname, compress) as matrix: @@ -332,10 +405,10 @@ def write_gene_presence_absence(output, compress=False): default_dat.append('0') index_org[org] = index - matrix.write('\t'.join(['Gene'] # 14 - + [str(org) for org in pan.organisms]) + "\n") # 15 + matrix.write('\t'.join(['Gene'] + # 14 + [str(org) for org in pan.organisms]) + "\n") # 15 default_genes = ["0"] * len(pan.organisms) - org_index = pan.get_index() # should just return things + org_index = pan.get_org_index() # should just return things for fam in pan.gene_families: genes = default_genes.copy() for org in fam.organisms: @@ -346,14 +419,22 @@ def write_gene_presence_absence(output, compress=False): logging.getLogger().info(f"Done writing the gene presence absence file : '{outname}'") -def write_stats(output, soft_core, dup_margin, compress=False): +def write_stats(output: str, soft_core: float = 0.95, dup_margin: float = 0.05, compress: bool = False): + """ + Write pangenome statistics + + :param output: Path to output directory + :param soft_core: Soft core threshold to use + :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated + :param compress: Compress the file in .gz + """ logging.getLogger().info("Writing pangenome statistics...") logging.getLogger().info("Writing statistics on persistent duplication...") single_copy_markers = set() # could use bitarrays if speed is needed with write_compressed_or_not(output + "/mean_persistent_duplication.tsv", compress) as outfile: outfile.write(f"#duplication_margin={round(dup_margin, 3)}\n") - outfile.write( - "\t".join(["persistent_family", "duplication_ratio", "mean_presence", "is_single_copy_marker"]) + "\n") + outfile.write("\t".join(["persistent_family", "duplication_ratio", "mean_presence", "is_single_copy_marker"]) + + "\n") for fam in pan.gene_families: if fam.named_partition == "persistent": mean_pres = len(fam.genes) / len(fam.organisms) @@ -439,7 +520,14 @@ def write_stats(output, soft_core, dup_margin, compress=False): logging.getLogger().info("Done writing genome per genome statistics") -def write_org_file(org, output, compress=False): +def write_org_file(org: Organism, output: str, compress: bool = False): + """ + Write the projection of pangenome for one organism + + :param org: Projected organism + :param output: Path to output directory + :param compress: Compress the file in .gz + """ with write_compressed_or_not(output + "/" + org.name + ".tsv", compress) as outfile: header = ["gene", "contig", "start", "stop", "strand", "family", "nb_copy_in_org", "partition", "persistent_neighbors", "shell_neighbors", "cloud_neighbors"] @@ -484,7 +572,13 @@ def write_org_file(org, output, compress=False): outfile.write("\t".join(map(str, row)) + "\n") -def write_projections(output, compress=False): +def write_projections(output: str, compress: bool = False): + """ + Write the projection of pangenome for all organisms + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ logging.getLogger().info("Writing the projection files...") outdir = output + "/projection" if not os.path.exists(outdir): @@ -494,8 +588,14 @@ def write_projections(output, compress=False): logging.getLogger().info("Done writing the projection files") -def write_parts(output, soft_core): - logging.getLogger().info("Writing the list of gene families for each partitions...") +def write_parts(output: str, soft_core: float = 0.95): + """ + Write the list of gene families for each partition + + :param output: Path to output directory + :param soft_core: Soft core threshold to use + """ + logging.getLogger().info("Writing the list of gene families for each partition ...") if not os.path.exists(output + "/partitions"): os.makedirs(output + "/partitions") part_sets = defaultdict(set) @@ -525,7 +625,13 @@ def write_parts(output, soft_core): logging.getLogger().info("Done writing the list of gene families for each partition") -def write_gene_families_tsv(output, compress=False): +def write_gene_families_tsv(output: str, compress: bool = False): + """ + Write the file providing the association between genes and gene families + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ logging.getLogger().info("Writing the file providing the association between genes and gene families...") outname = output + "/gene_families.tsv" with write_compressed_or_not(outname, compress) as tsv: @@ -533,11 +639,17 @@ def write_gene_families_tsv(output, compress=False): for gene in fam.genes: tsv.write("\t".join([fam.name, gene.ID if gene.local_identifier == "" else gene.local_identifier, "F" if gene.is_fragment else ""]) + "\n") - logging.getLogger().info( - f"Done writing the file providing the association between genes and gene families : '{outname}'") + logging.getLogger().info("Done writing the file providing the association between genes and " + f"gene families : '{outname}'") def write_regions(output, compress=False): + """ + Write the file providing information about RGP content + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ fname = output + "/plastic_regions.tsv" with write_compressed_or_not(fname, compress) as tab: tab.write("region\torganism\tcontig\tstart\tstop\tgenes\tcontigBorder\twholeContig\n") @@ -547,13 +659,17 @@ def write_regions(output, compress=False): len(region.genes), region.is_contig_border, region.is_whole_contig])) + "\n") -def summarize_spots(spots, output, compress): - def r_and_s(value): - """ rounds to dp figures and returns a str of the provided value""" - if isinstance(value, float): - return str(round(value, 3)) - else: - return str(value) +def summarize_spots(spots: set, output: str, compress: bool = False): + """ + Write a file providing summarize information about hotspots + + :param spots: set of spots in pangenome + :param output: Path to output directory + :param compress: Compress the file in .gz + """ + def r_and_s(value: float): + """rounds to dp figures and returns a str of the provided value""" + return str(round(value, 3)) if isinstance(value, float) else str(value) with write_compressed_or_not(output + "/summarize_spots.tsv", compress) as fout: fout.write("spot\tnb_rgp\tnb_families\tnb_unique_family_sets\tmean_nb_genes\t" @@ -570,13 +686,18 @@ def r_and_s(value): stdev_size = stdev(size_list) if len(size_list) > 1 else 0 max_size = max(size_list) min_size = min(size_list) - fout.write("\t".join(map(r_and_s, - [f"spot_{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, mean_size, - stdev_size, max_size, min_size])) + "\n") + fout.write("\t".join(map(r_and_s, [f"spot_{spot.ID}", len(rgp_list), len(tot_fams), len_uniq_content, + mean_size, stdev_size, max_size, min_size])) + "\n") logging.getLogger().info(f"Done writing spots in : '{output + '/summarize_spots.tsv'}'") -def spot2rgp(spots, output, compress): +def spot2rgp(spots: set, output: str, compress: bool = False): + """Write a tsv file providing association between spot and rgp + + :param spots: set of spots in pangenome + :param output: Path to output directory + :param compress: Compress the file in .gz + """ with write_compressed_or_not(output + "/spots.tsv", compress) as fout: fout.write("spot_id\trgp_id\n") for spot in spots: @@ -585,12 +706,23 @@ def spot2rgp(spots, output, compress): def write_spots(output, compress): + """ Write tsv files providing spots information and association with RGP + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ if len(pan.spots) > 0: spot2rgp(pan.spots, output, compress) summarize_spots(pan.spots, output, compress) -def write_borders(output, dup_margin, compress): +def write_borders(output: str, dup_margin: float = 0.05, compress: bool = False): + """Write all gene families bordering each spot + + :param output: Path to output directory + :param compress: Compress the file in .gz + :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated + """ multigenics = pan.get_multigenics(dup_margin=dup_margin) all_fams = set() with write_compressed_or_not(output + "/spot_borders.tsv", compress) as fout: @@ -610,7 +742,13 @@ def write_borders(output, dup_margin, compress): fout.write(f"{fam.sequence}\n") -def write_module_summary(output, compress): +def write_module_summary(output: str, compress: bool = False): + """ + Write a file providing summarize information about modules + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ logging.getLogger().info("Writing functional modules summary...") with write_compressed_or_not(output + "/modules_summary.tsv", compress) as fout: fout.write("module_id\tnb_families\tnb_organisms\tpartition\tmean_number_of_occurrence\n") @@ -629,7 +767,12 @@ def write_module_summary(output, compress): logging.getLogger().info(f"Done writing module summary: '{output + '/modules_summary.tsv'}'") -def write_modules(output, compress): +def write_modules(output: str, compress: bool = False): + """Write a tsv file providing association between modules and gene families + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ logging.getLogger().info("Writing functional modules...") with write_compressed_or_not(output + "/functional_modules.tsv", compress) as fout: fout.write("module_id\tfamily_id\n") @@ -642,6 +785,11 @@ def write_modules(output, compress): def write_org_modules(output, compress): + """Write a tsv file providing association between modules and organisms + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ logging.getLogger().info("Writing modules to organisms associations...") with write_compressed_or_not(output + "/modules_in_organisms.tsv", compress) as fout: fout.write("module_id\torganism\tcompletion\n") @@ -658,6 +806,11 @@ def write_org_modules(output, compress): def write_spot_modules(output, compress): + """Write a tsv file providing association between modules and spots + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ logging.getLogger().info("Writing modules to spot associations...") fam2mod = {} @@ -685,6 +838,11 @@ def write_spot_modules(output, compress): def write_rgp_modules(output, compress): + """Write a tsv file providing association between modules and RGP + + :param output: Path to output directory + :param compress: Compress the file in .gz + """ logging.getLogger().info("Clustering RGPs based on module content...") lists = write_compressed_or_not(output + "/modules_RGP_lists.tsv", compress) @@ -723,10 +881,38 @@ def write_rgp_modules(output, compress): logging.getLogger().info(f"RGP and associated modules are listed in : {output + '/modules_RGP_lists.tsv'}") -def write_flat_files(pangenome, output, cpu=1, soft_core=0.95, dup_margin=0.05, csv=False, gene_pa=False, gexf=False, - light_gexf=False, projection=False, stats=False, json=False, partitions=False, regions=False, - families_tsv=False, spots=False, borders=False, modules=False, spot_modules=False, compress=False, - disable_bar=False): +def write_flat_files(pangenome: Pangenome, output: str, cpu: int = 1, soft_core: float = 0.95, dup_margin: float = 0.05, + csv: bool = False, gene_pa: bool = False, gexf: bool = False, light_gexf: bool = False, + projection: bool = False, stats: bool = False, json: bool = False, partitions: bool = False, + regions: bool = False, families_tsv: bool = False, spots: bool = False, borders: bool = False, + modules: bool = False, spot_modules: bool = False, compress: bool = False, + disable_bar: bool = False): + """ + Main function to write flat files from pangenome + + :param pangenome: Pangenome object + :param output: Path to output directory + :param cpu: Number of available core + :param soft_core: Soft core threshold to use + :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated + :param csv: write csv file format as used by Roary + :param gene_pa: write gene presence abscence matrix + :param gexf: write pangenome graph in gexf format + :param light_gexf: write pangenome graph with only gene families + :param projection: write projection of pangenome for organisms + :param stats: write statistics about pangenome + :param json: write pangenome graph in json file + :param partitions: write the gene families for each partition + :param regions: write information on RGP + :param families_tsv: write gene families information + :param spots: write information on spots + :param borders: write gene families bordering spots + :param modules: write information about modules + :param spot_modules: write association between modules and RGP and modules and spots + :param compress: Compress the file in .gz + :param disable_bar: Disable progress bar + """ + # TODO Add force parameter to check if output already exist if not any(x for x in [csv, gene_pa, gexf, light_gexf, projection, stats, json, partitions, regions, spots, borders, families_tsv, modules, spot_modules]): raise Exception("You did not indicate what file you wanted to write.") @@ -767,10 +953,10 @@ def write_flat_files(pangenome, output, cpu=1, soft_core=0.95, dup_margin=0.05, need_partitions=needPartitions, need_rgp=needRegions, need_spots=needSpots, need_modules=needModules, disable_bar=disable_bar) - pan.get_index() # make the index because it will be used most likely + pan.get_org_index() # make the index because it will be used most likely with get_context('fork').Pool(processes=cpu) as p: if csv: - processes.append(p.apply_async(func=write_matrix, args=(',', "csv", output, compress, True))) + processes.append(p.apply_async(func=write_matrix, args=(output, ',', "csv", compress, True))) if gene_pa: processes.append(p.apply_async(func=write_gene_presence_absence, args=(output, compress))) if gexf: @@ -805,7 +991,12 @@ def write_flat_files(pangenome, output, cpu=1, soft_core=0.95, dup_margin=0.05, process.get() # get all the results -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ mk_outdir(args.output, args.force) global pan pan.add_file(args.pangenome) @@ -816,13 +1007,25 @@ def launch(args): spot_modules=args.spot_modules, compress=args.compress, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("write", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_flat(parser) return parser -def parser_flat(parser): +def parser_flat(parser: argparse.ArgumentParser): + """ + Parser for specific argument of write command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 17edf966..557be9fd 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -13,13 +13,23 @@ from tqdm import tqdm # local libraries +from ppanggolin.geneFamily import GeneFamily from ppanggolin.pangenome import Pangenome from ppanggolin.utils import mk_outdir, restricted_float from ppanggolin.formats.readBinaries import check_pangenome_info from ppanggolin.genetic_codes import genetic_codes -def get_families_to_write(pangenome, partition_filter, soft_core=0.95): +def get_families_to_write(pangenome: Pangenome, partition_filter: str = 'core', soft_core: float = 0.95): + """ + Get families corresponding to the given partition + + :param pangenome: Partitioned pangenome + :param partition_filter: choice of partition to compute Multiple Sequence Alignement of the gene families + :param soft_core: Soft core threshold to use + + :return: set of families unique to one partition + """ fams = set() if partition_filter == "all": return set(pangenome.gene_families) @@ -44,8 +54,14 @@ def get_families_to_write(pangenome, partition_filter, soft_core=0.95): return fams -def translate(seq, code): - """ translates the given dna sequence with the given translation table""" +def translate(seq: str, code: dict): + """translates the given dna sequence with the given translation table + + :param seq: given dna sequence + :param code: translation table corresponding to genetic code to use + + :return: protein sequence + """ # code: https://www.bioinformatics.org/sms/iupac.html start_table = code["start_table"] table = code["trans_table"] @@ -59,12 +75,22 @@ def translate(seq, code): except KeyError: # codon was not planned for. Probably can't determine it. protein += 'X' # X is for unknown else: - raise IndexError( - "Given sequence length modulo 3 was different than 0, which is unexpected.") + raise IndexError("Given sequence length modulo 3 was different than 0, which is unexpected.") return protein -def write_fasta_families(family, tmpdir, source, use_gene_id, code_table): +def write_fasta_families(family: GeneFamily, tmpdir: tempfile.TemporaryDirectory, code_table: dict, + source: str = 'protein', use_gene_id: bool = False): + """Write fasta files for each gene family + + :param family: gene family to write + :param tmpdir: path to temporary directory + :param source: indicates whether to use protein or dna sequences to compute the msa + :param use_gene_id: Use gene identifiers rather than organism names for sequences in the family MSA + :param code_table: Genetic code to use + + :return: path to fasta file + """ # have a directory for each gene family, to make deletion of tmp files simpler f_name = tmpdir.name + "/" + family.name + ".fasta" @@ -92,31 +118,55 @@ def write_fasta_families(family, tmpdir, source, use_gene_id, code_table): def launch_mafft(fname, output, fam_name): + """ + Compute the MSA with mafft + + :param fname: family gene sequence in fasta + :param output: directory to save alignment + :param fam_name: Name of the gene family + """ outname = output + "/" + fam_name + ".aln" cmd = ["mafft", "--thread", "1", fname] logging.getLogger().debug("command: " + " ".join(cmd)) - subprocess.run(cmd, stdout=open(outname, "w"), stderr=subprocess.DEVNULL, check=True) # + subprocess.run(cmd, stdout=open(outname, "w"), stderr=subprocess.DEVNULL, check=True) def launch_multi_mafft(args): + """ Allow to launch mafft in multiprocessing + + :param args: Pack of argument for launch_mafft + + :return: Organism object for pangenome + """ launch_mafft(*args) -def compute_msa(families, output, cpu, tmpdir, source, use_gene_id, code, disable_bar=False): +def compute_msa(families: set, output: str, tmpdir: str, cpu: int = 1, source: str = "protein", + use_gene_id: bool = False, code: int = 11, disable_bar: bool = False): + """ + Compute MSA between pangenome gene families + + :param families: Set of families specific to given partition + :param output: output directory name for families alignment + :param cpu: number of available core + :param tmpdir: path to temporary directory + :param source: indicates whether to use protein or dna sequences to compute the msa + :param use_gene_id: Use gene identifiers rather than organism names for sequences in the family MSA + :param code: Genetic code to use + :param disable_bar: Disable progress bar + """ newtmpdir = tempfile.TemporaryDirectory(dir=tmpdir) write_total = 0 args = [] logging.getLogger().info("Preparing input files for MSA...") - bar = tqdm(families, unit="family", disable=disable_bar) - code_table = genetic_codes(code) + code_table = genetic_codes(str(code)) - for family in bar: + for family in tqdm(families, unit="family", disable=disable_bar): start_write = time.time() - fname = write_fasta_families(family, newtmpdir, source, use_gene_id, code_table) + fname = write_fasta_families(family, newtmpdir, code_table, source, use_gene_id) write_total = write_total + (time.time() - start_write) args.append((fname, output, family.name)) - bar.close() logging.getLogger().info("Computing the MSA ...") bar = tqdm(range(len(families)), unit="family", disable=disable_bar) @@ -126,7 +176,17 @@ def compute_msa(families, output, cpu, tmpdir, source, use_gene_id, code, disabl bar.close() -def write_whole_genome_msa(pangenome, families, phylo_name, outname, use_gene_id=False): +def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: str, outname: str, + use_gene_id: bool = False): + """ + Writes a whole genome msa file for additional phylogenetic analysis + + :param pangenome: Pangenome object + :param families: Set of families specific to given partition + :param phylo_name: output file name for phylo alignment + :param outname: output directory name for families alignment + :param use_gene_id: Use gene identifiers rather than organism names for sequences in the family MSA + """ phylo_dict = {} for org in pangenome.organisms: phylo_dict[org.name] = "" @@ -178,8 +238,25 @@ def write_whole_genome_msa(pangenome, families, phylo_name, outname, use_gene_id fout.close() -def write_msa_files(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", source="protein", soft_core=0.95, - phylo=False, use_gene_id=False, translation_table="11", force=False, disable_bar=False): +def write_msa_files(pangenome: Pangenome, output: str, cpu: int = 1, partition: str = "core", tmpdir: str = "/tmp", + source: str = "protein", soft_core=0.95, phylo: bool = False, use_gene_id: bool = False, + translation_table: int = 11, force: bool = False, disable_bar: bool = False): + """ + Main function to write MSA files + + :param pangenome: Pangenome object with partition + :param output: Path to output directory + :param cpu: number of available core + :param partition: choice of partition to compute Multiple Sequence Alignement of the gene families + :param tmpdir: path to temporary directory + :param source: indicates whether to use protein or dna sequences to compute the msa + :param soft_core: Soft core threshold to use + :param phylo: Writes a whole genome msa file for additional phylogenetic analysis + :param use_gene_id: Use gene identifiers rather than organism names for sequences in the family MSA + :param translation_table: Translation table (genetic code) to use. + :param force: force to write in the directory + :param disable_bar: Disable progress bar + """ need_partitions = False if partition in ["persistent", "shell", "cloud"]: need_partitions = True @@ -195,8 +272,9 @@ def write_msa_files(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", s # check that the code is similar than the one used previously, if there is one if 'translation_table' in pangenome.parameters["cluster"]: if pangenome.parameters["cluster"]["translation_table"] != translation_table: - logging.getLogger().warning( - f"The translation table used during clustering ('{pangenome.parameters['cluster']['translation_table']}') is different than the one provided now ('{translation_table}')") + logging.getLogger().warning("The translation table used during clustering " + f"('{pangenome.parameters['cluster']['translation_table']}') " + f"is different than the one provided now ('{translation_table}')") code = translation_table compute_msa(families, outname, cpu=cpu, tmpdir=tmpdir, source=source, use_gene_id=use_gene_id, code=code, @@ -213,7 +291,12 @@ def write_msa_files(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", s logging.getLogger().info(f"Done writing the {partition} genome alignment in: '{phylo_name}'") -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) @@ -222,13 +305,25 @@ def launch(args): translation_table=args.translation_table, force=args.force, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("msa", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_msa(parser) return parser -def parser_msa(parser): +def parser_msa(parser: argparse.ArgumentParser): + """ + Parser for specific argument of msa command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="The following arguments are required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") @@ -251,7 +346,7 @@ def parser_msa(parser): optional.add_argument("--use_gene_id", required=False, action='store_true', help="Use gene identifiers rather than organism names for sequences in the family MSA" " (organism names are used by default)") - optional.add_argument("--translation_table", required=False, default="11", + optional.add_argument("--translation_table", required=False, default=11, type=int, help="Translation table (genetic code) to use.") diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py index 5bd562d5..4f9a78b7 100644 --- a/ppanggolin/formats/writeSequences.py +++ b/ppanggolin/formats/writeSequences.py @@ -6,6 +6,8 @@ import logging # installed libraries +from typing import TextIO + from tqdm import tqdm # local libraries @@ -18,11 +20,18 @@ "'core', 'module_X' with X being a module id." -def write_gene_sequences_from_annotations(pangenome, file_obj, list_cds=None, add='', disable_bar=False): +def write_gene_sequences_from_annotations(pangenome: Pangenome, file_obj: TextIO, list_cds: list = None, add: str = '', + disable_bar: bool = False): """ Writes the CDS sequences given through list_CDS of the Pangenome object to a tmpFile object, and adds the str provided through add in front of it. Loads the sequences from previously computed or loaded annotations + + :param pangenome: Pangenome object with gene families sequences + :param file_obj: Output file to write sequences + :param list_cds: Selected genes + :param add: Add prefix to gene ID + :param disable_bar: Disable progress bar """ counter = 0 if list_cds is None: @@ -36,7 +45,18 @@ def write_gene_sequences_from_annotations(pangenome, file_obj, list_cds=None, ad file_obj.flush() -def write_gene_sequences(pangenome, output, compress, genes, soft_core=0.95, disable_bar=False): +def write_gene_sequences(pangenome: Pangenome, output: str, genes: str, soft_core: float = 0.95, + compress: bool = False, disable_bar: bool = False): + """ + Write all nucleotide CDS sequences + + :param pangenome: Pangenome object with gene families sequences + :param output: Path to output directory + :param genes: Selected partition of gene + :param soft_core: Soft core threshold to use + :param compress: Compress the file in .gz + :param disable_bar: Disable progress bar + """ logging.getLogger().info("Writing all the gene nucleotide sequences...") outname = output + f"/{genes}_genes.fna" @@ -59,9 +79,16 @@ def write_gene_sequences(pangenome, output, compress, genes, soft_core=0.95, dis logging.getLogger().info(f"Done writing the gene sequences : '{outname}'") -def select_families(pangenome, partition, type_name, soft_core): - """ function used to filter down families to the given partition +def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_core: float = 0.95) -> set: + """ + function used to filter down families to the given partition + + :param pangenome: Pangenome object + :param partition: Selected partition + :param type_name: Which type of sequence we want. Gene families, protein, gene + :param soft_core: Soft core threshold to use + :return: Selected gene families """ genefams = set() if partition == 'all': @@ -99,7 +126,18 @@ def select_families(pangenome, partition, type_name, soft_core): return genefams -def write_fasta_gene_fam(pangenome, output, compress, gene_families, soft_core=0.95, disable_bar=False): +def write_fasta_gene_fam(pangenome: Pangenome, output: str, gene_families: str, soft_core: float = 0.95, + compress: bool = False, disable_bar=False): + """ + Write representative nucleotide sequences of gene families + + :param pangenome: Pangenome object with gene families sequences + :param output: Path to output directory + :param gene_families: Selected partition of gene families + :param soft_core: Soft core threshold to use + :param compress: Compress the file in .gz + :param disable_bar: Disable progress bar + """ outname = output + f"/{gene_families}_nucleotide_families.fasta" genefams = select_families(pangenome, gene_families, "representative nucleotide sequences of the gene families", @@ -111,22 +149,38 @@ def write_fasta_gene_fam(pangenome, output, compress, gene_families, soft_core=0 logging.getLogger().info(f"Done writing the representative nucleotide sequences of the gene families : '{outname}'") -def write_fasta_prot_fam(pangenome, output, compress, prot_families, soft_core=0.95, disable_bar=False): +def write_fasta_prot_fam(pangenome: Pangenome, output: str, prot_families: str, soft_core: float = 0.95, + compress: bool = False, disable_bar: bool = False): + """ + Write representative amino acid sequences of gene families. + + :param pangenome: Pangenome object with gene families sequences + :param output: Path to output directory + :param prot_families: Selected partition of protein families + :param soft_core: Soft core threshold to use + :param compress: Compress the file in .gz + :param disable_bar: Disable progress bar + """ outname = output + f"/{prot_families}_protein_families.faa" genefams = select_families(pangenome, prot_families, "representative amino acid sequences of the gene families", soft_core) with write_compressed_or_not(outname, compress) as fasta: - bar = tqdm(genefams, unit="prot families", disable=disable_bar) - for fam in bar: + for fam in tqdm(genefams, unit="prot families", disable=disable_bar): fasta.write('>' + fam.name + "\n") fasta.write(fam.sequence + "\n") - bar.close() logging.getLogger().info(f"Done writing the representative amino acid sequences of the gene families : '{outname}'") -def read_fasta_or_gff(filename): +def read_fasta_or_gff(filename: str) -> dict: + """ + Read the genome file in fasta or gbff format + + :param filename: Path to genome file + + :return: Dictionary with all sequences associated to contig + """ sequence_dict = {} seqname = "" seq = "" @@ -148,6 +202,13 @@ def read_fasta_or_gff(filename): def read_fasta_gbk(filename): + """ + Read the genome file in gbk format + + :param filename: Path to genome file + + :return: Dictionary with all sequences associated to contig + """ # line.startswith("ORIGIN"): sequence_dict = {} lines = read_compressed_or_not(filename).readlines()[::-1] @@ -178,7 +239,15 @@ def read_fasta_gbk(filename): return sequence_dict -def read_genome_file(file_dict, genome_name): +def read_genome_file(file_dict: dict, genome_name: str) -> dict: + """ + Read the genome file associated to organism + + :param file_dict: Dictionary given association between organism and fasta file + :param genome_name: organism name + + :return: Dictionary with all sequences associated to contig + """ filetype = detect_filetype(file_dict[genome_name]) if filetype in ["fasta", "gff"]: return read_fasta_or_gff(file_dict[genome_name]) @@ -188,7 +257,14 @@ def read_genome_file(file_dict, genome_name): raise Exception(f"Unknown filetype detected: '{file_dict[genome_name]}'") -def write_spaced_fasta(sequence, space): +def write_spaced_fasta(sequence: str, space: int = 60): + """Write a maximum of element per line + + :param sequence: sequence to write + :param space: maximum of size for one line + + :return: a sequence of maximum space caracter + """ seq = "" j = 0 while j < len(sequence): @@ -197,15 +273,25 @@ def write_spaced_fasta(sequence, space): return seq -def write_regions_sequences(pangenome, output, compress, regions, fasta, anno, disable_bar=False): +def write_regions_sequences(pangenome: Pangenome, output: str, regions: str, fasta: str, anno: str, + compress: bool = False, disable_bar: bool = False): + """ + Write representative amino acid sequences of gene families. + + :param pangenome: Pangenome object with gene families sequences + :param output: Path to output directory + :param regions: Write the RGP nucleotide sequences + :param fasta: A tab-separated file listing the organism names, fasta filepath of its genomic sequences + :param anno: A tab-separated file listing the organism names, and the gff/gbff filepath of its annotations + :param compress: Compress the file in .gz + :param disable_bar: Disable progress bar + """ organisms_file = fasta if fasta is not None else anno org_dict = {} for line in read_compressed_or_not(organisms_file): elements = [el.strip() for el in line.split("\t")] if len(elements) <= 1: - logging.getLogger().error( - f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") - exit(1) + raise Exception(f"No tabulation separator found in given --fasta or --anno file: '{organisms_file}'") org_dict[elements[0]] = elements[1] logging.getLogger().info(f"Writing {regions} rgp genomic sequences...") @@ -223,19 +309,33 @@ def write_regions_sequences(pangenome, output, compress, regions, fasta, anno, d outname = output + f"/{regions}_rgp_genomic_sequences.fasta" with write_compressed_or_not(outname, compress) as fasta: loaded_genome = "" - bar = tqdm(regions_to_write, unit="rgp", disable=disable_bar) - for region in bar: + for region in tqdm(regions_to_write, unit="rgp", disable=disable_bar): if region.organism.name != loaded_genome: loaded_genome = region.organism.name genome_sequence = read_genome_file(org_dict, loaded_genome) fasta.write(f">{region.name}\n") fasta.write(write_spaced_fasta(genome_sequence[region.contig.name][region.start:region.stop], 60)) - bar.close() logging.getLogger().info(f"Done writing the regions nucleotide sequences: '{outname}'") -def write_sequence_files(pangenome, output, fasta=None, anno=None, soft_core=0.95, regions=None, genes=None, - gene_families=None, prot_families=None, compress=False, disable_bar=False): +def write_sequence_files(pangenome: Pangenome, output: str, fasta: str = None, anno: str = None, + soft_core: float = 0.95, regions: str = None, genes: str = None, gene_families: str = None, + prot_families: str = None, compress: bool = False, disable_bar: bool = False): + """ + Main function to write sequence file from pangenome + + :param pangenome: Pangenome object containing sequences + :param output: Path to output directory + :param fasta: A tab-separated file listing the organism names, fasta filepath of its genomic sequences + :param anno: A tab-separated file listing the organism names, and the gff/gbff filepath of its annotations + :param soft_core: Soft core threshold to use + :param regions: Write the RGP nucleotide sequences + :param genes: Write all nucleotide CDS sequences + :param gene_families: Write representative nucleotide sequences of gene families. + :param prot_families: Write representative amino acid sequences of gene families. + :param compress: Compress the file in .gz + :param disable_bar: Disable progress bar + """ if not any(x for x in [regions, genes, prot_families, gene_families]): raise Exception("You did not indicate what file you wanted to write.") @@ -271,8 +371,8 @@ def write_sequence_files(pangenome, output, fasta=None, anno=None, soft_core=0.9 provided_filter = prot_families if regions is not None: provided_filter = regions - raise Exception( - f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. {poss_values_log}") + raise Exception(f"The filter that you indicated '{provided_filter}' was not understood by PPanGGOLiN. " + f"{poss_values_log}") ex_gene_sequences = Exception("The provided pangenome has no gene sequences. " "This is not compatible with any of the following options : --genes, --gene_families") ex_gene_family_sequences = Exception("The provided pangenome has no gene families. " @@ -284,28 +384,28 @@ def write_sequence_files(pangenome, output, fasta=None, anno=None, soft_core=0.9 raise ex_gene_family_sequences check_pangenome_info(pangenome, need_annotations=need_annotations, need_families=need_families, - need_graph=need_graph, - need_partitions=need_partitions, need_rgp=need_regions, need_spots=need_spots, - need_modules=need_modules, disable_bar=disable_bar) + need_graph=need_graph, need_partitions=need_partitions, need_rgp=need_regions, + need_spots=need_spots, need_modules=need_modules, disable_bar=disable_bar) if prot_families is not None: - write_fasta_prot_fam(pangenome, output, compress, prot_families, soft_core=soft_core, disable_bar=disable_bar) + write_fasta_prot_fam(pangenome, output, prot_families, soft_core, compress, disable_bar) if gene_families is not None: - write_fasta_gene_fam(pangenome, output, compress, gene_families, soft_core=soft_core, disable_bar=disable_bar) + write_fasta_gene_fam(pangenome, output, gene_families, soft_core, compress, disable_bar) if genes is not None: - write_gene_sequences(pangenome, output, compress, genes, soft_core=soft_core, disable_bar=disable_bar) + write_gene_sequences(pangenome, output, genes, soft_core, compress, disable_bar) if regions is not None: - write_regions_sequences(pangenome, output, compress, regions, fasta, anno, disable_bar=disable_bar) + write_regions_sequences(pangenome, output, regions, fasta, anno, compress, disable_bar) -def check_options(args): - if hasattr(args, "regions") and args.regions is not None and args.fasta is None and args.anno is None: +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ + if args.regions is not None and args.fasta is None and args.anno is None: raise Exception("The --regions options requires the use of --anno or --fasta " "(You need to provide the same file used to compute the pan)") - - -def launch(args): - check_options(args) mk_outdir(args.output, args.force) pangenome = Pangenome() pangenome.add_file(args.pangenome) @@ -314,13 +414,25 @@ def launch(args): prot_families=args.prot_families, compress=args.compress, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("fasta", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_seq(parser) return parser -def parser_seq(parser): +def parser_seq(parser: argparse.ArgumentParser): + """ + Parser for specific argument of fasta command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") @@ -336,20 +448,23 @@ def parser_seq(parser): help="A tab-separated file listing the organism names, and the gff/gbff filepath of its " "annotations (the files can be compressed with gzip). One line per organism. " "If this is provided, those annotations will be used.") - - optional = parser.add_argument_group( - title="Optional arguments. Indicating 'all' writes all elements. Writing a partition " - "('persistent', 'shell' or 'cloud') write the elements associated to said partition. " - "Writing 'rgp' writes elements associated to RGPs.") + onereq = parser.add_argument_group(title="Output file", + description="At least one of the following argument is required. " + "Indicating 'all' writes all elements. Writing a partition " + "('persistent', 'shell' or 'cloud') write the elements associated " + "to said partition. Writing 'rgp' writes elements associated to RGPs" + ) + onereq.add_argument("--genes", required=False, type=str, + help=f"Write all nucleotide CDS sequences. {poss_values_log}") + onereq.add_argument("--prot_families", required=False, type=str, + help=f"Write representative amino acid sequences of gene families. {poss_values_log}") + onereq.add_argument("--gene_families", required=False, type=str, + help=f"Write representative nucleotide sequences of gene families. {poss_values_log}") + optional = parser.add_argument_group(title="Optional arguments") # could make choice to allow customization - optional.add_argument("--regions", required=False, choices=["all", "complete"], + optional.add_argument("--regions", required=False, type=str, choices=["all", "complete"], help="Write the RGP nucleotide sequences (requires --anno or --fasta used to compute " "the pangenome to be given)") - optional.add_argument("--genes", required=False, help=f"Write all nucleotide CDS sequences. {poss_values_log}") - optional.add_argument("--prot_families", required=False, - help=f"Write representative amino acid sequences of gene families. {poss_values_log}") - optional.add_argument("--gene_families", required=False, - help=f"Write representative nucleotide sequences of gene families. {poss_values_log}") optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, help="Soft core threshold to use if 'softcore' partition is chosen") optional.add_argument("--compress", required=False, action="store_true", help="Compress the files in .gz") diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py index 5317c88c..de4c2af0 100644 --- a/ppanggolin/geneFamily.py +++ b/ppanggolin/geneFamily.py @@ -2,29 +2,31 @@ # coding: utf8 # default libraries +from __future__ import annotations from collections import defaultdict import logging # installed libraries +from typing import Dict, Set, List + import gmpy2 # local libraries -from ppanggolin.genome import Gene +from ppanggolin.edge import Edge +from ppanggolin.genome import Gene, Organism class GeneFamily: """ - This represents a single gene family. It will be a node in the pan graph, and be aware of its genes and edges. - """ + This represents a single gene family. It will be a node in the pangenome graph, and be aware of its genes and edges. - def __init__(self, family_id, name): - """Constructor method + :param family_id: The internal identifier to give to the gene family + :type family_id: any + :param name: The name of the gene family (to be printed in output files) + :type name: str + """ - :param family_id: The internal identifier to give to the gene family - :type family_id: any - :param name: The name of the gene family (to be printed in output files) - :type name: str - """ + def __init__(self, family_id: int, name: str): self.name = str(name) self.ID = family_id self._edges = {} @@ -37,29 +39,27 @@ def __init__(self, family_id, name): self.modules = set() self.bitarray = None - def add_sequence(self, seq): + def add_sequence(self, seq: str): """Assigns a protein sequence to the gene family. :param seq: the sequence to add to the gene family - :type seq: str """ self.sequence = seq - def add_partition(self, partition): + def add_partition(self, partition: str): """Assigns a partition to the gene family. It should be the raw partition name provided by NEM. :param partition: The partition - :type partition: str """ self.partition = partition @property - def named_partition(self): - """Reads the :attr:partition attribute and returns a meaningful name + def named_partition(self) -> str: + """Reads the partition attribute and returns a meaningful name :raises Exception: If the gene family has no partition assigned + :return: the partition name of the gene family - :rtype: str """ if self.partition == "": raise Exception("The gene family has not beed associated to a partition") @@ -72,11 +72,11 @@ def named_partition(self): else: return "undefined" - def add_gene(self, gene): + def add_gene(self, gene: Gene): """Add a gene to the gene family, and sets the gene's :attr:family accordingly. :param gene: the gene to add - :type gene: :class:`ppanggolin.genome.Gene` + :raises TypeError: If the provided `gene` is of the wrong type """ if not isinstance(gene, Gene): @@ -86,14 +86,12 @@ def add_gene(self, gene): if hasattr(gene, "organism"): self._genePerOrg[gene.organism].add(gene) - def mk_bitarray(self, index, partition='all'): - """Produces a bitarray representing the presence/absence of the family in the pan using the provided index + def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): + """Produces a bitarray representing the presence/absence of the family in the pangenome using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. :param index: The index computed by :func:`ppanggolin.pan.Pangenome.getIndex` - :type index: dict[:class:`ppanggolin.genome.Organism`, int] :param partition: partition used to compute bitarray - :type partition: str """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': @@ -111,11 +109,10 @@ def mk_bitarray(self, index, partition='all'): for org in self.organisms: self.bitarray[index[org]] = 1 - def get_org_dict(self): + def get_org_dict(self) -> Dict[Organism, Set[Gene]]: """Returns the organisms and the genes belonging to the gene family :return: a dictionnary of organism as key and set of genes as values - :rtype: dict[ :class:`ppanggolin.genome.Organism` ,set[:class:`ppanggolin.genome.Gene`] """ try: return self._genePerOrg @@ -123,14 +120,15 @@ def get_org_dict(self): for gene in self.genes: self._genePerOrg[gene.organism].add(gene) return self._genePerOrg + except Exception: + raise Exception("An unexpected error occurs. Please report in our GitHub") - def get_genes_per_org(self, org): + def get_genes_per_org(self, org: Organism) -> Set[Gene]: """Returns the genes belonging to the gene family in the given Organism :param org: Organism to look for - :type org: :class:`ppanggolin.genome.Organism` + :return: a set of gene(s) - :rtype: set[:class:`ppanggolin.genome.Gene`] """ try: return self._genePerOrg[org] @@ -138,31 +136,30 @@ def get_genes_per_org(self, org): for gene in self.genes: self._genePerOrg[gene.organism].add(gene) return self._genePerOrg[org] + except Exception: + raise Exception("An unexpected error occurs. Please report in our GitHub") @property - def neighbors(self): - """Returns all the :class:`ppanggolin.geneFamily.GeneFamily` that are linked with an edge + def neighbors(self) -> Set[GeneFamily]: + """Returns all the GeneFamilies that are linked with an edge :return: Neighbors - :rtype: set[:class:`ppanggolin.geneFamily.GeneFamily`] """ return set(self._edges.keys()) @property - def edges(self): - """Returns all the :class:`ppanggolin.pan.Edge` that are linked to this gene family + def edges(self) -> List[Edge]: + """Returns all Edges that are linked to this gene family :return: Edges of the gene family - :rtype: list[:class:`ppanggolin.pangenome.Edge`] """ return list(self._edges.values()) @property - def organisms(self): - """Returns all the :class:`ppanggolin.genome.Organism` that have this gene family + def organisms(self) -> Set[Organism]: + """Returns all the Organisms that have this gene family :return: Organisms that have this gene family - :rtype: set[:class:`ppanggolin.genome.Organism`] """ try: return set(self._genePerOrg.keys()) @@ -170,3 +167,5 @@ def organisms(self): for gene in self.genes: self._genePerOrg[gene.organism].add(gene) return set(self._genePerOrg.keys()) + except Exception: + raise Exception("An unexpected error occurs. Please report in our GitHub") diff --git a/ppanggolin/genome.py b/ppanggolin/genome.py index f8b9c58e..0b9d6b76 100644 --- a/ppanggolin/genome.py +++ b/ppanggolin/genome.py @@ -1,14 +1,22 @@ #!/usr/bin/env python3 # coding: utf8 +from __future__ import annotations + # installed libraries import logging +from typing import Iterator, Dict import gmpy2 class Feature: - def __init__(self, identifier): + """This is a general class representation of Gene, RNA + + :param identifier: Identifier of the feature given by PPanGGOLiN + """ + + def __init__(self, identifier: str): self.ID = identifier self.is_fragment = False self.type = "" @@ -22,33 +30,63 @@ def __init__(self, identifier): self.contig = None self.dna = None - def fill_annotations(self, start, stop, strand, gene_type="", name="", product="", local_identifier="", - position=None, genetic_code=11): - # genetic code, and position are not used in the default function. - self.start = int(start) - self.stop = int(stop) + def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", + product: str = "", local_identifier: str = ""): + """ + Fill general annotation for child classes + + :param start: Start position + :param stop: Stop position + :param strand: associated strand + :param gene_type: Type of gene + :param name: Name of the feature + :param product: Associated product + :param local_identifier: Identifier provided by the original file + """ + self.start = start if isinstance(start, int) else int(start) + self.stop = stop if isinstance(stop, int) else int(stop) self.type = gene_type self.strand = strand self.product = product self.name = name self.local_identifier = local_identifier - def fill_parents(self, organism, contig): + def fill_parents(self, organism: Organism, contig: Contig): + """ Associate object to an organism and a contig + + :param organism: Parent organism + :param contig: Parent contig + """ self.organism = organism self.contig = contig def add_dna(self, dna): + """ Add DNA sequence to feature + + :param dna: DNA sequence + """ if not isinstance(dna, str): raise TypeError(f"'str' type was expected but you provided a '{type(dna)}' type object") self.dna = dna class RNA(Feature): - pass + """Save RNA from genome as an Object with some information for Pangenome + + :param rna_id: Identifier of the rna + """ + + def __init__(self, rna_id: str): + super().__init__(rna_id) class Gene(Feature): - def __init__(self, gene_id): + """Save gene from genome as an Object with some information for Pangenome + + :param gene_id: Identifier of the gene + """ + + def __init__(self, gene_id: str): super().__init__(gene_id) self.position = None self.family = None @@ -59,20 +97,43 @@ def __init__(self, gene_id): def __str__(self): return str(self.ID) - def fill_annotations(self, start, stop, strand, gene_type="", name="", product="", local_identifier="", - position=None, genetic_code=11): + def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "", + product: str = "", local_identifier: str = "", position: int = None, genetic_code: int = 11): + """ + Fill Gene annotation provide by PPanGGOLiN dependencies + + :param start: Start position + :param stop: Stop position + :param strand: associated strand + :param gene_type: Type of gene + :param name: Gene name + :param product: Associated product + :param local_identifier: Identifier provided by the original file + :param position: Gene localisation in genome + :param genetic_code: Genetic code associated to gene + """ super().fill_annotations(start, stop, strand, gene_type, name, product, local_identifier) self.position = position self.genetic_code = genetic_code - def add_protein(self, protein): + def add_protein(self, protein: str): + """ Add protein sequence corresponding to translated gene + + :param protein: Protein sequence + """ if not isinstance(protein, str): raise TypeError(f"'str' type was expected but you provided a '{type(protein)}' type object") self.protein = protein class Contig: - def __init__(self, name, is_circular=False): + """ + Describe the contig content and some information + + :param name: Name of the contig + :param is_circular: save if the contig is circular + """ + def __init__(self, name: str, is_circular: bool = False): self.name = name self.is_circular = is_circular self.RNAs = set() # saving the rna annotations. We're not using them in the vast majority of cases. @@ -80,7 +141,11 @@ def __init__(self, name, is_circular=False): self._genes_position = [] @property - def genes(self): + def genes(self) -> list: + """ Give the gene content of the contig + + :return: list of gene in contig + """ return self._genes_position def __str__(self): @@ -90,7 +155,7 @@ def __iter__(self): return iter(self.genes) # retrieve gene by start position - def __getitem__(self, index): + def __getitem__(self, index: int): gene = self._genes_start.get(index) if not gene: if not isinstance(index, int): @@ -98,12 +163,20 @@ def __getitem__(self, index): raise IndexError(f"No gene start at the given position {index}") return gene - def add_rna(self, gene): - if not isinstance(gene, RNA): - raise TypeError(f"'RNA' type was expected but you provided a '{type(gene)}' type object") - self.RNAs.add(gene) + def add_rna(self, rna: RNA): + """ Add RNA to contig + + :param rna: RNA object to add + """ + if not isinstance(rna, RNA): + raise TypeError(f"'RNA' type was expected but you provided a '{type(rna)}' type object") + self.RNAs.add(rna) - def add_gene(self, gene): + def add_gene(self, gene: Gene): + """ Add gene to Contig + + :param gene: Gene object to add + """ if not isinstance(gene, Gene): raise TypeError(f"'Gene' type was expected but you provided a '{type(gene)}' type object") if gene.position is None: @@ -117,51 +190,76 @@ def add_gene(self, gene): class Organism: - def __init__(self, name): + """ + Describe the Genome content and some information + + :param name: Name of the genome + """ + def __init__(self, name: str): self.name = name self._contigs_getter = {} self.bitarray = None @property - def families(self): - """returns the gene families present in the organism""" + def families(self) -> set: + """ returns the gene families present in the organism + + :return: set of gene families in organism + """ return {gene.family for contig in self.contigs for gene in contig.genes} @property - def genes(self): + def genes(self) -> Iterator[Gene]: + """ Generator to get genes in organism """ for contig in self.contigs: for gene in contig.genes: yield gene - def number_of_genes(self): + def number_of_genes(self) -> int: + """ Get number of genes in organism + + :return: Number of gene in organism + """ return sum([len(list(contig.genes)) for contig in self.contigs]) @property - def contigs(self): + def contigs(self) -> dict.values: + """ Get contigs in organism + + :return: values in contig dictionary from organism + """ return self._contigs_getter.values() def __str__(self): return self.name - def get_or_add_contig(self, key, is_circular=False): - contig = self._contigs_getter.get(key) + def get_contig(self, contig_id: str, is_circular: bool = False): + """ + Get contig with the given identifier in the organim, if it does not exist in organism,the contig is added + + :param contig_id: Contig idenitifier + :param is_circular: save if the contig is circular + + :return: the contig with the given identifier + """ + contig = self._contigs_getter.get(contig_id) if contig is None: - contig = self._create_contig(key, is_circular) + contig = self._create_contig(contig_id, is_circular) return contig - def _create_contig(self, key, is_circular=False): - new_contig = Contig(key, is_circular) - self._contigs_getter[key] = new_contig + def _create_contig(self, contig_id: str, is_circular: bool = False): + new_contig = Contig(contig_id, is_circular) + self._contigs_getter[contig_id] = new_contig return new_contig - def mk_bitarray(self, index, partition='all'): + def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. + :param partition: Filter partition - type partition: str :param index: The index computed by :func:`ppanggolin.pan.Pangenome.getIndex` - :type index: dict[:class:`ppanggolin.genome.Organism`, int] """ + self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': logging.getLogger().debug(f"all") diff --git a/ppanggolin/graph/makeGraph.py b/ppanggolin/graph/makeGraph.py index 14a4c118..aef655b3 100644 --- a/ppanggolin/graph/makeGraph.py +++ b/ppanggolin/graph/makeGraph.py @@ -13,8 +13,13 @@ from ppanggolin.formats import read_pangenome, write_pangenome, erase_pangenome -def check_pangenome_former_graph(pangenome, force): - """ checks pangenome status and .h5 files for former neighbors graph, delete it if allowed or raise an error """ +def check_pangenome_former_graph(pangenome: Pangenome, force: bool = False): + """ + Checks pangenome status and .h5 files for former neighbors graph, delete it if allowed or raise an error + + :param pangenome: Pangenome object + :param force: Allow to force write on Pangenome file + """ if pangenome.status["neighborsGraph"] == "inFile" and not force: raise Exception("You are trying to make a neighbors graph that is already built. " "If you REALLY want to do that, use --force (it will erase everything except annotation data !)" @@ -25,9 +30,14 @@ def check_pangenome_former_graph(pangenome, force): def check_pangenome_for_neighbors_graph(pangenome, force, disable_bar=False): """ - Checks the pangenome for neighbors graph computing. + Checks and read the pangenome for neighbors graph computing. + + :param pangenome: Pangenome object + :param force: Allow to force write on Pangenome file + :param disable_bar: Disable progress bar """ check_pangenome_former_graph(pangenome, force) + # TODO Check if possible to change for check_pangenome_info if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] and \ pangenome.status["genesClustered"] in ["Computed", "Loaded"]: pass # nothing to do, can just continue. @@ -47,17 +57,27 @@ def check_pangenome_for_neighbors_graph(pangenome, force, disable_bar=False): def remove_high_copy_number(pangenome, number): - """ removes families present more than 'number' times from the pangenome graph""" + """Removes families present more than 'number' times from the pangenome graph + + :param pangenome: Pangenome object + :param number: Maximum authorized repeat presence + """ for fam in pangenome.gene_families: for gene_list in fam.get_org_dict().values(): if len(gene_list) >= number: fam.removed = True -def compute_neighbors_graph(pangenome, remove_copy_number=0, force=False, disable_bar=False): +def compute_neighbors_graph(pangenome: Pangenome, remove_copy_number: int = 0, + force: bool = False, disable_bar: bool = False): """ - Creates the Pangenome Graph. Will either load the information from the pangenome file if they are not loaded, - or use the information loaded if they are. + Creates the Pangenome Graph. Will either load the information from the pangenome file if they are not loaded, + or use the information loaded if they are. + + :param pangenome: Pangenome object + :param remove_copy_number: Maximum authorized repeat presence of gene families. if zero no remove + :param force: Allow to force write on Pangenome file + :param disable_bar: Disable progress bar """ check_pangenome_for_neighbors_graph(pangenome, force, disable_bar=disable_bar) @@ -80,6 +100,8 @@ def compute_neighbors_graph(pangenome, remove_copy_number=0, force=False, disabl prev = gene except AttributeError: raise AttributeError("a Gene does not have a GeneFamily object associated") + except Exception: + raise Exception("Unexpected error. Please report on our github.") if prev is not None and contig.is_circular and len(contig.genes) > 0: # if prev is None, the contig is entirely made of duplicated genes, so no edges are added pangenome.add_edge(contig.genes[0], prev) @@ -93,20 +115,37 @@ def compute_neighbors_graph(pangenome, remove_copy_number=0, force=False, disabl pangenome.parameters["graph"]["removed_high_copy_number_of_families_above"] = remove_copy_number -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ pangenome = Pangenome() pangenome.add_file(args.pangenome) compute_neighbors_graph(pangenome, args.remove_high_copy_number, args.force, disable_bar=args.disable_prog_bar) write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("graph", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_graph(parser) return parser -def parser_graph(parser): +def parser_graph(parser: argparse.ArgumentParser): + """ + Parser for specific argument of graph command + + :param parser: parser for align argument + """ parser.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") parser.add_argument('-r', '--remove_high_copy_number', type=int, default=0, help="Positive Number: Remove families having a number of copy of gene in a single organism " diff --git a/ppanggolin/info/info.py b/ppanggolin/info/info.py index 48bae437..73ef75b5 100644 --- a/ppanggolin/info/info.py +++ b/ppanggolin/info/info.py @@ -11,7 +11,15 @@ from ppanggolin.formats import read_info, read_parameters -def print_info(pangenome, status=False, content=False, parameters=False): +def print_info(pangenome: str, status: bool = False, content: bool = False, parameters: bool = False): + """ + Main function to return information about pangenome + + :param pangenome: Pangenome file + :param status: Get pangenome status + :param content: Get pangenome content + :param parameters: Get pangenome parameters + """ if status or content or parameters: h5f = tables.open_file(pangenome, "r") if status: @@ -52,17 +60,34 @@ def print_info(pangenome, status=False, content=False, parameters=False): print("Please select what information you want by using --parameters, --content or --status") -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ print_info(args.pangenome, args.status, args.content, args.parameters) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("info", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_info(parser) return parser -def parser_info(parser): +def parser_info(parser: argparse.ArgumentParser): + """ + Parser for specific argument of graph command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="The following arguments is required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") diff --git a/ppanggolin/main.py b/ppanggolin/main.py index a4f28d5b..87cfb0ce 100644 --- a/ppanggolin/main.py +++ b/ppanggolin/main.py @@ -22,19 +22,19 @@ import ppanggolin.figures import ppanggolin.formats import ppanggolin.info -import ppanggolin.metrics.metrics +import ppanggolin.metrics import ppanggolin.align -import ppanggolin.RGP.genomicIsland -import ppanggolin.RGP.spot +import ppanggolin.RGP import ppanggolin.mod import ppanggolin.context -import ppanggolin.workflow.workflow -import ppanggolin.workflow.panRGP -import ppanggolin.workflow.panModule -import ppanggolin.workflow.all +import ppanggolin.workflow -def cmd_line(): +def cmd_line() -> argparse.Namespace: + """ Manage the command line argument given by user + + :return: arguments given and readable by PPanGGOLiN + """ # need to manually write the description so that it's displayed into groups of subcommands .... desc = "\n" desc += "All of the following subcommands have their own set of options. To see them for a given subcommand," \ @@ -53,7 +53,7 @@ def cmd_line(): desc += " cluster Cluster proteins in protein families\n" desc += " graph Create the pangenome graph\n" desc += " partition Partition the pangenome graph\n" - desc += " rarefaction Compute the rarefaction curve of the pan\n" + desc += " rarefaction Compute the rarefaction curve of the pangenome\n" desc += " msa Compute Multiple Sequence Alignments for pangenome gene families\n" desc += " \n" desc += " Output:\n" @@ -132,6 +132,10 @@ def cmd_line(): def main(): + """ Run the command given by user and set / check some things + + :return: + """ args = cmd_line() if hasattr(args, "pangenome"): diff --git a/ppanggolin/metrics/fluidity.py b/ppanggolin/metrics/fluidity.py index 193c03aa..f7fd1282 100644 --- a/ppanggolin/metrics/fluidity.py +++ b/ppanggolin/metrics/fluidity.py @@ -14,16 +14,13 @@ from ppanggolin.formats import check_pangenome_info -def gen_fluidity(pangenome, disable_bar=False): +def gen_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: """ Compute the genomes' fluidity from the pan :param pangenome: pangenome which will be used to compute the genomes' fluidity - :type pangenome: Pangenome :param disable_bar: Disable the progress bar - :type disable_bar: bool - :return: Genomes fluidity value from the pan - :rtype:float + :return: Genomes fluidity value from the pangenome for each partition """ # check statuses and load info @@ -47,17 +44,14 @@ def gen_fluidity(pangenome, disable_bar=False): return fluidity_dict -def nb_fam_per_org(pangenome, disable_bar=False): +def nb_fam_per_org(pangenome: Pangenome, disable_bar: bool = False) -> dict: """ Create a dictionary with for each organism the number of gene families :param pangenome: Pangenome which contain the organisms and gene families - :type pangenome: Pangenome :param disable_bar: Disable the progress bar - :type disable_bar: bool :return: Dictionary with organisms as key and number of families as value - :rtype: dict """ org2_nb_fam = dict() for org in tqdm(pangenome.organisms, unit='organism', disable=disable_bar): @@ -71,16 +65,13 @@ def nb_fam_per_org(pangenome, disable_bar=False): # TODO Function to compute mash distance between genome for normalization -def fam_fluidity(pangenome, disable_bar=False): +def fam_fluidity(pangenome: Pangenome, disable_bar: bool = False) -> dict: """ Compute the family fluidity from the pan :param pangenome: pangenome which will be used to compute the genomes' fluidity - :type pangenome: Pangenome :param disable_bar: Disable the progress bar - :type disable_bar: bool - :return: family fluidity value from the pan - :rtype:float + :return: family fluidity value from the pangenome for each partition """ # check statuses and load info logging.getLogger().info("Check information in pan") @@ -104,17 +95,14 @@ def fam_fluidity(pangenome, disable_bar=False): return fluidity_dict -def nb_org_per_fam(pangenome, disable_bar=False): +def nb_org_per_fam(pangenome: Pangenome, disable_bar: bool = False) -> dict: """ Create a dictionary with for each gene families the number of organism :param pangenome: Pangenome which contain the organisms and gene families - :type pangenome: Pangenome :param disable_bar: Disable the progress bar - :type disable_bar: bool :return: Dictionary with organisms as key and number of families as value - :rtype: dict """ fam_2_nb_org = dict() for fam in tqdm(pangenome.gene_families, unit='gene families', disable=disable_bar): diff --git a/ppanggolin/metrics/metrics.py b/ppanggolin/metrics/metrics.py index 8ec7928c..cade18e5 100644 --- a/ppanggolin/metrics/metrics.py +++ b/ppanggolin/metrics/metrics.py @@ -15,44 +15,47 @@ from ppanggolin.metrics.fluidity import gen_fluidity, fam_fluidity -def check_metric(pangenome, genomes_fluidity=False, families_fluidity=False, info_modules=False, force=False): +def check_metric(pangenome: Pangenome, genomes_fluidity: bool = False, families_fluidity: bool = False, + info_modules: bool = False): + """ + Check if one of the asked metrics is not already computed + + :param pangenome: pangenome object + :param genomes_fluidity: Ask to compute genome fluidity + :param families_fluidity: Ask to compute family fluidity + :param info_modules: Ask to compute more information about module + """ with tables.open_file(pangenome.file, "a") as h5f: info_group = h5f.root.info if genomes_fluidity: - if 'genomes_fluidity' in info_group._v_attrs._f_list() and not force: + if 'genomes_fluidity' in info_group._v_attrs._f_list(): raise Exception("Genome fluidity was already compute. " "Please use -f option if you REALLY want to compute again") if families_fluidity: - if 'families_fluidity' in info_group._v_attrs._f_list() and not force: + if 'families_fluidity' in info_group._v_attrs._f_list(): raise Exception("Family fluidity was already compute. " "Please use -f option if you REALLY want to compute again") if info_modules: if any(x in info_group._v_attrs._f_list() for x in ['CloudSpecInModules', 'PersistentSpecInModules', 'ShellSpecInModules', 'numberOfFamiliesInModules', - 'StatOfFamiliesInModules']) and not force: + 'StatOfFamiliesInModules']): raise Exception("Supplementary information on modules was already compute. " "Please use -f option if you REALLY want to compute again") -def compute_metrics(pangenome, genomes_fluidity=False, families_fluidity=False, info_modules=False, - disable_bar=False): +def compute_metrics(pangenome: Pangenome, genomes_fluidity: bool = False, families_fluidity: bool = False, + info_modules: bool = False, disable_bar: bool = False) -> dict: """Compute the metrics + :param pangenome: pangenome which will be used to compute the genomes' fluidity - :type pangenome: Pangenome :param genomes_fluidity: Ask to compute genome fluidity - :type genomes_fluidity: bool :param families_fluidity: Ask to compute family fluidity - :type families_fluidity: bool :param info_modules: Ask to compute more information about module - :type info_modules: bool :param disable_bar: Disable the progress bar - :type disable_bar: bool - :return: dictionary with all the metrics computed - :rtype: dict """ metrics_dict = {} @@ -66,15 +69,13 @@ def compute_metrics(pangenome, genomes_fluidity=False, families_fluidity=False, return metrics_dict -def write_metrics(pangenome, metrics_dict, no_print_info=False): +def write_metrics(pangenome: Pangenome, metrics_dict: dict, no_print_info: bool = False): """ - Write the metrics computed in the pan + Write the metrics computed in the pangenome + :param pangenome: pangenome which will be used to compute the genomes' fluidity - :type pangenome: Pangenome :param metrics_dict: dictionary with all the metrics computed - :type metrics_dict: dict :param no_print_info: disable print of information - :type no_print_info: bool """ with tables.open_file(pangenome.file, "a") as h5f: info_group = h5f.root.info @@ -96,7 +97,12 @@ def write_metrics(pangenome, metrics_dict, no_print_info=False): read_info(h5f) -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ if not any(x for x in [args.genome_fluidity, args.family_fluidity, args.info_modules, args.all]): raise Exception("You did not indicate which metric you want to compute.") args_dict = {'genomes_fluidity': args.genome_fluidity, @@ -110,7 +116,8 @@ def launch(args): pangenome.add_file(args.pangenome) logging.getLogger().debug("Check if one of the metrics was already compute") - check_metric(pangenome, force=args.force, **args_dict) + if not args.force: + check_metric(pangenome, **args_dict) logging.getLogger().info("Metrics computation begin") metrics_dictionary = compute_metrics(pangenome, disable_bar=args.disable_prog_bar, **args_dict) logging.getLogger().info("Metrics computation done") @@ -118,22 +125,25 @@ def launch(args): write_metrics(pangenome, metrics_dictionary, no_print_info=args.no_print_info) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: """ - Parser arguments specific to metrics command + Subparser to launch PPanGGOLiN in Command line :param sub_parser : sub_parser for align command - :type sub_parser : argparse._SubParsersAction :return : parser arguments for align command - :rtype : argparse.ArgumentParser """ parser = sub_parser.add_parser("metrics", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_metrics(parser) return parser -def parser_metrics(parser): +def parser_metrics(parser: argparse.ArgumentParser): + """ + Parser for specific argument of metrics command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="All of the following arguments are required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") diff --git a/ppanggolin/mod/module.py b/ppanggolin/mod/module.py index 1c539212..b37ce976 100644 --- a/ppanggolin/mod/module.py +++ b/ppanggolin/mod/module.py @@ -18,14 +18,20 @@ from gmpy2 import xmpz, popcount # pylint: disable=no-name-in-module # local libraries +from ppanggolin.genome import Organism from ppanggolin.pangenome import Pangenome from ppanggolin.region import Module from ppanggolin.formats import check_pangenome_info, write_pangenome, erase_pangenome from ppanggolin.utils import mk_outdir, restricted_float, add_gene, connected_components -def check_pangenome_former_modules(pangenome, force): - """ checks pangenome status and .h5 files for former modules, delete them if allowed or raise an error """ +def check_pangenome_former_modules(pangenome: Pangenome, force: bool = False): + """ + Checks pangenome status and .h5 files for former modules, delete them if allowed or raise an error + + :param pangenome: Pangenome object + :param force: Allow to force write on pangenome by erasing already present modules + """ if pangenome.status["modules"] == "inFile" and not force: raise Exception("You are trying to detect modules on a pangenome which already has predicted modules. " "If you REALLY want to do that, use --force (it will erase modules previously predicted).") @@ -33,55 +39,13 @@ def check_pangenome_former_modules(pangenome, force): erase_pangenome(pangenome, modules=True) -def predict_modules(pangenome, cpu, tmpdir, force=False, dup_margin=0.05, size=3, min_presence=2, transitive=4, - jaccard=0.85, disable_bar=False): - # check statuses and load info - check_pangenome_former_modules(pangenome, force) - check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=True, - disable_bar=disable_bar) - - # compute the graph with transitive closure size provided as parameter - start_time = time.time() - logging.getLogger().info("Building the graph...") - g = compute_mod_graph(pangenome.organisms, t=transitive, disable_bar=disable_bar) - logging.getLogger().info(f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in") - logging.getLogger().info(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") - - start_time = time.time() - # get all multigenic gene families - multi = pangenome.get_multigenics(dup_margin, persistent=False) - - # extract the modules from the graph - modules = compute_modules(g, multi, jaccard, min_presence, size=size) - - fams = set() - for mod in modules: - fams |= mod.families - - logging.getLogger().info(f"There are {len(fams)} families among {len(modules)} modules") - logging.getLogger().info(f"Computing modules took {round(time.time() - start_time, 2)} seconds") - - pangenome.add_modules(modules) - - pangenome.status["modules"] = "Computed" - pangenome.parameters["modules"] = {} - pangenome.parameters["modules"]["size"] = size - pangenome.parameters["modules"]["min_presence"] = min_presence - pangenome.parameters["modules"]["transitive"] = transitive - pangenome.parameters["modules"]["jaccard"] = jaccard - pangenome.parameters["modules"]["dup_margin"] = dup_margin - - -def compute_mod_graph(organisms, t=1, disable_bar=False): +def compute_mod_graph(organisms: list, t: int = 1, disable_bar: bool = False): """ Computes a graph using all provided genomes with a transitive closure of size t :param organisms: the list of organisms to compute the graph with - :type organisms: list[:class:`ppanggolin.genome.Organism`] :param t: the size of the transitive closure - :type t: int :param disable_bar: whether to show a progress bar or not - :type disable_bar: bool """ g = nx.Graph() @@ -103,19 +67,16 @@ def compute_mod_graph(organisms, t=1, disable_bar=False): return g -def compute_modules(g, multi, weight, min_fam, size): +def compute_modules(g: nx.Graph, multi: set, weight: float = 0.85, min_fam: int = 2, size: int = 3): """ Computes modules using a graph built by :func:`ppanggolin.mod.module.compute_mod_graph` and different parameters defining how restrictive the modules will be. :param g: The networkx graph from :func:`ppanggolin.mod.module.compute_mod_graph` - :type g: :class:`networkx.Graph` :param multi: a set of families :class:`ppanggolin.geneFamily.GeneFamily` considered multigenic - :type multi: set :param weight: the minimal jaccard under which edges are not considered - :type weight: float :param min_fam: the minimal number of presence under which the family is not considered - :type min_fam: int + : param size: Minimal number of gene family in a module """ # removing families with low presence @@ -133,22 +94,93 @@ def compute_modules(g, multi, weight, min_fam, size): return modules -def launch(args): +def predict_modules(pangenome: Pangenome, tmpdir: str, cpu: int = 1, dup_margin: float = 0.05, + size: int = 3, min_presence: int = 2, transitive: int = 4, jaccard: float = 0.85, + force: bool = False, disable_bar: bool = False): + """ + Main function to predict module + + :param pangenome: Pangenome object with Gene Families, Annotation and Partition + :param tmpdir: Path to temporary directory + :param cpu: Number of available core + :param dup_margin: minimum ratio of organisms in which family must have multiple genes to be considered duplicated + :param size: Minimal number of gene family in a module + :param min_presence: Minimum number of times the module needs to be present in the pangenome to be reported. + :param transitive: Size of the transitive closure used to build the graph. + :param jaccard: minimum jaccard similarity used to filter edges between gene families. + :param force: Allow to force write on Pangenome file + :param disable_bar: Disable progress bar + """ + # check statuses and load info + check_pangenome_former_modules(pangenome, force) + check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=True, + disable_bar=disable_bar) + + # compute the graph with transitive closure size provided as parameter + start_time = time.time() + logging.getLogger().info("Building the graph...") + g = compute_mod_graph(pangenome.organisms, t=transitive, disable_bar=disable_bar) + logging.getLogger().info(f"Took {round(time.time() - start_time, 2)} seconds to build the graph to find modules in") + logging.getLogger().info(f"There are {nx.number_of_nodes(g)} nodes and {nx.number_of_edges(g)} edges") + + start_time = time.time() + # get all multigenic gene families + multi = pangenome.get_multigenics(dup_margin, persistent=False) + + # extract the modules from the graph + modules = compute_modules(g, multi, jaccard, min_presence, size=size) + + fams = set() + for mod in modules: + fams |= mod.families + + logging.getLogger().info(f"There are {len(fams)} families among {len(modules)} modules") + logging.getLogger().info(f"Computing modules took {round(time.time() - start_time, 2)} seconds") + + pangenome.add_modules(modules) + + pangenome.status["modules"] = "Computed" + pangenome.parameters["modules"] = {} + pangenome.parameters["modules"]["size"] = size + pangenome.parameters["modules"]["min_presence"] = min_presence + pangenome.parameters["modules"]["transitive"] = transitive + pangenome.parameters["modules"]["jaccard"] = jaccard + pangenome.parameters["modules"]["dup_margin"] = dup_margin + + +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ pangenome = Pangenome() pangenome.add_file(args.pangenome) - predict_modules(pangenome=pangenome, cpu=args.cpu, tmpdir=args.tmpdir, force=args.force, dup_margin=args.dup_margin, - size=args.size, min_presence=args.min_presence, transitive=args.transitive, jaccard=args.jaccard, + predict_modules(pangenome=pangenome, tmpdir=args.tmpdir, cpu=args.cpu, dup_margin=args.dup_margin, size=args.size, + min_presence=args.min_presence, transitive=args.transitive, jaccard=args.jaccard, force=args.force, disable_bar=args.disable_prog_bar) write_pangenome(pangenome, pangenome.file, args.force, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("module", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_module(parser) return parser -def parser_module(parser): +def parser_module(parser: argparse.ArgumentParser): + """ + Parser for specific argument of module command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index e01a0613..809315d0 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -14,6 +14,8 @@ from shutil import copytree # installed libraries +from typing import Union, Tuple + from tqdm import tqdm import plotly.offline as out_plotly import plotly.graph_objs as go @@ -30,8 +32,26 @@ samples = [] -def run_partitioning(nem_dir_path, nb_org, beta, free_dispersion, kval=3, seed=42, init="param_file", keep_files=False, - itermax=100, just_log_likelihood=False): +def run_partitioning(nem_dir_path: str, nb_org: int, beta: float = 2.5, free_dispersion: bool = False, kval: int = 3, + seed: int = 42, init: str = "param_file", keep_files: bool = False, itermax: int = 100, + just_log_likelihood: bool = False) -> Union[Tuple[dict, None, None], Tuple[int, float, float], + Tuple[dict, dict, float]]: + """ + Main function to make partitionning + + :param nem_dir_path: Path to directory with nem files + :param nb_org: Number of organisms + :param beta: strength of the smoothing using the graph topology during partitioning. 0 deactivate spatial smoothing + :param free_dispersion: use if the dispersion around the centroid vector of each partition during must be free. + :param kval: Number of partitions to use. Must be at least 2. If under 2, it will be detected automatically. + :param seed: seed used to generate random numbers + :param init: Initiate nem parameters with pangenome parameters or randomly + :param keep_files: True if you want to keep the NEM files + :param itermax: Maximum iteration to compute partitioning + :param just_log_likelihood: Return only nem parameter result + + :return: Nem parameters and if not just log likelihood the families associated to partition + """ logging.getLogger().debug("run_partitioning...") if init == "param_file": with open(nem_dir_path + "/nem_file_init_" + str(kval) + ".m", "w") as m_file: @@ -67,34 +87,15 @@ def run_partitioning(nem_dir_path, nb_org, beta, free_dispersion, kval=3, seed=4 # (INIT_SORT, init_random, init_param_file, INIT_FILE, INIT_LABEL, INIT_NB) = range(0,6) init_random, init_param_file = range(1, 3) logging.getLogger().debug("Running NEM...") - logging.getLogger().debug([nem_dir_path.encode('ascii') + b"/nem_file", - kval, - algo, - beta, - convergence, - convergence_th, - b"fuzzy", - itermax, - True, - model, - proportion, - variance_model, + logging.getLogger().debug([nem_dir_path.encode('ascii') + b"/nem_file", kval, algo, beta, convergence, + convergence_th, b"fuzzy", itermax, True, model, proportion, variance_model, init_param_file if init in ["param_file", "init_from_old"] else init_random, nem_dir_path.encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", nem_dir_path.encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), seed]) - nem_stats.nem(Fname=nem_dir_path.encode('ascii') + b"/nem_file", - nk=kval, - algo=algo, - beta=beta, - convergence=convergence, - convergence_th=convergence_th, - format=b"fuzzy", - it_max=itermax, - dolog=True, - model_family=model, - proportion=proportion, - dispersion=variance_model, + nem_stats.nem(Fname=nem_dir_path.encode('ascii') + b"/nem_file", nk=kval, algo=algo, beta=beta, + convergence=convergence, convergence_th=convergence_th, format=b"fuzzy", it_max=itermax, + dolog=True, model_family=model, proportion=proportion, dispersion=variance_model, init_mode=init_param_file if init in ["param_file", "init_from_old"] else init_random, init_file=nem_dir_path.encode('ascii') + b"/nem_file_init_" + str(kval).encode('ascii') + b".m", out_file_prefix=nem_dir_path.encode('ascii') + b"/nem_file_" + str(kval).encode('ascii'), @@ -165,7 +166,7 @@ def run_partitioning(nem_dir_path, nb_org, beta, free_dispersion, kval=3, seed=4 logging.getLogger().debug("partitioning did not work (the number of organisms used is probably too low), " "see logs here to obtain more details " + nem_dir_path + "/nem_file_" + str(kval) + ".log") - return [{}, None, None] # return empty objects. + return {}, None, None # return empty objects except ValueError: # return the default partitions_list which correspond to undefined pass @@ -182,16 +183,40 @@ def run_partitioning(nem_dir_path, nb_org, beta, free_dispersion, kval=3, seed=4 os.remove(nem_dir_path + "/nem_file.str") if just_log_likelihood: - return tuple([kval, log_likelihood, entropy]) + return kval, log_likelihood, entropy else: return dict(zip(index_fam, partitions_list)), all_parameters, log_likelihood -def nem_single(args): +def nem_single(args: tuple) -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: + """ + Allow to run partitioning in multiprocessing to evaluate partition number + + :param args: {nem_dir_path: str, nb_org: int, beta: float, free_dispersion: bool, kval: int, seed: int, + init: str, keep_files: bool, itermax: int, just_log_likelihood: bool} + :return: Result of run partitioning + """ return run_partitioning(*args) -def partition_nem(index, tmpdir, beta, sm_degree, free_dispersion, kval, seed, init, keep_tmp_files): +def partition_nem(index: int, tmpdir: str, kval: int, beta: float = 2.5, sm_degree: int = 10, + free_dispersion: bool = False, seed: int = 42, init: str = "param_file", + keep_tmp_files: bool = False) -> Union[Tuple[dict, None, None], Tuple[int, float, float], + Tuple[dict, dict, float]]: + """ + + :param index: Index of the sample group + :param tmpdir: temporary directory path + :param kval: Number of partitions to use + :param beta: strength of the smoothing using the graph topology during partitioning. 0 deactivate spatial smoothing + :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. + :param free_dispersion: use if the dispersion around the centroid vector of each partition during must be free. + :param seed: seed used to generate random numbers + :param init: Initiate nem parameters with pangenome parameters or randomly + :param keep_tmp_files: True if you want to keep the temporary NEM files + + :return: + """ currtmpdir = tmpdir + "/" + str(index) # unique directory name samp = samples[index] # org_samples accessible because it is a global variable. @@ -201,12 +226,26 @@ def partition_nem(index, tmpdir, beta, sm_degree, free_dispersion, kval, seed, i seed=seed, init=init, keep_files=keep_tmp_files) -def nem_samples(pack): - # run partitioning +def nem_samples(pack: tuple) -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: + """ + run partitioning + :param pack: {index: int, tmpdir: str, beta: float, sm_degree: int, free_dispersion: bool, + kval: int, seed: int, init: str, keep_tmp_files: bool} + :return: + """ return partition_nem(*pack) -def write_nem_input_files(tmpdir, organisms, sm_degree): +def write_nem_input_files(tmpdir: str, organisms: set, sm_degree: int = 10) -> (float, int): + """ + Create and format input files for partitioning with NEM + + :param tmpdir: temporary directory path + :param organisms: Set of organism from pangenome + :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. + + :return: total edge weigth to ponderate beta and number of families + """ mk_outdir(tmpdir, force=False) total_edges_weight = 0 @@ -265,24 +304,27 @@ def write_nem_input_files(tmpdir, organisms, sm_degree): return total_edges_weight / 2, len(index_fam) -def evaluate_nb_partitions(organisms, sm_degree, free_dispersion, chunk_size, krange, icl_margin, draw_icl, - cpu, tmpdir, seed, outputdir, disable_bar=False): +def evaluate_nb_partitions(organisms: set, tmpdir: str, outputdir: str = None, sm_degree: int = 10, + free_dispersion: bool = False, chunk_size: int = 500, krange: list = None, + icl_margin: float = 0.05, draw_icl: bool = False, cpu: int = 1, seed: int = 42, + disable_bar: bool = False) -> int: """ Evaluate the optimal number of partition for the pangenome - :param organisms: - :param sm_degree: - :param free_dispersion: - :param chunk_size: - :param krange: - :param icl_margin: - :param draw_icl: - :param cpu: - :param tmpdir: - :param seed: - :param outputdir: - :param disable_bar: - :return: + :param organisms: Set of organisms from pangenome + :param tmpdir: temporary directory path + :param outputdir: output directory path to draw ICL + :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. + :param free_dispersion: use if the dispersion around the centroid vector of each partition during must be free. + :param chunk_size: Size of the chunks when performing partitioning using chunks of organisms. + :param krange: Range of K values to test when detecting K automatically. + :param icl_margin: margin use to select the lowest K in maximizing ICL + :param draw_icl: draw the ICL curve for all the tested K values. + :param cpu: Number of available core + :param seed: seed used to generate random numbers + :param disable_bar: Disable progress bar + + :return: Ideal number of partition computed """ newtmpdir = tmpdir + "/eval_partitions" @@ -301,7 +343,7 @@ def evaluate_nb_partitions(organisms, sm_degree, free_dispersion, chunk_size, kr all_log_likelihood = [] if cpu > 1: - bar = tqdm(range(len(args_partitionning)), unit="Number of number of partitions", disable=disable_bar) + bar = tqdm(range(len(args_partitionning)), unit="Number of partitions", disable=disable_bar) with get_context('fork').Pool(processes=cpu) as p: for result in p.imap_unordered(nem_single, args_partitionning): all_log_likelihood.append(result) @@ -313,16 +355,13 @@ def evaluate_nb_partitions(organisms, sm_degree, free_dispersion, chunk_size, kr for arguments in args_partitionning: all_log_likelihood.append(nem_single(arguments)) - def calculate_bic(log_llhood, nb_parameters, nb_points): - return log_llhood - 0.5 * (math.log(nb_points) * nb_parameters) - all_bics = defaultdict(float) all_icls = defaultdict(float) all_lls = defaultdict(float) for k_candidate, log_likelihood, entropy in all_log_likelihood: if log_likelihood is not None: nb_params = k_candidate * (len(select_organisms) + 1 + (len(select_organisms) if free_dispersion else 1)) - all_bics[k_candidate] = calculate_bic(log_likelihood, nb_params, nb_fam) + all_bics[k_candidate] = log_likelihood - 0.5 * (math.log(nb_params) * nb_fam) # Calculate BIC all_icls[k_candidate] = all_bics[k_candidate] - entropy all_lls[k_candidate] = log_likelihood @@ -376,8 +415,12 @@ def calculate_bic(log_llhood, nb_parameters, nb_points): return chosen_k -def check_pangenome_former_partition(pangenome, force): - """ checks pangenome status and .h5 files for former partitions, delete them if allowed or raise an error """ +def check_pangenome_former_partition(pangenome: Pangenome, force: bool = False): + """checks pangenome status and .h5 files for former partitions, delete them if allowed or raise an error + + :param pangenome: Pangenome object + :param force: Allow to force write on Pangenome file + """ if pangenome.status["partitioned"] == "inFile" and not force: raise Exception("You are trying to partition a pangenome already partitioned." " If you REALLY want to do that, " @@ -386,47 +429,29 @@ def check_pangenome_former_partition(pangenome, force): erase_pangenome(pangenome, partition=True) -def partition(pangenome, tmpdir, outputdir=None, force=False, beta=2.5, sm_degree=10, free_dispersion=False, - chunk_size=500, kval=-1, krange=None, icl_margin=0.05, draw_icl=False, cpu=1, seed=42, - keep_tmp_files=False, - disable_bar=False): +def partition(pangenome: Pangenome, tmpdir: str, outputdir: str = None, beta: float = 2.5, sm_degree: int = 10, + free_dispersion: bool = False, chunk_size: int = 500, kval: int = -1, krange: list = None, + icl_margin: float = 0.05, draw_icl: bool = False, cpu: int = 1, seed: int = 42, + keep_tmp_files: bool = False, force: bool = False, disable_bar: bool = False): """ - Partitioning the pangenome + Partitioning the pangenome :param pangenome: Pangenome containing GeneFamilies to align with sequence set - :type pangenome: Pangenome :param tmpdir: temporary directory path - :type tmpdir: str - :param outputdir: output directory path - :type outputdir: str - :param force: force writing in the pangenome and output directory - :param force: bool - :param beta: - :param beta: - :param sm_degree: - :param sm_degree: - :param free_dispersion: - :param free_dispersion: - :param chunk_size: - :param chunk_size: - :param kval: - :param kval: - :param krange: - :param krange: - :param icl_margin: - :param icl_margin: - :param draw_icl: - :param draw_icl: - :param cpu: - :param cpu: - :param seed: - :param seed: - :param keep_tmp_files: - :param keep_tmp_files: - :param disable_bar: - :param disable_bar: - - :return: + :param outputdir: output directory path to draw ICL + :param beta: strength of the smoothing using the graph topology during partitioning. 0 deactivate spatial smoothing + :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. + :param free_dispersion: use if the dispersion around the centroid vector of each partition during must be free. + :param chunk_size: Size of the chunks when performing partitioning using chunks of organisms. + :param kval: Number of partitions to use. Must be at least 2. If under 2, it will be detected automatically. + :param krange: Range of K values to test when detecting K automatically. + :param icl_margin: margin use to select the lowest K in maximizing ICL + :param draw_icl: draw the ICL curve for all the tested K values. + :param cpu: Number of available core + :param seed: seed used to generate random numbers + :param keep_tmp_files: True if you want to keep the temporary NEM files + :param force: Allow to force write on Pangenome file + :param disable_bar: Disable progress bar """ kmm = [3, 20] if krange is None else krange global samples @@ -456,8 +481,8 @@ def partition(pangenome, tmpdir, outputdir=None, force=False, beta=2.5, sm_degre if kval < 2: pangenome.parameters["partition"]["computed_K"] = True logging.getLogger().info("Estimating the optimal number of partitions...") - kval = evaluate_nb_partitions(organisms, sm_degree, free_dispersion, chunk_size, kmm, icl_margin, - draw_icl, cpu, tmp_dir.name, seed, outputdir, disable_bar=disable_bar) + kval = evaluate_nb_partitions(organisms, tmp_dir.name, outputdir, sm_degree, free_dispersion, chunk_size, kmm, + icl_margin, draw_icl, cpu, seed, disable_bar=disable_bar) logging.getLogger().info(f"The number of partitions has been evaluated at {kval}") pangenome.parameters["partition"]["K"] = kval @@ -481,7 +506,12 @@ def partition(pangenome, tmpdir, outputdir=None, force=False, beta=2.5, sm_degre if chunk_size < len(organisms): validated = set() - def validate_family(res): + def validate_family(res: Union[(dict, None, None), (int, float, float), (dict, dict, float)]): + """ + Validate partition assignation to families + + :param res: Partitioning results + """ for node, nem_class in res[0].items(): cpt_partition[node][nem_class[0]] += 1 sum_partionning = sum(cpt_partition[node].values()) @@ -512,7 +542,7 @@ def validate_family(res): args = [] # tmpdir, beta, sm_degree, free_dispersion, K, seed for i, _ in enumerate(samples[prev:], start=prev): - args.append((i, tmp_dir.name, beta, sm_degree, free_dispersion, kval, seed, init, + args.append((i, tmp_dir.name, kval, beta, sm_degree, free_dispersion, seed, init, keep_tmp_files)) logging.getLogger().info("Launching NEM") @@ -561,29 +591,43 @@ def validate_family(res): copytree(tmp_dir.name, outputdir + "/NEM_files/") -def launch(args): +def launch(args: argparse.Namespace): """ - main code when launch partition from the command line. + Command launcher + + :param args: All arguments provide by user """ if args.draw_ICL or args.keep_tmp_files: mk_outdir(args.output, args.force) global pan pan.add_file(args.pangenome) - partition(pan, args.tmpdir, args.output, args.force, args.beta, args.max_degree_smoothing, args.free_dispersion, + partition(pan, args.tmpdir, args.output, args.beta, args.max_degree_smoothing, args.free_dispersion, args.chunk_size, args.nb_of_partitions, args.krange, args.ICL_margin, args.draw_ICL, args.cpu, args.seed, - args.keep_tmp_files, disable_bar=args.disable_prog_bar) + args.keep_tmp_files, args.force, disable_bar=args.disable_prog_bar) logging.getLogger().debug("Write partition in pangenome") write_pangenome(pan, pan.file, args.force, disable_bar=args.disable_prog_bar) logging.getLogger().debug("Partitioning is finished") -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("partition", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_partition(parser) return parser -def parser_partition(parser): +def parser_partition(parser: argparse.ArgumentParser): + """ + Parser for specific argument of partition command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome.h5 file") @@ -617,7 +661,7 @@ def parser_partition(parser): "lowest K that is found within a given 'margin' of the maximal ICL value. Basically, " "change this option only if you truly understand it, otherwise just leave it be.") optional.add_argument("--draw_ICL", required=False, default=False, action="store_true", - help="Use if you can to draw the ICL curve for all of the tested K values. " + help="Use if you want to draw the ICL curve for all the tested K values. " "Will not be done if K is given.") optional.add_argument("--keep_tmp_files", required=False, default=False, action="store_true", help="Use if you want to keep the temporary NEM files") diff --git a/ppanggolin/nem/rarefaction.py b/ppanggolin/nem/rarefaction.py index c71fb7cd..c20e1441 100644 --- a/ppanggolin/nem/rarefaction.py +++ b/ppanggolin/nem/rarefaction.py @@ -13,6 +13,8 @@ import warnings # installed libraries +from typing import Union, Tuple + from tqdm import tqdm import gmpy2 import numpy @@ -32,25 +34,47 @@ samples = [] -def raref_nem(index, tmpdir, beta, sm_degree, free_dispersion, chunk_size, k, krange, seed): +def raref_nem(index: int, tmpdir: str, beta: float = 2.5, sm_degree: int = 10, free_dispersion: bool = False, + chunk_size: int = 500, kval: int = -1, krange: list = None, seed: int = 42) -> (dict, int): + """ + + :param index: Index of the sample group organisms + :param tmpdir: temporary directory path + :param beta: strength of the smoothing using the graph topology during partitioning. 0 deactivate spatial smoothing + :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. + :param free_dispersion: use if the dispersion around the centroid vector of each partition during must be free. + :param chunk_size: Size of the chunks when performing partitioning using chunks of organisms. + :param kval: Number of partitions to use + :param krange: Range of K values to test when detecting K automatically. + :param seed: seed used to generate random numbers + + :return: Count of each partition and paremeters for the given sample index + """ samp = samples[index] currtmpdir = tmpdir + "/" + str(index) + "/" - if k < 3: - k = ppp.evaluate_nb_partitions(samp, sm_degree, free_dispersion, chunk_size, krange, 0.05, False, 1, - tmpdir + "/" + str(index) + "_eval", seed, None) + kmm = [3, 20] if krange is None else krange + + if kval < 3: + kval = ppp.evaluate_nb_partitions(samp, tmpdir + "/" + str(index) + "_eval", None, sm_degree, free_dispersion, + chunk_size, kmm, 0.05, False, 1, seed) if len(samp) <= chunk_size: # all good, just write stuff. edges_weight, nb_fam = ppp.write_nem_input_files(tmpdir=currtmpdir, organisms=set(samp), sm_degree=sm_degree) cpt_partition = ppp.run_partitioning(currtmpdir, len(samp), beta * (nb_fam / edges_weight), free_dispersion, - kval=k, seed=seed, init="param_file")[0] + kval=kval, seed=seed, init="param_file")[0] else: # going to need multiple partitioning for this sample... families = set() cpt_partition = {} validated = set() cpt = 0 - def validate_family(result): + def validate_family(result: Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]): + """ + Validate partition assignation to families + + :param result: Partitioning results + """ for node, nem_class in result[0].items(): cpt_partition[node][nem_class[0]] += 1 sum_partitioning = sum(cpt_partition[node].values()) @@ -87,14 +111,14 @@ def validate_family(result): for samp in org_samples: edges_weight, nb_fam = ppp.write_nem_input_files(currtmpdir + "/" + str(cpt) + "/", samp, sm_degree=sm_degree) - validate_family( - ppp.run_partitioning(currtmpdir + "/" + str(cpt) + "/", len(samp), beta * (nb_fam / edges_weight), - free_dispersion, kval=k, seed=seed, init="param_file")) + validate_family(ppp.run_partitioning(currtmpdir + "/" + str(cpt) + "/", len(samp), + beta * (nb_fam / edges_weight), free_dispersion, kval=kval, + seed=seed, init="param_file")) cpt += 1 if len(cpt_partition) == 0: - counts = {"persistent": "NA", "shell": "NA", "cloud": "NA", "undefined": "NA", "K": k} + counts = {"persistent": "NA", "shell": "NA", "cloud": "NA", "undefined": "NA", "K": kval} else: - counts = {"persistent": 0, "shell": 0, "cloud": 0, "undefined": 0, "K": k} + counts = {"persistent": 0, "shell": 0, "cloud": 0, "undefined": 0, "K": kval} for val in cpt_partition.values(): if isinstance(val, str): @@ -112,29 +136,41 @@ def validate_family(result): return counts, index -def launch_raref_nem(args): +def launch_raref_nem(args: tuple) -> (dict, int): + """ + Launch raref_nem in multiprocessing + + :param args: {index: int, tmpdir: str, beta: float, sm_degree: int, free_dispersion: bool, + chunk_size: int, kval: int, krange: list, seed: int} + :return: Count of each partition and paremeters for the given sample index + """ return raref_nem(*args) -def draw_curve(output, max_sampling, data): +def draw_curve(output: str, data: list, max_sampling: int = 10): + """ + Draw the rarefaction curve and associated data + + :param output: output directory path to draw the rarefaction curve and associated data + :param max_sampling: Maximum number of organisms in a sample + :param data: + """ logging.getLogger().info("Drawing the rarefaction curve ...") raref_name = output + "/rarefaction.csv" raref = open(raref_name, "w") - raref.write(",".join( - ["nb_org", "persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", "soft_core", - "soft_accessory", "pan", "K"]) + "\n") + raref.write(",".join(["nb_org", "persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", + "soft_core", "soft_accessory", "pangenome", "K"]) + "\n") for part in data: - raref.write(",".join(map(str, - [part["nborgs"], part["persistent"], part["shell"], part["cloud"], part["undefined"], - part["exact_core"], part["exact_accessory"], part["soft_core"], - part["soft_accessory"], part["exact_core"] + part["exact_accessory"], - part["K"]])) + "\n") + raref.write(",".join(map(str, [part["nborgs"], part["persistent"], part["shell"], part["cloud"], + part["undefined"], part["exact_core"], part["exact_accessory"], + part["soft_core"], part["soft_accessory"], part["exact_core"] + + part["exact_accessory"], part["K"]])) + "\n") raref.close() - def heap_law(n, p_kappa, p_gamma): + def heap_law(n, p_kappa, p_gamma) -> float: return p_kappa * n ** p_gamma - def poly_area(p_x, p_y): + def poly_area(p_x: list, p_y: list) -> float: return 0.5 * numpy.abs(numpy.dot(p_x, numpy.roll(p_y, 1)) - numpy.dot(p_y, numpy.roll(p_x, 1))) annotations = [] @@ -143,7 +179,7 @@ def poly_area(p_x, p_y): params_file = open(output + "/rarefaction_parameters" + ".csv", "w") params_file.write("partition,kappa,gamma,kappa_std_error,gamma_std_error,IQR_area\n") for partition in ["persistent", "shell", "cloud", "undefined", "exact_core", "exact_accessory", "soft_core", - "soft_accessory", "pan"]: + "soft_accessory", "pangenome"]: percentiles_75 = Series({i: numpy.nanpercentile(data_raref[data_raref["nb_org"] == i][partition], 75) for i in range(1, max_sampling + 1)}).dropna() percentiles_25 = Series({i: numpy.nanpercentile(data_raref[data_raref["nb_org"] == i][partition], 25) for i in @@ -161,7 +197,7 @@ def poly_area(p_x, p_y): x += list(reversed(percentiles_25.index.tolist())) area_iqr = poly_area(x, percentiles_25.tolist() + percentiles_75.tolist()) nb_org_min_fitting = 15 - colors = {"pan": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", + colors = {"pangenome": "black", "exact_accessory": "#EB37ED", "exact_core": "#FF2828", "soft_core": "#c7c938", "soft_accessory": "#996633", "shell": "#00D860", "persistent": "#F7A507", "cloud": "#79DEFF", "undefined": "#828282"} try: @@ -289,9 +325,32 @@ def poly_area(p_x, p_y): params_file.close() -def make_rarefaction_curve(pangenome, output, tmpdir, beta=2.5, depth=30, min_sampling=1, max_sampling=100, - sm_degree=10, free_dispersion=False, chunk_size=500, k=-1, cpu=1, seed=42, kestimate=False, - krange=None, soft_core=0.95, disable_bar=False): +def make_rarefaction_curve(pangenome: Pangenome, output: str, tmpdir: str, beta: float = 2.5, depth: int = 30, + min_sampling: int = 1, max_sampling: int = 100, sm_degree: int = 10, + free_dispersion: bool = False, chunk_size: int = 500, kval: int = -1, krange: list = None, + cpu: int = 1, seed: int = 42, kestimate: bool = False, soft_core: float = 0.95, + disable_bar: bool = False): + """ + Main function to make the rarefaction curve + + :param pangenome: Pangenome containing GeneFamilies to align with sequence set + :param output: output directory path to draw the rarefaction curve and associated data + :param tmpdir: temporary directory path + :param beta: strength of the smoothing using the graph topology during partitioning. 0 deactivate spatial smoothing + :param depth: Number of samplings at each sampling point + :param min_sampling: Minimum number of organisms in a sample + :param max_sampling: Maximum number of organisms in a sample + :param sm_degree: Maximum degree of the nodes to be included in the smoothing process. + :param free_dispersion: use if the dispersion around the centroid vector of each partition during must be free. + :param chunk_size: Size of the chunks when performing partitioning using chunks of organisms. + :param kval: Number of partitions to use. Must be at least 2. If under 2, it will be detected automatically. + :param krange: Range of K values to test when detecting K automatically. + :param cpu: Number of available core + :param seed: seed used to generate random numbers + :param kestimate: recompute the number of partitions for each sample between the values provided by krange + :param soft_core: Soft core threshold + :param disable_bar: Disable progress bar + """ if krange is None: krange = [3, -1] ppp.pan = pangenome # use the global from partition to store the pan, so that it is usable @@ -311,15 +370,15 @@ def make_rarefaction_curve(pangenome, output, tmpdir, beta=2.5, depth=30, min_sa else: max_sampling = int(max_sampling) - if k < 3 and kestimate is False: # estimate K once and for all. + if kval < 3 and kestimate is False: # estimate K once and for all. try: - k = ppp.pan.parameters["partition"]["K"] - logging.getLogger().info(f"Reuse the number of partitions {k}") + kval = ppp.pan.parameters["partition"]["K"] + logging.getLogger().info(f"Reuse the number of partitions {kval}") except KeyError: logging.getLogger().info("Estimating the number of partitions...") - k = ppp.evaluate_nb_partitions(pangenome.organisms, sm_degree, free_dispersion, chunk_size, krange, 0.05, - False, cpu, tmpdir, seed, None) - logging.getLogger().info(f"The number of partitions has been evaluated at {k}") + kval = ppp.evaluate_nb_partitions(set(pangenome.organisms), tmpdir, None, sm_degree, free_dispersion, + chunk_size, krange, 0.05, False, cpu, seed) + logging.getLogger().info(f"The number of partitions has been evaluated at {kval}") logging.getLogger().info("Extracting samples ...") all_samples = [] @@ -331,8 +390,8 @@ def make_rarefaction_curve(pangenome, output, tmpdir, beta=2.5, depth=30, min_sa logging.getLogger().info("Computing bitarrays for each family...") index_org = pangenome.compute_family_bitarrays() - logging.getLogger().info( - f"Done computing bitarrays. Comparing them to get exact and soft core stats for {len(all_samples)} samples...") + logging.getLogger().info("Done computing bitarrays. Comparing them to get exact and soft core stats for " + f"{len(all_samples)} samples...") bar = tqdm(range(len(all_samples) * len(pangenome.gene_families)), unit="gene family", disable=disable_bar) for samp in all_samples: # make the sample's organism bitarray. @@ -368,7 +427,7 @@ def make_rarefaction_curve(pangenome, output, tmpdir, beta=2.5, depth=30, min_sa args = [] for index, samp in enumerate(samples): - args.append((index, tmpdir, beta, sm_degree, free_dispersion, chunk_size, k, krange, seed)) + args.append((index, tmpdir, beta, sm_degree, free_dispersion, chunk_size, kval, krange, seed)) with get_context('fork').Pool(processes=cpu) as p: # launch partitioning @@ -382,37 +441,52 @@ def make_rarefaction_curve(pangenome, output, tmpdir, beta=2.5, depth=30, min_sa logging.getLogger().info("Done partitioning everything") warnings.filterwarnings("ignore") - draw_curve(output, max_sampling, samp_nb_per_part) + draw_curve(output, samp_nb_per_part, max_sampling) warnings.resetwarnings() tmpdir_obj.cleanup() logging.getLogger().info("Done making the rarefaction curves") -def launch(args): +def launch(args: argparse.Namespace): """ - main code when launch partition from the command line. + Command launcher + + :param args: All arguments provide by user """ mk_outdir(args.output, args.force) pangenome = Pangenome() - pangenome.add_file(args.pan) + pangenome.add_file(args.pangenome) make_rarefaction_curve(pangenome=pangenome, output=args.output, tmpdir=args.tmpdir, beta=args.beta, depth=args.depth, min_sampling=args.min, max_sampling=args.max, sm_degree=args.max_degree_smoothing, free_dispersion=args.free_dispersion, - chunk_size=args.chunk_size, k=args.nb_of_partitions, cpu=args.cpu, seed=args.seed, - kestimate=args.reestimate_K, krange=args.krange, soft_core=args.soft_core, + chunk_size=args.chunk_size, kval=args.nb_of_partitions, krange=args.krange, cpu=args.cpu, + seed=args.seed, kestimate=args.reestimate_K, soft_core=args.soft_core, disable_bar=args.disable_prog_bar) -def subparser(sub_parser): - parser = sub_parser.add_parser("rarefaction", formatter_class=argparse.ArgumentDefaultsHelpFormatter) +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ + parser = sub_parser.add_parser("rarefaction", description='Compute the rarefaction curve of the pangenome', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser_rarefaction(parser) return parser -def parser_rarefaction(parser): +def parser_rarefaction(parser: argparse.ArgumentParser): + """ + Parser for specific argument of graph command + + :param parser: parser for align argument + """ required = parser.add_argument_group(title="Required arguments", description="One of the following arguments is required :") - required.add_argument('-p', '--pan', required=True, type=str, help="The pan .h5 file") + required.add_argument('-p', '--pangenome', required=True, type=str, help="The pangenome .h5 file") optional = parser.add_argument_group(title="Optional arguments") optional.add_argument("-b", "--beta", required=False, default=2.5, type=float, diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 7ac296c1..ddfe094f 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -2,18 +2,18 @@ # coding: utf8 # default libraries -from collections.abc import Iterable +from typing import Iterator, List, Union, Dict, Set, Iterable # local libraries -from ppanggolin.genome import Organism -from ppanggolin.region import Region +from ppanggolin.genome import Organism, Gene +from ppanggolin.region import Region, Spot, Module from ppanggolin.geneFamily import GeneFamily from ppanggolin.edge import Edge class Pangenome: """ - This is a class representing your pan. It is used as a basic unit for all the analysis to access to the + This is a class representing your pangenome. It is used as a basic unit for all the analysis to access to the different elements of your pan, such as organisms, contigs, genes or gene families. It has setter and getter methods for most elements in your pan, and you can use those to add new elements to it, or get objects that have a specific identifier to manipulate them directly. @@ -49,14 +49,13 @@ def __init__(self): } self.parameters = {} - def add_file(self, pangenome_file): - """Links an HDF5 file to the pan. If needed elements will be loaded from this file, + def add_file(self, pangenome_file: str): + """Links an HDF5 file to the pangenome. If needed elements will be loaded from this file, and anything that is computed will be saved to this file when :func:`ppanggolin.formats.writeBinaries.writePangenome` is called. - :param pangenome_file: A string representing the filepath to the hdf5 pan file + :param pangenome_file: A string representing the filepath to the hdf5 pangenome file to be either used or created - :type pangenome_file: str """ from ppanggolin.formats.readBinaries import get_status # importing on call instead of importing on top to avoid cross-reference problems. @@ -65,11 +64,10 @@ def add_file(self, pangenome_file): """ Gene Methods""" @property - def genes(self): + def genes(self) -> list: """Creates the geneGetter if it does not exist, and returns all the genes of all organisms in the pangenome. :return: list of :class:`ppanggolin.genome.Gene` - :rtype: list """ try: return list(self._geneGetter.values()) @@ -77,12 +75,10 @@ def genes(self): self._mk_gene_getter() # make it return self.genes # return what was expected - def _yield_genes(self): - """ - Use a generator to get all the genes of a pan + def _yield_genes(self) -> Iterator[Gene]: + """ Use a generator to get all the genes of a pangenome - :return: an iterator of :class:`ppanggolin.genome.Gene` - :rtype: Iterator[:class:`ppanggolin.genome.Gene`] + :return: an iterator of Gene """ if self.number_of_organisms() > 0: # if we have organisms, they're supposed to have genes for org in self.organisms: @@ -97,85 +93,91 @@ def _yield_genes(self): def _mk_gene_getter(self): """ - Builds the :attr:`ppanggolin.pan.Pangenome._geneGetter` of the pan + Builds the attribute _geneGetter of the pangenome - Since the genes are never explicitly 'added' to a pan (but rather to a gene family, or a contig), - the pan cannot directly extract a gene from a geneID since it does not 'know' them. - if at some point we want to extract genes from a pan we'll create a geneGetter. - The assumption behind this is that the pan has been filled and no more gene will be added. + Since the genes are never explicitly 'added' to a pangenome (but rather to a gene family, or a contig), + the pangenome cannot directly extract a gene from a geneID since it does not 'know' them. + if at some point we want to extract genes from a pangenome we'll create a geneGetter. + The assumption behind this is that the pangenome has been filled and no more gene will be added. """ self._geneGetter = {} for gene in self._yield_genes(): self._geneGetter[gene.ID] = gene - def get_gene(self, gene_id): - """returns the gene that has the given `geneID` + def get_gene(self, gene_id: str) -> Gene: + """returns the gene that has the given geneID :param gene_id: The gene ID to look for - :type gene_id: any + :return: returns the gene that has the ID `geneID` - :rtype: :class:`ppanggolin.genome.Gene` - :raises KeyError: If the `geneID` is not in the pan + + :raises KeyError: If the `geneID` is not in the pangenome """ try: return self._geneGetter[gene_id] except AttributeError: - # in that case, either the gene getter has not been computed, or the geneID is not in the pan. + # in that case, either the gene getter has not been computed, or the geneID is not in the pangenome. self._mk_gene_getter() # make it - return self.get_gene( - gene_id) # return what was expected. If the geneID does not exist it will raise an error. + return self.get_gene(gene_id) # return what was expected. If geneID does not exist it will raise an error. except KeyError: - raise KeyError(f"{gene_id} does not exist in the pan.") + raise KeyError(f"{gene_id} does not exist in the pangenome.") + + def number_of_gene(self) -> int: + """Returns the number of gene present in the pangenome + + :return: the number of gene families + """ + try: + return len(self._geneGetter) + except AttributeError: # in that case the gene getter has not been computed + self._mk_gene_getter() # make it + return len(self._geneGetter) """Gene families methods""" @property - def gene_families(self): - """returns all the gene families in the pan + def gene_families(self) -> List[GeneFamily]: + """returns all the gene families in the pangenome :return: list of :class:`ppanggolin.geneFamily.GeneFamily` - :rtype: list """ return list(self._famGetter.values()) - def _create_gene_family(self, name): + def _create_gene_family(self, name: str) -> GeneFamily: """Creates a gene family object with the given `name` :param name: the name to give to the gene family. Must not exist already. - :type name: any + :return: the created GeneFamily object - :rtype: :class:`ppanggolin.geneFamily.GeneFamily` """ new_fam = GeneFamily(family_id=self.max_fam_id, name=name) self.max_fam_id += 1 self._famGetter[new_fam.name] = new_fam return new_fam - def number_of_gene_families(self): - """Returns the number of gene families present in the pan + def number_of_gene_families(self) -> int: + """Returns the number of gene families present in the pangenome :return: the number of gene families - :rtype: int """ return len(self._famGetter) - def get_gene_family(self, name): + def get_gene_family(self, name: str) -> GeneFamily: """returns the gene family that has the given `name` :param name: The gene family name to look for - :type name: any + :return: returns the gene family that has the name `name` - :rtype: :class:`ppanggolin.geneFamily.GeneFamily` """ return self._famGetter[name] - def add_gene_family(self, name): + def add_gene_family(self, name: str): """ - Get the :class:`ppanggolin.geneFamily.GeneFamily` object that has the given `name`. If it does not exist, - creates it. - returns the geneFamily object. + Get the :class:`ppanggolin.geneFamily.GeneFamily` object that has the given `name`. + If it does not exist, creates it. + + :param name: The gene family name to get if it exists, and create otherwise. - :param name: The gene family name to get if it exists, and create otherwise. - :type name: str + :return: GeneFamily object. """ fam = self._famGetter.get(name) if fam is None: @@ -184,25 +186,22 @@ def add_gene_family(self, name): """Graph methods""" @property - def edges(self): - """returns all the edges in the pan graph + def edges(self) -> list: + """returns all the edges in the pangenome graph - :return: list of :class:`ppanggolin.pan.Edge` - :rtype: list + :return: list of :class:`ppanggolin.pangenome.Edge` """ return list(self._edgeGetter.values()) - def add_edge(self, gene1, gene2): + def add_edge(self, gene1: Gene, gene2: Gene) -> Edge: """ - Adds an edge between the two gene families that the two given genes belong to. Genes object are expected, - and they are also expected to have a family assigned + Adds an edge between the two gene families that the two given genes belong to. + Genes object are expected, and they are also expected to have a family assigned :param gene1: The first gene - :type gene1: :class:`ppanggolin.genome.Gene` :param gene2: The second gene - :type gene2: :class:`ppanggolin.genome.Gene` + :return: the created Edge - :rtype: :class:`ppanggolin.pangenome.Edge` """ key = frozenset([gene1.family, gene2.family]) edge = self._edgeGetter.get(key) @@ -213,51 +212,56 @@ def add_edge(self, gene1, gene2): edge.add_genes(gene1, gene2) return edge + def number_of_edge(self) -> int: + """Returns the number of edge present in the pangenome + + :return: the number of gene families + """ + return len(self._edgeGetter) + """Organism methods""" @property - def organisms(self): - """returns all the organisms in the pan + def organisms(self) -> List[Organism]: + """returns all the organisms in the pangenome :return: list of :class:`ppanggolin.genome.Organism` - :rtype: list """ return list(self._orgGetter.values()) - def number_of_organisms(self): - """Returns the number of organisms present in the pan + def number_of_organisms(self) -> int: + """Returns the number of organisms present in the pangenome :return: the number of organism - :rtype: int """ return len(self._orgGetter) - def get_organism(self, org_name): + def get_organism(self, org_name: str) -> Organism: """ - Get an organism that is expected to be in the pan using its name, which is supposedly unique. + Get an organism that is expected to be in the pangenome using its name, which is supposedly unique. Raises an error if the organism does not exist. - :param org_name: Name of the :class:`ppanggolin.genome.Organism` to get - :type org_name: str + :param org_name: Name of the Organism to get + :return: The related Organism object - :rtype: :class:`ppanggolin.genome.Organism` - :raises KeyError: If the provided name is not in the pan + + :raises KeyError: If the provided name is not in the pangenome """ try: return self._orgGetter[org_name] except KeyError: - raise KeyError(f"{org_name} does not seem to be in your pan") + raise KeyError(f"{org_name} does not seem to be in your pangenome") - def add_organism(self, new_org): + def add_organism(self, new_org: Union[Organism, str]) -> Organism: """ - adds an organism that did not exist previously in the pan if an :class:`ppanggolin.genome.Organism` - object is provided. If an organism with the same name exists it will raise an error. - If a :class:`str` object is provided, will return the corresponding organism that has this name + adds an organism that did not exist previously in the pangenome if an Organism object is provided. + If an organism with the same name exists it will raise an error. + If a str object is provided, will return the corresponding organism that has this name OR create a new one if it does not exist. - :param new_org: Organism to add to the pan - :type new_org: :class:`ppanggolin.genome.Organism` or str + :param new_org: Organism to add to the pangenome + :return: The created organism - :rtype: :class:`ppanggolin.genome.Organism` + :raises TypeError: if the provided `newOrg` is neither a str nor a :class:`ppanggolin.genome.Organism` """ if isinstance(new_org, Organism): @@ -276,11 +280,10 @@ def add_organism(self, new_org): raise TypeError("Provide an Organism object or a str that will serve as organism name") return new_org - def get_index(self): # will not make a new index if it exists already + def get_org_index(self) -> Dict[Organism, int]: # will not make a new index if it exists already """Creates an index for Organisms (each organism is assigned an Integer). - :return: A dictionary with :class:`ppanggolin.genome.Organism` as key and `int` as value. - :rtype: dict[:class:`ppanggolin.genome.Organism`, int] + :return: The index of organisms in pangenome """ if self._org_index is None: # then the index does not exist yet self._org_index = {} @@ -288,29 +291,29 @@ def get_index(self): # will not make a new index if it exists already self._org_index[org] = index return self._org_index - def compute_family_bitarrays(self, part='all'): - """Based on the index generated by :meth:`ppanggolin.pan.Pangenome.getIndex`, generated a bitarray - for each gene family. + def compute_family_bitarrays(self, part: str = 'all') -> Dict[Organism, int]: + """ + Based on the index generated by get_org_index, generate a bitarray for each gene family. If the family j is present in the organism with the index i, the bit at position i will be 1. If it is not, the bit will be 0. The bitarrays are gmpy2.xmpz object. - :return: A dictionnary with :class:`ppanggolin.genome.Organism` as key and `int` as value. - :rtype: dict[:class:`ppanggolin.genome.Organism`, int] + :param part: Filter the organism in function of the given partition + + :return: the index of organisms in pangenome """ if self._org_index is None: # then the bitarrays don't exist yet, since the org index does not exist either. - self.get_index() + self.get_org_index() for fam in self.gene_families: fam.mk_bitarray(self._org_index, partition=part) # case where there is an index but the bitarrays have not been computed??? return self._org_index - def get_fam_index(self): # will not make a new index if it exists already + def get_fam_index(self) -> Dict[GeneFamily, int]: # will not make a new index if it exists already """Creates an index for gene families (each family is assigned an Integer). - :return: A dictionary with :class:`ppanggolin.genome.Organism` as key and `int` as value. - :rtype: dict[:class:`ppanggolin.genome.Organism`, int] + :return: The index of families in pangenome """ if self._fam_index is None: # then the index does not exist yet self._fam_index = {} @@ -318,15 +321,16 @@ def get_fam_index(self): # will not make a new index if it exists already self._fam_index[fam] = index return self._fam_index - def compute_org_bitarrays(self, part='all'): - """Based on the index generated by :meth:`ppanggolin.pan.Pangenome.get_fam_index`, generated a bitarray - for each gene family. + def compute_org_bitarrays(self, part='all') -> Dict[GeneFamily, int]: + """ + Based on the index generated by get_fam_index, generate a bitarray for each gene family. If the family j is present in the organism with the index i, the bit at position i will be 1. If it is not, the bit will be 0. The bitarrays are gmpy2.xmpz object. - :return: A dictionary with :class:`ppanggolin.genome.Organism` as key and `int` as value. - :rtype: dict[:class:`ppanggolin.genome.Organism`, int] + :param part: Filter the organism in function of the given partition + + :return: The index of gene families in pangenome """ if self._fam_index is None: # then the bitarrays don't exist yet, since the org index does not exist either. @@ -338,40 +342,36 @@ def compute_org_bitarrays(self, part='all'): """RGP methods""" @property - def regions(self): - """returns all the regions (RGP) in the pan + def regions(self) -> list: + """returns all the regions (RGP) in the pangenome - :return: list of :class:`ppanggolin.region.Region` - :rtype: list + :return: list of RGP """ return list(self._regionGetter.values()) - def get_or_add_region(self, region_name): - """Returns a region with the given `regionName`. Creates it if it does not exist. + def get_region(self, region_name: str) -> Region: + """Returns a region with the given region_name. Creates it if it does not exist. :param region_name: The name of the region to return - :type region_name: str + :return: The region - :rtype: :class:`ppanggolin.region.Region` """ try: return self._regionGetter[region_name] - except KeyError: # then the region is not stored in this pan. + except KeyError: # then the region is not stored in this pangenome. new_region = Region(region_name) self._regionGetter[region_name] = new_region return new_region - def get_multigenics(self, dup_margin, persistent=True): + def get_multigenics(self, dup_margin: float, persistent: bool = True) -> Set[GeneFamily]: """ - Returns the multigenic persistent families of the pan graph. A family will be considered multigenic + Returns the multigenic persistent families of the pangenome graph. A family will be considered multigenic if it is duplicated in more than `dup_margin` of the genomes where it is present. :param dup_margin: the ratio of presence in multicopy above which a gene family is considered multigenic - :type dup_margin: float :param persistent: if we consider only the persistent genes - :type persistent: bool - :return: a `set` of gene families considered multigenic - :rtype: set[:class:`ppanggolin.geneFamily.GeneFamily`] + + :return: set of gene families considered multigenic """ multigenics = set() for fam in self.gene_families: @@ -384,12 +384,12 @@ def get_multigenics(self, dup_margin, persistent=True): # (duplicated in more than {dup_margin} of the genomes)") return multigenics - def add_regions(self, region_group): - """Takes an Iterable or a Region object and adds it to the pan + def add_regions(self, region_group: Union[Region, Iterable[Region]]): + """Takes an Iterable or a Region object and adds it to the pangenome + + :param region_group: a region or an Iterable of regions to add to the pangenome - :param region_group: a region or an Iterable of regions to add to the pan - :type region_group: :class:`ppanggolin.region.Region` or Iterable[:class:`ppanggolin.region.Region`] - :raises TypeError: if regionGroup is neither a Region nor an Iterable[:class:`ppanggolin.region.Region`] + :raises TypeError: if regionGroup is neither a Region nor an Iterable[Region] """ old_len = len(self._regionGetter) if isinstance(region_group, Iterable): @@ -403,33 +403,46 @@ def add_regions(self, region_group): raise TypeError(f"An iterable or a 'Region' type object were expected, " f"but you provided a {type(region_group)} type object") + def number_of_rgp(self) -> int: + """Returns the number of gene families present in the pan + + :return: the number of gene families + """ + return len(self._regionGetter) + """Spot methods""" - def add_spots(self, spots): - """Adds the given iterable of spots to the pan. + def add_spots(self, spots: Iterable[Spot]): + """Adds the given iterable of spots to the pangenome. :param spots: An iterable of :class:`ppanggolin.region.Spot`. - :type spots: Iterable[:class:`ppanggolin.region.Spot`] """ self.spots |= set(spots) + def number_of_spots(self) -> int: + """Returns the number of gene families present in the pan + + :return: the number of gene families + """ + return len(self.spots) + """Modules methods""" - def add_modules(self, modules): - """Adds the given iterable of modules to the pan + def add_modules(self, modules: Iterable[Module]): + """Adds the given iterable of modules to the pangenome :param modules: an iterable of :class:`ppanggolin.module.Module` - :type modules: Iterable[:class:`ppanggolin.module.Module`] """ self.modules |= set(modules) - def compute_mod_bitarrays(self, part='all'): - """Based on the index generated by :meth:`ppanggolin.pan.Pangenome.get_fam_index`, generated a bitarray - for each gene family. + def compute_mod_bitarrays(self, part: str = 'all') -> Dict[GeneFamily, int]: + """Based on the index generated by get_fam_index, generated a bitarray + for each gene family present in modules. If the family j is present in the module with the index i, the bit at position i will be 1. If it is not, the bit will be 0. The bitarrays are gmpy2.xmpz object. - :return: A dictionary with :class:`ppanggolin.genome.Organism` as key and `int` as value. - :rtype: dict[:class:`ppanggolin.genome.Organism`, int] + :param part: Filter the organism in function of the given partition + + :return: A dictionary with Organism as key and int as value. """ if self._fam_index is None: # then the bitarrays don't exist yet, since the org index does not exist either. @@ -438,3 +451,10 @@ def compute_mod_bitarrays(self, part='all'): module.mk_bitarray(index=self._fam_index, partition=part) # case where there is an index but the bitarrays have not been computed??? return self._fam_index + + def number_of_modules(self) -> int: + """Returns the number of modules present in the pangenome + + :return: the number of modules + """ + return len(self.modules) diff --git a/ppanggolin/region.py b/ppanggolin/region.py index 4857dab8..700f5bf1 100644 --- a/ppanggolin/region.py +++ b/ppanggolin/region.py @@ -2,19 +2,28 @@ # coding: utf8 # default libraries +from __future__ import annotations import logging from collections.abc import Iterable # installed libraries +from typing import Dict + import gmpy2 # local libraries -from ppanggolin.genome import Gene +from ppanggolin.genome import Gene, Organism, Contig from ppanggolin.geneFamily import GeneFamily class Region: - def __init__(self, region_id): + """ + This class represent a region of genomic plasticity. + + :param region_id: identifier of the region + """ + + def __init__(self, region_id: str): self.genes = [] self.name = region_id self.score = 0 @@ -22,8 +31,14 @@ def __init__(self, region_id): def __hash__(self): return id(self) - def __eq__(self, other): - """ expects another Region type object. Will test whether two Region objects have the same gene families""" + def __eq__(self, other: Region) -> bool: + """ + Expects another Region type object. Will test whether two Region objects have the same gene families + + :param other: Other region to test equality of region + + :return: equal or not + """ if not isinstance(other, Region): raise TypeError(f"'Region' type object was expected, but '{type(other)}' type object was provided.") if [gene.family for gene in self.genes] == [gene.family for gene in other.genes]: @@ -32,52 +47,93 @@ def __eq__(self, other): return True return False + def __len__(self): + return len(self.genes) + + def __getitem__(self, index): + return self.genes[index] + def append(self, value): # allowing only gene-class objects in a region. if isinstance(value, Gene): self.genes.append(value) value.RGP.add(self) else: - raise TypeError( - "Unexpected class / type for " + type(value) + " when adding it to a region of genomic plasticity") + raise TypeError("Unexpected class / type for " + type(value) + + " when adding it to a region of genomic plasticity") @property - def families(self): + def families(self) -> set: + """Get the gene families in the RGP + + :return: Set of gene families + """ return {gene.family for gene in self.genes} @property - def start(self): + def start(self) -> int: + """ Get RGP starting position + + :return: Start position + """ return min(self.genes, key=lambda x: x.start).start - @property - def start_gene(self): + @property # TODO try to change start with this method + def start_gene(self) -> Gene: + """ Get RGP starting gene + + :return: Start gene + """ return min(self.genes, key=lambda x: x.position) @property - def stop_gene(self): + def stop_gene(self) -> Gene: + """ Get RGP stoping position + + :return: Stoping position + """ return max(self.genes, key=lambda x: x.position) @property def stop(self): + """ Get RGP stoping position + + :return: Stop position + """ return max(self.genes, key=lambda x: x.stop).stop @property - def organism(self): + def organism(self) -> Organism: + """ Get the Organism link to RGP + + :return: Organism + """ return self.genes[0].organism @property - def contig(self): + def contig(self) -> Contig: + """ Get the Contig link to RGP + + :return: Contig + """ return self.genes[0].contig @property - def is_whole_contig(self): - """ Indicates if the region is an entire contig """ + def is_whole_contig(self) -> bool: + """Indicates if the region is an entire contig + + :return: True if whole contig + """ if self.start_gene.position == 0 and self.stop_gene.position == len(self.contig.genes) - 1: return True return False @property - def is_contig_border(self): + def is_contig_border(self) -> bool: + """Indicates if the region is bordering a contig + + :return: True if bordering + """ if len(self.genes) == 0: raise Exception("Your region has no genes. Something wrong happenned.") if self.start_gene.position == 0 and not self.contig.is_circular: @@ -86,20 +142,25 @@ def is_contig_border(self): return True return False - def get_rnas(self): + def get_rnas(self) -> set: + """ Get RNA in region + + :return: Set of RNA + """ rnas = set() for rna in self.contig.RNAs: if self.start < rna.start < self.stop: rnas.add(rna) return rnas - def __len__(self): - return len(self.genes) + def get_bordering_genes(self, n: int, multigenics: set) -> list: + """ Get the bordered genes in the region - def __getitem__(self, index): - return self.genes[index] + :param n: number of genes to get + :param multigenics: pangenome graph multigenic persistent families - def get_bordering_genes(self, n, multigenics): + :return: A list of bordering gene in start and stop position List[List[Start Gene], [Stop Gene]] + """ border = [[], []] pos = self.start_gene.position init = pos @@ -138,6 +199,11 @@ def get_bordering_genes(self, n, multigenics): class Spot: + """ + This class represent a hotspot. + + :param spot_id: identifier of the spot + """ def __init__(self, spot_id): self.ID = spot_id self.regions = set() @@ -147,7 +213,12 @@ def __init__(self, spot_id): self._compContent = False @property - def families(self): + def families(self) -> set: + """Get the gene families in the RGP + + :return: Set of gene families + """ + union = set() for region in self.regions: union |= region.families @@ -157,6 +228,8 @@ def add_regions(self, regions): """ Adds region(s) contained in an Iterable to the spot which all have the same bordering persistent genes provided with 'borders' + + :param regions: Iterable list of RGP to add to spot """ if isinstance(regions, Iterable): for region in regions: @@ -165,15 +238,27 @@ def add_regions(self, regions): raise Exception("The provided 'regions' variable was not an Iterable") def add_region(self, region): + """ + Add one RGP to the spot + + :param region: RGP to add to spot + """ if isinstance(region, Region): self.regions.add(region) def spot_2_families(self): + """Add to Gene Families a link to spot""" for family in self.families: family.spot.add(self) - def borders(self, set_size, multigenics): - """ extracts all the borders of all RGPs belonging to the spot""" + def borders(self, set_size: int, multigenics): + """ Extracts all the borders of all RGPs belonging to the spot + + :param set_size: number of genes to get + :param multigenics: pangenome graph multigenic persistent families + + :return: families that bordering spot + """ all_borders = [] for rgp in self.regions: all_borders.append(rgp.get_bordering_genes(set_size, multigenics)) @@ -217,47 +302,71 @@ def _mk_uniq_content(self): self._uniqContent[rgp] = {rgp} def _get_content(self): - """Creates the _uniqContent object if it was never computed. Return it in any case""" + """Creates the _uniqContent object if it was never computed. Return it in any case + + :return: RGP groups that have identical gene content + """ if not self._compContent: self._mk_uniq_content() self._compContent = True return self._uniqContent def _get_ordered_set(self): - """Creates the _uniqSyn object if it was never computed. Return it in any case""" + """ Creates the _uniqSyn object if it was never computed. Return it in any case + + :return: RGP groups that have an identical synteny + """ if not self._compOrderedSet: self._mk_uniq_ordered_set_obj() self._compOrderedSet = True return self._uniqOrderedSet - def get_uniq_to_rgp(self): - """ returns the dictionnary with a representing RGP as key, and all identical RGPs as value""" + def get_uniq_to_rgp(self) -> dict: + """ Get dictionnary with a representing RGP as key, and all identical RGPs as value + + :return: Dictionnary with a representing RGP as key, and all identical RGPs as value + """ return self._get_ordered_set() def get_uniq_ordered_set(self): - """ returns an Iterable of all the unique syntenies in the spot""" + """Get an Iterable of all the unique syntenies in the spot + + :return: Iterable of all the unique syntenies in the spot + """ return set(self._get_ordered_set().keys()) def get_uniq_content(self): - """ returns an Iterable of all the unique rgp (in terms of gene family content) in the spot""" + """ Get an Iterable of all the unique rgp (in terms of gene family content) in the spot + + :return: Iterable of all the unique rgp (in terms of gene family content) in the spot + """ return set(self._get_content().keys()) - def count_uniq_content(self): + def count_uniq_content(self) -> dict: """ - Returns a counter with a representative rgp as key and - the number of identical rgp in terms of gene family content as value + Get a counter of uniq RGP and number of identical RGP (in terms of gene family content) + + :return: dictionary with a representative rgp as key and number of identical rgp as value """ return dict([(key, len(val)) for key, val in self._get_content().items()]) def count_uniq_ordered_set(self): """ - Returns a counter with a representative rgp as key and the number of identical rgp in terms of synteny as value + Get a counter of uniq RGP and number of identical RGP (in terms of synteny content) + + :return: dictionary with a representative rgp as key and number of identical rgp as value """ return dict([(key, len(val)) for key, val in self._get_ordered_set().items()]) class Module: - def __init__(self, module_id, families=None): + """ + This class represent a hotspot. + + :param module_id: identifier of the module + :param families: Set of families which define the module + """ + def __init__(self, module_id: int, families: set = None): """ 'core' are gene families that define the module. 'associated_families' are gene families that you believe are associated to the module in some way, @@ -267,30 +376,28 @@ def __init__(self, module_id, families=None): self.families = set() if families is not None: if not all(isinstance(fam, GeneFamily) for fam in families): - raise Exception( - f"You provided elements that were not GeneFamily object. Modules are only made of GeneFamily") + raise Exception(f"You provided elements that were not GeneFamily object." + f" Modules are only made of GeneFamily") self.families |= set(families) self.bitarray = None - def add_family(self, family): + def add_family(self, family: GeneFamily): """ Add a family to the module :param family: the family that will ba added to the module - :type family: GeneFamily """ if not isinstance(family, GeneFamily): raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") family.modules.add(self) self.families.add(family) - def mk_bitarray(self, index, partition='all'): + def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'): """Produces a bitarray representing the presence / absence of families in the organism using the provided index The bitarray is stored in the :attr:`bitarray` attribute and is a :class:`gmpy2.xmpz` type. + :param partition: filter module by partition - :type partition: str :param index: The index computed by :func:`ppanggolin.pan.Pangenome.getIndex` - :type index: dict[:class:`ppanggolin.genome.Organism`, int] """ self.bitarray = gmpy2.xmpz() # pylint: disable=no-member if partition == 'all': @@ -318,27 +425,13 @@ def mk_bitarray(self, index, partition='all'): class GeneContext: """ - A class used to represent a gene context + A class used to represent a gene context - Attributes - ---------- - gc_id : int - ID of the Gene context - families : set - Gene families related to the GeneContext - - Methods - ------- - """ - - def __init__(self, gc_id, families=None): - """ Initial methods + :param gc_id : identifier of the Gene context + :param families: Gene families related to the GeneContext + """ - :param gc_id: ID of the GeneContext - :type gc_id: int - :param families: Gene families related to the GeneContext - :type families: set - """ + def __init__(self, gc_id: int, families: set = None): self.ID = gc_id self.families = set() if families is not None: @@ -347,12 +440,11 @@ def __init__(self, gc_id, families=None): f" GeneContext are only made of GeneFamily") self.families |= set(families) - def add_family(self, family): + def add_family(self, family: GeneFamily): """ Allow to add one family in the GeneContext :param family: family to add - :type family: GeneFamily """ if not isinstance(family, GeneFamily): raise Exception("You did not provide a GenFamily object. Modules are only made of GeneFamily") - self.families.add(family) \ No newline at end of file + self.families.add(family) diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index a0949197..ecafa29e 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -5,27 +5,49 @@ import sys import os import gzip -import mmap import argparse from io import TextIOWrapper from pathlib import Path from typing import TextIO, Union, BinaryIO +import networkx as nx import pkg_resources from numpy import repeat import logging +from scipy.sparse import csc_matrix -def check_log(name): +from ppanggolin.geneFamily import GeneFamily + + +def check_log(name: str) -> TextIO: + """Check if the output log is writable + + :param name: Path to the log output + + :return: output for log + """ if name == "stdout": return sys.stdout elif name == "stderr": return sys.stderr else: - return open(name, "w") + try: + log_file = open(name, "w") + except IOError: + raise IOError("The given log file does not appear.") + except Exception: + raise Exception("An unexpected error happened with your logfile. Please check if he is accessible." + "If everything looks good, please report an issue on our GitHub.") + else: + return log_file def check_tsv_sanity(tsv): + """ Check if the given tsv is readable for the next PPanGGOLiN step + + :param tsv: Path to the input tsv + """ f = open(tsv, "r") name_set = set() duplicated_names = set() @@ -48,14 +70,16 @@ def check_tsv_sanity(tsv): raise Exception(f"Some of the given files do not exist. The non-existing files are the following : " f"'{' '.join(non_existing_files)}'") if len(duplicated_names) != 0: - raise Exception( - f"Some of your genomes have identical names. The duplicated names are the following : " - f"'{' '.join(duplicated_names)}'") + raise Exception(f"Some of your genomes have identical names. The duplicated names are the following : " + f"'{' '.join(duplicated_names)}'") -def check_input_files(anno=None, pangenome=None, fasta=None): - """ - Checks if the provided input files exist and are of the proper format +def check_input_files(anno: str = None, pangenome: str = None, fasta: str = None): + """ Checks if the provided input files exist and are of the proper format + + :param anno: Path to the annotation file + :param pangenome: Path to the pangenome hdf5 file + :param fasta: path to the fasta file """ if pangenome is not None and not os.path.exists(pangenome): raise FileNotFoundError(f"No such file or directory: '{pangenome}'") @@ -72,6 +96,10 @@ def check_input_files(anno=None, pangenome=None, fasta=None): def set_verbosity_level(args): + """Set the verbosity level + + :param args: argument pass by command line + """ level = logging.INFO # info, warnings and errors, default verbose == 1 if hasattr(args, "verbose"): if args.verbose == 2: @@ -89,7 +117,14 @@ def set_verbosity_level(args): logging.getLogger().info("PPanGGOLiN version: " + pkg_resources.get_distribution("ppanggolin").version) -def jaccard_similarities(mat, jaccard_similarity_th): +def jaccard_similarities(mat: csc_matrix, jaccard_similarity_th) -> csc_matrix: + """ Compute the jaccard similarities + + :param mat: + :param jaccard_similarity_th: threshold + + :return: + """ cols_sum = mat.getnnz(axis=0) ab = mat.T * mat # for rows @@ -129,9 +164,14 @@ def read_compressed_or_not(file_or_file_path: Union[str, BinaryIO, TextIOWrapper return file -def write_compressed_or_not(file_path, compress): +def write_compressed_or_not(file_path: str, compress: bool = False) -> Union[gzip.GzipFile, TextIO]: """ - Returns a file-like object, compressed or not. + Create a file-like object, compressed or not. + + :param file_path: Path to the file + :param compress: Compress the file in .gz + + :return: file-like object, compressed or not """ if compress: return gzip.open(file_path + ".gz", mode="wt") @@ -139,12 +179,12 @@ def write_compressed_or_not(file_path, compress): return open(file_path, "w") -def is_compressed(file_or_file_path): +def is_compressed(file_or_file_path: Union[str, TextIO, gzip.GzipFile]): """ Checks is a file, or file path given is compressed or not - :param file_or_file_path: + :param file_or_file_path: Input file - :return: + :return: Get if the file is compressed """ file = file_or_file_path if isinstance(file, str): @@ -160,25 +200,28 @@ def is_compressed(file_or_file_path): return False -def get_num_lines(file): - fp = open(file, "r+") - buf = mmap.mmap(fp.fileno(), 0) - lines = 0 - while buf.readline(): - lines += 1 - return lines +def mk_outdir(output, force): + """ Create a directory at the given output if it doesn't exist already + :param output: Path where to create directory + :param force: Force to write in the directory -def mk_outdir(output, force): + :raise FileExistError: The current path already exist and force is false + """ if not os.path.exists(output): os.makedirs(output) elif not force: raise FileExistsError(f"{output} already exists. Use -f if you want to overwrite the files in the directory") -def mk_file_name(basename, output, force): - """ - Returns a usable filename for a ppanggolin output file, or crashes. +def mk_file_name(basename: str, output: str, force: bool = False) -> Path: + """Returns a usable filename for a ppanggolin output file, or crashes. + + :param basename: basename for the file + :param output: Path to save the file + :param force: Force to write the file + + :return: Path to the file """ filename = Path(output + "/" + basename) if filename.suffix != ".h5": @@ -191,23 +234,43 @@ def mk_file_name(basename, output, force): return filename -def restricted_float(x): +def restricted_float(x) -> float: + """Decrease the choice possibility of float in argparse + + :param x: given float by user + + :return: given float if it is acceptable + + :raise argparse.ArgumentTypeError: The float is not acceptable + """ x = float(x) if x < 0.0 or x > 1.0: raise argparse.ArgumentTypeError("%r not in range [0.0, 1.0]" % (x,)) return x -def min_one(x): +def min_one(x) -> int: + """Check if the given int is superior to one + + :param x: given float by user + + :return: given float if it is acceptable + + :raise argparse.ArgumentTypeError: The float is not acceptable + """ x = int(x) if x < 1: raise argparse.ArgumentTypeError("%r is inferior to 1" % (x,)) return x -def connected_components(g, removed, weight): +def connected_components(g: nx.Graph, removed: set, weight: float): """ - Yields subgraphs of each connected component you get when filtering edges based on the given weight. + Yields subgraphs of each connected component you get when filtering edges based on the given weight. + + :param g: Subgraph + :param removed: removed node + :param weight: threshold to remove node or not """ for v in g.nodes: if v not in removed: @@ -216,8 +279,16 @@ def connected_components(g, removed, weight): removed.update(c) -def _plain_bfs(g, source, removed, weight): - """A fast BFS node generator, copied from networkx then adapted to the current use case""" +def _plain_bfs(g: nx.Graph, source: GeneFamily, removed: set, weight: float): + """ + A fast BFS node generator, copied from networkx then adapted to the current use case + + :param g: graph with the nodes + :param source: current node + :param removed: set of removed nodes + :param weight:threshold to remove node or not + """ + nextlevel = {source} while nextlevel: thislevel = nextlevel @@ -237,7 +308,13 @@ def _plain_bfs(g, source, removed, weight): nextlevel.add(n) -def add_gene(obj, gene, fam_split=True): +def add_gene(obj, gene, fam_split: bool = True): + """ + + :param obj: + :param gene: + :param fam_split: + """ if fam_split: try: obj["genes"][gene.family].add(gene) @@ -254,6 +331,11 @@ def add_gene(obj, gene, fam_split=True): def check_option_workflow(args): + """ + Check if the given argument to a workflow command is usable + + :param args: list of arguments + """ if args.clusters is not None and not any([args.fasta, args.anno]): raise Exception("If you give --clusters option, you must give at least --fasta or --anno") diff --git a/ppanggolin/workflow/all.py b/ppanggolin/workflow/all.py index d6ae6efa..3a081a53 100644 --- a/ppanggolin/workflow/all.py +++ b/ppanggolin/workflow/all.py @@ -28,7 +28,12 @@ """a global workflow that does everything in one go.""" -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ check_option_workflow(args) pangenome = Pangenome() filename = mk_file_name(args.basename, args.output, args.force) @@ -91,7 +96,7 @@ def launch(args): spot_time = time.time() - start_spots start_mods = time.time() - predict_modules(pangenome=pangenome, cpu=args.cpu, tmpdir=args.tmpdir, disable_bar=args.disable_prog_bar) + predict_modules(pangenome=pangenome, tmpdir=args.tmpdir, cpu=args.cpu, disable_bar=args.disable_prog_bar) mod_time = time.time() - start_mods start_writing = time.time() @@ -130,7 +135,14 @@ def launch(args): print_info(filename, content=True) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("all", formatter_class=argparse.ArgumentDefaultsHelpFormatter) required = parser.add_argument_group(title="Input arguments", description="The possible input arguments :") diff --git a/ppanggolin/workflow/panModule.py b/ppanggolin/workflow/panModule.py index 3459e3fa..2b90f52c 100644 --- a/ppanggolin/workflow/panModule.py +++ b/ppanggolin/workflow/panModule.py @@ -26,7 +26,12 @@ """a global workflow that does everything in one go.""" -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ check_option_workflow(args) pangenome = Pangenome() filename = mk_file_name(args.basename, args.output, args.force) @@ -78,7 +83,7 @@ def launch(args): writing_time = writing_time + time.time() - start_writing start_mods = time.time() - predict_modules(pangenome=pangenome, cpu=args.cpu, tmpdir=args.tmpdir, disable_bar=args.disable_prog_bar) + predict_modules(pangenome=pangenome, tmpdir=args.tmpdir, cpu=args.cpu, disable_bar=args.disable_prog_bar) mod_time = time.time() - start_mods start_writing = time.time() @@ -106,7 +111,14 @@ def launch(args): print_info(filename, content=True) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("panmodule", formatter_class=argparse.ArgumentDefaultsHelpFormatter) required = parser.add_argument_group(title="Input arguments", description="The possible input arguments :") diff --git a/ppanggolin/workflow/panRGP.py b/ppanggolin/workflow/panRGP.py index 1a396fcd..7c0a8445 100644 --- a/ppanggolin/workflow/panRGP.py +++ b/ppanggolin/workflow/panRGP.py @@ -27,7 +27,12 @@ """a global workflow that does everything in one go.""" -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ check_option_workflow(args) pangenome = Pangenome() filename = mk_file_name(args.basename, args.output, args.force) @@ -117,7 +122,14 @@ def launch(args): print_info(filename, content=True) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("panrgp", formatter_class=argparse.ArgumentDefaultsHelpFormatter) required = parser.add_argument_group(title="Input arguments", description="The possible input arguments :") diff --git a/ppanggolin/workflow/workflow.py b/ppanggolin/workflow/workflow.py index de623936..b6ad7000 100644 --- a/ppanggolin/workflow/workflow.py +++ b/ppanggolin/workflow/workflow.py @@ -24,7 +24,12 @@ """ a global workflow that does everything in one go. """ -def launch(args): +def launch(args: argparse.Namespace): + """ + Command launcher + + :param args: All arguments provide by user + """ check_option_workflow(args) pangenome = Pangenome() filename = mk_file_name(args.basename, args.output, args.force) @@ -70,7 +75,14 @@ def launch(args): print_info(filename, content=True) -def subparser(sub_parser): +def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: + """ + Subparser to launch PPanGGOLiN in Command line + + :param sub_parser : sub_parser for align command + + :return : parser arguments for align command + """ parser = sub_parser.add_parser("workflow", formatter_class=argparse.ArgumentDefaultsHelpFormatter) required = parser.add_argument_group(title="Input arguments", description="The possible input arguments :") diff --git a/tests/genome/test_Contig.py b/tests/genome/test_Contig.py index eb3b1ed4..59acdbe1 100644 --- a/tests/genome/test_Contig.py +++ b/tests/genome/test_Contig.py @@ -47,7 +47,7 @@ def l_genes(): l_genes = [] for i in range(6, 0, -1): o_gene = Gene(i) - o_gene.fill_annotations(i, i, i, position=i - 1) + o_gene.fill_annotations(i, i, i) l_genes.append(o_gene) return l_genes diff --git a/tests/genome/test_Gene.py b/tests/genome/test_Gene.py index 0680db69..3f6da237 100644 --- a/tests/genome/test_Gene.py +++ b/tests/genome/test_Gene.py @@ -45,7 +45,7 @@ def test_fill_annotations(o_gene): strand = "plus" position = "44" genetic_code = "le code" - o_gene.fill_annotations(start, stop, strand, position=position, genetic_code=genetic_code) + o_gene.fill_annotations(start, stop, strand) assert o_gene.position == position assert o_gene.genetic_code == genetic_code diff --git a/tests/genome/test_Organism.py b/tests/genome/test_Organism.py index f2788a2d..8c2f29a3 100644 --- a/tests/genome/test_Organism.py +++ b/tests/genome/test_Organism.py @@ -26,8 +26,7 @@ def o_org(): def test_get_or_add_contig(o_org): - # FIXME: shouldn't the method be called getContig ? - o_ctg = o_org.get_or_add_contig('i') + o_ctg = o_org.get_contig('i') assert isinstance(o_ctg, Contig) @@ -35,10 +34,10 @@ def test_get_or_add_contig(o_org): def t_filled_org(o_org): n = 0 for k in "azerty'": - o_ctg = o_org.get_or_add_contig(k) + o_ctg = o_org.get_contig(k) for i in range(randint(0, 5)): o_gene = Gene(k + "-" + str(i)) - o_gene.fill_annotations(6, 1, k, position=i) + o_gene.fill_annotations(6, 1, k) o_ctg.add_gene(o_gene) n += 1 @@ -63,14 +62,14 @@ def get_genes(): o_gene = Gene(str(i)) start = randint(0, 100) stop = randint(0, 100) - o_gene.fill_annotations(start, stop, 'x', position=i) + o_gene.fill_annotations(start, stop, 'x') yield o_gene def test_contigs(o_org): l_contigs = [] for k in "azer'": - o_ctg = o_org.get_or_add_contig(k) + o_ctg = o_org.get_contig(k) for o_gene in get_genes(): o_ctg.add_gene(o_gene) l_contigs.append(o_ctg) @@ -79,7 +78,7 @@ def test_contigs(o_org): def test_genes(o_org): - o_ctg = o_org.get_or_add_contig("scrap") + o_ctg = o_org.get_contig("scrap") for o_gene in get_genes(): o_ctg.add_gene(o_gene) diff --git a/tests/region/test_Region.py b/tests/region/test_Region.py index 3fca27ae..25165c0d 100644 --- a/tests/region/test_Region.py +++ b/tests/region/test_Region.py @@ -57,7 +57,7 @@ def l_genes(o_org, o_contig): "lolo", "lala", "lili", "lulu", ]): gene = Gene(gene_id) - gene.fill_annotations(c, c + 30, "+", position=i) + gene.fill_annotations(c, c + 30, "+") gene.fill_parents(o_org, o_contig) o_contig.add_gene(gene) gene.family = GeneFamily(i, gene_id) diff --git a/tests/test_Pangenome.py b/tests/test_Pangenome.py index 7e78ca80..beca8cbc 100644 --- a/tests/test_Pangenome.py +++ b/tests/test_Pangenome.py @@ -177,7 +177,7 @@ def _make_org_with_genes(org): l_genes = [] o_org = Organism(org) for i in range(randint(2, 10)): - o_ctg = o_org.get_or_add_contig("k_{}".format(i)) + o_ctg = o_org.get_contig("k_{}".format(i)) for j in range(randint(2, 10)): name = "{}.{}.{}".format(org, o_ctg.name, j) o_gene = Gene(name) @@ -295,10 +295,10 @@ def test_edges_several(o_pang, make_gene_pair): def test_get_index(o_pang, l_orgs): for o_org in l_orgs: o_pang.add_organism(o_org) - idx = o_pang.get_index() + idx = o_pang.get_org_index() # after the method, the index exist - assert o_pang.get_index() is idx + assert o_pang.get_org_index() is idx # all orgs are in the index l_observed = sorted(idx.keys(), key=lambda x: x.name) @@ -309,7 +309,7 @@ def test_get_index(o_pang, l_orgs): def test_compute_family_bitarrays(o_pang, l_orgs): for o_org in l_orgs: o_pang.add_organism(o_org) - idx = o_pang.get_index() + idx = o_pang.get_org_index() assert o_pang.compute_family_bitarrays() is idx From 47c510713791639e0a7f5aba91c96baf5de6d98a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 19 May 2022 14:30:13 +0200 Subject: [PATCH 05/20] Add position argument in test --- VERSION | 2 +- tests/genome/test_Contig.py | 5 +++-- tests/genome/test_Gene.py | 12 ++++-------- tests/genome/test_Organism.py | 4 ++-- tests/region/test_Region.py | 2 +- 5 files changed, 11 insertions(+), 14 deletions(-) diff --git a/VERSION b/VERSION index dd91a8b7..a67e110c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.98 +1.2.99 diff --git a/tests/genome/test_Contig.py b/tests/genome/test_Contig.py index 59acdbe1..6d7c79aa 100644 --- a/tests/genome/test_Contig.py +++ b/tests/genome/test_Contig.py @@ -1,4 +1,5 @@ #! /usr/bin/env python3 +import random import pytest @@ -45,9 +46,9 @@ def test_add_rna(o_ctg): @pytest.fixture() def l_genes(): l_genes = [] - for i in range(6, 0, -1): + for i in range(6, -1, -1): # Create 7 Gene o_gene = Gene(i) - o_gene.fill_annotations(i, i, i) + o_gene.fill_annotations(start=i*10, stop=i*10 - 1, strand='+', position=i) l_genes.append(o_gene) return l_genes diff --git a/tests/genome/test_Gene.py b/tests/genome/test_Gene.py index 3f6da237..45e02713 100644 --- a/tests/genome/test_Gene.py +++ b/tests/genome/test_Gene.py @@ -30,9 +30,7 @@ def o_gene(): def test_fill_annotations_defaults(o_gene): - start, stop = 1, 9 - strand = "plus" - o_gene.fill_annotations(start, stop, strand) + o_gene.fill_annotations(start=1, stop=9, strand='+') for attr in "position", "genetic_code": assert hasattr(o_gene, attr) @@ -41,11 +39,9 @@ def test_fill_annotations_defaults(o_gene): def test_fill_annotations(o_gene): - start, stop = 1, 9 - strand = "plus" - position = "44" - genetic_code = "le code" - o_gene.fill_annotations(start, stop, strand) + position = 44 + genetic_code = 11 + o_gene.fill_annotations(start=1, stop=9, strand='+', position=44, genetic_code=11) assert o_gene.position == position assert o_gene.genetic_code == genetic_code diff --git a/tests/genome/test_Organism.py b/tests/genome/test_Organism.py index 8c2f29a3..18c7a2cf 100644 --- a/tests/genome/test_Organism.py +++ b/tests/genome/test_Organism.py @@ -37,7 +37,7 @@ def t_filled_org(o_org): o_ctg = o_org.get_contig(k) for i in range(randint(0, 5)): o_gene = Gene(k + "-" + str(i)) - o_gene.fill_annotations(6, 1, k) + o_gene.fill_annotations(6, 1, k, position=i) o_ctg.add_gene(o_gene) n += 1 @@ -62,7 +62,7 @@ def get_genes(): o_gene = Gene(str(i)) start = randint(0, 100) stop = randint(0, 100) - o_gene.fill_annotations(start, stop, 'x') + o_gene.fill_annotations(start, stop, 'x', position=i) yield o_gene diff --git a/tests/region/test_Region.py b/tests/region/test_Region.py index 25165c0d..3fca27ae 100644 --- a/tests/region/test_Region.py +++ b/tests/region/test_Region.py @@ -57,7 +57,7 @@ def l_genes(o_org, o_contig): "lolo", "lala", "lili", "lulu", ]): gene = Gene(gene_id) - gene.fill_annotations(c, c + 30, "+") + gene.fill_annotations(c, c + 30, "+", position=i) gene.fill_parents(o_org, o_contig) o_contig.add_gene(gene) gene.family = GeneFamily(i, gene_id) From 819c9cca8f0b8b95a409e031befa4ef0d2f86aac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 19 May 2022 14:32:01 +0200 Subject: [PATCH 06/20] Change indentation to autogen documentation --- VERSION | 2 +- ppanggolin/nem/partition.py | 7 +++---- ppanggolin/pangenome.py | 3 +-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/VERSION b/VERSION index a67e110c..85de19b1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.99 +1.2.100 diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index 809315d0..4ac38bdc 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -227,10 +227,9 @@ def partition_nem(index: int, tmpdir: str, kval: int, beta: float = 2.5, sm_degr def nem_samples(pack: tuple) -> Union[Tuple[dict, None, None], Tuple[int, float, float], Tuple[dict, dict, float]]: - """ - run partitioning - :param pack: {index: int, tmpdir: str, beta: float, sm_degree: int, free_dispersion: bool, - kval: int, seed: int, init: str, keep_tmp_files: bool} + """ run partitioning + :param pack: {index: int, tmpdir: str, beta: float, sm_degree: int, free_dispersion: bool, kval: int, seed: int, init: str, keep_tmp_files: bool} + :return: """ return partition_nem(*pack) diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index ddfe094f..6cc43f3f 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -54,8 +54,7 @@ def add_file(self, pangenome_file: str): and anything that is computed will be saved to this file when :func:`ppanggolin.formats.writeBinaries.writePangenome` is called. - :param pangenome_file: A string representing the filepath to the hdf5 pangenome file - to be either used or created + :param pangenome_file: A string representing filepath to hdf5 pangenome file to be either used or created """ from ppanggolin.formats.readBinaries import get_status # importing on call instead of importing on top to avoid cross-reference problems. From 79b1e00db101a1c9f8586c2b49ffedc6cc16e0d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 19 May 2022 15:19:59 +0200 Subject: [PATCH 07/20] Remove python 3.6 support --- .github/workflows/main.yml | 2 +- VERSION | 2 +- requirements.txt | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c0468864..9073a76a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,7 +17,7 @@ jobs: strategy: matrix: os: ['ubuntu-latest', 'macos-latest'] - python-version: ['3.6', '3.7', '3.8', '3.9', '3.10'] + python-version: ['3.7', '3.8', '3.9', '3.10'] steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 diff --git a/VERSION b/VERSION index 85de19b1..ed0f5eb6 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.100 +1.2.101 diff --git a/requirements.txt b/requirements.txt index f4132370..1c219882 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,16 @@ -tqdm>=4.7.0 -pytables>=3.6.1 +tqdm>=4.64 +pytables>=3.7 prodigal>=2.6.3 -aragorn>=1.2.38 +aragorn>=1.2.41 infernal>=1.1.4 mmseqs2>=13.45111 networkx>=2.3 -dataclasses==0.8 -scipy>=1.5.3 +dataclasses>=0.8 +scipy>=1.7.3 plotly>=4.14.3 -gmpy2>=2.1.0b5 +gmpy2>=2.1.2 pandas>=0.25.3 -colorlover>=0.3 -mafft>=7.490 -numpy>=1.19.5 -bokeh>=2.3.3 +colorlover>=0.3.0 +mafft>=7.505 +numpy>=1.21.6 +bokeh>=2.4.2 From 7a97d4b793438c172c1f9ea54aafc3b12f91678d Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Fri, 20 May 2022 17:43:49 +0200 Subject: [PATCH 08/20] add 'single_copy' branch to release1.3 --- ppanggolin/formats/writeMSA.py | 94 ++++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 26 deletions(-) diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 557be9fd..3f353e4b 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -20,37 +20,70 @@ from ppanggolin.genetic_codes import genetic_codes -def get_families_to_write(pangenome: Pangenome, partition_filter: str = 'core', soft_core: float = 0.95): +def is_single_copy(fam, dup_margin): + """ + Check if a gene family can be considered 'single copy' or not + + :param fam: GeneFamily object + :param dup_margin: maximal number of genomes in which the gene family can have multiple members and still be considered a 'single copy' gene family + """ + nb_multi = 0 + for gene_list in fam.getOrgDict().values(): + if len(gene_list) > 1: + nb_multi += 1 + dup_ratio = nb_multi / len(fam.organisms) + if dup_ratio < dup_margin: + return True + return False + + +def getFamiliesToWrite(pangenome, partitionFilter, soft_core=0.95, dup_margin=0.95, single_copy=True): + """ Get families corresponding to the given partition :param pangenome: Partitioned pangenome :param partition_filter: choice of partition to compute Multiple Sequence Alignement of the gene families :param soft_core: Soft core threshold to use + :param dup_margin: maximal number of genomes in which the gene family can have multiple members and still be considered a 'single copy' gene family + :param single_copy: Use "single copy" (defined by dup_margin) gene families only :return: set of families unique to one partition """ fams = set() - if partition_filter == "all": - return set(pangenome.gene_families) - if partition_filter in ["persistent", "shell", "cloud"]: - for fam in pangenome.gene_families: - if fam.named_partition == partition_filter: - fams.add(fam) - elif partition_filter in ["core", "accessory", "softcore"]: - nb_org = pangenome.number_of_organisms() - if partition_filter == "core": - for fam in pangenome.gene_families: - if len(fam.organisms) == nb_org: + nb_org = pangenome.number_of_organisms() + + if partitionFilter == "all": + return set(pangenome.geneFamilies) + if partitionFilter in ["persistent", "shell", "cloud"]: + for fam in pangenome.geneFamilies: + if fam.namedPartition == partitionFilter: + if single_copy and is_single_copy(fam, dup_margin): fams.add(fam) - elif partition_filter == "accessory": - for fam in pangenome.gene_families: - if len(fam.organisms) < nb_org: + elif not single_copy: fams.add(fam) - elif partition_filter == "softcore": - for fam in pangenome.gene_families: + elif partitionFilter in ["core", "accessory", "softcore"]: + if partitionFilter == "core": + for fam in pangenome.geneFamilies: + if len(fam.organisms) == nb_org: + if single_copy and is_single_copy(fam, dup_margin): + fams.add(fam) + elif not single_copy: + fams.add(fam) + elif partitionFilter == "accessory": + for fam in pangenome.geneFamilies: + if len(fam.organisms) < nb_org: + if single_copy and is_single_copy(fam, dup_margin): + fams.add(fam) + elif not single_copy: + fams.add(fam) + elif partitionFilter == "softcore": + for fam in pangenome.geneFamilies: if len(fam.organisms) >= nb_org * soft_core: - fams.add(fam) + if single_copy and is_single_copy(fam, dup_margin): + fams.add(fam) + elif not single_copy: + fams.add(fam) return fams @@ -238,9 +271,9 @@ def write_whole_genome_msa(pangenome: Pangenome, families: set, phylo_name: str, fout.close() -def write_msa_files(pangenome: Pangenome, output: str, cpu: int = 1, partition: str = "core", tmpdir: str = "/tmp", - source: str = "protein", soft_core=0.95, phylo: bool = False, use_gene_id: bool = False, - translation_table: int = 11, force: bool = False, disable_bar: bool = False): +def writeMSAFiles(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", source="protein", soft_core=0.95, + phylo=False, use_gene_id=False, translation_table="11", dup_margin = 0.95, single_copy=True, force=False, disable_bar=False): + """ Main function to write MSA files @@ -254,6 +287,8 @@ def write_msa_files(pangenome: Pangenome, output: str, cpu: int = 1, partition: :param phylo: Writes a whole genome msa file for additional phylogenetic analysis :param use_gene_id: Use gene identifiers rather than organism names for sequences in the family MSA :param translation_table: Translation table (genetic code) to use. + :param dup_margin: maximal number of genomes in which the gene family can have multiple members and still be considered a 'single copy' gene family + :param single_copy: Use "single copy" (defined by dup_margin) gene families only :param force: force to write in the directory :param disable_bar: Disable progress bar """ @@ -267,7 +302,7 @@ def write_msa_files(pangenome: Pangenome, output: str, cpu: int = 1, partition: check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=need_partitions, need_gene_sequences=True, disable_bar=disable_bar) logging.getLogger().info(f"Doing MSA for {partition} families...") - families = get_families_to_write(pangenome, partition_filter=partition, soft_core=soft_core) + families = getFamiliesToWrite(pangenome, partitionFilter=partition, soft_core=soft_core, dup_margin=dup_margin, single_copy=single_copy) # check that the code is similar than the one used previously, if there is one if 'translation_table' in pangenome.parameters["cluster"]: @@ -299,10 +334,11 @@ def launch(args: argparse.Namespace): """ mk_outdir(args.output, args.force) pangenome = Pangenome() - pangenome.add_file(args.pangenome) - write_msa_files(pangenome, args.output, cpu=args.cpu, partition=args.partition, tmpdir=args.tmpdir, - source=args.source, soft_core=args.soft_core, phylo=args.phylo, use_gene_id=args.use_gene_id, - translation_table=args.translation_table, force=args.force, disable_bar=args.disable_prog_bar) + pangenome.addFile(args.pangenome) + writeMSAFiles(pangenome, args.output, cpu=args.cpu, partition=args.partition, tmpdir=args.tmpdir, + source=args.source, soft_core=args.soft_core, phylo=args.phylo, use_gene_id=args.use_gene_id, + translation_table=args.translation_table, dup_margin=args.dup_margin, + single_copy=args.single_copy, force=args.force, disable_bar=args.disable_prog_bar) def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser: @@ -336,6 +372,12 @@ def parser_msa(parser: argparse.ArgumentParser): # could make choice to allow customization optional.add_argument("--soft_core", required=False, type=restricted_float, default=0.95, help="Soft core threshold to use if 'softcore' partition is chosen") + optional.add_argument("--dup_margin", required=False, type=restricted_float, default=0.05, + help="minimum ratio of organisms in which the family must have multiple genes " + "for it to be considered 'duplicated'") + optional.add_argument("--single_copy", required=False, action="store_true", default=False, + help="Use report gene families that are considered 'single copy', for details see " + "option --dup_margin") optional.add_argument("--partition", required=False, default="core", choices=["all", "persistent", "shell", "cloud", "core", "accessory", 'softcore'], help="compute Multiple Sequence Alignement of the gene families in the given partition") From aff17f89af3b01c964732beac14d0b11d6d6ccdf Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Fri, 20 May 2022 18:04:48 +0200 Subject: [PATCH 09/20] fix 'writeMSA' to fit the new API --- ppanggolin/formats/writeMSA.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/ppanggolin/formats/writeMSA.py b/ppanggolin/formats/writeMSA.py index 3f353e4b..e23931c7 100644 --- a/ppanggolin/formats/writeMSA.py +++ b/ppanggolin/formats/writeMSA.py @@ -28,7 +28,7 @@ def is_single_copy(fam, dup_margin): :param dup_margin: maximal number of genomes in which the gene family can have multiple members and still be considered a 'single copy' gene family """ nb_multi = 0 - for gene_list in fam.getOrgDict().values(): + for gene_list in fam.get_org_dict().values(): if len(gene_list) > 1: nb_multi += 1 dup_ratio = nb_multi / len(fam.organisms) @@ -37,7 +37,7 @@ def is_single_copy(fam, dup_margin): return False -def getFamiliesToWrite(pangenome, partitionFilter, soft_core=0.95, dup_margin=0.95, single_copy=True): +def getFamiliesToWrite(pangenome, partition_filter, soft_core=0.95, dup_margin=0.95, single_copy=True): """ Get families corresponding to the given partition @@ -53,32 +53,32 @@ def getFamiliesToWrite(pangenome, partitionFilter, soft_core=0.95, dup_margin=0. fams = set() nb_org = pangenome.number_of_organisms() - if partitionFilter == "all": - return set(pangenome.geneFamilies) - if partitionFilter in ["persistent", "shell", "cloud"]: - for fam in pangenome.geneFamilies: - if fam.namedPartition == partitionFilter: + if partition_filter == "all": + return set(pangenome.gene_families) + if partition_filter in ["persistent", "shell", "cloud"]: + for fam in pangenome.gene_families: + if fam.named_partition == partition_filter: if single_copy and is_single_copy(fam, dup_margin): fams.add(fam) elif not single_copy: fams.add(fam) - elif partitionFilter in ["core", "accessory", "softcore"]: - if partitionFilter == "core": - for fam in pangenome.geneFamilies: + elif partition_filter in ["core", "accessory", "softcore"]: + if partition_filter == "core": + for fam in pangenome.gene_families: if len(fam.organisms) == nb_org: if single_copy and is_single_copy(fam, dup_margin): fams.add(fam) elif not single_copy: fams.add(fam) - elif partitionFilter == "accessory": - for fam in pangenome.geneFamilies: + elif partition_filter == "accessory": + for fam in pangenome.gene_families: if len(fam.organisms) < nb_org: if single_copy and is_single_copy(fam, dup_margin): fams.add(fam) elif not single_copy: fams.add(fam) - elif partitionFilter == "softcore": - for fam in pangenome.geneFamilies: + elif partition_filter == "softcore": + for fam in pangenome.gene_families: if len(fam.organisms) >= nb_org * soft_core: if single_copy and is_single_copy(fam, dup_margin): fams.add(fam) @@ -302,7 +302,7 @@ def writeMSAFiles(pangenome, output, cpu=1, partition="core", tmpdir="/tmp", sou check_pangenome_info(pangenome, need_annotations=True, need_families=True, need_partitions=need_partitions, need_gene_sequences=True, disable_bar=disable_bar) logging.getLogger().info(f"Doing MSA for {partition} families...") - families = getFamiliesToWrite(pangenome, partitionFilter=partition, soft_core=soft_core, dup_margin=dup_margin, single_copy=single_copy) + families = getFamiliesToWrite(pangenome, partition_filter=partition, soft_core=soft_core, dup_margin=dup_margin, single_copy=single_copy) # check that the code is similar than the one used previously, if there is one if 'translation_table' in pangenome.parameters["cluster"]: @@ -334,7 +334,7 @@ def launch(args: argparse.Namespace): """ mk_outdir(args.output, args.force) pangenome = Pangenome() - pangenome.addFile(args.pangenome) + pangenome.add_file(args.pangenome) writeMSAFiles(pangenome, args.output, cpu=args.cpu, partition=args.partition, tmpdir=args.tmpdir, source=args.source, soft_core=args.soft_core, phylo=args.phylo, use_gene_id=args.use_gene_id, translation_table=args.translation_table, dup_margin=args.dup_margin, From e3a1914e36cf7e815a5874b7fb15d4cf2201911d Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Fri, 20 May 2022 18:06:39 +0200 Subject: [PATCH 10/20] add '--single_copy' option to the testing workflow --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9073a76a..1115de65 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -78,7 +78,7 @@ jobs: run: | cd testingDataset ppanggolin workflow --cpu 1 --anno organisms.gbff.list --output myannopang - ppanggolin msa --pangenome myannopang/pangenome.h5 --source dna --partition core -o myannopang/ -f --use_gene_id --phylo + ppanggolin msa --pangenome myannopang/pangenome.h5 --source dna --partition core -o myannopang/ -f --use_gene_id --phylo --single_copy cd - - name: clusters reading from external file shell: bash -l {0} From 78390bb1e8fc308d61868103968f66b4c952630a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Fri, 10 Jun 2022 11:40:52 +0200 Subject: [PATCH 11/20] Fix bug in Partition and GeneFamilies reading --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 7 ++++++- ppanggolin/formats/writeBinaries.py | 2 +- ppanggolin/info/info.py | 9 ++++++++- 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/VERSION b/VERSION index ed0f5eb6..bd65ca14 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.101 +1.2.102 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index df6928da..471d7c23 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -63,8 +63,13 @@ def get_status(pangenome: Pangenome, pangenome_file: str): if 'Partitionned' in status_group._v_attrs._f_list(): # Partitionned keep working with older version + h5f.close() + h5f = tables.open_file(pangenome_file, "a") + status_group = h5f.root.status if status_group._v_attrs.Partitionned: status_group._v_attrs.Partitioned = True + else: + status_group._v_attrs.Partitioned = False del status_group._v_attrs.Partitionned if status_group._v_attrs.Partitioned: @@ -211,7 +216,7 @@ def read_gene_families(pangenome: Pangenome, h5f: tables.File, disable_bar: bool :param h5f: Pangenome HDF5 file with gene families information :param disable_bar: Disable the progress bar """ - table = h5f.root.gene_families + table = h5f.root.geneFamilies link = True if pangenome.status["genomesAnnotated"] in ["Computed", "Loaded"] else False diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index bdde667a..de48e8bb 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -309,7 +309,7 @@ def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = Fa if '/gene_families' in h5f and force is True: logging.getLogger().info("Erasing the formerly computed gene family to gene associations...") h5f.remove_node('/', 'gene_families') # erasing the table, and rewriting a new one. - gene_families = h5f.create_table("/", "gene_families", gene_to_fam_desc(*get_gene_to_fam_len(pangenome))) + gene_families = h5f.create_table("/", "geneFamilies", gene_to_fam_desc(*get_gene_to_fam_len(pangenome))) gene_row = gene_families.row for geneFam in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families(), unit="gene family", disable=disable_bar): diff --git a/ppanggolin/info/info.py b/ppanggolin/info/info.py index 73ef75b5..a21d2ea6 100644 --- a/ppanggolin/info/info.py +++ b/ppanggolin/info/info.py @@ -5,6 +5,8 @@ import argparse # installed libraries +import time + import tables # local libraries @@ -21,7 +23,7 @@ def print_info(pangenome: str, status: bool = False, content: bool = False, para :param parameters: Get pangenome parameters """ if status or content or parameters: - h5f = tables.open_file(pangenome, "r") + h5f = tables.open_file(pangenome, "r+") if status: status_group = h5f.root.status print(f"genomes annotated : {'true' if status_group._v_attrs.genomesAnnotated else 'false'}") @@ -32,8 +34,13 @@ def print_info(pangenome: str, status: bool = False, content: bool = False, para print(f"neighbors graph : {'true' if status_group._v_attrs.NeighborsGraph else 'false'}") if 'Partitionned' in status_group._v_attrs._f_list(): # Partitionned keep working with older version + h5f.close() + h5f = tables.open_file(pangenome, "a") + status_group = h5f.root.status if status_group._v_attrs.Partitionned: status_group._v_attrs.Partitioned = True + else: + status_group._v_attrs.Partitioned = False del status_group._v_attrs.Partitionned if status_group._v_attrs.Partitioned: print("pangenome partitioned : true") From 70b4e3fb0b92bedd59695bc4d093a332edf1d8a3 Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Mon, 8 Aug 2022 14:08:31 +0200 Subject: [PATCH 12/20] store sequences only once in a new table, and store sequence ids in geneSequences --- ppanggolin/formats/readBinaries.py | 22 +++++++-- ppanggolin/formats/writeBinaries.py | 72 ++++++++++++++++++++++------- 2 files changed, 74 insertions(+), 20 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 471d7c23..ac548bf7 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -103,6 +103,18 @@ def read_chunks(table: Table, column: str = None, chunk: int = 10000): yield row +def read_sequences(h5f: tables.File) -> dict: + """ + Reads the sequences table and returns a seqid2seq dictionnary + :param h5f: the hdf5 file handler + :return: dictionnary linking sequences to the seq identifier + """ + table = h5f.root.sequences + seqid2seq = {} + for row in read_chunks(table,chunk=20000): + seqid2seq[row["seqid"].decode()] = row['dna'].decode() + return seqid2seq + def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter = None, add: str = '', disable_bar: bool = False): """ @@ -119,12 +131,13 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter h5f = tables.open_file(filename, "r", driver_core_backing_store=0) table = h5f.root.geneSequences list_cds = set(list_cds) if list_cds is not None else None + seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): # Read the table chunk per chunk otherwise RAM dies on big pangenomes name_cds = row["gene"].decode() if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds): file_obj.write('>' + add + name_cds + "\n") - file_obj.write(row["dna"].decode() + "\n") + file_obj.write(seqid2seq[row["seqid"].decode()] + "\n") file_obj.flush() h5f.close() @@ -144,7 +157,7 @@ def read_organism(pangenome: Pangenome, org_name: str, contig_dict: dict, circul """ Read information from pangenome to assign to organism object - :param pangenome: Input pangenome + :param pangenome: Input pangenome :param org_name: Name of the organism :param contig_dict: Dictionary with all contig and associate genes :param circular_contigs: Dictionary of contigs @@ -264,9 +277,10 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo "if the annotations have not been loaded.") table = h5f.root.geneSequences + seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): gene = pangenome.get_gene(row['gene'].decode()) - gene.add_dna(row['dna'].decode()) + gene.add_dna(seqid2seq[row['seqid'].decode()]) pangenome.status["geneSequences"] = "Loaded" @@ -493,7 +507,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled") if gene_sequences: - if h5f.root.status._v_attrs.geneSequences: + if h5f.root.status._v_attrs.geneSequences and h5f.root.status._v_attrs.sequences: logging.getLogger().info("Reading pangenome gene dna sequences...") read_gene_sequences(pangenome, h5f, disable_bar=disable_bar) else: diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index de48e8bb..d8f016c9 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -5,6 +5,7 @@ import logging from collections import Counter, defaultdict import statistics +from typing import Tuple import pkg_resources # installed libraries @@ -12,6 +13,7 @@ import tables from gmpy2 import popcount +#local libraries from ppanggolin.pangenome import Pangenome @@ -51,7 +53,7 @@ def gene_desc(org_len, contig_len, id_len, type_len, name_len, product_len, max_ } -def get_max_len_annotations(pangenome: Pangenome) -> (int, int, int, int, int, int, int): +def get_max_len_annotations(pangenome: Pangenome) -> Tuple[int, int, int, int, int, int, int]: """ Get the maximum size of each annotation information to optimize saving @@ -144,42 +146,62 @@ def write_annotations(pangenome: Pangenome, h5f: tables.File, disable_bar: bool gene_table.flush() -def get_gene_sequences_len(pangenome: Pangenome) -> (int, int, int): +def get_gene_sequences_len(pangenome: Pangenome) -> Tuple[int, int]: """ - Get the maximum size of gene sequences to optimize saving + Get the maximum size of gene sequences to optimize disk space :param pangenome: Annotated pangenome :return: maximum size of each annotation """ - max_seq_len = 1 max_gene_id_len = 1 max_gene_type = 1 for gene in pangenome.genes: - if len(gene.dna) > max_seq_len: - max_seq_len = len(gene.dna) if len(gene.ID) > max_gene_id_len: max_gene_id_len = len(gene.ID) if len(gene.type) > max_gene_type: max_gene_type = len(gene.type) - return max_gene_id_len, max_seq_len, max_gene_type + return max_gene_id_len, max_gene_type -def gene_sequences_desc(gene_id_len, gene_seq_len, gene_type_len) -> dict: +def gene_sequences_desc(gene_id_len, gene_type_len) -> dict: """ Create table to save gene sequences :param gene_id_len: Maximum size of gene sequence identifier - :param gene_seq_len: Maximum size of gene sequences :param gene_type_len: Maximum size of gene type :return: Formated table """ return { "gene": tables.StringCol(itemsize=gene_id_len), - "dna": tables.StringCol(itemsize=gene_seq_len), + "seqid": tables.UInt32Col(), "type": tables.StringCol(itemsize=gene_type_len) } +def get_sequence_len(pangenome: Pangenome) -> int: + """ + Get the maximum size of gene sequences to optimize disk space + + :param pangenome: Annotated pangenome + :return: maximum size of each annotation + """ + max_seq_len = 1 + for gene in pangenome.genes: + if len(gene.dna) > max_seq_len: + max_seq_len = len(gene.dna) + return max_seq_len + +def sequence_desc(max_seq_len: int) -> dict: + """ + Table description to save sequences + :param max_seq_len: Maximum size of gene type + + :return: Formated table + """ + return { + "seqid": tables.UInt32Col(), + "dna": tables.StringCol(itemsize=max_seq_len) + } def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bool = False): """ @@ -191,14 +213,32 @@ def write_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: bo """ gene_seq = h5f.create_table("/", "geneSequences", gene_sequences_desc(*get_gene_sequences_len(pangenome)), expectedrows=len(pangenome.genes)) + #process sequences to save them only once + seq2seqid = {} + id_counter = 0 gene_row = gene_seq.row for gene in tqdm(pangenome.genes, total=pangenome.number_of_gene(), unit="gene", disable=disable_bar): + curr_seq_id = seq2seqid.get(gene.dna) + if curr_seq_id is None: + curr_seq_id = id_counter + seq2seqid[gene.dna] = id_counter + id_counter+=1 gene_row["gene"] = gene.ID - gene_row["dna"] = gene.dna + gene_row["seqid"] = curr_seq_id gene_row["type"] = gene.type gene_row.append() gene_seq.flush() + seq_table = h5f.create_table("/","sequences", sequence_desc(get_sequence_len(pangenome)), + expectedrows=len(seq2seqid)) + + seq_row = seq_table.row + for seq, seqid in seq2seqid.items(): + seq_row["dna"] = seq + seq_row["seqid"] = seqid + seq_row.append() + seq_table.flush() + def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int) -> dict: """ @@ -217,7 +257,7 @@ def gene_fam_desc(max_name_len: int, max_sequence_length: int, max_part_len: int } -def get_gene_fam_len(pangenome: Pangenome) -> (int, int, int): +def get_gene_fam_len(pangenome: Pangenome) -> Tuple[int, int, int]: """ Get maximum size of gene families information @@ -334,7 +374,7 @@ def graph_desc(max_gene_id_len): } -def get_gene_id_len(pangenome: Pangenome): +def get_gene_id_len(pangenome: Pangenome) -> int: """ Get maximum size of gene id in pangenome graph @@ -391,7 +431,7 @@ def rgp_desc(max_rgp_len, max_gene_len): } -def get_rgp_len(pangenome: Pangenome): +def get_rgp_len(pangenome: Pangenome) -> Tuple[int, int]: """ Get maximum size of region of genomic plasticity and gene @@ -448,7 +488,7 @@ def spot_desc(max_rgp_len): } -def get_spot_desc(pangenome: Pangenome): +def get_spot_desc(pangenome: Pangenome) -> int: """ Get maximum size of region of genomic plasticity in hotspot @@ -502,7 +542,7 @@ def mod_desc(gene_fam_name_len): } -def get_mod_desc(pangenome: Pangenome): +def get_mod_desc(pangenome: Pangenome) -> int: """ Get maximum size of gene families name in modules From aa677b2053522a8c84f99d0d14b024ee8b229e3e Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Mon, 8 Aug 2022 18:32:58 +0200 Subject: [PATCH 13/20] do not decode UInt32 seqid --- ppanggolin/formats/readBinaries.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index ac548bf7..734c6db9 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -112,7 +112,7 @@ def read_sequences(h5f: tables.File) -> dict: table = h5f.root.sequences seqid2seq = {} for row in read_chunks(table,chunk=20000): - seqid2seq[row["seqid"].decode()] = row['dna'].decode() + seqid2seq[row["seqid"]] = row['dna'].decode() return seqid2seq def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter = None, add: str = '', @@ -137,7 +137,7 @@ def get_gene_sequences_from_file(filename: str, file_obj: TextIO, list_cds: iter name_cds = row["gene"].decode() if row["type"] == b"CDS" and (list_cds is None or name_cds in list_cds): file_obj.write('>' + add + name_cds + "\n") - file_obj.write(seqid2seq[row["seqid"].decode()] + "\n") + file_obj.write(seqid2seq[row["seqid"]] + "\n") file_obj.flush() h5f.close() @@ -280,7 +280,7 @@ def read_gene_sequences(pangenome: Pangenome, h5f: tables.File, disable_bar: boo seqid2seq = read_sequences(h5f) for row in tqdm(read_chunks(table, chunk=20000), total=table.nrows, unit="gene", disable=disable_bar): gene = pangenome.get_gene(row['gene'].decode()) - gene.add_dna(seqid2seq[row['seqid'].decode()]) + gene.add_dna(seqid2seq[row['seqid']]) pangenome.status["geneSequences"] = "Loaded" From 34d0015ca392fa574e8428c9ec908444ae5fd416 Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Mon, 8 Aug 2022 18:57:03 +0200 Subject: [PATCH 14/20] remove useless check of sequences in status --- ppanggolin/formats/readBinaries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 734c6db9..7a1ea65a 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -507,7 +507,7 @@ def read_pangenome(pangenome, annotation: bool = False, gene_families: bool = Fa else: raise Exception(f"The pangenome in file '{filename}' has not been annotated, or has been improperly filled") if gene_sequences: - if h5f.root.status._v_attrs.geneSequences and h5f.root.status._v_attrs.sequences: + if h5f.root.status._v_attrs.geneSequences: logging.getLogger().info("Reading pangenome gene dna sequences...") read_gene_sequences(pangenome, h5f, disable_bar=disable_bar) else: From 71cfd301e6daeefa8efd55494061e195531b829e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Mon, 7 Nov 2022 17:59:33 +0100 Subject: [PATCH 15/20] Fix name of metrics in read --- VERSION | 2 +- ppanggolin/formats/readBinaries.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index bd65ca14..0a88fc62 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.102 +1.2.103 diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 471d7c23..b7e2eb3c 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -403,7 +403,7 @@ def read_info(h5f: tables.File): if info_group._v_attrs['numberOfPartitions'] != 3: for key, val in info_group._v_attrs['numberOfSubpartitions'].items(): print(f"Shell {key} : {val}") - if 'genome_fluidity' in info_group._v_attrs._f_list(): + if 'genomes_fluidity' in info_group._v_attrs._f_list(): out = "Genomes fluidity: " + \ ", ".join(f"{subset}={round(value, 3)}" for subset, value in info_group._v_attrs['genomes_fluidity'].items()) From 079c517a5df22a43a4b9a1e9a86b516f84e5040a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 17 Nov 2022 15:42:29 +0100 Subject: [PATCH 16/20] Fix bokeh dependencies problem with python 3.8 --- VERSION | 2 +- ppanggolin/figures/draw_spot.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/VERSION b/VERSION index 0a88fc62..9915bd79 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.103 +1.2.104 diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py index 02c69332..79d22b8f 100644 --- a/ppanggolin/figures/draw_spot.py +++ b/ppanggolin/figures/draw_spot.py @@ -15,7 +15,7 @@ import networkx as nx from tqdm import tqdm -from bokeh.plotting import ColumnDataSource, figure, save, Figure +from bokeh.plotting import ColumnDataSource, figure, save from bokeh.io import output_file from bokeh.layouts import column, row from bokeh.models import WheelZoomTool, LabelSet, Slider, CustomJS, HoverTool, RadioGroup, Div, Column, GlyphRenderer @@ -355,7 +355,7 @@ def color_str(color_element: str) -> str: gene_outline_size) -def add_gene_labels(fig: Figure, source_data: ColumnDataSource) -> (Column, LabelSet): +def add_gene_labels(fig, source_data: ColumnDataSource) -> (Column, LabelSet): """ :param fig: @@ -437,7 +437,7 @@ def mk_genomes(gene_lists: list, ordered_counts: list) -> (ColumnDataSource, lis return ColumnDataSource(data=df), tooltip -def add_genome_tools(fig: Figure, gene_recs: GlyphRenderer, genome_recs: GlyphRenderer, gene_source: ColumnDataSource, +def add_genome_tools(fig, gene_recs: GlyphRenderer, genome_recs: GlyphRenderer, gene_source: ColumnDataSource, genome_source: ColumnDataSource, nb: int, gene_labels: LabelSet): """ From 56f862837abc30ce5876adebe90a3e074ef082f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Arnoux?= Date: Thu, 17 Nov 2022 17:09:04 +0100 Subject: [PATCH 17/20] Fix type problem --- VERSION | 2 +- ppanggolin/nem/partition.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VERSION b/VERSION index 9915bd79..e7d3ff23 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.104 +1.2.105 diff --git a/ppanggolin/nem/partition.py b/ppanggolin/nem/partition.py index 4ac38bdc..79637079 100644 --- a/ppanggolin/nem/partition.py +++ b/ppanggolin/nem/partition.py @@ -505,7 +505,7 @@ def partition(pangenome: Pangenome, tmpdir: str, outputdir: str = None, beta: fl if chunk_size < len(organisms): validated = set() - def validate_family(res: Union[(dict, None, None), (int, float, float), (dict, dict, float)]): + def validate_family(res): """ Validate partition assignation to families From 8f3fd5f8870ce5985c5776707b78e33a9f1484d8 Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Tue, 7 Feb 2023 19:28:33 +0000 Subject: [PATCH 18/20] fix wrong renaming of the geneFamilies tables in hdf5 --- ppanggolin/formats/writeBinaries.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 9a344d53..719b6517 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -309,9 +309,9 @@ def write_gene_families(pangenome: Pangenome, h5f: tables.File, force: bool = Fa :param force: Force to write gene families in hdf5 file if there is already gene families :param disable_bar: Disable progress bar """ - if '/gene_families' in h5f and force is True: + if '/geneFamilies' in h5f and force is True: logging.getLogger().info("Erasing the formerly computed gene family to gene associations...") - h5f.remove_node('/', 'gene_families') # erasing the table, and rewriting a new one. + h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. gene_families = h5f.create_table("/", "geneFamilies", gene_to_fam_desc(*get_gene_to_fam_len(pangenome))) gene_row = gene_families.row for geneFam in tqdm(pangenome.gene_families, total=pangenome.number_of_gene_families(), unit="gene family", @@ -816,9 +816,9 @@ def erase_pangenome(pangenome: Pangenome, graph: bool = False, gene_families: bo status_group._v_attrs.NeighborsGraph = False pangenome.status["neighborsGraph"] = "No" h5f.del_node_attr(info_group, "numberOfEdges") - if '/gene_families' in h5f and gene_families: + if '/geneFamilies' in h5f and gene_families: logging.getLogger().info("Erasing the formerly computed gene family to gene associations...") - h5f.remove_node('/', 'gene_families') # erasing the table, and rewriting a new one. + h5f.remove_node('/', 'geneFamilies') # erasing the table, and rewriting a new one. pangenome.status["defragmented"] = "No" pangenome.status["genesClustered"] = "No" status_group._v_attrs.defragmented = False From 4a0a2d20c47beaf96ba4258a2d3b725f7d985dac Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Tue, 7 Feb 2023 19:34:09 +0000 Subject: [PATCH 19/20] fix unexpected warning message about infer_singletons when redoing a clustering --- ppanggolin/cluster/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py index ec82efbe..f3abf286 100644 --- a/ppanggolin/cluster/cluster.py +++ b/ppanggolin/cluster/cluster.py @@ -434,7 +434,7 @@ def launch(args: argparse.Namespace): pangenome = Pangenome() pangenome.add_file(args.pangenome) if args.clusters is None: - if args.infer_singletons is not None: + if args.infer_singletons is True: logging.getLogger().warning("--infer_singletons option is not compatible with clustering creation. " "To infer singleton you should give a clustering") clustering(pangenome, args.tmpdir, args.cpu, defrag=not args.no_defrag, code=args.translation_table, From 684d79d1371ffb7cc13d53384fb1c5d80924706c Mon Sep 17 00:00:00 2001 From: Adelme Bazin Date: Tue, 7 Feb 2023 20:01:58 +0000 Subject: [PATCH 20/20] fix partitions not getting written properly in the hdf5 --- ppanggolin/formats/writeBinaries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py index 719b6517..711f8cda 100644 --- a/ppanggolin/formats/writeBinaries.py +++ b/ppanggolin/formats/writeBinaries.py @@ -230,7 +230,7 @@ def get_gene_fam_len(pangenome: Pangenome) -> Tuple[int, int, int]: """ max_gene_fam_name_len = 1 max_gene_fam_seq_len = 1 - max_part_len = 1 + max_part_len = 3 for genefam in pangenome.gene_families: if len(genefam.sequence) > max_gene_fam_seq_len: max_gene_fam_seq_len = len(genefam.sequence)