Merge pull request #182 from labgem/dev

Patches on the v2.0.2
labgem · Feb 22, 2024 · 9d0821e · 9d0821e
2 parents c83ceb7 + 9070614
commit 9d0821e
Show file tree

Hide file tree

Showing 12 changed files with 129 additions and 69 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -72,6 +72,12 @@ jobs:
         ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06  --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --regions --borders --families_tsv --cpu 1 
         ppanggolin write_genomes  -p stepbystep/pangenome.h5 --output stepbystep -f --fasta genomes.fasta.list --gff --proksee --table
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta genomes.fasta.list
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families softcore --gene_families softcore 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
+
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log
         cd - 

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.2
+2.0.3
diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
@@ -101,8 +101,10 @@ def launch_prodigal(contig_sequences: Dict[str, str], org: Organism, code: int =
         mask=True,  # -m: Treat runs of N as masked sequence; don't build genes across them.
         min_gene=120  # This is to prevent error with mmseqs translatenucs that cut too short sequences
     )
-    gene_finder.train(max(sequences.values(), key=len), force_nonsd=False,
-                      translation_table=code)  # -g: Specify a translation table to use (default 11).
+
+    if not use_meta:
+        gene_finder.train(*contig_sequences.values(), force_nonsd=False,
+                        translation_table=code)  # -g: Specify a translation table to use (default 11).
     gene_counter = 1
     for contig_name, sequence in sequences.items():
         for pred in gene_finder.find_genes(sequence):
@@ -333,6 +335,8 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str
         max_contig_len = max(len(contig) for contig in org.contigs)
         if max_contig_len < 20000:  # case of short sequence
             use_meta = True
+            logging.getLogger("PPanGGOLiN").info(f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size.")
+
         else:
             use_meta = False
     else:

diff --git a/ppanggolin/cluster/cluster.py b/ppanggolin/cluster/cluster.py
@@ -303,7 +303,9 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
     """
 
     if keep_tmp_files:
-        dir_name = 'clustering_tmpdir' + time.strftime("_%Y-%m-%d_%H.%M.%S", time.localtime())
+        date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime())
+
+        dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}'
         tmp_path = Path(tmpdir) / dir_name
         mk_outdir(tmp_path, force=True)
     else:

diff --git a/ppanggolin/figures/draw_spot.py b/ppanggolin/figures/draw_spot.py
@@ -22,7 +22,9 @@
 from bokeh.plotting import ColumnDataSource, figure, save
 from bokeh.io import output_file
 from bokeh.layouts import column, row
-from bokeh.models import WheelZoomTool, LabelSet, Slider, CustomJS, HoverTool, RadioGroup, Div, Column, GlyphRenderer
+from bokeh.models import WheelZoomTool, LabelSet, Slider, CustomJS, HoverTool, RadioGroup, Div, Column, GlyphRenderer, RadioButtonGroup
+from bokeh.io import show
+
 
 # local libraries
 from ppanggolin.pangenome import Pangenome
@@ -320,44 +322,48 @@ def add_gene_tools(recs: GlyphRenderer, source_data: ColumnDataSource) -> Column
     """
 
     def color_str(color_element: str) -> str:
-        """ Javascript code to switch between partition, family and module color for the given 'color_element'
+        """ 
+        Javascript code to switch between partition, family and module color for the given 'color_element'
 
         :param color_element:
 
         :return: Javascript code
         """
         return f"""
-            if(this.active == 0){{
+            if(btn.active == 0){{
                 source.data['{color_element}'] = source.data['partition_color'];
-            }}else if(this.active == 1){{
+                console.log('partition_color')
+            }}else if(btn.active == 1){{
                 source.data['{color_element}'] = source.data['family_color'];
-            }}else if(this.active == 2){{
+                console.log('family_color')
+            }}else if(btn.active == 2){{
                 source.data['{color_element}'] = source.data['module_color'];
+                console.log('module_color')
             }}
             recs.{color_element} = source.data['{color_element}'];
             source.change.emit();
         """
 
-    radio_line_color = RadioGroup(labels=["partition", "family", "module"], active=0)
-    radio_fill_color = RadioGroup(labels=["partition", "family", "module"], active=1)
-    radio_line_color.js_on_change('streaming',
-                                  CustomJS(args=dict(recs=recs, source=source_data),
+    radio_line_color = RadioButtonGroup(labels=["partition", "family", "module"], active=0)
+    radio_fill_color = RadioButtonGroup(labels=["partition", "family", "module"], active=1)
+
+    radio_line_color.js_on_event("button_click",
+                                  CustomJS(args=dict(recs=recs, source=source_data, btn=radio_line_color),
                                            code=color_str("line_color")))
 
-    radio_fill_color.js_on_change('streaming',
-                                  CustomJS(args=dict(recs=recs, source=source_data),
+    radio_fill_color.js_on_event("button_click",
+                                  CustomJS(args=dict(recs=recs, source=source_data, btn=radio_fill_color),
                                            code=color_str("fill_color")))
 
     color_header = Div(text="<b>Genes:</b>")
-    line_title = Div(text="""Color to use for gene outlines:""",
-                     width=200, height=100)
-    fill_title = Div(text="""Color to fill genes with:""",
-                     width=200, height=100)
+    line_title = Div(text="""Color to use for gene outlines:""")
+    fill_title = Div(text="""Color to fill genes with:""")
 
     gene_outline_size = Slider(start=0, end=10, value=5, step=0.1, title="Gene outline size:")
     gene_outline_size.js_on_change('value', CustomJS(args=dict(other=recs),
                                                      code="""
                 other.glyph.line_width = this.value;
+                console.log('SLIDER: active=' + this.value, this.toString())
                 """
                                                      ))
 
@@ -376,8 +382,8 @@ def add_gene_labels(fig, source_data: ColumnDataSource) -> (Column, LabelSet):
     slider_font = Slider(start=0, end=64, value=16, step=1, title="Gene label font size in px")
     slider_angle = Slider(start=0, end=pi / 2, value=0, step=0.01, title="Gene label angle in radian")
 
-    radio_label_type = RadioGroup(labels=["name", "product", "family", "local identifier", "gene ID", "none"],
-                                  active=0)
+    radio_label_type = RadioButtonGroup(labels=["name", "product", "family", "local identifier", "gene ID", "none"],
+                                  active=1)
 
     slider_angle.js_link('value', labels, 'angle')
 
@@ -387,30 +393,29 @@ def add_gene_labels(fig, source_data: ColumnDataSource) -> (Column, LabelSet):
                                       )
                              )
 
-    radio_label_type.js_on_change('streaming',
-                                  CustomJS(args=dict(other=labels, source=source_data),
+    radio_label_type.js_on_event("button_click",
+                                  CustomJS(args=dict(other=labels, source=source_data, btn=radio_label_type),
                                            code="""
-                if(this.active == 5){
+                if(btn.active == 5){
                     source.data['label'] = [];
                     for(var i=0;i<source.data['name'].length;i++){
                         source.data['label'].push('');
                     }
-                }else if(this.active == 3){
+                }else if(btn.active == 3){
                     source.data['label'] = source.data['gene_local_ID'];
-                }else if(this.active == 4){
+                }else if(btn.active == 4){
                     source.data['label'] = source.data['gene_ID'];
                 }
                 else{
-                    source.data['label'] = source.data[this.labels[this.active]];
+                    source.data['label'] = source.data[btn.labels[btn.active]];
                 }
                 other.source = source;
                 source.change.emit();
                 """
                                            ))
 
     label_header = Div(text="<b>Gene labels:</b>")
-    radio_title = Div(text="""Gene labels to use:""",
-                      width=200, height=100)
+    radio_title = Div(text="""Gene labels to use:""",)
     labels_block = column(label_header, row(slider_font, slider_angle), column(radio_title, radio_label_type))
 
     fig.add_layout(labels)

diff --git a/ppanggolin/formats/writeSequences.py b/ppanggolin/formats/writeSequences.py
@@ -55,7 +55,6 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
     :param compress: Compress the file in .gz
     :param disable_bar: Disable progress bar
     """
-    assert genes in poss_values, f"Selected part to write genes not in {poss_values}"
 
     logging.getLogger("PPanGGOLiN").info("Writing all the gene nucleotide sequences...")
     outpath = output / f"{genes}_genes.fna"
@@ -94,35 +93,40 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c
     if partition == 'all':
         logging.getLogger("PPanGGOLiN").info(f"Writing all of the {type_name}...")
         genefams = pangenome.gene_families
+
     elif partition in ['persistent', 'shell', 'cloud']:
         logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} of the {partition}...")
         for fam in pangenome.gene_families:
             if fam.named_partition == partition:
                 genefams.add(fam)
+
     elif partition == "rgp":
         logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} in RGPs...")
         for region in pangenome.regions:
-            genefams |= region.families
+            genefams |= set(region.families)
+
     elif partition == "softcore":
         logging.getLogger("PPanGGOLiN").info(
             f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes")
         threshold = pangenome.number_of_organisms * soft_core
         for fam in pangenome.gene_families:
             if fam.number_of_organisms >= threshold:
                 genefams.add(fam)
+
     elif partition == "core":
         logging.getLogger("PPanGGOLiN").info(f"Writing the representative {type_name} of the {partition} "
                                              "gene families...")
         for fam in pangenome.gene_families:
             if fam.number_of_organisms == pangenome.number_of_organisms:
                 genefams.add(fam)
+
     elif "module_" in partition:
         logging.getLogger("PPanGGOLiN").info(f"Writing the representation {type_name} of {partition} gene families...")
         mod_id = int(partition.replace("module_", ""))
         for mod in pangenome.modules:
             # could be way more efficient with a dict structure instead of a set
             if mod.ID == mod_id:
-                genefams |= mod.families
+                genefams |= set(mod.families)
                 break
     return genefams
 
@@ -139,7 +143,6 @@ def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str,
     :param compress: Compress the file in .gz
     :param disable_bar: Disable progress bar
     """
-    assert gene_families in poss_values, f"Selected part to write gene families not in {poss_values}"
 
     outpath = output / f"{gene_families}_nucleotide_families.fasta"
 
@@ -165,8 +168,6 @@ def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str,
     :param compress: Compress the file in .gz
     :param disable_bar: Disable progress bar
     """
-    assert prot_families in poss_values, (f"Selected part: {prot_families} "
-                                          f"to write protein families not in {poss_values}")
 
     outpath = output / f"{prot_families}_protein_families.faa"
 
@@ -371,6 +372,8 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
 
     if prot_families is not None:
         need_families = True
+        if prot_families in ["core", "softcore"]:
+            need_annotations = True
 
     if any(x is not None for x in [regions, genes, gene_families]):
         need_annotations = True
@@ -380,7 +383,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
         need_regions = True
     if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)):
         need_partitions = True
-    for x in (genes, gene_families):
+    for x in (genes, gene_families, prot_families):
         if x is not None and 'module_' in x:
             need_modules = True
 
@@ -452,13 +455,29 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser
     parser_seq(parser)
     return parser
 
+def filter_values(arg_value: str):
+    """
+    Check filter value to ensure they are in the expected format.
+
+    :param arg_value: Argument value that is being tested.
+
+    :return: The same argument if it is valid.
+
+    :raises argparse.ArgumentTypeError: If the argument value is not in the expected format.
+    """
+    if arg_value in poss_values or module_regex.match(arg_value):
+        return arg_value
+    else:
+        raise argparse.ArgumentTypeError(f"Invalid choice '{arg_value}'. {poss_values_log}")
+
 
 def parser_seq(parser: argparse.ArgumentParser):
     """
     Parser for specific argument of fasta command
 
     :param parser: parser for align argument
     """
+
     required = parser.add_argument_group(title="Required arguments",
                                          description="One of the following arguments is required :")
     required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file")
@@ -474,18 +493,20 @@ def parser_seq(parser: argparse.ArgumentParser):
                          help="A tab-separated file listing the genome names, and the gff/gbff filepath of its "
                               "annotations (the files can be compressed with gzip). One line per genome. "
                               "If this is provided, those annotations will be used.")
+
     onereq = parser.add_argument_group(title="Output file",
                                        description="At least one of the following argument is required. "
                                                    "Indicating 'all' writes all elements. Writing a partition "
                                                    "('persistent', 'shell' or 'cloud') write the elements associated "
                                                    "to said partition. Writing 'rgp' writes elements associated to RGPs"
                                        )
-    onereq.add_argument("--genes", required=False, type=str, choices=poss_values,
+    onereq.add_argument("--genes", required=False, type=filter_values,
                         help=f"Write all nucleotide CDS sequences. {poss_values_log}")
-    onereq.add_argument("--prot_families", required=False, type=str, choices=poss_values,
+    onereq.add_argument("--prot_families", required=False, type=filter_values,
                         help=f"Write representative amino acid sequences of gene families. {poss_values_log}")
-    onereq.add_argument("--gene_families", required=False, type=str, choices=poss_values,
+    onereq.add_argument("--gene_families", required=False, type=filter_values,
                         help=f"Write representative nucleotide sequences of gene families. {poss_values_log}")
+
     optional = parser.add_argument_group(title="Optional arguments")
     # could make choice to allow customization
     optional.add_argument("--regions", required=False, type=str, choices=["all", "complete"],

diff --git a/ppanggolin/geneFamily.py b/ppanggolin/geneFamily.py
@@ -386,12 +386,12 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'):
     def get_org_dict(self) -> Dict[Organism, Set[Gene]]:
         """Returns the organisms and the genes belonging to the gene family
 
-        :return: A dictionnary of organism as key and set of genes as values
+        :return: A dictionary of organism as key and set of genes as values
         """
         if len(self._genePerOrg) == 0:
             for gene in self.genes:
                 if gene.organism is None:
-                    raise AttributeError(f"Gene: {gene.name} is not fill with organism")
+                    raise AttributeError(f"Gene: {gene.name} is not fill with genome")
                 self._genePerOrg[gene.organism].add(gene)
         return self._genePerOrg