Skip to content

Commit

Permalink
Merge pull request #182 from labgem/dev
Browse files Browse the repository at this point in the history
Patches on the v2.0.2
  • Loading branch information
JeanMainguy authored Feb 22, 2024
2 parents c83ceb7 + 9070614 commit 9d0821e
Show file tree
Hide file tree
Showing 12 changed files with 129 additions and 69 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,12 @@ jobs:
ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06 --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --regions --borders --families_tsv --cpu 1
ppanggolin write_genomes -p stepbystep/pangenome.h5 --output stepbystep -f --fasta genomes.fasta.list --gff --proksee --table
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta genomes.fasta.list
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families softcore --gene_families softcore
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log
cd -
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.2
2.0.3
8 changes: 6 additions & 2 deletions ppanggolin/annotate/synta.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,10 @@ def launch_prodigal(contig_sequences: Dict[str, str], org: Organism, code: int =
mask=True, # -m: Treat runs of N as masked sequence; don't build genes across them.
min_gene=120 # This is to prevent error with mmseqs translatenucs that cut too short sequences
)
gene_finder.train(max(sequences.values(), key=len), force_nonsd=False,
translation_table=code) # -g: Specify a translation table to use (default 11).

if not use_meta:
gene_finder.train(*contig_sequences.values(), force_nonsd=False,
translation_table=code) # -g: Specify a translation table to use (default 11).
gene_counter = 1
for contig_name, sequence in sequences.items():
for pred in gene_finder.find_genes(sequence):
Expand Down Expand Up @@ -333,6 +335,8 @@ def annotate_organism(org_name: str, file_name: Path, circular_contigs: List[str
max_contig_len = max(len(contig) for contig in org.contigs)
if max_contig_len < 20000: # case of short sequence
use_meta = True
logging.getLogger("PPanGGOLiN").info(f"Using the metagenomic mode to predict genes for {org_name}, as all its contigs are < 20KB in size.")

else:
use_meta = False
else:
Expand Down
4 changes: 3 additions & 1 deletion ppanggolin/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,9 @@ def clustering(pangenome: Pangenome, tmpdir: Path, cpu: int = 1, defrag: bool =
"""

if keep_tmp_files:
dir_name = 'clustering_tmpdir' + time.strftime("_%Y-%m-%d_%H.%M.%S", time.localtime())
date = time.strftime("_%Y-%m-%d_%H-%M-%S", time.localtime())

dir_name = f'clustering_tmpdir_{date}_PID{os.getpid()}'
tmp_path = Path(tmpdir) / dir_name
mk_outdir(tmp_path, force=True)
else:
Expand Down
55 changes: 30 additions & 25 deletions ppanggolin/figures/draw_spot.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
from bokeh.plotting import ColumnDataSource, figure, save
from bokeh.io import output_file
from bokeh.layouts import column, row
from bokeh.models import WheelZoomTool, LabelSet, Slider, CustomJS, HoverTool, RadioGroup, Div, Column, GlyphRenderer
from bokeh.models import WheelZoomTool, LabelSet, Slider, CustomJS, HoverTool, RadioGroup, Div, Column, GlyphRenderer, RadioButtonGroup
from bokeh.io import show


# local libraries
from ppanggolin.pangenome import Pangenome
Expand Down Expand Up @@ -320,44 +322,48 @@ def add_gene_tools(recs: GlyphRenderer, source_data: ColumnDataSource) -> Column
"""

def color_str(color_element: str) -> str:
""" Javascript code to switch between partition, family and module color for the given 'color_element'
"""
Javascript code to switch between partition, family and module color for the given 'color_element'
:param color_element:
:return: Javascript code
"""
return f"""
if(this.active == 0){{
if(btn.active == 0){{
source.data['{color_element}'] = source.data['partition_color'];
}}else if(this.active == 1){{
console.log('partition_color')
}}else if(btn.active == 1){{
source.data['{color_element}'] = source.data['family_color'];
}}else if(this.active == 2){{
console.log('family_color')
}}else if(btn.active == 2){{
source.data['{color_element}'] = source.data['module_color'];
console.log('module_color')
}}
recs.{color_element} = source.data['{color_element}'];
source.change.emit();
"""

radio_line_color = RadioGroup(labels=["partition", "family", "module"], active=0)
radio_fill_color = RadioGroup(labels=["partition", "family", "module"], active=1)
radio_line_color.js_on_change('streaming',
CustomJS(args=dict(recs=recs, source=source_data),
radio_line_color = RadioButtonGroup(labels=["partition", "family", "module"], active=0)
radio_fill_color = RadioButtonGroup(labels=["partition", "family", "module"], active=1)

radio_line_color.js_on_event("button_click",
CustomJS(args=dict(recs=recs, source=source_data, btn=radio_line_color),
code=color_str("line_color")))

radio_fill_color.js_on_change('streaming',
CustomJS(args=dict(recs=recs, source=source_data),
radio_fill_color.js_on_event("button_click",
CustomJS(args=dict(recs=recs, source=source_data, btn=radio_fill_color),
code=color_str("fill_color")))

color_header = Div(text="<b>Genes:</b>")
line_title = Div(text="""Color to use for gene outlines:""",
width=200, height=100)
fill_title = Div(text="""Color to fill genes with:""",
width=200, height=100)
line_title = Div(text="""Color to use for gene outlines:""")
fill_title = Div(text="""Color to fill genes with:""")

gene_outline_size = Slider(start=0, end=10, value=5, step=0.1, title="Gene outline size:")
gene_outline_size.js_on_change('value', CustomJS(args=dict(other=recs),
code="""
other.glyph.line_width = this.value;
console.log('SLIDER: active=' + this.value, this.toString())
"""
))

Expand All @@ -376,8 +382,8 @@ def add_gene_labels(fig, source_data: ColumnDataSource) -> (Column, LabelSet):
slider_font = Slider(start=0, end=64, value=16, step=1, title="Gene label font size in px")
slider_angle = Slider(start=0, end=pi / 2, value=0, step=0.01, title="Gene label angle in radian")

radio_label_type = RadioGroup(labels=["name", "product", "family", "local identifier", "gene ID", "none"],
active=0)
radio_label_type = RadioButtonGroup(labels=["name", "product", "family", "local identifier", "gene ID", "none"],
active=1)

slider_angle.js_link('value', labels, 'angle')

Expand All @@ -387,30 +393,29 @@ def add_gene_labels(fig, source_data: ColumnDataSource) -> (Column, LabelSet):
)
)

radio_label_type.js_on_change('streaming',
CustomJS(args=dict(other=labels, source=source_data),
radio_label_type.js_on_event("button_click",
CustomJS(args=dict(other=labels, source=source_data, btn=radio_label_type),
code="""
if(this.active == 5){
if(btn.active == 5){
source.data['label'] = [];
for(var i=0;i<source.data['name'].length;i++){
source.data['label'].push('');
}
}else if(this.active == 3){
}else if(btn.active == 3){
source.data['label'] = source.data['gene_local_ID'];
}else if(this.active == 4){
}else if(btn.active == 4){
source.data['label'] = source.data['gene_ID'];
}
else{
source.data['label'] = source.data[this.labels[this.active]];
source.data['label'] = source.data[btn.labels[btn.active]];
}
other.source = source;
source.change.emit();
"""
))

label_header = Div(text="<b>Gene labels:</b>")
radio_title = Div(text="""Gene labels to use:""",
width=200, height=100)
radio_title = Div(text="""Gene labels to use:""",)
labels_block = column(label_header, row(slider_font, slider_angle), column(radio_title, radio_label_type))

fig.add_layout(labels)
Expand Down
41 changes: 31 additions & 10 deletions ppanggolin/formats/writeSequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ def write_gene_sequences(pangenome: Pangenome, output: Path, genes: str, soft_co
:param compress: Compress the file in .gz
:param disable_bar: Disable progress bar
"""
assert genes in poss_values, f"Selected part to write genes not in {poss_values}"

logging.getLogger("PPanGGOLiN").info("Writing all the gene nucleotide sequences...")
outpath = output / f"{genes}_genes.fna"
Expand Down Expand Up @@ -94,35 +93,40 @@ def select_families(pangenome: Pangenome, partition: str, type_name: str, soft_c
if partition == 'all':
logging.getLogger("PPanGGOLiN").info(f"Writing all of the {type_name}...")
genefams = pangenome.gene_families

elif partition in ['persistent', 'shell', 'cloud']:
logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} of the {partition}...")
for fam in pangenome.gene_families:
if fam.named_partition == partition:
genefams.add(fam)

elif partition == "rgp":
logging.getLogger("PPanGGOLiN").info(f"Writing the {type_name} in RGPs...")
for region in pangenome.regions:
genefams |= region.families
genefams |= set(region.families)

elif partition == "softcore":
logging.getLogger("PPanGGOLiN").info(
f"Writing the {type_name} in {partition} genome, that are present in more than {soft_core} of genomes")
threshold = pangenome.number_of_organisms * soft_core
for fam in pangenome.gene_families:
if fam.number_of_organisms >= threshold:
genefams.add(fam)

elif partition == "core":
logging.getLogger("PPanGGOLiN").info(f"Writing the representative {type_name} of the {partition} "
"gene families...")
for fam in pangenome.gene_families:
if fam.number_of_organisms == pangenome.number_of_organisms:
genefams.add(fam)

elif "module_" in partition:
logging.getLogger("PPanGGOLiN").info(f"Writing the representation {type_name} of {partition} gene families...")
mod_id = int(partition.replace("module_", ""))
for mod in pangenome.modules:
# could be way more efficient with a dict structure instead of a set
if mod.ID == mod_id:
genefams |= mod.families
genefams |= set(mod.families)
break
return genefams

Expand All @@ -139,7 +143,6 @@ def write_fasta_gene_fam(pangenome: Pangenome, output: Path, gene_families: str,
:param compress: Compress the file in .gz
:param disable_bar: Disable progress bar
"""
assert gene_families in poss_values, f"Selected part to write gene families not in {poss_values}"

outpath = output / f"{gene_families}_nucleotide_families.fasta"

Expand All @@ -165,8 +168,6 @@ def write_fasta_prot_fam(pangenome: Pangenome, output: Path, prot_families: str,
:param compress: Compress the file in .gz
:param disable_bar: Disable progress bar
"""
assert prot_families in poss_values, (f"Selected part: {prot_families} "
f"to write protein families not in {poss_values}")

outpath = output / f"{prot_families}_protein_families.faa"

Expand Down Expand Up @@ -371,6 +372,8 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,

if prot_families is not None:
need_families = True
if prot_families in ["core", "softcore"]:
need_annotations = True

if any(x is not None for x in [regions, genes, gene_families]):
need_annotations = True
Expand All @@ -380,7 +383,7 @@ def write_sequence_files(pangenome: Pangenome, output: Path, fasta: Path = None,
need_regions = True
if any(x in ["persistent", "shell", "cloud"] for x in (genes, gene_families, prot_families)):
need_partitions = True
for x in (genes, gene_families):
for x in (genes, gene_families, prot_families):
if x is not None and 'module_' in x:
need_modules = True

Expand Down Expand Up @@ -452,13 +455,29 @@ def subparser(sub_parser: argparse._SubParsersAction) -> argparse.ArgumentParser
parser_seq(parser)
return parser

def filter_values(arg_value: str):
"""
Check filter value to ensure they are in the expected format.
:param arg_value: Argument value that is being tested.
:return: The same argument if it is valid.
:raises argparse.ArgumentTypeError: If the argument value is not in the expected format.
"""
if arg_value in poss_values or module_regex.match(arg_value):
return arg_value
else:
raise argparse.ArgumentTypeError(f"Invalid choice '{arg_value}'. {poss_values_log}")


def parser_seq(parser: argparse.ArgumentParser):
"""
Parser for specific argument of fasta command
:param parser: parser for align argument
"""

required = parser.add_argument_group(title="Required arguments",
description="One of the following arguments is required :")
required.add_argument('-p', '--pangenome', required=False, type=Path, help="The pangenome .h5 file")
Expand All @@ -474,18 +493,20 @@ def parser_seq(parser: argparse.ArgumentParser):
help="A tab-separated file listing the genome names, and the gff/gbff filepath of its "
"annotations (the files can be compressed with gzip). One line per genome. "
"If this is provided, those annotations will be used.")

onereq = parser.add_argument_group(title="Output file",
description="At least one of the following argument is required. "
"Indicating 'all' writes all elements. Writing a partition "
"('persistent', 'shell' or 'cloud') write the elements associated "
"to said partition. Writing 'rgp' writes elements associated to RGPs"
)
onereq.add_argument("--genes", required=False, type=str, choices=poss_values,
onereq.add_argument("--genes", required=False, type=filter_values,
help=f"Write all nucleotide CDS sequences. {poss_values_log}")
onereq.add_argument("--prot_families", required=False, type=str, choices=poss_values,
onereq.add_argument("--prot_families", required=False, type=filter_values,
help=f"Write representative amino acid sequences of gene families. {poss_values_log}")
onereq.add_argument("--gene_families", required=False, type=str, choices=poss_values,
onereq.add_argument("--gene_families", required=False, type=filter_values,
help=f"Write representative nucleotide sequences of gene families. {poss_values_log}")

optional = parser.add_argument_group(title="Optional arguments")
# could make choice to allow customization
optional.add_argument("--regions", required=False, type=str, choices=["all", "complete"],
Expand Down
4 changes: 2 additions & 2 deletions ppanggolin/geneFamily.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,12 +386,12 @@ def mk_bitarray(self, index: Dict[Organism, int], partition: str = 'all'):
def get_org_dict(self) -> Dict[Organism, Set[Gene]]:
"""Returns the organisms and the genes belonging to the gene family
:return: A dictionnary of organism as key and set of genes as values
:return: A dictionary of organism as key and set of genes as values
"""
if len(self._genePerOrg) == 0:
for gene in self.genes:
if gene.organism is None:
raise AttributeError(f"Gene: {gene.name} is not fill with organism")
raise AttributeError(f"Gene: {gene.name} is not fill with genome")
self._genePerOrg[gene.organism].add(gene)
return self._genePerOrg

Expand Down
Loading

0 comments on commit 9d0821e

Please sign in to comment.