Skip to content

Commit

Permalink
Merge pull request #187 from labgem/dev
Browse files Browse the repository at this point in the history
Patches on the 2.0.3
  • Loading branch information
jpjarnoux authored Mar 7, 2024
2 parents 9d0821e + 58993b3 commit ed7bbfe
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 24 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.3
2.0.4
28 changes: 17 additions & 11 deletions ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
line = lines.pop()

if contig.length != len(sequence):
raise ValueError("The contig lenght defined is different than the sequence length")
raise ValueError("The contig length defined is different than the sequence length")
# get each gene's sequence.
for gene in contig.genes:
gene.add_sequence(get_dna_sequence(sequence, gene))
Expand All @@ -253,7 +253,7 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str]
:param organism: Organism name
:param gff_file_path: Path corresponding to GFF file
:param circular_contigs: List of circular contigs
:param pseudo: Allow to read pseudogène
:param pseudo: Allow to read pseudogene
:return: Organism object and if there are sequences associated or not
"""
Expand Down Expand Up @@ -293,7 +293,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
element_id = attributes_dict.get("ID")
if not element_id:
raise Exception(f"Each CDS type of the gff files must own a unique ID attribute. "
f"Not the case for file: {gff_file_path}")
f"Not the case for file: {gff_file_path} with ID {element_id}")
return element_id

contig = None # initialize contig
Expand Down Expand Up @@ -419,7 +419,7 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list,
:param organism_name: Name of the organism
:param filename: Path to the corresponding file
:param circular_contigs: list of sequence in contig
:param pseudo: allow to read pseudogène
:param pseudo: allow to read pseudogene
:return: Annotated organism for pangenome and true for sequence in file
"""
Expand All @@ -428,16 +428,22 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list,
if filetype == "gff":
try:
return read_org_gff(organism_name, filename, circular_contigs, pseudo)
except Exception:
raise Exception(f"Reading the gff3 file '{filename}' raised an error.")
except Exception as err:
raise Exception(f"Reading the gff3 file '{filename}' raised an error. {err}")
elif filetype == "gbff":
try:
return read_org_gbff(organism_name, filename, circular_contigs, pseudo)
except Exception:
raise Exception(f"Reading the gbff file '{filename}' raised an error.")
else: # Fasta type obligatory because unknown raise an error in detect_filetype function
raise Exception("Wrong file type provided. This looks like a fasta file. "
"You may be able to use --fasta instead.")
except Exception as err:
raise Exception(f"Reading the gbff file '{filename}' raised an error. {err}")

elif filetype == "fasta":
raise ValueError(f"Invalid file type provided for parameter '--anno'. The file '{filename}' looks like a fasta file. "
"Please use a .gff or .gbff file. You may be able to use --fasta instead of --anno.")

else:
raise ValueError(f"Invalid file type provided for parameter '--anno'. The file '{filename}' appears to be of type '{filetype}'. "
"Please use .gff or .gbff files.")



def chose_gene_identifiers(pangenome: Pangenome) -> bool:
Expand Down
6 changes: 3 additions & 3 deletions ppanggolin/annotate/synta.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str,
:param org: Organism corresponding to fasta file
:param fna_file: Input fasta file with sequences or list of each line as sequence
:return: Dictionnary with contig_name as keys and contig sequence in values
:return: Dictionary with contig_name as keys and contig sequence in values
"""
global contig_counter
try:
Expand Down Expand Up @@ -199,8 +199,8 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str,
raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. "
f"One possibility for this error is that the file did not start with a '>' "
f"as it would be expected from a fna file.")
except Exception: # To manage other exception which can occur
raise Exception("Unexpected error. Please check your input file and if everything looks fine, "
except Exception as err: # To manage other exception which can occur
raise Exception(f"{err}: Please check your input file and if everything looks fine, "
"please post an issue on our github")
return contigs

Expand Down
7 changes: 4 additions & 3 deletions ppanggolin/formats/writeBinaries.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,7 @@ def part_spec(part: str) -> list:


mod_fam = [len(module) for module in pangenome.modules]
sum_mod_fam = sum(mod_fam)

info_group._v_attrs.StatOfFamiliesInModules = {"min": getmin(mod_fam),
"max": getmax(mod_fam),
Expand All @@ -536,19 +537,19 @@ def part_spec(part: str) -> list:
spec_shell = part_spec(part='shell')
spec_cloud = part_spec(part='cloud')

info_group._v_attrs.PersistentSpecInModules = {"percent": round((sum(spec_pers) / sum(mod_fam)) * 100, 2),
info_group._v_attrs.PersistentSpecInModules = {"percent": round((sum(spec_pers) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0,
"min": getmin(spec_pers),
"max": getmax(spec_pers),
"sd": getstdev(spec_pers),
"mean": getmean(spec_pers)}

info_group._v_attrs.ShellSpecInModules = {"percent": round((sum(spec_shell) / sum(mod_fam)) * 100, 2),
info_group._v_attrs.ShellSpecInModules = {"percent": round((sum(spec_shell) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0,
"min": getmin(spec_shell),
"max": getmax(spec_shell),
"sd": getstdev(spec_shell),
"mean": getmean(spec_shell)}

info_group._v_attrs.CloudSpecInModules = {"percent": round((sum(spec_cloud) / sum(mod_fam)) * 100, 2),
info_group._v_attrs.CloudSpecInModules = {"percent": round((sum(spec_cloud) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0,
"min": getmin(spec_cloud),
"max": getmax(spec_cloud),
"sd": getstdev(spec_cloud),
Expand Down
2 changes: 0 additions & 2 deletions ppanggolin/meta/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame:
if not colname_check.match(column):
raise ValueError(f"column name is not a valid identifier: {column}; "
f"it does not match the pattern {colname_check.pattern}")
if column != metatype and metadata_df.dtypes[column] == object:
pd.to_numeric(metadata_df[column], downcast='integer', errors='ignore')

return metadata_df

Expand Down
9 changes: 5 additions & 4 deletions ppanggolin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import tempfile
import time
from itertools import zip_longest
import re

import networkx as nx
from importlib.metadata import distribution
Expand Down Expand Up @@ -307,7 +308,7 @@ def mk_file_name(basename: str, output: Path, force: bool = False) -> Path:

def detect_filetype(filename: Path) -> str:
"""
Detects whether the current file is gff3, gbk/gbff, fasta or unknown.
Detects whether the current file is gff3, gbk/gbff, fasta, tsv or unknown.
If unknown, it will raise an error
:param filename: path to file
Expand All @@ -318,16 +319,16 @@ def detect_filetype(filename: Path) -> str:
first_line = f.readline()
if first_line.startswith("LOCUS "): # then this is probably a gbff/gbk file
return "gbff"
elif first_line.startswith("##gff-version 3") or first_line.startswith("##gff-version 3"): # prodigal gff header has two spaces betwene gff-version and 3...
elif re.match(r"##gff-version\s{1,3}3", first_line): # prodigal gff header has two spaces between gff-version and 3... some gff user can have a tab
return 'gff'
elif first_line.startswith(">"):
return 'fasta'
elif "\t" in first_line:
return "tsv"
else:
raise Exception(f"Filetype {filename} was not gff3 (file starts with '##gff-version 3') "
"nor gbff/gbk (file starts with 'LOCUS '). "
"Only those two file formats are supported (for now).")
"nor gbff/gbk (file starts with 'LOCUS ') "
"nor fasta (file starts with '>') nor tsv (file has '\t' in the first line). ")


def restricted_float(x: Union[int, float]) -> float:
Expand Down

0 comments on commit ed7bbfe

Please sign in to comment.