Merge pull request #187 from labgem/dev

Patches on the 2.0.3
labgem · Mar 7, 2024 · ed7bbfe · ed7bbfe
2 parents 9d0821e + 58993b3
commit ed7bbfe
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 24 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.3
+2.0.4
diff --git a/ppanggolin/annotate/annotate.py b/ppanggolin/annotate/annotate.py
@@ -238,7 +238,7 @@ def read_org_gbff(organism_name: str, gbff_file_path: Path, circular_contigs: Li
             line = lines.pop()
 
         if contig.length != len(sequence):
-            raise ValueError("The contig lenght defined is different than the sequence length")
+            raise ValueError("The contig length defined is different than the sequence length")
         # get each gene's sequence.
         for gene in contig.genes:
             gene.add_sequence(get_dna_sequence(sequence, gene))
@@ -253,7 +253,7 @@ def read_org_gff(organism: str, gff_file_path: Path, circular_contigs: List[str]
     :param organism: Organism name
     :param gff_file_path: Path corresponding to GFF file
     :param circular_contigs: List of circular contigs
-    :param pseudo: Allow to read pseudogène
+    :param pseudo: Allow to read pseudogene
 
     :return: Organism object and if there are sequences associated or not
     """
@@ -293,7 +293,7 @@ def get_id_attribute(attributes_dict: dict) -> str:
         element_id = attributes_dict.get("ID")
         if not element_id:
             raise Exception(f"Each CDS type of the gff files must own a unique ID attribute. "
-                            f"Not the case for file: {gff_file_path}")
+                            f"Not the case for file: {gff_file_path} with ID {element_id}")
         return element_id
 
     contig = None  # initialize contig
@@ -419,7 +419,7 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list,
     :param organism_name: Name of the organism
     :param filename: Path to the corresponding file
     :param circular_contigs: list of sequence in contig
-    :param pseudo: allow to read pseudogène
+    :param pseudo: allow to read pseudogene
 
     :return: Annotated organism for pangenome and true for sequence in file
     """
@@ -428,16 +428,22 @@ def read_anno_file(organism_name: str, filename: Path, circular_contigs: list,
     if filetype == "gff":
         try:
             return read_org_gff(organism_name, filename, circular_contigs, pseudo)
-        except Exception:
-            raise Exception(f"Reading the gff3 file '{filename}' raised an error.")
+        except Exception as err:
+            raise Exception(f"Reading the gff3 file '{filename}' raised an error. {err}")
     elif filetype == "gbff":
         try:
             return read_org_gbff(organism_name, filename, circular_contigs, pseudo)
-        except Exception:
-            raise Exception(f"Reading the gbff file '{filename}' raised an error.")
-    else:  # Fasta type obligatory because unknown raise an error in detect_filetype function
-        raise Exception("Wrong file type provided. This looks like a fasta file. "
-                        "You may be able to use --fasta instead.")
+        except Exception as err:
+            raise Exception(f"Reading the gbff file '{filename}' raised an error. {err}")
+
+    elif filetype == "fasta":
+        raise ValueError(f"Invalid file type provided for parameter '--anno'. The file '{filename}' looks like a fasta file. "
+                        "Please use a .gff or .gbff file. You may be able to use --fasta instead of --anno.")
+
+    else:
+        raise ValueError(f"Invalid file type provided for parameter '--anno'. The file '{filename}' appears to be of type '{filetype}'. "
+                        "Please use .gff or .gbff files.")
+
 
 
 def chose_gene_identifiers(pangenome: Pangenome) -> bool:

diff --git a/ppanggolin/annotate/synta.py b/ppanggolin/annotate/synta.py
@@ -169,7 +169,7 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str,
     :param org: Organism corresponding to fasta file
     :param fna_file: Input fasta file with sequences or list of each line as sequence
 
-    :return: Dictionnary with contig_name as keys and contig sequence in values
+    :return: Dictionary with contig_name as keys and contig sequence in values
     """
     global contig_counter
     try:
@@ -199,8 +199,8 @@ def read_fasta(org: Organism, fna_file: Union[TextIOWrapper, list]) -> Dict[str,
         raise AttributeError(f"{e}\nAn error was raised when reading file: '{fna_file.name}'. "
                              f"One possibility for this error is that the file did not start with a '>' "
                              f"as it would be expected from a fna file.")
-    except Exception:  # To manage other exception which can occur
-        raise Exception("Unexpected error. Please check your input file and if everything looks fine, "
+    except Exception as err:  # To manage other exception which can occur
+        raise Exception(f"{err}: Please check your input file and if everything looks fine, "
                         "please post an issue on our github")
     return contigs
 

diff --git a/ppanggolin/formats/writeBinaries.py b/ppanggolin/formats/writeBinaries.py
@@ -526,6 +526,7 @@ def part_spec(part: str) -> list:
 
 
     mod_fam = [len(module) for module in pangenome.modules]
+    sum_mod_fam = sum(mod_fam)
 
     info_group._v_attrs.StatOfFamiliesInModules = {"min": getmin(mod_fam),
                                                     "max": getmax(mod_fam),
@@ -536,19 +537,19 @@ def part_spec(part: str) -> list:
     spec_shell = part_spec(part='shell')
     spec_cloud = part_spec(part='cloud')
 
-    info_group._v_attrs.PersistentSpecInModules = {"percent": round((sum(spec_pers) / sum(mod_fam)) * 100, 2),
+    info_group._v_attrs.PersistentSpecInModules = {"percent": round((sum(spec_pers) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0,
                                                     "min": getmin(spec_pers),
                                                     "max": getmax(spec_pers),
                                                     "sd": getstdev(spec_pers),
                                                     "mean": getmean(spec_pers)}
 
-    info_group._v_attrs.ShellSpecInModules = {"percent": round((sum(spec_shell) / sum(mod_fam)) * 100, 2),
+    info_group._v_attrs.ShellSpecInModules = {"percent": round((sum(spec_shell) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0,
                                                 "min": getmin(spec_shell),
                                                 "max": getmax(spec_shell),
                                                 "sd": getstdev(spec_shell),
                                                 "mean": getmean(spec_shell)}
 
-    info_group._v_attrs.CloudSpecInModules = {"percent": round((sum(spec_cloud) / sum(mod_fam)) * 100, 2),
+    info_group._v_attrs.CloudSpecInModules = {"percent": round((sum(spec_cloud) / sum_mod_fam) * 100, 2) if sum_mod_fam > 0 else 0,
                                                 "min": getmin(spec_cloud),
                                                 "max": getmax(spec_cloud),
                                                 "sd": getstdev(spec_cloud),

diff --git a/ppanggolin/meta/meta.py b/ppanggolin/meta/meta.py
@@ -76,8 +76,6 @@ def check_metadata_format(metadata: Path, metatype: str) -> pd.DataFrame:
         if not colname_check.match(column):
             raise ValueError(f"column name is not a valid identifier: {column}; "
                              f"it does not match the pattern {colname_check.pattern}")
-        if column != metatype and metadata_df.dtypes[column] == object:
-            pd.to_numeric(metadata_df[column], downcast='integer', errors='ignore')
 
     return metadata_df
 

diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py
@@ -14,6 +14,7 @@
 import tempfile
 import time
 from itertools import zip_longest
+import re
 
 import networkx as nx
 from importlib.metadata import distribution
@@ -307,7 +308,7 @@ def mk_file_name(basename: str, output: Path, force: bool = False) -> Path:
 
 def detect_filetype(filename: Path) -> str:
     """
-    Detects whether the current file is gff3, gbk/gbff, fasta or unknown.
+    Detects whether the current file is gff3, gbk/gbff, fasta, tsv or unknown.
     If unknown, it will raise an error
 
     :param filename: path to file
@@ -318,16 +319,16 @@ def detect_filetype(filename: Path) -> str:
         first_line = f.readline()
     if first_line.startswith("LOCUS       "):  # then this is probably a gbff/gbk file
         return "gbff"
-    elif first_line.startswith("##gff-version 3") or first_line.startswith("##gff-version  3"): # prodigal gff header has two spaces betwene gff-version and 3... 
+    elif re.match(r"##gff-version\s{1,3}3", first_line):  # prodigal gff header has two spaces between gff-version and 3... some gff user can have a tab 
         return 'gff'
     elif first_line.startswith(">"):
         return 'fasta'
     elif "\t" in first_line:
         return "tsv"
     else:
         raise Exception(f"Filetype {filename} was not gff3 (file starts with '##gff-version 3') "
-                        "nor gbff/gbk (file starts with 'LOCUS       '). "
-                        "Only those two file formats are supported (for now).")
+                        "nor gbff/gbk (file starts with 'LOCUS       ') "
+                        "nor fasta (file starts with '>') nor tsv (file has '\t' in the first line). ")
 
 
 def restricted_float(x: Union[int, float]) -> float: