Skip to content

Commit

Permalink
Merge pull request #290 from labgem/handle_partial_genes
Browse files Browse the repository at this point in the history
Handling of partial genes in PPanGGOLiN
  • Loading branch information
jpjarnoux authored Oct 18, 2024
2 parents 0c1b85e + 6940253 commit 561d81b
Show file tree
Hide file tree
Showing 9 changed files with 591 additions and 48,109 deletions.
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ jobs:
shell: bash -l {0}
run: |
cd testingDataset
cat myannopang/gene_families.tsv | cut -f1,2,4 > clusters.tsv
ppanggolin panrgp --anno genomes.gbff.list --cluster clusters.tsv --output readclusterpang --cpu $NUM_CPUS
ppanggolin annotate --anno genomes.gbff.list --output readclusters --cpu $NUM_CPUS
awk 'BEGIN{FS=OFS="\t"} {$1 = $1 OFS $1} 1' clusters.tsv > clusters_with_reprez.tsv;
Expand Down
384 changes: 285 additions & 99 deletions ppanggolin/annotate/annotate.py

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions ppanggolin/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -475,8 +475,11 @@ def read_clustering_file(families_tsv_path: Path) -> Tuple[pd.DataFrame, bool]:
families_df["representative"] = families_df["representative"].astype(str)

# Check for duplicate gene IDs
if families_df["gene"].duplicated().any():
raise Exception("It seems that there is duplicated gene id in your clustering.")
duplicates = families_df[families_df["gene"].duplicated()]["gene"].unique()

if len(duplicates) > 0:
raise ValueError(f"Duplicate gene IDs found in your clustering: {', '.join(duplicates)}")


return families_df[["family", "representative", "gene", "is_frag"]], families_df["is_frag"].any()

Expand Down
69 changes: 57 additions & 12 deletions ppanggolin/genome.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from ppanggolin.metadata import MetaFeatures
from ppanggolin.utils import get_consecutive_region_positions


class Feature(MetaFeatures):
"""This is a general class representation of Gene, RNA
Expand Down Expand Up @@ -74,9 +75,10 @@ def __len__(self) -> int:
"""

try:
return sum([(stop - start +1) for start, stop in self.coordinates ])
return sum([(stop - start + 1) for start, stop in self.coordinates])
except TypeError:
raise ValueError(f"Coordinates of gene {self} have not been defined. Getting its length is then impossible.")
raise ValueError(
f"Coordinates of gene {self} have not been defined. Getting its length is then impossible.")

@property
def has_joined_coordinates(self) -> bool:
Expand Down Expand Up @@ -141,7 +143,7 @@ def contig(self, contig: Contig):
self._contig = contig

def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str = "", name: str = "",
product: str = "", local_identifier: str = "", coordinates: List[Tuple[int]] = None):
product: str = "", local_identifier: str = "", coordinates: List[Tuple[int, int]] = None):
"""
Fill general annotation for child classes
Expand Down Expand Up @@ -173,21 +175,25 @@ def fill_annotations(self, start: int, stop: int, strand: str, gene_type: str =
if not isinstance(product, str):
raise TypeError(f"Product should be str. Got {type(product)} instead in {self} from {self.organism}.")
if not isinstance(local_identifier, str):
raise TypeError(f"Local identifier should be str. Got {type(local_identifier)} instead in {self} from {self.organism}.")
raise TypeError(
f"Local identifier should be str. Got {type(local_identifier)} instead in {self} from {self.organism}.")
if strand not in ["+", "-"]:
raise ValueError(f"Strand should be '+' or '-'. Got {strand} instead in {self} from {self.organism}.")
if not isinstance(coordinates, list):
raise TypeError(f"Coordinates should be of type list. Got {type(coordinates)} instead in {self} from {self.organism}.")
raise TypeError(
f"Coordinates should be of type list. Got {type(coordinates)} instead in {self} from {self.organism}.")

for start_i, stop_i in coordinates:
if not isinstance(start_i, int):
raise TypeError(f"Start should be int. Got {type(start_i)} instead in {self} from {self.organism}.")
if not isinstance(stop_i, int):
raise TypeError(f"Stop should be int. Got {type(stop_i)} instead in {self} from {self.organism}.")
if stop_i < start_i:
raise ValueError(f"Wrong coordinates: {coordinates}. Start ({start_i}) should not be greater than stop ({stop_i}) in {self} from {self.organism}.")
raise ValueError(
f"Wrong coordinates: {coordinates}. Start ({start_i}) should not be greater than stop ({stop_i}) in {self} from {self.organism}.")
if start_i < 1 or stop_i < 1:
raise ValueError(f"Wrong coordinates: {coordinates}. Start ({start_i}) and stop ({stop_i}) should be greater than 0 in {self} from {self.organism}.")
raise ValueError(
f"Wrong coordinates: {coordinates}. Start ({start_i}) and stop ({stop_i}) should be greater than 0 in {self} from {self.organism}.")

self.start = start
self.stop = stop
Expand Down Expand Up @@ -222,7 +228,8 @@ def add_sequence(self, sequence):
:raise AssertionError: Sequence must be a string
"""
assert isinstance(sequence, str), f"'str' type was expected for dna sequence but you provided a '{type(sequence)}' type object"
assert isinstance(sequence,
str), f"'str' type was expected for dna sequence but you provided a '{type(sequence)}' type object"

self.dna = sequence

Expand All @@ -249,6 +256,7 @@ def stop_relative_to(self, gene):
if gene.start > self.stop:
return self.stop + self.contig.length


class RNA(Feature):
"""Save RNA from genome as an Object with some information for Pangenome
Expand Down Expand Up @@ -286,6 +294,8 @@ def __init__(self, gene_id: str):
self._RGP = None
self.genetic_code = None
self.protein = None
self.is_partial = False # is the gene a partial gene ?
self._frame = None # One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon, '1' that the second base is the first base of a codon, and so on..

@property
def family(self):
Expand Down Expand Up @@ -349,11 +359,15 @@ def module(self):
"""
return self.family.module

def fill_annotations(self, position: int = None, genetic_code: int = 11, **kwargs):
def fill_annotations(self, position: int = None, genetic_code: int = 11, is_partial: bool = False, frame: int = 0,
**kwargs):
"""Fill Gene annotation provide by PPanGGOLiN dependencies
:param position: Gene localization in genome
:param genetic_code: Genetic code associated to gene
:param is_partial: is the gene a partial gene
:param frame: One of '0', '1' or '2'. '0' indicates that the first base of the feature is the first base of a codon,
'1' that the second base is the first base of a codon, and so on..
:param kwargs: look at Feature.fill_annotations methods
:raises TypeError: If position or genetic code value is not instance integers
Expand All @@ -363,8 +377,14 @@ def fill_annotations(self, position: int = None, genetic_code: int = 11, **kwarg
raise TypeError("position should be an integer")
if not isinstance(genetic_code, int):
raise TypeError("Genetic code should be an integer")

if not isinstance(is_partial, bool):
raise TypeError("partial code should be an boolean")

self.position = position
self.genetic_code = genetic_code
self.is_partial = is_partial
self.frame = frame

def add_protein(self, protein: str):
"""Add a protein sequence corresponding to translated gene
Expand All @@ -377,6 +397,29 @@ def add_protein(self, protein: str):
raise TypeError(f"'str' type was expected but you provided a '{type(protein)}' type object")
self.protein = protein

@property
def frame(self) -> int:
"""
Get the frame of the gene
"""
assert self._frame is not None, "frame is already set and should not be set another time."

return self._frame

@frame.setter
def frame(self, frame: int):
"""Set the length of the contig
:param contig_len: length of the contig
"""
assert self._frame is None, "frame is already set and should not be set another time."

if frame not in [0, 1, 2]:
raise ValueError("Frame should be equal to 0, 1 or 2.")

self._frame = frame


class Contig(MetaFeatures):
"""
Expand Down Expand Up @@ -743,7 +786,6 @@ def modules(self):
modules.add(module)
yield from modules


def get_ordered_consecutive_genes(self, genes: Iterable[Gene]) -> List[List[Gene]]:
"""
Order the given genes considering the circularity of the contig.
Expand All @@ -754,12 +796,15 @@ def get_ordered_consecutive_genes(self, genes: Iterable[Gene]) -> List[List[Gene
gene_positions = [gene.position for gene in genes]

# Determine consecutive region positions
consecutive_region_positions = get_consecutive_region_positions(region_positions=gene_positions, contig_gene_count=self.number_of_genes)
consecutive_region_positions = get_consecutive_region_positions(region_positions=gene_positions,
contig_gene_count=self.number_of_genes)

consecutive_genes_lists = [[self[position] for position in consecutive_positions] for consecutive_positions in consecutive_region_positions]
consecutive_genes_lists = [[self[position] for position in consecutive_positions] for consecutive_positions in
consecutive_region_positions]

return consecutive_genes_lists


class Organism(MetaFeatures):
"""
Describe the Genome content and some information
Expand Down
Loading

0 comments on commit 561d81b

Please sign in to comment.