Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Filter non ASCII character #291

Merged
merged 2 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion ppanggolin/annotate/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
init_contig_counter, contig_counter)
from ppanggolin.pangenome import Pangenome
from ppanggolin.genome import Organism, Gene, RNA, Contig
from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files
from ppanggolin.utils import read_compressed_or_not, mk_file_name, detect_filetype, check_input_files, has_non_ascii, replace_non_ascii
from ppanggolin.formats import write_pangenome
from ppanggolin.metadata import Metadata

Expand Down Expand Up @@ -53,6 +53,8 @@ def check_annotate_args(args: argparse.Namespace):
check_input_files(args.anno, True)




def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: int, gene_id: str, dbxrefs: Set[str],
coordinates: List[Tuple[int, int]], strand: str, gene_type: str, position: int = None,
gene_name: str = "", product: str = "", genetic_code: int = 11, protein_id: str = "") -> Gene:
Expand All @@ -74,6 +76,15 @@ def create_gene(org: Organism, contig: Contig, gene_counter: int, rna_counter: i
:param genetic_code: Genetic code used
:param protein_id: Protein identifier
"""
# check for non ascii character in product field
if has_non_ascii(product):

logging.getLogger("PPanGGOLiN").warning(
f"In genome '{org.name}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. "
"These characters cannot be stored in the HDF5 file and will be replaced by underscores."
)
product = replace_non_ascii(product)


start, stop = coordinates[0][0], coordinates[-1][1]

Expand Down Expand Up @@ -889,6 +900,15 @@ def check_chevrons_in_start_and_stop(start: str, stop: str) -> Tuple[int, int, b
is_partial = False

product = attributes.pop('PRODUCT', "")

if has_non_ascii(product):

logging.getLogger("PPanGGOLiN").warning(
f"In genome '{organism}', the 'product' field of gene '{gene_id}' contains non-ASCII characters: '{product}'. "
"These characters cannot be stored in the HDF5 file and will be replaced by underscores."
)
product = replace_non_ascii(product)


if contig is None or contig.name != fields_gff[gff_seqname]:
# get the current contig
Expand Down
25 changes: 25 additions & 0 deletions ppanggolin/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1254,3 +1254,28 @@ def run_subprocess(cmd: List[str], output: Path = None, msg: str = "Subprocess f
if output is not None:
with open(output, 'w') as fout:
fout.write(result.stdout)



def has_non_ascii(string_to_test: str) -> bool:
"""
Check if a string contains any non-ASCII characters.

:param string_to_test: The string to check for non-ASCII characters.
:return: True if the string contains non-ASCII characters, False otherwise.
"""
try:
string_to_test.encode('ascii')
except UnicodeEncodeError:
return True
return False

def replace_non_ascii(string_with_ascii: str, replacement_string: str = "_") -> str:
"""
Replace all non-ASCII characters in a string with a specified replacement string.

:param string_with_ascii: The string potentially containing non-ASCII characters.
:param replacement_string: The string to replace non-ASCII characters with (default is '_').
:return: A new string where all non-ASCII characters have been replaced.
"""
return re.sub(r'[^\x00-\x7F]+', replacement_string, string_with_ascii)
27 changes: 25 additions & 2 deletions tests/utils/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@
import zipfile
from typing import Generator

from ppanggolin.utils import is_compressed, read_compressed_or_not, write_compressed_or_not

from ppanggolin.utils import is_compressed, read_compressed_or_not, write_compressed_or_not, has_non_ascii, replace_non_ascii

class TestCompressed:
"""
Expand Down Expand Up @@ -157,3 +156,27 @@ def test_write_uncompressed(self, plain_file_path: Path) -> None:
f.write("Test data")
with open(plain_file_path, 'r') as f:
assert f.read() == "Test data"


# Test cases for has_non_ascii
@pytest.mark.parametrize("input_string, expected", [
("Escherichia_coli", False), # All ASCII characters
("Escherichia_colí", True), # Contains non-ASCII character 'í'
("simple_string", False), # Simple ASCII string
("Ωmega", True), # Contains non-ASCII character 'Ω'
("", False), # Empty string should return False
])
def test_has_non_ascii(input_string, expected):
assert has_non_ascii(input_string) == expected

# Test cases for replace_non_ascii
@pytest.mark.parametrize("input_string, replacement, expected", [
("Escherichia_coli", "_", "Escherichia_coli"), # All ASCII characters, no replacement needed
("Escherichia_colí", "_", "Escherichia_col_"), # Replace 'í' with '_'
("Ωmega", "-", "-mega"), # Replace 'Ω' with '-'
("Escherichia_Ωcoli", "X", "Escherichia_Xcoli"),# Replace 'Ω' with 'X'
("", "_", ""), # Empty string, no replacement
])
def test_replace_non_ascii(input_string, replacement, expected):
assert replace_non_ascii(input_string, replacement) == expected