diff --git a/data/bad_seq.txt b/data/bad_seq.txt new file mode 100644 index 0000000..5fba2e1 --- /dev/null +++ b/data/bad_seq.txt @@ -0,0 +1,4 @@ +>P_1|1|training +CGCCUCCCACGCGGGAGACCCGGGUUCAAUUCCCGGCCAAU +>P_21|training +CCGGGUUCAAUUCCCGGCCACUGCACGUGGUUGUUUUUCAC diff --git a/util/FileProcessing.py b/util/FileProcessing.py index 5926a22..bc596c2 100644 --- a/util/FileProcessing.py +++ b/util/FileProcessing.py @@ -34,7 +34,7 @@ def __init__(self, file): else: - self.error_msg = 'File format error.' + pass def read_fasta(self, file): """ @@ -55,6 +55,8 @@ def read_fasta(self, file): header, sequence = array[0].split()[0], re.sub('[^ACDEFGHIKLMNPQRSTUVWY-]', '-', ''.join(array[1:]).upper()) header_array = header.split('|') name = header_array[0] + if len(header_array) != 3: + return [], None, f"fasta file parsing failed at \"{header}\"" label = header_array[1] if len(header_array) >= 2 else '0' label_train = header_array[2] if len(header_array) >= 3 else 'training' fasta_sequences.append([name, sequence, label, label_train])