Skip to content

Commit

Permalink
added check duplicates for logs; fixed error catching
Browse files Browse the repository at this point in the history
  • Loading branch information
lnalinaf committed Mar 28, 2021
1 parent a8f0db5 commit 4748195
Show file tree
Hide file tree
Showing 8 changed files with 69 additions and 28 deletions.
39 changes: 23 additions & 16 deletions pipeline/get_ortho_nucleotides.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,13 @@ def check_start_codon(seq, file_out_number, protein_id, start_codons="ATG"):
def check_accordance_with_protein_length(seq_length, protein_length, protein_id, file_out_number):
global BROKEN_ACCORDANCE
n = seq_length - 3 * protein_length
if n == 0:
logging.info(
"check_accordance_with_protein_length-OK for file {} protein_id {}".format(file_out_number, protein_id))
if n in range(-9, 10): # do softer condition to exclude very rough mistaken sequences
if n == 0:
logging.info(
"check_accordance_with_protein_length-OK for file {} protein_id {}".format(file_out_number, protein_id))
else:
logging.info("check_accordance_with_protein_length-NEAR OK: delta = seq_length - 3 * protein_length = {}\n"
"for file {} protein_id {}".format(n, file_out_number, protein_id))
try:
BROKEN_ACCORDANCE.get(file_out_number).remove(protein_id)
if not BROKEN_ACCORDANCE.get(file_out_number):
Expand Down Expand Up @@ -105,8 +109,7 @@ def check_multiple_of_three(seq, file_out_number, protein_id):


def check_common_accordance(check_multiple, check_start, check_stop, check_accordance):
if (check_start or not check_start) and (check_accordance or not check_accordance) and (
check_stop or not check_stop) and check_multiple:
if (check_start or not check_start) and (check_stop or not check_stop) and check_multiple and check_accordance:
logging.info("Check common accordance - OK: check_multiple - {}, check_start - {}, check_stop - {}, "
"check_accordance "
"- {}".format(check_multiple,
Expand Down Expand Up @@ -205,7 +208,7 @@ def check_translate(seq, protein_translation, initfna_filepath, feature, record_
logging.info("from check_translate:\n protein counted length={}\nprotein translation length from .gbff={}\n "
"length seq[:-3]=length"
"without right stop codon(check_stop=True)={}\n"
"length of man extracted seq={}"
"length of man extracted seq={}\n"
"protein counted:\n{}\n"
"protein translation:\n{}\n"
"nuc sequence, show stop codon:\n{}\n"
Expand All @@ -216,7 +219,7 @@ def check_translate(seq, protein_translation, initfna_filepath, feature, record_
else:
logging.info("from check_translate:\n protein counted length={}\nprotein translation length from .gbff={}\n "
"length seq=length with broken stop codon(check_stop=False)={}\n"
"length of man extracted seq={}"
"length of man extracted seq={}\n"
"protein counted:\n{}\n"
"protein translation:\n{}\n"
"nuc sequence, show broken stop codon\n{}"
Expand Down Expand Up @@ -339,6 +342,7 @@ def get_seq_from_gbff(gb_file, ortho_protein_ids):


def get_seq_record_from_cds(cds_from_genomic_file, protein_id, species_numerating):
seq_record, gene_name = "", ""
for record in SeqIO.parse(cds_from_genomic_file, "fasta"):
if protein_id in record.name:
if "gene=" in record.description:
Expand All @@ -355,14 +359,17 @@ def get_from_cds_and_write(cds_from_genomic_file, ortho_protein_ids, species_num
if protein_id == '*' or not protein_id:
continue
seq_record, gene_name = get_seq_record_from_cds(cds_from_genomic_file, protein_id, species_numerating)
seq_length = len(seq_record.seq)
file_out_number = str(idx[0] + 1)
if not seq_record: # no seq_record when this species is not in the group
if not ABSENT_IN_CDS.get(species_numerating):
ABSENT_IN_CDS[species_numerating] = list()
ABSENT_IN_CDS.get(species_numerating).append(protein_id)
logging.info("protein_id {} is absent in {}".format(protein_id, cds_from_genomic_file))
continue
else:
seq_length = len(seq_record.seq)
if not gene_name:
logging.warning("empty gene_name")
if anti_repeat_check(protein_id, seq_record, seq_store):
write_fasta_file(directory_out, file_out_number, seq_record, species_numerating)
log_file = os.path.join(directory_out, file_out_number + ".log")
Expand Down Expand Up @@ -411,7 +418,7 @@ def get_and_write_nucleotide_seq(gb_file, cds_from_genomic_file, ortho_protein_i
check_stop = check_stop_codon(nucleotide_seq, file_out_number, protein_id)
check_multiple = check_multiple_of_three(nucleotide_seq, file_out_number, protein_id)
if check_common_accordance(check_multiple, check_start, check_stop, check_accordance):
delete_from_broken(file_out_number, protein_id) # common: if check_multiple-ok, then OK
delete_from_broken(file_out_number, protein_id) # common: if check_multiple, check_accordance-ok, then OK
else: # extract from genome .fna
extracted_seq, check_trans_result = check_translate(nucleotide_seq, protein_translation,
genome_fna_path, feature, record_id,
Expand Down Expand Up @@ -485,7 +492,7 @@ def replace_broken_files(directory_out):
os.path.join(broken_multiple_folder, file_number + ".log"))


def main(orthodata_filepath, annotation_gbff, cds_from_genomic, initfna_filepath, species, group, directory_out):
def main(orthodata_filepath, annotation_gbff, cds_from_genomic, initfna_filepath, species, directory_out):
global NUMBER_OF_NEED_TO_BE_WRITTEN
global BROKEN_LIST
if not os.path.isdir(directory_out):
Expand All @@ -510,11 +517,6 @@ def main(orthodata_filepath, annotation_gbff, cds_from_genomic, initfna_filepath
get_and_write_nucleotide_seq(annotation_gbff_path, cds_from_genomic_path, ortho_protein_ids, directory_out,
species_numerating, initfna_filepath)

for file, written_species in PROCESSED_FILES.items():
if written_species < group:
if file not in BROKEN_SPECIES:
BROKEN_SPECIES.append(file)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
Expand All @@ -533,9 +535,10 @@ def main(orthodata_filepath, annotation_gbff, cds_from_genomic, initfna_filepath
parser.add_argument('--group', help='Minimal size of species group', nargs='?')
parser.add_argument('--out', help='Path to the folder for result write out', nargs='?')
args = parser.parse_args()
group = int(args.group)

try:
main(args.ortho, args.gbff, args.cds, args.genome, int(args.species), int(args.group), args.out)
main(args.ortho, args.gbff, args.cds, args.genome, int(args.species), args.out)
written_files_number = len(PROCESSED_FILES)
delta = NUMBER_OF_NEED_TO_BE_WRITTEN - written_files_number
if delta == 0:
Expand All @@ -544,6 +547,10 @@ def main(orthodata_filepath, annotation_gbff, cds_from_genomic, initfna_filepath
logging.info("NUMBER_OF_NEED_TO_BE_WRITTEN = {}, WRITTEN_FILES = {}, where {} in BROKEN_SPECIES list: {}"
.format(NUMBER_OF_NEED_TO_BE_WRITTEN, written_files_number, len(BROKEN_SPECIES),
repr(BROKEN_SPECIES)))
for file, written_species in PROCESSED_FILES.items():
if written_species < group:
if file not in BROKEN_SPECIES:
BROKEN_SPECIES.append(file)
if BROKEN_SPECIES:
replace_broken_files(args.out)
residue = written_files_number - len(BROKEN_SPECIES)
Expand Down
34 changes: 34 additions & 0 deletions usages/check_duplicates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import argparse
import logging
import os
import re

"""
check duplicates in log files
"""
LOG_FILE = "check_duplicates.log"
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename=LOG_FILE)


def main(in_dir):
for infile in os.scandir(in_dir):
if infile.name.split('.')[-1] == 'log':
species_names = list()
with open(os.path.join(in_dir, infile.name), 'r') as f:
for line in f:
if re.search(r'-\s(\d+)$', line):
species_names.append(re.search(r'-\s(\d+)$', line).group(1))
if len(species_names) == len(set(species_names)):
logging.info("infile {} - OK".format(infile.name))
else:
logging.warning("duplicates in file {}".format(infile.name))


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--i', help='Path to the folder with .log files to analyze', nargs='?')
args = parser.parse_args()
try:
main(args.infolder)
except BaseException as err:
logging.info("Unexpected error: {}".format(err))
4 changes: 2 additions & 2 deletions usages/clean_folder_from_odd.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ def main(broken_folder, cleaning_folder):
args = parser.parse_args()
try:
main(args.broken, args.clean)
except:
logging.exception("Unexpected error")
except BaseException as err:
logging.info("Unexpected error: {}".format(err))
logging.info("BROKEN_FASTA list of length {}:\n{}".format(len(BROKEN_FASTA), BROKEN_FASTA))
logging.info("REMOVED_FASTA list of length {}:\n{}".format(len(REMOVED_FASTA), REMOVED_FASTA))
logging.info("REMOVED_LOG list of length {}:\n{}".format(len(REMOVED_LOG), REMOVED_LOG))
Expand Down
4 changes: 2 additions & 2 deletions usages/correct_broken_ortho_nuc.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,8 +470,8 @@ def main(orthodata_filepath, annotation_gbff, annotation_csv, initfna_filepath,
logging.info("removed broken species, broken multiple files into folders 'broken_species_files'"
"'broken_multiple_files' in cwd,"
"please check out folder for .fna files number: {}".format(residue))
except:
logging.exception("Unexpected error")
except BaseException as err:
logging.info("Unexpected error: {}".format(err))

logging.warning("BROKEN_SPECIES {} : {}".format(len(BROKEN_SPECIES), BROKEN_SPECIES))
logging.warning("BROKEN_STOP_CODON {} : {}".format(len(BROKEN_STOP_CODON), BROKEN_STOP_CODON))
Expand Down
4 changes: 2 additions & 2 deletions usages/count_correct_rst_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,5 @@ def main(folder_in):
args = parser.parse_args()
try:
main(args.infolder)
except:
logging.exception("Unexpected error")
except BaseException as e:
logging.info("Unexpected error: {}".format(e))
4 changes: 2 additions & 2 deletions usages/fasta2paml_one group.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,6 @@ def main(folder_in, folder_out, species, group):
logging.warning("NOT_NEEDED_SPECIES {}:{}".format(len(NOT_NEEDED_SPECIES), NOT_NEEDED_SPECIES))
logging.warning("NOT_MULTIPLE_OF_THREE {}:{}".format(len(NOT_MULTIPLE_OF_THREE), NOT_MULTIPLE_OF_THREE))
logging.warning("EDITED_MULT_OF_THREE {}:{}".format(len(EDITED_MULT_OF_THREE), EDITED_MULT_OF_THREE))
except:
logging.exception("Unexpected error")
except BaseException as err:
logging.info("Unexpected error: {}".format(err))
logging.info("The work has been completed")
4 changes: 2 additions & 2 deletions usages/remove_broken.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def main(outfolder):
args = parser.parse_args()
try:
main(args.outfolder)
except:
logging.exception("Unexpected error")
except BaseException as err:
logging.info("Unexpected error: {}".format(err))
logging.info("NOT_FOUND_FILES list of length {}:\n{}".format(len(NOT_FOUND_FILES), NOT_FOUND_FILES))


4 changes: 2 additions & 2 deletions usages/rename_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def rename_seq(infile, out_folder_fas, out_folder_log):
logger.setLevel(logging.INFO)
pool = multiprocessing.Pool(threads)
pool.starmap(rename_seq, zip(inputs, len(inputs) * [args.outfolder_fas], len(inputs) * [args.outfolder_txt]))
except:
logging.exception("Unexpected error")
except BaseException as err:
logging.info("Unexpected error: {}".format(err))

logging.info("The work has been completed")

0 comments on commit 4748195

Please sign in to comment.