auto selection of gblocks params;sort by folders-groups in beginning

ctlab · May 26, 2021 · 7bd2857 · 7bd2857
1 parent 26bd9e1
commit 7bd2857
Show file tree

Hide file tree

Showing 6 changed files with 66 additions and 50 deletions.
diff --git a/pipeline/gblocks_alignment.py b/pipeline/gblocks_alignment.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 import argparse
+import math
 import multiprocessing
 import os
 import logging
@@ -18,19 +19,32 @@ def init_counter(args):
     counter_file = args
 
 
-def parse_dir(in_dir):
-    for infile in os.listdir(in_dir):
-        if infile.split('.')[-1] == 'fas':
-            yield os.path.join(in_dir, infile)
+def parse_dir(input_dir):
+    for species_folder in os.scandir(input_dir):
+        if os.path.isdir(species_folder):
+            for infile in os.scandir(species_folder):
+                if infile.split('.')[-1] == 'fas':
+                    yield input_dir, species_folder, infile
 
 
-def launch_gblocks(infile, exec_path, child_logger):
+def launch_gblocks(input_tuple, auto_flag, exec_path, child_logger):
     global counter_file
-    file_number = re.search(r'\/(\d+)\.', infile).group(1)
-    launch = '{} {} -t=c -b1=3 -b2=4 -b3=8 -b4=10 -b5=n ' \
-             '-p=Yes >> {}'.format(exec_path, infile, LOG_FILE)  # TODO: > LOG_FILE: to do multiprocessing
+    input_dir, species_folder, infile = input_tuple
+    infile_path = os.path.join(input_dir, species_folder, infile)
+    if auto_flag == 'n':
+        params_string = '-t=c -b1=3 -b2=4 -b3=8 -b4=10 -b5=n -p=y'
+    else:
+        if len(species_folder) <= 9:
+            number_of_species = len(species_folder)
+        else:
+            number_of_species = (len(species_folder) - 9) / 2 + 9
+        b1 = math.ceil(number_of_species / 2 + 1)
+        b2 = math.ceil(number_of_species * 0.85)
+        params_string = '-t=c -b1={} -b2={} -b3=8 -b4=9 -b5=n -p=y'.format(b1, b2)
+
+    launch = '{} {} {} >> {}'.format(exec_path, infile_path, params_string, LOG_FILE)  # TODO: > LOG_FILE: to do multiprocessing
     os.system(launch)
-    child_logger.info("Gblocks processed file {} with params {}".format(file_number, '-t=c -b1=3 -b2=4 -b3=7 -b4=6 -b5=h'))
+    child_logger.info("Gblocks processed file {} with params {}".format(infile, params_string))
     with counter_file.get_lock():
         counter_file.value += 1
         child_logger.info("Counter (processed files) = {}".format(counter_file.value))
@@ -41,6 +55,8 @@ def launch_gblocks(infile, exec_path, child_logger):
     parser.add_argument('--i', help='Path to the folder with input files for Gblocks'
                                     'FASTA formats are accepted', nargs='?')
     parser.add_argument('--exec', help='Path to the Gblocks executable', nargs='?')
+    parser.add_argument('--auto', help='\'y\' or \'n\': automatic selection of basic parameters according to group size',
+                        nargs='?')
     parser.add_argument('--threads', help='Number of threads', nargs='?')
     args = parser.parse_args()
     threads = int(args.threads)
@@ -53,16 +69,16 @@ def launch_gblocks(infile, exec_path, child_logger):
         pool = multiprocessing.Pool(processes=threads, initializer=init_counter, initargs=(counter_file,))
         in_dir = args.i
         executable_path = args.exec
-        inputs = list(parse_dir(in_dir))
-        len_inputs = len(inputs)
+        input_tuples = list(parse_dir(in_dir))
+        len_inputs = len(input_tuples)
         logger.info("Path to the folder with input files for Gblocks: {}\nExecutable path: {}".
-                     format(in_dir, executable_path))
-        i = pool.starmap_async(launch_gblocks, zip(inputs, len_inputs * [executable_path], len_inputs * [logger]))
+                    format(in_dir, executable_path))
+        i = pool.starmap_async(launch_gblocks, zip(input_tuples, len_inputs * [executable_path], len_inputs * [logger]))
         i.wait()
         i.get()
     except BaseException as e:
         logger.exception("Unexpected error: {}".format(e))
         logger.info("Number of processed files = {}".format(counter_file.value))
-
+        raise e
     logger.info("Number of processed files = {}".format(counter_file.value))
     logger.info("The work has been completed")
diff --git a/pipeline/get_ortho_nucleotides.py b/pipeline/get_ortho_nucleotides.py
@@ -280,15 +280,16 @@ def anti_repeat_check_with_comparing(anti_repeat_store, protein_id, nucleotide_s
                 return nucleotide_seq
 
 
-def write_fasta_file(directory_out, file_out_number, seq, species_numerating):
+def write_fasta_file(directory_out, file_out_number, seq_record, species_numerating):
     global PROCESSED_FILES
     with open(os.path.join(directory_out, file_out_number + ".fna"), "a") as ofile:
-        SeqIO.write(seq, ofile, "fasta")
+        SeqIO.write(seq_record, ofile, "fasta")
         logging.info("Sequence corresponding to the species number {} has been added to the file {}".format(
             species_numerating, file_out_number))
         if not PROCESSED_FILES.get(file_out_number):
-            PROCESSED_FILES[file_out_number] = 0
-        PROCESSED_FILES[file_out_number] += 1
+            PROCESSED_FILES[file_out_number] = ["", 0]
+        PROCESSED_FILES[file_out_number][0] += seq_record.id
+        PROCESSED_FILES[file_out_number][1] += 1
 
 
 def write_log_file(log_file, gene, protein_id, seq_length,
@@ -478,25 +479,17 @@ def conv_string(val):
 
 def replace_broken_files(directory_out):
     broken_species_folder = "broken_species_files"
-    broken_multiple_folder = "broken_multiple_files"
     os.makedirs(broken_species_folder)
     for file_number in BROKEN_SPECIES:
         os.replace(os.path.join(directory_out, file_number + ".fna"),
                    os.path.join(broken_species_folder, file_number + ".fna"))
         os.replace(os.path.join(directory_out, file_number + ".log"),
                    os.path.join(broken_species_folder, file_number + ".log"))
-    for file_number in BROKEN_MULTIPLE_THREE:
-        os.replace(os.path.join(directory_out, file_number + ".fna"),
-                   os.path.join(broken_multiple_folder, file_number + ".fna"))
-        os.replace(os.path.join(directory_out, file_number + ".log"),
-                   os.path.join(broken_multiple_folder, file_number + ".log"))
 
 
 def main(orthodata_filepath, annotation_gbff, cds_from_genomic, initfna_filepath, species, directory_out):
     global NUMBER_OF_NEED_TO_BE_WRITTEN
     global BROKEN_LIST
-    if not os.path.isdir(directory_out):
-        os.makedirs(directory_out)
     try:
         ortho_data = pd.read_csv(orthodata_filepath, sep='\t', usecols=range(3, 3 + species))
     except ValueError:
@@ -543,19 +536,32 @@ def main(orthodata_filepath, annotation_gbff, cds_from_genomic, initfna_filepath
     group = int(args.group)
 
     try:
-        main(args.ortho, args.gbff, args.cds, args.genome, int(args.species), args.out)
+        directory_out = args.out
+        if not os.path.isdir(directory_out):
+            os.makedirs(directory_out)
+        main(args.ortho, args.gbff, args.cds, args.genome, int(args.species), directory_out)
         written_files_number = len(PROCESSED_FILES)
         delta = NUMBER_OF_NEED_TO_BE_WRITTEN - written_files_number
         if delta == 0:
             logging.info("All files are written")
-
         logging.info("NUMBER_OF_NEED_TO_BE_WRITTEN = {},  WRITTEN_FILES = {}, where {} in BROKEN_SPECIES list: {}"
                      .format(NUMBER_OF_NEED_TO_BE_WRITTEN, written_files_number, len(BROKEN_SPECIES),
                              repr(BROKEN_SPECIES)))
-        for file, written_species in PROCESSED_FILES.items():
-            if written_species < group:
+
+        for file, species_info in PROCESSED_FILES.items():
+            if species_info[1] < group:
                 if file not in BROKEN_SPECIES:
                     BROKEN_SPECIES.append(file)
+            else:
+                "sort by folders-groups"
+                list_of_species = list(species_info[0])
+                list_of_species.sort()
+                group_folder_name = "".join(list_of_species)
+                group_folder_path = os.path.join(directory_out, group_folder_name)
+                if not os.path.isdir(group_folder_path):
+                    os.makedirs(group_folder_path)
+                os.replace(os.path.join(directory_out, file + ".fna"), os.path.join(group_folder_path, file + ".fna"))
+
         if BROKEN_SPECIES:
             replace_broken_files(args.out)
             residue = written_files_number - len(BROKEN_SPECIES)

diff --git a/pipeline/get_orthologs_table.py b/pipeline/get_orthologs_table.py
@@ -14,7 +14,7 @@
 
 def main(proteinortho_file, species, required):
     out_file_name = "{}{}{}".format(os.path.split(proteinortho_file)[0], "/formed_", os.path.split(proteinortho_file)[1])
-    proteinortho_data = pd.read_csv(proteinortho_file, sep='\t')
+    proteinortho_data = pd.read_csv(proteinortho_file, sep=',')
     logging.info("Input file {}\nOutput file {}".format(proteinortho_file, out_file_name))
     result = []
     for species in species.split(','):
@@ -52,4 +52,5 @@ def main(proteinortho_file, species, required):
         main(args.ortho, args.species, args.required)
     except BaseException as e:
         logging.exception("Unexpected error: {}".format(e))
+        raise e
     logging.info("The orthologs table was recorded")
diff --git a/pipeline/prank_alignment.py b/pipeline/prank_alignment.py
@@ -68,11 +68,11 @@ def launch_prank(infile, folder_out, tree, format_out):
         global PROCESSED_FILES
         launch_command = get_launch_command(infile, final_file_path, outfile_path_without_extension, tree, format_out)
         if not os.system(launch_command):
-            logging.info("prank completed task for file {}".format(file_number))
+            # logging.info("prank completed task for file {}".format(file_number))
             if file_number not in PROCESSED_FILES:
                 PROCESSED_FILES.append(file_number)
                 with counter.get_lock():
-                    counter.value += 1
+                    counter.value += 1  # TODO: wrong number
                     # logging.info("Counter (ALIGNED_FILES) = {}".format(counter.value)) # TODO: multiprocessing
     except BaseException as err:
         global EXCEPTION_NUMBER
@@ -114,8 +114,8 @@ def launch_prank(infile, folder_out, tree, format_out):
         i.get()
     except BaseException as e:
         logging.exception("Unexpected error: {}".format(e))
-        logging.info("Number of PROCESSED_FILES = {}".format(counter.value))
-        logging.info("Number of prank exceptions = {}".format(EXCEPTION_NUMBER))
-    logging.info("Number of PROCESSED_FILES = {}".format(counter.value))
-    logging.info("Number of prank exceptions = {}".format(EXCEPTION_NUMBER))
+    #     logging.info("Number of PROCESSED_FILES = {}".format(counter.value))
+    #     logging.info("Number of prank exceptions = {}".format(EXCEPTION_NUMBER))
+    # logging.info("Number of PROCESSED_FILES = {}".format(counter.value))
+    # logging.info("Number of prank exceptions = {}".format(EXCEPTION_NUMBER))
     logging.info("The work has been completed")
diff --git a/usages/adjust_pvalue.py b/usages/adjust_pvalue.py
@@ -27,16 +27,6 @@ def main(sheet_path, out_path):
         sheet['Significance_FDR'] = sheet.apply(lambda row: get_FDR_significance(row), axis=1)
         sheet.to_excel(writer, sheet_name=name)
     writer.save()
-    # new_sheet = {'Gene name': list(), '2a, Dn/Ds foreground': list(), 'P-value': list()}
-    # for row in sheet.iterrows():
-    #     if row[1]['Gene name'] in target_list:
-    #         print(name)
-    #         print(row[1]['Gene name'])
-    #         new_sheet['Gene name'].append(row[1]['Gene name'])
-    #         new_sheet['2a, Dn/Ds foreground'].append(row[1]['2a, Dn/Ds foreground'])
-    #         new_sheet['P-value'].append(row[1]['P-value'])
-    # full_table = full_table.append(sheet)
-    # df = pd.DataFrame(summary_sheet, columns=['Gene name', 'NCBI protein_id', 'p-value'])
 
 
 if __name__ == '__main__':

diff --git a/usages/compare_with_data.py b/usages/compare_with_data.py
@@ -6,9 +6,9 @@ def main(data_sheet_path, adjust_sheet_path, out_sheet_path):
     adjust_sheet = pd.io.excel.read_excel(adjust_sheet_path, sheet_name=None, engine='openpyxl')
     data_sheet = pd.io.excel.read_excel(data_sheet_path, sheet_name=None, engine='openpyxl')
     writer = pd.ExcelWriter(out_sheet_path, engine='openpyxl')
-    full_out_table = pd.DataFrame()
     for name_data, sheet_data in data_sheet.items():
-        if name_data == 'Datasheet S6':
+        if name_data == 'Datasheet S6' or name_data == 'Datasheet S5a':
+            full_out_table = pd.DataFrame()
             for name_adjust, sheet_adjust in adjust_sheet.items():
                 merged = pd.merge(sheet_data, sheet_adjust, how='inner', on=['Gene name'],
                                   suffixes=("_article", "_new"), indicator=True)
@@ -18,7 +18,10 @@ def main(data_sheet_path, adjust_sheet_path, out_sheet_path):
                     }
                 merged['_merge'] = merged['_merge'].map(d)
                 full_out_table = full_out_table.append(merged)
-    full_out_table.to_excel(writer, sheet_name='Datasheet S6', index=False)
+            if name_data == 'Datasheet S6':
+                full_out_table.to_excel(writer, sheet_name='Datasheet S6', index=False)
+            elif name_data == 'Datasheet S5a':
+                full_out_table.to_excel(writer, sheet_name='Datasheet S5a', index=False)
     writer.save()