Skip to content

Commit

Permalink
auto selection of gblocks params;sort by folders-groups in beginning
Browse files Browse the repository at this point in the history
  • Loading branch information
lnalinaf committed May 26, 2021
1 parent 26bd9e1 commit 7bd2857
Show file tree
Hide file tree
Showing 6 changed files with 66 additions and 50 deletions.
44 changes: 30 additions & 14 deletions pipeline/gblocks_alignment.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import math
import multiprocessing
import os
import logging
Expand All @@ -18,19 +19,32 @@ def init_counter(args):
counter_file = args


def parse_dir(in_dir):
for infile in os.listdir(in_dir):
if infile.split('.')[-1] == 'fas':
yield os.path.join(in_dir, infile)
def parse_dir(input_dir):
for species_folder in os.scandir(input_dir):
if os.path.isdir(species_folder):
for infile in os.scandir(species_folder):
if infile.split('.')[-1] == 'fas':
yield input_dir, species_folder, infile


def launch_gblocks(infile, exec_path, child_logger):
def launch_gblocks(input_tuple, auto_flag, exec_path, child_logger):
global counter_file
file_number = re.search(r'\/(\d+)\.', infile).group(1)
launch = '{} {} -t=c -b1=3 -b2=4 -b3=8 -b4=10 -b5=n ' \
'-p=Yes >> {}'.format(exec_path, infile, LOG_FILE) # TODO: > LOG_FILE: to do multiprocessing
input_dir, species_folder, infile = input_tuple
infile_path = os.path.join(input_dir, species_folder, infile)
if auto_flag == 'n':
params_string = '-t=c -b1=3 -b2=4 -b3=8 -b4=10 -b5=n -p=y'
else:
if len(species_folder) <= 9:
number_of_species = len(species_folder)
else:
number_of_species = (len(species_folder) - 9) / 2 + 9
b1 = math.ceil(number_of_species / 2 + 1)
b2 = math.ceil(number_of_species * 0.85)
params_string = '-t=c -b1={} -b2={} -b3=8 -b4=9 -b5=n -p=y'.format(b1, b2)

launch = '{} {} {} >> {}'.format(exec_path, infile_path, params_string, LOG_FILE) # TODO: > LOG_FILE: to do multiprocessing
os.system(launch)
child_logger.info("Gblocks processed file {} with params {}".format(file_number, '-t=c -b1=3 -b2=4 -b3=7 -b4=6 -b5=h'))
child_logger.info("Gblocks processed file {} with params {}".format(infile, params_string))
with counter_file.get_lock():
counter_file.value += 1
child_logger.info("Counter (processed files) = {}".format(counter_file.value))
Expand All @@ -41,6 +55,8 @@ def launch_gblocks(infile, exec_path, child_logger):
parser.add_argument('--i', help='Path to the folder with input files for Gblocks'
'FASTA formats are accepted', nargs='?')
parser.add_argument('--exec', help='Path to the Gblocks executable', nargs='?')
parser.add_argument('--auto', help='\'y\' or \'n\': automatic selection of basic parameters according to group size',
nargs='?')
parser.add_argument('--threads', help='Number of threads', nargs='?')
args = parser.parse_args()
threads = int(args.threads)
Expand All @@ -53,16 +69,16 @@ def launch_gblocks(infile, exec_path, child_logger):
pool = multiprocessing.Pool(processes=threads, initializer=init_counter, initargs=(counter_file,))
in_dir = args.i
executable_path = args.exec
inputs = list(parse_dir(in_dir))
len_inputs = len(inputs)
input_tuples = list(parse_dir(in_dir))
len_inputs = len(input_tuples)
logger.info("Path to the folder with input files for Gblocks: {}\nExecutable path: {}".
format(in_dir, executable_path))
i = pool.starmap_async(launch_gblocks, zip(inputs, len_inputs * [executable_path], len_inputs * [logger]))
format(in_dir, executable_path))
i = pool.starmap_async(launch_gblocks, zip(input_tuples, len_inputs * [executable_path], len_inputs * [logger]))
i.wait()
i.get()
except BaseException as e:
logger.exception("Unexpected error: {}".format(e))
logger.info("Number of processed files = {}".format(counter_file.value))

raise e
logger.info("Number of processed files = {}".format(counter_file.value))
logger.info("The work has been completed")
38 changes: 22 additions & 16 deletions pipeline/get_ortho_nucleotides.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,15 +280,16 @@ def anti_repeat_check_with_comparing(anti_repeat_store, protein_id, nucleotide_s
return nucleotide_seq


def write_fasta_file(directory_out, file_out_number, seq, species_numerating):
def write_fasta_file(directory_out, file_out_number, seq_record, species_numerating):
global PROCESSED_FILES
with open(os.path.join(directory_out, file_out_number + ".fna"), "a") as ofile:
SeqIO.write(seq, ofile, "fasta")
SeqIO.write(seq_record, ofile, "fasta")
logging.info("Sequence corresponding to the species number {} has been added to the file {}".format(
species_numerating, file_out_number))
if not PROCESSED_FILES.get(file_out_number):
PROCESSED_FILES[file_out_number] = 0
PROCESSED_FILES[file_out_number] += 1
PROCESSED_FILES[file_out_number] = ["", 0]
PROCESSED_FILES[file_out_number][0] += seq_record.id
PROCESSED_FILES[file_out_number][1] += 1


def write_log_file(log_file, gene, protein_id, seq_length,
Expand Down Expand Up @@ -478,25 +479,17 @@ def conv_string(val):

def replace_broken_files(directory_out):
broken_species_folder = "broken_species_files"
broken_multiple_folder = "broken_multiple_files"
os.makedirs(broken_species_folder)
for file_number in BROKEN_SPECIES:
os.replace(os.path.join(directory_out, file_number + ".fna"),
os.path.join(broken_species_folder, file_number + ".fna"))
os.replace(os.path.join(directory_out, file_number + ".log"),
os.path.join(broken_species_folder, file_number + ".log"))
for file_number in BROKEN_MULTIPLE_THREE:
os.replace(os.path.join(directory_out, file_number + ".fna"),
os.path.join(broken_multiple_folder, file_number + ".fna"))
os.replace(os.path.join(directory_out, file_number + ".log"),
os.path.join(broken_multiple_folder, file_number + ".log"))


def main(orthodata_filepath, annotation_gbff, cds_from_genomic, initfna_filepath, species, directory_out):
global NUMBER_OF_NEED_TO_BE_WRITTEN
global BROKEN_LIST
if not os.path.isdir(directory_out):
os.makedirs(directory_out)
try:
ortho_data = pd.read_csv(orthodata_filepath, sep='\t', usecols=range(3, 3 + species))
except ValueError:
Expand Down Expand Up @@ -543,19 +536,32 @@ def main(orthodata_filepath, annotation_gbff, cds_from_genomic, initfna_filepath
group = int(args.group)

try:
main(args.ortho, args.gbff, args.cds, args.genome, int(args.species), args.out)
directory_out = args.out
if not os.path.isdir(directory_out):
os.makedirs(directory_out)
main(args.ortho, args.gbff, args.cds, args.genome, int(args.species), directory_out)
written_files_number = len(PROCESSED_FILES)
delta = NUMBER_OF_NEED_TO_BE_WRITTEN - written_files_number
if delta == 0:
logging.info("All files are written")

logging.info("NUMBER_OF_NEED_TO_BE_WRITTEN = {}, WRITTEN_FILES = {}, where {} in BROKEN_SPECIES list: {}"
.format(NUMBER_OF_NEED_TO_BE_WRITTEN, written_files_number, len(BROKEN_SPECIES),
repr(BROKEN_SPECIES)))
for file, written_species in PROCESSED_FILES.items():
if written_species < group:

for file, species_info in PROCESSED_FILES.items():
if species_info[1] < group:
if file not in BROKEN_SPECIES:
BROKEN_SPECIES.append(file)
else:
"sort by folders-groups"
list_of_species = list(species_info[0])
list_of_species.sort()
group_folder_name = "".join(list_of_species)
group_folder_path = os.path.join(directory_out, group_folder_name)
if not os.path.isdir(group_folder_path):
os.makedirs(group_folder_path)
os.replace(os.path.join(directory_out, file + ".fna"), os.path.join(group_folder_path, file + ".fna"))

if BROKEN_SPECIES:
replace_broken_files(args.out)
residue = written_files_number - len(BROKEN_SPECIES)
Expand Down
3 changes: 2 additions & 1 deletion pipeline/get_orthologs_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

def main(proteinortho_file, species, required):
out_file_name = "{}{}{}".format(os.path.split(proteinortho_file)[0], "/formed_", os.path.split(proteinortho_file)[1])
proteinortho_data = pd.read_csv(proteinortho_file, sep='\t')
proteinortho_data = pd.read_csv(proteinortho_file, sep=',')
logging.info("Input file {}\nOutput file {}".format(proteinortho_file, out_file_name))
result = []
for species in species.split(','):
Expand Down Expand Up @@ -52,4 +52,5 @@ def main(proteinortho_file, species, required):
main(args.ortho, args.species, args.required)
except BaseException as e:
logging.exception("Unexpected error: {}".format(e))
raise e
logging.info("The orthologs table was recorded")
12 changes: 6 additions & 6 deletions pipeline/prank_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,11 @@ def launch_prank(infile, folder_out, tree, format_out):
global PROCESSED_FILES
launch_command = get_launch_command(infile, final_file_path, outfile_path_without_extension, tree, format_out)
if not os.system(launch_command):
logging.info("prank completed task for file {}".format(file_number))
# logging.info("prank completed task for file {}".format(file_number))
if file_number not in PROCESSED_FILES:
PROCESSED_FILES.append(file_number)
with counter.get_lock():
counter.value += 1
counter.value += 1 # TODO: wrong number
# logging.info("Counter (ALIGNED_FILES) = {}".format(counter.value)) # TODO: multiprocessing
except BaseException as err:
global EXCEPTION_NUMBER
Expand Down Expand Up @@ -114,8 +114,8 @@ def launch_prank(infile, folder_out, tree, format_out):
i.get()
except BaseException as e:
logging.exception("Unexpected error: {}".format(e))
logging.info("Number of PROCESSED_FILES = {}".format(counter.value))
logging.info("Number of prank exceptions = {}".format(EXCEPTION_NUMBER))
logging.info("Number of PROCESSED_FILES = {}".format(counter.value))
logging.info("Number of prank exceptions = {}".format(EXCEPTION_NUMBER))
# logging.info("Number of PROCESSED_FILES = {}".format(counter.value))
# logging.info("Number of prank exceptions = {}".format(EXCEPTION_NUMBER))
# logging.info("Number of PROCESSED_FILES = {}".format(counter.value))
# logging.info("Number of prank exceptions = {}".format(EXCEPTION_NUMBER))
logging.info("The work has been completed")
10 changes: 0 additions & 10 deletions usages/adjust_pvalue.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,6 @@ def main(sheet_path, out_path):
sheet['Significance_FDR'] = sheet.apply(lambda row: get_FDR_significance(row), axis=1)
sheet.to_excel(writer, sheet_name=name)
writer.save()
# new_sheet = {'Gene name': list(), '2a, Dn/Ds foreground': list(), 'P-value': list()}
# for row in sheet.iterrows():
# if row[1]['Gene name'] in target_list:
# print(name)
# print(row[1]['Gene name'])
# new_sheet['Gene name'].append(row[1]['Gene name'])
# new_sheet['2a, Dn/Ds foreground'].append(row[1]['2a, Dn/Ds foreground'])
# new_sheet['P-value'].append(row[1]['P-value'])
# full_table = full_table.append(sheet)
# df = pd.DataFrame(summary_sheet, columns=['Gene name', 'NCBI protein_id', 'p-value'])


if __name__ == '__main__':
Expand Down
9 changes: 6 additions & 3 deletions usages/compare_with_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ def main(data_sheet_path, adjust_sheet_path, out_sheet_path):
adjust_sheet = pd.io.excel.read_excel(adjust_sheet_path, sheet_name=None, engine='openpyxl')
data_sheet = pd.io.excel.read_excel(data_sheet_path, sheet_name=None, engine='openpyxl')
writer = pd.ExcelWriter(out_sheet_path, engine='openpyxl')
full_out_table = pd.DataFrame()
for name_data, sheet_data in data_sheet.items():
if name_data == 'Datasheet S6':
if name_data == 'Datasheet S6' or name_data == 'Datasheet S5a':
full_out_table = pd.DataFrame()
for name_adjust, sheet_adjust in adjust_sheet.items():
merged = pd.merge(sheet_data, sheet_adjust, how='inner', on=['Gene name'],
suffixes=("_article", "_new"), indicator=True)
Expand All @@ -18,7 +18,10 @@ def main(data_sheet_path, adjust_sheet_path, out_sheet_path):
}
merged['_merge'] = merged['_merge'].map(d)
full_out_table = full_out_table.append(merged)
full_out_table.to_excel(writer, sheet_name='Datasheet S6', index=False)
if name_data == 'Datasheet S6':
full_out_table.to_excel(writer, sheet_name='Datasheet S6', index=False)
elif name_data == 'Datasheet S5a':
full_out_table.to_excel(writer, sheet_name='Datasheet S5a', index=False)
writer.save()


Expand Down

0 comments on commit 7bd2857

Please sign in to comment.