diff --git a/bin/AsaruSim.py b/bin/AsaruSim.py new file mode 100644 index 0000000..12989ea --- /dev/null +++ b/bin/AsaruSim.py @@ -0,0 +1,36 @@ +import argparse +from badread_caller import badread_caller, setup_badread_command +from template_maker import template_maker, setup_template_parameters +from PCR_amplificator import PCR_amplificator, setup_PCR_parameters + +def setup_parent_parser(): + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser.add_argument('--debug', action='store_true', help='Enable debug mode') + return parent_parser + + +def main(): + parent_parser = setup_parent_parser() + main_parser = argparse.ArgumentParser() + + subparsers = main_parser.add_subparsers(title="command", dest="command", required=True) + + subparsers.add_parser('call_badread', parents=[setup_badread_command(parent_parser)], + add_help=False) + subparsers.add_parser('template_maker', parents=[setup_template_parameters(parent_parser)], + add_help=False) + subparsers.add_parser('PCR', parents=[setup_PCR_parameters(parent_parser)], + add_help=False) + + args = main_parser.parse_args() + + if args.command == 'call_badread': + badread_caller(args) + if args.command == 'template_maker': + template_maker(args) + if args.command == 'PCR': + PCR_amplificator(args) + + +if __name__ == "__main__": + main() diff --git a/bin/PCR.py b/bin/PCR_amplificator.py similarity index 74% rename from bin/PCR.py rename to bin/PCR_amplificator.py index 6a5b514..ee8c16b 100755 --- a/bin/PCR.py +++ b/bin/PCR_amplificator.py @@ -4,23 +4,33 @@ import random import os, sys, math from toolkit import multiprocessing_submit +from toolkit import read_template YELLOW = "\033[93m" GRAY = "\033[90m" RESET = "\033[0m" logging.basicConfig(level=logging.INFO, format=GRAY+ '%(message)s' +RESET) -parser = argparse.ArgumentParser(description="Script for generating template sequences for scRNAseq simulation.") - -parser.add_argument('-f','--template', type=str, required=True, help="Path to the template FASTA file.") -parser.add_argument('-o','--out', type=str, default="out.fa", help="Path to the output FASTA file.") -parser.add_argument('-c','--cycles', type=int, default=5, help="number of cycles.") -parser.add_argument('-d','--dup', type=float, default=0.7, help="duplication rate.") -parser.add_argument('-e','--error', type=float, default=0.00003, help="error rate.") -parser.add_argument('-t','--thread', type=int, default=4, help="number of threads to use.") -parser.add_argument('-b','--batch_size', type=int, default=500, help="batch size.") -parser.add_argument('-n','--totalNamber', type=int, default=None, help="total number of sequence to select from the finel pool.") -parser.add_argument('-s','--seed', type=int, default=2024, help="seed value.") + + +def setup_PCR_parameters(parent_parser): + description="Nanopore template maker." + if parent_parser: + parser = argparse.ArgumentParser(parents=[parent_parser], + description=description) + else : + parser = argparse.ArgumentParser(description=description) + + parser.add_argument('-f','--template', type=str, required=True, help="Path to the template FASTA file.") + parser.add_argument('-o','--out', type=str, default="out.fa", help="Path to the output FASTA file.") + parser.add_argument('-c','--cycles', type=int, default=5, help="number of cycles.") + parser.add_argument('-d','--dup', type=float, default=0.7, help="duplication rate.") + parser.add_argument('-e','--error', type=float, default=0.00003, help="error rate.") + parser.add_argument('-t','--thread', type=int, default=4, help="number of threads to use.") + parser.add_argument('-b','--batch_size', type=int, default=500, help="batch size.") + parser.add_argument('-n','--totalNamber', type=int, default=None, help="total number of sequence to select from the finel pool.") + parser.add_argument('-s','--seed', type=int, default=2024, help="seed value.") + return parser class Molecule (): def __init__(self, name, length, seq, root, inherited_mut): @@ -113,39 +123,8 @@ def PCR(template, cycles, dup_rate, error_rate, totalNumber, outfile): sequencing(pool_list, outfile, totalNumber) -def read_template(fasta, thread, batch_size): - """ - Function to read and extract sequences from a FASTQ file. It supports both plain text and gzipped files. - Inputs: - fastq: String, path to the FASTQ file. - Returns: List of sequences encoded in ASCII. - """ - if thread==1: - with open(fasta, 'rt') as f: - while True: - header = f.readline() - if not header: break - name = header[1:].strip() - seq = f.readline().strip() - yield name, seq - else: - batch = [] - with open(fasta, 'r') as f: - while True: - header = f.readline() - if not header: break - name = header[1:].strip() - seq = f.readline().strip() - batch.append([name, seq]) - if len(batch) == batch_size: - yield batch - batch = [] - if len(batch) > 0: - yield batch - - -def main(): - args = parser.parse_args() +def PCR_amplificator(args): + logging.info("_____________________________") logging.info("Template file name : %s" , args.template) logging.info("PCR cycles : %s" , args.cycles) @@ -206,4 +185,6 @@ def main(): logging.info("DONE!_______________________________") if __name__ == "__main__": - main() \ No newline at end of file + args = setup_PCR_parameters(None).parse_args() + if args: + PCR_amplificator(args) \ No newline at end of file diff --git a/bin/QC.py b/bin/QC.py index 5044866..b8ce168 100755 --- a/bin/QC.py +++ b/bin/QC.py @@ -1,12 +1,11 @@ import pandas as pd import argparse -from libs import qc_generator -from libs.figs import over_time_graph -from libs.figs import ATGC_graph -from libs.figs import GC_content_plot -from libs.report import make_report -from libs.reporte_maker import reporter +from QC_tools import qc_generator +from QC_tools.figs import over_time_graph +from QC_tools.figs import ATGC_graph +from QC_tools.figs import GC_content_plot +from QC_tools.reporte_maker import reporter import plotly.graph_objects as go parser = argparse.ArgumentParser(description='Process some integers.') @@ -65,8 +64,8 @@ def create_table(stats_data, table_width=500, table_height=400, margin=10): data=[go.Table( header=dict( values=['', 'Value'], - fill_color='#EBEBEB', # Couleur de fond de l'en-tête - font=dict(color='black', size=12) # Couleur et taille du texte de l'en-tête + fill_color='#EBEBEB', + font=dict(color='black', size=12) ), cells=dict( values=[ @@ -78,8 +77,8 @@ def create_table(stats_data, table_width=500, table_height=400, margin=10): int(int(stats_data['Simulated UMI counts']) / int(stats_data['Simulated Cell BC'])) ] ], - fill_color='#f4f4f4', # Couleur de fond des cellules - font=dict(color='black', size=11) # Couleur et taille du texte des cellules + fill_color='#f4f4f4', + font=dict(color='black', size=11) ) )] @@ -131,7 +130,6 @@ def main(): pie_fig = pie_unknown(df_stats) stats_table = create_table(df_stats) - #make_report(Q_over_time, atgc, gc, args.project, args.positions) reporter(stats_table, pie_fig, Q_over_time, diff --git a/bin/QC_tools/.DS_Store b/bin/QC_tools/.DS_Store new file mode 100644 index 0000000..b10f6c9 Binary files /dev/null and b/bin/QC_tools/.DS_Store differ diff --git a/bin/QC_tools/._.DS_Store b/bin/QC_tools/._.DS_Store new file mode 100644 index 0000000..043f343 Binary files /dev/null and b/bin/QC_tools/._.DS_Store differ diff --git a/bin/QC_tools/._css_style.py b/bin/QC_tools/._css_style.py new file mode 100644 index 0000000..f561729 Binary files /dev/null and b/bin/QC_tools/._css_style.py differ diff --git a/bin/QC_tools/._qc_generator.py b/bin/QC_tools/._qc_generator.py new file mode 100644 index 0000000..faef07f Binary files /dev/null and b/bin/QC_tools/._qc_generator.py differ diff --git a/bin/libs/css_style.py b/bin/QC_tools/css_style.py similarity index 100% rename from bin/libs/css_style.py rename to bin/QC_tools/css_style.py diff --git a/bin/libs/fastq_reader.cpp b/bin/QC_tools/fastq_reader.cpp similarity index 100% rename from bin/libs/fastq_reader.cpp rename to bin/QC_tools/fastq_reader.cpp diff --git a/bin/libs/figs.py b/bin/QC_tools/figs.py similarity index 99% rename from bin/libs/figs.py rename to bin/QC_tools/figs.py index a4a61f4..50a5ca6 100755 --- a/bin/libs/figs.py +++ b/bin/QC_tools/figs.py @@ -52,7 +52,7 @@ def make_plots(df_stats, positions, q1, q_scores, q3): percentage_T=df_stats["%T"], result_directory='your/directory/path', graph_name='Base % over sequence', - yaxis_title='Q Score', + yaxis_title='Base (%)', log=False, sigma=1, yaxis_starts_zero=False, diff --git a/bin/libs/qc_generator.py b/bin/QC_tools/qc_generator.py similarity index 82% rename from bin/libs/qc_generator.py rename to bin/QC_tools/qc_generator.py index 6381a2b..1a5724c 100755 --- a/bin/libs/qc_generator.py +++ b/bin/QC_tools/qc_generator.py @@ -5,7 +5,7 @@ from tqdm import tqdm from concurrent.futures import ProcessPoolExecutor, as_completed -from fqread.fastq_reader import SequenceStats +from QC_tools.fqread.fastq_reader import SequenceStats def process_fastq_file(filename, pos, reverse): @@ -16,14 +16,14 @@ def process_fastq_file(filename, pos, reverse): with open_fn(filename[0], 'rb') as f: while sequence_count < 100000: header = f.readline() - if not header: break # EOF + if not header: break if reverse: seq = f.readline().strip()[::-1][:pos] - f.readline() # Plus line + f.readline() quality = f.readline().strip()[::-1] else: seq = f.readline().strip()[:pos] - f.readline() # Plus line + f.readline() quality = f.readline().strip() stats.add_sequence(seq, quality) diff --git a/bin/libs/reporte_maker.py b/bin/QC_tools/reporte_maker.py similarity index 99% rename from bin/libs/reporte_maker.py rename to bin/QC_tools/reporte_maker.py index 15f9115..7cf5dbd 100644 --- a/bin/libs/reporte_maker.py +++ b/bin/QC_tools/reporte_maker.py @@ -4,7 +4,7 @@ from dominate.tags import * from dominate.util import raw from datetime import date, datetime -from libs.css_style import body_style, head_style, div_summary_style +from QC_tools.css_style import body_style, head_style, div_summary_style import plotly.express as px df2 = px.data.iris() diff --git a/bin/SLSim/error_simulator.py b/bin/SLSim/error_simulator.py deleted file mode 100755 index d4d0467..0000000 --- a/bin/SLSim/error_simulator.py +++ /dev/null @@ -1,187 +0,0 @@ -""" -This script is based on the original work from https://github.com/youyupei/SLSim/ -All original rights are acknowledged. -""" -import sys -import argparse -import Bio.SeqIO -import textwrap -from tqdm import tqdm -from badread.simulate import sequence_fragment, ErrorModel, QScoreModel, Identities -import os -import multiprocessing as mp -import random -import numpy as np - -import helper -#import config - -def parse_arg(): - parser = argparse.ArgumentParser( - description=textwrap.dedent( - ''' - Descript come later - '''), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - # Required positional argument - parser.add_argument('-t', '--template-fasta', type=str,required=True, - help='Template fastq file.') - # Required positional argument - parser.add_argument('--badread-error-model', type=str, default='nanopore2020', - help=textwrap.dedent(''' - Error model from babread.[reference needed] - ''' - )) - parser.add_argument('--badread-qscore-model', type=str, default='nanopore2020', - help=textwrap.dedent(''' - Error model from babread.[reference needed] - ''' - )) - parser.add_argument('--badread-identity', type=str, default='87.5,5,97.5', - help=textwrap.dedent(''' - Identity/accuracy (in percentage) parameter pass to badread: format mean,st,max. - ''' - )) - - - parser.add_argument('-o','--output-filename', type=str, default='sim_read.fq', - help=textwrap.dedent(''' - Filename of simulated reads with errors. - ''' - )) - parser.add_argument('--batch-size', type=int, default=500, - help=textwrap.dedent(''' - Batch size - ''' - )) - parser.add_argument('--thread', type=int, default = mp.cpu_count()-1, - help=textwrap.dedent(''' - Thread - ''' - )) - parser.add_argument('--test-error-dist', action='store_true', - help=textwrap.dedent(''' - Test distribution of badread - ''' - )) - - - args = parser.parse_args() - - if not args.test_error_dist: - helper.check_exist([args.template_fasta]) - return args - - -def sim_read(perfect_read, args, error_model, qscore_model, identities): - """Simulate error into perfect read using Badread error and qscore model - - read (str): perfect reads - """ - #random.seed("123") - #np.random.seed("123") - output=sys.stderr - seq, quals, actual_identity, identity_by_qscores = \ - sequence_fragment(perfect_read, identities.get_identity(), - error_model, qscore_model) - return seq, quals - - - -def read_batch_generator(fasta_fns, batch_size): - """Generator of barches of reads from list of fasta files - Args: - fasta_fns (list): fasta filenames - batch_size (int, optional): Defaults to 100. - """ - for fn in fasta_fns: - if str(fn).endswith('.gz'): - with gzip.open(fn, "rt") as handle: - fasta = Bio.SeqIO.parse(handle, "fasta") - read_batch = helper.batch_iterator(fasta, batch_size=batch_size) - for batch in read_batch: - yield batch - else: - fasta = Bio.SeqIO.parse(fn, "fasta") - read_batch = helper.batch_iterator(fasta, batch_size=batch_size) - for batch in read_batch: - yield batch - - -def sim_read_batch(read_batch, args, error_model, qscore_model, identities): - fastq_lines = [] - for read in read_batch: - seq, qscore = sim_read(str(read.seq), args, error_model, - qscore_model, identities) - # write read.id, se quals\ - fastq_lines.extend([f'@{read.id}', seq, '+' , qscore]) - return fastq_lines - - -def main(output=sys.stderr): - args = parse_arg() - - # load BadRead models - error_model = ErrorModel(args.badread_error_model, output) - qscore_model = QScoreModel(args.badread_qscore_model, output) - - # error rate distribution, .get_identity method generate random error rate - # from beta distribution - mean, sd, maxi = [float(x) for x in args.badread_identity.split(',')] - identities = Identities(mean, sd, maxi, output) - if args.test_error_dist: - sys.exit(0) - - # input template file - if os.path.isdir(args.template_fasta): - fns = helper.get_files(fastq_dir, ['*.fastq', '*.fq', '*.fastq.gz', '*.fq.gz', - '*.fasta', '*.fa', '*.fasta.gz', '*.fa.gz']) - elif os.path.isfile(args.template_fasta): - fns = [args.template_fasta] - else: - helper.err_msg("Invalid input of template fasta file") - sys.exit(1) - - read_batchs = read_batch_generator(fns, args.batch_size) - - - # single threading mode (good for debuging) - if args.thread == 1: - pbar = tqdm(unit = 'read', desc='Processed') - for batch in read_batchs: - fastq_records = sim_read_batch(batch, args, error_model, - qscore_model, identities) - - with open(args.output_filename, 'w') as f: - f.write('\n'.join(fastq_records) + '\n') - pbar.update(args.batch_size) - - helper.green_msg(f'Simulation finished!') - return None - else: - # multiprocessing mode - pass - - rst_futures = helper.multiprocessing_submit(sim_read_batch, - read_batchs, n_process=args.thread, - pbar_update=args.batch_size, - args=args, - error_model=error_model, - qscore_model=qscore_model, - identities=identities) - - for idx, f in enumerate(rst_futures): - fastq_records = f.result() #write rst_df out - if idx == 0: - with open(args.output_filename, 'w') as f: - f.write('\n'.join(fastq_records)+'\n') - else: - with open(args.output_filename, 'a') as f: - f.write('\n'.join(fastq_records)+'\n') - - helper.green_msg(f'Simulation finished!') - return None - - -if __name__ == '__main__': - main() diff --git a/bin/SLSim/helper.py b/bin/SLSim/helper.py deleted file mode 100755 index 07faad7..0000000 --- a/bin/SLSim/helper.py +++ /dev/null @@ -1,169 +0,0 @@ -""" -This script is based on the original work from https://github.com/youyupei/SLSim/tree/main. -It has been modified and adapted -to meet specific requirements and functionalities not present in the original version. -All original rights are acknowledged. -""" -import numpy as np -import concurrent.futures -from concurrent.futures import ThreadPoolExecutor, as_completed -import multiprocessing as mp -from tqdm import tqdm -import os -import sys - -def err_msg(msg): - CRED = '\033[91m' - CEND = '\033[0m' - print(CRED + msg + CEND) - -def warning_msg(msg): - CRED = '\033[93m' - CEND = '\033[0m' - print(CRED + msg + CEND) - -def green_msg(msg): - CRED = '\033[92m' - CEND = '\033[0m' - print(CRED + msg + CEND) - - - -class param: - def __init__(self, **kwargs): - for key in kwargs.keys(): - self.__dict__[key] = kwargs[key] - def add(self, attr_name, attr_val, overwrite = True): - if attr_name in __dict__.keys() and not overwrite: - pass - else: - self.__dict__[attr_name] = attr_val - def rm(self, attr_name): - if attr_name in self.__dict__.keys(): - del self.__dict__[attr_name] - def __str__(self): - return str(self.__dict__) - - def check(self, attr_list, add_none = True, silent = True): - """ - Check whether the attributes in a given list are present. - - Parameters - ---------- - attr_list : LIST - list of strings of the attributes to check - add_none : BOOL - whether or not to create the missed attributes with value None - silent : BOOL - always return True if silent = True - Returns - ------- - True/False if all attributes is present - """ - try: - assert isinstance(add_none, bool) - assert isinstance(silent, bool) - except (AttributeError, TypeError): - raise AssertionError("add_none and silent must be bool variable.") - - check_res = True - for attr in attr_list: - if attr not in self.__dict__.keys(): - check_res = False - self.__dict__[attr] = None - return check_res if not silent else True - -# multiprocessing -def multiprocessing_submit(func, iterator, n_process=mp.cpu_count()-1 ,pbar = True, pbar_update = 500, *arg, **kwargs): - executor = concurrent.futures.ProcessPoolExecutor(n_process) - - # A dictionary which will contain the future object - max_queue = n_process * 2 - if pbar: - pbar = tqdm(unit = 'read', desc='Processed') - - futures = {} - n_job_in_queue = 0 - while True: - while n_job_in_queue < max_queue: - i = next(iterator, None) - if not i: - break - futures[executor.submit(func, i, *arg, **kwargs)] = None - n_job_in_queue += 1 - - # will wait until as least one job finished - job = next(as_completed(futures), None) - - # no more job - if job is None: - break - # otherwise - else: - n_job_in_queue -= 1 - # update pregress bar based on batch size - pbar.update(pbar_update) - yield job - del futures[job] - - -# check file exist -def check_exist(file_list): - exit_code = 0 - for fn in file_list: - if not os.path.exists(fn): - exit_code = 1 - err_msg(f'Error: can not find {fn}') - if exit_code == 1: - sys.exit() - -# get file with a certian extensions -def get_files(search_dir, extensions, recursive=True): - files = [] - if recursive: - for i in extensions: - files.extend(Path(search_dir).rglob(i)) - return files - else: - for i in extensions: - files.extend(Path(search_dir).glob(i)) - return files - - -# split any iterator in to batches -def batch_iterator(iterator, batch_size): - """generateor of batches of items in a iterator with batch_size. - """ - batch = [] - i=0 - for entry in iterator: - i += 1 - batch.append(entry) - if i == batch_size: - yield batch - batch = [] - i = 0 - if len(batch): - yield batch - - -def round_to_nearest(value, base=10): - return base * round(value / base) - - -def fastq_batch_generator(fastq, batch_size): - open_fn = gzip.open if fastq.endswith('.gz') else open - - with open_fn(fastq, 'rt') as fq: - batch = [] - line_num = 0 - for line in fq: - line_num += 1 - if line_num % 4 == 0: - batch.append(line.rstrip()) - if len(batch) == batch_size: - yield batch - batch = [] - - if len(batch) > 0: - yield batch \ No newline at end of file diff --git a/bin/badread_caller.py b/bin/badread_caller.py new file mode 100644 index 0000000..c84ee8f --- /dev/null +++ b/bin/badread_caller.py @@ -0,0 +1,99 @@ +""" +This script is based on the original work from https://github.com/youyupei/SLSim/ +All original rights are acknowledged. +""" +import sys, os +import argparse +from tqdm import tqdm +import logging +from badread.simulate import sequence_fragment, ErrorModel, QScoreModel, Identities +from toolkit import multiprocessing_submit, read_template + +YELLOW = "\033[93m" +GRAY = "\033[90m" +RESET = "\033[0m" + +logging.basicConfig(level=logging.INFO, format=GRAY+ '%(message)s' +RESET) + +def setup_badread_command(parent_parser): + description="Nanopore sequencing error simulator." + if parent_parser: + parser = argparse.ArgumentParser(parents=[parent_parser], + description=description) + else : + parser = argparse.ArgumentParser(description=description) + + parser.add_argument('-t', '--template', type=str,required=True, + help='Template fastq file.') + parser.add_argument('--badread-error-model', type=str, default='nanopore2020', + help="Error model from babread.") + parser.add_argument('--badread-qscore-model', type=str, default='nanopore2020', + help="Error model from babread.") + parser.add_argument('--badread-identity', type=str, default='87.5,5,97.5', + help="Identity/accuracy (in percentage) parameter pass to badread: format mean,st,max.") + parser.add_argument('-o','--output', type=str, default='sim_read.fq', + help="Filename of simulated reads with errors.") + parser.add_argument('--batch-size', type=int, default=500, + help= "Batch size") + parser.add_argument('--thread', type=int, default = 1, help="Thread") + return parser + + +def write_fastq(output, name, seq, quals): + with open(output, 'a') as outFasta: + outFasta.write(f"@{name}\n" + f"{seq}\n" + "+\n" + f"{quals}\n") + + +def error_simulator(template, identities, err_model, q_model): + reads=[] + for name, read in template: + err_seq, quals, _, _= sequence_fragment(read, identities, err_model, q_model) + reads.append((name, err_seq, quals)) + return reads + + +def badread_caller(args): + stderr = sys.stderr + + logging.info("_____________________________") + logging.info("Input file name : %s" , args.template) + logging.info("Badread error model: %s", args.badread_error_model) + logging.info("Badread Qscore model: %s", args.badread_error_model) + logging.info("Badread identity parameters: %s", args.badread_identity) + logging.info("Output file name : %s", args.output) + logging.info("Threads: %d", args.thread) + logging.info("_______________________________") + + if os.path.exists(args.output): + sys.exit(f'Oups! Output file {args.output} already exists.') + + error_model = ErrorModel(args.badread_error_model, stderr) + qscore_model = QScoreModel(args.badread_qscore_model, stderr) + + badread_identity = args.badread_identity.split(',') + mean, sd, mx = [float(x) for x in badread_identity] + identities = Identities(mean, sd, mx, stderr).get_identity() + + template_chunks = read_template(args.template, + thread=args.thread, + batch_size=args.batch_size) + + results = multiprocessing_submit(error_simulator, + template_chunks, + identities=identities, + err_model=error_model, + q_model=qscore_model, + n_process=args.thread, + pbar_update=args.batch_size) + + for f in results: + reads_res = f.result() + for name, err_seq, quals in reads_res: + write_fastq(args.output, name, err_seq, quals) + +if __name__ == '__main__': + args = setup_badread_command(None).parse_args() + badread_caller(args) \ No newline at end of file diff --git a/bin/counts_simulator.R b/bin/counts_simulator.R index fd95a78..7a6ecfa 100755 --- a/bin/counts_simulator.R +++ b/bin/counts_simulator.R @@ -3,6 +3,8 @@ library(dplyr) library(SPARSim) +'%!in%' <- function(x,y)!('%in%'(x,y)) + calc.intensity <- function(mtx, ids, type) { cells <- ids %>% filter(cell_type == type) %>% @@ -30,44 +32,54 @@ calc_count_variability <- function(mtx, df, cell_type) { run_simulation <- function(matrix_file, cell_types_file, output_file) { - cat("Loading data ...\n") - mtx <- read.csv(matrix_file, sep=",", header = TRUE, row.names = 1) - cell_types <- read.csv(cell_types_file, sep=",", header = TRUE, row.names = 1) - - cat("Normalization ...\n") - mtx.norm <- scran_normalization(mtx, positive = TRUE) - - cat("Parameters estimation ...\n") - params_list <- list() - for (type in unique(cell_types$cell_type)) { - params_list[[type]] <- list( - intensity = calc.intensity(mtx.norm, cell_types, type), - variability = calc_count_variability(mtx.norm, cell_types, type), - lib_size = calc.library.size(mtx, cell_types, type) - ) - } - - simulation_params <- list() - for (type in unique(cell_types$cell_type)) { - param <- SPARSim_create_simulation_parameter( - intensity = params_list[[type]]$intensity, - variability = params_list[[type]]$variability, - library_size = params_list[[type]]$lib_size, - condition_name = type - ) - simulation_params[[type]] <- param + preset <- c('Bacher', 'Camp', 'Chu', 'Engel', 'Horning', 'Tung', 'Zheng', 'Macosko', 'Saunders') + preset <- paste0(preset, '_param_preset') + + if (matrix_file %in% preset){ + data(list=matrix_file) + simulation_params <- get(matrix_file) + }else{ + cat("Loading data ...\n") + mtx <- read.csv(matrix_file, sep=",", header = TRUE, row.names = 1) + cell_types <- read.csv(cell_types_file, sep=",", header = TRUE, row.names = 1) + + cat("Normalization ...\n") + mtx.norm <- scran_normalization(mtx, positive = TRUE) + + cat("Parameters estimation ...\n") + params_list <- list() + for (type in unique(cell_types$cell_type)) { + params_list[[type]] <- list( + intensity = calc.intensity(mtx.norm, cell_types, type), + variability = calc_count_variability(mtx.norm, cell_types, type), + lib_size = calc.library.size(mtx, cell_types, type) + ) + } + + simulation_params <- list() + for (type in unique(cell_types$cell_type)) { + param <- SPARSim_create_simulation_parameter( + intensity = params_list[[type]]$intensity, + variability = params_list[[type]]$variability, + library_size = params_list[[type]]$lib_size, + condition_name = type + ) + simulation_params[[type]] <- param + } } cat("Simulation ...\n") SPARSim_result <- SPARSim_simulation(simulation_params) sim.mtx <- SPARSim_result$count_matrix - - rownames(sim.mtx) <- names(params_list[[1]]$intensity) - cols <- c() - for (el in params_list) { - cols <- c(cols, names(el$lib_size)) + + if (matrix_file %!in% preset){ + rownames(sim.mtx) <- names(params_list[[1]]$intensity) + cols <- c() + for (el in params_list) { + cols <- c(cols, names(el$lib_size)) + } + colnames(sim.mtx) <- cols } - colnames(sim.mtx) <- cols cat("Saving ...\n") write.csv(sim.mtx, file=output_file) @@ -82,6 +94,6 @@ if (length(args) != 3) { tryCatch({ run_simulation(args[1], args[2], args[3]) }, error = function(e) { - cat("An error occurred: ", e$message, "\n") + cat(paste0("An error occurred: ", e$message, "\n")) }) diff --git a/bin/fqread/fastq_reader.cpython-311-x86_64-linux-gnu.so b/bin/fqread/fastq_reader.cpython-311-x86_64-linux-gnu.so deleted file mode 100755 index f1248a6..0000000 Binary files a/bin/fqread/fastq_reader.cpython-311-x86_64-linux-gnu.so and /dev/null differ diff --git a/bin/libs/.ipynb_checkpoints/figs-checkpoint.py b/bin/libs/.ipynb_checkpoints/figs-checkpoint.py deleted file mode 100755 index 2af2ef7..0000000 --- a/bin/libs/.ipynb_checkpoints/figs-checkpoint.py +++ /dev/null @@ -1,532 +0,0 @@ -from scipy.ndimage.filters import gaussian_filter1d -import plotly.graph_objs as go -from plotly.subplots import make_subplots -import numpy as np -import scipy.stats as stats - -interpolation_point_count_dict = { - 'read_length_distribution': (None, 10000, 3), - 'yield_plot': (None, 10000, 3), - 'phred_score_density': (None, 1000, 3), - 'over_time_graph': (None, 1000, 3), - 'scatterplot': (10000, 4000, 3), - 'phred_violin': (10000, 4000, 3), -} - -toulligqc_colors = {'all': '#fca311', # Yellow - 'all_1d2': '#fca311', # Yellow - 'pass': '#51a96d', # Green - 'fail': '#d90429', # Red - 'barcode_pass': '#79bf90', # Green - 'barcode_fail': '#fb1941', # Red - 'sequence_length_over_time': '#205b47', - 'phred_score_over_time': '#7aaceb', - 'speed_over_time': '#AE3F7B', - 'nseq_over_time': '#edb773', - 'pie_chart_palette': ["#f3a683", "#f7d794", "#778beb", "#e77f67", "#cf6a87", "#786fa6", "#f8a5c2", - "#63cdda", "#ea8685", "#596275"], - 'green_zone_color': 'rgba(0,100,0,.1)' - } - -def make_plots(df_stats, positions, q1, q_scores, q3): - fig = make_subplots(rows=3, cols=1) - fig.add_trace( - over_time_graph(time_series=positions, - percentile_25_series=q1, - percentile_50_series=q_scores, - percentile_75_series=q3, - result_directory='your/directory/path', - graph_name='Qscore over position', - yaxis_title='Q Score', - log=False, - sigma=1, - yaxis_starts_zero=False, - green_zone_starts_at=None), - row=3, col=1) - - fig.add_trace( - ATGC_graph(time_series=positions, - percentage_G=df_stats["%G"], - percentage_C=df_stats["%C"], - percentage_A=df_stats["%A"], - percentage_T=df_stats["%T"], - result_directory='your/directory/path', - graph_name='Base % over sequence', - yaxis_title='Q Score', - log=False, - sigma=1, - yaxis_starts_zero=False, - green_zone_starts_at=None), - row=3, col=1) - - - fig.add_trace( - GC_content_plot(df_stats['GC_content']), - row=3, col=1) - - fig.update_layout(height=600, width=800, title_text="Side By Side Subplots") - fig.write_html("cc.html") - - -def over_time_graph(time_series, - percentile_25_series, - percentile_50_series, - percentile_75_series, - result_directory, - graph_name, - yaxis_title, - color=toulligqc_colors['phred_score_over_time'], - log=False, - sigma=1, - yaxis_starts_zero=False, - green_zone_starts_at=None, - green_zone_color=toulligqc_colors['phred_score_over_time']): - - # Apply Gaussian filter to the percentile series - filtered_25 = gaussian_filter1d(percentile_25_series[0], sigma=sigma) - filtered_50 = gaussian_filter1d(percentile_50_series[0], sigma=sigma) - filtered_75 = gaussian_filter1d(percentile_75_series[0], sigma=sigma) - - # Create a Plotly figure - fig = go.Figure() - - # Add the 25th percentile trace - fig.add_trace(go.Scatter( - x=time_series[0], - y=filtered_25, - name="Lower percentile", - mode='lines', - line=dict(color=color, width=2), - fill=None - )) - - # Add the 75th percentile trace - fig.add_trace(go.Scatter( - x=time_series[0], - y=filtered_75, - name="Uper percentile", - mode='lines', - line=dict(color=color, width=2), - fill='tonexty', # Fill the area between this line and the previous line - fillcolor='rgba(0, 100, 0, 0.2)' # Semi-transparent fill - )) - - # Add the 50th percentile (median) trace - fig.add_trace(go.Scatter( - x=time_series[0], - y=filtered_50, - name="Mean", - mode='lines', - line=dict(color='black', width=2) - )) - - # Define the green zone if required - if green_zone_starts_at is not None: - min_x = min(time_series[0]) - max_x = max(time_series[0]) - max_y = max(filtered_75) * 1.05 - fig.add_trace(go.Scatter( - mode="lines", - x=[min_x, max_x], - y=[max_y, max_y], - line=dict(width=0), - hoverinfo="skip", - showlegend=False, - )) - fig.add_trace(go.Scatter( - mode="lines", - name="Green Zone", - x=[min_x, max_x], - y=[green_zone_starts_at, green_zone_starts_at], - fill='tonexty', - fillcolor=green_zone_color, - line=dict(width=0), - hoverinfo="skip", - )) - - ###### - ### add REVERSE - ##### - - # Apply Gaussian filter to the percentile series - filtered_25 = gaussian_filter1d(percentile_25_series[1], sigma=sigma) - filtered_50 = gaussian_filter1d(percentile_50_series[1], sigma=sigma) - filtered_75 = gaussian_filter1d(percentile_75_series[1], sigma=sigma) - - # Add the 25th percentile trace - fig.add_trace(go.Scatter( - x=time_series[1].multiply(other = -1), - y=filtered_25, - name="Lower percentile", - mode='lines', - line=dict(color=color, width=2), - fill=None, - visible=False - )) - - # Add the 75th percentile trace - fig.add_trace(go.Scatter( - x=time_series[1].multiply(other = -1), - y=filtered_75, - name="Uper percentile", - mode='lines', - line=dict(color=color, width=2), - fill='tonexty', # Fill the area between this line and the previous line - fillcolor='rgba(0, 100, 0, 0.2)', - visible=False - )) - - # Add the 50th percentile (median) trace - fig.add_trace(go.Scatter( - x=time_series[1].multiply(other = -1), - y=filtered_50, - name="Mean", - mode='lines', - line=dict(color='black', width=2), - visible=False - )) - - # Define the green zone if required - if green_zone_starts_at is not None: - min_x = min(time_series[1]) - max_x = max(time_series[1]) - max_y = max(filtered_75) * 1.05 - fig.add_trace(go.Scatter( - mode="lines", - x=[min_x, max_x], - y=[max_y, max_y], - line=dict(width=0), - hoverinfo="skip", - showlegend=False, - visible=False - )) - fig.add_trace(go.Scatter( - mode="lines", - name="Green Zone", - x=[min_x, max_x], - y=[green_zone_starts_at, green_zone_starts_at], - fill='tonexty', - fillcolor=green_zone_color, - line=dict(width=0), - hoverinfo="skip", - visible=False - )) - - ######## - # Add buttons - ############### - - fig.update_layout( - updatemenus=[ - dict( - type="buttons", - direction="down", - buttons=list([ - dict( - args=[{'visible': [True, True, True, False, False, False, ]}, - {**_xaxis('Q score', dict(visible=True)), - **_yaxis('Position', dict(visible=True)), - 'plot_bgcolor': '#e5ecf6'}], - label="Begining ", - method="update" - ), - dict( - args=[{'visible': [False, False, False, True, True, True, ]}, - {**_xaxis('Q score', dict(visible=True)), - **_yaxis('Position', dict(visible=True)), - 'plot_bgcolor': '#e5ecf6'}], - label="End", - method="update" - ) - ]), - pad={"r": 20, "t": 160, "l": 40, "b": 20}, - showactive=True, - x=1.0, - xanchor="left", - y=1.25, - yanchor="top" - ) - ] - ) - - # Set minimal value of y axis to 0 if required - y_axis_range_mode = 'tozero' if yaxis_starts_zero else 'normal' - - # Update layout - fig.update_layout( - title=graph_name, - autosize=False, - width=1400, - height=400, - xaxis_title=' Position ', - yaxis_title='' +yaxis_title +'', - hovermode='x unified', - yaxis=dict(rangemode=y_axis_range_mode) - ) - - # Set logarithmic scale if required - if log: - fig.update_yaxes(type="log") - - # Show the figure - return fig #.write_html("Qscore_over_positions.html") #fig.show() - - # Dummy return values (modify as needed for your application) - #table_html = None - #div = None - #output_file = None - #return graph_name, output_file, table_html, div - -def ATGC_graph(time_series, - percentage_G, - percentage_C, - percentage_A, - percentage_T, - result_directory, - graph_name, - yaxis_title, - color=toulligqc_colors['phred_score_over_time'], - log=False, - sigma=1, - yaxis_starts_zero=False, - green_zone_starts_at=None, - green_zone_color=toulligqc_colors['phred_score_over_time']): - - # Apply Gaussian filter to the percentile series - filtered_G = gaussian_filter1d(percentage_G[0], sigma=sigma) - filtered_C = gaussian_filter1d(percentage_C[0], sigma=sigma) - filtered_A = gaussian_filter1d(percentage_A[0], sigma=sigma) - filtered_T = gaussian_filter1d(percentage_T[0], sigma=sigma) - - # Create a Plotly figure - fig = go.Figure() - - # Add the 25th percentile trace - fig.add_trace(go.Scatter( - x=time_series[0], - y=filtered_G, - name="Base G Percentage", - mode='lines', - line=dict(color="red", width=2), - fill=None - )) - - # Add the 75th percentile trace - fig.add_trace(go.Scatter( - x=time_series[0], - y=filtered_A, - name="Base A Percentage", - mode='lines', - line=dict(color="green", width=2), - #fill='tonexty', # Fill the area between this line and the previous line - #fillcolor='rgba(0, 100, 0, 0.2)' # Semi-transparent fill - )) - - # Add the 75th percentile trace - fig.add_trace(go.Scatter( - x=time_series[0], - y=filtered_T, - name="Base T Percentage", - mode='lines', - line=dict(color="blue", width=2), - #fill='tonexty', # Fill the area between this line and the previous line - #fillcolor='rgba(0, 100, 0, 0.2)' # Semi-transparent fill - )) - - # Add the 50th percentile (median) trace - fig.add_trace(go.Scatter( - x=time_series[0], - y=filtered_C, - name="Base C Percentage", - mode='lines', - line=dict(color='black', width=2) - )) - - ###### - ### add REVERSE - ##### - # Apply Gaussian filter to the percentile series - filtered_G = gaussian_filter1d(percentage_G[1], sigma=sigma) - filtered_C = gaussian_filter1d(percentage_C[1], sigma=sigma) - filtered_A = gaussian_filter1d(percentage_A[1], sigma=sigma) - filtered_T = gaussian_filter1d(percentage_T[1], sigma=sigma) - # Add the 25th percentile trace - fig.add_trace(go.Scatter( - x=time_series[1].multiply(other = -1), - y=filtered_G, - name="Base G Percentage", - mode='lines', - line=dict(color="red", width=2), - fill=None, - visible=False - )) - - # Add the 75th percentile trace - fig.add_trace(go.Scatter( - x=time_series[1].multiply(other = -1), - y=filtered_A, - name="Base A Percentage", - mode='lines', - line=dict(color="green", width=2), - visible=False - #fill='tonexty', # Fill the area between this line and the previous line - #fillcolor='rgba(0, 100, 0, 0.2)' # Semi-transparent fill - )) - - # Add the 75th percentile trace - fig.add_trace(go.Scatter( - x=time_series[1].multiply(other = -1), - y=filtered_T, - name="Base T Percentage", - mode='lines', - line=dict(color="blue", width=2), - visible=False - #fill='tonexty', # Fill the area between this line and the previous line - #fillcolor='rgba(0, 100, 0, 0.2)' # Semi-transparent fill - )) - - # Add the 50th percentile (median) trace - fig.add_trace(go.Scatter( - x=time_series[1].multiply(other = -1), - y=filtered_C, - name="Base C Percentage", - mode='lines', - line=dict(color='black', width=2), - visible=False - )) - - ######## - # Add buttons - ############### - - fig.update_layout( - updatemenus=[ - dict( - type="buttons", - direction="down", - buttons=list([ - dict( - args=[{'visible': [True, True, True, True, False, False, False, False]}, - {**_xaxis('Q score', dict(visible=True)), - **_yaxis('Position', dict(visible=True)), - 'plot_bgcolor': '#e5ecf6'}], - label="Begining ", - method="update" - ), - dict( - args=[{'visible': [False, False, False, False, True, True, True, True]}, - {**_xaxis('Q score', dict(visible=True)), - **_yaxis('Position', dict(visible=True)), - 'plot_bgcolor': '#e5ecf6'}], - label="End", - method="update" - ) - ]), - pad={"r": 20, "t": 160, "l": 40, "b": 20}, - showactive=True, - x=1.0, - xanchor="left", - y=1.25, - yanchor="top" - ) - ] - ) - # Set minimal value of y axis to 0 if required - y_axis_range_mode = 'tozero' if yaxis_starts_zero else 'normal' - - # Update layout - fig.update_layout( - title=graph_name, - autosize=False, - width=1400, - height=400, - xaxis_title=' Position ', - yaxis_title= '' + yaxis_title + '', - hovermode='x unified', - yaxis=dict(rangemode=y_axis_range_mode) - ) - - # Set logarithmic scale if required - if log: - fig.update_yaxes(type="log") - - # Show the figure - return fig #.write_html("ATGC_graph.html") #fig.show() - - # Dummy return values (modify as needed for your application) - #table_html = None - #div = None - #output_file = None - #return graph_name, output_file, table_html, div - - -def GC_content_plot(gc_content_percentages): - # Distribution réelle - max_gc = 100 # Pourcentage max - gc_distribution = [0] * (max_gc + 1) - for gc_content in gc_content_percentages: - gc_distribution[int(gc_content)] += 1 - - # affichage en ligne - real_distribution_smooth = np.convolve(gc_distribution, np.ones(5)/5, mode='same') - - # Distribution théorique - mean_gc = np.mean(gc_content_percentages) - std_gc = np.std(gc_content_percentages) - theoretical_distribution = [stats.norm.pdf(gc, mean_gc, std_gc) * len(gc_content_percentages) for gc in range(max_gc + 1)] - - - fig = go.Figure() - fig.add_trace(go.Scatter(x=list(range(max_gc + 1)), y=real_distribution_smooth, mode='lines', name='Distribution GC réelle')) - fig.add_trace(go.Scatter(x=list(range(max_gc + 1)), y=theoretical_distribution, mode='lines', name='Distribution GC théorique')) - - # Mise en forme - fig.update_layout(title='Distribution du contenu en GC', - autosize=False, - width=1400, - height=400, - xaxis_title=' Contenu en GC (%) ', - yaxis_title=' Compte ', - legend_title=' Légende ') - - - return fig #fig.write_html("GC_content.html") #.show() - - -def interpolation_points(series, graph_name): - count = len(series) - threshold, npoints, sigma = interpolation_point_count_dict[graph_name] - - if threshold is not None: - if count > threshold: - result = npoints - else: - result = count - else: - result = npoints - - return result, sigma - - -def _xaxis(title, args=None): - axis_dict = dict( - title='' + title + '', - titlefont_size=14, - tickfont_size=12) - - if args is not None: - axis_dict.update(dict(**args)) - - return dict(xaxis=axis_dict) - -def _yaxis(title, args=None): - axis_dict = dict( - title='' + title + '', - titlefont_size=14, - tickfont_size=12, - fixedrange=True) - - if args is not None: - axis_dict.update(dict(**args)) - - return dict(yaxis=axis_dict) \ No newline at end of file diff --git a/bin/libs/.ipynb_checkpoints/report-checkpoint.py b/bin/libs/.ipynb_checkpoints/report-checkpoint.py deleted file mode 100755 index 7c9ef81..0000000 --- a/bin/libs/.ipynb_checkpoints/report-checkpoint.py +++ /dev/null @@ -1,30 +0,0 @@ -import plotly.io as io - -def make_report(Q_over_time, atgc, gc, project, pos): - html_string = ''' - - - - - - -

''' + project + '''

- - -

Qscore across ''' + str(pos) + ''' bases

- ''' + io.to_html(Q_over_time) + ''' -

- - -

Sequence content across ''' + str(pos) + ''' bases

- ''' + io.to_html(atgc) + ''' -

-

Per base GC content

- ''' + io.to_html(gc) + ''' - - ''' - - out_name = project+'.html' - f = open(out_name,'w') - f.write(html_string) - f.close() \ No newline at end of file diff --git a/bin/libs/__pycache__/figs.cpython-311.pyc b/bin/libs/__pycache__/figs.cpython-311.pyc deleted file mode 100755 index 88fa020..0000000 Binary files a/bin/libs/__pycache__/figs.cpython-311.pyc and /dev/null differ diff --git a/bin/libs/__pycache__/report.cpython-311.pyc b/bin/libs/__pycache__/report.cpython-311.pyc deleted file mode 100755 index 9c6f75d..0000000 Binary files a/bin/libs/__pycache__/report.cpython-311.pyc and /dev/null differ diff --git a/bin/libs/__pycache__/toullig.cpython-311.pyc b/bin/libs/__pycache__/toullig.cpython-311.pyc deleted file mode 100755 index 0a5b239..0000000 Binary files a/bin/libs/__pycache__/toullig.cpython-311.pyc and /dev/null differ diff --git a/bin/libs/report.py b/bin/libs/report.py deleted file mode 100755 index 7c9ef81..0000000 --- a/bin/libs/report.py +++ /dev/null @@ -1,30 +0,0 @@ -import plotly.io as io - -def make_report(Q_over_time, atgc, gc, project, pos): - html_string = ''' - - - - - - -

''' + project + '''

- - -

Qscore across ''' + str(pos) + ''' bases

- ''' + io.to_html(Q_over_time) + ''' -

- - -

Sequence content across ''' + str(pos) + ''' bases

- ''' + io.to_html(atgc) + ''' -

-

Per base GC content

- ''' + io.to_html(gc) + ''' - - ''' - - out_name = project+'.html' - f = open(out_name,'w') - f.write(html_string) - f.close() \ No newline at end of file diff --git a/bin/template_maker.py b/bin/template_maker.py index df72339..61f02cd 100755 --- a/bin/template_maker.py +++ b/bin/template_maker.py @@ -20,26 +20,33 @@ RESET = "\033[0m" logging.basicConfig(level=logging.INFO, format=GRAY+ '%(message)s' +RESET) -parser = argparse.ArgumentParser(description="Script for generating template sequences for scRNAseq simulation.") -parser.add_argument('-r','--transcriptome', type=str, help="Path to the transcriptome FASTA file.") -parser.add_argument('-m','--matrix', type=str, help="Path to the SPARSim matrix CSV file.") -parser.add_argument('-g','--gtf', type=str, help="Path to the annotation GTF file.") -parser.add_argument('-b','--unfilteredBC', type=str, help="Path to the unfiltered barcode counts CSV file.") +def setup_template_parameters(parent_parser): + description="Nanopore template maker." + if parent_parser: + parser = argparse.ArgumentParser(parents=[parent_parser], + description=description) + else : + parser = argparse.ArgumentParser(description=description) + parser.add_argument('-r','--transcriptome', type=str, help="Path to the transcriptome FASTA file.") + parser.add_argument('-m','--matrix', type=str, help="Path to the SPARSim matrix CSV file.") + parser.add_argument('-g','--gtf', type=str, help="Path to the annotation GTF file.") + parser.add_argument('-b','--unfilteredBC', type=str, help="Path to the unfiltered barcode counts CSV file.") -parser.add_argument('-o','--outFasta', type=str, default="template.fa", help="Output file name for the generated template sequences.") -parser.add_argument('-t','--threads', type=int, default=1, help="Number of threads to use.") -parser.add_argument('-f','--features', type=str, default="transcript_id", help="Feature rownames in input matrix.") -parser.add_argument('--on_disk', action='store_true', help="Whether to use Faidx to index the FASTA or load the FASTA in memory.") -parser.add_argument('-c','--amp', type=int, default=1, help="amplification rate.") + parser.add_argument('-o','--outFasta', type=str, default="template.fa", help="Output file name for the generated template sequences.") + parser.add_argument('-t','--threads', type=int, default=1, help="Number of threads to use.") + parser.add_argument('-f','--features', type=str, default="transcript_id", help="Feature rownames in input matrix.") + parser.add_argument('--on_disk', action='store_true', help="Whether to use Faidx to index the FASTA or load the FASTA in memory.") + parser.add_argument('-c','--amp', type=int, default=1, help="amplification rate.") -parser.add_argument('--full_length', action='store_true', help="Simulate a full length transcripts.") -parser.add_argument('--length_dist', type=str, default="0.37,0.0,824.94", help="amplification rate.") + parser.add_argument('--full_length', action='store_true', help="Simulate a full length transcripts.") + parser.add_argument('--length_dist', type=str, default="0.37,0.0,824.94", help="amplification rate.") -parser.add_argument('--adapter', type=str, default="ATGCGTAGTCAGTCATGATC", help="Adapter sequence.") -parser.add_argument('--TSO', type=str, default="ATGCGTAGTCAGTCATGATC", help="TSO sequence.") -parser.add_argument('--len_dT', type=str, default=15, help="Poly-dT sequence.") -parser.add_argument('--log', type=str, help="Path to the log file CSV.") + parser.add_argument('--adapter', type=str, default="ATGCGTAGTCAGTCATGATC", help="Adapter sequence.") + parser.add_argument('--TSO', type=str, default="ATGCGTAGTCAGTCATGATC", help="TSO sequence.") + parser.add_argument('--len_dT', type=str, default=15, help="Poly-dT sequence.") + parser.add_argument('--log', type=str, help="Path to the log file CSV.") + return parser class Transcriptome: @@ -159,9 +166,8 @@ def make_template(self, cell, umi_counts, transcripts_index, features): self.unfound += unfound -def main(): +def template_maker(args): - args = parser.parse_args() logging.info("_____________________________") logging.info("Output file name : %s" , args.outFasta) logging.info("Threads: %d", args.threads) @@ -295,6 +301,7 @@ def main(): str(generator.counter)+ "\nUnknown transcript counts: "+str(generator.unfound)) - if __name__ == "__main__": - main() + args = setup_template_parameters(None).parse_args() + if args: + template_maker(args) diff --git a/bin/toolkit.py b/bin/toolkit.py index 94c489e..3ac2ba8 100755 --- a/bin/toolkit.py +++ b/bin/toolkit.py @@ -232,6 +232,37 @@ def cDNA_pool(trns, transcriptome): return sequence[:] +def read_template(fasta, thread=1, batch_size=0): + """ + Function to read and extract sequences from a FASTQ file. It supports both plain text and gzipped files. + Inputs: + fastq: String, path to the FASTQ file. + Returns: List of sequences encoded in ASCII. + """ + if thread==1: + with open(fasta, 'rt') as f: + while True: + header = f.readline() + if not header: break + name = header[1:].strip() + seq = f.readline().strip() + yield name, seq + else: + batch = [] + with open(fasta, 'r') as f: + while True: + header = f.readline() + if not header: break + name = header[1:].strip() + seq = f.readline().strip() + batch.append([name, seq]) + if len(batch) == batch_size: + yield batch + batch = [] + if len(batch) > 0: + yield batch + + def multiprocessing_submit(func, iterator, n_process=mp.cpu_count()-1 ,pbar = True, pbar_update = 500, *arg, **kwargs): executor = ProcessPoolExecutor(n_process) diff --git a/main.nf b/main.nf index 015e22f..6b636c8 100755 --- a/main.nf +++ b/main.nf @@ -1,3 +1,46 @@ +params.help = false + +if (params.help) { + println """ + Usage: + nextflow run main.nf --matrix [options] + + Description: + ASARU - SINGLE CELL NANOPORE READS SIMULATOR PIPELINE + Simulates nanopore reads from single-cell data. + + Required parameters: + --matrix Path to the expression matrix. + --transcriptome Path to the transcriptome file. + --matrix_rownames Row names for the matrix. + + Optional parameters: + --barcodes_counts Distribution of barcode counts. + --full_length Indicates if transcripts are full length [default: false]. + --simulate_cell_types Simulate cell types [default: false]. + --cell_type_annotation Path to cell type annotation. + --gtf Path to the GTF file. + --trained_model Pre-trained error model. + --identity_model Identity model for Badread. + --error_model Custom error model. + --qscore_model Q score model. + --build_model Build an error model [default: false]. + --fastq_model Path to the FASTQ model. + --ref_genome Path to the reference genome. + --umi_duplication UMI duplication. + --pcr_cycles Number of PCR amplification cycles. + --pcr_dup_rate PCR duplication rate. + --pcr_error_rate PCR error rate. + --pcr_total_reads Total number of PCR reads. + --outdir Output directory. + + Example: + nextflow run main.nf --matrix 'path/to/matrix.csv' --transcriptome 'path/to/transcriptome.fa' --outdir 'results' + + """ + System.exit(0) +} + log.info """\ ASARU - SINGLE CELL NANOPORE READS SIMULATOR P I P E L I N E =================================== @@ -44,16 +87,24 @@ include { ERRORS_SIMULATOR } from './modules/modules.nf' include { GROUND_TRUTH } from './modules/modules.nf' include { QC } from './modules/modules.nf' +def validSPARSIMOptions = ['Bacher', 'Camp', 'Chu', 'Engel', 'Horning', 'Tung', 'Zheng', 'Macosko', 'Saunders'] +validSPARSIMOptions = validSPARSIMOptions.collect { it + '_param_preset' } + workflow { transcriptome_ch = Channel.fromPath(params.transcriptome, checkIfExists: true) - matrix_ch = params.matrix != null ? file(params.matrix, checkIfExists: true) : + if (params.matrix in validSPARSIMOptions){ + matrix_ch = file(params.matrix, type: "file") + + } else { + matrix_ch = params.matrix != null ? file(params.matrix, checkIfExists: true) : file("no_matrix_counts", type: "file") + } barcodes_ch = params.bc_counts != null ? file(params.bc_counts, checkIfExists: true) : file("no_barcode_counts", type: "file") - if (barcodes_ch.name == "no_barcode_counts" && matrix_ch.name == "no_matrix_counts") { + if (barcodes_ch.name == "no_barcode_counts" && matrix_ch.name == "no_matrix_counts" ) { log.error("Please provide one of the parameters: 'matrix counts' or 'barcodes counts'.") System.exit(1) } @@ -61,7 +112,7 @@ workflow { gtf_ch = params.features != "transcript_id" ? Channel.fromPath(params.gtf, checkIfExists: true) : file("no_gtf", type: "file") - cell_types_ch = params.sim_celltypes == true ? Channel.fromPath(params.cell_types_annotation, checkIfExists: true) : + cell_types_ch = params.sim_celltypes == true && (params.matrix !in validSPARSIMOptions)? Channel.fromPath(params.cell_types_annotation, checkIfExists: true) : file("no_cell_types", type: "file") error_model_ch = params.error_model != null ? file(params.error_model) : diff --git a/modules/PCR.nf b/modules/PCR.nf index 65528e9..753a0bf 100644 --- a/modules/PCR.nf +++ b/modules/PCR.nf @@ -11,7 +11,7 @@ process PCR_SIMULATOR { def totalNamber = params.pcr_total_reads == null? "" : "--totalNamber $params.pcr_total_reads" """ - python3.11 $projectDir/bin/PCR.py -f ${fasta} \ + python3.11 $projectDir/bin/AsaruSim.py PCR -f ${fasta} \ --cycles $params.pcr_cycles \ --dup $params.pcr_dup_rate \ --error $params.pcr_error_rate \ diff --git a/modules/modules.nf b/modules/modules.nf index 1764112..dde3fb3 100755 --- a/modules/modules.nf +++ b/modules/modules.nf @@ -1,5 +1,6 @@ process COUNT_SIMULATOR { publishDir params.outdir, mode:'copy' + cache false input: path matrix @@ -35,7 +36,7 @@ process TEMPLATE_MAKER { def unfiltered = barcodes.name != "no_barcode_counts"? "--unfilteredBC $barcodes" : "" def full_length = params.full_length? "--full_length" : "" """ - python3.11 $projectDir/bin/template_maker.py --matrix ${matrix} \ + python3.11 $projectDir/bin/AsaruSim.py template_maker --matrix ${matrix} \ --transcriptome $transcriptome \ $unfiltered \ $gtf \ @@ -74,7 +75,7 @@ process ERRORS_SIMULATOR { qscore_model_arg = qscore_model.name != "no_qscore_model" ? "--badread-qscore-model $qscore_model" : "" } """ - python3.11 $projectDir/bin/SLSim/error_simulator.py --thread $params.threads \ + python3.11 $projectDir/bin/AsaruSim.py call_badread --thread $params.threads \ -t $template \ --badread-identity ${identity.trim()} \ $error_model_arg \ @@ -101,7 +102,6 @@ process GROUND_TRUTH { process QC { publishDir params.outdir, mode:'copy' - cache false input: path fastq