From 7df41d622cc19d40c97ab953b8d244654fba302e Mon Sep 17 00:00:00 2001 From: Leon Hafner Date: Fri, 2 Aug 2024 15:54:18 +0200 Subject: [PATCH 1/6] added end2end rna-seq draft --- assets/end_to_end.py | 89 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 assets/end_to_end.py diff --git a/assets/end_to_end.py b/assets/end_to_end.py new file mode 100644 index 0000000..b40259c --- /dev/null +++ b/assets/end_to_end.py @@ -0,0 +1,89 @@ +import os +import argparse +import pandas as pd + +parser = argparse.ArgumentParser() + +parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer") +parser.add_argument('--outdir', type=str, required=False, help='Path to output directory') +parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file') +parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file') + +parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory') +parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory') +parser.add_argument('--atac_seq', type=str, required=False, help='ATAC-Seq directory') + +args = parser.parse_args() + +profile = args.profile +outdir = args.outdir +fasta = args.fasta +gtf = args.gtf + +rna_seq = args.rna_seq +chip_seq = args.chip_seq +atac_seq = args.atac_seq + +fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa" +gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf" +rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq" +outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output" + + +# Start only if output directory does not exist +#if os.path.exists(outdir): +# raise FileExistsError("Output directory already existing") + + +def list_files_recursive(directory): + file_list = [] + for root, _, files in os.walk(directory): + for file in files: + file_list.append(os.path.join(root, file)) + return file_list + + + +# Run nf-core/rnaseq pipeline +path_rnaseq = os.path.join(outdir, 'nfcore-rnaseq') +path_outdir_rnaseq = os.path.join(path_rnaseq, 'output') +path_samplesheet_rnaseq = os.path.join(path_rnaseq, 'samplesheet_rnaseq.csv') + +if not os.path.exists(path_rnaseq): + os.makedirs(path_rnaseq) + +file_paths = list_files_recursive(rna_seq) + +samples = {} +for file_path in file_paths: + condition = file_path.split('/')[-2] + basename = os.path.basename(file_path) + + # Remove .fq.gz and split file name + sample, rep, read = basename.split('.')[0].split('_') + + sample_name = "_".join([condition, sample, rep]) + + if sample_name not in samples: + samples[sample_name] = {'sample': sample_name, 'fastq_1': '', 'fastq_2': '', 'strandedness': 'auto'} + + if read == 'R1': + samples[sample_name]['fastq_1'] = file_path + elif read == 'R2': + samples[sample_name]['fastq_2'] = file_path + +df = pd.DataFrame.from_dict(samples, orient='index').to_csv(path_samplesheet_rnaseq, index=False) + +rnaseq_run = f""" +nextflow run \ + nf-core/rnaseq \ + --input {path_samplesheet_rnaseq} \ + --outdir {path_outdir_rnaseq} \ + --gtf {gtf} \ + --fasta {fasta} \ + --igenomes_ignore \ + --genome null \ + -profile {profile} +""" +os.chdir(path_rnaseq) +os.system(rnaseq_run) From de9fa815a41092337cb4e519019ae939e46b62a1 Mon Sep 17 00:00:00 2001 From: Leon Hafner Date: Mon, 2 Sep 2024 22:11:49 +0200 Subject: [PATCH 2/6] add chipseq --- assets/end_to_end.py | 75 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/assets/end_to_end.py b/assets/end_to_end.py index b40259c..2ce4c71 100644 --- a/assets/end_to_end.py +++ b/assets/end_to_end.py @@ -8,6 +8,7 @@ parser.add_argument('--outdir', type=str, required=False, help='Path to output directory') parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file') parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file') +parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory') parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory') parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory') @@ -19,6 +20,7 @@ outdir = args.outdir fasta = args.fasta gtf = args.gtf +apptainer_cache = args.apptainer_cache rna_seq = args.rna_seq chip_seq = args.chip_seq @@ -27,12 +29,16 @@ fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa" gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf" rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq" +chip_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/chip_seq" outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output" +apptainer_cache = "/nfs/scratch/apptainer_cache" + +os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache # Start only if output directory does not exist -#if os.path.exists(outdir): -# raise FileExistsError("Output directory already existing") +if not os.path.exists(outdir): + os.makedirs(outdir) def list_files_recursive(directory): @@ -45,12 +51,12 @@ def list_files_recursive(directory): # Run nf-core/rnaseq pipeline -path_rnaseq = os.path.join(outdir, 'nfcore-rnaseq') -path_outdir_rnaseq = os.path.join(path_rnaseq, 'output') -path_samplesheet_rnaseq = os.path.join(path_rnaseq, 'samplesheet_rnaseq.csv') +path_nfcore_rnaseq = os.path.join(outdir, 'nfcore-rnaseq') +path_outdir_rnaseq = os.path.join(path_nfcore_rnaseq, 'output') +path_samplesheet_rnaseq = os.path.join(path_nfcore_rnaseq, 'samplesheet_rnaseq.csv') -if not os.path.exists(path_rnaseq): - os.makedirs(path_rnaseq) +if not os.path.exists(path_nfcore_rnaseq): + os.makedirs(path_nfcore_rnaseq) file_paths = list_files_recursive(rna_seq) @@ -83,7 +89,58 @@ def list_files_recursive(directory): --fasta {fasta} \ --igenomes_ignore \ --genome null \ - -profile {profile} + -profile {profile} \ + -resume \ + --skip_deseq2_qc true """ -os.chdir(path_rnaseq) +os.chdir(path_nfcore_rnaseq) os.system(rnaseq_run) + + +# Run nfcore/chipseq pipeline +path_nfcore_chipseq = os.path.join(outdir, 'nfcore-chipseq') +path_outdir_chipseq = os.path.join(path_nfcore_chipseq, 'output') +path_samplesheet_chipseq = os.path.join(path_nfcore_chipseq, 'samplesheet_chipeq.csv') + +if not os.path.exists(path_nfcore_chipseq): + os.makedirs(path_nfcore_chipseq) + +file_paths = list_files_recursive(chip_seq) + +samples = {} +for file_path in file_paths: + condition = file_path.split('/')[-3] + antibody = file_path.split('/')[-2] + basename = os.path.basename(file_path) + + # Remove .fq.gz and split file name + sample, rep, read = basename.split('.')[0].split('_') + + sample_name = "_".join([condition, antibody, sample, rep]) + + if sample_name not in samples: + samples[sample_name] = {'sample': sample_name, 'fastq_1': '', 'fastq_2': '', 'antibody': antibody, 'control': f'{condition}_CONTROL_{sample}_{rep}'} + + if read == 'R1': + samples[sample_name]['fastq_1'] = file_path + elif read == 'R2': + samples[sample_name]['fastq_2'] = file_path + +chipseq_samplesheet = pd.DataFrame.from_dict(samples, orient='index') +chipseq_samplesheet['control'] = chipseq_samplesheet['control'].apply(lambda x: x if x in chipseq_samplesheet['sample'].values else '') +chipseq_samplesheet.to_csv(path_samplesheet_chipseq, index=False) + +chipseq_run = f""" +nextflow run \ + nf-core/chipseq \ + --input {path_samplesheet_chipseq} \ + --outdir {path_outdir_chipseq} \ + --gtf {gtf} \ + --fasta {fasta} \ + --read_length 150 \ + -profile docker \ + -resume +""" + +os.chdir(path_nfcore_chipseq) +os.system(chipseq_run) \ No newline at end of file From 49f2acbc37d077198771a71796e2053dbd3aa93a Mon Sep 17 00:00:00 2001 From: Leon Hafner Date: Fri, 13 Sep 2024 12:30:07 +0200 Subject: [PATCH 3/6] Use nf-core/chipseq dev revision --- assets/end_to_end.py | 53 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/assets/end_to_end.py b/assets/end_to_end.py index 2ce4c71..ed69db1 100644 --- a/assets/end_to_end.py +++ b/assets/end_to_end.py @@ -1,5 +1,6 @@ import os import argparse +import string import pandas as pd parser = argparse.ArgumentParser() @@ -9,6 +10,7 @@ parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file') parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file') parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory') +parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local') parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory') parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory') @@ -21,22 +23,25 @@ fasta = args.fasta gtf = args.gtf apptainer_cache = args.apptainer_cache +process_executor = args.process_executor rna_seq = args.rna_seq chip_seq = args.chip_seq atac_seq = args.atac_seq +# TODO: Remove hard coded parameters fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa" gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf" rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq" chip_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/chip_seq" outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output" apptainer_cache = "/nfs/scratch/apptainer_cache" +process_executor = 'local' os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache -# Start only if output directory does not exist +# Create output directory if not exisiting if not os.path.exists(outdir): os.makedirs(outdir) @@ -49,7 +54,6 @@ def list_files_recursive(directory): return file_list - # Run nf-core/rnaseq pipeline path_nfcore_rnaseq = os.path.join(outdir, 'nfcore-rnaseq') path_outdir_rnaseq = os.path.join(path_nfcore_rnaseq, 'output') @@ -71,7 +75,10 @@ def list_files_recursive(directory): sample_name = "_".join([condition, sample, rep]) if sample_name not in samples: - samples[sample_name] = {'sample': sample_name, 'fastq_1': '', 'fastq_2': '', 'strandedness': 'auto'} + samples[sample_name] = {'sample': sample_name, + 'fastq_1': '', + 'fastq_2': '', + 'strandedness': 'auto'} if read == 'R1': samples[sample_name]['fastq_1'] = file_path @@ -90,6 +97,7 @@ def list_files_recursive(directory): --igenomes_ignore \ --genome null \ -profile {profile} \ + -process.executor {process_executor} \ -resume \ --skip_deseq2_qc true """ @@ -100,7 +108,7 @@ def list_files_recursive(directory): # Run nfcore/chipseq pipeline path_nfcore_chipseq = os.path.join(outdir, 'nfcore-chipseq') path_outdir_chipseq = os.path.join(path_nfcore_chipseq, 'output') -path_samplesheet_chipseq = os.path.join(path_nfcore_chipseq, 'samplesheet_chipeq.csv') +path_samplesheet_chipseq = os.path.join(path_nfcore_chipseq, 'samplesheet_chipseq.csv') if not os.path.exists(path_nfcore_chipseq): os.makedirs(path_nfcore_chipseq) @@ -116,30 +124,51 @@ def list_files_recursive(directory): # Remove .fq.gz and split file name sample, rep, read = basename.split('.')[0].split('_') - sample_name = "_".join([condition, antibody, sample, rep]) + condition_antibody_sample = "_".join([condition, antibody, sample]) + + sample_rep = '_'.join([condition_antibody_sample, rep]) - if sample_name not in samples: - samples[sample_name] = {'sample': sample_name, 'fastq_1': '', 'fastq_2': '', 'antibody': antibody, 'control': f'{condition}_CONTROL_{sample}_{rep}'} + # Strip 'REP' from replicate + if sample_rep not in samples: + samples[sample_rep] = {'sample': condition_antibody_sample, + 'fastq_1': '', + 'fastq_2': '', + 'replicate': rep.strip(string.ascii_letters), + 'antibody': antibody, + 'control': f'{condition}_CONTROL_{sample}', + 'control_replicate': rep.strip(string.ascii_letters)} if read == 'R1': - samples[sample_name]['fastq_1'] = file_path + samples[sample_rep]['fastq_1'] = file_path elif read == 'R2': - samples[sample_name]['fastq_2'] = file_path + samples[sample_rep]['fastq_2'] = file_path chipseq_samplesheet = pd.DataFrame.from_dict(samples, orient='index') + +# Remove control files that do not exist chipseq_samplesheet['control'] = chipseq_samplesheet['control'].apply(lambda x: x if x in chipseq_samplesheet['sample'].values else '') + +# Remove antibody, control and control_replicate column from dataframe +chipseq_samplesheet.loc[chipseq_samplesheet['sample'].str.contains('CONTROL'), ['antibody', 'control', 'control_replicate']] = '' chipseq_samplesheet.to_csv(path_samplesheet_chipseq, index=False) +# Runs on revision dev as the pipeline has not had an release for some time +# TODO change parameter read_length +# TODO change parameter skip_preseq +# TODO disable processes not needed for downstream pipelines chipseq_run = f""" nextflow run \ nf-core/chipseq \ + -r dev \ --input {path_samplesheet_chipseq} \ --outdir {path_outdir_chipseq} \ --gtf {gtf} \ --fasta {fasta} \ - --read_length 150 \ - -profile docker \ - -resume + --read_length 50 \ + -profile {profile} \ + -process.executor {process_executor} \ + -resume \ + --skip_preseq true """ os.chdir(path_nfcore_chipseq) From 0071e70210b52e38212823190e92e04f84316f9b Mon Sep 17 00:00:00 2001 From: Leon Hafner Date: Fri, 13 Sep 2024 12:47:07 +0200 Subject: [PATCH 4/6] fix linting --- assets/end_to_end.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/assets/end_to_end.py b/assets/end_to_end.py index ed69db1..cf1cfd9 100644 --- a/assets/end_to_end.py +++ b/assets/end_to_end.py @@ -68,18 +68,18 @@ def list_files_recursive(directory): for file_path in file_paths: condition = file_path.split('/')[-2] basename = os.path.basename(file_path) - + # Remove .fq.gz and split file name sample, rep, read = basename.split('.')[0].split('_') sample_name = "_".join([condition, sample, rep]) - + if sample_name not in samples: samples[sample_name] = {'sample': sample_name, 'fastq_1': '', 'fastq_2': '', 'strandedness': 'auto'} - + if read == 'R1': samples[sample_name]['fastq_1'] = file_path elif read == 'R2': @@ -120,14 +120,14 @@ def list_files_recursive(directory): condition = file_path.split('/')[-3] antibody = file_path.split('/')[-2] basename = os.path.basename(file_path) - + # Remove .fq.gz and split file name sample, rep, read = basename.split('.')[0].split('_') condition_antibody_sample = "_".join([condition, antibody, sample]) sample_rep = '_'.join([condition_antibody_sample, rep]) - + # Strip 'REP' from replicate if sample_rep not in samples: samples[sample_rep] = {'sample': condition_antibody_sample, @@ -137,7 +137,7 @@ def list_files_recursive(directory): 'antibody': antibody, 'control': f'{condition}_CONTROL_{sample}', 'control_replicate': rep.strip(string.ascii_letters)} - + if read == 'R1': samples[sample_rep]['fastq_1'] = file_path elif read == 'R2': @@ -172,4 +172,4 @@ def list_files_recursive(directory): """ os.chdir(path_nfcore_chipseq) -os.system(chipseq_run) \ No newline at end of file +os.system(chipseq_run) From d53112c7f7a0df8e73cba5d5d81e99095d9eef1e Mon Sep 17 00:00:00 2001 From: Leon Hafner Date: Fri, 13 Sep 2024 14:17:23 +0200 Subject: [PATCH 5/6] add BAM file output --- assets/end_to_end.py | 1 + 1 file changed, 1 insertion(+) diff --git a/assets/end_to_end.py b/assets/end_to_end.py index cf1cfd9..3c8f46e 100644 --- a/assets/end_to_end.py +++ b/assets/end_to_end.py @@ -165,6 +165,7 @@ def list_files_recursive(directory): --gtf {gtf} \ --fasta {fasta} \ --read_length 50 \ + --save_align_intermeds \ -profile {profile} \ -process.executor {process_executor} \ -resume \ From fe41445070a1c9e487d1fab5c39c57a37988f4ad Mon Sep 17 00:00:00 2001 From: Leon Hafner Date: Thu, 19 Sep 2024 13:58:33 +0200 Subject: [PATCH 6/6] added tfactivtiy --- .gitignore | 1 + assets/end_to_end.py | 197 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 171 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 5124c9a..d69d807 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ results/ testing/ testing* *.pyc +pyrightconfig.json diff --git a/assets/end_to_end.py b/assets/end_to_end.py index 3c8f46e..7be5303 100644 --- a/assets/end_to_end.py +++ b/assets/end_to_end.py @@ -1,56 +1,87 @@ import os +import sys import argparse import string import pandas as pd +from typing import List parser = argparse.ArgumentParser() -parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer") -parser.add_argument('--outdir', type=str, required=False, help='Path to output directory') parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file') parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file') -parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory') -parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local') +parser.add_argument('--genome', type=str, required=False, help='genome name/version') +parser.add_argument('--motifs', type=str, required=False, help='Path to JASPAR motif file') +parser.add_argument('--taxon_id', type=str, required=False, help='JASPAR taxon ID for motif download') +parser.add_argument('--outdir', type=str, required=False, help='Path to output directory') parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory') parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory') -parser.add_argument('--atac_seq', type=str, required=False, help='ATAC-Seq directory') + +parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer") +parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory') +parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local') +parser.add_argument('--process_queue', type=str, required=False, help='scheduler queue') args = parser.parse_args() -profile = args.profile -outdir = args.outdir +if args.taxon_id is None: + if args.genome is None or args.motifs is None: + print('Error: If taxon_id is not specified, both fasta and motifs must be provided.') + sys.exit(1) + fasta = args.fasta gtf = args.gtf -apptainer_cache = args.apptainer_cache -process_executor = args.process_executor +genome = args.genome +motifs = args.motifs +taxon_id = args.taxon_id +outdir = args.outdir rna_seq = args.rna_seq chip_seq = args.chip_seq -atac_seq = args.atac_seq -# TODO: Remove hard coded parameters -fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa" -gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf" -rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq" -chip_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/chip_seq" -outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output" -apptainer_cache = "/nfs/scratch/apptainer_cache" -process_executor = 'local' +profile = args.profile +apptainer_cache = args.apptainer_cache +process_executor = args.process_executor +process_queue = args.process_queue -os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache +os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache # Create output directory if not exisiting if not os.path.exists(outdir): os.makedirs(outdir) -def list_files_recursive(directory): +def list_files(directory, prefix=None, suffix=None, recursive=False) -> List[str]: + """ + List files in the specified directory, optionally including subdirectories. + + Parameters: + directory (str): The root directory to start the search. + prefix (str, optional): Filter files that start with this prefix. + suffix (str, optional): Filter files that end with this suffix. + recursive (bool, optional): If True, search subdirectories as well. + If False, search only the specified directory. + Default is False. + + Returns: + List[str]: List of file paths matching the criteria. + """ + file_list = [] - for root, _, files in os.walk(directory): - for file in files: - file_list.append(os.path.join(root, file)) + + if recursive: + for root, _, files in os.walk(directory): + for file in files: + if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)): + file_list.append(os.path.join(root, file)) + else: + for file in os.listdir(directory): + full_path = os.path.join(directory, file) + if os.path.isfile(full_path): + if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)): + file_list.append(full_path) + return file_list @@ -62,7 +93,7 @@ def list_files_recursive(directory): if not os.path.exists(path_nfcore_rnaseq): os.makedirs(path_nfcore_rnaseq) -file_paths = list_files_recursive(rna_seq) +file_paths = list_files(rna_seq, recursive=True) samples = {} for file_path in file_paths: @@ -98,11 +129,12 @@ def list_files_recursive(directory): --genome null \ -profile {profile} \ -process.executor {process_executor} \ + -process.queue {process_queue} \ -resume \ --skip_deseq2_qc true """ os.chdir(path_nfcore_rnaseq) -os.system(rnaseq_run) +#os.system(rnaseq_run) # Run nfcore/chipseq pipeline @@ -113,7 +145,7 @@ def list_files_recursive(directory): if not os.path.exists(path_nfcore_chipseq): os.makedirs(path_nfcore_chipseq) -file_paths = list_files_recursive(chip_seq) +file_paths = list_files(chip_seq, recursive=True) samples = {} for file_path in file_paths: @@ -168,9 +200,120 @@ def list_files_recursive(directory): --save_align_intermeds \ -profile {profile} \ -process.executor {process_executor} \ + -process.queue {process_queue} \ -resume \ --skip_preseq true """ os.chdir(path_nfcore_chipseq) -os.system(chipseq_run) +#os.system(chipseq_run) + + +# Run nfcore/tfactivity pipeline +path_nfcore_tfactivity = os.path.join(outdir, 'nfcore-tfactivity') +path_outdir_tfactivity = os.path.join(path_nfcore_tfactivity, 'output') +path_samplesheet_tfactivity_peaks = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_peaks.csv') +path_samplesheet_tfactivity_bams = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_bams.csv') + +path_design_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'design_tfactivity_rna.csv') +path_counts_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'counts_tfactivity_rna.csv') + +if not os.path.exists(path_nfcore_tfactivity): + os.makedirs(path_nfcore_tfactivity) + +# Create samplesheet peaks +file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library', 'macs3', 'broad_peak'), suffix='.broadPeak') + +samples = {} +for file_path in file_paths: + basename = os.path.basename(file_path) + condition, antibody, sample, rep, _ = basename.split('_') + + sample_id = '_'.join([condition, antibody, sample, rep]) + + if sample_id not in samples: + # TODO: Set 'footprinting' for ChIP-Seq but not for ATAC-Seq and DNase-Seq + samples[sample_id] = { + 'sample': sample_id, + 'condition': condition, + 'assay': antibody, + 'peak_file': file_path, + } + else: + raise ValueError('Duplicate sample_id detected!') + +pd.DataFrame.from_dict(samples, orient='index').sort_values(by='sample').to_csv(path_samplesheet_tfactivity_peaks, index=False) + +# Create samplesheet bams +file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library'), suffix='.mLb.clN.sorted.bam') + +samples = {} +controls = {} +for file_path in file_paths: + basename = os.path.basename(file_path) + condition, antibody, sample, rep = basename.split('.')[0].split('_') + + sample_id = '_'.join([condition, antibody, sample, rep]) + + if antibody != 'CONTROL' and sample_id not in samples: + samples[sample_id] = { + 'sample': sample_id, + 'condition': condition, + 'assay': antibody, + 'signal': file_path, + 'merge_key': f'{condition}_{sample}_{rep}', + } + elif antibody == 'CONTROL' and sample_id not in controls: + controls[sample_id] = { + 'merge_key': f'{condition}_{sample}_{rep}', + 'control': file_path, + } + else: + raise ValueError('Duplicated sample_id detected!') + +samplesheet_bam = pd.DataFrame.from_dict(samples, orient='index') +controls = pd.DataFrame.from_dict(controls, orient='index') + +samplesheet_bam = samplesheet_bam.merge(controls, on='merge_key', how='inner').drop(columns='merge_key').sort_values(by='sample') +samplesheet_bam.to_csv(path_samplesheet_tfactivity_bams, index=False) + +# Convert rna counts to csv +# TODO: Remove hardcoded aligner 'star_salmon' when adding aligner choice +counts_rna_tsv = pd.read_csv(os.path.join(path_outdir_rnaseq, 'star_salmon', 'salmon.merged.gene_counts.tsv'), sep='\t') +counts_rna_tsv['gene_id'].to_csv(path_counts_tfactivity_rna, header=False, index=False) + +counts_design = {} +for sample in counts_rna_tsv.drop(columns=['gene_id', 'gene_name']).columns: + condition = sample.split('_')[0] + path_sample_counts = os.path.join(path_nfcore_tfactivity, f'{sample}_counts.txt') + + counts_rna_tsv[sample].to_csv(path_sample_counts, index=False, header=False) + counts_design[sample] = { + 'sample': sample, + 'condition': condition, + 'counts_file': path_sample_counts, + } + +pd.DataFrame.from_dict(counts_design, orient='index').to_csv(path_design_tfactivity_rna, index=False) + +tfactivity_run = f""" +nextflow run \ + /nfs/home/students/l.hafner/inspect/tfactivity/dev/main.nf \ + nf-core/tfactivity \ + --input {path_samplesheet_tfactivity_peaks} \ + --input_bam {path_samplesheet_tfactivity_bams} \ + --counts {path_counts_tfactivity_rna} \ + --counts_design {path_design_tfactivity_rna} \ + --motifs {motifs} \ + --genome {genome} \ + --fasta {fasta} \ + --gtf {gtf} \ + --outdir {path_outdir_tfactivity} \ + -profile {profile} \ + -process.executor {process_executor} \ + -process.queue {process_queue} \ + -resume +""" + +os.chdir(path_nfcore_tfactivity) +os.system(tfactivity_run)