From fe41445070a1c9e487d1fab5c39c57a37988f4ad Mon Sep 17 00:00:00 2001 From: Leon Hafner Date: Thu, 19 Sep 2024 13:58:33 +0200 Subject: [PATCH] added tfactivtiy --- .gitignore | 1 + assets/end_to_end.py | 197 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 171 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index 5124c9a..d69d807 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ results/ testing/ testing* *.pyc +pyrightconfig.json diff --git a/assets/end_to_end.py b/assets/end_to_end.py index 3c8f46e..7be5303 100644 --- a/assets/end_to_end.py +++ b/assets/end_to_end.py @@ -1,56 +1,87 @@ import os +import sys import argparse import string import pandas as pd +from typing import List parser = argparse.ArgumentParser() -parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer") -parser.add_argument('--outdir', type=str, required=False, help='Path to output directory') parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file') parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file') -parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory') -parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local') +parser.add_argument('--genome', type=str, required=False, help='genome name/version') +parser.add_argument('--motifs', type=str, required=False, help='Path to JASPAR motif file') +parser.add_argument('--taxon_id', type=str, required=False, help='JASPAR taxon ID for motif download') +parser.add_argument('--outdir', type=str, required=False, help='Path to output directory') parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory') parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory') -parser.add_argument('--atac_seq', type=str, required=False, help='ATAC-Seq directory') + +parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer") +parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory') +parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local') +parser.add_argument('--process_queue', type=str, required=False, help='scheduler queue') args = parser.parse_args() -profile = args.profile -outdir = args.outdir +if args.taxon_id is None: + if args.genome is None or args.motifs is None: + print('Error: If taxon_id is not specified, both fasta and motifs must be provided.') + sys.exit(1) + fasta = args.fasta gtf = args.gtf -apptainer_cache = args.apptainer_cache -process_executor = args.process_executor +genome = args.genome +motifs = args.motifs +taxon_id = args.taxon_id +outdir = args.outdir rna_seq = args.rna_seq chip_seq = args.chip_seq -atac_seq = args.atac_seq -# TODO: Remove hard coded parameters -fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa" -gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf" -rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq" -chip_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/chip_seq" -outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output" -apptainer_cache = "/nfs/scratch/apptainer_cache" -process_executor = 'local' +profile = args.profile +apptainer_cache = args.apptainer_cache +process_executor = args.process_executor +process_queue = args.process_queue -os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache +os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache # Create output directory if not exisiting if not os.path.exists(outdir): os.makedirs(outdir) -def list_files_recursive(directory): +def list_files(directory, prefix=None, suffix=None, recursive=False) -> List[str]: + """ + List files in the specified directory, optionally including subdirectories. + + Parameters: + directory (str): The root directory to start the search. + prefix (str, optional): Filter files that start with this prefix. + suffix (str, optional): Filter files that end with this suffix. + recursive (bool, optional): If True, search subdirectories as well. + If False, search only the specified directory. + Default is False. + + Returns: + List[str]: List of file paths matching the criteria. + """ + file_list = [] - for root, _, files in os.walk(directory): - for file in files: - file_list.append(os.path.join(root, file)) + + if recursive: + for root, _, files in os.walk(directory): + for file in files: + if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)): + file_list.append(os.path.join(root, file)) + else: + for file in os.listdir(directory): + full_path = os.path.join(directory, file) + if os.path.isfile(full_path): + if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)): + file_list.append(full_path) + return file_list @@ -62,7 +93,7 @@ def list_files_recursive(directory): if not os.path.exists(path_nfcore_rnaseq): os.makedirs(path_nfcore_rnaseq) -file_paths = list_files_recursive(rna_seq) +file_paths = list_files(rna_seq, recursive=True) samples = {} for file_path in file_paths: @@ -98,11 +129,12 @@ def list_files_recursive(directory): --genome null \ -profile {profile} \ -process.executor {process_executor} \ + -process.queue {process_queue} \ -resume \ --skip_deseq2_qc true """ os.chdir(path_nfcore_rnaseq) -os.system(rnaseq_run) +#os.system(rnaseq_run) # Run nfcore/chipseq pipeline @@ -113,7 +145,7 @@ def list_files_recursive(directory): if not os.path.exists(path_nfcore_chipseq): os.makedirs(path_nfcore_chipseq) -file_paths = list_files_recursive(chip_seq) +file_paths = list_files(chip_seq, recursive=True) samples = {} for file_path in file_paths: @@ -168,9 +200,120 @@ def list_files_recursive(directory): --save_align_intermeds \ -profile {profile} \ -process.executor {process_executor} \ + -process.queue {process_queue} \ -resume \ --skip_preseq true """ os.chdir(path_nfcore_chipseq) -os.system(chipseq_run) +#os.system(chipseq_run) + + +# Run nfcore/tfactivity pipeline +path_nfcore_tfactivity = os.path.join(outdir, 'nfcore-tfactivity') +path_outdir_tfactivity = os.path.join(path_nfcore_tfactivity, 'output') +path_samplesheet_tfactivity_peaks = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_peaks.csv') +path_samplesheet_tfactivity_bams = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_bams.csv') + +path_design_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'design_tfactivity_rna.csv') +path_counts_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'counts_tfactivity_rna.csv') + +if not os.path.exists(path_nfcore_tfactivity): + os.makedirs(path_nfcore_tfactivity) + +# Create samplesheet peaks +file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library', 'macs3', 'broad_peak'), suffix='.broadPeak') + +samples = {} +for file_path in file_paths: + basename = os.path.basename(file_path) + condition, antibody, sample, rep, _ = basename.split('_') + + sample_id = '_'.join([condition, antibody, sample, rep]) + + if sample_id not in samples: + # TODO: Set 'footprinting' for ChIP-Seq but not for ATAC-Seq and DNase-Seq + samples[sample_id] = { + 'sample': sample_id, + 'condition': condition, + 'assay': antibody, + 'peak_file': file_path, + } + else: + raise ValueError('Duplicate sample_id detected!') + +pd.DataFrame.from_dict(samples, orient='index').sort_values(by='sample').to_csv(path_samplesheet_tfactivity_peaks, index=False) + +# Create samplesheet bams +file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library'), suffix='.mLb.clN.sorted.bam') + +samples = {} +controls = {} +for file_path in file_paths: + basename = os.path.basename(file_path) + condition, antibody, sample, rep = basename.split('.')[0].split('_') + + sample_id = '_'.join([condition, antibody, sample, rep]) + + if antibody != 'CONTROL' and sample_id not in samples: + samples[sample_id] = { + 'sample': sample_id, + 'condition': condition, + 'assay': antibody, + 'signal': file_path, + 'merge_key': f'{condition}_{sample}_{rep}', + } + elif antibody == 'CONTROL' and sample_id not in controls: + controls[sample_id] = { + 'merge_key': f'{condition}_{sample}_{rep}', + 'control': file_path, + } + else: + raise ValueError('Duplicated sample_id detected!') + +samplesheet_bam = pd.DataFrame.from_dict(samples, orient='index') +controls = pd.DataFrame.from_dict(controls, orient='index') + +samplesheet_bam = samplesheet_bam.merge(controls, on='merge_key', how='inner').drop(columns='merge_key').sort_values(by='sample') +samplesheet_bam.to_csv(path_samplesheet_tfactivity_bams, index=False) + +# Convert rna counts to csv +# TODO: Remove hardcoded aligner 'star_salmon' when adding aligner choice +counts_rna_tsv = pd.read_csv(os.path.join(path_outdir_rnaseq, 'star_salmon', 'salmon.merged.gene_counts.tsv'), sep='\t') +counts_rna_tsv['gene_id'].to_csv(path_counts_tfactivity_rna, header=False, index=False) + +counts_design = {} +for sample in counts_rna_tsv.drop(columns=['gene_id', 'gene_name']).columns: + condition = sample.split('_')[0] + path_sample_counts = os.path.join(path_nfcore_tfactivity, f'{sample}_counts.txt') + + counts_rna_tsv[sample].to_csv(path_sample_counts, index=False, header=False) + counts_design[sample] = { + 'sample': sample, + 'condition': condition, + 'counts_file': path_sample_counts, + } + +pd.DataFrame.from_dict(counts_design, orient='index').to_csv(path_design_tfactivity_rna, index=False) + +tfactivity_run = f""" +nextflow run \ + /nfs/home/students/l.hafner/inspect/tfactivity/dev/main.nf \ + nf-core/tfactivity \ + --input {path_samplesheet_tfactivity_peaks} \ + --input_bam {path_samplesheet_tfactivity_bams} \ + --counts {path_counts_tfactivity_rna} \ + --counts_design {path_design_tfactivity_rna} \ + --motifs {motifs} \ + --genome {genome} \ + --fasta {fasta} \ + --gtf {gtf} \ + --outdir {path_outdir_tfactivity} \ + -profile {profile} \ + -process.executor {process_executor} \ + -process.queue {process_queue} \ + -resume +""" + +os.chdir(path_nfcore_tfactivity) +os.system(tfactivity_run)