From fe41445070a1c9e487d1fab5c39c57a37988f4ad Mon Sep 17 00:00:00 2001
From: Leon Hafner <leon.hafner@tum.de>
Date: Thu, 19 Sep 2024 13:58:33 +0200
Subject: [PATCH] added tfactivtiy

---
 .gitignore           |   1 +
 assets/end_to_end.py | 197 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 171 insertions(+), 27 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5124c9a..d69d807 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ results/
 testing/
 testing*
 *.pyc
+pyrightconfig.json
diff --git a/assets/end_to_end.py b/assets/end_to_end.py
index 3c8f46e..7be5303 100644
--- a/assets/end_to_end.py
+++ b/assets/end_to_end.py
@@ -1,56 +1,87 @@
 import os
+import sys
 import argparse
 import string
 import pandas as pd
+from typing import List
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer")
-parser.add_argument('--outdir', type=str, required=False, help='Path to output directory')
 parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file')
 parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file')
-parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory')
-parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local')
+parser.add_argument('--genome', type=str, required=False, help='genome name/version')
+parser.add_argument('--motifs', type=str, required=False, help='Path to JASPAR motif file')
+parser.add_argument('--taxon_id', type=str, required=False, help='JASPAR taxon ID for motif download')
+parser.add_argument('--outdir', type=str, required=False, help='Path to output directory')
 
 parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory')
 parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory')
-parser.add_argument('--atac_seq', type=str, required=False, help='ATAC-Seq directory')
+
+parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer")
+parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory')
+parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local')
+parser.add_argument('--process_queue', type=str, required=False, help='scheduler queue')
 
 args = parser.parse_args()
 
-profile = args.profile
-outdir = args.outdir
+if args.taxon_id is None:
+    if args.genome is None or args.motifs is None:
+        print('Error: If taxon_id is not specified, both fasta and motifs must be provided.')
+        sys.exit(1)
+
 fasta = args.fasta
 gtf = args.gtf
-apptainer_cache = args.apptainer_cache
-process_executor = args.process_executor
+genome = args.genome
+motifs = args.motifs
+taxon_id = args.taxon_id
+outdir = args.outdir
 
 rna_seq = args.rna_seq
 chip_seq = args.chip_seq
-atac_seq = args.atac_seq
 
-# TODO: Remove hard coded parameters
-fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa"
-gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf"
-rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq"
-chip_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/chip_seq"
-outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output"
-apptainer_cache = "/nfs/scratch/apptainer_cache"
-process_executor = 'local'
+profile = args.profile
+apptainer_cache = args.apptainer_cache
+process_executor = args.process_executor
+process_queue = args.process_queue
 
-os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache
 
+os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache
 
 # Create output directory if not exisiting
 if not os.path.exists(outdir):
     os.makedirs(outdir)
 
 
-def list_files_recursive(directory):
+def list_files(directory, prefix=None, suffix=None, recursive=False) -> List[str]:
+    """
+    List files in the specified directory, optionally including subdirectories.
+
+    Parameters:
+        directory (str): The root directory to start the search.
+        prefix (str, optional): Filter files that start with this prefix.
+        suffix (str, optional): Filter files that end with this suffix.
+        recursive (bool, optional): If True, search subdirectories as well.
+                                    If False, search only the specified directory.
+                                    Default is False.
+
+    Returns:
+        List[str]: List of file paths matching the criteria.
+    """
+
     file_list = []
-    for root, _, files in os.walk(directory):
-        for file in files:
-            file_list.append(os.path.join(root, file))
+
+    if recursive:
+        for root, _, files in os.walk(directory):
+            for file in files:
+                if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)):
+                    file_list.append(os.path.join(root, file))
+    else:
+        for file in os.listdir(directory):
+            full_path = os.path.join(directory, file)
+            if os.path.isfile(full_path):
+                if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)):
+                    file_list.append(full_path)
+
     return file_list
 
 
@@ -62,7 +93,7 @@ def list_files_recursive(directory):
 if not os.path.exists(path_nfcore_rnaseq):
     os.makedirs(path_nfcore_rnaseq)
 
-file_paths = list_files_recursive(rna_seq)
+file_paths = list_files(rna_seq, recursive=True)
 
 samples = {}
 for file_path in file_paths:
@@ -98,11 +129,12 @@ def list_files_recursive(directory):
     --genome null \
     -profile {profile} \
     -process.executor {process_executor} \
+    -process.queue {process_queue} \
     -resume \
     --skip_deseq2_qc true
 """
 os.chdir(path_nfcore_rnaseq)
-os.system(rnaseq_run)
+#os.system(rnaseq_run)
 
 
 # Run nfcore/chipseq pipeline
@@ -113,7 +145,7 @@ def list_files_recursive(directory):
 if not os.path.exists(path_nfcore_chipseq):
     os.makedirs(path_nfcore_chipseq)
 
-file_paths = list_files_recursive(chip_seq)
+file_paths = list_files(chip_seq, recursive=True)
 
 samples = {}
 for file_path in file_paths:
@@ -168,9 +200,120 @@ def list_files_recursive(directory):
     --save_align_intermeds \
     -profile {profile} \
     -process.executor {process_executor} \
+    -process.queue {process_queue} \
     -resume \
     --skip_preseq true
 """
 
 os.chdir(path_nfcore_chipseq)
-os.system(chipseq_run)
+#os.system(chipseq_run)
+
+
+# Run nfcore/tfactivity pipeline
+path_nfcore_tfactivity = os.path.join(outdir, 'nfcore-tfactivity')
+path_outdir_tfactivity = os.path.join(path_nfcore_tfactivity, 'output')
+path_samplesheet_tfactivity_peaks = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_peaks.csv')
+path_samplesheet_tfactivity_bams = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_bams.csv')
+
+path_design_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'design_tfactivity_rna.csv')
+path_counts_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'counts_tfactivity_rna.csv')
+
+if not os.path.exists(path_nfcore_tfactivity):
+    os.makedirs(path_nfcore_tfactivity)
+
+# Create samplesheet peaks
+file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library', 'macs3', 'broad_peak'), suffix='.broadPeak')
+
+samples = {}
+for file_path in file_paths:
+    basename = os.path.basename(file_path)
+    condition, antibody, sample, rep, _ = basename.split('_')
+
+    sample_id = '_'.join([condition, antibody, sample, rep])
+
+    if sample_id not in samples:
+        # TODO: Set 'footprinting' for ChIP-Seq but not for ATAC-Seq and DNase-Seq
+        samples[sample_id] = {
+            'sample': sample_id,
+            'condition': condition,
+            'assay': antibody,
+            'peak_file': file_path,
+        }
+    else:
+        raise ValueError('Duplicate sample_id detected!')
+
+pd.DataFrame.from_dict(samples, orient='index').sort_values(by='sample').to_csv(path_samplesheet_tfactivity_peaks, index=False)
+
+# Create samplesheet bams
+file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library'), suffix='.mLb.clN.sorted.bam')
+
+samples = {}
+controls = {}
+for file_path in file_paths:
+    basename = os.path.basename(file_path)
+    condition, antibody, sample, rep = basename.split('.')[0].split('_')
+
+    sample_id = '_'.join([condition, antibody, sample, rep])
+
+    if antibody != 'CONTROL' and sample_id not in samples:
+        samples[sample_id] = {
+            'sample': sample_id,
+            'condition': condition,
+            'assay': antibody,
+            'signal': file_path,
+            'merge_key': f'{condition}_{sample}_{rep}',
+        }
+    elif antibody == 'CONTROL' and sample_id not in controls:
+        controls[sample_id] = {
+            'merge_key': f'{condition}_{sample}_{rep}',
+            'control': file_path,
+        }
+    else:
+        raise ValueError('Duplicated sample_id detected!')
+
+samplesheet_bam = pd.DataFrame.from_dict(samples, orient='index')
+controls = pd.DataFrame.from_dict(controls, orient='index')
+
+samplesheet_bam = samplesheet_bam.merge(controls, on='merge_key', how='inner').drop(columns='merge_key').sort_values(by='sample')
+samplesheet_bam.to_csv(path_samplesheet_tfactivity_bams, index=False)
+
+# Convert rna counts to csv
+# TODO: Remove hardcoded aligner 'star_salmon' when adding aligner choice
+counts_rna_tsv = pd.read_csv(os.path.join(path_outdir_rnaseq, 'star_salmon', 'salmon.merged.gene_counts.tsv'), sep='\t')
+counts_rna_tsv['gene_id'].to_csv(path_counts_tfactivity_rna, header=False, index=False)
+
+counts_design = {}
+for sample in counts_rna_tsv.drop(columns=['gene_id', 'gene_name']).columns:
+    condition = sample.split('_')[0]
+    path_sample_counts = os.path.join(path_nfcore_tfactivity, f'{sample}_counts.txt')
+
+    counts_rna_tsv[sample].to_csv(path_sample_counts, index=False, header=False)
+    counts_design[sample] = {
+        'sample': sample,
+        'condition': condition,
+        'counts_file': path_sample_counts,
+    }
+
+pd.DataFrame.from_dict(counts_design, orient='index').to_csv(path_design_tfactivity_rna, index=False)
+
+tfactivity_run = f"""
+nextflow run \
+    /nfs/home/students/l.hafner/inspect/tfactivity/dev/main.nf \
+    nf-core/tfactivity \
+    --input {path_samplesheet_tfactivity_peaks} \
+    --input_bam {path_samplesheet_tfactivity_bams} \
+    --counts {path_counts_tfactivity_rna} \
+    --counts_design {path_design_tfactivity_rna} \
+    --motifs {motifs} \
+    --genome {genome} \
+    --fasta {fasta} \
+    --gtf {gtf} \
+    --outdir {path_outdir_tfactivity} \
+    -profile {profile} \
+    -process.executor {process_executor} \
+    -process.queue {process_queue} \
+    -resume
+"""
+
+os.chdir(path_nfcore_tfactivity)
+os.system(tfactivity_run)