From 7df41d622cc19d40c97ab953b8d244654fba302e Mon Sep 17 00:00:00 2001
From: Leon Hafner <leon.hafner@tum.de>
Date: Fri, 2 Aug 2024 15:54:18 +0200
Subject: [PATCH 1/6] added end2end rna-seq draft

---
 assets/end_to_end.py | 89 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 assets/end_to_end.py

diff --git a/assets/end_to_end.py b/assets/end_to_end.py
new file mode 100644
index 0000000..b40259c
--- /dev/null
+++ b/assets/end_to_end.py
@@ -0,0 +1,89 @@
+import os
+import argparse
+import pandas as pd
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer")
+parser.add_argument('--outdir', type=str, required=False, help='Path to output directory')
+parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file')
+parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file')
+
+parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory')
+parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory')
+parser.add_argument('--atac_seq', type=str, required=False, help='ATAC-Seq directory')
+
+args = parser.parse_args()
+
+profile = args.profile
+outdir = args.outdir
+fasta = args.fasta
+gtf = args.gtf
+
+rna_seq = args.rna_seq
+chip_seq = args.chip_seq
+atac_seq = args.atac_seq
+
+fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa"
+gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf"
+rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq"
+outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output"
+
+
+# Start only if output directory does not exist
+#if os.path.exists(outdir):
+#    raise FileExistsError("Output directory already existing")
+
+
+def list_files_recursive(directory):
+    file_list = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            file_list.append(os.path.join(root, file))
+    return file_list
+
+
+
+# Run nf-core/rnaseq pipeline
+path_rnaseq = os.path.join(outdir, 'nfcore-rnaseq')
+path_outdir_rnaseq = os.path.join(path_rnaseq, 'output')
+path_samplesheet_rnaseq = os.path.join(path_rnaseq, 'samplesheet_rnaseq.csv')
+
+if not os.path.exists(path_rnaseq):
+    os.makedirs(path_rnaseq)
+
+file_paths = list_files_recursive(rna_seq)
+
+samples = {}
+for file_path in file_paths:
+    condition = file_path.split('/')[-2]
+    basename = os.path.basename(file_path)
+    
+    # Remove .fq.gz and split file name
+    sample, rep, read = basename.split('.')[0].split('_')
+
+    sample_name = "_".join([condition, sample, rep])
+    
+    if sample_name not in samples:
+        samples[sample_name] = {'sample': sample_name, 'fastq_1': '', 'fastq_2': '', 'strandedness': 'auto'}
+    
+    if read == 'R1':
+        samples[sample_name]['fastq_1'] = file_path
+    elif read == 'R2':
+        samples[sample_name]['fastq_2'] = file_path
+
+df = pd.DataFrame.from_dict(samples, orient='index').to_csv(path_samplesheet_rnaseq, index=False)
+
+rnaseq_run = f"""
+nextflow run \
+    nf-core/rnaseq \
+    --input {path_samplesheet_rnaseq} \
+    --outdir {path_outdir_rnaseq} \
+    --gtf {gtf} \
+    --fasta {fasta} \
+    --igenomes_ignore \
+    --genome null \
+    -profile {profile}
+"""
+os.chdir(path_rnaseq)
+os.system(rnaseq_run)

From de9fa815a41092337cb4e519019ae939e46b62a1 Mon Sep 17 00:00:00 2001
From: Leon Hafner <leon.hafner@tum.de>
Date: Mon, 2 Sep 2024 22:11:49 +0200
Subject: [PATCH 2/6] add chipseq

---
 assets/end_to_end.py | 75 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 66 insertions(+), 9 deletions(-)

diff --git a/assets/end_to_end.py b/assets/end_to_end.py
index b40259c..2ce4c71 100644
--- a/assets/end_to_end.py
+++ b/assets/end_to_end.py
@@ -8,6 +8,7 @@
 parser.add_argument('--outdir', type=str, required=False, help='Path to output directory')
 parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file')
 parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file')
+parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory')
 
 parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory')
 parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory')
@@ -19,6 +20,7 @@
 outdir = args.outdir
 fasta = args.fasta
 gtf = args.gtf
+apptainer_cache = args.apptainer_cache
 
 rna_seq = args.rna_seq
 chip_seq = args.chip_seq
@@ -27,12 +29,16 @@
 fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa"
 gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf"
 rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq"
+chip_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/chip_seq"
 outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output"
+apptainer_cache = "/nfs/scratch/apptainer_cache"
+
+os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache
 
 
 # Start only if output directory does not exist
-#if os.path.exists(outdir):
-#    raise FileExistsError("Output directory already existing")
+if not os.path.exists(outdir):
+    os.makedirs(outdir)
 
 
 def list_files_recursive(directory):
@@ -45,12 +51,12 @@ def list_files_recursive(directory):
 
 
 # Run nf-core/rnaseq pipeline
-path_rnaseq = os.path.join(outdir, 'nfcore-rnaseq')
-path_outdir_rnaseq = os.path.join(path_rnaseq, 'output')
-path_samplesheet_rnaseq = os.path.join(path_rnaseq, 'samplesheet_rnaseq.csv')
+path_nfcore_rnaseq = os.path.join(outdir, 'nfcore-rnaseq')
+path_outdir_rnaseq = os.path.join(path_nfcore_rnaseq, 'output')
+path_samplesheet_rnaseq = os.path.join(path_nfcore_rnaseq, 'samplesheet_rnaseq.csv')
 
-if not os.path.exists(path_rnaseq):
-    os.makedirs(path_rnaseq)
+if not os.path.exists(path_nfcore_rnaseq):
+    os.makedirs(path_nfcore_rnaseq)
 
 file_paths = list_files_recursive(rna_seq)
 
@@ -83,7 +89,58 @@ def list_files_recursive(directory):
     --fasta {fasta} \
     --igenomes_ignore \
     --genome null \
-    -profile {profile}
+    -profile {profile} \
+    -resume \
+    --skip_deseq2_qc true
 """
-os.chdir(path_rnaseq)
+os.chdir(path_nfcore_rnaseq)
 os.system(rnaseq_run)
+
+
+# Run nfcore/chipseq pipeline
+path_nfcore_chipseq = os.path.join(outdir, 'nfcore-chipseq')
+path_outdir_chipseq = os.path.join(path_nfcore_chipseq, 'output')
+path_samplesheet_chipseq = os.path.join(path_nfcore_chipseq, 'samplesheet_chipeq.csv')
+
+if not os.path.exists(path_nfcore_chipseq):
+    os.makedirs(path_nfcore_chipseq)
+
+file_paths = list_files_recursive(chip_seq)
+
+samples = {}
+for file_path in file_paths:
+    condition = file_path.split('/')[-3]
+    antibody = file_path.split('/')[-2]
+    basename = os.path.basename(file_path)
+    
+    # Remove .fq.gz and split file name
+    sample, rep, read = basename.split('.')[0].split('_')
+
+    sample_name = "_".join([condition, antibody, sample, rep])
+    
+    if sample_name not in samples:
+        samples[sample_name] = {'sample': sample_name, 'fastq_1': '', 'fastq_2': '', 'antibody': antibody, 'control': f'{condition}_CONTROL_{sample}_{rep}'}
+    
+    if read == 'R1':
+        samples[sample_name]['fastq_1'] = file_path
+    elif read == 'R2':
+        samples[sample_name]['fastq_2'] = file_path
+
+chipseq_samplesheet = pd.DataFrame.from_dict(samples, orient='index')
+chipseq_samplesheet['control'] = chipseq_samplesheet['control'].apply(lambda x: x if x in chipseq_samplesheet['sample'].values else '')
+chipseq_samplesheet.to_csv(path_samplesheet_chipseq, index=False)
+
+chipseq_run = f"""
+nextflow run \
+    nf-core/chipseq \
+    --input {path_samplesheet_chipseq} \
+    --outdir {path_outdir_chipseq} \
+    --gtf {gtf} \
+    --fasta {fasta} \
+    --read_length 150 \
+    -profile docker \
+    -resume
+"""
+
+os.chdir(path_nfcore_chipseq)
+os.system(chipseq_run)
\ No newline at end of file

From 49f2acbc37d077198771a71796e2053dbd3aa93a Mon Sep 17 00:00:00 2001
From: Leon Hafner <leon.hafner@tum.de>
Date: Fri, 13 Sep 2024 12:30:07 +0200
Subject: [PATCH 3/6] Use nf-core/chipseq dev revision

---
 assets/end_to_end.py | 53 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 12 deletions(-)

diff --git a/assets/end_to_end.py b/assets/end_to_end.py
index 2ce4c71..ed69db1 100644
--- a/assets/end_to_end.py
+++ b/assets/end_to_end.py
@@ -1,5 +1,6 @@
 import os
 import argparse
+import string
 import pandas as pd
 
 parser = argparse.ArgumentParser()
@@ -9,6 +10,7 @@
 parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file')
 parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file')
 parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory')
+parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local')
 
 parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory')
 parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory')
@@ -21,22 +23,25 @@
 fasta = args.fasta
 gtf = args.gtf
 apptainer_cache = args.apptainer_cache
+process_executor = args.process_executor
 
 rna_seq = args.rna_seq
 chip_seq = args.chip_seq
 atac_seq = args.atac_seq
 
+# TODO: Remove hard coded parameters
 fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa"
 gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf"
 rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq"
 chip_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/chip_seq"
 outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output"
 apptainer_cache = "/nfs/scratch/apptainer_cache"
+process_executor = 'local'
 
 os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache
 
 
-# Start only if output directory does not exist
+# Create output directory if not exisiting
 if not os.path.exists(outdir):
     os.makedirs(outdir)
 
@@ -49,7 +54,6 @@ def list_files_recursive(directory):
     return file_list
 
 
-
 # Run nf-core/rnaseq pipeline
 path_nfcore_rnaseq = os.path.join(outdir, 'nfcore-rnaseq')
 path_outdir_rnaseq = os.path.join(path_nfcore_rnaseq, 'output')
@@ -71,7 +75,10 @@ def list_files_recursive(directory):
     sample_name = "_".join([condition, sample, rep])
     
     if sample_name not in samples:
-        samples[sample_name] = {'sample': sample_name, 'fastq_1': '', 'fastq_2': '', 'strandedness': 'auto'}
+        samples[sample_name] = {'sample': sample_name,
+                                'fastq_1': '',
+                                'fastq_2': '',
+                                'strandedness': 'auto'}
     
     if read == 'R1':
         samples[sample_name]['fastq_1'] = file_path
@@ -90,6 +97,7 @@ def list_files_recursive(directory):
     --igenomes_ignore \
     --genome null \
     -profile {profile} \
+    -process.executor {process_executor} \
     -resume \
     --skip_deseq2_qc true
 """
@@ -100,7 +108,7 @@ def list_files_recursive(directory):
 # Run nfcore/chipseq pipeline
 path_nfcore_chipseq = os.path.join(outdir, 'nfcore-chipseq')
 path_outdir_chipseq = os.path.join(path_nfcore_chipseq, 'output')
-path_samplesheet_chipseq = os.path.join(path_nfcore_chipseq, 'samplesheet_chipeq.csv')
+path_samplesheet_chipseq = os.path.join(path_nfcore_chipseq, 'samplesheet_chipseq.csv')
 
 if not os.path.exists(path_nfcore_chipseq):
     os.makedirs(path_nfcore_chipseq)
@@ -116,30 +124,51 @@ def list_files_recursive(directory):
     # Remove .fq.gz and split file name
     sample, rep, read = basename.split('.')[0].split('_')
 
-    sample_name = "_".join([condition, antibody, sample, rep])
+    condition_antibody_sample = "_".join([condition, antibody, sample])
+
+    sample_rep = '_'.join([condition_antibody_sample, rep])
     
-    if sample_name not in samples:
-        samples[sample_name] = {'sample': sample_name, 'fastq_1': '', 'fastq_2': '', 'antibody': antibody, 'control': f'{condition}_CONTROL_{sample}_{rep}'}
+    # Strip 'REP' from replicate
+    if sample_rep not in samples:
+        samples[sample_rep] = {'sample': condition_antibody_sample,
+                               'fastq_1': '',
+                               'fastq_2': '',
+                               'replicate': rep.strip(string.ascii_letters),
+                               'antibody': antibody,
+                               'control': f'{condition}_CONTROL_{sample}',
+                               'control_replicate': rep.strip(string.ascii_letters)}
     
     if read == 'R1':
-        samples[sample_name]['fastq_1'] = file_path
+        samples[sample_rep]['fastq_1'] = file_path
     elif read == 'R2':
-        samples[sample_name]['fastq_2'] = file_path
+        samples[sample_rep]['fastq_2'] = file_path
 
 chipseq_samplesheet = pd.DataFrame.from_dict(samples, orient='index')
+
+# Remove control files that do not exist
 chipseq_samplesheet['control'] = chipseq_samplesheet['control'].apply(lambda x: x if x in chipseq_samplesheet['sample'].values else '')
+
+# Remove antibody, control and control_replicate column from dataframe
+chipseq_samplesheet.loc[chipseq_samplesheet['sample'].str.contains('CONTROL'), ['antibody', 'control', 'control_replicate']] = ''
 chipseq_samplesheet.to_csv(path_samplesheet_chipseq, index=False)
 
+# Runs on revision dev as the pipeline has not had an release for some time
+# TODO change parameter read_length
+# TODO change parameter skip_preseq
+# TODO disable processes not needed for downstream pipelines
 chipseq_run = f"""
 nextflow run \
     nf-core/chipseq \
+    -r dev \
     --input {path_samplesheet_chipseq} \
     --outdir {path_outdir_chipseq} \
     --gtf {gtf} \
     --fasta {fasta} \
-    --read_length 150 \
-    -profile docker \
-    -resume
+    --read_length 50 \
+    -profile {profile} \
+    -process.executor {process_executor} \
+    -resume \
+    --skip_preseq true
 """
 
 os.chdir(path_nfcore_chipseq)

From 0071e70210b52e38212823190e92e04f84316f9b Mon Sep 17 00:00:00 2001
From: Leon Hafner <leon.hafner@tum.de>
Date: Fri, 13 Sep 2024 12:47:07 +0200
Subject: [PATCH 4/6] fix linting

---
 assets/end_to_end.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/assets/end_to_end.py b/assets/end_to_end.py
index ed69db1..cf1cfd9 100644
--- a/assets/end_to_end.py
+++ b/assets/end_to_end.py
@@ -68,18 +68,18 @@ def list_files_recursive(directory):
 for file_path in file_paths:
     condition = file_path.split('/')[-2]
     basename = os.path.basename(file_path)
-    
+
     # Remove .fq.gz and split file name
     sample, rep, read = basename.split('.')[0].split('_')
 
     sample_name = "_".join([condition, sample, rep])
-    
+
     if sample_name not in samples:
         samples[sample_name] = {'sample': sample_name,
                                 'fastq_1': '',
                                 'fastq_2': '',
                                 'strandedness': 'auto'}
-    
+
     if read == 'R1':
         samples[sample_name]['fastq_1'] = file_path
     elif read == 'R2':
@@ -120,14 +120,14 @@ def list_files_recursive(directory):
     condition = file_path.split('/')[-3]
     antibody = file_path.split('/')[-2]
     basename = os.path.basename(file_path)
-    
+
     # Remove .fq.gz and split file name
     sample, rep, read = basename.split('.')[0].split('_')
 
     condition_antibody_sample = "_".join([condition, antibody, sample])
 
     sample_rep = '_'.join([condition_antibody_sample, rep])
-    
+
     # Strip 'REP' from replicate
     if sample_rep not in samples:
         samples[sample_rep] = {'sample': condition_antibody_sample,
@@ -137,7 +137,7 @@ def list_files_recursive(directory):
                                'antibody': antibody,
                                'control': f'{condition}_CONTROL_{sample}',
                                'control_replicate': rep.strip(string.ascii_letters)}
-    
+
     if read == 'R1':
         samples[sample_rep]['fastq_1'] = file_path
     elif read == 'R2':
@@ -172,4 +172,4 @@ def list_files_recursive(directory):
 """
 
 os.chdir(path_nfcore_chipseq)
-os.system(chipseq_run)
\ No newline at end of file
+os.system(chipseq_run)

From d53112c7f7a0df8e73cba5d5d81e99095d9eef1e Mon Sep 17 00:00:00 2001
From: Leon Hafner <leon.hafner@tum.de>
Date: Fri, 13 Sep 2024 14:17:23 +0200
Subject: [PATCH 5/6] add BAM file output

---
 assets/end_to_end.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/assets/end_to_end.py b/assets/end_to_end.py
index cf1cfd9..3c8f46e 100644
--- a/assets/end_to_end.py
+++ b/assets/end_to_end.py
@@ -165,6 +165,7 @@ def list_files_recursive(directory):
     --gtf {gtf} \
     --fasta {fasta} \
     --read_length 50 \
+    --save_align_intermeds \
     -profile {profile} \
     -process.executor {process_executor} \
     -resume \

From fe41445070a1c9e487d1fab5c39c57a37988f4ad Mon Sep 17 00:00:00 2001
From: Leon Hafner <leon.hafner@tum.de>
Date: Thu, 19 Sep 2024 13:58:33 +0200
Subject: [PATCH 6/6] added tfactivtiy

---
 .gitignore           |   1 +
 assets/end_to_end.py | 197 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 171 insertions(+), 27 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5124c9a..d69d807 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ results/
 testing/
 testing*
 *.pyc
+pyrightconfig.json
diff --git a/assets/end_to_end.py b/assets/end_to_end.py
index 3c8f46e..7be5303 100644
--- a/assets/end_to_end.py
+++ b/assets/end_to_end.py
@@ -1,56 +1,87 @@
 import os
+import sys
 import argparse
 import string
 import pandas as pd
+from typing import List
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer")
-parser.add_argument('--outdir', type=str, required=False, help='Path to output directory')
 parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file')
 parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file')
-parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory')
-parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local')
+parser.add_argument('--genome', type=str, required=False, help='genome name/version')
+parser.add_argument('--motifs', type=str, required=False, help='Path to JASPAR motif file')
+parser.add_argument('--taxon_id', type=str, required=False, help='JASPAR taxon ID for motif download')
+parser.add_argument('--outdir', type=str, required=False, help='Path to output directory')
 
 parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory')
 parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory')
-parser.add_argument('--atac_seq', type=str, required=False, help='ATAC-Seq directory')
+
+parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer")
+parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory')
+parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local')
+parser.add_argument('--process_queue', type=str, required=False, help='scheduler queue')
 
 args = parser.parse_args()
 
-profile = args.profile
-outdir = args.outdir
+if args.taxon_id is None:
+    if args.genome is None or args.motifs is None:
+        print('Error: If taxon_id is not specified, both fasta and motifs must be provided.')
+        sys.exit(1)
+
 fasta = args.fasta
 gtf = args.gtf
-apptainer_cache = args.apptainer_cache
-process_executor = args.process_executor
+genome = args.genome
+motifs = args.motifs
+taxon_id = args.taxon_id
+outdir = args.outdir
 
 rna_seq = args.rna_seq
 chip_seq = args.chip_seq
-atac_seq = args.atac_seq
 
-# TODO: Remove hard coded parameters
-fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa"
-gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf"
-rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq"
-chip_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/chip_seq"
-outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output"
-apptainer_cache = "/nfs/scratch/apptainer_cache"
-process_executor = 'local'
+profile = args.profile
+apptainer_cache = args.apptainer_cache
+process_executor = args.process_executor
+process_queue = args.process_queue
 
-os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache
 
+os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache
 
 # Create output directory if not exisiting
 if not os.path.exists(outdir):
     os.makedirs(outdir)
 
 
-def list_files_recursive(directory):
+def list_files(directory, prefix=None, suffix=None, recursive=False) -> List[str]:
+    """
+    List files in the specified directory, optionally including subdirectories.
+
+    Parameters:
+        directory (str): The root directory to start the search.
+        prefix (str, optional): Filter files that start with this prefix.
+        suffix (str, optional): Filter files that end with this suffix.
+        recursive (bool, optional): If True, search subdirectories as well.
+                                    If False, search only the specified directory.
+                                    Default is False.
+
+    Returns:
+        List[str]: List of file paths matching the criteria.
+    """
+
     file_list = []
-    for root, _, files in os.walk(directory):
-        for file in files:
-            file_list.append(os.path.join(root, file))
+
+    if recursive:
+        for root, _, files in os.walk(directory):
+            for file in files:
+                if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)):
+                    file_list.append(os.path.join(root, file))
+    else:
+        for file in os.listdir(directory):
+            full_path = os.path.join(directory, file)
+            if os.path.isfile(full_path):
+                if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)):
+                    file_list.append(full_path)
+
     return file_list
 
 
@@ -62,7 +93,7 @@ def list_files_recursive(directory):
 if not os.path.exists(path_nfcore_rnaseq):
     os.makedirs(path_nfcore_rnaseq)
 
-file_paths = list_files_recursive(rna_seq)
+file_paths = list_files(rna_seq, recursive=True)
 
 samples = {}
 for file_path in file_paths:
@@ -98,11 +129,12 @@ def list_files_recursive(directory):
     --genome null \
     -profile {profile} \
     -process.executor {process_executor} \
+    -process.queue {process_queue} \
     -resume \
     --skip_deseq2_qc true
 """
 os.chdir(path_nfcore_rnaseq)
-os.system(rnaseq_run)
+#os.system(rnaseq_run)
 
 
 # Run nfcore/chipseq pipeline
@@ -113,7 +145,7 @@ def list_files_recursive(directory):
 if not os.path.exists(path_nfcore_chipseq):
     os.makedirs(path_nfcore_chipseq)
 
-file_paths = list_files_recursive(chip_seq)
+file_paths = list_files(chip_seq, recursive=True)
 
 samples = {}
 for file_path in file_paths:
@@ -168,9 +200,120 @@ def list_files_recursive(directory):
     --save_align_intermeds \
     -profile {profile} \
     -process.executor {process_executor} \
+    -process.queue {process_queue} \
     -resume \
     --skip_preseq true
 """
 
 os.chdir(path_nfcore_chipseq)
-os.system(chipseq_run)
+#os.system(chipseq_run)
+
+
+# Run nfcore/tfactivity pipeline
+path_nfcore_tfactivity = os.path.join(outdir, 'nfcore-tfactivity')
+path_outdir_tfactivity = os.path.join(path_nfcore_tfactivity, 'output')
+path_samplesheet_tfactivity_peaks = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_peaks.csv')
+path_samplesheet_tfactivity_bams = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_bams.csv')
+
+path_design_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'design_tfactivity_rna.csv')
+path_counts_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'counts_tfactivity_rna.csv')
+
+if not os.path.exists(path_nfcore_tfactivity):
+    os.makedirs(path_nfcore_tfactivity)
+
+# Create samplesheet peaks
+file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library', 'macs3', 'broad_peak'), suffix='.broadPeak')
+
+samples = {}
+for file_path in file_paths:
+    basename = os.path.basename(file_path)
+    condition, antibody, sample, rep, _ = basename.split('_')
+
+    sample_id = '_'.join([condition, antibody, sample, rep])
+
+    if sample_id not in samples:
+        # TODO: Set 'footprinting' for ChIP-Seq but not for ATAC-Seq and DNase-Seq
+        samples[sample_id] = {
+            'sample': sample_id,
+            'condition': condition,
+            'assay': antibody,
+            'peak_file': file_path,
+        }
+    else:
+        raise ValueError('Duplicate sample_id detected!')
+
+pd.DataFrame.from_dict(samples, orient='index').sort_values(by='sample').to_csv(path_samplesheet_tfactivity_peaks, index=False)
+
+# Create samplesheet bams
+file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library'), suffix='.mLb.clN.sorted.bam')
+
+samples = {}
+controls = {}
+for file_path in file_paths:
+    basename = os.path.basename(file_path)
+    condition, antibody, sample, rep = basename.split('.')[0].split('_')
+
+    sample_id = '_'.join([condition, antibody, sample, rep])
+
+    if antibody != 'CONTROL' and sample_id not in samples:
+        samples[sample_id] = {
+            'sample': sample_id,
+            'condition': condition,
+            'assay': antibody,
+            'signal': file_path,
+            'merge_key': f'{condition}_{sample}_{rep}',
+        }
+    elif antibody == 'CONTROL' and sample_id not in controls:
+        controls[sample_id] = {
+            'merge_key': f'{condition}_{sample}_{rep}',
+            'control': file_path,
+        }
+    else:
+        raise ValueError('Duplicated sample_id detected!')
+
+samplesheet_bam = pd.DataFrame.from_dict(samples, orient='index')
+controls = pd.DataFrame.from_dict(controls, orient='index')
+
+samplesheet_bam = samplesheet_bam.merge(controls, on='merge_key', how='inner').drop(columns='merge_key').sort_values(by='sample')
+samplesheet_bam.to_csv(path_samplesheet_tfactivity_bams, index=False)
+
+# Convert rna counts to csv
+# TODO: Remove hardcoded aligner 'star_salmon' when adding aligner choice
+counts_rna_tsv = pd.read_csv(os.path.join(path_outdir_rnaseq, 'star_salmon', 'salmon.merged.gene_counts.tsv'), sep='\t')
+counts_rna_tsv['gene_id'].to_csv(path_counts_tfactivity_rna, header=False, index=False)
+
+counts_design = {}
+for sample in counts_rna_tsv.drop(columns=['gene_id', 'gene_name']).columns:
+    condition = sample.split('_')[0]
+    path_sample_counts = os.path.join(path_nfcore_tfactivity, f'{sample}_counts.txt')
+
+    counts_rna_tsv[sample].to_csv(path_sample_counts, index=False, header=False)
+    counts_design[sample] = {
+        'sample': sample,
+        'condition': condition,
+        'counts_file': path_sample_counts,
+    }
+
+pd.DataFrame.from_dict(counts_design, orient='index').to_csv(path_design_tfactivity_rna, index=False)
+
+tfactivity_run = f"""
+nextflow run \
+    /nfs/home/students/l.hafner/inspect/tfactivity/dev/main.nf \
+    nf-core/tfactivity \
+    --input {path_samplesheet_tfactivity_peaks} \
+    --input_bam {path_samplesheet_tfactivity_bams} \
+    --counts {path_counts_tfactivity_rna} \
+    --counts_design {path_design_tfactivity_rna} \
+    --motifs {motifs} \
+    --genome {genome} \
+    --fasta {fasta} \
+    --gtf {gtf} \
+    --outdir {path_outdir_tfactivity} \
+    -profile {profile} \
+    -process.executor {process_executor} \
+    -process.queue {process_queue} \
+    -resume
+"""
+
+os.chdir(path_nfcore_tfactivity)
+os.system(tfactivity_run)