Skip to content

Commit

Permalink
added tfactivtiy
Browse files Browse the repository at this point in the history
  • Loading branch information
LeonHafner committed Sep 19, 2024
1 parent d53112c commit fe41445
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 27 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ results/
testing/
testing*
*.pyc
pyrightconfig.json
197 changes: 170 additions & 27 deletions assets/end_to_end.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,87 @@
import os
import sys
import argparse
import string
import pandas as pd
from typing import List

parser = argparse.ArgumentParser()

parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer")
parser.add_argument('--outdir', type=str, required=False, help='Path to output directory')
parser.add_argument('--fasta', type=str, required=False, help='Path to the FASTA file')
parser.add_argument('--gtf', type=str, required=False, help='Path to the GTF file')
parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory')
parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local')
parser.add_argument('--genome', type=str, required=False, help='genome name/version')
parser.add_argument('--motifs', type=str, required=False, help='Path to JASPAR motif file')
parser.add_argument('--taxon_id', type=str, required=False, help='JASPAR taxon ID for motif download')
parser.add_argument('--outdir', type=str, required=False, help='Path to output directory')

parser.add_argument('--rna_seq', type=str, required=False, help='RNA-Seq directory')
parser.add_argument('--chip_seq', type=str, required=False, help='ChIP-Seq directory')
parser.add_argument('--atac_seq', type=str, required=False, help='ATAC-Seq directory')

parser.add_argument('--profile', type=str, required=False, help='nextflow profile', default="apptainer")
parser.add_argument('--apptainer_cache', type=str, required=False, help='Path to the apptainer cache directory')
parser.add_argument('--process_executor', type=str, required=False, help='Executor for the nextflow pipelines', default='local')
parser.add_argument('--process_queue', type=str, required=False, help='scheduler queue')

args = parser.parse_args()

profile = args.profile
outdir = args.outdir
if args.taxon_id is None:
if args.genome is None or args.motifs is None:
print('Error: If taxon_id is not specified, both fasta and motifs must be provided.')
sys.exit(1)

fasta = args.fasta
gtf = args.gtf
apptainer_cache = args.apptainer_cache
process_executor = args.process_executor
genome = args.genome
motifs = args.motifs
taxon_id = args.taxon_id
outdir = args.outdir

rna_seq = args.rna_seq
chip_seq = args.chip_seq
atac_seq = args.atac_seq

# TODO: Remove hard coded parameters
fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genome.fa"
gtf = "https://raw.githubusercontent.com/nf-core/test-datasets/rnaseq/reference/genes.gtf"
rna_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/rna_seq"
chip_seq = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/data/chip_seq"
outdir = "/nfs/home/students/l.hafner/inspect/tfactivity/dev/testing/end_to_end/output"
apptainer_cache = "/nfs/scratch/apptainer_cache"
process_executor = 'local'
profile = args.profile
apptainer_cache = args.apptainer_cache
process_executor = args.process_executor
process_queue = args.process_queue

os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache

os.environ["NXF_APPTAINER_CACHEDIR"] = apptainer_cache

# Create output directory if not exisiting
if not os.path.exists(outdir):
os.makedirs(outdir)


def list_files_recursive(directory):
def list_files(directory, prefix=None, suffix=None, recursive=False) -> List[str]:
"""
List files in the specified directory, optionally including subdirectories.
Parameters:
directory (str): The root directory to start the search.
prefix (str, optional): Filter files that start with this prefix.
suffix (str, optional): Filter files that end with this suffix.
recursive (bool, optional): If True, search subdirectories as well.
If False, search only the specified directory.
Default is False.
Returns:
List[str]: List of file paths matching the criteria.
"""

file_list = []
for root, _, files in os.walk(directory):
for file in files:
file_list.append(os.path.join(root, file))

if recursive:
for root, _, files in os.walk(directory):
for file in files:
if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)):
file_list.append(os.path.join(root, file))
else:
for file in os.listdir(directory):
full_path = os.path.join(directory, file)
if os.path.isfile(full_path):
if (prefix is None or file.startswith(prefix)) and (suffix is None or file.endswith(suffix)):
file_list.append(full_path)

return file_list


Expand All @@ -62,7 +93,7 @@ def list_files_recursive(directory):
if not os.path.exists(path_nfcore_rnaseq):
os.makedirs(path_nfcore_rnaseq)

file_paths = list_files_recursive(rna_seq)
file_paths = list_files(rna_seq, recursive=True)

samples = {}
for file_path in file_paths:
Expand Down Expand Up @@ -98,11 +129,12 @@ def list_files_recursive(directory):
--genome null \
-profile {profile} \
-process.executor {process_executor} \
-process.queue {process_queue} \
-resume \
--skip_deseq2_qc true
"""
os.chdir(path_nfcore_rnaseq)
os.system(rnaseq_run)
#os.system(rnaseq_run)


# Run nfcore/chipseq pipeline
Expand All @@ -113,7 +145,7 @@ def list_files_recursive(directory):
if not os.path.exists(path_nfcore_chipseq):
os.makedirs(path_nfcore_chipseq)

file_paths = list_files_recursive(chip_seq)
file_paths = list_files(chip_seq, recursive=True)

samples = {}
for file_path in file_paths:
Expand Down Expand Up @@ -168,9 +200,120 @@ def list_files_recursive(directory):
--save_align_intermeds \
-profile {profile} \
-process.executor {process_executor} \
-process.queue {process_queue} \
-resume \
--skip_preseq true
"""

os.chdir(path_nfcore_chipseq)
os.system(chipseq_run)
#os.system(chipseq_run)


# Run nfcore/tfactivity pipeline
path_nfcore_tfactivity = os.path.join(outdir, 'nfcore-tfactivity')
path_outdir_tfactivity = os.path.join(path_nfcore_tfactivity, 'output')
path_samplesheet_tfactivity_peaks = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_peaks.csv')
path_samplesheet_tfactivity_bams = os.path.join(path_nfcore_tfactivity, 'samplesheet_tfactivity_bams.csv')

path_design_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'design_tfactivity_rna.csv')
path_counts_tfactivity_rna = os.path.join(path_nfcore_tfactivity, 'counts_tfactivity_rna.csv')

if not os.path.exists(path_nfcore_tfactivity):
os.makedirs(path_nfcore_tfactivity)

# Create samplesheet peaks
file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library', 'macs3', 'broad_peak'), suffix='.broadPeak')

samples = {}
for file_path in file_paths:
basename = os.path.basename(file_path)
condition, antibody, sample, rep, _ = basename.split('_')

sample_id = '_'.join([condition, antibody, sample, rep])

if sample_id not in samples:
# TODO: Set 'footprinting' for ChIP-Seq but not for ATAC-Seq and DNase-Seq
samples[sample_id] = {
'sample': sample_id,
'condition': condition,
'assay': antibody,
'peak_file': file_path,
}
else:
raise ValueError('Duplicate sample_id detected!')

pd.DataFrame.from_dict(samples, orient='index').sort_values(by='sample').to_csv(path_samplesheet_tfactivity_peaks, index=False)

# Create samplesheet bams
file_paths = list_files(os.path.join(path_outdir_chipseq, 'bwa', 'merged_library'), suffix='.mLb.clN.sorted.bam')

samples = {}
controls = {}
for file_path in file_paths:
basename = os.path.basename(file_path)
condition, antibody, sample, rep = basename.split('.')[0].split('_')

sample_id = '_'.join([condition, antibody, sample, rep])

if antibody != 'CONTROL' and sample_id not in samples:
samples[sample_id] = {
'sample': sample_id,
'condition': condition,
'assay': antibody,
'signal': file_path,
'merge_key': f'{condition}_{sample}_{rep}',
}
elif antibody == 'CONTROL' and sample_id not in controls:
controls[sample_id] = {
'merge_key': f'{condition}_{sample}_{rep}',
'control': file_path,
}
else:
raise ValueError('Duplicated sample_id detected!')

samplesheet_bam = pd.DataFrame.from_dict(samples, orient='index')
controls = pd.DataFrame.from_dict(controls, orient='index')

samplesheet_bam = samplesheet_bam.merge(controls, on='merge_key', how='inner').drop(columns='merge_key').sort_values(by='sample')
samplesheet_bam.to_csv(path_samplesheet_tfactivity_bams, index=False)

# Convert rna counts to csv
# TODO: Remove hardcoded aligner 'star_salmon' when adding aligner choice
counts_rna_tsv = pd.read_csv(os.path.join(path_outdir_rnaseq, 'star_salmon', 'salmon.merged.gene_counts.tsv'), sep='\t')
counts_rna_tsv['gene_id'].to_csv(path_counts_tfactivity_rna, header=False, index=False)

counts_design = {}
for sample in counts_rna_tsv.drop(columns=['gene_id', 'gene_name']).columns:
condition = sample.split('_')[0]
path_sample_counts = os.path.join(path_nfcore_tfactivity, f'{sample}_counts.txt')

counts_rna_tsv[sample].to_csv(path_sample_counts, index=False, header=False)
counts_design[sample] = {
'sample': sample,
'condition': condition,
'counts_file': path_sample_counts,
}

pd.DataFrame.from_dict(counts_design, orient='index').to_csv(path_design_tfactivity_rna, index=False)

tfactivity_run = f"""
nextflow run \
/nfs/home/students/l.hafner/inspect/tfactivity/dev/main.nf \
nf-core/tfactivity \
--input {path_samplesheet_tfactivity_peaks} \
--input_bam {path_samplesheet_tfactivity_bams} \
--counts {path_counts_tfactivity_rna} \
--counts_design {path_design_tfactivity_rna} \
--motifs {motifs} \
--genome {genome} \
--fasta {fasta} \
--gtf {gtf} \
--outdir {path_outdir_tfactivity} \
-profile {profile} \
-process.executor {process_executor} \
-process.queue {process_queue} \
-resume
"""

os.chdir(path_nfcore_tfactivity)
os.system(tfactivity_run)

0 comments on commit fe41445

Please sign in to comment.