Merge pull request #6 from nf-core/motif-files

Simplify handling of motif files
nf-core · May 30, 2024 · 31b5c43 · 31b5c43
2 parents 9c629d9 + 2656f79
commit 31b5c43
Show file tree

Hide file tree

Showing 22 changed files with 474 additions and 466 deletions.
diff --git a/conf/igenomes.config b/conf/igenomes.config
diff --git a/main.nf b/main.nf
@@ -33,7 +33,11 @@ include { PREPARE_GENOME          } from './subworkflows/local/prepare_genome'
 params.fasta     = getGenomeAttribute('fasta')
 params.gtf       = getGenomeAttribute('gtf')
 params.blacklist = getGenomeAttribute('blacklist')
-params.pwms      = getGenomeAttribute('pwms')
+params.taxon_id  = getGenomeAttribute('taxon_id')
+
+if (!params.motifs && !params.taxon_id) {
+    error "Please provide either a motifs file or a taxon ID"
+}
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -55,11 +59,12 @@ workflow NFCORE_TFACTIVITY {
 
     ch_versions = Channel.empty()
 
-    ch_fasta = Channel.value(file(params.fasta))
-    ch_gtf   = Channel.value(file(params.gtf))
-    ch_blacklist = Channel.value(file(params.blacklist))
-    ch_pwms  = Channel.value(file(params.pwms))
-    ch_counts = Channel.value(file(params.counts))
+    ch_fasta = Channel.value(file(params.fasta, checkIfExists: true))
+    ch_gtf   = Channel.value(file(params.gtf, checkIfExists: true))
+    ch_blacklist = params.blacklist ? Channel.value(file(params.blacklist, checkIfExists: true)) : Channel.value([])
+    ch_motifs  = params.motifs ? Channel.value(file(params.motifs, checkIfExists: true)) : Channel.empty()
+    ch_counts = Channel.value(file(params.counts, checkIfExists: true))
+    ch_taxon_id = (!params.motifs && params.taxon_id) ? Channel.value(params.taxon_id) : Channel.empty()
 
     //
     // SUBWORKFLOW: Prepare genome
@@ -81,34 +86,42 @@ workflow NFCORE_TFACTIVITY {
         PREPARE_GENOME.out.fasta,
         PREPARE_GENOME.out.gtf,
         ch_blacklist,
-        ch_pwms,
+        ch_motifs,
+        ch_taxon_id,
         PREPARE_GENOME.out.gene_lengths,
         PREPARE_GENOME.out.gene_map,
-        ch_counts,
-        ch_extra_counts,
-        Channel.value(file(params.counts_design, checkIfExists: true))
-            .map{ design -> [[id: "design"], design]},
-        samplesheet_bam,
         PREPARE_GENOME.out.chrom_sizes,
+
+        // ChromHMM
+        samplesheet_bam,
         params.chromhmm_states,
         params.chromhmm_threshold,
         params.chromhmm_marks.split(','),
+
+        // Peaks
         params.window_size,
         params.decay,
         params.merge_samples,
         params.affinity_aggregation,
 
+        // Counts
+        ch_counts,
+        ch_extra_counts,
+        Channel.value(file(params.counts_design, checkIfExists: true))
+            .map{ design -> [[id: "design"], design]},
         params.min_count,
         params.min_tpm,
         params.expression_aggregation,
         params.min_count_tf,
         params.min_tpm_tf,
 
+        // Dynamite
         params.dynamite_ofolds,
         params.dynamite_ifolds,
         params.dynamite_alpha,
         params.dynamite_randomize,
 
+        // Ranking
         params.alpha,
 
         ch_versions

diff --git a/modules/local/fimo/filter_motifs/main.nf b/modules/local/fimo/filter_motifs/main.nf
@@ -1,23 +1,26 @@
 process FILTER_MOTIFS {
 
-    conda 'conda-forge::python==3.9.5'
+    conda "conda-forge::pandas==1.5.2"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/python:3.9--1':
-        'biocontainers/python:3.9--1' }"
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2':
+        'biocontainers/pandas:1.5.2' }"
 
     input:
         tuple val(meta), path(tfs_jaspar_ids)
-        path jaspar_motifs
+        tuple val(meta2), path(meme_motifs)
 
     output:
-        tuple val(meta), path("sign_motifs/*.meme"), emit: motifs
-        path "versions.yml",                         emit: versions
+        tuple val(meta), path("motifs/*.meme"), emit: motifs
+        path "versions.yml",                    emit: versions
 
     script:
     template "filter_motifs.py"
 
     stub:
     """
-    touch motifs.meme
+    mkdir motifs
+    touch motifs/MA0778.1.meme
+    touch motifs/MA0938.3.meme
+    touch motifs/MA1272.1.meme
     """
 }
diff --git a/modules/local/fimo/filter_motifs/templates/filter_motifs.py b/modules/local/fimo/filter_motifs/templates/filter_motifs.py
@@ -1,9 +1,41 @@
 #!/usr/bin/env python3
 
 from os import mkdir
-from os.path import exists
-from shutil import copy
+import pandas as pd
 import platform
+from collections import defaultdict
+
+
+def parse_meme_file(path_meme_file):
+    with open(path_meme_file, "r") as f:
+        meme_file = f.read()
+
+    lines = meme_file.split('\\n')
+    header = []
+    meme_to_matrix = {}
+    symbol_to_meme = defaultdict(set)
+    current_motif = []
+    current_motif_meme = ""
+    is_header = True
+
+    for line in lines:
+        if line.startswith("MOTIF"):
+            # List not empty -> not first motif
+            if current_motif:
+                meme_to_matrix[current_motif_meme] = '\\n'.join(header + current_motif)
+                current_motif = []
+            current_motif_meme, current_motif_symbol = line.split()[1:3]
+            symbol_to_meme[current_motif_symbol].add(current_motif_meme)
+            is_header = False
+        if is_header:
+            header.append(line)
+        else:
+            current_motif.append(line)
+
+    if current_motif:
+        meme_to_matrix[current_motif_meme] = '\\n'.join(header + current_motif)
+
+    return meme_to_matrix, symbol_to_meme
 
 
 def format_yaml_like(data: dict, indent: int = 0) -> str:
@@ -26,28 +58,39 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
     return yaml_str
 
 
-tfs_jaspar_ids = "${tfs_jaspar_ids}"
-jaspar_motifs = "${jaspar_motifs}"
+tfs_ranking_file = '${tfs_jaspar_ids}'
+path_meme_file = '${meme_motifs}'
+
 
-# Read differentially expressed (DE) transcription factors (TF)
-with open(tfs_jaspar_ids, "r") as f:
-    tfs_jaspar_ids = f.read().split('\\n')
+# Parse tfs_ranking
+tfs_ranking = pd.read_csv(tfs_ranking_file, sep='\\t', index_col=0).index.tolist()
 
-# Create directory for significant motif files
-mkdir("sign_motifs")
+# Parse meme file
+meme_to_matrix, symbol_to_meme = parse_meme_file(path_meme_file)
 
-# Iterate over TFs and store meme files for DE TFs
-for jaspar_id in tfs_jaspar_ids:
-    if exists(f"jaspar_motifs/{jaspar_id}.meme"):
-        copy(f"jaspar_motifs/{jaspar_id}.meme", f"sign_motifs/{jaspar_id}.meme")
+mkdir('motifs')
+for symbol in tfs_ranking:
+    if symbol not in symbol_to_meme:
+        # Check if symbol without version is in dictionary
+        base_symbol = symbol.split('.')[0]
+        if base_symbol not in symbol_to_meme:
+            print(f'Symbol {symbol} not found')
+            continue
+        # Remove version from symbol
+        symbol = base_symbol
+    for meme_id in symbol_to_meme[symbol]:
+        with open(f'motifs/{meme_id}.meme', 'w') as f:
+            f.write(meme_to_matrix[meme_id])
 
 
 # Create version file
 versions = {
     "${task.process}" : {
-        "python": platform.python_version()
+        "python": platform.python_version(),
+        "pandas": pd.__version__,
     }
 }
 
+# Write version file
 with open("versions.yml", "w") as f:
     f.write(format_yaml_like(versions))
diff --git a/modules/local/fimo/jaspar_download/main.nf b/modules/local/fimo/jaspar_download/main.nf
diff --git a/modules/local/fimo/jaspar_mapping/main.nf b/modules/local/fimo/jaspar_mapping/main.nf
diff --git a/modules/local/fimo/jaspar_mapping/templates/jaspar_mapping.py b/modules/local/fimo/jaspar_mapping/templates/jaspar_mapping.py
diff --git a/modules/local/motifs/convert_motifs/main.nf b/modules/local/motifs/convert_motifs/main.nf
@@ -0,0 +1,21 @@
+process CONVERT_MOTIFS {
+    tag "$meta.id"
+    label "process_single"
+
+    conda "bioconda:bioconductor-universalmotif==1.20.0--r43hf17093f_0"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/bioconductor-universalmotif:1.20.0--r43hf17093f_0':
+        'biocontainers/bioconductor-universalmotif:1.20.0--r43hf17093f_0' }"
+
+    input:
+    tuple val(meta), path(in_file), val(in_type)
+    val(out_type)
+
+    output:
+    tuple val(meta), path("${out_file}"), emit: converted
+    path "versions.yml"                 , emit: versions
+
+    script:
+    out_file = "${meta.id}.converted.${out_type}"
+    template "convert.R"
+}