Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Several fixes after running a full dataset #24

Open
wants to merge 14 commits into
base: dev
Choose a base branch
from
6 changes: 4 additions & 2 deletions bin/DYNAMITE.R
Original file line number Diff line number Diff line change
Expand Up @@ -186,12 +186,14 @@ for(Sample in FileList){
Test_Data<-c()
Train_Data<-c()
for (j in 1:length(subM)){
rndselect=sample(x=nrow(subM[[j]]), size=mSize/test_size)
# Test on a single example if dataset size is too small
rndselect=sample(x=nrow(subM[[j]]), size=ifelse(mSize/test_size < 1, 1, mSize/test_size))
Test_Data<-rbind(Test_Data,subM[[j]][rndselect,])
Train_Data<-rbind(Train_Data,subM[[j]][-rndselect,])
}
}else{
rndselect=sample(x=nrow(M),size=as.numeric(argsL$testsize)*nrow(M))
# Test on a single example if dataset size is too small
rndselect=sample(x=nrow(M),size=ifelse(as.numeric(argsL$testsize)*nrow(M) < 1, 1, as.numeric(argsL$testsize)*nrow(M)))
Comment on lines -194 to +196
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This file is a copy from here and I would like to keep it identical if possible

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could open a PR to their repository, however, the tool does not appear to be actively maintained. The last open PR is from 2020 and still unanswered.

The changes improve the tool's robustness when handling small dataset sizes, making it essential for a reliable pipeline and also for the run on our lactation data.

Test_Data<-M[rndselect,]
Train_Data<-M[-rndselect,]
}
Expand Down
13 changes: 12 additions & 1 deletion conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,17 @@ process {
}

withName: SORT_BED {
ext.args = "-k1,1 -k2,2n"
ext.args = "-V -k1,1 -k2,2n"
ext.prefix = {"${meta.id}_sorted"}
ext.suffix = "bed"
}

withName: SORT_CHROM_SIZES {
ext.args = "-V -k1,1 -k2,2n"
ext.prefix = {"${meta.id}_sorted"}
ext.suffix = "fa.fai"
}

withName: CONSTRUCT_TSS {
ext.args = "-b ${params.rose_tss_window}"
ext.prefix = "tss"
Expand Down Expand Up @@ -119,6 +125,11 @@ process {
ext.suffix = "tsv"
}

withName: "RUN_DYNAMITE" {
//Produces an error if the input is too small and ends with exitStatus 139
errorStrategy = { task.exitStatus == 139 ? 'ignore' : 'retry' }
}

withName: ".*RANKING:CREATE_RANKING" {
publishDir = [
path: { "${params.outdir}/specific_ranking" },
Expand Down
3 changes: 2 additions & 1 deletion modules/local/chromhmm/binarize_bams/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ process BINARIZE_BAMS {
$chromsizes \\
input \\
$table \\
output
output \\
-Xmx${task.memory.toMega()}M

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
3 changes: 2 additions & 1 deletion modules/local/chromhmm/learn_model/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ process LEARN_MODEL {
input \\
output \\
$states \\
PLACEHOLDER
PLACEHOLDER \\
-Xmx${task.memory.toMega()}M

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
df_lengths = df_lengths / 1e3
df_lengths = df_lengths.groupby(df_lengths.index).mean()

df_lengths = df_lengths.loc[df_counts.index]
# Subset gene lengths and counts to common index
shared_index = df_lengths.index.intersection(df_counts.index)
df_lengths = df_lengths.loc[shared_index]
df_counts = df_counts.loc[shared_index]

# Calculate TPM
df_rpk = df_counts.div(df_lengths["length"], axis=0)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ def remove_version(gene_id):
df_affinities = df_affinities.loc[gene_intersection]
df_expression = df_expression.loc[gene_intersection]

# Aggregate duplicated genes from version clipping
df_affinities = df_affinities.groupby(df_affinities.index).mean()
df_expression = df_expression.groupby(df_expression.index).mean()

df_affinities["Expression"] = 0
df_affinities.loc[df_expression["log2FoldChange"] > 0, "Expression"] = 1

Expand Down
2 changes: 1 addition & 1 deletion modules/local/fimo/combine_results/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ process COMBINE_RESULTS {
'biocontainers/python:3.9--1' }"

input:
tuple val(meta), path(motif_files)
tuple val(meta), path(motif_files, stageAs: "fimo/*")

output:
tuple val(meta), path("${meta.id}.tsv"), emit: tsv
Expand Down
39 changes: 18 additions & 21 deletions modules/local/fimo/combine_results/templates/combine_results.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3

import os
import platform

def format_yaml_like(data: dict, indent: int = 0) -> str:
Expand All @@ -14,38 +15,34 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
"""
yaml_str = ""
for key, value in data.items():
spaces = " " * indent
spaces = " " * indent
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this was not on purpose

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think four spaces are the default for nf-core modules.

if isinstance(value, dict):
yaml_str += f"{spaces}{key}:\\n{format_yaml_like(value, indent + 1)}"
else:
yaml_str += f"{spaces}{key}: {value}\\n"
return yaml_str


output_dirs = "${motif_files}".split(',')
output_dirs = [os.path.join('fimo', d) for d in os.listdir('fimo') if os.path.isdir(os.path.join('fimo', d))]

tsvs = []
gffs = []
for output in output_dirs:
with open(f'{output}/fimo.tsv', 'r') as f:
tsv = f.read().split('\\n')
with open(f'{output}/fimo.gff', 'r') as f:
gff = f.read().split('\\n')
output_tsv = "${meta.id}.tsv"
output_gff = "${meta.id}.gff"

tsvs.extend(tsv)
gffs.extend(gff)
with open(output_tsv, 'w') as tsv_out, open(output_gff, 'w') as gff_out:
tsv_out.write('motif_id\\tmotif_alt_id\\tsequence_name\\tstart\\tstop\\tstrand\\tscore\\tp-value\\tq-value\\tmatched_sequence\\n')

tsvs = [line for line in tsvs if not line.startswith('#') and not line.startswith('motif_id') and not line == '']
gffs = [line for line in gffs if not line.startswith('#') and not line == '']

tsvs = ['motif_id\\tmotif_alt_id\\tsequence_name\\tstart\\tstop\\tstrand\\tscore\\tp-value\\tq-value\\tmatched_sequence'] + tsvs

with open('${meta.id}.tsv', 'w') as f:
f.write('\\n'.join(tsvs))

with open('${meta.id}.gff', 'w') as f:
f.write('\\n'.join(gffs))
for output in output_dirs:
with open(f"{output}/fimo.tsv", "r") as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and not line.startswith('motif_id'):
tsv_out.write(line + "\\n")

with open(f"{output}/fimo.gff", "r") as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
gff_out.write(line + "\\n")

# Create version file
versions = {
Expand Down
4 changes: 4 additions & 0 deletions modules/local/ranking/tf_tg_score/templates/tf_tg_score.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ def remove_version(gene_id):
df_affinities = df_affinities.loc[gene_intersection]
df_differential = df_differential.loc[gene_intersection]

# Aggregate duplicated genes from version clipping
df_affinities = df_affinities.groupby(df_affinities.index).mean()
df_differential = df_differential.groupby(df_differential.index).mean()

# Make sure TFs are in common between the affinities and coefficients files
tf_intersection = df_affinities.columns.intersection(df_coefficients.index)
assert len(tf_intersection) > 0, "No TFs found in common between the affinities and coefficients files"
Expand Down
2 changes: 1 addition & 1 deletion modules/local/report/create/main.nf
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import groovy.json.JsonOutput

process CREATE {
label "process_low"
label "process_medium"

conda "bioconda::mulled-v2-ab48c38c3be93a696d7773767d9287b4a0d3bf19==e3c8a1ac0a27058d7922e8b6d02f303c30d93e3a-0"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
Expand Down
9 changes: 7 additions & 2 deletions subworkflows/local/rose.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
include { GAWK as FILTER_CONVERT_GTF } from '../../modules/nf-core/gawk'
include { GNU_SORT as SORT_BED } from '../../modules/nf-core/gnu/sort'
include { GNU_SORT as SORT_CHROM_SIZES } from '../../modules/nf-core/gnu/sort'
include { BEDTOOLS_SLOP as CONSTRUCT_TSS } from '../../modules/nf-core/bedtools/slop'
include { BEDTOOLS_SUBTRACT as FILTER_PREDICTIONS } from '../../modules/nf-core/bedtools/subtract'
include { BEDTOOLS_COMPLEMENT as INVERT_TSS } from '../../modules/nf-core/bedtools/complement'
Expand All @@ -26,10 +27,13 @@ workflow ROSE {
// Downstream methods require sorted inputs
SORT_BED(FILTER_CONVERT_GTF.out.output)

// Sort chrom_sizes to have same ordering as bed file
SORT_CHROM_SIZES(chrom_sizes)

// Construct 2 * params.rose_tss_window bps window around transcription start site (TSS)
CONSTRUCT_TSS(SORT_BED.out.sorted, chrom_sizes.map{meta, file -> file})
CONSTRUCT_TSS(SORT_BED.out.sorted, SORT_CHROM_SIZES.out.sorted.map{meta, file -> file})

INVERT_TSS(CONSTRUCT_TSS.out.bed, chrom_sizes.map{meta, file -> file})
INVERT_TSS(CONSTRUCT_TSS.out.bed, SORT_CHROM_SIZES.out.sorted.map{meta, file -> file})

predicted_regions = ch_bed.branch{
meta, file ->
Expand Down Expand Up @@ -87,6 +91,7 @@ workflow ROSE {
ch_versions = ch_versions.mix(
FILTER_CONVERT_GTF.out.versions,
SORT_BED.out.versions,
SORT_CHROM_SIZES.out.versions,
CONSTRUCT_TSS.out.versions,
INVERT_TSS.out.versions,
FILTER_PREDICTIONS.out.versions,
Expand Down
Loading