test v0.1.1

pirl-unc · Aug 1, 2024 · e2ba435 · e2ba435
1 parent c555752
commit e2ba435
Show file tree

Hide file tree

Showing 10 changed files with 285 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -24,6 +24,7 @@ Download the latest stable release [here](https://github.com/pirl-unc/nexus/rele
 ```
 conda create -n nexus python=3.10
 conda activate nexus
+conda install nextflow==23.10.0
 pip install nexus-<version>.tar.gz --verbose
 ```
 
@@ -82,7 +83,7 @@ For more on this particular workflow, check out [here](/src/nexuslib/pipelines/a
 
 ## 05. Documentation for Available Workflows
 
-A list of links to documentation for all available workflows in `v0.1.0` is provided below:
+A list of links to documentation for all available workflows in the latest version is provided below:
 
 | Category                | Workflow                                                                                                                                              |
 |:------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------|
@@ -104,6 +105,7 @@ A list of links to documentation for all available workflows in `v0.1.0` is prov
 | Utilities               | [**fastq_to_unaligned_bam.nf**](/src/nexuslib/pipelines/utilities/fastq_to_unaligned_bam/)                                                            |
 | Utilities               | [**fastqc.nf**](/src/nexuslib/pipelines/utilities/fastqc/)                                                                                            |
 | Utilities               | [**sequencing_coverage.nf**](/src/nexuslib/pipelines/utilities/sequencing_coverage/)                                                                  |
+| Variant calling (DNA)   | [**long_read_dna_variant_calling_clairs.nf**](/src/nexuslib/pipelines/variant_calling/long_read_dna_variant_calling_clairs/)                          |
 | Variant calling (DNA)   | [**long_read_dna_variant_calling_cutesv.nf**](/src/nexuslib/pipelines/variant_calling/long_read_dna_variant_calling_cutesv/)                          |
 | Variant calling (DNA)   | [**long_read_dna_variant_calling_deepvariant.nf**](/src/nexuslib/pipelines/variant_calling/long_read_dna_variant_calling_deepvariant/)                |
 | Variant calling (DNA)   | [**long_read_dna_variant_calling_pbsv.nf**](/src/nexuslib/pipelines/variant_calling/long_read_dna_variant_calling_pbsv/)                              |

diff --git a/nextflow/nexus_v0.1.0_nextflow_slurm.config b/nextflow/nexus_v0.1.0_nextflow_slurm.config
@@ -36,6 +36,12 @@ process {
         memory = '192.GB'
         time = '7days'
     }
+    withLabel: 'clairs' {
+        container = 'hkubal/clairs:v0.3.0'
+        cpus = 48
+        memory = '192.GB'
+        time = '7days'
+    }
     withLabel: 'copy_bam_file' {
         cpus = 2
         memory = '16.GB'
@@ -320,7 +326,7 @@ process {
         container = 'ajslee/svaba:1.2.0'
         cpus = 48
         memory = '192.GB'
-        time = '7days'
+        time = '14days'
     }
     withLabel: 'svim' {
         container = 'ajslee/svim:2.0.0'

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
 
 [project]
 name = "nexus"
-version = "0.1.0"
+version = "0.1.1"
 description = "NEXflow's Ultimate Streamliner"
 requires-python = ">=3.10"
 readme = "README.md"

diff --git a/scripts/data/fastq/create_paired-end_read_translocation_fastq_files.py b/scripts/data/fastq/create_paired-end_read_translocation_fastq_files.py
diff --git a/scripts/unittests/unittest_variant_calling_dna_long_reads.sh b/scripts/unittests/unittest_variant_calling_dna_long_reads.sh
@@ -3,7 +3,8 @@ pytest \
   --cov-report=term-missing \
   --cov=nexuslib \
   test/ \
-  -k "test_long_read_dna_variant_calling_cutesv or \
+  -k "test_long_read_dna_variant_calling_clairs or \
+      test_long_read_dna_variant_calling_cutesv or \
       test_long_read_dna_variant_calling_deepvariant_github or \
       test_long_read_dna_variant_calling_pbsv or \
       test_long_read_dna_variant_calling_savana or \

diff --git a/src/nexuslib/pipelines/modules/clairs.nf b/src/nexuslib/pipelines/modules/clairs.nf
@@ -0,0 +1,35 @@
+#!/usr/bin/env nextflow
+
+process runClairs {
+
+    label 'clairs'
+    tag "${sample_id}"
+    debug true
+
+    publishDir(
+        path: "${output_dir}/",
+        mode: 'copy'
+    )
+
+    input:
+        tuple val(sample_id), path(tumor_bam_file), path(tumor_bam_bai_file), path(normal_bam_file), path(normal_bam_bai_file)
+        path(reference_genome_fasta_file)
+        path(reference_genome_fasta_fai_file)
+        val(params_clairs)
+        val(output_dir)
+
+    output:
+        tuple val(sample_id), path("${sample_id}_clairs_outputs/"), emit: f
+
+    script:
+        """
+        mkdir -p ${sample_id}_clairs_outputs/
+        run_clairs \
+            --tumor_bam $tumor_bam_file \
+            --normal_bam $normal_bam_file \
+            --ref_fn $reference_genome_fasta_file \
+            --output_dir ${sample_id}_clairs_outputs/ \
+            --threads ${task.cpus} \
+            $params_clairs
+        """
+}
diff --git a/...xuslib/pipelines/variant_calling/long_read_dna_variant_calling_clairs/README.md b/...xuslib/pipelines/variant_calling/long_read_dna_variant_calling_clairs/README.md
@@ -0,0 +1,79 @@
+## long_read_dna_variant_calling_clairs.nf
+
+Identifies somatic small DNA variants in tumor and normal long-read DNA BAM files using [ClairS](https://github.com/HKU-BAL/ClairS).
+
+### Inputs / Outputs
+
+| I/O    | Description                                   |
+|:-------|:----------------------------------------------|
+| Input  | Tumor and normal `bam` files for each sample. | 
+| Output | `vcf` file for each sample.                   |
+
+### Dependencies
+
+* `ClairS`
+
+### Example
+
+```
+nexus run --nf-workflow long_read_dna_variant_calling_clairs.nf \
+    -c NEXTFLOW_CONFIG_FILE \
+    -w WORK_DIR \
+    --samples_tsv_file SAMPLES_TSV_FILE \
+    --output_dir OUTPUT_DIR \
+    --reference_genome_fasta_file REFERENCE_GENOME_FASTA_FILE \
+    --reference_genome_fasta_fai_file REFERENCE_GENOME_FASTA_FAI_FILE \
+    --params_clairs '"--platform hifi_revio"'
+```
+
+### Usage
+
+```
+workflow:
+    1. Run ClairS.
+
+usage: nexus run --nf-workflow long_read_dna_variant_calling_clairs.nf [required] [optional] [--help]
+
+required arguments:
+    -c                                  :   Nextflow .config file.
+    -w                                  :   Nextflow work directory path.
+    --samples_tsv_file                  :   TSV file with the following columns: 'sample_id', 'tumor_bam_file', 'tumor_bam_bai_file', 'normal_bam_file', 'normal_bam_bai_file'.
+    --output_dir                        :   Directory to which output files will be copied.
+
+optional arguments:
+    --reference_genome_fasta_file       :   Reference genome FASTA file (default: /datastore/lbcfs/collaborations/pirl/seqdata/references/hg38.fa).
+    --reference_genome_fasta_fai_file   :   Reference genome FASTA file (default: /datastore/lbcfs/collaborations/pirl/seqdata/references/hg38.fa.fai).
+    --params_clairs                     :   ClairS parameters (default: '"--platform hifi_revio"').
+                                            Note that the parameters need to be wrapped in quotes.
+    --delete_work_dir                   :   Delete work directory (default: false).
+```
+
+### Parameters
+
+`-c`
+* Nextflow config file can be downloaded [here](https://github.com/pirl-unc/nexus/tree/main/nextflow)
+
+`--sample_tsv_file`
+
+| Header              | Description                        |
+|---------------------|------------------------------------|
+| sample_id           | Sample ID                          |
+| tumor_bam_file      | Full path to tumor `bam` file      |
+| tumor_bam_bai_file  | Full path to tumor `bam.bai` file  |
+| normal_bam_file     | Full path to normal `bam` file     |
+| normal_bam_bai_file | Full path to normal `bam.bai` file |
+
+`--reference_genome_fasta_file`
+* Reference genome FASTA files can be found in /datastore/lbcfs/collaborations/pirl/seqdata/references/ on LBG.
+
+`--reference_genome_fasta_fai_file`
+* Reference genome FASTA.FAI files can be found in /datastore/lbcfs/collaborations/pirl/seqdata/references/ on LBG.
+
+`--params_clairs`
+* Refer to the [ClairS documentation](https://github.com/HKU-BAL/ClairS).
+* The following parameters for `run_clairs` are already included in `nexus` module for `clairs` and should not be specified:
+  * `--tumor_bam`
+  * `--normal_bam`
+  * `--ref_fn`
+  * `--output_dir`
+  * `--threads`
diff --git a/...iant_calling/long_read_dna_variant_calling_clairs/long_read_dna_variant_calling_clairs.nf b/...iant_calling/long_read_dna_variant_calling_clairs/long_read_dna_variant_calling_clairs.nf
@@ -0,0 +1,109 @@
+#!/usr/bin/env nextflow
+
+nextflow.enable.dsl=2
+
+// Step 1. Import Nextflow modules
+include { runClairs } from '../../modules/clairs'
+
+// Step 2. Input arguments
+params.help = ''
+// Required arguments
+params.samples_tsv_file = ''
+params.output_dir = ''
+// Optional arguments
+params.reference_genome_fasta_file = '/datastore/lbcfs/collaborations/pirl/seqdata/references/hg38.fa'
+params.reference_genome_fasta_fai_file = '/datastore/lbcfs/collaborations/pirl/seqdata/references/hg38.fa.fai'
+params.params_clairs = '--platform hifi_revio'
+params.delete_work_dir = false
+
+if (params.params_clairs == true) {
+    params_clairs = ''
+} else {
+    params_clairs = params.params_clairs
+}
+
+// Step 3. Print inputs and help
+log.info """\
+         ==================================================================================
+         Identify somatic small variants in long-read DNA sequencing BAM files using ClairS
+         ==================================================================================
+         """.stripIndent()
+
+if (params.help) {
+    log.info"""\
+    workflow:
+        1. Run ClairS.
+
+    usage: nexus run --nf-workflow long_read_dna_variant_calling_clairs.nf [required] [optional] [--help]
+
+    required arguments:
+        -c                                  :   Nextflow .config file.
+        -w                                  :   Nextflow work directory path.
+        --samples_tsv_file                  :   TSV file with the following columns: 'sample_id', 'tumor_bam_file', 'tumor_bam_bai_file', 'normal_bam_file', 'normal_bam_bai_file'.
+        --output_dir                        :   Directory to which output files will be copied.
+
+    optional arguments:
+        --reference_genome_fasta_file       :   Reference genome FASTA file (default: /datastore/lbcfs/collaborations/pirl/seqdata/references/hg38.fa).
+        --reference_genome_fasta_fai_file   :   Reference genome FASTA file (default: /datastore/lbcfs/collaborations/pirl/seqdata/references/hg38.fa.fai).
+        --params_clairs                     :   ClairS parameters (default: '"--platform hifi_revio"').
+                                                Note that the parameters need to be wrapped in quotes.
+        --delete_work_dir                   :   Delete work directory (default: false).
+    """.stripIndent()
+    exit 0
+} else {
+    log.info"""\
+        samples_tsv_file                    :   ${params.samples_tsv_file}
+        output_dir                          :   ${params.output_dir}
+        reference_genome_fasta_file         :   ${params.reference_genome_fasta_file}
+        reference_genome_fasta_fai_file     :   ${params.reference_genome_fasta_fai_file}
+        params_clairs                       :   ${params_clairs}
+        delete_work_dir                     :   ${params.delete_work_dir}
+    """.stripIndent()
+}
+
+// Step 4. Set channels
+Channel
+    .fromPath( params.samples_tsv_file )
+    .splitCsv( header: true, sep: '\t' )
+    .map { row -> tuple(
+        "${row.sample_id}",
+        "${row.tumor_bam_file}",
+        "${row.tumor_bam_bai_file}",
+        "${row.normal_bam_file}",
+        "${row.normal_bam_bai_file}") }
+    .set { input_bam_files_ch }
+
+// Step 5. Workflow
+workflow LONG_READ_DNA_VARIANT_CALLING_CLAIRS {
+    take:
+        input_bam_files_ch             // channel: [val(sample_id), path(tumor_bam_file), path(tumor_bam_bai_file), path(normal_bam_file), path(normal_bam_bai_file)]
+        reference_genome_fasta_file
+        reference_genome_fasta_fai_file
+        params_clairs
+        output_dir
+
+    main:
+        runClairs(
+            input_bam_files_ch,
+            reference_genome_fasta_file,
+            reference_genome_fasta_fai_file,
+            params_clairs,
+            output_dir
+        )
+}
+
+workflow {
+    LONG_READ_DNA_VARIANT_CALLING_CLAIRS(
+        input_bam_files_ch,
+        params.reference_genome_fasta_file,
+        params.reference_genome_fasta_fai_file,
+        params_clairs,
+        params.output_dir
+    )
+}
+
+workflow.onComplete {
+    if ( params.delete_work_dir == true || params.delete_work_dir == 1 ) {
+        workflow.workDir.deleteDir()
+    }
+}
diff --git a/test/data/nextflow/nextflow_test_docker.config b/test/data/nextflow/nextflow_test_docker.config
@@ -28,6 +28,12 @@ process {
         memory = '4.GB'
         time = '1hour'
     }
+    withLabel: 'clairs' {
+        container = 'hkubal/clairs:v0.3.0'
+        cpus = 2
+        memory = '4.GB'
+        time = '1hour'
+    }
     withLabel: 'copy_bam_file' {
         cpus = 2
         memory = '4.GB'

diff --git a/test/variant_calling/test_long_read_dna_variant_calling_clairs.py b/test/variant_calling/test_long_read_dna_variant_calling_clairs.py
@@ -0,0 +1,43 @@
+import pandas as pd
+import os
+from nexuslib.main import run_workflow
+from ..data import get_data_path
+
+
+def test_long_read_dna_variant_calling_clairs():
+    nextflow_config_file = get_data_path(name='nextflow/nextflow_test_docker.config')
+    tumor_dna_bam_file = get_data_path(name='bam/hg38_tp53_tumor_long_read_dna.bam')
+    tumor_dna_bam_bai_file = get_data_path(name='bam/hg38_tp53_tumor_long_read_dna.bam.bai')
+    normal_dna_bam_file = get_data_path(name='bam/hg38_tp53_normal_long_read_dna.bam')
+    normal_dna_bam_bai_file = get_data_path(name='bam/hg38_tp53_normal_long_read_dna.bam.bai')
+    reference_genome_fasta_file = get_data_path(name='fasta/hg38_chr17_1-8000000.fa')
+    reference_genome_fasta_fai_file = get_data_path(name='fasta/hg38_chr17_1-8000000.fa.fai')
+    temp_dir = os.getcwd() + '/tmp'
+    intermediate_dir = temp_dir + '/intermediate/test_long_read_dna_variant_calling_clairs'
+    work_dir = temp_dir + '/work/test_long_read_dna_variant_calling_clairs'
+    output_dir = temp_dir + '/outputs/test_long_read_dna_variant_calling_clairs'
+    if not os.path.exists(intermediate_dir):
+        os.makedirs(intermediate_dir)
+    if not os.path.exists(work_dir):
+        os.makedirs(work_dir)
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    pd.DataFrame({
+        'sample_id': ['tumor'],
+        'tumor_bam_file': [tumor_dna_bam_file],
+        'tumor_bam_bai_file': [tumor_dna_bam_bai_file],
+        'normal_bam_file': [normal_dna_bam_file],
+        'normal_bam_bai_file': [normal_dna_bam_bai_file]
+    }).to_csv(intermediate_dir + "/samples.tsv", sep='\t', index=False)
+    workflow_args = [
+        '-c', nextflow_config_file,
+        '-w', work_dir,
+        '--samples_tsv_file', intermediate_dir + '/samples.tsv',
+        '--reference_genome_fasta_file', reference_genome_fasta_file,
+        '--reference_genome_fasta_fai_file', reference_genome_fasta_fai_file,
+        '--params_clairs', '"--platform hifi_revio"',
+        '--output_dir', output_dir,
+    ]
+    run_workflow(workflow='long_read_dna_variant_calling_clairs.nf',
+                 nextflow='nextflow',
+                 workflow_args=workflow_args)