diff --git a/.gitignore b/.gitignore index e7a4bef..920fef1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .nextflow* work/ .DS_Store +nextflow_*.config diff --git a/Dockerfile b/Dockerfile index 1cdfe40..30d636d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,7 +19,7 @@ COPY requirements.yml /requirements.yml RUN /usr/miniconda3/bin/conda env create -f /requirements.yml && \ /usr/miniconda3/bin/conda clean -a -RUN git clone https://github.com/ChrisHIV/shiver.git +RUN git clone -b v1.5.8 https://github.com/ChrisHIV/shiver.git ENV PATH=/usr/miniconda3/envs/time_analysis/bin/:/shiver:/shiver/tools:$PATH diff --git a/README.md b/README.md index ece04df..d25fafe 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,13 @@ # TIME_pipeline -Determining time of infection for human immunodeficiency viruses +Determining time of infection for human immunodeficiency viruses. + +The analysis builds upon the [Shiver tool](https://github.com/ChrisHIV/shiver) for mapping paired-end short reads to a +custom reference sequence constructed using do novo assembled contigs. Base frequencies from the alignment are then +used to calculate the time of infection based on the accumulated mutations in the pol gene as described in the +publication by Puller et al. + +> Neher R, Albert J (2017), Estimating time of HIV-1 infection from next-generation sequence diversity. +PLoS Comput Biol 13(10): e1005775. https://doi.org/10.1371/journal.pcbi.1005775 ## Installation and set up @@ -21,7 +29,7 @@ Important settings to change include: | Settings | Description | | ------------------------- | --------------------------------------------------------------- | -| hostGenome | Directory with human reference genome database, indexed for bwa | +| hostGenome | Directory with human reference genome database (bwa) | | hostGenomeBase | The name (base) of the database files | | cache | Directory for cache files | | process.clusterOptions | Cluster options if using slurm for execution | @@ -29,6 +37,14 @@ Important settings to change include: In addition, default settings for primers, adapters and similar configurations are described in more detail [here](data/README.md). +### Download host reference genome +Set `nextflow.config` parameter `hostFasta` to the path of the host reference genome of your choice. +Then set up the host database with the following command. The database will be placed in the `hostGenome` directory +and will be namned as `hostGenomeBase`. +``` +nextflow run main.nf --setup -profile slurm,singularity --outdir +``` + ### Run Shiver initialisation Shiver initilisation directories are included in this repository. Information on these are available [here](data/README.md). To create your own initilisation directory, run the following command: diff --git a/bin/calculate_eti.py b/bin/calculate_eti.py index 5469d70..77e6491 100755 --- a/bin/calculate_eti.py +++ b/bin/calculate_eti.py @@ -59,17 +59,11 @@ def get_region_data(base_frequency_file, pos_start, start_base, pos_fin, step): "HXB2_position": str, "Reference_position": str, "Reference_Base": str, - "A": float, - "C": float, - "G": float, - "T": float, - "gap": float, - "N": float, } all_data = pd.read_csv(f, header=0, names=headers) - all_data.astype(columns) + converted_data = all_data.astype(columns, copy=True, errors='raise') positions_of_interest = range(pos_start + start_base - 1, pos_fin + 1, step) - pol = all_data[all_data["HXB2_position"].isin(str(i) for i in positions_of_interest)] + pol = converted_data[converted_data["HXB2_position"].isin(str(i) for i in positions_of_interest)] return pol diff --git a/configs/executor_options.config b/configs/executor_options.config index ca62a28..1f7cd74 100644 --- a/configs/executor_options.config +++ b/configs/executor_options.config @@ -1,4 +1,19 @@ process { + withLabel: version { + cpus = 1 + memory = '1 GB' + time = '10m' + } + withLabel: info { + cpus = 1 + memory = '1 GB' + time = '10m' + } + withLabel: buildDatabase { + cpus = 1 + memory = '8 GB' + time = '30m' + } withLabel: trimming { cpus = 6 memory = '10 GB' diff --git a/configs/executor_options_large.config b/configs/executor_options_large.config index 20b816e..a7de7f7 100644 --- a/configs/executor_options_large.config +++ b/configs/executor_options_large.config @@ -1,4 +1,19 @@ process { + withLabel: version { + cpus = 1 + memory = '1 GB' + time = '10m' + } + withLabel: info { + cpus = 1 + memory = '1 GB' + time = '10m' + } + withLabel: buildDatabase { + cpus = 1 + memory = '8 GB' + time = '30m' + } withLabel: trimming { cpus = 6 memory = '20 GB' diff --git a/main.nf b/main.nf index f17af80..374ce3d 100644 --- a/main.nf +++ b/main.nf @@ -5,38 +5,47 @@ nextflow.enable.dsl=2 def help() { log.info""" - nextflow run main.nf - - Required options: - -profile Comma-separated list of run profiles to use: local,slurm,docker,singularity - --input Input directory with fastq files - --outdir Directory for results - --ticket Batch name - - Additional options: - --primers Primers used during amplification [Default:?] - --adapters Sequencing adapters [Default:?] - --initDir Shiver initialization directory for configurations - --config Shiver configuration file [Default:?] - --min_cov_eti Coverage threshold in time of infection calculations [Default: 300] - --large Use more computing resources - --help Display this help message and exit - - Initialisation of Shiver directory: - nextflow run main.nf --init - - Required options: - -profile Comma-separated list of run profiles to use: local,slurm,docker,singularity - --primers Primers used during amplification [Default:?] - --adapters Sequencing adapters [Default:?] - --config Shiver configuration file [Default:?] - --references HIV reference dataset + Run the analysis: + nextflow run main.nf -profile --input --outdir --ticket + + Initialize the Shiver input: + nextflow run main.nf --init -profile --outdir + + Create database for host genome: + nextflow run main.nf --setup -profile --outdir Nextflow options: - Specify config file with Nextflow options: - -c Path to additional config file [Default: Nextflow uses nextflow.config in current and script - directory in addition to configurations in ~/.nextflow/config] - -C Path to config file. Other config files are ignored. + -c Path to additional config file [Default: Nextflow uses nextflow.config in current and script + directory in addition to configurations in ~/.nextflow/config] + -C Path to config file. Other config files are ignored. + + Options: + -profile Comma-separated list of run profiles to use: local,slurm,docker,singularity + --input Input directory with fastq files + --outdir Directory for results + --ticket Batch name + --primers Primers used during amplification [Default:?] + --adapters Sequencing adapters [Default:?] + --initDir Shiver initialization directory for configurations + --config Shiver configuration file [Default:?] + --references HIV reference dataset + --hostGenome Directory with reference database for host genome + --hostGenomeBase Name of reference database for host genome + --hostURL URL to download host reference genome from + --min_cov_eti Coverage threshold in time of infection calculations [Default: 300] + --large Use more computing resources + --help Display this help message and exit + + Initialization options: + --primers Primers used during amplification [Default:?] + --adapters Sequencing adapters [Default:?] + --config Shiver configuration file [Default:?] + --references HIV reference dataset + + Setup options: + --hostGenome Directory to create reference database for host genome in + --hostGenomeBase Name of reference database for host genome + --hostFasta Host reference genome path """ } @@ -46,27 +55,43 @@ if (params.help) { } // Verify input parameters -if (!params.primers){ - println("Please specify primers with --primers") - System.exit(1) -} -if (!params.adapters){ - println("Please specify adapters with --adapters") - System.exit(1) -} -if (!params.config){ - println("Please specify Shiver config file with --config") - System.exit(1) -} + if (!params.outdir){ - println("Please specify directory for results with --outdir") + println("Please specify directory for output with --outdir") System.exit(1) } -if ( params.init ) { + +if ( params.setup ) { + if (!params.hostGenome){ + println("Please specify reference database directory with --hostGenome") + System.exit(1) + } + if (!params.hostGenomeBase){ + println("Please specify reference database name with --hostGenomeBase") + System.exit(1) + } + if (!params.hostFasta){ + println("Please specify url for host reference fasta with --hostFasta") + System.exit(1) + } +} +else if ( params.init ) { if (!params.references){ println("Please specify reference fasta file with --references") System.exit(1) } + if (!params.primers){ + println("Please specify primers with --primers") + System.exit(1) + } + if (!params.adapters){ + println("Please specify adapters with --adapters") + System.exit(1) + } + if (!params.config){ + println("Please specify Shiver config file with --config") + System.exit(1) + } } else { if (!params.input){ @@ -81,26 +106,50 @@ else { println("Please specify Shiver intialisation directory with --initDir") System.exit(1) } + if (!params.primers){ + println("Please specify primers with --primers") + System.exit(1) + } + if (!params.adapters){ + println("Please specify adapters with --adapters") + System.exit(1) + } + if (!params.config){ + println("Please specify Shiver config file with --config") + System.exit(1) + } } -// include workflows +// include workflows and modules include {timeAnalysis} from './workflows/time.nf' include {initialisation} from './workflows/initialisation.nf' +include {setup} from './workflows/pipeline_setup.nf' +include {getVersion} from './modules/version.nf' workflow { + getVersion() + + if ( params.setup ) { + // Set initialisation input + Channel.fromPath(params.hostFasta) + .set{ ch_hostFasta } - // Set general workflow input - Channel.fromPath(params.primers) - .set{ ch_primersFile } - Channel.fromPath(params.adapters) - .set{ ch_adaptersFile } - Channel.fromPath(params.config) - .set{ ch_shiverConfigFile } + // Run setup + main: + setup(ch_hostFasta) + } - if ( params.init ) { + else if ( params.init ) { // Set initialisation input Channel.fromPath(params.references) .set{ ch_referenceFile } + Channel.fromPath(params.primers) + .set{ ch_primersFile } + Channel.fromPath(params.adapters) + .set{ ch_adaptersFile } + Channel.fromPath(params.config) + .set{ ch_shiverConfigFile } + // Run initialisation main: initialisation(ch_primersFile, ch_adaptersFile, ch_shiverConfigFile, ch_referenceFile) @@ -115,6 +164,12 @@ workflow { .set{ ch_shiverInitDir } Channel.fromPath(params.hostGenome) .set{ ch_hostGenome } + Channel.fromPath(params.primers) + .set{ ch_primersFile } + Channel.fromPath(params.adapters) + .set{ ch_adaptersFile } + Channel.fromPath(params.config) + .set{ ch_shiverConfigFile } // run analysis main: diff --git a/modules/setup.nf b/modules/setup.nf new file mode 100644 index 0000000..21ca012 --- /dev/null +++ b/modules/setup.nf @@ -0,0 +1,18 @@ +process buildDatabase { + + label 'buildDatabase' + + publishDir "${params.outdir}/database/", mode: 'copy' + publishDir "${params.hostGenome}", mode: 'copy' + + input: + path(fasta) + + output: + path("${params.hostGenomeBase}*") + + script: + """ + bwa index -p ${params.hostGenomeBase} ${fasta} + """ +} diff --git a/modules/version.nf b/modules/version.nf new file mode 100644 index 0000000..120b239 --- /dev/null +++ b/modules/version.nf @@ -0,0 +1,34 @@ +process getVersion { + + label 'version' + + publishDir "${params.outdir}/store/", mode: 'copy' + + output: + path("*.txt") + + script: + """ + echo Version: ${workflow.manifest.version} > pipeline.version.txt + """ +} + +process getInfo { + + label 'info' + + publishDir "${params.outdir}/store/", mode: 'copy' + + output: + path("*.txt") + + script: + """ + echo Primers: ${params.primers} >> pipeline.info.txt + echo Adapters: ${params.adapters} >> pipeline.info.txt + echo Shiver initialization directory: ${params.initDir} >> pipeline.info.txt + echo Pipeline config: ${params.config} >> pipeline.info.txt + echo Host genome: ${params.hostGenome}/${params.hostGenomeBase} >> pipeline.info.txt + echo Coverage threshold: ${params.coverage_threshold_eti} >> pipeline.info.txt + """ +} diff --git a/nextflow.config b/nextflow.config index 97841fc..6a7dd02 100644 --- a/nextflow.config +++ b/nextflow.config @@ -2,15 +2,19 @@ params { // Sample preparation options primers = "$baseDir/data/primers/Primers_A_elife-11282-supp2-v2_PCR1_primers_A_primers_RC.fasta" adapters = "$baseDir/data/adapters/NexteraPE-PE.fa" + + // Configurations initDir = "$baseDir/data/initdirs/InitDirShiver190405_BQ30/" config = "$baseDir/data/configs/shiver_config_BQ30_notrimming.sh" + references = "$baseDir/data/references/HIV1_COM_2020_547-9592_DNA.fasta" // Databases - hostGenome = "/path/to/human_genome_bwa/" + hostGenome = "$baseDir/data/databases/human_genome_bwa/" hostGenomeBase = "GRCh38_genome" + hostFasta = false // Parameters for estimating time of infection - coverage_threshold_eti = '300' + coverage_threshold_eti = '100' // Output options tracedir = "${params.outdir}/store" @@ -19,7 +23,9 @@ params { // General options help = false input = false + outdir = false init = false + setup = false large = false } @@ -46,12 +52,12 @@ profiles { docker.enabled = true fixOwnership = true runOptions = "-u \$(id -u):\$(id -g)" - process.container = "talnor/hiv_time_analysis:0.1.0" + process.container = "talnor/hiv_time_analysis:0.2.0" } singularity { singularity.enabled = true singularity.autoMounts = true - process.container = "talnor/hiv_time_analysis:0.1.0" + process.container = "talnor/hiv_time_analysis:0.2.0" if (params.cache) { singularity.cacheDir = params.cache env.NXF_TEMP="${params.cache}" @@ -89,5 +95,5 @@ dag { manifest { description = 'Analysis pipeline for estimation of time of infection of HIV samples in the TIME study' mainScript = 'main.nf' - version = '0.1.0' + version = '0.2.0' } diff --git a/workflows/pipeline_setup.nf b/workflows/pipeline_setup.nf new file mode 100644 index 0000000..9db24df --- /dev/null +++ b/workflows/pipeline_setup.nf @@ -0,0 +1,14 @@ +// enable dsl2 +nextflow.enable.dsl=2 + +// import modules +include {buildDatabase} from '../modules/setup.nf' + +workflow setup { + take: + ch_hostFasta + + main: + buildDatabase(ch_hostFasta) + +} diff --git a/workflows/time.nf b/workflows/time.nf index 3182c2a..76bf96c 100644 --- a/workflows/time.nf +++ b/workflows/time.nf @@ -2,6 +2,7 @@ nextflow.enable.dsl=2 // import modules +include {getInfo} from '../modules/version.nf' include {trimming} from '../modules/preprocessing.nf' include {hostDepletion} from '../modules/preprocessing.nf' include {hostStats} from '../modules/preprocessing.nf' @@ -20,6 +21,7 @@ workflow timeAnalysis { ch_hostGenome main: + getInfo() trimming(ch_fastq.combine(ch_adaptersFile).combine(ch_primersFile)) hostDepletion(trimming.out.trim.combine(ch_hostGenome)) hostStats(hostDepletion.out.bam)