From 757a0852b40a8d408e3e0b5aa298a77ff9be8edc Mon Sep 17 00:00:00 2001 From: Ahsan Date: Tue, 25 Jan 2022 15:05:56 -0500 Subject: [PATCH] added summary statistics --- Dockerfile | 11 +++-- NanoCaller | 18 +++++++- NanoCaller_WGS | 21 +++++++++- docs/Install.md | 16 +++++--- docs/Usage.md | 77 +++++++++++++++++++---------------- environment.yml | 2 +- nanocaller_src/indelCaller.py | 2 +- 7 files changed, 100 insertions(+), 47 deletions(-) diff --git a/Dockerfile b/Dockerfile index 622d12b..2f834dc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,8 +10,13 @@ RUN conda env create -f environment.yml && conda clean --yes --all RUN conda init bash -RUN echo "conda activate NanoCaller" > ~/.bashrc +RUN echo "conda activate nanocaller_env" > ~/.bashrc -ENV PATH=/opt/conda/envs/NanoCaller/bin:$PATH +ENV PATH=/opt/conda/envs/nanocaller_env/bin:$PATH -COPY ./scripts ./ +COPY ./nanocaller_src /opt/conda/envs/nanocaller_env/bin/nanocaller_src +COPY ./NanoCaller /opt/conda/envs/nanocaller_env/bin +COPY ./NanoCaller_WGS /opt/conda/envs/nanocaller_env/bin + +RUN chmod +x /opt/conda/envs/nanocaller_env/bin/NanoCaller +RUN chmod +x /opt/conda/envs/nanocaller_env/bin/NanoCaller_WGS diff --git a/NanoCaller b/NanoCaller index fbccdda..86fb9fd 100755 --- a/NanoCaller +++ b/NanoCaller @@ -138,7 +138,8 @@ def run(args): pool.close() pool.join() - + + if __name__ == '__main__': t=time.time() @@ -257,5 +258,20 @@ if __name__ == '__main__': run(args) + final_file_name={'snps_unphased':'snps','snps':'snps.phased', 'both':'final', 'indels':'indels'} + + final_output=os.path.join(args.output,'%s.%s.vcf.gz' %(args.prefix, final_file_name[args.mode])) + + print('\n\n-----------------------------------------------------------\n\n',) + print ('\n%s: Variant calling output can be found here: %s' %(str(datetime.datetime.now()), final_output)) + + print('\n%s: Printing variant calling summary statistics' %str(datetime.datetime.now())) + stats=run_cmd('rtg vcfstats %s' %final_output, output=True, verbose=True, error=True) + + print('\n%s: Saving variant calling summary statistics to: %s' %(str(datetime.datetime.now()), os.path.join(args.output,'%s.summary' %args.prefix))) + with open(os.path.join(args.output,'%s.summary' %args.prefix), 'w') as stats_file: + stats_file.write(stats) + elapsed=time.time()-t + print ('\n%s: Total Time Elapsed: %.2f seconds' %(str(datetime.datetime.now()), elapsed)) diff --git a/NanoCaller_WGS b/NanoCaller_WGS index 1c9abc0..18471b6 100755 --- a/NanoCaller_WGS +++ b/NanoCaller_WGS @@ -92,7 +92,8 @@ if __name__ == '__main__': indel_group.add_argument("-win_size", "--win_size", help="Size of the sliding window in which the number of indels is counted to determine indel candidate site. Only indels longer than 2bp are counted in this window. Larger window size can increase recall, but use a maximum of 50 only", type=int, default=40) indel_group.add_argument("-small_win_size", "--small_win_size", help="Size of the sliding window in which indel frequency is determined for small indels", type=int, default=4) - + indel_group.add_argument('-impute_indel_phase','--impute_indel_phase', help='Infer read phase by rudimentary allele clustering if the no or insufficient phasing information is available, can be useful for datasets without SNPs or regions with poor phasing quality', default=False, action='store_true') + #phasing phase_group.add_argument('-phase_bam','--phase_bam', help='Phase bam files if snps mode is selected. This will phase bam file without indel calling.', default=False, action='store_true') @@ -213,7 +214,7 @@ if __name__ == '__main__': for mbase in range(1,chr_end,10000000): job_id='%s_%d_%d' %(chrom, mbase, min(chr_end,mbase+10000000-1)) out_path=os.path.join(args.output, 'intermediate_files', job_id) - job_command='python %s -chrom %s %s -cpu 1 --output %s -start %d -end %d -prefix %s > %s/%s 2>&1' %(os.path.join(dirname,'NanoCaller.py'), chrom, cmd, out_path ,mbase, min(chr_end,mbase+10000000-1),job_id, log_path,job_id) + job_command='%s -chrom %s %s -cpu 1 --output %s -start %d -end %d -prefix %s > %s/%s 2>&1' %(os.path.join(dirname,'NanoCaller'), chrom, cmd, out_path ,mbase, min(chr_end,mbase+10000000-1),job_id, log_path,job_id) job_dict[job_id]=job_command wg_commands.write('%s\n' %job_command) @@ -269,5 +270,21 @@ if __name__ == '__main__': if 'ls: cannot access' in final_logs: print('VARIANT CALLING FAILED. Please check any errors printed above, or in the log files here: %s.\n' %log_path, flush=True) + elapsed=time.time()-t + + final_file_name={'snps_unphased':'snps','snps':'snps.phased', 'both':'final', 'indels':'indels'} + + final_output=os.path.join(args.output,'%s.%s.vcf.gz' %(args.prefix, final_file_name[args.mode])) + + print('\n\n-----------------------------------------------------------\n\n',) + print ('\n%s: Variant calling output can be found here: %s' %(str(datetime.datetime.now()), final_output)) + + print('\n%s: Printing variant calling summary statistics' %str(datetime.datetime.now())) + stats=run_cmd('rtg vcfstats %s' %final_output, output=True, verbose=True, error=True) + + print('\n%s: Saving variant calling summary statistics to: %s' %(str(datetime.datetime.now()), os.path.join(args.output,'%s.summary' %args.prefix))) + with open(os.path.join(args.output,'%s.summary' %args.prefix), 'w') as stats_file: + stats_file.write(stats) + elapsed=time.time()-t print ('\n%s: Total Time Elapsed: %.2f seconds' %(str(datetime.datetime.now()), elapsed), flush=True) diff --git a/docs/Install.md b/docs/Install.md index aae8b64..83aae54 100644 --- a/docs/Install.md +++ b/docs/Install.md @@ -11,8 +11,8 @@ For instructions regarding Docker installation, please visit [Docker website](ht ### 1) via Docker Hub (preferred) You can pull NanoCaller docker images from Docker Hub by specifiying a version number. ``` -VERSION="1.0.1" -docker run genomicslab/nanocaller:${VERSION} python NanoCaller.py --help +VERSION="2.0.0" +docker run genomicslab/nanocaller:${VERSION} NanoCaller --help ``` ### 2) Locally build docker image @@ -21,7 +21,7 @@ If you want to build an image for the latest commit of NanoCaller Github reposit ``` git clone https://github.com/WGLab/NanoCaller.git docker build -t nanocaller NanoCaller -docker run nanocaller python NanoCaller.py --help +docker run nanocaller NanoCaller --help ``` ## Singularity @@ -40,12 +40,18 @@ curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh bash Miniconda3-latest-Linux-x86_64.sh ``` -Go through all the prompts (installation in `$HOME` is recommended). After Anaconda is installed successfully, simply run: +Go through all the prompts (installation in `$HOME` is recommended). Make sure you are using conda version >=4.9. After Anaconda is installed successfully, simply run: ``` git clone https://github.com/WGLab/NanoCaller.git cd NanoCaller conda env create -f environment.yml -conda activate NanoCaller +chmod +x NanoCaller NanoCaller_WGS + +conda env config vars set -n nanocaller_env PATH=$PWD:$PATH # this will add NanoCaller repository to PATH everytime you activate nanocaller_env environment. If you dont want to do this, then you would need to run NanoCaller by giving full path to repository + +conda activate nanocaller_env ``` The installation should take about 10 minutes, including the installation of Miniconda. + + diff --git a/docs/Usage.md b/docs/Usage.md index 33bddf8..365ebb1 100644 --- a/docs/Usage.md +++ b/docs/Usage.md @@ -13,7 +13,7 @@ docker run -itd \ -v 'YOUR_INPUT_DIR':'/input/' \ -v 'YOUR_WORKING_DIR':'/output/' \ genomicslab/nanocaller:${VERSION} \ -python NanoCaller_WGS.py \ +NanoCaller_WGS \ -bam /input/YOUR_BAM \ -ref /input/YOUR_REF \ -o /output \ @@ -30,7 +30,7 @@ docker run -itd \ -v 'YOUR_INPUT_DIR':'/input/' \ -v 'YOUR_WORKING_DIR':'/output/' \ genomicslab/nanocaller:${VERSION} \ -python NanoCaller.py \ +NanoCaller \ -chrom CHROMOSOME \ -bam /input/YOUR_BAM \ -ref /input/YOUR_REF \ @@ -46,7 +46,7 @@ python NanoCaller.py \ For whole genome variant calling, or calling variants on several chromosomes, use `NanoCaller_WGS.py` to call variants. Assuming NanoCaller respository is cloned and located at `PATH_TO_NANOCALLER_REPOSITORY` directory, run the following command. ``` -python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCaller_WGS.py +NanoCaller_WGS -bam YOUR_BAM \ -ref YOUR_REF \ -o OUTPUT_DIRECTORY \ @@ -54,10 +54,10 @@ python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCaller_WGS.py -p PRESET ``` -Another way to run NanoCaller for whole genome variant calling is to use `NanoCaller.py` or `NanoCaller_WGS.py` on each chromosome separately by setting `chrom` argument. This option is suitable if you have a large computing cluster with a lot of computing resources. For instance, on a Sun Grid Engine, you can submit a separate job for each chromosome like this, using 16 CPUs per job: +Another way to run NanoCaller for whole genome variant calling is to use `NanoCaller` or `NanoCaller_WGS` on each chromosome separately by setting `chrom` argument. This option is suitable if you have a large computing cluster with a lot of computing resources. For instance, on a Sun Grid Engine, you can submit a separate job for each chromosome like this, using 16 CPUs per job: ``` -for i in {1..22};do echo "python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCaller.py +for i in {1..22};do echo "NanoCaller -chrom chr$i -bam YOUR_BAM \ -ref YOUR_REF \ @@ -70,11 +70,11 @@ for i in {1..22};do echo "python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCalle #### Single Chromosome Variant Calling -For calling variants on single chromosomes, use `NanoCaller.py` to call variants. +For calling variants on single chromosomes, use `NanoCaller` to call variants. Assuming NanoCaller respository is cloned and located at `PATH_TO_NANOCALLER_REPOSITORY` directory, run the following command. ``` -python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCaller_WGS.py +NanoCaller_WGS -chrom CHROMOSOME \ -bam YOUR_BAM \ -ref YOUR_REF \ @@ -87,19 +87,17 @@ python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCaller_WGS.py ### Single Chromosome Variant Calling ``` -usage: NanoCaller.py [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU] - [-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam] [-o OUTPUT] - [-prefix PREFIX] [-sample SAMPLE] - [-include_bed INCLUDE_BED] [-exclude_bed EXCLUDE_BED] - [-start START] [-end END] [-p PRESET] -bam BAM -ref REF - -chrom CHROM [-snp_model SNP_MODEL] - [-min_allele_freq MIN_ALLELE_FREQ] - [-min_nbr_sites MIN_NBR_SITES] - [-nbr_t NEIGHBOR_THRESHOLD] [-sup] - [-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD] - [-del_t DEL_THRESHOLD] [-win_size WIN_SIZE] - [-small_win_size SMALL_WIN_SIZE] [-phase_bam] - [-enable_whatshap] +usage: NanoCaller [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU] + [-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam] [-o OUTPUT] + [-prefix PREFIX] [-sample SAMPLE] [-include_bed INCLUDE_BED] + [-exclude_bed EXCLUDE_BED] [-start START] [-end END] + [-p PRESET] -bam BAM -ref REF -chrom CHROM + [-snp_model SNP_MODEL] [-min_allele_freq MIN_ALLELE_FREQ] + [-min_nbr_sites MIN_NBR_SITES] [-nbr_t NEIGHBOR_THRESHOLD] + [-sup] [-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD] + [-del_t DEL_THRESHOLD] [-win_size WIN_SIZE] + [-small_win_size SMALL_WIN_SIZE] [-impute_indel_phase] + [-phase_bam] [-enable_whatshap] optional arguments: -h, --help show this help message and exit @@ -203,6 +201,11 @@ Indel Calling: -small_win_size SMALL_WIN_SIZE, --small_win_size SMALL_WIN_SIZE Size of the sliding window in which indel frequency is determined for small indels (default: 4) + -impute_indel_phase, --impute_indel_phase + Infer read phase by rudimentary allele clustering if + the no or insufficient phasing information is + available, can be useful for datasets without SNPs or + regions with poor phasing quality (default: False) Output Options: -keep_bam, --keep_bam @@ -233,20 +236,20 @@ Phasing: ### Whole Genome Variant Calling ``` -usage: NanoCaller_WGS.py [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU] - [-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam] - [-o OUTPUT] [-prefix PREFIX] [-sample SAMPLE] - [-chrom [CHROM [CHROM ...]]] - [-include_bed INCLUDE_BED] [-exclude_bed EXCLUDE_BED] - [-wgs_contigs_type WGS_CONTIGS_TYPE] [-p PRESET] -bam - BAM -ref REF [-snp_model SNP_MODEL] - [-min_allele_freq MIN_ALLELE_FREQ] - [-min_nbr_sites MIN_NBR_SITES] - [-nbr_t NEIGHBOR_THRESHOLD] [-sup] - [-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD] - [-del_t DEL_THRESHOLD] [-win_size WIN_SIZE] - [-small_win_size SMALL_WIN_SIZE] [-phase_bam] - [-enable_whatshap] +usage: NanoCaller_WGS [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU] + [-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam] + [-o OUTPUT] [-prefix PREFIX] [-sample SAMPLE] + [-chrom [CHROM [CHROM ...]]] [-include_bed INCLUDE_BED] + [-exclude_bed EXCLUDE_BED] + [-wgs_contigs_type WGS_CONTIGS_TYPE] [-p PRESET] -bam + BAM -ref REF [-snp_model SNP_MODEL] + [-min_allele_freq MIN_ALLELE_FREQ] + [-min_nbr_sites MIN_NBR_SITES] + [-nbr_t NEIGHBOR_THRESHOLD] [-sup] + [-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD] + [-del_t DEL_THRESHOLD] [-win_size WIN_SIZE] + [-small_win_size SMALL_WIN_SIZE] [-impute_indel_phase] + [-phase_bam] [-enable_whatshap] optional arguments: -h, --help show this help message and exit @@ -356,6 +359,11 @@ Indel Calling: -small_win_size SMALL_WIN_SIZE, --small_win_size SMALL_WIN_SIZE Size of the sliding window in which indel frequency is determined for small indels (default: 4) + -impute_indel_phase, --impute_indel_phase + Infer read phase by rudimentary allele clustering if + the no or insufficient phasing information is + available, can be useful for datasets without SNPs or + regions with poor phasing quality (default: False) Output Options: -keep_bam, --keep_bam @@ -440,6 +448,7 @@ Preset `ccs` is equivalent to: --ins_threshold 0.4 --del_threshold 0.4 --enable_whatshap +--impute_indel_phase ``` Preset `clr` is equivalent to: diff --git a/environment.yml b/environment.yml index 85c4a40..1c7f69e 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: NanoCaller +name: nanocaller_env channels: - conda-forge - bioconda diff --git a/nanocaller_src/indelCaller.py b/nanocaller_src/indelCaller.py index 60857d1..f28c5d1 100644 --- a/nanocaller_src/indelCaller.py +++ b/nanocaller_src/indelCaller.py @@ -195,4 +195,4 @@ def test_model(params,pool): run_cmd("bcftools sort %s.raw.vcf|bgziptabix %s.raw.vcf.gz" %(outfile, outfile)) - return outfile \ No newline at end of file + return outfile