Skip to content

Commit

Permalink
added summary statistics
Browse files Browse the repository at this point in the history
  • Loading branch information
umahsn committed Jan 25, 2022
1 parent d5f4b11 commit 757a085
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 47 deletions.
11 changes: 8 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@ RUN conda env create -f environment.yml && conda clean --yes --all

RUN conda init bash

RUN echo "conda activate NanoCaller" > ~/.bashrc
RUN echo "conda activate nanocaller_env" > ~/.bashrc

ENV PATH=/opt/conda/envs/NanoCaller/bin:$PATH
ENV PATH=/opt/conda/envs/nanocaller_env/bin:$PATH

COPY ./scripts ./
COPY ./nanocaller_src /opt/conda/envs/nanocaller_env/bin/nanocaller_src
COPY ./NanoCaller /opt/conda/envs/nanocaller_env/bin
COPY ./NanoCaller_WGS /opt/conda/envs/nanocaller_env/bin

RUN chmod +x /opt/conda/envs/nanocaller_env/bin/NanoCaller
RUN chmod +x /opt/conda/envs/nanocaller_env/bin/NanoCaller_WGS
18 changes: 17 additions & 1 deletion NanoCaller
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def run(args):

pool.close()
pool.join()



if __name__ == '__main__':

t=time.time()
Expand Down Expand Up @@ -257,5 +258,20 @@ if __name__ == '__main__':

run(args)

final_file_name={'snps_unphased':'snps','snps':'snps.phased', 'both':'final', 'indels':'indels'}

final_output=os.path.join(args.output,'%s.%s.vcf.gz' %(args.prefix, final_file_name[args.mode]))

print('\n\n-----------------------------------------------------------\n\n',)
print ('\n%s: Variant calling output can be found here: %s' %(str(datetime.datetime.now()), final_output))

print('\n%s: Printing variant calling summary statistics' %str(datetime.datetime.now()))
stats=run_cmd('rtg vcfstats %s' %final_output, output=True, verbose=True, error=True)

print('\n%s: Saving variant calling summary statistics to: %s' %(str(datetime.datetime.now()), os.path.join(args.output,'%s.summary' %args.prefix)))
with open(os.path.join(args.output,'%s.summary' %args.prefix), 'w') as stats_file:
stats_file.write(stats)

elapsed=time.time()-t

print ('\n%s: Total Time Elapsed: %.2f seconds' %(str(datetime.datetime.now()), elapsed))
21 changes: 19 additions & 2 deletions NanoCaller_WGS
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ if __name__ == '__main__':
indel_group.add_argument("-win_size", "--win_size", help="Size of the sliding window in which the number of indels is counted to determine indel candidate site. Only indels longer than 2bp are counted in this window. Larger window size can increase recall, but use a maximum of 50 only", type=int, default=40)
indel_group.add_argument("-small_win_size", "--small_win_size", help="Size of the sliding window in which indel frequency is determined for small indels", type=int, default=4)


indel_group.add_argument('-impute_indel_phase','--impute_indel_phase', help='Infer read phase by rudimentary allele clustering if the no or insufficient phasing information is available, can be useful for datasets without SNPs or regions with poor phasing quality', default=False, action='store_true')

#phasing
phase_group.add_argument('-phase_bam','--phase_bam', help='Phase bam files if snps mode is selected. This will phase bam file without indel calling.', default=False, action='store_true')

Expand Down Expand Up @@ -213,7 +214,7 @@ if __name__ == '__main__':
for mbase in range(1,chr_end,10000000):
job_id='%s_%d_%d' %(chrom, mbase, min(chr_end,mbase+10000000-1))
out_path=os.path.join(args.output, 'intermediate_files', job_id)
job_command='python %s -chrom %s %s -cpu 1 --output %s -start %d -end %d -prefix %s > %s/%s 2>&1' %(os.path.join(dirname,'NanoCaller.py'), chrom, cmd, out_path ,mbase, min(chr_end,mbase+10000000-1),job_id, log_path,job_id)
job_command='%s -chrom %s %s -cpu 1 --output %s -start %d -end %d -prefix %s > %s/%s 2>&1' %(os.path.join(dirname,'NanoCaller'), chrom, cmd, out_path ,mbase, min(chr_end,mbase+10000000-1),job_id, log_path,job_id)
job_dict[job_id]=job_command
wg_commands.write('%s\n' %job_command)

Expand Down Expand Up @@ -269,5 +270,21 @@ if __name__ == '__main__':
if 'ls: cannot access' in final_logs:
print('VARIANT CALLING FAILED. Please check any errors printed above, or in the log files here: %s.\n' %log_path, flush=True)

elapsed=time.time()-t

final_file_name={'snps_unphased':'snps','snps':'snps.phased', 'both':'final', 'indels':'indels'}

final_output=os.path.join(args.output,'%s.%s.vcf.gz' %(args.prefix, final_file_name[args.mode]))

print('\n\n-----------------------------------------------------------\n\n',)
print ('\n%s: Variant calling output can be found here: %s' %(str(datetime.datetime.now()), final_output))

print('\n%s: Printing variant calling summary statistics' %str(datetime.datetime.now()))
stats=run_cmd('rtg vcfstats %s' %final_output, output=True, verbose=True, error=True)

print('\n%s: Saving variant calling summary statistics to: %s' %(str(datetime.datetime.now()), os.path.join(args.output,'%s.summary' %args.prefix)))
with open(os.path.join(args.output,'%s.summary' %args.prefix), 'w') as stats_file:
stats_file.write(stats)

elapsed=time.time()-t
print ('\n%s: Total Time Elapsed: %.2f seconds' %(str(datetime.datetime.now()), elapsed), flush=True)
16 changes: 11 additions & 5 deletions docs/Install.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ For instructions regarding Docker installation, please visit [Docker website](ht
### 1) via Docker Hub (preferred)
You can pull NanoCaller docker images from Docker Hub by specifiying a version number.
```
VERSION="1.0.1"
docker run genomicslab/nanocaller:${VERSION} python NanoCaller.py --help
VERSION="2.0.0"
docker run genomicslab/nanocaller:${VERSION} NanoCaller --help
```

### 2) Locally build docker image
Expand All @@ -21,7 +21,7 @@ If you want to build an image for the latest commit of NanoCaller Github reposit
```
git clone https://github.com/WGLab/NanoCaller.git
docker build -t nanocaller NanoCaller
docker run nanocaller python NanoCaller.py --help
docker run nanocaller NanoCaller --help
```

## Singularity
Expand All @@ -40,12 +40,18 @@ curl -O https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
```

Go through all the prompts (installation in `$HOME` is recommended). After Anaconda is installed successfully, simply run:
Go through all the prompts (installation in `$HOME` is recommended). Make sure you are using conda version >=4.9. After Anaconda is installed successfully, simply run:

```
git clone https://github.com/WGLab/NanoCaller.git
cd NanoCaller
conda env create -f environment.yml
conda activate NanoCaller
chmod +x NanoCaller NanoCaller_WGS
conda env config vars set -n nanocaller_env PATH=$PWD:$PATH # this will add NanoCaller repository to PATH everytime you activate nanocaller_env environment. If you dont want to do this, then you would need to run NanoCaller by giving full path to repository
conda activate nanocaller_env
```
The installation should take about 10 minutes, including the installation of Miniconda.


77 changes: 43 additions & 34 deletions docs/Usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ docker run -itd \
-v 'YOUR_INPUT_DIR':'/input/' \
-v 'YOUR_WORKING_DIR':'/output/' \
genomicslab/nanocaller:${VERSION} \
python NanoCaller_WGS.py \
NanoCaller_WGS \
-bam /input/YOUR_BAM \
-ref /input/YOUR_REF \
-o /output \
Expand All @@ -30,7 +30,7 @@ docker run -itd \
-v 'YOUR_INPUT_DIR':'/input/' \
-v 'YOUR_WORKING_DIR':'/output/' \
genomicslab/nanocaller:${VERSION} \
python NanoCaller.py \
NanoCaller \
-chrom CHROMOSOME \
-bam /input/YOUR_BAM \
-ref /input/YOUR_REF \
Expand All @@ -46,18 +46,18 @@ python NanoCaller.py \

For whole genome variant calling, or calling variants on several chromosomes, use `NanoCaller_WGS.py` to call variants. Assuming NanoCaller respository is cloned and located at `PATH_TO_NANOCALLER_REPOSITORY` directory, run the following command.
```
python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCaller_WGS.py
NanoCaller_WGS
-bam YOUR_BAM \
-ref YOUR_REF \
-o OUTPUT_DIRECTORY \
-cpu NUMBER_OF_CPUS_TO_USE
-p PRESET
```

Another way to run NanoCaller for whole genome variant calling is to use `NanoCaller.py` or `NanoCaller_WGS.py` on each chromosome separately by setting `chrom` argument. This option is suitable if you have a large computing cluster with a lot of computing resources. For instance, on a Sun Grid Engine, you can submit a separate job for each chromosome like this, using 16 CPUs per job:
Another way to run NanoCaller for whole genome variant calling is to use `NanoCaller` or `NanoCaller_WGS` on each chromosome separately by setting `chrom` argument. This option is suitable if you have a large computing cluster with a lot of computing resources. For instance, on a Sun Grid Engine, you can submit a separate job for each chromosome like this, using 16 CPUs per job:

```
for i in {1..22};do echo "python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCaller.py
for i in {1..22};do echo "NanoCaller
-chrom chr$i
-bam YOUR_BAM \
-ref YOUR_REF \
Expand All @@ -70,11 +70,11 @@ for i in {1..22};do echo "python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCalle

#### Single Chromosome Variant Calling

For calling variants on single chromosomes, use `NanoCaller.py` to call variants.
For calling variants on single chromosomes, use `NanoCaller` to call variants.

Assuming NanoCaller respository is cloned and located at `PATH_TO_NANOCALLER_REPOSITORY` directory, run the following command.
```
python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCaller_WGS.py
NanoCaller_WGS
-chrom CHROMOSOME \
-bam YOUR_BAM \
-ref YOUR_REF \
Expand All @@ -87,19 +87,17 @@ python PATH_TO_NANOCALLER_REPOSITORY/scripts/NanoCaller_WGS.py

### Single Chromosome Variant Calling
```
usage: NanoCaller.py [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU]
[-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam] [-o OUTPUT]
[-prefix PREFIX] [-sample SAMPLE]
[-include_bed INCLUDE_BED] [-exclude_bed EXCLUDE_BED]
[-start START] [-end END] [-p PRESET] -bam BAM -ref REF
-chrom CHROM [-snp_model SNP_MODEL]
[-min_allele_freq MIN_ALLELE_FREQ]
[-min_nbr_sites MIN_NBR_SITES]
[-nbr_t NEIGHBOR_THRESHOLD] [-sup]
[-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD]
[-del_t DEL_THRESHOLD] [-win_size WIN_SIZE]
[-small_win_size SMALL_WIN_SIZE] [-phase_bam]
[-enable_whatshap]
usage: NanoCaller [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU]
[-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam] [-o OUTPUT]
[-prefix PREFIX] [-sample SAMPLE] [-include_bed INCLUDE_BED]
[-exclude_bed EXCLUDE_BED] [-start START] [-end END]
[-p PRESET] -bam BAM -ref REF -chrom CHROM
[-snp_model SNP_MODEL] [-min_allele_freq MIN_ALLELE_FREQ]
[-min_nbr_sites MIN_NBR_SITES] [-nbr_t NEIGHBOR_THRESHOLD]
[-sup] [-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD]
[-del_t DEL_THRESHOLD] [-win_size WIN_SIZE]
[-small_win_size SMALL_WIN_SIZE] [-impute_indel_phase]
[-phase_bam] [-enable_whatshap]
optional arguments:
-h, --help show this help message and exit
Expand Down Expand Up @@ -203,6 +201,11 @@ Indel Calling:
-small_win_size SMALL_WIN_SIZE, --small_win_size SMALL_WIN_SIZE
Size of the sliding window in which indel frequency is
determined for small indels (default: 4)
-impute_indel_phase, --impute_indel_phase
Infer read phase by rudimentary allele clustering if
the no or insufficient phasing information is
available, can be useful for datasets without SNPs or
regions with poor phasing quality (default: False)
Output Options:
-keep_bam, --keep_bam
Expand Down Expand Up @@ -233,20 +236,20 @@ Phasing:

### Whole Genome Variant Calling
```
usage: NanoCaller_WGS.py [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU]
[-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam]
[-o OUTPUT] [-prefix PREFIX] [-sample SAMPLE]
[-chrom [CHROM [CHROM ...]]]
[-include_bed INCLUDE_BED] [-exclude_bed EXCLUDE_BED]
[-wgs_contigs_type WGS_CONTIGS_TYPE] [-p PRESET] -bam
BAM -ref REF [-snp_model SNP_MODEL]
[-min_allele_freq MIN_ALLELE_FREQ]
[-min_nbr_sites MIN_NBR_SITES]
[-nbr_t NEIGHBOR_THRESHOLD] [-sup]
[-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD]
[-del_t DEL_THRESHOLD] [-win_size WIN_SIZE]
[-small_win_size SMALL_WIN_SIZE] [-phase_bam]
[-enable_whatshap]
usage: NanoCaller_WGS [-h] [-mode MODE] [-seq SEQUENCING] [-cpu CPU]
[-mincov MINCOV] [-maxcov MAXCOV] [-keep_bam]
[-o OUTPUT] [-prefix PREFIX] [-sample SAMPLE]
[-chrom [CHROM [CHROM ...]]] [-include_bed INCLUDE_BED]
[-exclude_bed EXCLUDE_BED]
[-wgs_contigs_type WGS_CONTIGS_TYPE] [-p PRESET] -bam
BAM -ref REF [-snp_model SNP_MODEL]
[-min_allele_freq MIN_ALLELE_FREQ]
[-min_nbr_sites MIN_NBR_SITES]
[-nbr_t NEIGHBOR_THRESHOLD] [-sup]
[-indel_model INDEL_MODEL] [-ins_t INS_THRESHOLD]
[-del_t DEL_THRESHOLD] [-win_size WIN_SIZE]
[-small_win_size SMALL_WIN_SIZE] [-impute_indel_phase]
[-phase_bam] [-enable_whatshap]
optional arguments:
-h, --help show this help message and exit
Expand Down Expand Up @@ -356,6 +359,11 @@ Indel Calling:
-small_win_size SMALL_WIN_SIZE, --small_win_size SMALL_WIN_SIZE
Size of the sliding window in which indel frequency is
determined for small indels (default: 4)
-impute_indel_phase, --impute_indel_phase
Infer read phase by rudimentary allele clustering if
the no or insufficient phasing information is
available, can be useful for datasets without SNPs or
regions with poor phasing quality (default: False)
Output Options:
-keep_bam, --keep_bam
Expand Down Expand Up @@ -440,6 +448,7 @@ Preset `ccs` is equivalent to:
--ins_threshold 0.4
--del_threshold 0.4
--enable_whatshap
--impute_indel_phase
```

Preset `clr` is equivalent to:
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: NanoCaller
name: nanocaller_env
channels:
- conda-forge
- bioconda
Expand Down
2 changes: 1 addition & 1 deletion nanocaller_src/indelCaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,4 +195,4 @@ def test_model(params,pool):

run_cmd("bcftools sort %s.raw.vcf|bgziptabix %s.raw.vcf.gz" %(outfile, outfile))

return outfile
return outfile

0 comments on commit 757a085

Please sign in to comment.