From b59abbe80f6d7b8bda0a3ba65e1f19612f59a6dd Mon Sep 17 00:00:00 2001 From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com> Date: Tue, 16 Sep 2025 09:38:56 +1000 Subject: [PATCH 01/22] Release v0.2.15 (#14) * update doc * bump gpgr version --- CHANGELOG.md | 2 ++ README.md | 14 +++++++------- pyproject.toml | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f7c841..1d8ba01 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## dev +- [14](https://github.com/umccr/bolt/pull/14) - gpgr version bump to 2.2.0 + - [3](https://github.com/scwatts/bolt/pull/3) - Improve PCGR / CPSR argument handling - [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee \ No newline at end of file diff --git a/README.md b/README.md index 323a77f..f1db850 100644 --- a/README.md +++ b/README.md @@ -54,12 +54,12 @@ the software environment and dependencies. Consequently, dependencies are split | Name | Docker image URI | Commands | | --- | --- | --- | -| pcgr | ghcr.io/scwatts/bolt:0.2.14-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
| -| gpgr | ghcr.io/scwatts/bolt:0.2.14-gpgr | • `bolt other cancer_report` | -| snpeff | ghcr.io/scwatts/bolt:0.2.14-snpeff | • `bolt sv_somatic annotate` | -| circos | ghcr.io/scwatts/bolt:0.2.14-circos | • `bolt other purple_baf_plot` | -| multiqc | ghcr.io/scwatts/bolt:0.2.14-multiqc | • `bolt other multiqc_report` | -| base | ghcr.io/scwatts/bolt:0.2.14 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
| +| pcgr | ghcr.io/scwatts/bolt:0.2.15-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
| +| gpgr | ghcr.io/scwatts/bolt:0.2.15-gpgr | • `bolt other cancer_report` | +| snpeff | ghcr.io/scwatts/bolt:0.2.15-snpeff | • `bolt sv_somatic annotate` | +| circos | ghcr.io/scwatts/bolt:0.2.15-circos | • `bolt other purple_baf_plot` | +| multiqc | ghcr.io/scwatts/bolt:0.2.15-multiqc | • `bolt other multiqc_report` | +| base | ghcr.io/scwatts/bolt:0.2.15 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
| ## Usage @@ -67,7 +67,7 @@ Given the nature of software dependencies required, it is strongly recommended t [Docker images](#docker-images): ```bash -docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.14 \ +docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.15 \ bolt smlv_somatic filter \ --tumor_name tumor_sample \ --vcf_fp tumor_sample.vcf.gz \ diff --git a/pyproject.toml b/pyproject.toml index ae8b9bb..149727c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ include = ["bolt*"] [project] name = "bolt" -version = "0.2.14" +version = "0.2.15" authors = [ {name = "Stephen Watts", email = "stephen.watts@umccr.org"}, ] From f3696db8731e41ba191fe7190818351b6f5be15b Mon Sep 17 00:00:00 2001 From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:00:22 +1000 Subject: [PATCH 02/22] fix version (#18) --- .bumpversion.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 977b262..c5ffb18 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.14 +current_version = 0.2.15 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P[a-z0-9+]+) From a4bfad1307b6c7a8c2bf7506e586cfc15e69174c Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Wed, 24 Sep 2025 14:45:11 +1000 Subject: [PATCH 03/22] Bump gpgr version to 2.2.0 --- docker/Dockerfile.gpgr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.gpgr b/docker/Dockerfile.gpgr index b8504e9..f1835d3 100644 --- a/docker/Dockerfile.gpgr +++ b/docker/Dockerfile.gpgr @@ -18,7 +18,7 @@ RUN \ RUN \ conda install --prefix /env/ \ - 'r-gpgr ==2.1.3' \ + 'r-gpgr ==2.2.0' \ 'r-sigrap ==0.1.1' \ 'bioconductor-bsgenome.hsapiens.ucsc.hg38 ==1.4.5' \ 'bioconductor-txdb.hsapiens.ucsc.hg38.knowngene ==3.16.0' \ From 05c1fa298a85eccd04ba638bc5640ca4bc3a0e28 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com> Date: Wed, 1 Oct 2025 11:36:54 +1000 Subject: [PATCH 04/22] Release 0.2.16 (#19) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Bump version: 0.2.15 → 0.2.16 * Change hrd facultative for cancer report (#17) * change hrd file facultatif * fix arg position and assign default value * fix version (#18) * Bump gpgr version to 2.2.0 --- .bumpversion.cfg | 2 +- README.md | 14 +++++++------- bolt/workflows/other/cancer_report.py | 11 ++++++++--- pyproject.toml | 2 +- 4 files changed, 17 insertions(+), 12 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index c5ffb18..37d2a42 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.15 +current_version = 0.2.16 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P[a-z0-9+]+) diff --git a/README.md b/README.md index f1db850..5e5d8c6 100644 --- a/README.md +++ b/README.md @@ -54,12 +54,12 @@ the software environment and dependencies. Consequently, dependencies are split | Name | Docker image URI | Commands | | --- | --- | --- | -| pcgr | ghcr.io/scwatts/bolt:0.2.15-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
| -| gpgr | ghcr.io/scwatts/bolt:0.2.15-gpgr | • `bolt other cancer_report` | -| snpeff | ghcr.io/scwatts/bolt:0.2.15-snpeff | • `bolt sv_somatic annotate` | -| circos | ghcr.io/scwatts/bolt:0.2.15-circos | • `bolt other purple_baf_plot` | -| multiqc | ghcr.io/scwatts/bolt:0.2.15-multiqc | • `bolt other multiqc_report` | -| base | ghcr.io/scwatts/bolt:0.2.15 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
| +| pcgr | ghcr.io/scwatts/bolt:0.2.16-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
| +| gpgr | ghcr.io/scwatts/bolt:0.2.16-gpgr | • `bolt other cancer_report` | +| snpeff | ghcr.io/scwatts/bolt:0.2.16-snpeff | • `bolt sv_somatic annotate` | +| circos | ghcr.io/scwatts/bolt:0.2.16-circos | • `bolt other purple_baf_plot` | +| multiqc | ghcr.io/scwatts/bolt:0.2.16-multiqc | • `bolt other multiqc_report` | +| base | ghcr.io/scwatts/bolt:0.2.16 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
| ## Usage @@ -67,7 +67,7 @@ Given the nature of software dependencies required, it is strongly recommended t [Docker images](#docker-images): ```bash -docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.15 \ +docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.16 \ bolt smlv_somatic filter \ --tumor_name tumor_sample \ --vcf_fp tumor_sample.vcf.gz \ diff --git a/bolt/workflows/other/cancer_report.py b/bolt/workflows/other/cancer_report.py index 0799cd1..983f596 100644 --- a/bolt/workflows/other/cancer_report.py +++ b/bolt/workflows/other/cancer_report.py @@ -29,7 +29,7 @@ @click.option('--purple_dir', required=True, type=click.Path(exists=True)) @click.option('--virusbreakend_dir', required=True, type=click.Path(exists=True)) -@click.option('--dragen_hrd_fp', required=True, type=click.Path(exists=True)) +@click.option('--dragen_hrd_fp', required=False, type=click.Path(exists=True)) @click.option('--cancer_genes_fp', required=True, type=click.Path(exists=True)) @click.option('--oncokb_genes_fp', required=True, type=click.Path(exists=True)) @@ -62,6 +62,11 @@ def entry(ctx, **kwargs): batch_name = f'{kwargs["subject_name"]}_{kwargs["tumor_name"]}' output_table_dir = output_dir / 'cancer_report_tables' + # Optional dragen hrd argument + dragen_hrd_arg = '' + if kwargs['dragen_hrd_fp']: + dragen_hrd_arg = f"--dragen_hrd {kwargs['dragen_hrd_fp']}" + # Run gpgr canrep command = fr''' gpgr.R canrep \ @@ -88,7 +93,7 @@ def entry(ctx, **kwargs): --virusbreakend_tsv {kwargs['virusbreakend_dir']}/{kwargs['tumor_name']}.virusbreakend.vcf.summary.tsv \ --virusbreakend_vcf {kwargs['virusbreakend_dir']}/{kwargs['tumor_name']}.virusbreakend.vcf \ \ - --dragen_hrd {kwargs['dragen_hrd_fp']} \ + {dragen_hrd_arg} \ --bcftools_stats {kwargs['smlv_somatic_bcftools_stats_fp']} \ \ --key_genes {kwargs['cancer_genes_fp']} \ @@ -96,7 +101,7 @@ def entry(ctx, **kwargs): \ --img_dir {output_image_dir}/ \ --result_outdir {output_table_dir}/ \ - --out_file {output_dir}/{kwargs["tumor_name"]}.cancer_report.html + --out_file {output_dir}/{kwargs['tumor_name']}.cancer_report.html ''' util.execute_command(command) diff --git a/pyproject.toml b/pyproject.toml index 149727c..e050686 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ include = ["bolt*"] [project] name = "bolt" -version = "0.2.15" +version = "0.2.16" authors = [ {name = "Stephen Watts", email = "stephen.watts@umccr.org"}, ] From 6dd0b0997227b02b0b4f4a64c3c189891bcc8df6 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com> Date: Wed, 1 Oct 2025 12:18:46 +1000 Subject: [PATCH 05/22] Release 0.2.17 (#20) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Bump version: 0.2.16 → 0.2.17 * bump gpgr to v2.2.1 in docker files (#21) --- .bumpversion.cfg | 2 +- CHANGELOG.md | 2 ++ README.md | 14 +++++++------- docker/Dockerfile.gpgr | 2 +- pyproject.toml | 2 +- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 37d2a42..fe18769 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.16 +current_version = 0.2.17 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P[a-z0-9+]+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d8ba01..88a28d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## dev +- [17](https://github.com/umccr/bolt/pull/17) - change dragen HRD file optional + - [14](https://github.com/umccr/bolt/pull/14) - gpgr version bump to 2.2.0 - [3](https://github.com/scwatts/bolt/pull/3) - Improve PCGR / CPSR argument handling diff --git a/README.md b/README.md index 5e5d8c6..504b052 100644 --- a/README.md +++ b/README.md @@ -54,12 +54,12 @@ the software environment and dependencies. Consequently, dependencies are split | Name | Docker image URI | Commands | | --- | --- | --- | -| pcgr | ghcr.io/scwatts/bolt:0.2.16-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
| -| gpgr | ghcr.io/scwatts/bolt:0.2.16-gpgr | • `bolt other cancer_report` | -| snpeff | ghcr.io/scwatts/bolt:0.2.16-snpeff | • `bolt sv_somatic annotate` | -| circos | ghcr.io/scwatts/bolt:0.2.16-circos | • `bolt other purple_baf_plot` | -| multiqc | ghcr.io/scwatts/bolt:0.2.16-multiqc | • `bolt other multiqc_report` | -| base | ghcr.io/scwatts/bolt:0.2.16 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
| +| pcgr | ghcr.io/scwatts/bolt:0.2.17-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
| +| gpgr | ghcr.io/scwatts/bolt:0.2.17-gpgr | • `bolt other cancer_report` | +| snpeff | ghcr.io/scwatts/bolt:0.2.17-snpeff | • `bolt sv_somatic annotate` | +| circos | ghcr.io/scwatts/bolt:0.2.17-circos | • `bolt other purple_baf_plot` | +| multiqc | ghcr.io/scwatts/bolt:0.2.17-multiqc | • `bolt other multiqc_report` | +| base | ghcr.io/scwatts/bolt:0.2.17 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
| ## Usage @@ -67,7 +67,7 @@ Given the nature of software dependencies required, it is strongly recommended t [Docker images](#docker-images): ```bash -docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.16 \ +docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.17 \ bolt smlv_somatic filter \ --tumor_name tumor_sample \ --vcf_fp tumor_sample.vcf.gz \ diff --git a/docker/Dockerfile.gpgr b/docker/Dockerfile.gpgr index f1835d3..8b3a157 100644 --- a/docker/Dockerfile.gpgr +++ b/docker/Dockerfile.gpgr @@ -18,7 +18,7 @@ RUN \ RUN \ conda install --prefix /env/ \ - 'r-gpgr ==2.2.0' \ + 'r-gpgr ==2.2.1' \ 'r-sigrap ==0.1.1' \ 'bioconductor-bsgenome.hsapiens.ucsc.hg38 ==1.4.5' \ 'bioconductor-txdb.hsapiens.ucsc.hg38.knowngene ==3.16.0' \ diff --git a/pyproject.toml b/pyproject.toml index e050686..e7325a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ include = ["bolt*"] [project] name = "bolt" -version = "0.2.16" +version = "0.2.17" authors = [ {name = "Stephen Watts", email = "stephen.watts@umccr.org"}, ] From d7ca46b2ddb8aef1b1e9568d034b8fb66e0634d7 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com> Date: Wed, 27 Aug 2025 14:40:51 +1000 Subject: [PATCH 06/22] Add option for sash sigrap individual process input to cancer report (#11) * add params input for sigrap tools --- bolt/workflows/other/cancer_report.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/bolt/workflows/other/cancer_report.py b/bolt/workflows/other/cancer_report.py index 983f596..60b4da3 100644 --- a/bolt/workflows/other/cancer_report.py +++ b/bolt/workflows/other/cancer_report.py @@ -28,6 +28,10 @@ @click.option('--purple_dir', required=True, type=click.Path(exists=True)) @click.option('--virusbreakend_dir', required=True, type=click.Path(exists=True)) +@click.option('--mutpat_dir', required=True, type=click.Path(exists=True)) + +@click.option('--hrdetect_file', required=True, type=click.Path(exists=True)) +@click.option('--chord_file', required=True, type=click.Path(exists=True)) @click.option('--dragen_hrd_fp', required=False, type=click.Path(exists=True)) @@ -99,6 +103,10 @@ def entry(ctx, **kwargs): --key_genes {kwargs['cancer_genes_fp']} \ --oncokb_genes {kwargs['oncokb_genes_fp']} \ \ + --mutpat_dir {kwargs['mutpat_dir']} \ + --hrdetect_file {kwargs['hrdetect_file']} \ + --chord_file {kwargs['chord_file']} \ + \ --img_dir {output_image_dir}/ \ --result_outdir {output_table_dir}/ \ --out_file {output_dir}/{kwargs['tumor_name']}.cancer_report.html From f33c48c7a96ebe4e2edb6cc0f00ea9fe01bdb7c9 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:47:40 +1000 Subject: [PATCH 07/22] Hypermutation Handling in PCGR (#9) --- .gitignore | 2 + CHANGELOG.md | 4 +- bolt/common/constants.py | 52 ++++- bolt/common/pcgr.py | 234 ++++++++++++++++++++-- bolt/logging_config.py | 34 ++++ bolt/util.py | 256 +++++++++++++++++++++--- bolt/workflows/other/cancer_report.py | 6 + bolt/workflows/smlv_germline/report.py | 4 + bolt/workflows/smlv_somatic/annotate.py | 59 +++--- bolt/workflows/smlv_somatic/filter.py | 3 + bolt/workflows/smlv_somatic/report.py | 85 +++++++- bolt/workflows/smlv_somatic/rescue.py | 7 + bolt/workflows/sv_somatic/annotate.py | 3 + conda/env/bolt_env.yml | 1 + pyproject.toml | 1 + 15 files changed, 677 insertions(+), 74 deletions(-) create mode 100644 bolt/logging_config.py diff --git a/.gitignore b/.gitignore index 59a0c10..e4ef844 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ __pycache__/ build/ venv/ working/ +data/ +workspace/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 88a28d9..90b7815 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,4 +8,6 @@ - [3](https://github.com/scwatts/bolt/pull/3) - Improve PCGR / CPSR argument handling -- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee \ No newline at end of file +- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee + +-[9](https://github.com/umccr/bolt/pull/9) Add hypermutation sample handling \ No newline at end of file diff --git a/bolt/common/constants.py b/bolt/common/constants.py index e38ed3d..29216bc 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -4,7 +4,7 @@ ###################################### ## Variation selection (annotation) ## ###################################### -MAX_SOMATIC_VARIANTS = 500_000 +MAX_SOMATIC_VARIANTS = 450_000 MAX_SOMATIC_VARIANTS_GNOMAD_FILTER = 0.01 @@ -41,6 +41,43 @@ } +################################ +## Hypermutated report filter ## +################################ +PCGR_TIERS_FILTERING = ( + 'TIER_1', + 'TIER_2', + 'TIER_3', + 'TIER_4', + 'NONCODING', +) + +VEP_IMPACTS_FILTER = ( + 'intergenic', + 'intronic', + 'downstream', + 'upstream', + 'impacts_other', +) + +GENOMIC_REGIONS_FILTERING = ( + 'difficult', + 'none', + 'giab_conf', +) + +HOTSPOT_FIELDS_FILTERING = ( + 'SAGE_HOTSPOT', + 'hotspot', + 'PCGR_MUTATION_HOTSPOT', +) + +RETAIN_FIELDS_FILTERING = ( + 'PANEL', + *HOTSPOT_FIELDS_FILTERING, +) + + ################################################## ## VCF FILTER tags and FORMAT, INFO annotations ## ################################################## @@ -61,6 +98,8 @@ class VcfFilter(enum.Enum): ENCODE = 'ENCODE' GNOMAD_COMMON = 'gnomAD_common' + PCGR_COUNT_LIMIT = 'PCGR_count_limit' + @property def namespace(self): return 'FILTER' @@ -121,6 +160,8 @@ class VcfInfo(enum.Enum): RESCUED_FILTERS_EXISTING = 'RESCUED_FILTERS_EXISTING' RESCUED_FILTERS_PENDING = 'RESCUED_FILTERS_PENDING' + PANEL = 'PANEL' + @property def namespace(self): return 'INFO' @@ -187,6 +228,9 @@ def namespace(self): 'Description': f'gnomAD AF >= {MAX_GNOMAD_AF}', }, + VcfFilter.PCGR_COUNT_LIMIT: { + 'Description': 'Manually filtered to meet PCGR 500,000 variant limit', + }, # INFO VcfInfo.TUMOR_AF: { @@ -350,6 +394,12 @@ def namespace(self): 'Description': 'Filters pending prior to variant rescue', }, + VcfInfo.PANEL: { + 'Number': '0', + 'Type': 'Flag', + 'Description': 'UMCCR somatic panel CDS (2,000 bp padding)', + }, + # FORMAT VcfFormat.SAGE_AD: { diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index fce4c56..296fb48 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -1,9 +1,12 @@ +import collections import csv +import functools +import itertools import pathlib import re import shutil import tempfile - +import logging import cyvcf2 @@ -11,6 +14,8 @@ from .. import util from ..common import constants +# Use the existing logger configuration +logger = logging.getLogger(__name__) def prepare_vcf_somatic(input_fp, tumor_name, normal_name, output_dir): @@ -110,13 +115,18 @@ def get_minimal_header(input_fh): return '\n'.join([filetype_line, *chrom_lines, *format_lines, column_line]) -def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None): +def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None): # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a # temporary directory outside of the FusionFS mounted directory then manually copy across temp_dir = tempfile.TemporaryDirectory() - pcgr_output_dir = output_dir / 'pcgr/' + temp_dir_path = pathlib.Path(temp_dir.name) + pcgr_output_dir = pcgr_output_dir / f"pcgr_{chunk_nbr}" if chunk_nbr is not None else pcgr_output_dir # Check if the output directory already exists + if pcgr_output_dir.exists(): + logger.warning(f"Warning: Output directory '{pcgr_output_dir}' already exists and will be overwrited") + shutil.rmtree(pcgr_output_dir) + if not sample_id: sample_id = 'nosampleset' @@ -171,7 +181,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No command = fr''' pcgr \ - {command_args_str} + {command_args_str} ''' if pcgr_conda: @@ -179,11 +189,24 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No command_formatting = '\n' + ' ' * 4 command = command_formatting + command_conda + command - util.execute_command(command) + # Log file path + log_file_path = temp_dir_path / "run_somatic.log" + + # Run the command and redirect output to the log file + util.execute_command(command, log_file_path=log_file_path) shutil.copytree(temp_dir.name, pcgr_output_dir) - return pcgr_output_dir + pcgr_tsv_fp = pathlib.Path(pcgr_output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv' + pcgr_vcf_fp = pathlib.Path(pcgr_output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz' + + # Check if both files exist + if not pcgr_tsv_fp.exists(): + raise FileNotFoundError(f"Expected file {pcgr_tsv_fp} not found.") + if not pcgr_vcf_fp.exists(): + raise FileNotFoundError(f"Expected file {pcgr_vcf_fp} not found.") + + return pcgr_tsv_fp, pcgr_vcf_fp def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None): @@ -240,7 +263,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pc return cpsr_output_dir -def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, output_dir): +def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp, output_dir): # Set destination INFO field names and source TSV fields info_field_map = { constants.VcfInfo.PCGR_MUTATION_HOTSPOT: 'MUTATION_HOTSPOT', @@ -249,9 +272,6 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou constants.VcfInfo.PCGR_CSQ: 'CSQ', } - pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv' - pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz' - # Enforce matching defined and source INFO annotations check_annotation_headers(info_field_map, pcgr_vcf_fp) @@ -277,10 +297,6 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou # Do not process chrM since *snvs_indels.tiers.tsv does not include these annotations if record.CHROM == 'chrM': continue - # Immediately print out variants that were not annotated - if filter_name in record.FILTERS: - output_fh.write_record(record) - continue # Annotate and write record_ann = annotate_record(record, pcgr_data) output_fh.write_record(record_ann) @@ -481,3 +497,193 @@ def annotate_record(record, annotations, *, allow_missing=False): record.INFO[info_enum.value] = v return record + +def split_vcf(input_vcf, output_dir): + """ + Splits a VCF file into multiple chunks, each containing up to max_variants variants. + Each chunk includes the VCF header. + Ensures no overlapping positions between chunks. + """ + output_dir = pathlib.Path(output_dir / "vcf_chunks") + output_dir.mkdir(parents=True, exist_ok=True) + chunk_files = [] + chunk_number = 1 + variant_count = 0 + base_filename = pathlib.Path(input_vcf).stem + chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + base_filename = input_vcf.stem + chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + chunk_files.append(chunk_filename) + # Open the input VCF using cyvcf2 + vcf_in = cyvcf2.VCF(input_vcf) + # Create a new VCF file for the first chunk + vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in) + last_position = None + for record in vcf_in: + current_position = record.POS + # Check if we need to start a new chunk + if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position): + # Close the current chunk file and start a new one + vcf_out.close() + chunk_number += 1 + chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + chunk_files.append(chunk_filename) + vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in) + variant_count = 0 + # Write the record to the current chunk + vcf_out.write_record(record) + variant_count += 1 + last_position = current_position + # Close the last chunk file + vcf_out.close() + vcf_in.close() + logger.info(f"VCF file split into {len(chunk_files)} chunks.") + return chunk_files + +def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda): + pcgr_tsv_files = [] + pcgr_vcf_files = [] + num_chunks = len(vcf_chunks) + # Ensure we don't use more workers than available threads, and each worker has at least 2 threads + max_workers = min(num_chunks, max_threads // 2) + threads_quot, threads_rem = divmod(max_threads, num_chunks) + threads_per_chunk = max(2, threads_quot) + # Limit the number of workers to the smaller of num_chunks or max_threads // 2 + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = {} + for chunk_number, vcf_file in enumerate(vcf_chunks, start=1): + # Assign extra thread to the first 'threads_rem' chunks + additional_thread = 1 if chunk_number <= threads_rem else 0 + total_threads = threads_per_chunk + additional_thread + futures[executor.submit(run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number + for future in concurrent.futures.as_completed(futures): + try: + pcgr_tsv_fp, pcgr_vcf_fp = future.result() + if pcgr_tsv_fp: + pcgr_tsv_files.append(pcgr_tsv_fp) + if pcgr_vcf_fp: + pcgr_vcf_files.append(pcgr_vcf_fp) + except Exception as e: + print(f"Exception occurred: {e}") + merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files) + return merged_tsv_fp, merged_vcf_fp + +def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp): + pcgr_dir = pathlib.Path(output_dir) / 'pcgr' + pcgr_dir.mkdir(exist_ok=True) + + # Merge all TSV files into a single file in the pcgr directory + merged_tsv_fp = pcgr_dir / "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv" + util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp) + + # Step 5: Merge all VCF files into a single file in the pcgr directory + merged_vcf_path = pcgr_dir / "nosampleset.pcgr_acmg.grch38" + merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path) + + return merged_vcf, merged_tsv_fp + + +def get_variant_filter_data(variant): + attribute_names = ( + 'tier', + 'difficult', + 'giab_conf', + 'intergenic', + 'intronic', + 'downstream', + 'upstream', + 'impacts_other', + ) + + data = {e: None for e in attribute_names} + + + data['tier'] = variant.INFO['PCGR_TIER'] + + + info_keys = [k for k, v in variant.INFO] + + data['difficult'] = any(e.startswith('DIFFICULT') for e in info_keys) + data['giab_conf'] = 'GIAB_CONF' in info_keys + + + # NOTE(SW): GIAB_CONF always overrides DIFFICULT tags + if data['giab_conf'] and data['difficult']: + data['difficult']= False + + + for impact in get_impacts(variant.INFO['PCGR_CSQ']): + if impact == 'intergenic_variant': + data['intergenic'] = True + elif impact == 'intron_variant': + data['intronic'] = True + elif impact == 'downstream_gene_variant': + data['downstream'] = True + elif impact == 'upstream_gene_variant': + data['upstream'] = True + elif impact: + data['impacts_other'] = True + else: + assert False + + return data + + +def get_impacts(csq_str_full): + impacts = set() + for csq_str in csq_str_full.split(','): + csq_tokens = csq_str.split('|') + impact_str = csq_tokens[1] + impacts.update(impact_str.split('&')) + return impacts + + +def determine_filter(data): + + for impact, region in get_ordering(tiers=False): + + # NOTE(SW): this is less efficient than nested loops since the outer block is reevaluated + # within what would be the inner loop each cycle; taking this route for cleaner code + + impacts_higher = get_impacts_higher(impact) + impact_filter = bool(data[impact]) and not any(bool(data[e]) for e in impacts_higher) + + region_filter = False + if region == 'none': + region_filter = not (data['difficult'] or data['giab_conf']) + else: + region_filter = data[region] + + if impact_filter and region_filter: + return (impact, region) + + return False + + +def get_variant_repr(variant): + return (variant.CHROM, variant.POS, variant.REF, tuple(variant.ALT)) + + +@functools.cache +def get_ordering(tiers=True, impacts=True, regions=True): + categories = [ + constants.PCGR_TIERS_FILTERING if tiers else None, + constants.VEP_IMPACTS_FILTER if impacts else None, + constants.GENOMIC_REGIONS_FILTERING if regions else None, + ] + + # NOTE(SW): I'm not aware of any noncoding impacts for TIER_[1-4] other than TERT but keeping + # in to be overly cautious + ordering_iter = itertools.product(*(c for c in categories if c)) + + return tuple(ordering_iter) + + +@functools.cache +def get_impacts_higher(impact): + impact_index = constants.VEP_IMPACTS_FILTER.index(impact) + if impact_index + 1 < len(constants.VEP_IMPACTS_FILTER): + impacts_higher = constants.VEP_IMPACTS_FILTER[impact_index+1:len(constants.VEP_IMPACTS_FILTER)] + else: + impacts_higher = list() + return impacts_higher diff --git a/bolt/logging_config.py b/bolt/logging_config.py new file mode 100644 index 0000000..0ddda2b --- /dev/null +++ b/bolt/logging_config.py @@ -0,0 +1,34 @@ +import logging +import sys +import pathlib +from datetime import datetime + +class IgnoreTinfoFilter(logging.Filter): + def filter(self, record): + # Exclude messages that contain the unwanted text. + if "no version information available" in record.getMessage(): + return False + return True + +def setup_logging(output_dir, script_name): + # Create a timestamp for the log file + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + log_filename = f"{script_name}_{timestamp}.log" + log_file = pathlib.Path(output_dir) / log_filename + + # Create individual handlers. + console_handler = logging.StreamHandler(sys.stdout) + file_handler = logging.FileHandler(log_file) + + # Instantiate and attach the filter to both handlers. + tinfo_filter = IgnoreTinfoFilter() + console_handler.addFilter(tinfo_filter) + file_handler.addFilter(tinfo_filter) + + logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[file_handler, console_handler] + ) + logger = logging.getLogger(__name__) + logger.info("Logging setup complete") \ No newline at end of file diff --git a/bolt/util.py b/bolt/util.py index b7333d9..b2ce7df 100644 --- a/bolt/util.py +++ b/bolt/util.py @@ -1,11 +1,14 @@ +import os import pathlib import subprocess -import sys import textwrap - +import logging +from types import SimpleNamespace from .common import constants +# Set up logging +logger = logging.getLogger(__name__) # TODO(SW): create note that number this assumes location of `//` def get_project_root(): @@ -15,46 +18,58 @@ def get_project_root(): return project_root -def execute_command(command): - command_prepared = command_prepare(command) +def execute_command(command, log_file_path=None): + logger.info("Executing command: %s", command.strip()) - print(command_prepared) + # Open the log file if provided + log_file = log_file_path.open('a', encoding='utf-8') if log_file_path else None - process = subprocess.run( - command_prepared, + # Launch process with combined stdout and stderr streams, and line buffering enabled. + process = subprocess.Popen( + command, shell=True, executable='/bin/bash', - capture_output=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, encoding='utf-8', + bufsize=1 # line buffered ) - if process.returncode != 0: - print(process) - print(process.stderr) - sys.exit(1) + output_lines = [] + # Iterate over each line as it becomes available + with process.stdout: + for line in iter(process.stdout.readline, ''): + if line: + logger.info(line.strip()) + output_lines.append(line) + if log_file: + log_file.write(line) + log_file.flush() # flush immediately for real-time logging + process.wait() # wait for the process to complete + + if log_file: + log_file.close() - return process + result = SimpleNamespace( + stdout=''.join(output_lines), + returncode=process.returncode, + pid=process.pid, + command=command + ) + return result def command_prepare(command): return f'set -o pipefail; {textwrap.dedent(command)}' - -#def count_vcf_records(fp, exclude_args=None): -# args = list() -# if exclude_args: -# args.append(f'-e \'{exclude_args}\'') -# -# args_str = ' '.join(args) -# command = f'bcftools view -H {args_str} {fp} | wc -l' -# -# result = execute_command(command) -# return int(result.stdout) - - def count_vcf_records(fp): - result = execute_command(f'bcftools view -H {fp} | wc -l') - return int(result.stdout) + result = subprocess.run(f'bcftools view -H {fp} | wc -l', + shell=True, + executable="/bin/bash", + capture_output=True, + text=True ) + return int(result.stdout.strip()) def add_vcf_header_entry(fh, anno_enum): @@ -94,8 +109,185 @@ def get_qualified_vcf_annotation(anno_enum): assert anno_enum in constants.VcfInfo or anno_enum in constants.VcfFormat return f'{anno_enum.namespace}/{anno_enum.value}' +def split_vcf(input_vcf, output_dir): + """ + Splits a VCF file into multiple chunks, each containing up to max_variants variants. + Each chunk includes the VCF header. + Ensures no overlapping positions between chunks. + """ + output_dir = pathlib.Path(output_dir / "vcf_chunks") + output_dir.mkdir(parents=True, exist_ok=True) + + chunk_files = [] + chunk_number = 1 + variant_count = 0 + base_filename = pathlib.Path(input_vcf).stem + chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + base_filename = input_vcf.stem + chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + chunk_files.append(chunk_filename) + + # Open the input VCF using cyvcf2 + vcf_in = cyvcf2.VCF(input_vcf) + # Create a new VCF file for the first chunk + vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in) + + last_position = None + + for record in vcf_in: + current_position = record.POS + # Check if we need to start a new chunk + if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position): + # Close the current chunk file and start a new one + vcf_out.close() + chunk_number += 1 + chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + chunk_files.append(chunk_filename) + vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in) + variant_count = 0 + + # Write the record to the current chunk + vcf_out.write_record(record) + variant_count += 1 + last_position = current_position + + # Close the last chunk file + vcf_out.close() + vcf_in.close() + + logger.info(f"VCF file split into {len(chunk_files)} chunks.") + + return chunk_files + +def merge_tsv_files(tsv_files, merged_tsv_fp): + """ + Merges all TSV files into a single TSV. + """ + with open(merged_tsv_fp, 'w') as merged_tsv: + for i, tsv_file in enumerate(tsv_files): + with open(tsv_file, 'r') as infile: + for line_number, line in enumerate(infile): + # Skip header except for the first file + if i > 0 and line_number == 0: + continue + merged_tsv.write(line) + logger.info(f"Merged TSV written to: {merged_tsv_fp}") + + +def merge_vcf_files(vcf_files, merged_vcf_fp): + """ + Merges multiple VCF files into a single sorted VCF file using bcftools. + + Parameters: + - vcf_files: List of paths to VCF files to be merged. + - merged_vcf_fp: Path to the output merged VCF file (without extension). + + Returns: + - Path to the sorted merged VCF file. + """ + merged_vcf_fp = pathlib.Path(merged_vcf_fp) + merged_unsorted_vcf = merged_vcf_fp.with_suffix('.unsorted.vcf.gz') + merged_vcf = merged_vcf_fp.with_suffix('.vcf.gz') + + # Prepare the bcftools merge command arguments + command_args = [ + 'bcftools merge', + '-m all', + '-Oz', + f'-o {merged_unsorted_vcf}', + ] + [str(vcf_file) for vcf_file in vcf_files] + + # Format the command for readability + delimiter_padding = ' ' * 10 + delimiter = f' \\\n{delimiter_padding}' + command_args_str = delimiter.join(command_args) + + command = f''' + {command_args_str} + ''' + + # Run the bcftools merge command + logger.info("Running bcftools merge...") + execute_command(command) + logger.info(f"Merged VCF written to: {merged_unsorted_vcf}") + + # Sort the merged VCF file + sort_command_args = [ + 'bcftools sort', + '-Oz', + f'-o {merged_vcf}', + f'{merged_unsorted_vcf}' + ] + sort_command_args_str = delimiter.join(sort_command_args) + sort_command = f''' + {sort_command_args_str} + ''' + + logger.info("Sorting merged VCF file...") + execute_command(sort_command) + logger.info(f"Sorted merged VCF written to: {merged_vcf}") + + # Index the sorted merged VCF file + index_command_args = [ + 'bcftools index', + '-t', + f'{merged_vcf}' + ] + index_command_args_str = delimiter.join(index_command_args) + index_command = f''' + {index_command_args_str} + ''' + + logger.info("Indexing sorted merged VCF file...") + execute_command(index_command) + logger.info(f"Indexed merged VCF file: {merged_vcf}.tbi") + + # Optionally, remove the unsorted merged VCF file + if merged_unsorted_vcf.exists(): + merged_unsorted_vcf.unlink() + + return merged_vcf + +def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp): + # Step 3: Merge all chunk VCF files into a single file + pcgr_dir = output_dir / 'pcgr/' + pcgr_dir.mkdir(exist_ok=True) + # Merge all TSV files into a single file in the pcgr directory merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv") + merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv") + merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp) + # Step 5: Merge all VCF files into a single file in the pcgr directory + merged_vcf_path = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38") + merged_vcf = merge_vcf_files(pcgr_vcf_files, merged_vcf_path) + return merged_vcf, merged_tsv_fp + +def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda): + pcgr_tsv_files = [] + pcgr_vcf_files = [] + + num_chunks = len(vcf_chunks) + # Ensure we don't use more workers than available threads, and each worker has at least 2 threads + max_workers = min(num_chunks, max_threads // 2) + threads_quot, threads_rem = divmod(max_threads, num_chunks) + threads_per_chunk = max(2, threads_quot) + + # Limit the number of workers to the smaller of num_chunks or max_threads // 2 + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = {} + for chunk_number, vcf_file in enumerate(vcf_chunks, start=1): + # Assign extra thread to the first 'threads_rem' chunks + additional_thread = 1 if chunk_number <= threads_rem else 0 + total_threads = threads_per_chunk + additional_thread + futures[executor.submit(pcgr.run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number + + for future in concurrent.futures.as_completed(futures): + try: + pcgr_tsv_fp, pcgr_vcf_fp = future.result() + if pcgr_tsv_fp: + pcgr_tsv_files.append(pcgr_tsv_fp) + if pcgr_vcf_fp: + pcgr_vcf_files.append(pcgr_vcf_fp) + except Exception as e: + print(f"Exception occurred: {e}") -#def add_vcf_filter(record, filter_enum): -# existing_filters = [e for e in record.FILTERS if e != 'PASS'] -# assert filter_enum.value not in existing_filters -# return ';'.join([*existing_filters, filter_enum.value]) + merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files) + return merged_tsv_fp, merged_vcf_fp \ No newline at end of file diff --git a/bolt/workflows/other/cancer_report.py b/bolt/workflows/other/cancer_report.py index 60b4da3..1902079 100644 --- a/bolt/workflows/other/cancer_report.py +++ b/bolt/workflows/other/cancer_report.py @@ -5,6 +5,9 @@ from ... import util +from ...logging_config import setup_logging + + @click.command(name='cancer_report') @@ -48,6 +51,9 @@ def entry(ctx, **kwargs): output_dir = pathlib.Path(kwargs['output_dir']) output_dir.mkdir(mode=0o755, parents=True, exist_ok=True) + script_name = pathlib.Path(__file__).stem + setup_logging(output_dir, script_name) + # Normalise SAGE variants and remove duplicates that arise for MutationalPattern compatibility decomposed_snv_vcf = normalise_and_dedup_sage_variants( kwargs['smlv_somatic_vcf_fp'], diff --git a/bolt/workflows/smlv_germline/report.py b/bolt/workflows/smlv_germline/report.py index 0742c19..0a10711 100644 --- a/bolt/workflows/smlv_germline/report.py +++ b/bolt/workflows/smlv_germline/report.py @@ -1,5 +1,6 @@ import pathlib import yaml +from ...logging_config import setup_logging import click @@ -35,6 +36,9 @@ def entry(ctx, **kwargs): output_dir = pathlib.Path(kwargs['output_dir']) output_dir.mkdir(mode=0o755, parents=True, exist_ok=True) + # Set up logging + script_name = pathlib.Path(__file__).stem + setup_logging(output_dir, script_name) # BCFtools stats run_bcftool_stats(kwargs['vcf_unfiltered_fp'], kwargs['normal_name'], output_dir) diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py index f163d5f..fe0007a 100644 --- a/bolt/workflows/smlv_somatic/annotate.py +++ b/bolt/workflows/smlv_somatic/annotate.py @@ -1,14 +1,13 @@ +import logging import pathlib - - import click import cyvcf2 - from ... import util from ...common import constants from ...common import pcgr - +from ...logging_config import setup_logging +logger = logging.getLogger(__name__) @click.command(name='annotate') @click.pass_context @@ -44,6 +43,9 @@ def entry(ctx, **kwargs): output_dir = pathlib.Path(kwargs['output_dir']) output_dir.mkdir(mode=0o755, parents=True, exist_ok=True) + script_name = pathlib.Path(__file__).stem + setup_logging(output_dir, script_name) + # Set all FILTER="." to FILTER="PASS" as required by PURPLE filter_pass_fp = set_filter_pass(kwargs['vcf_fp'], kwargs['tumor_name'], output_dir) @@ -73,7 +75,6 @@ def entry(ctx, **kwargs): ) # Annotate with cancer-related and functional information from a range of sources using PCGR - # - Select variants to process - there is an upper limit for PCGR of around 500k # - Set tumor and normal AF and DP in INFO for PCGR and remove all other annotations # - Run PCGR on minimal VCF (pcgr_prep_fp) # - Transfer selected PCGR annotations to unfiltered VCF (selected_fp) @@ -84,30 +85,40 @@ def entry(ctx, **kwargs): # - Hits in COSMIC [INFO/PCGR_COSMIC_COUNT] # - Hits in TCGA [INFO/PCGR_TCGA_PANCANCER_COUNT] # - Hits in PCAWG [INFO/PCGR_ICGC_PCAWG_COUNT] - # Set selected data or full input - selection_data = select_variants( - pon_fp, - kwargs['tumor_name'], - kwargs['cancer_genes_fp'], - output_dir, - ) - - if not (pcgr_prep_input_fp := selection_data.get('filtered')): - pcgr_prep_input_fp = selection_data['selected'] # Prepare VCF for PCGR annotation pcgr_prep_fp = pcgr.prepare_vcf_somatic( - pcgr_prep_input_fp, + pon_fp, kwargs['tumor_name'], kwargs['normal_name'], output_dir, ) - # Run PCGR - pcgr_dir = pcgr.run_somatic( - pcgr_prep_fp, + pcgr_output_dir = output_dir / 'pcgr' + total_variants = util.count_vcf_records(pcgr_prep_fp) + print(f"Total number of variants in the input VCF: {total_variants}") + + # Run PCGR in chunks if the total number of variants exceeds the maximum allowed for somatic variants + if total_variants > constants.MAX_SOMATIC_VARIANTS: + vcf_chunks = util.split_vcf( + pcgr_prep_fp, + output_dir + ) + pcgr_tsv_fp, pcgr_vcf_fp = util.run_somatic_chunck( + vcf_chunks, kwargs['pcgr_data_dir'], output_dir, + pcgr_output_dir, + kwargs['threads'], + kwargs['pcgr_conda'], + kwargs['pcgrr_conda'] + ) + else: + pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic( + pcgr_prep_fp, + kwargs['pcgr_data_dir'], + pcgr_output_dir, + chunk_nbr=None, threads=kwargs['threads'], pcgr_conda=kwargs['pcgr_conda'], pcgrr_conda=kwargs['pcgrr_conda'], @@ -115,13 +126,13 @@ def entry(ctx, **kwargs): # Transfer PCGR annotations to full set of variants pcgr.transfer_annotations_somatic( - selection_data['selected'], + pon_fp, kwargs['tumor_name'], - selection_data.get('filter_name'), - pcgr_dir, + pcgr_vcf_fp, + pcgr_tsv_fp, output_dir, ) - + logger.info("Annotation process completed") def set_filter_pass(input_fp, tumor_name, output_dir): output_fp = output_dir / f'{tumor_name}.set_filter_pass.vcf.gz' @@ -136,7 +147,6 @@ def set_filter_pass(input_fp, tumor_name, output_dir): return output_fp - def general_annotations(input_fp, tumor_name, threads, annotations_dir, output_dir): toml_fp = pathlib.Path(annotations_dir) / 'vcfanno_annotations.toml' @@ -169,7 +179,6 @@ def panel_of_normal_annotations(input_fp, tumor_name, threads, pon_dir, output_d util.execute_command(command) return output_fp - def select_variants(input_fp, tumor_name, cancer_genes_fp, output_dir): # Exclude variants until we hopefully move the needle below the threshold diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py index e46f285..104296d 100644 --- a/bolt/workflows/smlv_somatic/filter.py +++ b/bolt/workflows/smlv_somatic/filter.py @@ -26,6 +26,9 @@ def entry(ctx, **kwargs): output_dir = pathlib.Path(kwargs['output_dir']) output_dir.mkdir(mode=0o755, parents=True, exist_ok=True) + script_name = pathlib.Path(__file__).stem + setup_logging(output_dir, script_name) + # Open input VCF and set required header entries for output in_fh = cyvcf2.VCF(kwargs['vcf_fp']) header_filters = ( diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py index bdf908f..b2bead1 100644 --- a/bolt/workflows/smlv_somatic/report.py +++ b/bolt/workflows/smlv_somatic/report.py @@ -1,3 +1,4 @@ +import collections import csv import json import pathlib @@ -45,6 +46,9 @@ def entry(ctx, **kwargs): output_dir = pathlib.Path(kwargs['output_dir']) output_dir.mkdir(mode=0o755, parents=True, exist_ok=True) + script_name = pathlib.Path(__file__).stem + setup_logging(output_dir, script_name) + # BCFtools stats bcftools_vcf_fp = bcftools_stats_prepare(kwargs['vcf_fp'], kwargs['tumor_name'], output_dir) run_bcftools_stats(bcftools_vcf_fp, kwargs['tumor_name'], output_dir) @@ -105,10 +109,20 @@ def entry(ctx, **kwargs): fh.write('\n') # PCGR report + if variant_counts_process['filter_pass'] <= constants.MAX_SOMATIC_VARIANTS: + pcgr_input_vcf_fp = kwargs['vcf_fp'] + else: + pcgr_input_vcf_fp = select_pcgr_variants( + kwargs['vcf_fp'], + kwargs['cancer_genes_fp'], + kwargs['tumor_name'], + output_dir, + ) + purple_data = parse_purple_purity_file(kwargs['purple_purity_fp']) pcgr_prep_fp = pcgr.prepare_vcf_somatic( - kwargs['vcf_fp'], + pcgr_input_vcf_fp, kwargs['tumor_name'], kwargs['normal_name'], output_dir, @@ -284,6 +298,75 @@ def count_variant_process(vcf_fp): return counts +def select_pcgr_variants(vcf_fp, cancer_genes_fp, tumor_name, output_dir): + # Annotate variants in UMCCR somatic gene panel + fp_annotated_out = output_dir / f'{tumor_name}.umccr_panel_variants_annotated.vcf.gz' + util.execute_command(fr''' + bcftools annotate \ + --annotations <(awk 'BEGIN {{ OFS="\t" }} {{ print $1, $2-2000, $3+2000, "1" }}' {cancer_genes_fp}) \ + --header-line '{util.get_vcf_header_line(constants.VcfInfo.PANEL)}' \ + --columns CHROM,FROM,TO,{constants.VcfInfo.PANEL.value} \ + --output {fp_annotated_out} \ + {vcf_fp} + ''') + + # Set filter category for each variant + variants_sorted = collections.defaultdict(list) + for variant_count, variant in enumerate(cyvcf2.VCF(fp_annotated_out), 1): + if any(variant.INFO.get(e) for e in constants.RETAIN_FIELDS_FILTERING): + continue + + data = pcgr.get_variant_filter_data(variant) + variant_filter = pcgr.determine_filter(data) + assert variant_filter + + filter_category = (data['tier'], *variant_filter) + variant_repr = pcgr.get_variant_repr(variant) + variants_sorted[filter_category].append(variant_repr) + + + # Determine the set of filter categories to come under the PCGR 500,000 variant threshold + filter_sum = 0 + filter_categories = list() + for key in pcgr.get_ordering(): + + if (variant_count - filter_sum) <= constants.MAX_SOMATIC_VARIANTS: + break + + filter_sum += len(variants_sorted.get(key, [])) + filter_categories.append(key) + + # Set FILTERS and write out records + filter_variants = set() + for key in filter_categories: + filter_variants.update(variants_sorted[key]) + + fh_in = cyvcf2.VCF(vcf_fp) + util.add_vcf_header_entry(fh_in, constants.VcfFilter.PCGR_COUNT_LIMIT) + + # NOTE(SW): creating an additional VCF with all records for traceability + fp_out = output_dir / f'{tumor_name}.pcgr_hypermutated.pass.vcf.gz' + fp_set_out = output_dir / f'{tumor_name}.pcgr_hypermutated.filters_set.vcf.gz' + + fh_out = cyvcf2.Writer(fp_out, fh_in) + fh_set_out = cyvcf2.Writer(fp_set_out, fh_in) + + for variant in fh_in: + variant_repr = pcgr.get_variant_repr(variant) + if variant_repr not in filter_variants: + # Write only passing + fh_out.write_record(variant) + else: + variant.FILTER = constants.VcfFilter.PCGR_COUNT_LIMIT.value + # Write all variants including those with FILTER set + fh_set_out.write_record(variant) + + fh_out.close() + fh_set_out.close() + + return fp_out + + def parse_purple_purity_file(fp): with open(fp, 'r') as fh: entries = list(csv.DictReader(fh, delimiter='\t')) diff --git a/bolt/workflows/smlv_somatic/rescue.py b/bolt/workflows/smlv_somatic/rescue.py index c92df94..bfd89b9 100644 --- a/bolt/workflows/smlv_somatic/rescue.py +++ b/bolt/workflows/smlv_somatic/rescue.py @@ -7,10 +7,14 @@ import click import cyvcf2 +import logging from ... import util from ...common import constants +from ...logging_config import setup_logging + +logger = logging.getLogger(__name__) @click.command(name='rescue') @@ -37,6 +41,9 @@ def entry(ctx, **kwargs): output_dir = pathlib.Path(kwargs['output_dir']) output_dir.mkdir(mode=0o755, parents=True, exist_ok=True) + script_name = pathlib.Path(__file__).stem + setup_logging(output_dir, script_name) + # Select PASS SAGE variants in hotspots and then split into existing and novel calls sage_pass_vcf_fp = select_sage_pass_hotspot( kwargs['sage_vcf_fp'], diff --git a/bolt/workflows/sv_somatic/annotate.py b/bolt/workflows/sv_somatic/annotate.py index 2988e53..8577922 100644 --- a/bolt/workflows/sv_somatic/annotate.py +++ b/bolt/workflows/sv_somatic/annotate.py @@ -5,10 +5,13 @@ import click import cyvcf2 import pysam +import logging from ... import util +logger = logging.getLogger(__name__) + @click.command(name='annotate') @click.pass_context diff --git a/conda/env/bolt_env.yml b/conda/env/bolt_env.yml index e27bd24..71e90c9 100644 --- a/conda/env/bolt_env.yml +++ b/conda/env/bolt_env.yml @@ -16,3 +16,4 @@ dependencies: - python >=3.10 - pyyaml - vcfanno ==0.3.5 + - ncurses>=6.3 diff --git a/pyproject.toml b/pyproject.toml index e7325a3..e98f470 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "cyvcf2", "pysam", "pyyaml", + "future" ] [project.scripts] From cb5ee4d510acd85db1061a65a0c42a0806ae430f Mon Sep 17 00:00:00 2001 From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com> Date: Tue, 2 Sep 2025 15:31:24 +1000 Subject: [PATCH 08/22] Feature/pcgr v2.2.1 (#10) --- bolt/common/constants.py | 78 +++++------- bolt/common/pcgr.py | 152 +++++++++++++----------- bolt/util.py | 1 + bolt/workflows/smlv_germline/report.py | 2 + bolt/workflows/smlv_somatic/annotate.py | 16 ++- bolt/workflows/smlv_somatic/filter.py | 16 +-- bolt/workflows/smlv_somatic/report.py | 4 +- docker/Dockerfile.pcgr | 4 +- tests/test_smlv_somatic_filter.py | 12 +- 9 files changed, 145 insertions(+), 140 deletions(-) diff --git a/bolt/common/constants.py b/bolt/common/constants.py index 29216bc..7cebe78 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -35,9 +35,9 @@ 'pathogenic', 'uncertain_significance', } -PCGR_TIERS_RESCUE = { - 'TIER_1', - 'TIER_2', +PCGR_ACTIONABILITY_TIER_RESCUE = { + '1', + '2', } @@ -116,16 +116,14 @@ class VcfInfo(enum.Enum): SAGE_NOVEL = 'SAGE_NOVEL' SAGE_RESCUE = 'SAGE_RESCUE' - PCGR_TIER = 'PCGR_TIER' + PCGR_ACTIONABILITY_TIER = 'PCGR_ACTIONABILITY_TIER' PCGR_CSQ = 'PCGR_CSQ' PCGR_MUTATION_HOTSPOT = 'PCGR_MUTATION_HOTSPOT' - PCGR_CLINVAR_CLNSIG = 'PCGR_CLINVAR_CLNSIG' + PCGR_CLINVAR_CLASSIFICATION = 'PCGR_CLINVAR_CLASSIFICATION' PCGR_COSMIC_COUNT = 'PCGR_COSMIC_COUNT' PCGR_TCGA_PANCANCER_COUNT = 'PCGR_TCGA_PANCANCER_COUNT' PCGR_ICGC_PCAWG_COUNT = 'PCGR_ICGC_PCAWG_COUNT' - CPSR_FINAL_CLASSIFICATION = 'CPSR_FINAL_CLASSIFICATION' - CPSR_PATHOGENICITY_SCORE = 'CPSR_PATHOGENICITY_SCORE' CPSR_CLINVAR_CLASSIFICATION = 'CPSR_CLINVAR_CLASSIFICATION' CPSR_CSQ = 'CPSR_CSQ' @@ -151,7 +149,7 @@ class VcfInfo(enum.Enum): GNOMAD_AF = 'gnomAD_AF' - PCGR_TIER_RESCUE = 'PCGR_TIER_RESCUE' + PCGR_ACTIONABILITY_TIER_RESCUE = 'PCGR_ACTIONABILITY_TIER_RESCUE' SAGE_HOTSPOT_RESCUE = 'SAGE_HOTSPOT_RESCUE' CLINICAL_POTENTIAL_RESCUE = 'CLINICAL_POTENTIAL_RESCUE' @@ -270,7 +268,7 @@ def namespace(self): 'Description': 'Variant rescued by a matching SAGE call', }, - VcfInfo.PCGR_TIER: { + VcfInfo.PCGR_ACTIONABILITY_TIER: { 'Number': '1', 'Type': 'String', 'Description': ( @@ -281,28 +279,29 @@ def namespace(self): }, VcfInfo.PCGR_CSQ: { 'Number': '.', - 'Type': 'String', - 'Description': ( - 'Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|' - 'Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|' - 'CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|' - 'DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|' - 'MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|' - 'UNIPROT_ISOFORM|RefSeq|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|' - 'gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|' - 'gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|' - 'NearestExonJB' - ), + 'Type': 'String', + 'Description': ( + 'Consequence annotations from Ensembl VEP. Format: ' + 'Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|' + 'HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|' + 'ALLELE_NUM|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|' + 'MANE|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|' + 'UNIPROT_ISOFORM|RefSeq|DOMAINS|HGVS_OFFSET|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|' + 'gnomADe_ASJ_AF|gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|' + 'gnomADe_REMAINING_AF|gnomADe_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|' + 'MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|' + 'MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref' + ), }, VcfInfo.PCGR_MUTATION_HOTSPOT: { 'Number': '.', 'Type': 'String', - 'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org_v2, Gene|Codon|Q-value', + 'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org. Format: GeneSymbol|Entrez_ID|CodonRefAA|Alt_AA|Q-value', }, - VcfInfo.PCGR_CLINVAR_CLNSIG: { + VcfInfo.PCGR_CLINVAR_CLASSIFICATION: { 'Number': '.', 'Type': 'String', - 'Description': 'ClinVar clinical significance', + 'Description': 'ClinVar - clinical significance - per phenotype submission', }, VcfInfo.PCGR_COSMIC_COUNT: { 'Number': '1', @@ -320,23 +319,10 @@ def namespace(self): 'Description': 'Count of ICGC PCAWG hits', }, - VcfInfo.CPSR_FINAL_CLASSIFICATION: { - 'Number': '1', - 'Type': 'String', - 'Description': ( - 'Final variant classification based on the combination of CLINVAR_CLASSIFICTION (for ' - 'ClinVar-classified variants), and CPSR_CLASSIFICATION (for novel variants)' - ), - }, - VcfInfo.CPSR_PATHOGENICITY_SCORE: { - 'Number': '1', - 'Type': 'Float', - 'Description': 'Aggregated CPSR pathogenicity score', - }, VcfInfo.CPSR_CLINVAR_CLASSIFICATION: { 'Number': '1', 'Type': 'String', - 'Description': 'Clinical significance of variant on a five-tiered scale', + 'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale', }, VcfInfo.CPSR_CSQ: { 'Number': '.', @@ -345,13 +331,13 @@ def namespace(self): 'Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|' 'Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|' 'Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|' - 'FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE_SELECT|' - 'MANE_PLUS_CLINICAL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|RefSeq|' - 'DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|gnomAD_AF|gnomAD_AFR_AF|' - 'gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|' - 'gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|' - 'MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|LoF|LoF_filter|LoF_flags|' - 'LoF_info' + 'FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE|MANE_SELECT|' + 'MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|RefSeq|' + 'DOMAINS|HGVS_OFFSET|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|gnomADe_ASJ_AF|' + 'gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|gnomADe_REMAINING_AF|' + 'gnomADe_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|' + 'MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|MaxEntScan_alt|MaxEntScan_diff|' + 'MaxEntScan_ref' ), }, @@ -360,7 +346,7 @@ def namespace(self): 'Type': 'Flag', 'Description': '', }, - VcfInfo.PCGR_TIER_RESCUE: { + VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE: { 'Number': '0', 'Type': 'Flag', 'Description': '', diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 296fb48..8a17865 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -1,6 +1,7 @@ import collections import csv import functools +import gzip import itertools import pathlib import re @@ -115,18 +116,17 @@ def get_minimal_header(input_fh): return '\n'.join([filetype_line, *chrom_lines, *format_lines, column_line]) -def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None): +def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None): - # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a - # temporary directory outside of the FusionFS mounted directory then manually copy across - temp_dir = tempfile.TemporaryDirectory() - temp_dir_path = pathlib.Path(temp_dir.name) - pcgr_output_dir = pcgr_output_dir / f"pcgr_{chunk_nbr}" if chunk_nbr is not None else pcgr_output_dir # Check if the output directory already exists - if pcgr_output_dir.exists(): - logger.warning(f"Warning: Output directory '{pcgr_output_dir}' already exists and will be overwrited") - shutil.rmtree(pcgr_output_dir) + output_dir = output_dir / f"pcgr_{chunk_nbr}" if chunk_nbr is not None else output_dir + if output_dir.exists(): + logger.warning(f"Output directory '{output_dir}' already exists and will be overwritten") + shutil.rmtree(output_dir) + + # Create output directory + output_dir.mkdir(parents=True, exist_ok=True) if not sample_id: sample_id = 'nosampleset' @@ -134,19 +134,20 @@ def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None command_args = [ f'--sample_id {sample_id}', f'--input_vcf {input_fp}', + f'--vep_dir {vep_dir}', + f'--refdata_dir {pcgr_refdata_dir}', f'--tumor_dp_tag TUMOR_DP', f'--tumor_af_tag TUMOR_AF', f'--control_dp_tag NORMAL_DP', f'--control_af_tag NORMAL_AF', - f'--pcgr_dir {pcgr_refdata_dir}', f'--genome_assembly grch38', f'--assay WGS', f'--estimate_signatures', - f'--estimate_msi_status', + f'--estimate_msi', f'--estimate_tmb', - f'--show_noncoding', f'--vcfanno_n_proc {threads}', - f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane', + f'--vep_n_forks 4', + f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select', ] # NOTE(SW): VEP pick order is applied as a successive filter: @@ -172,7 +173,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None command_args.append(f'--tumor_ploidy {ploidy}') # NOTE(SW): placed here to always have output directory last - command_args.append(f'--output_dir {temp_dir.name}') + command_args.append(f'--output_dir {output_dir}') delimiter_padding = ' ' * 10 delimiter = f' \\\n{delimiter_padding}' @@ -190,18 +191,16 @@ def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None command = command_formatting + command_conda + command # Log file path - log_file_path = temp_dir_path / "run_somatic.log" + log_file_path = output_dir / "run_somatic.log" # Run the command and redirect output to the log file util.execute_command(command, log_file_path=log_file_path) - shutil.copytree(temp_dir.name, pcgr_output_dir) - - pcgr_tsv_fp = pathlib.Path(pcgr_output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv' - pcgr_vcf_fp = pathlib.Path(pcgr_output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz' + pcgr_tsv_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.snv_indel_ann.tsv.gz' + pcgr_vcf_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.pass.vcf.gz' # Check if both files exist - if not pcgr_tsv_fp.exists(): + if not pcgr_tsv_fp.exists(): raise FileNotFoundError(f"Expected file {pcgr_tsv_fp} not found.") if not pcgr_vcf_fp.exists(): raise FileNotFoundError(f"Expected file {pcgr_vcf_fp} not found.") @@ -209,37 +208,42 @@ def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None return pcgr_tsv_fp, pcgr_vcf_fp -def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None): +def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None): if not sample_id: sample_id = 'nosampleset' - # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a - # temporary directory outside of the FusionFS mounted directory then manually copy across - temp_dir = tempfile.TemporaryDirectory() cpsr_output_dir = output_dir / 'cpsr/' + if cpsr_output_dir.exists(): + logger.warning(f"Output directory '{cpsr_output_dir}' already exists and will be overwritten") + shutil.rmtree(cpsr_output_dir) + + # Create output directory + cpsr_output_dir.mkdir(parents=True, exist_ok=True) + command_args = [ f'--sample_id {sample_id}', f'--input_vcf {input_fp}', f'--genome_assembly grch38', f'--custom_list {panel_fp}', + f'--vep_dir {pcgr_refdata_dir}', + f'--refdata_dir {pcgr_refdata_dir}', # NOTE(SW): probably useful to add versioning information here; weigh against maintainence # burden f'--custom_list_name umccr_germline_panel', f'--pop_gnomad global', f'--classify_all', - f'--pcgr_dir {pcgr_refdata_dir}', f'--vcfanno_n_proc {threads}', - f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane', + f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select', ] if pcgrr_conda: command_args.append(f'--pcgrr_conda {pcgrr_conda}') # NOTE(SW): placed here to always have output directory last - command_args.append(f'--output_dir {temp_dir.name}') + command_args.append(f'--output_dir {cpsr_output_dir}') delimiter_padding = ' ' * 10 delimiter = f' \\\n{delimiter_padding}' @@ -258,8 +262,6 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pc util.execute_command(command) - shutil.copytree(temp_dir.name, cpsr_output_dir) - return cpsr_output_dir @@ -267,11 +269,14 @@ def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp, # Set destination INFO field names and source TSV fields info_field_map = { constants.VcfInfo.PCGR_MUTATION_HOTSPOT: 'MUTATION_HOTSPOT', - constants.VcfInfo.PCGR_CLINVAR_CLNSIG: 'CLINVAR_CLNSIG', + constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION: 'CLINVAR_CLASSIFICATION', constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT: 'TCGA_PANCANCER_COUNT', constants.VcfInfo.PCGR_CSQ: 'CSQ', } + pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv' + pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz' + # Enforce matching defined and source INFO annotations check_annotation_headers(info_field_map, pcgr_vcf_fp) @@ -281,13 +286,11 @@ def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp, # Open filehandles, set required header entries input_fh = cyvcf2.VCF(input_fp) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_TIER) + util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLNSIG) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_COSMIC_COUNT) + util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT) output_fp = output_dir / f'{tumor_name}.annotations.vcf.gz' output_fh = cyvcf2.Writer(output_fp, input_fh, 'wz') @@ -298,20 +301,19 @@ def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp, if record.CHROM == 'chrM': continue # Annotate and write - record_ann = annotate_record(record, pcgr_data) + record_ann = annotate_record(record, pcgr_data, allow_missing=True) output_fh.write_record(record_ann) def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir): # Set destination INFO field names and source TSV fields + # Note: Only include fields that exist in CPSR v2.2.1 output info_field_map = { - constants.VcfInfo.CPSR_FINAL_CLASSIFICATION: 'FINAL_CLASSIFICATION', - constants.VcfInfo.CPSR_PATHOGENICITY_SCORE: 'CPSR_PATHOGENICITY_SCORE', constants.VcfInfo.CPSR_CLINVAR_CLASSIFICATION: 'CLINVAR_CLASSIFICATION', constants.VcfInfo.CPSR_CSQ: 'CSQ', } - cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snvs_indels.tiers.tsv' + cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.classification.tsv.gz' cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz' # Enforce matching defined and source INFO annotations @@ -323,8 +325,6 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir): # Open filehandles, set required header entries input_fh = cyvcf2.VCF(input_fp) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_FINAL_CLASSIFICATION) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_PATHOGENICITY_SCORE) util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_CLINVAR_CLASSIFICATION) util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_CSQ) @@ -352,13 +352,14 @@ def check_annotation_headers(info_field_map, vcf_fp): try: header_src_entry = vcf_fh.get_header_type(header_src) except KeyError: + print(f"Missing in VCF: {header_src}") continue header_dst_entry = util.get_vcf_header_entry(header_dst) # Remove leading and trailing quotes from source header_src_description_unquoted = header_src_entry['Description'].strip('"') - assert header_src_description_unquoted == header_dst_entry['Description'] - + if header_src_description_unquoted != header_dst_entry['Description']: + raise AssertionError(f"Mismatch for {header_src}:\nVCF: {header_src_description_unquoted}\nExpected: {header_dst_entry['Description']}") def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV @@ -368,27 +369,13 @@ def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): key, record_ann = get_annotation_entry_tsv(record, info_field_map) assert key not in data_tsv - # Process PCGR_TIER - # TIER_1, TIER_2, TIER_3, TIER_4, NONCODING - record_ann[constants.VcfInfo.PCGR_TIER] = record['TIER'].replace(' ', '_') - - # Count COSMIC hits - if record['COSMIC_MUTATION_ID'] == 'NA': - cosmic_count = 0 - else: - cosmic_count = len(record['COSMIC_MUTATION_ID'].split('&')) - record_ann[constants.VcfInfo.PCGR_COSMIC_COUNT] = cosmic_count - - # Count ICGC-PCAWG hits by taking sum of affected donors where the annotation value has - # the following format: project_code|tumor_type|affected_donors|tested_donors|frequency - icgc_pcawg_count = 0 - if record['ICGC_PCAWG_OCCURRENCE'] != 'NA': - for pcawg_hit_data in record['ICGC_PCAWG_OCCURRENCE'].split(','): - pcawrg_hit_data_fields = pcawg_hit_data.split('|') - affected_donors = int(pcawrg_hit_data_fields[2]) - icgc_pcawg_count += affected_donors - assert icgc_pcawg_count > 0 - record_ann[constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT] = icgc_pcawg_count + # Process PCGR_ACTIONABILITY_TIER + # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING + record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_') + + # Process PCGR_ACTIONABILITY_TIER + # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING + record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'] # Store annotation data data_tsv[key] = record_ann @@ -404,7 +391,7 @@ def collect_cpsr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather annotations from TSV data_tsv = dict() gdot_re = re.compile('^(?P[\dXYM]+):g\.(?P\d+)(?P[A-Z]+)>(?P[A-Z]+)$') - with open(tsv_fp, 'r') as tsv_fh: + with gzip.open(tsv_fp, 'rt') as tsv_fh: for record in csv.DictReader(tsv_fh, delimiter='\t'): # Decompose CPSR 'GENOMIC_CHANGE' field into CHROM, POS, REF, and ALT re_result = gdot_re.match(record['GENOMIC_CHANGE']) @@ -426,6 +413,24 @@ def collect_cpsr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Compile annotations, prefering TSV source return compile_annotation_data(data_tsv, data_vcf) +def parse_genomic_change(genomic_change): + """ + Parse a genomic change string, e.g., "3:g.41224645T>C" + Returns a tuple: (chrom, pos, ref, alt) + """ + # Regular expression for the format "chrom:g.posRef>Alt" + pattern = r'^(?P\w+):g\.(?P\d+)(?P\w+)>(?P\w+)$' + match = re.match(pattern, genomic_change) + if not match: + raise ValueError(f"Format not recognized: {genomic_change}") + + # Get values and format as needed + chrom = f"chr{match.group('chrom')}" + pos = int(match.group('pos')) + ref = match.group('ref') + alt = match.group('alt') + return chrom, pos, ref, alt + def get_annotations_vcf(vcf_fp, info_field_map): data_vcf = dict() @@ -445,10 +450,14 @@ def get_annotations_vcf(vcf_fp, info_field_map): def get_annotation_entry_tsv(record, info_field_map): - # Set lookup key; PCGR/CPSR strips leading 'chr' from contig names - chrom = f'chr{record["CHROM"]}' - pos = int(record['POS']) - key = (chrom, pos, record['REF'], record['ALT']) + # If GENOMIC_CHANGE is present, parse it for coordinates; otherwise, use separate fields. + if "GENOMIC_CHANGE" in record and record["GENOMIC_CHANGE"]: + chrom, pos, ref, alt = parse_genomic_change(record["GENOMIC_CHANGE"]) + + if not chrom.startswith('chr'): + chrom = f'chr{chrom}' + + key = (chrom, pos, ref, alt) record_ann = dict() for info_dst, info_src in info_field_map.items(): @@ -510,7 +519,6 @@ def split_vcf(input_vcf, output_dir): chunk_number = 1 variant_count = 0 base_filename = pathlib.Path(input_vcf).stem - chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" base_filename = input_vcf.stem chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" chunk_files.append(chunk_filename) @@ -540,7 +548,7 @@ def split_vcf(input_vcf, output_dir): logger.info(f"VCF file split into {len(chunk_files)} chunks.") return chunk_files -def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda): +def run_somatic_chunck(vcf_chunks, pcgr_data_dir, vep_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda): pcgr_tsv_files = [] pcgr_vcf_files = [] num_chunks = len(vcf_chunks) @@ -555,7 +563,7 @@ def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, m # Assign extra thread to the first 'threads_rem' chunks additional_thread = 1 if chunk_number <= threads_rem else 0 total_threads = threads_per_chunk + additional_thread - futures[executor.submit(run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number + futures[executor.submit(run_somatic, vcf_file, pcgr_data_dir, vep_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number for future in concurrent.futures.as_completed(futures): try: pcgr_tsv_fp, pcgr_vcf_fp = future.result() @@ -577,7 +585,7 @@ def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp): util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp) # Step 5: Merge all VCF files into a single file in the pcgr directory - merged_vcf_path = pcgr_dir / "nosampleset.pcgr_acmg.grch38" + merged_vcf_path = pcgr_dir / "nosampleset.pcgr.grch38.pass.vcf.gz" merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path) return merged_vcf, merged_tsv_fp diff --git a/bolt/util.py b/bolt/util.py index b2ce7df..34df80f 100644 --- a/bolt/util.py +++ b/bolt/util.py @@ -1,3 +1,4 @@ +import gzip import os import pathlib import subprocess diff --git a/bolt/workflows/smlv_germline/report.py b/bolt/workflows/smlv_germline/report.py index 0a10711..395268f 100644 --- a/bolt/workflows/smlv_germline/report.py +++ b/bolt/workflows/smlv_germline/report.py @@ -23,6 +23,7 @@ @click.option('--germline_panel_list_fp', required=True, type=click.Path(exists=True)) @click.option('--pcgr_data_dir', required=True, type=click.Path(exists=True)) +@click.option('--vep_dir', required=True, type=click.Path(exists=True)) @click.option('--threads', required=True, type=int, default=1) @@ -73,6 +74,7 @@ def entry(ctx, **kwargs): cpsr_prep_fp, kwargs['germline_panel_list_fp'], kwargs['pcgr_data_dir'], + kwargs['vep_dir'], output_dir, threads=kwargs['threads'], pcgr_conda=kwargs['pcgr_conda'], diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py index fe0007a..471b816 100644 --- a/bolt/workflows/smlv_somatic/annotate.py +++ b/bolt/workflows/smlv_somatic/annotate.py @@ -22,6 +22,7 @@ @click.option('--pon_dir', required=True, type=click.Path(exists=True)) @click.option('--pcgr_data_dir', required=True, type=click.Path(exists=True)) +@click.option('--vep_dir', required=True, type=click.Path(exists=True)) @click.option('--pcgr_conda', required=False, type=str) @click.option('--pcgrr_conda', required=False, type=str) @@ -81,8 +82,7 @@ def entry(ctx, **kwargs): # - PCGR ACMG TIER [INFO/PCGR_TIER] # - VEP consequence [INFO/PCR_CSQ] # - Known mutation hotspot [INFO/PCGR_MUTATION_HOTSPOT] - # - ClinVar clinical significant [INFO/PCGR_CLINVAR_CLNSIG] - # - Hits in COSMIC [INFO/PCGR_COSMIC_COUNT] + # - ClinVar clinical significant [INFO/PCGR_CLNSIG] # - Hits in TCGA [INFO/PCGR_TCGA_PANCANCER_COUNT] # - Hits in PCAWG [INFO/PCGR_ICGC_PCAWG_COUNT] @@ -107,6 +107,7 @@ def entry(ctx, **kwargs): pcgr_tsv_fp, pcgr_vcf_fp = util.run_somatic_chunck( vcf_chunks, kwargs['pcgr_data_dir'], + kwargs['vep_dir'], output_dir, pcgr_output_dir, kwargs['threads'], @@ -117,6 +118,17 @@ def entry(ctx, **kwargs): pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic( pcgr_prep_fp, kwargs['pcgr_data_dir'], + output_dir, + pcgr_output_dir, + kwargs['threads'], + kwargs['pcgr_conda'], + kwargs['pcgrr_conda'] + ) + else: + pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic( + pcgr_prep_fp, + kwargs['pcgr_data_dir'], + kwargs['vep_dir'], pcgr_output_dir, chunk_nbr=None, threads=kwargs['threads'], diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py index 104296d..814863c 100644 --- a/bolt/workflows/smlv_somatic/filter.py +++ b/bolt/workflows/smlv_somatic/filter.py @@ -40,7 +40,7 @@ def entry(ctx, **kwargs): constants.VcfFilter.ENCODE, constants.VcfFilter.GNOMAD_COMMON, constants.VcfInfo.SAGE_HOTSPOT_RESCUE, - constants.VcfInfo.PCGR_TIER_RESCUE, + constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE, constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE, constants.VcfInfo.RESCUED_FILTERS_EXISTING, constants.VcfInfo.RESCUED_FILTERS_PENDING, @@ -165,9 +165,9 @@ def set_filter_data(record, tumor_index): ## # PCGR tier rescue ## - pcgr_tier = record.INFO.get(constants.VcfInfo.PCGR_TIER.value) - if pcgr_tier in constants.PCGR_TIERS_RESCUE: - info_rescue.append(constants.VcfInfo.PCGR_TIER_RESCUE) + pcgr_tier = record.INFO.get(constants.VcfInfo.PCGR_ACTIONABILITY_TIER.value) + if pcgr_tier in constants.PCGR_ACTIONABILITY_TIER_RESCUE: + info_rescue.append(constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE) ## # SAGE hotspot rescue @@ -184,19 +184,15 @@ def set_filter_data(record, tumor_index): # single CLINICAL_POTENTIAL_RESCUE flag # Get ClinVar clinical significance entries - clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLNSIG.value, '') + clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value, '') clinvar_clinsigs = clinvar_clinsig.split(',') # Hit counts in relevant reference somatic mutation databases - cosmic_count = record.INFO.get(constants.VcfInfo.PCGR_COSMIC_COUNT.value, 0) tcga_pancancer_count = record.INFO.get(constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT.value, 0) - icgc_pcawg_count = record.INFO.get(constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT.value, 0) if ( record.INFO.get(constants.VcfInfo.HMF_HOTSPOT.value) is not None or record.INFO.get(constants.VcfInfo.PCGR_MUTATION_HOTSPOT.value) is not None or any(e in clinvar_clinsigs for e in constants.CLINVAR_CLINSIGS_RESCUE) or - cosmic_count >= constants.MIN_COSMIC_COUNT_RESCUE or - tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE or - icgc_pcawg_count >= constants.MIN_ICGC_PCAWG_COUNT_RESCUE + tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE ): info_rescue.append(constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE) diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py index b2bead1..ede90ca 100644 --- a/bolt/workflows/smlv_somatic/report.py +++ b/bolt/workflows/smlv_somatic/report.py @@ -28,6 +28,7 @@ @click.option('--pcgrr_conda', required=False, type=str) @click.option('--pcgr_data_dir', required=False, type=str) +@click.option('--vep_dir', required=True, type=click.Path(exists=True)) @click.option('--purple_purity_fp', required=True, type=click.Path(exists=True)) @click.option('--cancer_genes_fp', required=True, type=click.Path(exists=True)) @@ -131,7 +132,8 @@ def entry(ctx, **kwargs): pcgr.run_somatic( pcgr_prep_fp, kwargs['pcgr_data_dir'], - output_dir, + kwargs['vep_dir'], + pcgr_output_dir, threads=kwargs['threads'], pcgr_conda=kwargs['pcgr_conda'], pcgrr_conda=kwargs['pcgrr_conda'], diff --git a/docker/Dockerfile.pcgr b/docker/Dockerfile.pcgr index 4ea4932..ca984e8 100644 --- a/docker/Dockerfile.pcgr +++ b/docker/Dockerfile.pcgr @@ -15,13 +15,13 @@ RUN \ conda create \ --solver libmamba \ --name pcgr \ - --file https://raw.githubusercontent.com/sigven/pcgr/v1.4.1/conda/env/lock/pcgr-linux-64.lock + --file https://raw.githubusercontent.com/sigven/pcgr/refs/tags/v2.2.1/conda/env/lock/pcgr-linux-64.lock RUN \ conda create \ --solver libmamba \ --name pcgrr \ - --file https://raw.githubusercontent.com/sigven/pcgr/v1.4.1/conda/env/lock/pcgrr-linux-64.lock + --file https://raw.githubusercontent.com/sigven/pcgr/refs/tags/v2.2.1/conda/env/lock/pcgrr-linux-64.lock COPY ./conda/env/bolt_env.yml /tmp/ RUN \ diff --git a/tests/test_smlv_somatic_filter.py b/tests/test_smlv_somatic_filter.py index 41a0e96..6455162 100644 --- a/tests/test_smlv_somatic_filter.py +++ b/tests/test_smlv_somatic_filter.py @@ -160,15 +160,15 @@ def test_common_population_filter(self): def test_pcgr_tier_rescue(self): pcgr_tiers = [ - 'TIER_1', - 'TIER_2', + '1', + '2', ] - rescue_tag_str = bolt_constants.VcfInfo.PCGR_TIER_RESCUE.value + rescue_tag_str = bolt_constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE.value for pcgr_tier in pcgr_tiers: record = get_record( **self.records['filter_min_af9.9'], - info_data={'PCGR_TIER': pcgr_tier}, + info_data={'PCGR_ACTIONABILITY_TIER': pcgr_tier}, ) smlv_somatic_filter.set_filter_data(record, 0) assert not record.FILTER @@ -189,9 +189,7 @@ def test_clinical_potential_rescue_general(self): info_data_sets = [ {'HMF_HOTSPOT': ''}, {'PCGR_MUTATION_HOTSPOT': ''}, - {'PCGR_COSMIC_COUNT': 11}, {'PCGR_TCGA_PANCANCER_COUNT': 6}, - {'PCGR_ICGC_PCAWG_COUNT': 6}, ] rescue_tag_str = bolt_constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE.value @@ -217,7 +215,7 @@ def test_clinical_potential_rescue_clinvar_clinsig(self): for clinsig in clinsigs: record = get_record( **self.records['filter_min_af9.9'], - info_data={'PCGR_CLINVAR_CLNSIG': clinsig}, + info_data={'PCGR_CLINVAR_CLASSIFICATION': clinsig}, ) smlv_somatic_filter.set_filter_data(record, 0) assert not record.FILTER From 9d8e4d11292d5875dda0dbc901eda0940903bdb4 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Tue, 16 Sep 2025 10:34:44 +1000 Subject: [PATCH 09/22] fix merge duplication and imports --- bolt/common/pcgr.py | 55 ++++++++++++++++--------- bolt/util.py | 51 ++--------------------- bolt/workflows/smlv_somatic/annotate.py | 53 +++++++++--------------- pyproject.toml | 1 - 4 files changed, 59 insertions(+), 101 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 8a17865..72376fc 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -8,6 +8,7 @@ import shutil import tempfile import logging +import concurrent.futures import cyvcf2 @@ -228,7 +229,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre f'--input_vcf {input_fp}', f'--genome_assembly grch38', f'--custom_list {panel_fp}', - f'--vep_dir {pcgr_refdata_dir}', + f'--vep_dir {vep_dir}', f'--refdata_dir {pcgr_refdata_dir}', # NOTE(SW): probably useful to add versioning information here; weigh against maintainence # burden @@ -274,8 +275,9 @@ def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp, constants.VcfInfo.PCGR_CSQ: 'CSQ', } - pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv' - pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz' + # Respect paths provided; do not override + pcgr_tsv_fp = pathlib.Path(pcgr_tsv_fp) + pcgr_vcf_fp = pathlib.Path(pcgr_vcf_fp) # Enforce matching defined and source INFO annotations check_annotation_headers(info_field_map, pcgr_vcf_fp) @@ -364,18 +366,27 @@ def check_annotation_headers(info_field_map, vcf_fp): def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV data_tsv = dict() - with open(tsv_fp, 'r') as tsv_fh: + # Read gz or plain text based on extension + open_fn = gzip.open if str(tsv_fp).endswith('.gz') else open + with open_fn(tsv_fp, 'rt') as tsv_fh: for record in csv.DictReader(tsv_fh, delimiter='\t'): key, record_ann = get_annotation_entry_tsv(record, info_field_map) assert key not in data_tsv - # Process PCGR_ACTIONABILITY_TIER - # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING - record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_') - - # Process PCGR_ACTIONABILITY_TIER - # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING - record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'] + # Normalize PCGR actionability tier to simple values: '1','2','3','4','N' + raw_tier = (record.get('ACTIONABILITY_TIER') or '').strip() + tier_norm = raw_tier.replace('_', ' ').upper() + if tier_norm in ('TIER 1','TIER1','1'): + tier_val = '1' + elif tier_norm in ('TIER 2','TIER2','2'): + tier_val = '2' + elif tier_norm in ('TIER 3','TIER3','3'): + tier_val = '3' + elif tier_norm in ('TIER 4','TIER4','4'): + tier_val = '4' + else: + tier_val = 'N' + record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = tier_val # Store annotation data data_tsv[key] = record_ann @@ -450,12 +461,18 @@ def get_annotations_vcf(vcf_fp, info_field_map): def get_annotation_entry_tsv(record, info_field_map): - # If GENOMIC_CHANGE is present, parse it for coordinates; otherwise, use separate fields. - if "GENOMIC_CHANGE" in record and record["GENOMIC_CHANGE"]: - chrom, pos, ref, alt = parse_genomic_change(record["GENOMIC_CHANGE"]) - - if not chrom.startswith('chr'): - chrom = f'chr{chrom}' + # If GENOMIC_CHANGE is present, parse it; otherwise, fallback to CHROM/POS/REF/ALT fields + chrom = pos = ref = alt = None + if 'GENOMIC_CHANGE' in record and record['GENOMIC_CHANGE']: + chrom, pos, ref, alt = parse_genomic_change(record['GENOMIC_CHANGE']) + else: + chrom = record.get('CHROM') or record.get('Chromosome') + pos = int(record.get('POS') or record.get('Start_position')) + ref = record.get('REF') + alt = record.get('ALT') + # Ensure chrom has 'chr' prefix + if chrom and not str(chrom).startswith('chr'): + chrom = f'chr{chrom}' key = (chrom, pos, ref, alt) @@ -518,7 +535,7 @@ def split_vcf(input_vcf, output_dir): chunk_files = [] chunk_number = 1 variant_count = 0 - base_filename = pathlib.Path(input_vcf).stem + input_vcf = pathlib.Path(input_vcf) base_filename = input_vcf.stem chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" chunk_files.append(chunk_filename) @@ -606,7 +623,7 @@ def get_variant_filter_data(variant): data = {e: None for e in attribute_names} - data['tier'] = variant.INFO['PCGR_TIER'] + data['tier'] = variant.INFO.get('PCGR_ACTIONABILITY_TIER') info_keys = [k for k, v in variant.INFO] diff --git a/bolt/util.py b/bolt/util.py index 34df80f..7310b89 100644 --- a/bolt/util.py +++ b/bolt/util.py @@ -1,11 +1,11 @@ -import gzip -import os import pathlib import subprocess import textwrap import logging from types import SimpleNamespace +import cyvcf2 + from .common import constants # Set up logging @@ -122,8 +122,7 @@ def split_vcf(input_vcf, output_dir): chunk_files = [] chunk_number = 1 variant_count = 0 - base_filename = pathlib.Path(input_vcf).stem - chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + input_vcf = pathlib.Path(input_vcf) base_filename = input_vcf.stem chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" chunk_files.append(chunk_filename) @@ -248,47 +247,3 @@ def merge_vcf_files(vcf_files, merged_vcf_fp): merged_unsorted_vcf.unlink() return merged_vcf - -def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp): - # Step 3: Merge all chunk VCF files into a single file - pcgr_dir = output_dir / 'pcgr/' - pcgr_dir.mkdir(exist_ok=True) - # Merge all TSV files into a single file in the pcgr directory merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv") - merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv") - merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp) - # Step 5: Merge all VCF files into a single file in the pcgr directory - merged_vcf_path = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38") - merged_vcf = merge_vcf_files(pcgr_vcf_files, merged_vcf_path) - return merged_vcf, merged_tsv_fp - -def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda): - pcgr_tsv_files = [] - pcgr_vcf_files = [] - - num_chunks = len(vcf_chunks) - # Ensure we don't use more workers than available threads, and each worker has at least 2 threads - max_workers = min(num_chunks, max_threads // 2) - threads_quot, threads_rem = divmod(max_threads, num_chunks) - threads_per_chunk = max(2, threads_quot) - - # Limit the number of workers to the smaller of num_chunks or max_threads // 2 - with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: - futures = {} - for chunk_number, vcf_file in enumerate(vcf_chunks, start=1): - # Assign extra thread to the first 'threads_rem' chunks - additional_thread = 1 if chunk_number <= threads_rem else 0 - total_threads = threads_per_chunk + additional_thread - futures[executor.submit(pcgr.run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number - - for future in concurrent.futures.as_completed(futures): - try: - pcgr_tsv_fp, pcgr_vcf_fp = future.result() - if pcgr_tsv_fp: - pcgr_tsv_files.append(pcgr_tsv_fp) - if pcgr_vcf_fp: - pcgr_vcf_files.append(pcgr_vcf_fp) - except Exception as e: - print(f"Exception occurred: {e}") - - merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files) - return merged_tsv_fp, merged_vcf_fp \ No newline at end of file diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py index 471b816..d0efa93 100644 --- a/bolt/workflows/smlv_somatic/annotate.py +++ b/bolt/workflows/smlv_somatic/annotate.py @@ -98,43 +98,30 @@ def entry(ctx, **kwargs): total_variants = util.count_vcf_records(pcgr_prep_fp) print(f"Total number of variants in the input VCF: {total_variants}") - # Run PCGR in chunks if the total number of variants exceeds the maximum allowed for somatic variants + # Run PCGR in chunks if exceeding the maximum allowed for somatic variants if total_variants > constants.MAX_SOMATIC_VARIANTS: - vcf_chunks = util.split_vcf( - pcgr_prep_fp, - output_dir + vcf_chunks = pcgr.split_vcf(pcgr_prep_fp, output_dir) + pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic_chunck( + vcf_chunks, + kwargs['pcgr_data_dir'], + kwargs['vep_dir'], + output_dir, + pcgr_output_dir, + kwargs['threads'], + kwargs['pcgr_conda'], + kwargs['pcgrr_conda'], ) - pcgr_tsv_fp, pcgr_vcf_fp = util.run_somatic_chunck( - vcf_chunks, - kwargs['pcgr_data_dir'], - kwargs['vep_dir'], - output_dir, - pcgr_output_dir, - kwargs['threads'], - kwargs['pcgr_conda'], - kwargs['pcgrr_conda'] - ) - else: - pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic( - pcgr_prep_fp, - kwargs['pcgr_data_dir'], - output_dir, - pcgr_output_dir, - kwargs['threads'], - kwargs['pcgr_conda'], - kwargs['pcgrr_conda'] - ) else: pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic( - pcgr_prep_fp, - kwargs['pcgr_data_dir'], - kwargs['vep_dir'], - pcgr_output_dir, - chunk_nbr=None, - threads=kwargs['threads'], - pcgr_conda=kwargs['pcgr_conda'], - pcgrr_conda=kwargs['pcgrr_conda'], - ) + pcgr_prep_fp, + kwargs['pcgr_data_dir'], + kwargs['vep_dir'], + pcgr_output_dir, + chunk_nbr=None, + threads=kwargs['threads'], + pcgr_conda=kwargs['pcgr_conda'], + pcgrr_conda=kwargs['pcgrr_conda'], + ) # Transfer PCGR annotations to full set of variants pcgr.transfer_annotations_somatic( diff --git a/pyproject.toml b/pyproject.toml index e98f470..e7325a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,6 @@ dependencies = [ "cyvcf2", "pysam", "pyyaml", - "future" ] [project.scripts] From 8db35b30bd3c6134314905d68fab6f66250c3457 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Thu, 2 Oct 2025 14:15:26 +1000 Subject: [PATCH 10/22] bump pcgr version --- docker/Dockerfile.pcgr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.pcgr b/docker/Dockerfile.pcgr index ca984e8..8615e50 100644 --- a/docker/Dockerfile.pcgr +++ b/docker/Dockerfile.pcgr @@ -39,7 +39,7 @@ RUN \ RUN \ conda clean -afy -FROM quay.io/bioconda/base-glibc-busybox-bash:2.1.0 +FROM quay.io/bioconda/base-glibc-busybox-bash:2.2.5 # Copy Conda install and all environments COPY --from=build /opt/conda/ /opt/conda/ From c67b30ad4d5ecfa9e28817a61bc341d8f0c71847 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Thu, 2 Oct 2025 14:19:01 +1000 Subject: [PATCH 11/22] bump gpgr version --- docker/Dockerfile.gpgr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.gpgr b/docker/Dockerfile.gpgr index 8b3a157..6b76ecb 100644 --- a/docker/Dockerfile.gpgr +++ b/docker/Dockerfile.gpgr @@ -18,7 +18,7 @@ RUN \ RUN \ conda install --prefix /env/ \ - 'r-gpgr ==2.2.1' \ + 'r-gpgr ==2.2.11' \ 'r-sigrap ==0.1.1' \ 'bioconductor-bsgenome.hsapiens.ucsc.hg38 ==1.4.5' \ 'bioconductor-txdb.hsapiens.ucsc.hg38.knowngene ==3.16.0' \ From f9a7d3b9ce7912b41f6ef97fe42302201bf91c3f Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Fri, 17 Oct 2025 09:26:19 +1100 Subject: [PATCH 12/22] change print to logging --- bolt/common/pcgr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 72376fc..423a8b5 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -588,8 +588,9 @@ def run_somatic_chunck(vcf_chunks, pcgr_data_dir, vep_dir, output_dir, pcgr_outp pcgr_tsv_files.append(pcgr_tsv_fp) if pcgr_vcf_fp: pcgr_vcf_files.append(pcgr_vcf_fp) - except Exception as e: - print(f"Exception occurred: {e}") + except Exception: + chunk_number = futures[future] + logger.exception(f"Exception occurred while processing PCGR chunk {chunk_number}.") merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files) return merged_tsv_fp, merged_vcf_fp From b8ee4a0b1e2b54aaf6b99d2f3e7571448db8c3c9 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Fri, 17 Oct 2025 10:02:11 +1100 Subject: [PATCH 13/22] add argument chunck size --- bolt/common/pcgr.py | 9 +++++++-- bolt/workflows/smlv_somatic/annotate.py | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 423a8b5..54c87aa 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -524,12 +524,17 @@ def annotate_record(record, annotations, *, allow_missing=False): return record -def split_vcf(input_vcf, output_dir): +def split_vcf(input_vcf, output_dir, *, max_variants=None): """ Splits a VCF file into multiple chunks, each containing up to max_variants variants. Each chunk includes the VCF header. Ensures no overlapping positions between chunks. """ + if max_variants is None: + max_variants = constants.MAX_SOMATIC_VARIANTS + elif max_variants <= 0: + raise ValueError("max_variants must be a positive integer.") + output_dir = pathlib.Path(output_dir / "vcf_chunks") output_dir.mkdir(parents=True, exist_ok=True) chunk_files = [] @@ -547,7 +552,7 @@ def split_vcf(input_vcf, output_dir): for record in vcf_in: current_position = record.POS # Check if we need to start a new chunk - if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position): + if variant_count >= max_variants and (last_position is None or current_position != last_position): # Close the current chunk file and start a new one vcf_out.close() chunk_number += 1 diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py index d0efa93..32b85b3 100644 --- a/bolt/workflows/smlv_somatic/annotate.py +++ b/bolt/workflows/smlv_somatic/annotate.py @@ -30,6 +30,7 @@ @click.option('--threads', required=False, default=4, type=int) @click.option('--output_dir', required=True, type=click.Path()) +@click.option('--pcgr_variant_chunk_size', required=False, type=int, help='Override maximum variants per PCGR chunk.') def entry(ctx, **kwargs): '''Annotate variants with information from several sources\f @@ -99,8 +100,13 @@ def entry(ctx, **kwargs): print(f"Total number of variants in the input VCF: {total_variants}") # Run PCGR in chunks if exceeding the maximum allowed for somatic variants - if total_variants > constants.MAX_SOMATIC_VARIANTS: - vcf_chunks = pcgr.split_vcf(pcgr_prep_fp, output_dir) + chunk_size = kwargs.get('pcgr_variant_chunk_size') + if chunk_size is not None and chunk_size <= 0: + raise click.BadParameter('must be a positive integer', param_hint='--pcgr_variant_chunk_size') + chunk_size = chunk_size or constants.MAX_SOMATIC_VARIANTS + + if total_variants > chunk_size: + vcf_chunks = pcgr.split_vcf(pcgr_prep_fp, output_dir, max_variants=chunk_size) pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic_chunck( vcf_chunks, kwargs['pcgr_data_dir'], From 8223ef9a729119bdbfefe7a16b3b7b4b25439200 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Fri, 17 Oct 2025 11:44:28 +1100 Subject: [PATCH 14/22] fix version software pcgr container --- docker/Dockerfile.pcgr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.pcgr b/docker/Dockerfile.pcgr index 8615e50..ca984e8 100644 --- a/docker/Dockerfile.pcgr +++ b/docker/Dockerfile.pcgr @@ -39,7 +39,7 @@ RUN \ RUN \ conda clean -afy -FROM quay.io/bioconda/base-glibc-busybox-bash:2.2.5 +FROM quay.io/bioconda/base-glibc-busybox-bash:2.1.0 # Copy Conda install and all environments COPY --from=build /opt/conda/ /opt/conda/ From f83c23aec1273df1fa9ddeaab354c279b3375352 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Fri, 17 Oct 2025 15:50:20 +1100 Subject: [PATCH 15/22] add logging and filter fonction to read value and replace placeholder --- bolt/workflows/smlv_somatic/filter.py | 59 +++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py index 814863c..bb5e8f1 100644 --- a/bolt/workflows/smlv_somatic/filter.py +++ b/bolt/workflows/smlv_somatic/filter.py @@ -1,12 +1,14 @@ import click import pathlib - +import logging import cyvcf2 from ... import util from ...common import constants +from ...logging_config import setup_logging +logger = logging.getLogger(__name__) @click.command(name='filter') @@ -129,7 +131,11 @@ def set_filter_data(record, tumor_index): # PON filter ## # NOTE(SW): 'max' is inclusive - keeps variants with 0 to n-1 PON hits; preserved from Umccrise - pon_count = record.INFO.get(constants.VcfInfo.PON_COUNT.value, 0) + pon_count = get_record_value( + record, + constants.VcfInfo.PON_COUNT.value, + default=0 + ) if pon_count >= constants.PON_HIT_THRESHOLD: filters.append(constants.VcfFilter.PON) @@ -144,7 +150,14 @@ def set_filter_data(record, tumor_index): ## # NOTE(SW): rounding is essential here for accurate comparison; cyvcf2 floating-point error # means INFO/gnomAD_AF=0.01 can be represented as 0.009999999776482582 - gnomad_af = round(record.INFO.get(constants.VcfInfo.GNOMAD_AF.value, 0), 3) + gnomad_af = round( + get_record_value( + record, + constants.VcfInfo.GNOMAD_AF.value, + default=0.0 ), + ), + 3, + ) if gnomad_af >= constants.MAX_GNOMAD_AF: filters.append(constants.VcfFilter.GNOMAD_COMMON) @@ -173,7 +186,7 @@ def set_filter_data(record, tumor_index): # SAGE hotspot rescue ## # NOTE(SW): effectively reverts any FILTERs that may have been applied above - if record.INFO.get(constants.VcfInfo.SAGE_HOTSPOT.value) is not None: + if get_record_value(record, constants.VcfInfo.SAGE_HOTSPOT.value) is not None: info_rescue.append(constants.VcfInfo.SAGE_HOTSPOT_RESCUE) ## @@ -184,13 +197,24 @@ def set_filter_data(record, tumor_index): # single CLINICAL_POTENTIAL_RESCUE flag # Get ClinVar clinical significance entries - clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value, '') + clinvar_clinsig = get_record_value( + record, + constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value, + default='', + ) clinvar_clinsigs = clinvar_clinsig.split(',') # Hit counts in relevant reference somatic mutation databases - tcga_pancancer_count = record.INFO.get(constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT.value, 0) + tcga_pancancer_count = get_record_value( + record, + constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT.value, + default=0 + ) + hmf_present = get_record_value(record, constants.VcfInfo.HMF_HOTSPOT.value) + pcgr_hotspot_present = get_record_value(record, constants.VcfInfo.PCGR_MUTATION_HOTSPOT.value) + if ( - record.INFO.get(constants.VcfInfo.HMF_HOTSPOT.value) is not None or - record.INFO.get(constants.VcfInfo.PCGR_MUTATION_HOTSPOT.value) is not None or + hmf_present is not None or + pcgr_hotspot_present is not None or any(e in clinvar_clinsigs for e in constants.CLINVAR_CLINSIGS_RESCUE) or tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE ): @@ -227,3 +251,22 @@ def set_filter_data(record, tumor_index): filters_existing = [e for e in record.FILTERS if e != 'PASS'] assert all(e not in filters_existing for e in filters_value) record.FILTER = ';'.join([*filters_existing, *filters_value]) + + +def get_record_value(record, key, *, default=None): + ''' + Return a INFO value, replacing '.', '' and missing entries by default value. + + This prevents placeholder values emitted by pcgr from triggering rescues + or filters. For example, an INFO placeholder '.' for + PCGR_TCGA_PANCANCER_COUNT will resolve to the integer 0 when called with + `get_record_value(record, 'PCGR_TCGA_PANCANCER_COUNT', default=0)`. + ''' + value = record.INFO.get(key) + if value is None: + return default + if value=='' or value=='.': + return default + if not isinstance(value, (int, float, str)): + logger.error(f'record ID: {record.ID} INFO value for {key}: {value} ({type(value)}, not int, float, str)') + return value \ No newline at end of file From 22b5c1291b7d7432a0c65c7ff5c75bdf07cd71ce Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Fri, 17 Oct 2025 16:02:38 +1100 Subject: [PATCH 16/22] fix indentation --- bolt/workflows/smlv_somatic/filter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py index bb5e8f1..c820385 100644 --- a/bolt/workflows/smlv_somatic/filter.py +++ b/bolt/workflows/smlv_somatic/filter.py @@ -154,7 +154,7 @@ def set_filter_data(record, tumor_index): get_record_value( record, constants.VcfInfo.GNOMAD_AF.value, - default=0.0 ), + default=0.0 ), 3, ) @@ -214,7 +214,7 @@ def set_filter_data(record, tumor_index): if ( hmf_present is not None or - pcgr_hotspot_present is not None or + pcgr_hotspot_present or any(e in clinvar_clinsigs for e in constants.CLINVAR_CLINSIGS_RESCUE) or tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE ): From 529309c03d4838f8c5fe71c704dfee298a932933 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Fri, 17 Oct 2025 16:36:13 +1100 Subject: [PATCH 17/22] read gz tsv from pcgr --- bolt/common/pcgr.py | 2 +- bolt/util.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 54c87aa..a45154a 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -604,7 +604,7 @@ def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp): pcgr_dir.mkdir(exist_ok=True) # Merge all TSV files into a single file in the pcgr directory - merged_tsv_fp = pcgr_dir / "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv" + merged_tsv_fp = pcgr_dir / "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv.gz" util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp) # Step 5: Merge all VCF files into a single file in the pcgr directory diff --git a/bolt/util.py b/bolt/util.py index 7310b89..8065302 100644 --- a/bolt/util.py +++ b/bolt/util.py @@ -1,3 +1,4 @@ +import gzip import pathlib import subprocess import textwrap @@ -161,11 +162,12 @@ def split_vcf(input_vcf, output_dir): def merge_tsv_files(tsv_files, merged_tsv_fp): """ - Merges all TSV files into a single TSV. + Merge gzipped TSV files into a single gzipped TSV. """ - with open(merged_tsv_fp, 'w') as merged_tsv: + + with gzip.open(merged_tsv_fp, 'wt', encoding='utf-8') as merged_tsv: for i, tsv_file in enumerate(tsv_files): - with open(tsv_file, 'r') as infile: + with gzip.open(tsv_file, 'rt', encoding='utf-8') as infile: for line_number, line in enumerate(infile): # Skip header except for the first file if i > 0 and line_number == 0: From d3dd1820ba346e513e79f4427dade621baca0184 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Mon, 20 Oct 2025 14:54:57 +1100 Subject: [PATCH 18/22] change PCGR_CLINVAR_CLASSIFICATION --- bolt/common/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/common/constants.py b/bolt/common/constants.py index 7cebe78..cfc6aa5 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -301,7 +301,7 @@ def namespace(self): VcfInfo.PCGR_CLINVAR_CLASSIFICATION: { 'Number': '.', 'Type': 'String', - 'Description': 'ClinVar - clinical significance - per phenotype submission', + 'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale', }, VcfInfo.PCGR_COSMIC_COUNT: { 'Number': '1', From a5fc92311f4607d0d528151ca83394c150f74b68 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Mon, 20 Oct 2025 15:00:46 +1100 Subject: [PATCH 19/22] Change pcgr header check --- bolt/common/pcgr.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index a45154a..9dba668 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -348,6 +348,7 @@ def check_annotation_headers(info_field_map, vcf_fp): # Ensure header descriptions from source INFO annotations match those defined here for the # output file; force manual inspection where they do not match vcf_fh = cyvcf2.VCF(vcf_fp) + mismatches = list() for header_dst, header_src in info_field_map.items(): # Skip header lines that do not have an equivalent entry in the PCGR/CPSR VCF @@ -361,7 +362,15 @@ def check_annotation_headers(info_field_map, vcf_fp): # Remove leading and trailing quotes from source header_src_description_unquoted = header_src_entry['Description'].strip('"') if header_src_description_unquoted != header_dst_entry['Description']: - raise AssertionError(f"Mismatch for {header_src}:\nVCF: {header_src_description_unquoted}\nExpected: {header_dst_entry['Description']}") + mismatches.append( + f"Mismatch for {header_src}:\n" + f"VCF: {header_src_description_unquoted}\n" + f"Expected: {header_dst_entry['Description']}" + ) + + if mismatches: + mismatch_msg = "\n\n".join(mismatches) + raise AssertionError(f"Mismatched INFO annotations:\n{mismatch_msg}") def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV @@ -608,7 +617,7 @@ def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp): util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp) # Step 5: Merge all VCF files into a single file in the pcgr directory - merged_vcf_path = pcgr_dir / "nosampleset.pcgr.grch38.pass.vcf.gz" + merged_vcf_path = pcgr_dir / "nosampleset.pcgr.grch38.pass" merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path) return merged_vcf, merged_tsv_fp From bf930e78188c403f530ed754f1f9b451b70e0691 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Fri, 24 Oct 2025 13:56:03 +1100 Subject: [PATCH 20/22] tyo --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 90b7815..25ce706 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,4 +10,4 @@ - [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee --[9](https://github.com/umccr/bolt/pull/9) Add hypermutation sample handling \ No newline at end of file +- [9](https://github.com/umccr/bolt/pull/9) Add hypermutation sample handling \ No newline at end of file From 8ea1edc8cd5c64afff30640471f515f279b2c90a Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Fri, 24 Oct 2025 13:57:30 +1100 Subject: [PATCH 21/22] typo --- bolt/common/pcgr.py | 2 +- bolt/workflows/other/cancer_report.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 9dba668..95c0f47 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -649,7 +649,7 @@ def get_variant_filter_data(variant): # NOTE(SW): GIAB_CONF always overrides DIFFICULT tags if data['giab_conf'] and data['difficult']: - data['difficult']= False + data['difficult'] = False for impact in get_impacts(variant.INFO['PCGR_CSQ']): diff --git a/bolt/workflows/other/cancer_report.py b/bolt/workflows/other/cancer_report.py index 1902079..9ad3d93 100644 --- a/bolt/workflows/other/cancer_report.py +++ b/bolt/workflows/other/cancer_report.py @@ -69,7 +69,7 @@ def entry(ctx, **kwargs): ) # Set other required argument values - batch_name = f'{kwargs["subject_name"]}_{kwargs["tumor_name"]}' + batch_name = f'{kwargs['subject_name']}_{kwargs['tumor_name']}' output_table_dir = output_dir / 'cancer_report_tables' # Optional dragen hrd argument From 443c5be931e062b00eb56a74e7bb3221c87a4bdc Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Fri, 24 Oct 2025 13:57:56 +1100 Subject: [PATCH 22/22] add missing variable pcgr_output_dir --- bolt/workflows/smlv_somatic/report.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py index ede90ca..65ce535 100644 --- a/bolt/workflows/smlv_somatic/report.py +++ b/bolt/workflows/smlv_somatic/report.py @@ -129,6 +129,7 @@ def entry(ctx, **kwargs): output_dir, ) + pcgr_output_dir = output_dir / 'pcgr' pcgr.run_somatic( pcgr_prep_fp, kwargs['pcgr_data_dir'],