From b59abbe80f6d7b8bda0a3ba65e1f19612f59a6dd Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com>
Date: Tue, 16 Sep 2025 09:38:56 +1000
Subject: [PATCH 01/22] Release v0.2.15 (#14)
* update doc
* bump gpgr version
---
CHANGELOG.md | 2 ++
README.md | 14 +++++++-------
pyproject.toml | 2 +-
3 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0f7c841..1d8ba01 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
## dev
+- [14](https://github.com/umccr/bolt/pull/14) - gpgr version bump to 2.2.0
+
- [3](https://github.com/scwatts/bolt/pull/3) - Improve PCGR / CPSR argument handling
- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee
\ No newline at end of file
diff --git a/README.md b/README.md
index 323a77f..f1db850 100644
--- a/README.md
+++ b/README.md
@@ -54,12 +54,12 @@ the software environment and dependencies. Consequently, dependencies are split
| Name | Docker image URI | Commands |
| --- | --- | --- |
-| pcgr | ghcr.io/scwatts/bolt:0.2.14-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
|
-| gpgr | ghcr.io/scwatts/bolt:0.2.14-gpgr | • `bolt other cancer_report` |
-| snpeff | ghcr.io/scwatts/bolt:0.2.14-snpeff | • `bolt sv_somatic annotate` |
-| circos | ghcr.io/scwatts/bolt:0.2.14-circos | • `bolt other purple_baf_plot` |
-| multiqc | ghcr.io/scwatts/bolt:0.2.14-multiqc | • `bolt other multiqc_report` |
-| base | ghcr.io/scwatts/bolt:0.2.14 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
|
+| pcgr | ghcr.io/scwatts/bolt:0.2.15-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
|
+| gpgr | ghcr.io/scwatts/bolt:0.2.15-gpgr | • `bolt other cancer_report` |
+| snpeff | ghcr.io/scwatts/bolt:0.2.15-snpeff | • `bolt sv_somatic annotate` |
+| circos | ghcr.io/scwatts/bolt:0.2.15-circos | • `bolt other purple_baf_plot` |
+| multiqc | ghcr.io/scwatts/bolt:0.2.15-multiqc | • `bolt other multiqc_report` |
+| base | ghcr.io/scwatts/bolt:0.2.15 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
|
## Usage
@@ -67,7 +67,7 @@ Given the nature of software dependencies required, it is strongly recommended t
[Docker images](#docker-images):
```bash
-docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.14 \
+docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.15 \
bolt smlv_somatic filter \
--tumor_name tumor_sample \
--vcf_fp tumor_sample.vcf.gz \
diff --git a/pyproject.toml b/pyproject.toml
index ae8b9bb..149727c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ include = ["bolt*"]
[project]
name = "bolt"
-version = "0.2.14"
+version = "0.2.15"
authors = [
{name = "Stephen Watts", email = "stephen.watts@umccr.org"},
]
From f3696db8731e41ba191fe7190818351b6f5be15b Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com>
Date: Tue, 23 Sep 2025 17:00:22 +1000
Subject: [PATCH 02/22] fix version (#18)
---
.bumpversion.cfg | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 977b262..c5ffb18 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.2.14
+current_version = 0.2.15
commit = True
tag = False
parse = (?P\d+)\.(?P\d+)\.(?P[a-z0-9+]+)
From a4bfad1307b6c7a8c2bf7506e586cfc15e69174c Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Wed, 24 Sep 2025 14:45:11 +1000
Subject: [PATCH 03/22] Bump gpgr version to 2.2.0
---
docker/Dockerfile.gpgr | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docker/Dockerfile.gpgr b/docker/Dockerfile.gpgr
index b8504e9..f1835d3 100644
--- a/docker/Dockerfile.gpgr
+++ b/docker/Dockerfile.gpgr
@@ -18,7 +18,7 @@ RUN \
RUN \
conda install --prefix /env/ \
- 'r-gpgr ==2.1.3' \
+ 'r-gpgr ==2.2.0' \
'r-sigrap ==0.1.1' \
'bioconductor-bsgenome.hsapiens.ucsc.hg38 ==1.4.5' \
'bioconductor-txdb.hsapiens.ucsc.hg38.knowngene ==3.16.0' \
From 05c1fa298a85eccd04ba638bc5640ca4bc3a0e28 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com>
Date: Wed, 1 Oct 2025 11:36:54 +1000
Subject: [PATCH 04/22] Release 0.2.16 (#19)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Bump version: 0.2.15 → 0.2.16
* Change hrd facultative for cancer report (#17)
* change hrd file facultatif
* fix arg position and assign default value
* fix version (#18)
* Bump gpgr version to 2.2.0
---
.bumpversion.cfg | 2 +-
README.md | 14 +++++++-------
bolt/workflows/other/cancer_report.py | 11 ++++++++---
pyproject.toml | 2 +-
4 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index c5ffb18..37d2a42 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.2.15
+current_version = 0.2.16
commit = True
tag = False
parse = (?P\d+)\.(?P\d+)\.(?P[a-z0-9+]+)
diff --git a/README.md b/README.md
index f1db850..5e5d8c6 100644
--- a/README.md
+++ b/README.md
@@ -54,12 +54,12 @@ the software environment and dependencies. Consequently, dependencies are split
| Name | Docker image URI | Commands |
| --- | --- | --- |
-| pcgr | ghcr.io/scwatts/bolt:0.2.15-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
|
-| gpgr | ghcr.io/scwatts/bolt:0.2.15-gpgr | • `bolt other cancer_report` |
-| snpeff | ghcr.io/scwatts/bolt:0.2.15-snpeff | • `bolt sv_somatic annotate` |
-| circos | ghcr.io/scwatts/bolt:0.2.15-circos | • `bolt other purple_baf_plot` |
-| multiqc | ghcr.io/scwatts/bolt:0.2.15-multiqc | • `bolt other multiqc_report` |
-| base | ghcr.io/scwatts/bolt:0.2.15 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
|
+| pcgr | ghcr.io/scwatts/bolt:0.2.16-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
|
+| gpgr | ghcr.io/scwatts/bolt:0.2.16-gpgr | • `bolt other cancer_report` |
+| snpeff | ghcr.io/scwatts/bolt:0.2.16-snpeff | • `bolt sv_somatic annotate` |
+| circos | ghcr.io/scwatts/bolt:0.2.16-circos | • `bolt other purple_baf_plot` |
+| multiqc | ghcr.io/scwatts/bolt:0.2.16-multiqc | • `bolt other multiqc_report` |
+| base | ghcr.io/scwatts/bolt:0.2.16 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
|
## Usage
@@ -67,7 +67,7 @@ Given the nature of software dependencies required, it is strongly recommended t
[Docker images](#docker-images):
```bash
-docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.15 \
+docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.16 \
bolt smlv_somatic filter \
--tumor_name tumor_sample \
--vcf_fp tumor_sample.vcf.gz \
diff --git a/bolt/workflows/other/cancer_report.py b/bolt/workflows/other/cancer_report.py
index 0799cd1..983f596 100644
--- a/bolt/workflows/other/cancer_report.py
+++ b/bolt/workflows/other/cancer_report.py
@@ -29,7 +29,7 @@
@click.option('--purple_dir', required=True, type=click.Path(exists=True))
@click.option('--virusbreakend_dir', required=True, type=click.Path(exists=True))
-@click.option('--dragen_hrd_fp', required=True, type=click.Path(exists=True))
+@click.option('--dragen_hrd_fp', required=False, type=click.Path(exists=True))
@click.option('--cancer_genes_fp', required=True, type=click.Path(exists=True))
@click.option('--oncokb_genes_fp', required=True, type=click.Path(exists=True))
@@ -62,6 +62,11 @@ def entry(ctx, **kwargs):
batch_name = f'{kwargs["subject_name"]}_{kwargs["tumor_name"]}'
output_table_dir = output_dir / 'cancer_report_tables'
+ # Optional dragen hrd argument
+ dragen_hrd_arg = ''
+ if kwargs['dragen_hrd_fp']:
+ dragen_hrd_arg = f"--dragen_hrd {kwargs['dragen_hrd_fp']}"
+
# Run gpgr canrep
command = fr'''
gpgr.R canrep \
@@ -88,7 +93,7 @@ def entry(ctx, **kwargs):
--virusbreakend_tsv {kwargs['virusbreakend_dir']}/{kwargs['tumor_name']}.virusbreakend.vcf.summary.tsv \
--virusbreakend_vcf {kwargs['virusbreakend_dir']}/{kwargs['tumor_name']}.virusbreakend.vcf \
\
- --dragen_hrd {kwargs['dragen_hrd_fp']} \
+ {dragen_hrd_arg} \
--bcftools_stats {kwargs['smlv_somatic_bcftools_stats_fp']} \
\
--key_genes {kwargs['cancer_genes_fp']} \
@@ -96,7 +101,7 @@ def entry(ctx, **kwargs):
\
--img_dir {output_image_dir}/ \
--result_outdir {output_table_dir}/ \
- --out_file {output_dir}/{kwargs["tumor_name"]}.cancer_report.html
+ --out_file {output_dir}/{kwargs['tumor_name']}.cancer_report.html
'''
util.execute_command(command)
diff --git a/pyproject.toml b/pyproject.toml
index 149727c..e050686 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ include = ["bolt*"]
[project]
name = "bolt"
-version = "0.2.15"
+version = "0.2.16"
authors = [
{name = "Stephen Watts", email = "stephen.watts@umccr.org"},
]
From 6dd0b0997227b02b0b4f4a64c3c189891bcc8df6 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com>
Date: Wed, 1 Oct 2025 12:18:46 +1000
Subject: [PATCH 05/22] Release 0.2.17 (#20)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Bump version: 0.2.16 → 0.2.17
* bump gpgr to v2.2.1 in docker files (#21)
---
.bumpversion.cfg | 2 +-
CHANGELOG.md | 2 ++
README.md | 14 +++++++-------
docker/Dockerfile.gpgr | 2 +-
pyproject.toml | 2 +-
5 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
index 37d2a42..fe18769 100644
--- a/.bumpversion.cfg
+++ b/.bumpversion.cfg
@@ -1,5 +1,5 @@
[bumpversion]
-current_version = 0.2.16
+current_version = 0.2.17
commit = True
tag = False
parse = (?P\d+)\.(?P\d+)\.(?P[a-z0-9+]+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1d8ba01..88a28d9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,8 @@
## dev
+- [17](https://github.com/umccr/bolt/pull/17) - change dragen HRD file optional
+
- [14](https://github.com/umccr/bolt/pull/14) - gpgr version bump to 2.2.0
- [3](https://github.com/scwatts/bolt/pull/3) - Improve PCGR / CPSR argument handling
diff --git a/README.md b/README.md
index 5e5d8c6..504b052 100644
--- a/README.md
+++ b/README.md
@@ -54,12 +54,12 @@ the software environment and dependencies. Consequently, dependencies are split
| Name | Docker image URI | Commands |
| --- | --- | --- |
-| pcgr | ghcr.io/scwatts/bolt:0.2.16-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
|
-| gpgr | ghcr.io/scwatts/bolt:0.2.16-gpgr | • `bolt other cancer_report` |
-| snpeff | ghcr.io/scwatts/bolt:0.2.16-snpeff | • `bolt sv_somatic annotate` |
-| circos | ghcr.io/scwatts/bolt:0.2.16-circos | • `bolt other purple_baf_plot` |
-| multiqc | ghcr.io/scwatts/bolt:0.2.16-multiqc | • `bolt other multiqc_report` |
-| base | ghcr.io/scwatts/bolt:0.2.16 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
|
+| pcgr | ghcr.io/scwatts/bolt:0.2.17-pcgr | • `bolt smlv_germline report`
• `bolt smlv_somatic annotate`
• `bolt smlv_somatic report`
|
+| gpgr | ghcr.io/scwatts/bolt:0.2.17-gpgr | • `bolt other cancer_report` |
+| snpeff | ghcr.io/scwatts/bolt:0.2.17-snpeff | • `bolt sv_somatic annotate` |
+| circos | ghcr.io/scwatts/bolt:0.2.17-circos | • `bolt other purple_baf_plot` |
+| multiqc | ghcr.io/scwatts/bolt:0.2.17-multiqc | • `bolt other multiqc_report` |
+| base | ghcr.io/scwatts/bolt:0.2.17 | • `bolt smlv_germline prepare`
• `bolt smlv_somatic rescue`
• `bolt smlv_somatic filter`
• `bolt sv_somatic prioritise`
|
## Usage
@@ -67,7 +67,7 @@ Given the nature of software dependencies required, it is strongly recommended t
[Docker images](#docker-images):
```bash
-docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.16 \
+docker run -ti -v $(pwd):$(pwd) -w $(pwd) ghcr.io/scwatts/bolt:0.2.17 \
bolt smlv_somatic filter \
--tumor_name tumor_sample \
--vcf_fp tumor_sample.vcf.gz \
diff --git a/docker/Dockerfile.gpgr b/docker/Dockerfile.gpgr
index f1835d3..8b3a157 100644
--- a/docker/Dockerfile.gpgr
+++ b/docker/Dockerfile.gpgr
@@ -18,7 +18,7 @@ RUN \
RUN \
conda install --prefix /env/ \
- 'r-gpgr ==2.2.0' \
+ 'r-gpgr ==2.2.1' \
'r-sigrap ==0.1.1' \
'bioconductor-bsgenome.hsapiens.ucsc.hg38 ==1.4.5' \
'bioconductor-txdb.hsapiens.ucsc.hg38.knowngene ==3.16.0' \
diff --git a/pyproject.toml b/pyproject.toml
index e050686..e7325a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ include = ["bolt*"]
[project]
name = "bolt"
-version = "0.2.16"
+version = "0.2.17"
authors = [
{name = "Stephen Watts", email = "stephen.watts@umccr.org"},
]
From d7ca46b2ddb8aef1b1e9568d034b8fb66e0634d7 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com>
Date: Wed, 27 Aug 2025 14:40:51 +1000
Subject: [PATCH 06/22] Add option for sash sigrap individual process input to
cancer report (#11)
* add params input for sigrap tools
---
bolt/workflows/other/cancer_report.py | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/bolt/workflows/other/cancer_report.py b/bolt/workflows/other/cancer_report.py
index 983f596..60b4da3 100644
--- a/bolt/workflows/other/cancer_report.py
+++ b/bolt/workflows/other/cancer_report.py
@@ -28,6 +28,10 @@
@click.option('--purple_dir', required=True, type=click.Path(exists=True))
@click.option('--virusbreakend_dir', required=True, type=click.Path(exists=True))
+@click.option('--mutpat_dir', required=True, type=click.Path(exists=True))
+
+@click.option('--hrdetect_file', required=True, type=click.Path(exists=True))
+@click.option('--chord_file', required=True, type=click.Path(exists=True))
@click.option('--dragen_hrd_fp', required=False, type=click.Path(exists=True))
@@ -99,6 +103,10 @@ def entry(ctx, **kwargs):
--key_genes {kwargs['cancer_genes_fp']} \
--oncokb_genes {kwargs['oncokb_genes_fp']} \
\
+ --mutpat_dir {kwargs['mutpat_dir']} \
+ --hrdetect_file {kwargs['hrdetect_file']} \
+ --chord_file {kwargs['chord_file']} \
+ \
--img_dir {output_image_dir}/ \
--result_outdir {output_table_dir}/ \
--out_file {output_dir}/{kwargs['tumor_name']}.cancer_report.html
From f33c48c7a96ebe4e2edb6cc0f00ea9fe01bdb7c9 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com>
Date: Fri, 29 Aug 2025 15:47:40 +1000
Subject: [PATCH 07/22] Hypermutation Handling in PCGR (#9)
---
.gitignore | 2 +
CHANGELOG.md | 4 +-
bolt/common/constants.py | 52 ++++-
bolt/common/pcgr.py | 234 ++++++++++++++++++++--
bolt/logging_config.py | 34 ++++
bolt/util.py | 256 +++++++++++++++++++++---
bolt/workflows/other/cancer_report.py | 6 +
bolt/workflows/smlv_germline/report.py | 4 +
bolt/workflows/smlv_somatic/annotate.py | 59 +++---
bolt/workflows/smlv_somatic/filter.py | 3 +
bolt/workflows/smlv_somatic/report.py | 85 +++++++-
bolt/workflows/smlv_somatic/rescue.py | 7 +
bolt/workflows/sv_somatic/annotate.py | 3 +
conda/env/bolt_env.yml | 1 +
pyproject.toml | 1 +
15 files changed, 677 insertions(+), 74 deletions(-)
create mode 100644 bolt/logging_config.py
diff --git a/.gitignore b/.gitignore
index 59a0c10..e4ef844 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ __pycache__/
build/
venv/
working/
+data/
+workspace/
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 88a28d9..90b7815 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,4 +8,6 @@
- [3](https://github.com/scwatts/bolt/pull/3) - Improve PCGR / CPSR argument handling
-- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee
\ No newline at end of file
+- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee
+
+-[9](https://github.com/umccr/bolt/pull/9) Add hypermutation sample handling
\ No newline at end of file
diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index e38ed3d..29216bc 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -4,7 +4,7 @@
######################################
## Variation selection (annotation) ##
######################################
-MAX_SOMATIC_VARIANTS = 500_000
+MAX_SOMATIC_VARIANTS = 450_000
MAX_SOMATIC_VARIANTS_GNOMAD_FILTER = 0.01
@@ -41,6 +41,43 @@
}
+################################
+## Hypermutated report filter ##
+################################
+PCGR_TIERS_FILTERING = (
+ 'TIER_1',
+ 'TIER_2',
+ 'TIER_3',
+ 'TIER_4',
+ 'NONCODING',
+)
+
+VEP_IMPACTS_FILTER = (
+ 'intergenic',
+ 'intronic',
+ 'downstream',
+ 'upstream',
+ 'impacts_other',
+)
+
+GENOMIC_REGIONS_FILTERING = (
+ 'difficult',
+ 'none',
+ 'giab_conf',
+)
+
+HOTSPOT_FIELDS_FILTERING = (
+ 'SAGE_HOTSPOT',
+ 'hotspot',
+ 'PCGR_MUTATION_HOTSPOT',
+)
+
+RETAIN_FIELDS_FILTERING = (
+ 'PANEL',
+ *HOTSPOT_FIELDS_FILTERING,
+)
+
+
##################################################
## VCF FILTER tags and FORMAT, INFO annotations ##
##################################################
@@ -61,6 +98,8 @@ class VcfFilter(enum.Enum):
ENCODE = 'ENCODE'
GNOMAD_COMMON = 'gnomAD_common'
+ PCGR_COUNT_LIMIT = 'PCGR_count_limit'
+
@property
def namespace(self):
return 'FILTER'
@@ -121,6 +160,8 @@ class VcfInfo(enum.Enum):
RESCUED_FILTERS_EXISTING = 'RESCUED_FILTERS_EXISTING'
RESCUED_FILTERS_PENDING = 'RESCUED_FILTERS_PENDING'
+ PANEL = 'PANEL'
+
@property
def namespace(self):
return 'INFO'
@@ -187,6 +228,9 @@ def namespace(self):
'Description': f'gnomAD AF >= {MAX_GNOMAD_AF}',
},
+ VcfFilter.PCGR_COUNT_LIMIT: {
+ 'Description': 'Manually filtered to meet PCGR 500,000 variant limit',
+ },
# INFO
VcfInfo.TUMOR_AF: {
@@ -350,6 +394,12 @@ def namespace(self):
'Description': 'Filters pending prior to variant rescue',
},
+ VcfInfo.PANEL: {
+ 'Number': '0',
+ 'Type': 'Flag',
+ 'Description': 'UMCCR somatic panel CDS (2,000 bp padding)',
+ },
+
# FORMAT
VcfFormat.SAGE_AD: {
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index fce4c56..296fb48 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -1,9 +1,12 @@
+import collections
import csv
+import functools
+import itertools
import pathlib
import re
import shutil
import tempfile
-
+import logging
import cyvcf2
@@ -11,6 +14,8 @@
from .. import util
from ..common import constants
+# Use the existing logger configuration
+logger = logging.getLogger(__name__)
def prepare_vcf_somatic(input_fp, tumor_name, normal_name, output_dir):
@@ -110,13 +115,18 @@ def get_minimal_header(input_fh):
return '\n'.join([filetype_line, *chrom_lines, *format_lines, column_line])
-def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None):
+def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None):
# NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a
# temporary directory outside of the FusionFS mounted directory then manually copy across
temp_dir = tempfile.TemporaryDirectory()
- pcgr_output_dir = output_dir / 'pcgr/'
+ temp_dir_path = pathlib.Path(temp_dir.name)
+ pcgr_output_dir = pcgr_output_dir / f"pcgr_{chunk_nbr}" if chunk_nbr is not None else pcgr_output_dir # Check if the output directory already exists
+ if pcgr_output_dir.exists():
+ logger.warning(f"Warning: Output directory '{pcgr_output_dir}' already exists and will be overwrited")
+ shutil.rmtree(pcgr_output_dir)
+
if not sample_id:
sample_id = 'nosampleset'
@@ -171,7 +181,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No
command = fr'''
pcgr \
- {command_args_str}
+ {command_args_str}
'''
if pcgr_conda:
@@ -179,11 +189,24 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No
command_formatting = '\n' + ' ' * 4
command = command_formatting + command_conda + command
- util.execute_command(command)
+ # Log file path
+ log_file_path = temp_dir_path / "run_somatic.log"
+
+ # Run the command and redirect output to the log file
+ util.execute_command(command, log_file_path=log_file_path)
shutil.copytree(temp_dir.name, pcgr_output_dir)
- return pcgr_output_dir
+ pcgr_tsv_fp = pathlib.Path(pcgr_output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv'
+ pcgr_vcf_fp = pathlib.Path(pcgr_output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz'
+
+ # Check if both files exist
+ if not pcgr_tsv_fp.exists():
+ raise FileNotFoundError(f"Expected file {pcgr_tsv_fp} not found.")
+ if not pcgr_vcf_fp.exists():
+ raise FileNotFoundError(f"Expected file {pcgr_vcf_fp} not found.")
+
+ return pcgr_tsv_fp, pcgr_vcf_fp
def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None):
@@ -240,7 +263,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pc
return cpsr_output_dir
-def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, output_dir):
+def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp, output_dir):
# Set destination INFO field names and source TSV fields
info_field_map = {
constants.VcfInfo.PCGR_MUTATION_HOTSPOT: 'MUTATION_HOTSPOT',
@@ -249,9 +272,6 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
constants.VcfInfo.PCGR_CSQ: 'CSQ',
}
- pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv'
- pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz'
-
# Enforce matching defined and source INFO annotations
check_annotation_headers(info_field_map, pcgr_vcf_fp)
@@ -277,10 +297,6 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
# Do not process chrM since *snvs_indels.tiers.tsv does not include these annotations
if record.CHROM == 'chrM':
continue
- # Immediately print out variants that were not annotated
- if filter_name in record.FILTERS:
- output_fh.write_record(record)
- continue
# Annotate and write
record_ann = annotate_record(record, pcgr_data)
output_fh.write_record(record_ann)
@@ -481,3 +497,193 @@ def annotate_record(record, annotations, *, allow_missing=False):
record.INFO[info_enum.value] = v
return record
+
+def split_vcf(input_vcf, output_dir):
+ """
+ Splits a VCF file into multiple chunks, each containing up to max_variants variants.
+ Each chunk includes the VCF header.
+ Ensures no overlapping positions between chunks.
+ """
+ output_dir = pathlib.Path(output_dir / "vcf_chunks")
+ output_dir.mkdir(parents=True, exist_ok=True)
+ chunk_files = []
+ chunk_number = 1
+ variant_count = 0
+ base_filename = pathlib.Path(input_vcf).stem
+ chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+ base_filename = input_vcf.stem
+ chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+ chunk_files.append(chunk_filename)
+ # Open the input VCF using cyvcf2
+ vcf_in = cyvcf2.VCF(input_vcf)
+ # Create a new VCF file for the first chunk
+ vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
+ last_position = None
+ for record in vcf_in:
+ current_position = record.POS
+ # Check if we need to start a new chunk
+ if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position):
+ # Close the current chunk file and start a new one
+ vcf_out.close()
+ chunk_number += 1
+ chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+ chunk_files.append(chunk_filename)
+ vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
+ variant_count = 0
+ # Write the record to the current chunk
+ vcf_out.write_record(record)
+ variant_count += 1
+ last_position = current_position
+ # Close the last chunk file
+ vcf_out.close()
+ vcf_in.close()
+ logger.info(f"VCF file split into {len(chunk_files)} chunks.")
+ return chunk_files
+
+def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda):
+ pcgr_tsv_files = []
+ pcgr_vcf_files = []
+ num_chunks = len(vcf_chunks)
+ # Ensure we don't use more workers than available threads, and each worker has at least 2 threads
+ max_workers = min(num_chunks, max_threads // 2)
+ threads_quot, threads_rem = divmod(max_threads, num_chunks)
+ threads_per_chunk = max(2, threads_quot)
+ # Limit the number of workers to the smaller of num_chunks or max_threads // 2
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
+ futures = {}
+ for chunk_number, vcf_file in enumerate(vcf_chunks, start=1):
+ # Assign extra thread to the first 'threads_rem' chunks
+ additional_thread = 1 if chunk_number <= threads_rem else 0
+ total_threads = threads_per_chunk + additional_thread
+ futures[executor.submit(run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number
+ for future in concurrent.futures.as_completed(futures):
+ try:
+ pcgr_tsv_fp, pcgr_vcf_fp = future.result()
+ if pcgr_tsv_fp:
+ pcgr_tsv_files.append(pcgr_tsv_fp)
+ if pcgr_vcf_fp:
+ pcgr_vcf_files.append(pcgr_vcf_fp)
+ except Exception as e:
+ print(f"Exception occurred: {e}")
+ merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files)
+ return merged_tsv_fp, merged_vcf_fp
+
+def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
+ pcgr_dir = pathlib.Path(output_dir) / 'pcgr'
+ pcgr_dir.mkdir(exist_ok=True)
+
+ # Merge all TSV files into a single file in the pcgr directory
+ merged_tsv_fp = pcgr_dir / "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv"
+ util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
+
+ # Step 5: Merge all VCF files into a single file in the pcgr directory
+ merged_vcf_path = pcgr_dir / "nosampleset.pcgr_acmg.grch38"
+ merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path)
+
+ return merged_vcf, merged_tsv_fp
+
+
+def get_variant_filter_data(variant):
+ attribute_names = (
+ 'tier',
+ 'difficult',
+ 'giab_conf',
+ 'intergenic',
+ 'intronic',
+ 'downstream',
+ 'upstream',
+ 'impacts_other',
+ )
+
+ data = {e: None for e in attribute_names}
+
+
+ data['tier'] = variant.INFO['PCGR_TIER']
+
+
+ info_keys = [k for k, v in variant.INFO]
+
+ data['difficult'] = any(e.startswith('DIFFICULT') for e in info_keys)
+ data['giab_conf'] = 'GIAB_CONF' in info_keys
+
+
+ # NOTE(SW): GIAB_CONF always overrides DIFFICULT tags
+ if data['giab_conf'] and data['difficult']:
+ data['difficult']= False
+
+
+ for impact in get_impacts(variant.INFO['PCGR_CSQ']):
+ if impact == 'intergenic_variant':
+ data['intergenic'] = True
+ elif impact == 'intron_variant':
+ data['intronic'] = True
+ elif impact == 'downstream_gene_variant':
+ data['downstream'] = True
+ elif impact == 'upstream_gene_variant':
+ data['upstream'] = True
+ elif impact:
+ data['impacts_other'] = True
+ else:
+ assert False
+
+ return data
+
+
+def get_impacts(csq_str_full):
+ impacts = set()
+ for csq_str in csq_str_full.split(','):
+ csq_tokens = csq_str.split('|')
+ impact_str = csq_tokens[1]
+ impacts.update(impact_str.split('&'))
+ return impacts
+
+
+def determine_filter(data):
+
+ for impact, region in get_ordering(tiers=False):
+
+ # NOTE(SW): this is less efficient than nested loops since the outer block is reevaluated
+ # within what would be the inner loop each cycle; taking this route for cleaner code
+
+ impacts_higher = get_impacts_higher(impact)
+ impact_filter = bool(data[impact]) and not any(bool(data[e]) for e in impacts_higher)
+
+ region_filter = False
+ if region == 'none':
+ region_filter = not (data['difficult'] or data['giab_conf'])
+ else:
+ region_filter = data[region]
+
+ if impact_filter and region_filter:
+ return (impact, region)
+
+ return False
+
+
+def get_variant_repr(variant):
+ return (variant.CHROM, variant.POS, variant.REF, tuple(variant.ALT))
+
+
+@functools.cache
+def get_ordering(tiers=True, impacts=True, regions=True):
+ categories = [
+ constants.PCGR_TIERS_FILTERING if tiers else None,
+ constants.VEP_IMPACTS_FILTER if impacts else None,
+ constants.GENOMIC_REGIONS_FILTERING if regions else None,
+ ]
+
+ # NOTE(SW): I'm not aware of any noncoding impacts for TIER_[1-4] other than TERT but keeping
+ # in to be overly cautious
+ ordering_iter = itertools.product(*(c for c in categories if c))
+
+ return tuple(ordering_iter)
+
+
+@functools.cache
+def get_impacts_higher(impact):
+ impact_index = constants.VEP_IMPACTS_FILTER.index(impact)
+ if impact_index + 1 < len(constants.VEP_IMPACTS_FILTER):
+ impacts_higher = constants.VEP_IMPACTS_FILTER[impact_index+1:len(constants.VEP_IMPACTS_FILTER)]
+ else:
+ impacts_higher = list()
+ return impacts_higher
diff --git a/bolt/logging_config.py b/bolt/logging_config.py
new file mode 100644
index 0000000..0ddda2b
--- /dev/null
+++ b/bolt/logging_config.py
@@ -0,0 +1,34 @@
+import logging
+import sys
+import pathlib
+from datetime import datetime
+
+class IgnoreTinfoFilter(logging.Filter):
+ def filter(self, record):
+ # Exclude messages that contain the unwanted text.
+ if "no version information available" in record.getMessage():
+ return False
+ return True
+
+def setup_logging(output_dir, script_name):
+ # Create a timestamp for the log file
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+ log_filename = f"{script_name}_{timestamp}.log"
+ log_file = pathlib.Path(output_dir) / log_filename
+
+ # Create individual handlers.
+ console_handler = logging.StreamHandler(sys.stdout)
+ file_handler = logging.FileHandler(log_file)
+
+ # Instantiate and attach the filter to both handlers.
+ tinfo_filter = IgnoreTinfoFilter()
+ console_handler.addFilter(tinfo_filter)
+ file_handler.addFilter(tinfo_filter)
+
+ logging.basicConfig(
+ level=logging.DEBUG,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+ handlers=[file_handler, console_handler]
+ )
+ logger = logging.getLogger(__name__)
+ logger.info("Logging setup complete")
\ No newline at end of file
diff --git a/bolt/util.py b/bolt/util.py
index b7333d9..b2ce7df 100644
--- a/bolt/util.py
+++ b/bolt/util.py
@@ -1,11 +1,14 @@
+import os
import pathlib
import subprocess
-import sys
import textwrap
-
+import logging
+from types import SimpleNamespace
from .common import constants
+# Set up logging
+logger = logging.getLogger(__name__)
# TODO(SW): create note that number this assumes location of `//`
def get_project_root():
@@ -15,46 +18,58 @@ def get_project_root():
return project_root
-def execute_command(command):
- command_prepared = command_prepare(command)
+def execute_command(command, log_file_path=None):
+ logger.info("Executing command: %s", command.strip())
- print(command_prepared)
+ # Open the log file if provided
+ log_file = log_file_path.open('a', encoding='utf-8') if log_file_path else None
- process = subprocess.run(
- command_prepared,
+ # Launch process with combined stdout and stderr streams, and line buffering enabled.
+ process = subprocess.Popen(
+ command,
shell=True,
executable='/bin/bash',
- capture_output=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
encoding='utf-8',
+ bufsize=1 # line buffered
)
- if process.returncode != 0:
- print(process)
- print(process.stderr)
- sys.exit(1)
+ output_lines = []
+ # Iterate over each line as it becomes available
+ with process.stdout:
+ for line in iter(process.stdout.readline, ''):
+ if line:
+ logger.info(line.strip())
+ output_lines.append(line)
+ if log_file:
+ log_file.write(line)
+ log_file.flush() # flush immediately for real-time logging
+ process.wait() # wait for the process to complete
+
+ if log_file:
+ log_file.close()
- return process
+ result = SimpleNamespace(
+ stdout=''.join(output_lines),
+ returncode=process.returncode,
+ pid=process.pid,
+ command=command
+ )
+ return result
def command_prepare(command):
return f'set -o pipefail; {textwrap.dedent(command)}'
-
-#def count_vcf_records(fp, exclude_args=None):
-# args = list()
-# if exclude_args:
-# args.append(f'-e \'{exclude_args}\'')
-#
-# args_str = ' '.join(args)
-# command = f'bcftools view -H {args_str} {fp} | wc -l'
-#
-# result = execute_command(command)
-# return int(result.stdout)
-
-
def count_vcf_records(fp):
- result = execute_command(f'bcftools view -H {fp} | wc -l')
- return int(result.stdout)
+ result = subprocess.run(f'bcftools view -H {fp} | wc -l',
+ shell=True,
+ executable="/bin/bash",
+ capture_output=True,
+ text=True )
+ return int(result.stdout.strip())
def add_vcf_header_entry(fh, anno_enum):
@@ -94,8 +109,185 @@ def get_qualified_vcf_annotation(anno_enum):
assert anno_enum in constants.VcfInfo or anno_enum in constants.VcfFormat
return f'{anno_enum.namespace}/{anno_enum.value}'
+def split_vcf(input_vcf, output_dir):
+ """
+ Splits a VCF file into multiple chunks, each containing up to max_variants variants.
+ Each chunk includes the VCF header.
+ Ensures no overlapping positions between chunks.
+ """
+ output_dir = pathlib.Path(output_dir / "vcf_chunks")
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ chunk_files = []
+ chunk_number = 1
+ variant_count = 0
+ base_filename = pathlib.Path(input_vcf).stem
+ chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+ base_filename = input_vcf.stem
+ chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+ chunk_files.append(chunk_filename)
+
+ # Open the input VCF using cyvcf2
+ vcf_in = cyvcf2.VCF(input_vcf)
+ # Create a new VCF file for the first chunk
+ vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
+
+ last_position = None
+
+ for record in vcf_in:
+ current_position = record.POS
+ # Check if we need to start a new chunk
+ if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position):
+ # Close the current chunk file and start a new one
+ vcf_out.close()
+ chunk_number += 1
+ chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+ chunk_files.append(chunk_filename)
+ vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
+ variant_count = 0
+
+ # Write the record to the current chunk
+ vcf_out.write_record(record)
+ variant_count += 1
+ last_position = current_position
+
+ # Close the last chunk file
+ vcf_out.close()
+ vcf_in.close()
+
+ logger.info(f"VCF file split into {len(chunk_files)} chunks.")
+
+ return chunk_files
+
+def merge_tsv_files(tsv_files, merged_tsv_fp):
+ """
+ Merges all TSV files into a single TSV.
+ """
+ with open(merged_tsv_fp, 'w') as merged_tsv:
+ for i, tsv_file in enumerate(tsv_files):
+ with open(tsv_file, 'r') as infile:
+ for line_number, line in enumerate(infile):
+ # Skip header except for the first file
+ if i > 0 and line_number == 0:
+ continue
+ merged_tsv.write(line)
+ logger.info(f"Merged TSV written to: {merged_tsv_fp}")
+
+
+def merge_vcf_files(vcf_files, merged_vcf_fp):
+ """
+ Merges multiple VCF files into a single sorted VCF file using bcftools.
+
+ Parameters:
+ - vcf_files: List of paths to VCF files to be merged.
+ - merged_vcf_fp: Path to the output merged VCF file (without extension).
+
+ Returns:
+ - Path to the sorted merged VCF file.
+ """
+ merged_vcf_fp = pathlib.Path(merged_vcf_fp)
+ merged_unsorted_vcf = merged_vcf_fp.with_suffix('.unsorted.vcf.gz')
+ merged_vcf = merged_vcf_fp.with_suffix('.vcf.gz')
+
+ # Prepare the bcftools merge command arguments
+ command_args = [
+ 'bcftools merge',
+ '-m all',
+ '-Oz',
+ f'-o {merged_unsorted_vcf}',
+ ] + [str(vcf_file) for vcf_file in vcf_files]
+
+ # Format the command for readability
+ delimiter_padding = ' ' * 10
+ delimiter = f' \\\n{delimiter_padding}'
+ command_args_str = delimiter.join(command_args)
+
+ command = f'''
+ {command_args_str}
+ '''
+
+ # Run the bcftools merge command
+ logger.info("Running bcftools merge...")
+ execute_command(command)
+ logger.info(f"Merged VCF written to: {merged_unsorted_vcf}")
+
+ # Sort the merged VCF file
+ sort_command_args = [
+ 'bcftools sort',
+ '-Oz',
+ f'-o {merged_vcf}',
+ f'{merged_unsorted_vcf}'
+ ]
+ sort_command_args_str = delimiter.join(sort_command_args)
+ sort_command = f'''
+ {sort_command_args_str}
+ '''
+
+ logger.info("Sorting merged VCF file...")
+ execute_command(sort_command)
+ logger.info(f"Sorted merged VCF written to: {merged_vcf}")
+
+ # Index the sorted merged VCF file
+ index_command_args = [
+ 'bcftools index',
+ '-t',
+ f'{merged_vcf}'
+ ]
+ index_command_args_str = delimiter.join(index_command_args)
+ index_command = f'''
+ {index_command_args_str}
+ '''
+
+ logger.info("Indexing sorted merged VCF file...")
+ execute_command(index_command)
+ logger.info(f"Indexed merged VCF file: {merged_vcf}.tbi")
+
+ # Optionally, remove the unsorted merged VCF file
+ if merged_unsorted_vcf.exists():
+ merged_unsorted_vcf.unlink()
+
+ return merged_vcf
+
+def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
+ # Step 3: Merge all chunk VCF files into a single file
+ pcgr_dir = output_dir / 'pcgr/'
+ pcgr_dir.mkdir(exist_ok=True)
+ # Merge all TSV files into a single file in the pcgr directory merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv")
+ merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv")
+ merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
+ # Step 5: Merge all VCF files into a single file in the pcgr directory
+ merged_vcf_path = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38")
+ merged_vcf = merge_vcf_files(pcgr_vcf_files, merged_vcf_path)
+ return merged_vcf, merged_tsv_fp
+
+def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda):
+ pcgr_tsv_files = []
+ pcgr_vcf_files = []
+
+ num_chunks = len(vcf_chunks)
+ # Ensure we don't use more workers than available threads, and each worker has at least 2 threads
+ max_workers = min(num_chunks, max_threads // 2)
+ threads_quot, threads_rem = divmod(max_threads, num_chunks)
+ threads_per_chunk = max(2, threads_quot)
+
+ # Limit the number of workers to the smaller of num_chunks or max_threads // 2
+ with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
+ futures = {}
+ for chunk_number, vcf_file in enumerate(vcf_chunks, start=1):
+ # Assign extra thread to the first 'threads_rem' chunks
+ additional_thread = 1 if chunk_number <= threads_rem else 0
+ total_threads = threads_per_chunk + additional_thread
+ futures[executor.submit(pcgr.run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number
+
+ for future in concurrent.futures.as_completed(futures):
+ try:
+ pcgr_tsv_fp, pcgr_vcf_fp = future.result()
+ if pcgr_tsv_fp:
+ pcgr_tsv_files.append(pcgr_tsv_fp)
+ if pcgr_vcf_fp:
+ pcgr_vcf_files.append(pcgr_vcf_fp)
+ except Exception as e:
+ print(f"Exception occurred: {e}")
-#def add_vcf_filter(record, filter_enum):
-# existing_filters = [e for e in record.FILTERS if e != 'PASS']
-# assert filter_enum.value not in existing_filters
-# return ';'.join([*existing_filters, filter_enum.value])
+ merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files)
+ return merged_tsv_fp, merged_vcf_fp
\ No newline at end of file
diff --git a/bolt/workflows/other/cancer_report.py b/bolt/workflows/other/cancer_report.py
index 60b4da3..1902079 100644
--- a/bolt/workflows/other/cancer_report.py
+++ b/bolt/workflows/other/cancer_report.py
@@ -5,6 +5,9 @@
from ... import util
+from ...logging_config import setup_logging
+
+
@click.command(name='cancer_report')
@@ -48,6 +51,9 @@ def entry(ctx, **kwargs):
output_dir = pathlib.Path(kwargs['output_dir'])
output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)
+ script_name = pathlib.Path(__file__).stem
+ setup_logging(output_dir, script_name)
+
# Normalise SAGE variants and remove duplicates that arise for MutationalPattern compatibility
decomposed_snv_vcf = normalise_and_dedup_sage_variants(
kwargs['smlv_somatic_vcf_fp'],
diff --git a/bolt/workflows/smlv_germline/report.py b/bolt/workflows/smlv_germline/report.py
index 0742c19..0a10711 100644
--- a/bolt/workflows/smlv_germline/report.py
+++ b/bolt/workflows/smlv_germline/report.py
@@ -1,5 +1,6 @@
import pathlib
import yaml
+from ...logging_config import setup_logging
import click
@@ -35,6 +36,9 @@ def entry(ctx, **kwargs):
output_dir = pathlib.Path(kwargs['output_dir'])
output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)
+ # Set up logging
+ script_name = pathlib.Path(__file__).stem
+ setup_logging(output_dir, script_name)
# BCFtools stats
run_bcftool_stats(kwargs['vcf_unfiltered_fp'], kwargs['normal_name'], output_dir)
diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py
index f163d5f..fe0007a 100644
--- a/bolt/workflows/smlv_somatic/annotate.py
+++ b/bolt/workflows/smlv_somatic/annotate.py
@@ -1,14 +1,13 @@
+import logging
import pathlib
-
-
import click
import cyvcf2
-
from ... import util
from ...common import constants
from ...common import pcgr
-
+from ...logging_config import setup_logging
+logger = logging.getLogger(__name__)
@click.command(name='annotate')
@click.pass_context
@@ -44,6 +43,9 @@ def entry(ctx, **kwargs):
output_dir = pathlib.Path(kwargs['output_dir'])
output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)
+ script_name = pathlib.Path(__file__).stem
+ setup_logging(output_dir, script_name)
+
# Set all FILTER="." to FILTER="PASS" as required by PURPLE
filter_pass_fp = set_filter_pass(kwargs['vcf_fp'], kwargs['tumor_name'], output_dir)
@@ -73,7 +75,6 @@ def entry(ctx, **kwargs):
)
# Annotate with cancer-related and functional information from a range of sources using PCGR
- # - Select variants to process - there is an upper limit for PCGR of around 500k
# - Set tumor and normal AF and DP in INFO for PCGR and remove all other annotations
# - Run PCGR on minimal VCF (pcgr_prep_fp)
# - Transfer selected PCGR annotations to unfiltered VCF (selected_fp)
@@ -84,30 +85,40 @@ def entry(ctx, **kwargs):
# - Hits in COSMIC [INFO/PCGR_COSMIC_COUNT]
# - Hits in TCGA [INFO/PCGR_TCGA_PANCANCER_COUNT]
# - Hits in PCAWG [INFO/PCGR_ICGC_PCAWG_COUNT]
- # Set selected data or full input
- selection_data = select_variants(
- pon_fp,
- kwargs['tumor_name'],
- kwargs['cancer_genes_fp'],
- output_dir,
- )
-
- if not (pcgr_prep_input_fp := selection_data.get('filtered')):
- pcgr_prep_input_fp = selection_data['selected']
# Prepare VCF for PCGR annotation
pcgr_prep_fp = pcgr.prepare_vcf_somatic(
- pcgr_prep_input_fp,
+ pon_fp,
kwargs['tumor_name'],
kwargs['normal_name'],
output_dir,
)
- # Run PCGR
- pcgr_dir = pcgr.run_somatic(
- pcgr_prep_fp,
+ pcgr_output_dir = output_dir / 'pcgr'
+ total_variants = util.count_vcf_records(pcgr_prep_fp)
+ print(f"Total number of variants in the input VCF: {total_variants}")
+
+ # Run PCGR in chunks if the total number of variants exceeds the maximum allowed for somatic variants
+ if total_variants > constants.MAX_SOMATIC_VARIANTS:
+ vcf_chunks = util.split_vcf(
+ pcgr_prep_fp,
+ output_dir
+ )
+ pcgr_tsv_fp, pcgr_vcf_fp = util.run_somatic_chunck(
+ vcf_chunks,
kwargs['pcgr_data_dir'],
output_dir,
+ pcgr_output_dir,
+ kwargs['threads'],
+ kwargs['pcgr_conda'],
+ kwargs['pcgrr_conda']
+ )
+ else:
+ pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic(
+ pcgr_prep_fp,
+ kwargs['pcgr_data_dir'],
+ pcgr_output_dir,
+ chunk_nbr=None,
threads=kwargs['threads'],
pcgr_conda=kwargs['pcgr_conda'],
pcgrr_conda=kwargs['pcgrr_conda'],
@@ -115,13 +126,13 @@ def entry(ctx, **kwargs):
# Transfer PCGR annotations to full set of variants
pcgr.transfer_annotations_somatic(
- selection_data['selected'],
+ pon_fp,
kwargs['tumor_name'],
- selection_data.get('filter_name'),
- pcgr_dir,
+ pcgr_vcf_fp,
+ pcgr_tsv_fp,
output_dir,
)
-
+ logger.info("Annotation process completed")
def set_filter_pass(input_fp, tumor_name, output_dir):
output_fp = output_dir / f'{tumor_name}.set_filter_pass.vcf.gz'
@@ -136,7 +147,6 @@ def set_filter_pass(input_fp, tumor_name, output_dir):
return output_fp
-
def general_annotations(input_fp, tumor_name, threads, annotations_dir, output_dir):
toml_fp = pathlib.Path(annotations_dir) / 'vcfanno_annotations.toml'
@@ -169,7 +179,6 @@ def panel_of_normal_annotations(input_fp, tumor_name, threads, pon_dir, output_d
util.execute_command(command)
return output_fp
-
def select_variants(input_fp, tumor_name, cancer_genes_fp, output_dir):
# Exclude variants until we hopefully move the needle below the threshold
diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py
index e46f285..104296d 100644
--- a/bolt/workflows/smlv_somatic/filter.py
+++ b/bolt/workflows/smlv_somatic/filter.py
@@ -26,6 +26,9 @@ def entry(ctx, **kwargs):
output_dir = pathlib.Path(kwargs['output_dir'])
output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)
+ script_name = pathlib.Path(__file__).stem
+ setup_logging(output_dir, script_name)
+
# Open input VCF and set required header entries for output
in_fh = cyvcf2.VCF(kwargs['vcf_fp'])
header_filters = (
diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py
index bdf908f..b2bead1 100644
--- a/bolt/workflows/smlv_somatic/report.py
+++ b/bolt/workflows/smlv_somatic/report.py
@@ -1,3 +1,4 @@
+import collections
import csv
import json
import pathlib
@@ -45,6 +46,9 @@ def entry(ctx, **kwargs):
output_dir = pathlib.Path(kwargs['output_dir'])
output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)
+ script_name = pathlib.Path(__file__).stem
+ setup_logging(output_dir, script_name)
+
# BCFtools stats
bcftools_vcf_fp = bcftools_stats_prepare(kwargs['vcf_fp'], kwargs['tumor_name'], output_dir)
run_bcftools_stats(bcftools_vcf_fp, kwargs['tumor_name'], output_dir)
@@ -105,10 +109,20 @@ def entry(ctx, **kwargs):
fh.write('\n')
# PCGR report
+ if variant_counts_process['filter_pass'] <= constants.MAX_SOMATIC_VARIANTS:
+ pcgr_input_vcf_fp = kwargs['vcf_fp']
+ else:
+ pcgr_input_vcf_fp = select_pcgr_variants(
+ kwargs['vcf_fp'],
+ kwargs['cancer_genes_fp'],
+ kwargs['tumor_name'],
+ output_dir,
+ )
+
purple_data = parse_purple_purity_file(kwargs['purple_purity_fp'])
pcgr_prep_fp = pcgr.prepare_vcf_somatic(
- kwargs['vcf_fp'],
+ pcgr_input_vcf_fp,
kwargs['tumor_name'],
kwargs['normal_name'],
output_dir,
@@ -284,6 +298,75 @@ def count_variant_process(vcf_fp):
return counts
+def select_pcgr_variants(vcf_fp, cancer_genes_fp, tumor_name, output_dir):
+ # Annotate variants in UMCCR somatic gene panel
+ fp_annotated_out = output_dir / f'{tumor_name}.umccr_panel_variants_annotated.vcf.gz'
+ util.execute_command(fr'''
+ bcftools annotate \
+ --annotations <(awk 'BEGIN {{ OFS="\t" }} {{ print $1, $2-2000, $3+2000, "1" }}' {cancer_genes_fp}) \
+ --header-line '{util.get_vcf_header_line(constants.VcfInfo.PANEL)}' \
+ --columns CHROM,FROM,TO,{constants.VcfInfo.PANEL.value} \
+ --output {fp_annotated_out} \
+ {vcf_fp}
+ ''')
+
+ # Set filter category for each variant
+ variants_sorted = collections.defaultdict(list)
+ for variant_count, variant in enumerate(cyvcf2.VCF(fp_annotated_out), 1):
+ if any(variant.INFO.get(e) for e in constants.RETAIN_FIELDS_FILTERING):
+ continue
+
+ data = pcgr.get_variant_filter_data(variant)
+ variant_filter = pcgr.determine_filter(data)
+ assert variant_filter
+
+ filter_category = (data['tier'], *variant_filter)
+ variant_repr = pcgr.get_variant_repr(variant)
+ variants_sorted[filter_category].append(variant_repr)
+
+
+ # Determine the set of filter categories to come under the PCGR 500,000 variant threshold
+ filter_sum = 0
+ filter_categories = list()
+ for key in pcgr.get_ordering():
+
+ if (variant_count - filter_sum) <= constants.MAX_SOMATIC_VARIANTS:
+ break
+
+ filter_sum += len(variants_sorted.get(key, []))
+ filter_categories.append(key)
+
+ # Set FILTERS and write out records
+ filter_variants = set()
+ for key in filter_categories:
+ filter_variants.update(variants_sorted[key])
+
+ fh_in = cyvcf2.VCF(vcf_fp)
+ util.add_vcf_header_entry(fh_in, constants.VcfFilter.PCGR_COUNT_LIMIT)
+
+ # NOTE(SW): creating an additional VCF with all records for traceability
+ fp_out = output_dir / f'{tumor_name}.pcgr_hypermutated.pass.vcf.gz'
+ fp_set_out = output_dir / f'{tumor_name}.pcgr_hypermutated.filters_set.vcf.gz'
+
+ fh_out = cyvcf2.Writer(fp_out, fh_in)
+ fh_set_out = cyvcf2.Writer(fp_set_out, fh_in)
+
+ for variant in fh_in:
+ variant_repr = pcgr.get_variant_repr(variant)
+ if variant_repr not in filter_variants:
+ # Write only passing
+ fh_out.write_record(variant)
+ else:
+ variant.FILTER = constants.VcfFilter.PCGR_COUNT_LIMIT.value
+ # Write all variants including those with FILTER set
+ fh_set_out.write_record(variant)
+
+ fh_out.close()
+ fh_set_out.close()
+
+ return fp_out
+
+
def parse_purple_purity_file(fp):
with open(fp, 'r') as fh:
entries = list(csv.DictReader(fh, delimiter='\t'))
diff --git a/bolt/workflows/smlv_somatic/rescue.py b/bolt/workflows/smlv_somatic/rescue.py
index c92df94..bfd89b9 100644
--- a/bolt/workflows/smlv_somatic/rescue.py
+++ b/bolt/workflows/smlv_somatic/rescue.py
@@ -7,10 +7,14 @@
import click
import cyvcf2
+import logging
from ... import util
from ...common import constants
+from ...logging_config import setup_logging
+
+logger = logging.getLogger(__name__)
@click.command(name='rescue')
@@ -37,6 +41,9 @@ def entry(ctx, **kwargs):
output_dir = pathlib.Path(kwargs['output_dir'])
output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)
+ script_name = pathlib.Path(__file__).stem
+ setup_logging(output_dir, script_name)
+
# Select PASS SAGE variants in hotspots and then split into existing and novel calls
sage_pass_vcf_fp = select_sage_pass_hotspot(
kwargs['sage_vcf_fp'],
diff --git a/bolt/workflows/sv_somatic/annotate.py b/bolt/workflows/sv_somatic/annotate.py
index 2988e53..8577922 100644
--- a/bolt/workflows/sv_somatic/annotate.py
+++ b/bolt/workflows/sv_somatic/annotate.py
@@ -5,10 +5,13 @@
import click
import cyvcf2
import pysam
+import logging
from ... import util
+logger = logging.getLogger(__name__)
+
@click.command(name='annotate')
@click.pass_context
diff --git a/conda/env/bolt_env.yml b/conda/env/bolt_env.yml
index e27bd24..71e90c9 100644
--- a/conda/env/bolt_env.yml
+++ b/conda/env/bolt_env.yml
@@ -16,3 +16,4 @@ dependencies:
- python >=3.10
- pyyaml
- vcfanno ==0.3.5
+ - ncurses>=6.3
diff --git a/pyproject.toml b/pyproject.toml
index e7325a3..e98f470 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,6 +17,7 @@ dependencies = [
"cyvcf2",
"pysam",
"pyyaml",
+ "future"
]
[project.scripts]
From cb5ee4d510acd85db1061a65a0c42a0806ae430f Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <37511834+qclayssen@users.noreply.github.com>
Date: Tue, 2 Sep 2025 15:31:24 +1000
Subject: [PATCH 08/22] Feature/pcgr v2.2.1 (#10)
---
bolt/common/constants.py | 78 +++++-------
bolt/common/pcgr.py | 152 +++++++++++++-----------
bolt/util.py | 1 +
bolt/workflows/smlv_germline/report.py | 2 +
bolt/workflows/smlv_somatic/annotate.py | 16 ++-
bolt/workflows/smlv_somatic/filter.py | 16 +--
bolt/workflows/smlv_somatic/report.py | 4 +-
docker/Dockerfile.pcgr | 4 +-
tests/test_smlv_somatic_filter.py | 12 +-
9 files changed, 145 insertions(+), 140 deletions(-)
diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index 29216bc..7cebe78 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -35,9 +35,9 @@
'pathogenic',
'uncertain_significance',
}
-PCGR_TIERS_RESCUE = {
- 'TIER_1',
- 'TIER_2',
+PCGR_ACTIONABILITY_TIER_RESCUE = {
+ '1',
+ '2',
}
@@ -116,16 +116,14 @@ class VcfInfo(enum.Enum):
SAGE_NOVEL = 'SAGE_NOVEL'
SAGE_RESCUE = 'SAGE_RESCUE'
- PCGR_TIER = 'PCGR_TIER'
+ PCGR_ACTIONABILITY_TIER = 'PCGR_ACTIONABILITY_TIER'
PCGR_CSQ = 'PCGR_CSQ'
PCGR_MUTATION_HOTSPOT = 'PCGR_MUTATION_HOTSPOT'
- PCGR_CLINVAR_CLNSIG = 'PCGR_CLINVAR_CLNSIG'
+ PCGR_CLINVAR_CLASSIFICATION = 'PCGR_CLINVAR_CLASSIFICATION'
PCGR_COSMIC_COUNT = 'PCGR_COSMIC_COUNT'
PCGR_TCGA_PANCANCER_COUNT = 'PCGR_TCGA_PANCANCER_COUNT'
PCGR_ICGC_PCAWG_COUNT = 'PCGR_ICGC_PCAWG_COUNT'
- CPSR_FINAL_CLASSIFICATION = 'CPSR_FINAL_CLASSIFICATION'
- CPSR_PATHOGENICITY_SCORE = 'CPSR_PATHOGENICITY_SCORE'
CPSR_CLINVAR_CLASSIFICATION = 'CPSR_CLINVAR_CLASSIFICATION'
CPSR_CSQ = 'CPSR_CSQ'
@@ -151,7 +149,7 @@ class VcfInfo(enum.Enum):
GNOMAD_AF = 'gnomAD_AF'
- PCGR_TIER_RESCUE = 'PCGR_TIER_RESCUE'
+ PCGR_ACTIONABILITY_TIER_RESCUE = 'PCGR_ACTIONABILITY_TIER_RESCUE'
SAGE_HOTSPOT_RESCUE = 'SAGE_HOTSPOT_RESCUE'
CLINICAL_POTENTIAL_RESCUE = 'CLINICAL_POTENTIAL_RESCUE'
@@ -270,7 +268,7 @@ def namespace(self):
'Description': 'Variant rescued by a matching SAGE call',
},
- VcfInfo.PCGR_TIER: {
+ VcfInfo.PCGR_ACTIONABILITY_TIER: {
'Number': '1',
'Type': 'String',
'Description': (
@@ -281,28 +279,29 @@ def namespace(self):
},
VcfInfo.PCGR_CSQ: {
'Number': '.',
- 'Type': 'String',
- 'Description': (
- 'Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|'
- 'Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|'
- 'CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|'
- 'DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|'
- 'MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|'
- 'UNIPROT_ISOFORM|RefSeq|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|'
- 'gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|'
- 'gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|'
- 'NearestExonJB'
- ),
+ 'Type': 'String',
+ 'Description': (
+ 'Consequence annotations from Ensembl VEP. Format: '
+ 'Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|'
+ 'HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|'
+ 'ALLELE_NUM|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|'
+ 'MANE|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|'
+ 'UNIPROT_ISOFORM|RefSeq|DOMAINS|HGVS_OFFSET|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|'
+ 'gnomADe_ASJ_AF|gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|'
+ 'gnomADe_REMAINING_AF|gnomADe_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|'
+ 'MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|'
+ 'MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref'
+ ),
},
VcfInfo.PCGR_MUTATION_HOTSPOT: {
'Number': '.',
'Type': 'String',
- 'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org_v2, Gene|Codon|Q-value',
+ 'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org. Format: GeneSymbol|Entrez_ID|CodonRefAA|Alt_AA|Q-value',
},
- VcfInfo.PCGR_CLINVAR_CLNSIG: {
+ VcfInfo.PCGR_CLINVAR_CLASSIFICATION: {
'Number': '.',
'Type': 'String',
- 'Description': 'ClinVar clinical significance',
+ 'Description': 'ClinVar - clinical significance - per phenotype submission',
},
VcfInfo.PCGR_COSMIC_COUNT: {
'Number': '1',
@@ -320,23 +319,10 @@ def namespace(self):
'Description': 'Count of ICGC PCAWG hits',
},
- VcfInfo.CPSR_FINAL_CLASSIFICATION: {
- 'Number': '1',
- 'Type': 'String',
- 'Description': (
- 'Final variant classification based on the combination of CLINVAR_CLASSIFICTION (for '
- 'ClinVar-classified variants), and CPSR_CLASSIFICATION (for novel variants)'
- ),
- },
- VcfInfo.CPSR_PATHOGENICITY_SCORE: {
- 'Number': '1',
- 'Type': 'Float',
- 'Description': 'Aggregated CPSR pathogenicity score',
- },
VcfInfo.CPSR_CLINVAR_CLASSIFICATION: {
'Number': '1',
'Type': 'String',
- 'Description': 'Clinical significance of variant on a five-tiered scale',
+ 'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale',
},
VcfInfo.CPSR_CSQ: {
'Number': '.',
@@ -345,13 +331,13 @@ def namespace(self):
'Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|'
'Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|'
'Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|'
- 'FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE_SELECT|'
- 'MANE_PLUS_CLINICAL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|RefSeq|'
- 'DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|gnomAD_AF|gnomAD_AFR_AF|'
- 'gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|'
- 'gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|'
- 'MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|LoF|LoF_filter|LoF_flags|'
- 'LoF_info'
+ 'FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE|MANE_SELECT|'
+ 'MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|RefSeq|'
+ 'DOMAINS|HGVS_OFFSET|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|gnomADe_ASJ_AF|'
+ 'gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|gnomADe_REMAINING_AF|'
+ 'gnomADe_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|'
+ 'MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|MaxEntScan_alt|MaxEntScan_diff|'
+ 'MaxEntScan_ref'
),
},
@@ -360,7 +346,7 @@ def namespace(self):
'Type': 'Flag',
'Description': '',
},
- VcfInfo.PCGR_TIER_RESCUE: {
+ VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE: {
'Number': '0',
'Type': 'Flag',
'Description': '',
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 296fb48..8a17865 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -1,6 +1,7 @@
import collections
import csv
import functools
+import gzip
import itertools
import pathlib
import re
@@ -115,18 +116,17 @@ def get_minimal_header(input_fh):
return '\n'.join([filetype_line, *chrom_lines, *format_lines, column_line])
-def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None):
+def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None):
- # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a
- # temporary directory outside of the FusionFS mounted directory then manually copy across
- temp_dir = tempfile.TemporaryDirectory()
- temp_dir_path = pathlib.Path(temp_dir.name)
- pcgr_output_dir = pcgr_output_dir / f"pcgr_{chunk_nbr}" if chunk_nbr is not None else pcgr_output_dir # Check if the output directory already exists
- if pcgr_output_dir.exists():
- logger.warning(f"Warning: Output directory '{pcgr_output_dir}' already exists and will be overwrited")
- shutil.rmtree(pcgr_output_dir)
+ output_dir = output_dir / f"pcgr_{chunk_nbr}" if chunk_nbr is not None else output_dir
+ if output_dir.exists():
+ logger.warning(f"Output directory '{output_dir}' already exists and will be overwritten")
+ shutil.rmtree(output_dir)
+
+ # Create output directory
+ output_dir.mkdir(parents=True, exist_ok=True)
if not sample_id:
sample_id = 'nosampleset'
@@ -134,19 +134,20 @@ def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None
command_args = [
f'--sample_id {sample_id}',
f'--input_vcf {input_fp}',
+ f'--vep_dir {vep_dir}',
+ f'--refdata_dir {pcgr_refdata_dir}',
f'--tumor_dp_tag TUMOR_DP',
f'--tumor_af_tag TUMOR_AF',
f'--control_dp_tag NORMAL_DP',
f'--control_af_tag NORMAL_AF',
- f'--pcgr_dir {pcgr_refdata_dir}',
f'--genome_assembly grch38',
f'--assay WGS',
f'--estimate_signatures',
- f'--estimate_msi_status',
+ f'--estimate_msi',
f'--estimate_tmb',
- f'--show_noncoding',
f'--vcfanno_n_proc {threads}',
- f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane',
+ f'--vep_n_forks 4',
+ f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select',
]
# NOTE(SW): VEP pick order is applied as a successive filter:
@@ -172,7 +173,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None
command_args.append(f'--tumor_ploidy {ploidy}')
# NOTE(SW): placed here to always have output directory last
- command_args.append(f'--output_dir {temp_dir.name}')
+ command_args.append(f'--output_dir {output_dir}')
delimiter_padding = ' ' * 10
delimiter = f' \\\n{delimiter_padding}'
@@ -190,18 +191,16 @@ def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None
command = command_formatting + command_conda + command
# Log file path
- log_file_path = temp_dir_path / "run_somatic.log"
+ log_file_path = output_dir / "run_somatic.log"
# Run the command and redirect output to the log file
util.execute_command(command, log_file_path=log_file_path)
- shutil.copytree(temp_dir.name, pcgr_output_dir)
-
- pcgr_tsv_fp = pathlib.Path(pcgr_output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv'
- pcgr_vcf_fp = pathlib.Path(pcgr_output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz'
+ pcgr_tsv_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.snv_indel_ann.tsv.gz'
+ pcgr_vcf_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.pass.vcf.gz'
# Check if both files exist
- if not pcgr_tsv_fp.exists():
+ if not pcgr_tsv_fp.exists():
raise FileNotFoundError(f"Expected file {pcgr_tsv_fp} not found.")
if not pcgr_vcf_fp.exists():
raise FileNotFoundError(f"Expected file {pcgr_vcf_fp} not found.")
@@ -209,37 +208,42 @@ def run_somatic(input_fp, pcgr_refdata_dir, pcgr_pcgr_output_dir, chunk_nbr=None
return pcgr_tsv_fp, pcgr_vcf_fp
-def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None):
+def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None):
if not sample_id:
sample_id = 'nosampleset'
- # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a
- # temporary directory outside of the FusionFS mounted directory then manually copy across
- temp_dir = tempfile.TemporaryDirectory()
cpsr_output_dir = output_dir / 'cpsr/'
+ if cpsr_output_dir.exists():
+ logger.warning(f"Output directory '{cpsr_output_dir}' already exists and will be overwritten")
+ shutil.rmtree(cpsr_output_dir)
+
+ # Create output directory
+ cpsr_output_dir.mkdir(parents=True, exist_ok=True)
+
command_args = [
f'--sample_id {sample_id}',
f'--input_vcf {input_fp}',
f'--genome_assembly grch38',
f'--custom_list {panel_fp}',
+ f'--vep_dir {pcgr_refdata_dir}',
+ f'--refdata_dir {pcgr_refdata_dir}',
# NOTE(SW): probably useful to add versioning information here; weigh against maintainence
# burden
f'--custom_list_name umccr_germline_panel',
f'--pop_gnomad global',
f'--classify_all',
- f'--pcgr_dir {pcgr_refdata_dir}',
f'--vcfanno_n_proc {threads}',
- f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane',
+ f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select',
]
if pcgrr_conda:
command_args.append(f'--pcgrr_conda {pcgrr_conda}')
# NOTE(SW): placed here to always have output directory last
- command_args.append(f'--output_dir {temp_dir.name}')
+ command_args.append(f'--output_dir {cpsr_output_dir}')
delimiter_padding = ' ' * 10
delimiter = f' \\\n{delimiter_padding}'
@@ -258,8 +262,6 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pc
util.execute_command(command)
- shutil.copytree(temp_dir.name, cpsr_output_dir)
-
return cpsr_output_dir
@@ -267,11 +269,14 @@ def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp,
# Set destination INFO field names and source TSV fields
info_field_map = {
constants.VcfInfo.PCGR_MUTATION_HOTSPOT: 'MUTATION_HOTSPOT',
- constants.VcfInfo.PCGR_CLINVAR_CLNSIG: 'CLINVAR_CLNSIG',
+ constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION: 'CLINVAR_CLASSIFICATION',
constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT: 'TCGA_PANCANCER_COUNT',
constants.VcfInfo.PCGR_CSQ: 'CSQ',
}
+ pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv'
+ pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz'
+
# Enforce matching defined and source INFO annotations
check_annotation_headers(info_field_map, pcgr_vcf_fp)
@@ -281,13 +286,11 @@ def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp,
# Open filehandles, set required header entries
input_fh = cyvcf2.VCF(input_fp)
- util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_TIER)
+ util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER)
util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ)
util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT)
- util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLNSIG)
- util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_COSMIC_COUNT)
+ util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION)
util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT)
- util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT)
output_fp = output_dir / f'{tumor_name}.annotations.vcf.gz'
output_fh = cyvcf2.Writer(output_fp, input_fh, 'wz')
@@ -298,20 +301,19 @@ def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp,
if record.CHROM == 'chrM':
continue
# Annotate and write
- record_ann = annotate_record(record, pcgr_data)
+ record_ann = annotate_record(record, pcgr_data, allow_missing=True)
output_fh.write_record(record_ann)
def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir):
# Set destination INFO field names and source TSV fields
+ # Note: Only include fields that exist in CPSR v2.2.1 output
info_field_map = {
- constants.VcfInfo.CPSR_FINAL_CLASSIFICATION: 'FINAL_CLASSIFICATION',
- constants.VcfInfo.CPSR_PATHOGENICITY_SCORE: 'CPSR_PATHOGENICITY_SCORE',
constants.VcfInfo.CPSR_CLINVAR_CLASSIFICATION: 'CLINVAR_CLASSIFICATION',
constants.VcfInfo.CPSR_CSQ: 'CSQ',
}
- cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snvs_indels.tiers.tsv'
+ cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.classification.tsv.gz'
cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz'
# Enforce matching defined and source INFO annotations
@@ -323,8 +325,6 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir):
# Open filehandles, set required header entries
input_fh = cyvcf2.VCF(input_fp)
- util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_FINAL_CLASSIFICATION)
- util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_PATHOGENICITY_SCORE)
util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_CLINVAR_CLASSIFICATION)
util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_CSQ)
@@ -352,13 +352,14 @@ def check_annotation_headers(info_field_map, vcf_fp):
try:
header_src_entry = vcf_fh.get_header_type(header_src)
except KeyError:
+ print(f"Missing in VCF: {header_src}")
continue
header_dst_entry = util.get_vcf_header_entry(header_dst)
# Remove leading and trailing quotes from source
header_src_description_unquoted = header_src_entry['Description'].strip('"')
- assert header_src_description_unquoted == header_dst_entry['Description']
-
+ if header_src_description_unquoted != header_dst_entry['Description']:
+ raise AssertionError(f"Mismatch for {header_src}:\nVCF: {header_src_description_unquoted}\nExpected: {header_dst_entry['Description']}")
def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
# Gather all annotations from TSV
@@ -368,27 +369,13 @@ def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
key, record_ann = get_annotation_entry_tsv(record, info_field_map)
assert key not in data_tsv
- # Process PCGR_TIER
- # TIER_1, TIER_2, TIER_3, TIER_4, NONCODING
- record_ann[constants.VcfInfo.PCGR_TIER] = record['TIER'].replace(' ', '_')
-
- # Count COSMIC hits
- if record['COSMIC_MUTATION_ID'] == 'NA':
- cosmic_count = 0
- else:
- cosmic_count = len(record['COSMIC_MUTATION_ID'].split('&'))
- record_ann[constants.VcfInfo.PCGR_COSMIC_COUNT] = cosmic_count
-
- # Count ICGC-PCAWG hits by taking sum of affected donors where the annotation value has
- # the following format: project_code|tumor_type|affected_donors|tested_donors|frequency
- icgc_pcawg_count = 0
- if record['ICGC_PCAWG_OCCURRENCE'] != 'NA':
- for pcawg_hit_data in record['ICGC_PCAWG_OCCURRENCE'].split(','):
- pcawrg_hit_data_fields = pcawg_hit_data.split('|')
- affected_donors = int(pcawrg_hit_data_fields[2])
- icgc_pcawg_count += affected_donors
- assert icgc_pcawg_count > 0
- record_ann[constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT] = icgc_pcawg_count
+ # Process PCGR_ACTIONABILITY_TIER
+ # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING
+ record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_')
+
+ # Process PCGR_ACTIONABILITY_TIER
+ # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING
+ record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER']
# Store annotation data
data_tsv[key] = record_ann
@@ -404,7 +391,7 @@ def collect_cpsr_annotation_data(tsv_fp, vcf_fp, info_field_map):
# Gather annotations from TSV
data_tsv = dict()
gdot_re = re.compile('^(?P[\dXYM]+):g\.(?P\d+)(?P[[A-Z]+)>(?P[A-Z]+)$')
- with open(tsv_fp, 'r') as tsv_fh:
+ with gzip.open(tsv_fp, 'rt') as tsv_fh:
for record in csv.DictReader(tsv_fh, delimiter='\t'):
# Decompose CPSR 'GENOMIC_CHANGE' field into CHROM, POS, REF, and ALT
re_result = gdot_re.match(record['GENOMIC_CHANGE'])
@@ -426,6 +413,24 @@ def collect_cpsr_annotation_data(tsv_fp, vcf_fp, info_field_map):
# Compile annotations, prefering TSV source
return compile_annotation_data(data_tsv, data_vcf)
+def parse_genomic_change(genomic_change):
+ """
+ Parse a genomic change string, e.g., "3:g.41224645T>C"
+ Returns a tuple: (chrom, pos, ref, alt)
+ """
+ # Regular expression for the format "chrom:g.posRef>Alt"
+ pattern = r'^(?P\w+):g\.(?P\d+)(?P][\w+)>(?P\w+)$'
+ match = re.match(pattern, genomic_change)
+ if not match:
+ raise ValueError(f"Format not recognized: {genomic_change}")
+
+ # Get values and format as needed
+ chrom = f"chr{match.group('chrom')}"
+ pos = int(match.group('pos'))
+ ref = match.group('ref')
+ alt = match.group('alt')
+ return chrom, pos, ref, alt
+
def get_annotations_vcf(vcf_fp, info_field_map):
data_vcf = dict()
@@ -445,10 +450,14 @@ def get_annotations_vcf(vcf_fp, info_field_map):
def get_annotation_entry_tsv(record, info_field_map):
- # Set lookup key; PCGR/CPSR strips leading 'chr' from contig names
- chrom = f'chr{record["CHROM"]}'
- pos = int(record['POS'])
- key = (chrom, pos, record['REF'], record['ALT'])
+ # If GENOMIC_CHANGE is present, parse it for coordinates; otherwise, use separate fields.
+ if "GENOMIC_CHANGE" in record and record["GENOMIC_CHANGE"]:
+ chrom, pos, ref, alt = parse_genomic_change(record["GENOMIC_CHANGE"])
+
+ if not chrom.startswith('chr'):
+ chrom = f'chr{chrom}'
+
+ key = (chrom, pos, ref, alt)
record_ann = dict()
for info_dst, info_src in info_field_map.items():
@@ -510,7 +519,6 @@ def split_vcf(input_vcf, output_dir):
chunk_number = 1
variant_count = 0
base_filename = pathlib.Path(input_vcf).stem
- chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
base_filename = input_vcf.stem
chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
chunk_files.append(chunk_filename)
@@ -540,7 +548,7 @@ def split_vcf(input_vcf, output_dir):
logger.info(f"VCF file split into {len(chunk_files)} chunks.")
return chunk_files
-def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda):
+def run_somatic_chunck(vcf_chunks, pcgr_data_dir, vep_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda):
pcgr_tsv_files = []
pcgr_vcf_files = []
num_chunks = len(vcf_chunks)
@@ -555,7 +563,7 @@ def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, m
# Assign extra thread to the first 'threads_rem' chunks
additional_thread = 1 if chunk_number <= threads_rem else 0
total_threads = threads_per_chunk + additional_thread
- futures[executor.submit(run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number
+ futures[executor.submit(run_somatic, vcf_file, pcgr_data_dir, vep_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number
for future in concurrent.futures.as_completed(futures):
try:
pcgr_tsv_fp, pcgr_vcf_fp = future.result()
@@ -577,7 +585,7 @@ def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
# Step 5: Merge all VCF files into a single file in the pcgr directory
- merged_vcf_path = pcgr_dir / "nosampleset.pcgr_acmg.grch38"
+ merged_vcf_path = pcgr_dir / "nosampleset.pcgr.grch38.pass.vcf.gz"
merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path)
return merged_vcf, merged_tsv_fp
diff --git a/bolt/util.py b/bolt/util.py
index b2ce7df..34df80f 100644
--- a/bolt/util.py
+++ b/bolt/util.py
@@ -1,3 +1,4 @@
+import gzip
import os
import pathlib
import subprocess
diff --git a/bolt/workflows/smlv_germline/report.py b/bolt/workflows/smlv_germline/report.py
index 0a10711..395268f 100644
--- a/bolt/workflows/smlv_germline/report.py
+++ b/bolt/workflows/smlv_germline/report.py
@@ -23,6 +23,7 @@
@click.option('--germline_panel_list_fp', required=True, type=click.Path(exists=True))
@click.option('--pcgr_data_dir', required=True, type=click.Path(exists=True))
+@click.option('--vep_dir', required=True, type=click.Path(exists=True))
@click.option('--threads', required=True, type=int, default=1)
@@ -73,6 +74,7 @@ def entry(ctx, **kwargs):
cpsr_prep_fp,
kwargs['germline_panel_list_fp'],
kwargs['pcgr_data_dir'],
+ kwargs['vep_dir'],
output_dir,
threads=kwargs['threads'],
pcgr_conda=kwargs['pcgr_conda'],
diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py
index fe0007a..471b816 100644
--- a/bolt/workflows/smlv_somatic/annotate.py
+++ b/bolt/workflows/smlv_somatic/annotate.py
@@ -22,6 +22,7 @@
@click.option('--pon_dir', required=True, type=click.Path(exists=True))
@click.option('--pcgr_data_dir', required=True, type=click.Path(exists=True))
+@click.option('--vep_dir', required=True, type=click.Path(exists=True))
@click.option('--pcgr_conda', required=False, type=str)
@click.option('--pcgrr_conda', required=False, type=str)
@@ -81,8 +82,7 @@ def entry(ctx, **kwargs):
# - PCGR ACMG TIER [INFO/PCGR_TIER]
# - VEP consequence [INFO/PCR_CSQ]
# - Known mutation hotspot [INFO/PCGR_MUTATION_HOTSPOT]
- # - ClinVar clinical significant [INFO/PCGR_CLINVAR_CLNSIG]
- # - Hits in COSMIC [INFO/PCGR_COSMIC_COUNT]
+ # - ClinVar clinical significant [INFO/PCGR_CLNSIG]
# - Hits in TCGA [INFO/PCGR_TCGA_PANCANCER_COUNT]
# - Hits in PCAWG [INFO/PCGR_ICGC_PCAWG_COUNT]
@@ -107,6 +107,7 @@ def entry(ctx, **kwargs):
pcgr_tsv_fp, pcgr_vcf_fp = util.run_somatic_chunck(
vcf_chunks,
kwargs['pcgr_data_dir'],
+ kwargs['vep_dir'],
output_dir,
pcgr_output_dir,
kwargs['threads'],
@@ -117,6 +118,17 @@ def entry(ctx, **kwargs):
pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic(
pcgr_prep_fp,
kwargs['pcgr_data_dir'],
+ output_dir,
+ pcgr_output_dir,
+ kwargs['threads'],
+ kwargs['pcgr_conda'],
+ kwargs['pcgrr_conda']
+ )
+ else:
+ pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic(
+ pcgr_prep_fp,
+ kwargs['pcgr_data_dir'],
+ kwargs['vep_dir'],
pcgr_output_dir,
chunk_nbr=None,
threads=kwargs['threads'],
diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py
index 104296d..814863c 100644
--- a/bolt/workflows/smlv_somatic/filter.py
+++ b/bolt/workflows/smlv_somatic/filter.py
@@ -40,7 +40,7 @@ def entry(ctx, **kwargs):
constants.VcfFilter.ENCODE,
constants.VcfFilter.GNOMAD_COMMON,
constants.VcfInfo.SAGE_HOTSPOT_RESCUE,
- constants.VcfInfo.PCGR_TIER_RESCUE,
+ constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE,
constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE,
constants.VcfInfo.RESCUED_FILTERS_EXISTING,
constants.VcfInfo.RESCUED_FILTERS_PENDING,
@@ -165,9 +165,9 @@ def set_filter_data(record, tumor_index):
##
# PCGR tier rescue
##
- pcgr_tier = record.INFO.get(constants.VcfInfo.PCGR_TIER.value)
- if pcgr_tier in constants.PCGR_TIERS_RESCUE:
- info_rescue.append(constants.VcfInfo.PCGR_TIER_RESCUE)
+ pcgr_tier = record.INFO.get(constants.VcfInfo.PCGR_ACTIONABILITY_TIER.value)
+ if pcgr_tier in constants.PCGR_ACTIONABILITY_TIER_RESCUE:
+ info_rescue.append(constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE)
##
# SAGE hotspot rescue
@@ -184,19 +184,15 @@ def set_filter_data(record, tumor_index):
# single CLINICAL_POTENTIAL_RESCUE flag
# Get ClinVar clinical significance entries
- clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLNSIG.value, '')
+ clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value, '')
clinvar_clinsigs = clinvar_clinsig.split(',')
# Hit counts in relevant reference somatic mutation databases
- cosmic_count = record.INFO.get(constants.VcfInfo.PCGR_COSMIC_COUNT.value, 0)
tcga_pancancer_count = record.INFO.get(constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT.value, 0)
- icgc_pcawg_count = record.INFO.get(constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT.value, 0)
if (
record.INFO.get(constants.VcfInfo.HMF_HOTSPOT.value) is not None or
record.INFO.get(constants.VcfInfo.PCGR_MUTATION_HOTSPOT.value) is not None or
any(e in clinvar_clinsigs for e in constants.CLINVAR_CLINSIGS_RESCUE) or
- cosmic_count >= constants.MIN_COSMIC_COUNT_RESCUE or
- tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE or
- icgc_pcawg_count >= constants.MIN_ICGC_PCAWG_COUNT_RESCUE
+ tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE
):
info_rescue.append(constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE)
diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py
index b2bead1..ede90ca 100644
--- a/bolt/workflows/smlv_somatic/report.py
+++ b/bolt/workflows/smlv_somatic/report.py
@@ -28,6 +28,7 @@
@click.option('--pcgrr_conda', required=False, type=str)
@click.option('--pcgr_data_dir', required=False, type=str)
+@click.option('--vep_dir', required=True, type=click.Path(exists=True))
@click.option('--purple_purity_fp', required=True, type=click.Path(exists=True))
@click.option('--cancer_genes_fp', required=True, type=click.Path(exists=True))
@@ -131,7 +132,8 @@ def entry(ctx, **kwargs):
pcgr.run_somatic(
pcgr_prep_fp,
kwargs['pcgr_data_dir'],
- output_dir,
+ kwargs['vep_dir'],
+ pcgr_output_dir,
threads=kwargs['threads'],
pcgr_conda=kwargs['pcgr_conda'],
pcgrr_conda=kwargs['pcgrr_conda'],
diff --git a/docker/Dockerfile.pcgr b/docker/Dockerfile.pcgr
index 4ea4932..ca984e8 100644
--- a/docker/Dockerfile.pcgr
+++ b/docker/Dockerfile.pcgr
@@ -15,13 +15,13 @@ RUN \
conda create \
--solver libmamba \
--name pcgr \
- --file https://raw.githubusercontent.com/sigven/pcgr/v1.4.1/conda/env/lock/pcgr-linux-64.lock
+ --file https://raw.githubusercontent.com/sigven/pcgr/refs/tags/v2.2.1/conda/env/lock/pcgr-linux-64.lock
RUN \
conda create \
--solver libmamba \
--name pcgrr \
- --file https://raw.githubusercontent.com/sigven/pcgr/v1.4.1/conda/env/lock/pcgrr-linux-64.lock
+ --file https://raw.githubusercontent.com/sigven/pcgr/refs/tags/v2.2.1/conda/env/lock/pcgrr-linux-64.lock
COPY ./conda/env/bolt_env.yml /tmp/
RUN \
diff --git a/tests/test_smlv_somatic_filter.py b/tests/test_smlv_somatic_filter.py
index 41a0e96..6455162 100644
--- a/tests/test_smlv_somatic_filter.py
+++ b/tests/test_smlv_somatic_filter.py
@@ -160,15 +160,15 @@ def test_common_population_filter(self):
def test_pcgr_tier_rescue(self):
pcgr_tiers = [
- 'TIER_1',
- 'TIER_2',
+ '1',
+ '2',
]
- rescue_tag_str = bolt_constants.VcfInfo.PCGR_TIER_RESCUE.value
+ rescue_tag_str = bolt_constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE.value
for pcgr_tier in pcgr_tiers:
record = get_record(
**self.records['filter_min_af9.9'],
- info_data={'PCGR_TIER': pcgr_tier},
+ info_data={'PCGR_ACTIONABILITY_TIER': pcgr_tier},
)
smlv_somatic_filter.set_filter_data(record, 0)
assert not record.FILTER
@@ -189,9 +189,7 @@ def test_clinical_potential_rescue_general(self):
info_data_sets = [
{'HMF_HOTSPOT': ''},
{'PCGR_MUTATION_HOTSPOT': ''},
- {'PCGR_COSMIC_COUNT': 11},
{'PCGR_TCGA_PANCANCER_COUNT': 6},
- {'PCGR_ICGC_PCAWG_COUNT': 6},
]
rescue_tag_str = bolt_constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE.value
@@ -217,7 +215,7 @@ def test_clinical_potential_rescue_clinvar_clinsig(self):
for clinsig in clinsigs:
record = get_record(
**self.records['filter_min_af9.9'],
- info_data={'PCGR_CLINVAR_CLNSIG': clinsig},
+ info_data={'PCGR_CLINVAR_CLASSIFICATION': clinsig},
)
smlv_somatic_filter.set_filter_data(record, 0)
assert not record.FILTER
From 9d8e4d11292d5875dda0dbc901eda0940903bdb4 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Tue, 16 Sep 2025 10:34:44 +1000
Subject: [PATCH 09/22] fix merge duplication and imports
---
bolt/common/pcgr.py | 55 ++++++++++++++++---------
bolt/util.py | 51 ++---------------------
bolt/workflows/smlv_somatic/annotate.py | 53 +++++++++---------------
pyproject.toml | 1 -
4 files changed, 59 insertions(+), 101 deletions(-)
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 8a17865..72376fc 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -8,6 +8,7 @@
import shutil
import tempfile
import logging
+import concurrent.futures
import cyvcf2
@@ -228,7 +229,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre
f'--input_vcf {input_fp}',
f'--genome_assembly grch38',
f'--custom_list {panel_fp}',
- f'--vep_dir {pcgr_refdata_dir}',
+ f'--vep_dir {vep_dir}',
f'--refdata_dir {pcgr_refdata_dir}',
# NOTE(SW): probably useful to add versioning information here; weigh against maintainence
# burden
@@ -274,8 +275,9 @@ def transfer_annotations_somatic(input_fp, tumor_name, pcgr_vcf_fp, pcgr_tsv_fp,
constants.VcfInfo.PCGR_CSQ: 'CSQ',
}
- pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv'
- pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz'
+ # Respect paths provided; do not override
+ pcgr_tsv_fp = pathlib.Path(pcgr_tsv_fp)
+ pcgr_vcf_fp = pathlib.Path(pcgr_vcf_fp)
# Enforce matching defined and source INFO annotations
check_annotation_headers(info_field_map, pcgr_vcf_fp)
@@ -364,18 +366,27 @@ def check_annotation_headers(info_field_map, vcf_fp):
def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
# Gather all annotations from TSV
data_tsv = dict()
- with open(tsv_fp, 'r') as tsv_fh:
+ # Read gz or plain text based on extension
+ open_fn = gzip.open if str(tsv_fp).endswith('.gz') else open
+ with open_fn(tsv_fp, 'rt') as tsv_fh:
for record in csv.DictReader(tsv_fh, delimiter='\t'):
key, record_ann = get_annotation_entry_tsv(record, info_field_map)
assert key not in data_tsv
- # Process PCGR_ACTIONABILITY_TIER
- # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING
- record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_')
-
- # Process PCGR_ACTIONABILITY_TIER
- # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING
- record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER']
+ # Normalize PCGR actionability tier to simple values: '1','2','3','4','N'
+ raw_tier = (record.get('ACTIONABILITY_TIER') or '').strip()
+ tier_norm = raw_tier.replace('_', ' ').upper()
+ if tier_norm in ('TIER 1','TIER1','1'):
+ tier_val = '1'
+ elif tier_norm in ('TIER 2','TIER2','2'):
+ tier_val = '2'
+ elif tier_norm in ('TIER 3','TIER3','3'):
+ tier_val = '3'
+ elif tier_norm in ('TIER 4','TIER4','4'):
+ tier_val = '4'
+ else:
+ tier_val = 'N'
+ record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = tier_val
# Store annotation data
data_tsv[key] = record_ann
@@ -450,12 +461,18 @@ def get_annotations_vcf(vcf_fp, info_field_map):
def get_annotation_entry_tsv(record, info_field_map):
- # If GENOMIC_CHANGE is present, parse it for coordinates; otherwise, use separate fields.
- if "GENOMIC_CHANGE" in record and record["GENOMIC_CHANGE"]:
- chrom, pos, ref, alt = parse_genomic_change(record["GENOMIC_CHANGE"])
-
- if not chrom.startswith('chr'):
- chrom = f'chr{chrom}'
+ # If GENOMIC_CHANGE is present, parse it; otherwise, fallback to CHROM/POS/REF/ALT fields
+ chrom = pos = ref = alt = None
+ if 'GENOMIC_CHANGE' in record and record['GENOMIC_CHANGE']:
+ chrom, pos, ref, alt = parse_genomic_change(record['GENOMIC_CHANGE'])
+ else:
+ chrom = record.get('CHROM') or record.get('Chromosome')
+ pos = int(record.get('POS') or record.get('Start_position'))
+ ref = record.get('REF')
+ alt = record.get('ALT')
+ # Ensure chrom has 'chr' prefix
+ if chrom and not str(chrom).startswith('chr'):
+ chrom = f'chr{chrom}'
key = (chrom, pos, ref, alt)
@@ -518,7 +535,7 @@ def split_vcf(input_vcf, output_dir):
chunk_files = []
chunk_number = 1
variant_count = 0
- base_filename = pathlib.Path(input_vcf).stem
+ input_vcf = pathlib.Path(input_vcf)
base_filename = input_vcf.stem
chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
chunk_files.append(chunk_filename)
@@ -606,7 +623,7 @@ def get_variant_filter_data(variant):
data = {e: None for e in attribute_names}
- data['tier'] = variant.INFO['PCGR_TIER']
+ data['tier'] = variant.INFO.get('PCGR_ACTIONABILITY_TIER')
info_keys = [k for k, v in variant.INFO]
diff --git a/bolt/util.py b/bolt/util.py
index 34df80f..7310b89 100644
--- a/bolt/util.py
+++ b/bolt/util.py
@@ -1,11 +1,11 @@
-import gzip
-import os
import pathlib
import subprocess
import textwrap
import logging
from types import SimpleNamespace
+import cyvcf2
+
from .common import constants
# Set up logging
@@ -122,8 +122,7 @@ def split_vcf(input_vcf, output_dir):
chunk_files = []
chunk_number = 1
variant_count = 0
- base_filename = pathlib.Path(input_vcf).stem
- chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+ input_vcf = pathlib.Path(input_vcf)
base_filename = input_vcf.stem
chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
chunk_files.append(chunk_filename)
@@ -248,47 +247,3 @@ def merge_vcf_files(vcf_files, merged_vcf_fp):
merged_unsorted_vcf.unlink()
return merged_vcf
-
-def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
- # Step 3: Merge all chunk VCF files into a single file
- pcgr_dir = output_dir / 'pcgr/'
- pcgr_dir.mkdir(exist_ok=True)
- # Merge all TSV files into a single file in the pcgr directory merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv")
- merged_tsv_fp = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv")
- merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
- # Step 5: Merge all VCF files into a single file in the pcgr directory
- merged_vcf_path = os.path.join(pcgr_dir, "nosampleset.pcgr_acmg.grch38")
- merged_vcf = merge_vcf_files(pcgr_vcf_files, merged_vcf_path)
- return merged_vcf, merged_tsv_fp
-
-def run_somatic_chunck(vcf_chunks, pcgr_data_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda):
- pcgr_tsv_files = []
- pcgr_vcf_files = []
-
- num_chunks = len(vcf_chunks)
- # Ensure we don't use more workers than available threads, and each worker has at least 2 threads
- max_workers = min(num_chunks, max_threads // 2)
- threads_quot, threads_rem = divmod(max_threads, num_chunks)
- threads_per_chunk = max(2, threads_quot)
-
- # Limit the number of workers to the smaller of num_chunks or max_threads // 2
- with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
- futures = {}
- for chunk_number, vcf_file in enumerate(vcf_chunks, start=1):
- # Assign extra thread to the first 'threads_rem' chunks
- additional_thread = 1 if chunk_number <= threads_rem else 0
- total_threads = threads_per_chunk + additional_thread
- futures[executor.submit(pcgr.run_somatic, vcf_file, pcgr_data_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number
-
- for future in concurrent.futures.as_completed(futures):
- try:
- pcgr_tsv_fp, pcgr_vcf_fp = future.result()
- if pcgr_tsv_fp:
- pcgr_tsv_files.append(pcgr_tsv_fp)
- if pcgr_vcf_fp:
- pcgr_vcf_files.append(pcgr_vcf_fp)
- except Exception as e:
- print(f"Exception occurred: {e}")
-
- merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files)
- return merged_tsv_fp, merged_vcf_fp
\ No newline at end of file
diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py
index 471b816..d0efa93 100644
--- a/bolt/workflows/smlv_somatic/annotate.py
+++ b/bolt/workflows/smlv_somatic/annotate.py
@@ -98,43 +98,30 @@ def entry(ctx, **kwargs):
total_variants = util.count_vcf_records(pcgr_prep_fp)
print(f"Total number of variants in the input VCF: {total_variants}")
- # Run PCGR in chunks if the total number of variants exceeds the maximum allowed for somatic variants
+ # Run PCGR in chunks if exceeding the maximum allowed for somatic variants
if total_variants > constants.MAX_SOMATIC_VARIANTS:
- vcf_chunks = util.split_vcf(
- pcgr_prep_fp,
- output_dir
+ vcf_chunks = pcgr.split_vcf(pcgr_prep_fp, output_dir)
+ pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic_chunck(
+ vcf_chunks,
+ kwargs['pcgr_data_dir'],
+ kwargs['vep_dir'],
+ output_dir,
+ pcgr_output_dir,
+ kwargs['threads'],
+ kwargs['pcgr_conda'],
+ kwargs['pcgrr_conda'],
)
- pcgr_tsv_fp, pcgr_vcf_fp = util.run_somatic_chunck(
- vcf_chunks,
- kwargs['pcgr_data_dir'],
- kwargs['vep_dir'],
- output_dir,
- pcgr_output_dir,
- kwargs['threads'],
- kwargs['pcgr_conda'],
- kwargs['pcgrr_conda']
- )
- else:
- pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic(
- pcgr_prep_fp,
- kwargs['pcgr_data_dir'],
- output_dir,
- pcgr_output_dir,
- kwargs['threads'],
- kwargs['pcgr_conda'],
- kwargs['pcgrr_conda']
- )
else:
pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic(
- pcgr_prep_fp,
- kwargs['pcgr_data_dir'],
- kwargs['vep_dir'],
- pcgr_output_dir,
- chunk_nbr=None,
- threads=kwargs['threads'],
- pcgr_conda=kwargs['pcgr_conda'],
- pcgrr_conda=kwargs['pcgrr_conda'],
- )
+ pcgr_prep_fp,
+ kwargs['pcgr_data_dir'],
+ kwargs['vep_dir'],
+ pcgr_output_dir,
+ chunk_nbr=None,
+ threads=kwargs['threads'],
+ pcgr_conda=kwargs['pcgr_conda'],
+ pcgrr_conda=kwargs['pcgrr_conda'],
+ )
# Transfer PCGR annotations to full set of variants
pcgr.transfer_annotations_somatic(
diff --git a/pyproject.toml b/pyproject.toml
index e98f470..e7325a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -17,7 +17,6 @@ dependencies = [
"cyvcf2",
"pysam",
"pyyaml",
- "future"
]
[project.scripts]
From 8db35b30bd3c6134314905d68fab6f66250c3457 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Thu, 2 Oct 2025 14:15:26 +1000
Subject: [PATCH 10/22] bump pcgr version
---
docker/Dockerfile.pcgr | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docker/Dockerfile.pcgr b/docker/Dockerfile.pcgr
index ca984e8..8615e50 100644
--- a/docker/Dockerfile.pcgr
+++ b/docker/Dockerfile.pcgr
@@ -39,7 +39,7 @@ RUN \
RUN \
conda clean -afy
-FROM quay.io/bioconda/base-glibc-busybox-bash:2.1.0
+FROM quay.io/bioconda/base-glibc-busybox-bash:2.2.5
# Copy Conda install and all environments
COPY --from=build /opt/conda/ /opt/conda/
From c67b30ad4d5ecfa9e28817a61bc341d8f0c71847 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Thu, 2 Oct 2025 14:19:01 +1000
Subject: [PATCH 11/22] bump gpgr version
---
docker/Dockerfile.gpgr | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docker/Dockerfile.gpgr b/docker/Dockerfile.gpgr
index 8b3a157..6b76ecb 100644
--- a/docker/Dockerfile.gpgr
+++ b/docker/Dockerfile.gpgr
@@ -18,7 +18,7 @@ RUN \
RUN \
conda install --prefix /env/ \
- 'r-gpgr ==2.2.1' \
+ 'r-gpgr ==2.2.11' \
'r-sigrap ==0.1.1' \
'bioconductor-bsgenome.hsapiens.ucsc.hg38 ==1.4.5' \
'bioconductor-txdb.hsapiens.ucsc.hg38.knowngene ==3.16.0' \
From f9a7d3b9ce7912b41f6ef97fe42302201bf91c3f Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Fri, 17 Oct 2025 09:26:19 +1100
Subject: [PATCH 12/22] change print to logging
---
bolt/common/pcgr.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 72376fc..423a8b5 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -588,8 +588,9 @@ def run_somatic_chunck(vcf_chunks, pcgr_data_dir, vep_dir, output_dir, pcgr_outp
pcgr_tsv_files.append(pcgr_tsv_fp)
if pcgr_vcf_fp:
pcgr_vcf_files.append(pcgr_vcf_fp)
- except Exception as e:
- print(f"Exception occurred: {e}")
+ except Exception:
+ chunk_number = futures[future]
+ logger.exception(f"Exception occurred while processing PCGR chunk {chunk_number}.")
merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files)
return merged_tsv_fp, merged_vcf_fp
From b8ee4a0b1e2b54aaf6b99d2f3e7571448db8c3c9 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Fri, 17 Oct 2025 10:02:11 +1100
Subject: [PATCH 13/22] add argument chunck size
---
bolt/common/pcgr.py | 9 +++++++--
bolt/workflows/smlv_somatic/annotate.py | 10 ++++++++--
2 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 423a8b5..54c87aa 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -524,12 +524,17 @@ def annotate_record(record, annotations, *, allow_missing=False):
return record
-def split_vcf(input_vcf, output_dir):
+def split_vcf(input_vcf, output_dir, *, max_variants=None):
"""
Splits a VCF file into multiple chunks, each containing up to max_variants variants.
Each chunk includes the VCF header.
Ensures no overlapping positions between chunks.
"""
+ if max_variants is None:
+ max_variants = constants.MAX_SOMATIC_VARIANTS
+ elif max_variants <= 0:
+ raise ValueError("max_variants must be a positive integer.")
+
output_dir = pathlib.Path(output_dir / "vcf_chunks")
output_dir.mkdir(parents=True, exist_ok=True)
chunk_files = []
@@ -547,7 +552,7 @@ def split_vcf(input_vcf, output_dir):
for record in vcf_in:
current_position = record.POS
# Check if we need to start a new chunk
- if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position):
+ if variant_count >= max_variants and (last_position is None or current_position != last_position):
# Close the current chunk file and start a new one
vcf_out.close()
chunk_number += 1
diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py
index d0efa93..32b85b3 100644
--- a/bolt/workflows/smlv_somatic/annotate.py
+++ b/bolt/workflows/smlv_somatic/annotate.py
@@ -30,6 +30,7 @@
@click.option('--threads', required=False, default=4, type=int)
@click.option('--output_dir', required=True, type=click.Path())
+@click.option('--pcgr_variant_chunk_size', required=False, type=int, help='Override maximum variants per PCGR chunk.')
def entry(ctx, **kwargs):
'''Annotate variants with information from several sources\f
@@ -99,8 +100,13 @@ def entry(ctx, **kwargs):
print(f"Total number of variants in the input VCF: {total_variants}")
# Run PCGR in chunks if exceeding the maximum allowed for somatic variants
- if total_variants > constants.MAX_SOMATIC_VARIANTS:
- vcf_chunks = pcgr.split_vcf(pcgr_prep_fp, output_dir)
+ chunk_size = kwargs.get('pcgr_variant_chunk_size')
+ if chunk_size is not None and chunk_size <= 0:
+ raise click.BadParameter('must be a positive integer', param_hint='--pcgr_variant_chunk_size')
+ chunk_size = chunk_size or constants.MAX_SOMATIC_VARIANTS
+
+ if total_variants > chunk_size:
+ vcf_chunks = pcgr.split_vcf(pcgr_prep_fp, output_dir, max_variants=chunk_size)
pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic_chunck(
vcf_chunks,
kwargs['pcgr_data_dir'],
From 8223ef9a729119bdbfefe7a16b3b7b4b25439200 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Fri, 17 Oct 2025 11:44:28 +1100
Subject: [PATCH 14/22] fix version software pcgr container
---
docker/Dockerfile.pcgr | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docker/Dockerfile.pcgr b/docker/Dockerfile.pcgr
index 8615e50..ca984e8 100644
--- a/docker/Dockerfile.pcgr
+++ b/docker/Dockerfile.pcgr
@@ -39,7 +39,7 @@ RUN \
RUN \
conda clean -afy
-FROM quay.io/bioconda/base-glibc-busybox-bash:2.2.5
+FROM quay.io/bioconda/base-glibc-busybox-bash:2.1.0
# Copy Conda install and all environments
COPY --from=build /opt/conda/ /opt/conda/
From f83c23aec1273df1fa9ddeaab354c279b3375352 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Fri, 17 Oct 2025 15:50:20 +1100
Subject: [PATCH 15/22] add logging and filter fonction to read value and
replace placeholder
---
bolt/workflows/smlv_somatic/filter.py | 59 +++++++++++++++++++++++----
1 file changed, 51 insertions(+), 8 deletions(-)
diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py
index 814863c..bb5e8f1 100644
--- a/bolt/workflows/smlv_somatic/filter.py
+++ b/bolt/workflows/smlv_somatic/filter.py
@@ -1,12 +1,14 @@
import click
import pathlib
-
+import logging
import cyvcf2
from ... import util
from ...common import constants
+from ...logging_config import setup_logging
+logger = logging.getLogger(__name__)
@click.command(name='filter')
@@ -129,7 +131,11 @@ def set_filter_data(record, tumor_index):
# PON filter
##
# NOTE(SW): 'max' is inclusive - keeps variants with 0 to n-1 PON hits; preserved from Umccrise
- pon_count = record.INFO.get(constants.VcfInfo.PON_COUNT.value, 0)
+ pon_count = get_record_value(
+ record,
+ constants.VcfInfo.PON_COUNT.value,
+ default=0
+ )
if pon_count >= constants.PON_HIT_THRESHOLD:
filters.append(constants.VcfFilter.PON)
@@ -144,7 +150,14 @@ def set_filter_data(record, tumor_index):
##
# NOTE(SW): rounding is essential here for accurate comparison; cyvcf2 floating-point error
# means INFO/gnomAD_AF=0.01 can be represented as 0.009999999776482582
- gnomad_af = round(record.INFO.get(constants.VcfInfo.GNOMAD_AF.value, 0), 3)
+ gnomad_af = round(
+ get_record_value(
+ record,
+ constants.VcfInfo.GNOMAD_AF.value,
+ default=0.0 ),
+ ),
+ 3,
+ )
if gnomad_af >= constants.MAX_GNOMAD_AF:
filters.append(constants.VcfFilter.GNOMAD_COMMON)
@@ -173,7 +186,7 @@ def set_filter_data(record, tumor_index):
# SAGE hotspot rescue
##
# NOTE(SW): effectively reverts any FILTERs that may have been applied above
- if record.INFO.get(constants.VcfInfo.SAGE_HOTSPOT.value) is not None:
+ if get_record_value(record, constants.VcfInfo.SAGE_HOTSPOT.value) is not None:
info_rescue.append(constants.VcfInfo.SAGE_HOTSPOT_RESCUE)
##
@@ -184,13 +197,24 @@ def set_filter_data(record, tumor_index):
# single CLINICAL_POTENTIAL_RESCUE flag
# Get ClinVar clinical significance entries
- clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value, '')
+ clinvar_clinsig = get_record_value(
+ record,
+ constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value,
+ default='',
+ )
clinvar_clinsigs = clinvar_clinsig.split(',')
# Hit counts in relevant reference somatic mutation databases
- tcga_pancancer_count = record.INFO.get(constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT.value, 0)
+ tcga_pancancer_count = get_record_value(
+ record,
+ constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT.value,
+ default=0
+ )
+ hmf_present = get_record_value(record, constants.VcfInfo.HMF_HOTSPOT.value)
+ pcgr_hotspot_present = get_record_value(record, constants.VcfInfo.PCGR_MUTATION_HOTSPOT.value)
+
if (
- record.INFO.get(constants.VcfInfo.HMF_HOTSPOT.value) is not None or
- record.INFO.get(constants.VcfInfo.PCGR_MUTATION_HOTSPOT.value) is not None or
+ hmf_present is not None or
+ pcgr_hotspot_present is not None or
any(e in clinvar_clinsigs for e in constants.CLINVAR_CLINSIGS_RESCUE) or
tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE
):
@@ -227,3 +251,22 @@ def set_filter_data(record, tumor_index):
filters_existing = [e for e in record.FILTERS if e != 'PASS']
assert all(e not in filters_existing for e in filters_value)
record.FILTER = ';'.join([*filters_existing, *filters_value])
+
+
+def get_record_value(record, key, *, default=None):
+ '''
+ Return a INFO value, replacing '.', '' and missing entries by default value.
+
+ This prevents placeholder values emitted by pcgr from triggering rescues
+ or filters. For example, an INFO placeholder '.' for
+ PCGR_TCGA_PANCANCER_COUNT will resolve to the integer 0 when called with
+ `get_record_value(record, 'PCGR_TCGA_PANCANCER_COUNT', default=0)`.
+ '''
+ value = record.INFO.get(key)
+ if value is None:
+ return default
+ if value=='' or value=='.':
+ return default
+ if not isinstance(value, (int, float, str)):
+ logger.error(f'record ID: {record.ID} INFO value for {key}: {value} ({type(value)}, not int, float, str)')
+ return value
\ No newline at end of file
From 22b5c1291b7d7432a0c65c7ff5c75bdf07cd71ce Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Fri, 17 Oct 2025 16:02:38 +1100
Subject: [PATCH 16/22] fix indentation
---
bolt/workflows/smlv_somatic/filter.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py
index bb5e8f1..c820385 100644
--- a/bolt/workflows/smlv_somatic/filter.py
+++ b/bolt/workflows/smlv_somatic/filter.py
@@ -154,7 +154,7 @@ def set_filter_data(record, tumor_index):
get_record_value(
record,
constants.VcfInfo.GNOMAD_AF.value,
- default=0.0 ),
+ default=0.0
),
3,
)
@@ -214,7 +214,7 @@ def set_filter_data(record, tumor_index):
if (
hmf_present is not None or
- pcgr_hotspot_present is not None or
+ pcgr_hotspot_present or
any(e in clinvar_clinsigs for e in constants.CLINVAR_CLINSIGS_RESCUE) or
tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE
):
From 529309c03d4838f8c5fe71c704dfee298a932933 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Fri, 17 Oct 2025 16:36:13 +1100
Subject: [PATCH 17/22] read gz tsv from pcgr
---
bolt/common/pcgr.py | 2 +-
bolt/util.py | 8 +++++---
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 54c87aa..a45154a 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -604,7 +604,7 @@ def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
pcgr_dir.mkdir(exist_ok=True)
# Merge all TSV files into a single file in the pcgr directory
- merged_tsv_fp = pcgr_dir / "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv"
+ merged_tsv_fp = pcgr_dir / "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv.gz"
util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
# Step 5: Merge all VCF files into a single file in the pcgr directory
diff --git a/bolt/util.py b/bolt/util.py
index 7310b89..8065302 100644
--- a/bolt/util.py
+++ b/bolt/util.py
@@ -1,3 +1,4 @@
+import gzip
import pathlib
import subprocess
import textwrap
@@ -161,11 +162,12 @@ def split_vcf(input_vcf, output_dir):
def merge_tsv_files(tsv_files, merged_tsv_fp):
"""
- Merges all TSV files into a single TSV.
+ Merge gzipped TSV files into a single gzipped TSV.
"""
- with open(merged_tsv_fp, 'w') as merged_tsv:
+
+ with gzip.open(merged_tsv_fp, 'wt', encoding='utf-8') as merged_tsv:
for i, tsv_file in enumerate(tsv_files):
- with open(tsv_file, 'r') as infile:
+ with gzip.open(tsv_file, 'rt', encoding='utf-8') as infile:
for line_number, line in enumerate(infile):
# Skip header except for the first file
if i > 0 and line_number == 0:
From d3dd1820ba346e513e79f4427dade621baca0184 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Mon, 20 Oct 2025 14:54:57 +1100
Subject: [PATCH 18/22] change PCGR_CLINVAR_CLASSIFICATION
---
bolt/common/constants.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index 7cebe78..cfc6aa5 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -301,7 +301,7 @@ def namespace(self):
VcfInfo.PCGR_CLINVAR_CLASSIFICATION: {
'Number': '.',
'Type': 'String',
- 'Description': 'ClinVar - clinical significance - per phenotype submission',
+ 'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale',
},
VcfInfo.PCGR_COSMIC_COUNT: {
'Number': '1',
From a5fc92311f4607d0d528151ca83394c150f74b68 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Mon, 20 Oct 2025 15:00:46 +1100
Subject: [PATCH 19/22] Change pcgr header check
---
bolt/common/pcgr.py | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index a45154a..9dba668 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -348,6 +348,7 @@ def check_annotation_headers(info_field_map, vcf_fp):
# Ensure header descriptions from source INFO annotations match those defined here for the
# output file; force manual inspection where they do not match
vcf_fh = cyvcf2.VCF(vcf_fp)
+ mismatches = list()
for header_dst, header_src in info_field_map.items():
# Skip header lines that do not have an equivalent entry in the PCGR/CPSR VCF
@@ -361,7 +362,15 @@ def check_annotation_headers(info_field_map, vcf_fp):
# Remove leading and trailing quotes from source
header_src_description_unquoted = header_src_entry['Description'].strip('"')
if header_src_description_unquoted != header_dst_entry['Description']:
- raise AssertionError(f"Mismatch for {header_src}:\nVCF: {header_src_description_unquoted}\nExpected: {header_dst_entry['Description']}")
+ mismatches.append(
+ f"Mismatch for {header_src}:\n"
+ f"VCF: {header_src_description_unquoted}\n"
+ f"Expected: {header_dst_entry['Description']}"
+ )
+
+ if mismatches:
+ mismatch_msg = "\n\n".join(mismatches)
+ raise AssertionError(f"Mismatched INFO annotations:\n{mismatch_msg}")
def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
# Gather all annotations from TSV
@@ -608,7 +617,7 @@ def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
# Step 5: Merge all VCF files into a single file in the pcgr directory
- merged_vcf_path = pcgr_dir / "nosampleset.pcgr.grch38.pass.vcf.gz"
+ merged_vcf_path = pcgr_dir / "nosampleset.pcgr.grch38.pass"
merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path)
return merged_vcf, merged_tsv_fp
From bf930e78188c403f530ed754f1f9b451b70e0691 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Fri, 24 Oct 2025 13:56:03 +1100
Subject: [PATCH 20/22] tyo
---
CHANGELOG.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 90b7815..25ce706 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,4 +10,4 @@
- [6](https://github.com/umccr/bolt/pull/6) - Change oncoanalyser v2.0.0 uptade, with switch sv caller from GRIPSS to eSVee
--[9](https://github.com/umccr/bolt/pull/9) Add hypermutation sample handling
\ No newline at end of file
+- [9](https://github.com/umccr/bolt/pull/9) Add hypermutation sample handling
\ No newline at end of file
From 8ea1edc8cd5c64afff30640471f515f279b2c90a Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Fri, 24 Oct 2025 13:57:30 +1100
Subject: [PATCH 21/22] typo
---
bolt/common/pcgr.py | 2 +-
bolt/workflows/other/cancer_report.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 9dba668..95c0f47 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -649,7 +649,7 @@ def get_variant_filter_data(variant):
# NOTE(SW): GIAB_CONF always overrides DIFFICULT tags
if data['giab_conf'] and data['difficult']:
- data['difficult']= False
+ data['difficult'] = False
for impact in get_impacts(variant.INFO['PCGR_CSQ']):
diff --git a/bolt/workflows/other/cancer_report.py b/bolt/workflows/other/cancer_report.py
index 1902079..9ad3d93 100644
--- a/bolt/workflows/other/cancer_report.py
+++ b/bolt/workflows/other/cancer_report.py
@@ -69,7 +69,7 @@ def entry(ctx, **kwargs):
)
# Set other required argument values
- batch_name = f'{kwargs["subject_name"]}_{kwargs["tumor_name"]}'
+ batch_name = f'{kwargs['subject_name']}_{kwargs['tumor_name']}'
output_table_dir = output_dir / 'cancer_report_tables'
# Optional dragen hrd argument
From 443c5be931e062b00eb56a74e7bb3221c87a4bdc Mon Sep 17 00:00:00 2001
From: Quentin Clayssen
Date: Fri, 24 Oct 2025 13:57:56 +1100
Subject: [PATCH 22/22] add missing variable pcgr_output_dir
---
bolt/workflows/smlv_somatic/report.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py
index ede90ca..65ce535 100644
--- a/bolt/workflows/smlv_somatic/report.py
+++ b/bolt/workflows/smlv_somatic/report.py
@@ -129,6 +129,7 @@ def entry(ctx, **kwargs):
output_dir,
)
+ pcgr_output_dir = output_dir / 'pcgr'
pcgr.run_somatic(
pcgr_prep_fp,
kwargs['pcgr_data_dir'],
]