From 107a9149650ccb9ad3b31e554ef72da2f1c7f66d Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Mon, 7 Apr 2025 09:56:34 +1000
Subject: [PATCH 01/25] uptade docker image

---
 docker/Dockerfile.pcgr | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.pcgr b/docker/Dockerfile.pcgr
index 4ea4932..ca984e8 100644
--- a/docker/Dockerfile.pcgr
+++ b/docker/Dockerfile.pcgr
@@ -15,13 +15,13 @@ RUN \
   conda create \
     --solver libmamba \
     --name pcgr \
-    --file https://raw.githubusercontent.com/sigven/pcgr/v1.4.1/conda/env/lock/pcgr-linux-64.lock
+    --file https://raw.githubusercontent.com/sigven/pcgr/refs/tags/v2.2.1/conda/env/lock/pcgr-linux-64.lock
 
 RUN \
   conda create \
     --solver libmamba \
     --name pcgrr \
-    --file https://raw.githubusercontent.com/sigven/pcgr/v1.4.1/conda/env/lock/pcgrr-linux-64.lock
+    --file https://raw.githubusercontent.com/sigven/pcgr/refs/tags/v2.2.1/conda/env/lock/pcgrr-linux-64.lock
 
 COPY ./conda/env/bolt_env.yml /tmp/
 RUN \

From 0ec6c095c0ba79fdf59109317e98db4259778361 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Mon, 14 Apr 2025 17:45:00 +1000
Subject: [PATCH 02/25] chang command pcgr

---
 bolt/common/pcgr.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index fce4c56..aa4751e 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -124,19 +124,20 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No
     command_args = [
         f'--sample_id {sample_id}',
         f'--input_vcf {input_fp}',
+        f'--vep_dir {pcgr_refdata_dir}',
+        f'--refdata_dir {pcgr_refdata_dir}',
         f'--tumor_dp_tag TUMOR_DP',
         f'--tumor_af_tag TUMOR_AF',
         f'--control_dp_tag NORMAL_DP',
         f'--control_af_tag NORMAL_AF',
-        f'--pcgr_dir {pcgr_refdata_dir}',
         f'--genome_assembly grch38',
         f'--assay WGS',
         f'--estimate_signatures',
-        f'--estimate_msi_status',
+        f'--estimate_msi',
         f'--estimate_tmb',
-        f'--show_noncoding',
+        #f'--show_noncoding',
         f'--vcfanno_n_proc {threads}',
-        f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane',
+        f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select',
     ]
 
     # NOTE(SW): VEP pick order is applied as a successive filter:
@@ -209,7 +210,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pc
         f'--classify_all',
         f'--pcgr_dir {pcgr_refdata_dir}',
         f'--vcfanno_n_proc {threads}',
-        f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane',
+        f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select',
     ]
 
     if pcgrr_conda:
@@ -249,8 +250,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
         constants.VcfInfo.PCGR_CSQ: 'CSQ',
     }
 
-    pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv'
-    pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz'
+    pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snvs_indels.tiers.tsv'
+    pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz'
 
     # Enforce matching defined and source INFO annotations
     check_annotation_headers(info_field_map, pcgr_vcf_fp)

From c51feda01ddcbfb30feb2cda4825ce8a57ecb100 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Mon, 14 Apr 2025 17:45:35 +1000
Subject: [PATCH 03/25] change  pcgr vcf header/info

---
 bolt/common/constants.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index e38ed3d..601c0b0 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -236,29 +236,30 @@ def namespace(self):
         ),
     },
     VcfInfo.PCGR_CSQ: {
-        'Number': '.',
-        'Type': 'String',
-        'Description': (
-            'Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|'
-            'Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|'
-            'CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|'
-            'DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|'
-            'MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|'
-            'UNIPROT_ISOFORM|RefSeq|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|'
-            'gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|'
-            'gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|'
-            'NearestExonJB'
-        ),
+    'Number': '.',
+    'Type': 'String',
+    'Description': (
+        'Consequence annotations from Ensembl VEP. Format: '
+        'Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|'
+        'HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|'
+        'ALLELE_NUM|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|'
+        'MANE|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|'
+        'UNIPROT_ISOFORM|RefSeq|DOMAINS|HGVS_OFFSET|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|'
+        'gnomADe_ASJ_AF|gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|'
+        'gnomADe_REMAINING_AF|gnomADe_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|'
+        'MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|'
+        'MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref'
+    ),
     },
     VcfInfo.PCGR_MUTATION_HOTSPOT: {
         'Number': '.',
         'Type': 'String',
-        'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org_v2, Gene|Codon|Q-value',
+        'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org. Format: GeneSymbol|Entrez_ID|CodonRefAA|Alt_AA|Q-value',
     },
     VcfInfo.PCGR_CLINVAR_CLNSIG: {
         'Number': '.',
         'Type': 'String',
-        'Description': 'ClinVar clinical significance',
+        'Description': 'ClinVar - clinical significance - per phenotype submissione',
     },
     VcfInfo.PCGR_COSMIC_COUNT: {
         'Number': '1',

From feace9f9320516a5e7899ef2d85abc07f45b30e7 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Tue, 15 Apr 2025 14:22:20 +1000
Subject: [PATCH 04/25] uptade pcgr tier

---
 bolt/common/constants.py | 16 ++++++++--------
 bolt/common/pcgr.py      | 16 +++++++++-------
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index 601c0b0..1fa73bf 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -35,9 +35,9 @@
     'pathogenic',
     'uncertain_significance',
 }
-PCGR_TIERS_RESCUE = {
-    'TIER_1',
-    'TIER_2',
+PCGR_ACTIONABILITY_TIER_RESCUE = {
+    '1',
+    '2',
 }
 
 
@@ -77,7 +77,7 @@ class VcfInfo(enum.Enum):
     SAGE_NOVEL = 'SAGE_NOVEL'
     SAGE_RESCUE = 'SAGE_RESCUE'
 
-    PCGR_TIER = 'PCGR_TIER'
+    PCGR_ACTIONABILITY_TIER = 'PCGR_ACTIONABILITY_TIER'
     PCGR_CSQ = 'PCGR_CSQ'
     PCGR_MUTATION_HOTSPOT = 'PCGR_MUTATION_HOTSPOT'
     PCGR_CLINVAR_CLNSIG = 'PCGR_CLINVAR_CLNSIG'
@@ -112,7 +112,7 @@ class VcfInfo(enum.Enum):
 
     GNOMAD_AF = 'gnomAD_AF'
 
-    PCGR_TIER_RESCUE = 'PCGR_TIER_RESCUE'
+    PCGR_ACTIONABILITY_TIER_RESCUE = 'PCGR_ACTIONABILITY_TIER_RESCUE'
     SAGE_HOTSPOT_RESCUE = 'SAGE_HOTSPOT_RESCUE'
     CLINICAL_POTENTIAL_RESCUE = 'CLINICAL_POTENTIAL_RESCUE'
 
@@ -226,7 +226,7 @@ def namespace(self):
         'Description': 'Variant rescued by a matching SAGE call',
     },
 
-    VcfInfo.PCGR_TIER: {
+    VcfInfo.PCGR_ACTIONABILITY_TIER: {
         'Number': '1',
         'Type': 'String',
         'Description': (
@@ -259,7 +259,7 @@ def namespace(self):
     VcfInfo.PCGR_CLINVAR_CLNSIG: {
         'Number': '.',
         'Type': 'String',
-        'Description': 'ClinVar - clinical significance - per phenotype submissione',
+        'Description': 'ClinVar - clinical significance - per phenotype submission',
     },
     VcfInfo.PCGR_COSMIC_COUNT: {
         'Number': '1',
@@ -317,7 +317,7 @@ def namespace(self):
         'Type': 'Flag',
         'Description': '',
     },
-    VcfInfo.PCGR_TIER_RESCUE: {
+    VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE: {
         'Number': '0',
         'Type': 'Flag',
         'Description': '',
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index aa4751e..9962da4 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -3,6 +3,7 @@
 import re
 import shutil
 import tempfile
+import gzip  # added import
 
 
 import cyvcf2
@@ -250,7 +251,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
         constants.VcfInfo.PCGR_CSQ: 'CSQ',
     }
 
-    pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snvs_indels.tiers.tsv'
+    pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz'
     pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz'
 
     # Enforce matching defined and source INFO annotations
@@ -262,7 +263,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
     # Open filehandles, set required header entries
     input_fh = cyvcf2.VCF(input_fp)
 
-    util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_TIER)
+    util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLNSIG)
@@ -296,7 +297,7 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir):
         constants.VcfInfo.CPSR_CSQ: 'CSQ',
     }
 
-    cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snvs_indels.tiers.tsv'
+    cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snv_indel_ann.tsv.gzv'
     cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz'
 
     # Enforce matching defined and source INFO annotations
@@ -337,6 +338,7 @@ def check_annotation_headers(info_field_map, vcf_fp):
         try:
             header_src_entry = vcf_fh.get_header_type(header_src)
         except KeyError:
+            print(f"Missing in VCF: {header_src}")
             continue
 
         header_dst_entry = util.get_vcf_header_entry(header_dst)
@@ -348,14 +350,14 @@ def check_annotation_headers(info_field_map, vcf_fp):
 def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
     # Gather all annotations from TSV
     data_tsv = dict()
-    with open(tsv_fp, 'r') as tsv_fh:
+    with gzip.open(tsv_fp, 'rt') as tsv_fh:
         for record in csv.DictReader(tsv_fh, delimiter='\t'):
             key, record_ann = get_annotation_entry_tsv(record, info_field_map)
             assert key not in data_tsv
 
-            # Process PCGR_TIER
-            # TIER_1, TIER_2, TIER_3, TIER_4, NONCODING
-            record_ann[constants.VcfInfo.PCGR_TIER] = record['TIER'].replace(' ', '_')
+            # Process PCGR_ACTIONABILITY_TIER
+            # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING
+            record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_')
 
             # Count COSMIC hits
             if record['COSMIC_MUTATION_ID'] == 'NA':

From d21d145fd40ead1dea23499333d77e24431fcb40 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Tue, 15 Apr 2025 15:27:09 +1000
Subject: [PATCH 05/25] remove in annotation cosmic and ICG un uptade variable
 name

---
 bolt/common/constants.py                |  6 +--
 bolt/common/pcgr.py                     | 57 ++++++++++++-------------
 bolt/workflows/smlv_somatic/annotate.py |  5 +--
 bolt/workflows/smlv_somatic/filter.py   |  2 +-
 4 files changed, 34 insertions(+), 36 deletions(-)

diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index 1fa73bf..8aa8608 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -80,7 +80,7 @@ class VcfInfo(enum.Enum):
     PCGR_ACTIONABILITY_TIER = 'PCGR_ACTIONABILITY_TIER'
     PCGR_CSQ = 'PCGR_CSQ'
     PCGR_MUTATION_HOTSPOT = 'PCGR_MUTATION_HOTSPOT'
-    PCGR_CLINVAR_CLNSIG = 'PCGR_CLINVAR_CLNSIG'
+    PCGR_CLINVAR_CLASSIFICATION = 'PCGR_CLINVAR_CLASSIFICATION'
     PCGR_COSMIC_COUNT = 'PCGR_COSMIC_COUNT'
     PCGR_TCGA_PANCANCER_COUNT = 'PCGR_TCGA_PANCANCER_COUNT'
     PCGR_ICGC_PCAWG_COUNT = 'PCGR_ICGC_PCAWG_COUNT'
@@ -256,10 +256,10 @@ def namespace(self):
         'Type': 'String',
         'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org. Format: GeneSymbol|Entrez_ID|CodonRefAA|Alt_AA|Q-value',
     },
-    VcfInfo.PCGR_CLINVAR_CLNSIG: {
+    VcfInfo.PCGR_CLINVAR_CLASSIFICATION: {
         'Number': '.',
         'Type': 'String',
-        'Description': 'ClinVar - clinical significance - per phenotype submission',
+        'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale',
     },
     VcfInfo.PCGR_COSMIC_COUNT: {
         'Number': '1',
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 9962da4..4a9346f 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -246,7 +246,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
     # Set destination INFO field names and source TSV fields
     info_field_map = {
         constants.VcfInfo.PCGR_MUTATION_HOTSPOT: 'MUTATION_HOTSPOT',
-        constants.VcfInfo.PCGR_CLINVAR_CLNSIG: 'CLINVAR_CLNSIG',
+        constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION: 'CLINVAR_CLASSIFICATION',
         constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT: 'TCGA_PANCANCER_COUNT',
         constants.VcfInfo.PCGR_CSQ: 'CSQ',
     }
@@ -266,10 +266,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT)
-    util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLNSIG)
-    util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_COSMIC_COUNT)
+    util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT)
-    util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT)
 
     output_fp = output_dir / f'{tumor_name}.annotations.vcf.gz'
     output_fh = cyvcf2.Writer(output_fp, input_fh, 'wz')
@@ -344,8 +342,8 @@ def check_annotation_headers(info_field_map, vcf_fp):
         header_dst_entry = util.get_vcf_header_entry(header_dst)
         # Remove leading and trailing quotes from source
         header_src_description_unquoted = header_src_entry['Description'].strip('"')
-        assert  header_src_description_unquoted == header_dst_entry['Description']
-
+        if header_src_description_unquoted != header_dst_entry['Description']:
+            raise AssertionError(f"Mismatch for {header_src}:\nVCF: {header_src_description_unquoted}\nExpected: {header_dst_entry['Description']}")
 
 def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
     # Gather all annotations from TSV
@@ -357,25 +355,7 @@ def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
 
             # Process PCGR_ACTIONABILITY_TIER
             # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING
-            record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_')
-
-            # Count COSMIC hits
-            if record['COSMIC_MUTATION_ID'] == 'NA':
-                cosmic_count = 0
-            else:
-                cosmic_count = len(record['COSMIC_MUTATION_ID'].split('&'))
-            record_ann[constants.VcfInfo.PCGR_COSMIC_COUNT] = cosmic_count
-
-            # Count ICGC-PCAWG hits by taking sum of affected donors where the annotation value has
-            # the following format: project_code|tumor_type|affected_donors|tested_donors|frequency
-            icgc_pcawg_count = 0
-            if record['ICGC_PCAWG_OCCURRENCE'] != 'NA':
-                for pcawg_hit_data in record['ICGC_PCAWG_OCCURRENCE'].split(','):
-                    pcawrg_hit_data_fields = pcawg_hit_data.split('|')
-                    affected_donors = int(pcawrg_hit_data_fields[2])
-                    icgc_pcawg_count += affected_donors
-                assert icgc_pcawg_count > 0
-            record_ann[constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT] = icgc_pcawg_count
+            record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER']
 
             # Store annotation data
             data_tsv[key] = record_ann
@@ -413,6 +393,24 @@ def collect_cpsr_annotation_data(tsv_fp, vcf_fp, info_field_map):
     # Compile annotations, prefering TSV source
     return compile_annotation_data(data_tsv, data_vcf)
 
+def parse_genomic_change(genomic_change):
+    """
+    Parse a genomic change string, e.g., "3:g.41224645T>C"
+    Returns a tuple: (chrom, pos, ref, alt)
+    """
+    # Regular expression for the format "chrom:g.posRef>Alt"
+    pattern = r'^(?P<chrom>\w+):g\.(?P<pos>\d+)(?P<ref>\w+)>(?P<alt>\w+)$'
+    match = re.match(pattern, genomic_change)
+    if not match:
+        raise ValueError(f"Format not recognized: {genomic_change}")
+    
+    # Get values and format as needed
+    chrom = f"chr{match.group('chrom')}"
+    pos = int(match.group('pos'))
+    ref = match.group('ref')
+    alt = match.group('alt')
+    return chrom, pos, ref, alt
+
 
 def get_annotations_vcf(vcf_fp, info_field_map):
     data_vcf = dict()
@@ -432,10 +430,11 @@ def get_annotations_vcf(vcf_fp, info_field_map):
 
 
 def get_annotation_entry_tsv(record, info_field_map):
-    # Set lookup key; PCGR/CPSR strips leading 'chr' from contig names
-    chrom = f'chr{record["CHROM"]}'
-    pos = int(record['POS'])
-    key = (chrom, pos, record['REF'], record['ALT'])
+    # If GENOMIC_CHANGE is present, parse it for coordinates; otherwise, use separate fields.
+    if "GENOMIC_CHANGE" in record and record["GENOMIC_CHANGE"]:
+        chrom, pos, ref, alt = parse_genomic_change(record["GENOMIC_CHANGE"])
+
+    key = (chrom, pos, ref, alt)
 
     record_ann = dict()
     for info_dst, info_src in info_field_map.items():
diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py
index f163d5f..687f17f 100644
--- a/bolt/workflows/smlv_somatic/annotate.py
+++ b/bolt/workflows/smlv_somatic/annotate.py
@@ -43,6 +43,7 @@ def entry(ctx, **kwargs):
     # Create output directory
     output_dir = pathlib.Path(kwargs['output_dir'])
     output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)
+    pcgr_dir = output_dir / 'pcgr/'
 
     # Set all FILTER="." to FILTER="PASS" as required by PURPLE
     filter_pass_fp = set_filter_pass(kwargs['vcf_fp'], kwargs['tumor_name'], output_dir)
@@ -80,10 +81,8 @@ def entry(ctx, **kwargs):
     #       - PCGR ACMG TIER [INFO/PCGR_TIER]
     #       - VEP consequence [INFO/PCR_CSQ]
     #       - Known mutation hotspot [INFO/PCGR_MUTATION_HOTSPOT]
-    #       - ClinVar clinical significant [INFO/PCGR_CLINVAR_CLNSIG]
-    #       - Hits in COSMIC [INFO/PCGR_COSMIC_COUNT]
+    #       - ClinVar clinical significant [INFO/PCGR_CLNSIG]
     #       - Hits in TCGA [INFO/PCGR_TCGA_PANCANCER_COUNT]
-    #       - Hits in PCAWG [INFO/PCGR_ICGC_PCAWG_COUNT]
     # Set selected data or full input
     selection_data = select_variants(
         pon_fp,
diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py
index e46f285..3228419 100644
--- a/bolt/workflows/smlv_somatic/filter.py
+++ b/bolt/workflows/smlv_somatic/filter.py
@@ -181,7 +181,7 @@ def set_filter_data(record, tumor_index):
     # single CLINICAL_POTENTIAL_RESCUE flag
 
     # Get ClinVar clinical significance entries
-    clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLNSIG.value, '')
+    clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value, '')
     clinvar_clinsigs = clinvar_clinsig.split(',')
     # Hit counts in relevant reference somatic mutation databases
     cosmic_count = record.INFO.get(constants.VcfInfo.PCGR_COSMIC_COUNT.value, 0)

From 55f0f00e162ccdf96f982ab5fc8150bf218fb2f7 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Tue, 15 Apr 2025 16:59:53 +1000
Subject: [PATCH 06/25] change filter remove COSMIC_COUN and ICGC_PCAWG and
 change name pcgr tier use

---
 bolt/workflows/smlv_somatic/annotate.py |  1 -
 bolt/workflows/smlv_somatic/filter.py   | 14 +++++---------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py
index 687f17f..385fcc7 100644
--- a/bolt/workflows/smlv_somatic/annotate.py
+++ b/bolt/workflows/smlv_somatic/annotate.py
@@ -43,7 +43,6 @@ def entry(ctx, **kwargs):
     # Create output directory
     output_dir = pathlib.Path(kwargs['output_dir'])
     output_dir.mkdir(mode=0o755, parents=True, exist_ok=True)
-    pcgr_dir = output_dir / 'pcgr/'
 
     # Set all FILTER="." to FILTER="PASS" as required by PURPLE
     filter_pass_fp = set_filter_pass(kwargs['vcf_fp'], kwargs['tumor_name'], output_dir)
diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py
index 3228419..ea8b3c0 100644
--- a/bolt/workflows/smlv_somatic/filter.py
+++ b/bolt/workflows/smlv_somatic/filter.py
@@ -37,7 +37,7 @@ def entry(ctx, **kwargs):
         constants.VcfFilter.ENCODE,
         constants.VcfFilter.GNOMAD_COMMON,
         constants.VcfInfo.SAGE_HOTSPOT_RESCUE,
-        constants.VcfInfo.PCGR_TIER_RESCUE,
+        constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE,
         constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE,
         constants.VcfInfo.RESCUED_FILTERS_EXISTING,
         constants.VcfInfo.RESCUED_FILTERS_PENDING,
@@ -162,9 +162,9 @@ def set_filter_data(record, tumor_index):
     ##
     # PCGR tier rescue
     ##
-    pcgr_tier = record.INFO.get(constants.VcfInfo.PCGR_TIER.value)
-    if pcgr_tier in constants.PCGR_TIERS_RESCUE:
-        info_rescue.append(constants.VcfInfo.PCGR_TIER_RESCUE)
+    pcgr_tier = record.INFO.get(constants.VcfInfo.PCGR_ACTIONABILITY_TIER.value)
+    if pcgr_tier in constants.PCGR_ACTIONABILITY_TIER_RESCUE:
+        info_rescue.append(constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE)
 
     ##
     # SAGE hotspot rescue
@@ -184,16 +184,12 @@ def set_filter_data(record, tumor_index):
     clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value, '')
     clinvar_clinsigs = clinvar_clinsig.split(',')
     # Hit counts in relevant reference somatic mutation databases
-    cosmic_count = record.INFO.get(constants.VcfInfo.PCGR_COSMIC_COUNT.value, 0)
     tcga_pancancer_count = record.INFO.get(constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT.value, 0)
-    icgc_pcawg_count = record.INFO.get(constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT.value, 0)
     if (
         record.INFO.get(constants.VcfInfo.HMF_HOTSPOT.value) is not None or
         record.INFO.get(constants.VcfInfo.PCGR_MUTATION_HOTSPOT.value) is not None or
         any(e in clinvar_clinsigs for e in constants.CLINVAR_CLINSIGS_RESCUE) or
-        cosmic_count >= constants.MIN_COSMIC_COUNT_RESCUE or
-        tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE or
-        icgc_pcawg_count >= constants.MIN_ICGC_PCAWG_COUNT_RESCUE
+        tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE
     ):
         info_rescue.append(constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE)
 

From f381589477561d84dd7fba9554daa99a058b29af Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Tue, 15 Apr 2025 17:00:22 +1000
Subject: [PATCH 07/25] change test

---
 tests/test_smlv_somatic_filter.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/test_smlv_somatic_filter.py b/tests/test_smlv_somatic_filter.py
index 41a0e96..caebea8 100644
--- a/tests/test_smlv_somatic_filter.py
+++ b/tests/test_smlv_somatic_filter.py
@@ -160,15 +160,15 @@ def test_common_population_filter(self):
 
     def test_pcgr_tier_rescue(self):
         pcgr_tiers = [
-            'TIER_1',
-            'TIER_2',
+            '1',
+            '2',
         ]
-        rescue_tag_str = bolt_constants.VcfInfo.PCGR_TIER_RESCUE.value
+        rescue_tag_str = bolt_constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE.value
 
         for pcgr_tier in pcgr_tiers:
             record = get_record(
                 **self.records['filter_min_af9.9'],
-                info_data={'PCGR_TIER': pcgr_tier},
+                info_data={'PCGR_ACTIONABILITY_TIER': pcgr_tier},
             )
             smlv_somatic_filter.set_filter_data(record, 0)
             assert not record.FILTER
@@ -217,7 +217,7 @@ def test_clinical_potential_rescue_clinvar_clinsig(self):
         for clinsig in clinsigs:
             record = get_record(
                 **self.records['filter_min_af9.9'],
-                info_data={'PCGR_CLINVAR_CLNSIG': clinsig},
+                info_data={'PCGR_CLINVAR_CLASSIFICATION': clinsig},
             )
             smlv_somatic_filter.set_filter_data(record, 0)
             assert not record.FILTER

From 642d6917bb4d0b95b8514e3090f6222eb66298d9 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Fri, 27 Jun 2025 16:05:39 +1000
Subject: [PATCH 08/25] add input vep dir for pcgr 2.2.1

---
 bolt/common/pcgr.py                     | 203 +++++++++++++++++++++++-
 bolt/workflows/smlv_germline/report.py  |   2 +
 bolt/workflows/smlv_somatic/annotate.py |  14 ++
 bolt/workflows/smlv_somatic/report.py   |   4 +-
 4 files changed, 218 insertions(+), 5 deletions(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 4a9346f..7ad39be 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -111,7 +111,7 @@ def get_minimal_header(input_fh):
     return '\n'.join([filetype_line, *chrom_lines, *format_lines, column_line])
 
 
-def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None):
+def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None):
 
     # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a
     # temporary directory outside of the FusionFS mounted directory then manually copy across
@@ -125,7 +125,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No
     command_args = [
         f'--sample_id {sample_id}',
         f'--input_vcf {input_fp}',
-        f'--vep_dir {pcgr_refdata_dir}',
+        f'--vep_dir {vep_dir}',
         f'--refdata_dir {pcgr_refdata_dir}',
         f'--tumor_dp_tag TUMOR_DP',
         f'--tumor_af_tag TUMOR_AF',
@@ -188,7 +188,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No
     return pcgr_output_dir
 
 
-def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None):
+def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None):
 
     if not sample_id:
         sample_id = 'nosampleset'
@@ -199,17 +199,22 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pc
     temp_dir = tempfile.TemporaryDirectory()
     cpsr_output_dir = output_dir / 'cpsr/'
 
+    if cpsr_output_dir.exists():
+        logger.warning(f"Output directory '{cpsr_output_dir}' already exists and will be overwritten")
+        shutil.rmtree(cpsr_output_dir)
+
     command_args = [
         f'--sample_id {sample_id}',
         f'--input_vcf {input_fp}',
         f'--genome_assembly grch38',
         f'--custom_list {panel_fp}',
+        f'--vep_dir {vep_dir}',
+        f'--refdata_dir {pcgr_refdata_dir}',
         # NOTE(SW): probably useful to add versioning information here; weigh against maintainence
         # burden
         f'--custom_list_name umccr_germline_panel',
         f'--pop_gnomad global',
         f'--classify_all',
-        f'--pcgr_dir {pcgr_refdata_dir}',
         f'--vcfanno_n_proc {threads}',
         f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select',
     ]
@@ -483,3 +488,193 @@ def annotate_record(record, annotations, *, allow_missing=False):
         record.INFO[info_enum.value] = v
 
     return record
+
+def split_vcf(input_vcf, output_dir):
+    """
+    Splits a VCF file into multiple chunks, each containing up to max_variants variants.
+    Each chunk includes the VCF header.
+    Ensures no overlapping positions between chunks.
+    """
+    output_dir = pathlib.Path(output_dir / "vcf_chunks")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    chunk_files = []
+    chunk_number = 1
+    variant_count = 0
+    base_filename = pathlib.Path(input_vcf).stem
+    chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+    base_filename = input_vcf.stem
+    chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+    chunk_files.append(chunk_filename)
+    # Open the input VCF using cyvcf2
+    vcf_in = cyvcf2.VCF(input_vcf)
+    # Create a new VCF file for the first chunk
+    vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
+    last_position = None
+    for record in vcf_in:
+        current_position = record.POS
+        # Check if we need to start a new chunk
+        if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position):
+            # Close the current chunk file and start a new one
+            vcf_out.close()
+            chunk_number += 1
+            chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
+            chunk_files.append(chunk_filename)
+            vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in)
+            variant_count = 0
+        # Write the record to the current chunk
+        vcf_out.write_record(record)
+        variant_count += 1
+        last_position = current_position
+    # Close the last chunk file
+    vcf_out.close()
+    vcf_in.close()
+    logger.info(f"VCF file split into {len(chunk_files)} chunks.")
+    return chunk_files
+
+def run_somatic_chunck(vcf_chunks, pcgr_data_dir, vep_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda):
+    pcgr_tsv_files = []
+    pcgr_vcf_files = []
+    num_chunks = len(vcf_chunks)
+    # Ensure we don't use more workers than available threads, and each worker has at least 2 threads
+    max_workers = min(num_chunks, max_threads // 2)
+    threads_quot, threads_rem = divmod(max_threads, num_chunks)
+    threads_per_chunk = max(2, threads_quot)
+    # Limit the number of workers to the smaller of num_chunks or max_threads // 2
+    with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
+        futures = {}
+        for chunk_number, vcf_file in enumerate(vcf_chunks, start=1):
+            # Assign extra thread to the first 'threads_rem' chunks
+            additional_thread = 1 if chunk_number <= threads_rem else 0
+            total_threads = threads_per_chunk + additional_thread
+            futures[executor.submit(run_somatic, vcf_file, pcgr_data_dir, vep_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                pcgr_tsv_fp, pcgr_vcf_fp = future.result()
+                if pcgr_tsv_fp:
+                    pcgr_tsv_files.append(pcgr_tsv_fp)
+                if pcgr_vcf_fp:
+                    pcgr_vcf_files.append(pcgr_vcf_fp)
+            except Exception as e:
+                print(f"Exception occurred: {e}")
+    merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files)
+    return merged_tsv_fp, merged_vcf_fp
+
+def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
+    pcgr_dir = pathlib.Path(output_dir) / 'pcgr'
+    pcgr_dir.mkdir(exist_ok=True)
+
+    # Merge all TSV files into a single file in the pcgr directory
+    merged_tsv_fp = pcgr_dir / "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv"
+    util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
+
+    # Step 5: Merge all VCF files into a single file in the pcgr directory
+    merged_vcf_path = pcgr_dir / "nosampleset.pcgr_acmg.grch38"
+    merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path)
+
+    return merged_vcf, merged_tsv_fp
+
+
+def get_variant_filter_data(variant):
+    attribute_names = (
+        'tier',
+        'difficult',
+        'giab_conf',
+        'intergenic',
+        'intronic',
+        'downstream',
+        'upstream',
+        'impacts_other',
+    )
+
+    data = {e: None for e in attribute_names}
+
+
+    data['tier'] = variant.INFO['PCGR_TIER']
+
+
+    info_keys = [k for k, v in variant.INFO]
+
+    data['difficult'] = any(e.startswith('DIFFICULT') for e in info_keys)
+    data['giab_conf'] = 'GIAB_CONF' in info_keys
+
+
+    # NOTE(SW): GIAB_CONF always overrides DIFFICULT tags
+    if data['giab_conf'] and data['difficult']:
+        data['difficult']= False
+
+
+    for impact in get_impacts(variant.INFO['PCGR_CSQ']):
+        if impact == 'intergenic_variant':
+            data['intergenic'] = True
+        elif impact == 'intron_variant':
+            data['intronic'] = True
+        elif impact == 'downstream_gene_variant':
+            data['downstream'] = True
+        elif impact == 'upstream_gene_variant':
+            data['upstream'] = True
+        elif impact:
+            data['impacts_other'] = True
+        else:
+            assert False
+
+    return data
+
+
+def get_impacts(csq_str_full):
+    impacts = set()
+    for csq_str in csq_str_full.split(','):
+        csq_tokens = csq_str.split('|')
+        impact_str = csq_tokens[1]
+        impacts.update(impact_str.split('&'))
+    return impacts
+
+
+def determine_filter(data):
+
+    for impact, region in get_ordering(tiers=False):
+
+        # NOTE(SW): this is less efficient than nested loops since the outer block is reevaluated
+        # within what would be the inner loop each cycle; taking this route for cleaner code
+
+        impacts_higher = get_impacts_higher(impact)
+        impact_filter = bool(data[impact]) and not any(bool(data[e]) for e in impacts_higher)
+
+        region_filter = False
+        if region == 'none':
+            region_filter = not (data['difficult'] or data['giab_conf'])
+        else:
+            region_filter = data[region]
+
+        if impact_filter and region_filter:
+            return (impact, region)
+
+    return False
+
+
+def get_variant_repr(variant):
+    return (variant.CHROM, variant.POS, variant.REF, tuple(variant.ALT))
+
+
+@functools.cache
+def get_ordering(tiers=True, impacts=True, regions=True):
+    categories = [
+        constants.PCGR_TIERS_FILTERING if tiers else None,
+        constants.VEP_IMPACTS_FILTER if impacts else None,
+        constants.GENOMIC_REGIONS_FILTERING if regions else None,
+    ]
+
+    # NOTE(SW): I'm not aware of any noncoding impacts for TIER_[1-4] other than TERT but keeping
+    # in to be overly cautious
+    ordering_iter = itertools.product(*(c for c in categories if c))
+
+    return tuple(ordering_iter)
+
+
+@functools.cache
+def get_impacts_higher(impact):
+    impact_index = constants.VEP_IMPACTS_FILTER.index(impact)
+    if impact_index + 1 < len(constants.VEP_IMPACTS_FILTER):
+        impacts_higher = constants.VEP_IMPACTS_FILTER[impact_index+1:len(constants.VEP_IMPACTS_FILTER)]
+    else:
+        impacts_higher = list()
+    return impacts_higher
diff --git a/bolt/workflows/smlv_germline/report.py b/bolt/workflows/smlv_germline/report.py
index 0742c19..8248a19 100644
--- a/bolt/workflows/smlv_germline/report.py
+++ b/bolt/workflows/smlv_germline/report.py
@@ -22,6 +22,7 @@
 
 @click.option('--germline_panel_list_fp', required=True, type=click.Path(exists=True))
 @click.option('--pcgr_data_dir', required=True, type=click.Path(exists=True))
+@click.option('--vep_dir', required=True, type=click.Path(exists=True))
 
 @click.option('--threads', required=True, type=int, default=1)
 
@@ -69,6 +70,7 @@ def entry(ctx, **kwargs):
         cpsr_prep_fp,
         kwargs['germline_panel_list_fp'],
         kwargs['pcgr_data_dir'],
+        kwargs['vep_dir'],
         output_dir,
         threads=kwargs['threads'],
         pcgr_conda=kwargs['pcgr_conda'],
diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py
index 385fcc7..543755d 100644
--- a/bolt/workflows/smlv_somatic/annotate.py
+++ b/bolt/workflows/smlv_somatic/annotate.py
@@ -23,6 +23,7 @@
 @click.option('--pon_dir', required=True, type=click.Path(exists=True))
 
 @click.option('--pcgr_data_dir', required=True, type=click.Path(exists=True))
+@click.option('--vep_dir', required=True, type=click.Path(exists=True))
 
 @click.option('--pcgr_conda', required=False, type=str)
 @click.option('--pcgrr_conda', required=False, type=str)
@@ -105,7 +106,20 @@ def entry(ctx, **kwargs):
     pcgr_dir = pcgr.run_somatic(
         pcgr_prep_fp,
         kwargs['pcgr_data_dir'],
+        kwargs['vep_dir'],
         output_dir,
+        pcgr_output_dir,
+        kwargs['threads'],
+        kwargs['pcgr_conda'],
+        kwargs['pcgrr_conda']
+    )
+    else:
+        pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic(
+        pcgr_prep_fp,
+        kwargs['pcgr_data_dir'],
+        kwargs['vep_dir'],
+        pcgr_output_dir,
+        chunk_nbr=None,
         threads=kwargs['threads'],
         pcgr_conda=kwargs['pcgr_conda'],
         pcgrr_conda=kwargs['pcgrr_conda'],
diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py
index bdf908f..bf17ffb 100644
--- a/bolt/workflows/smlv_somatic/report.py
+++ b/bolt/workflows/smlv_somatic/report.py
@@ -27,6 +27,7 @@
 @click.option('--pcgrr_conda', required=False, type=str)
 
 @click.option('--pcgr_data_dir', required=False, type=str)
+@click.option('--vep_dir', required=True, type=click.Path(exists=True))
 @click.option('--purple_purity_fp', required=True, type=click.Path(exists=True))
 
 @click.option('--cancer_genes_fp', required=True, type=click.Path(exists=True))
@@ -117,7 +118,8 @@ def entry(ctx, **kwargs):
     pcgr.run_somatic(
         pcgr_prep_fp,
         kwargs['pcgr_data_dir'],
-        output_dir,
+        kwargs['vep_dir'],
+        pcgr_output_dir,
         threads=kwargs['threads'],
         pcgr_conda=kwargs['pcgr_conda'],
         pcgrr_conda=kwargs['pcgrr_conda'],

From a7a36ea83dedc41e5ccf966d0b727b1c740d8b19 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Tue, 1 Jul 2025 13:14:05 +1000
Subject: [PATCH 09/25] update CPSR-related constants and file naming
 conventions for pcgr v2.1

---
 bolt/common/constants.py | 31 ++++++++-----------------------
 bolt/common/pcgr.py      | 11 ++++-------
 2 files changed, 12 insertions(+), 30 deletions(-)

diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index 8aa8608..1f84f51 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -85,8 +85,6 @@ class VcfInfo(enum.Enum):
     PCGR_TCGA_PANCANCER_COUNT = 'PCGR_TCGA_PANCANCER_COUNT'
     PCGR_ICGC_PCAWG_COUNT = 'PCGR_ICGC_PCAWG_COUNT'
 
-    CPSR_FINAL_CLASSIFICATION = 'CPSR_FINAL_CLASSIFICATION'
-    CPSR_PATHOGENICITY_SCORE = 'CPSR_PATHOGENICITY_SCORE'
     CPSR_CLINVAR_CLASSIFICATION = 'CPSR_CLINVAR_CLASSIFICATION'
     CPSR_CSQ = 'CPSR_CSQ'
 
@@ -277,23 +275,10 @@ def namespace(self):
         'Description': 'Count of ICGC PCAWG hits',
     },
 
-    VcfInfo.CPSR_FINAL_CLASSIFICATION: {
-        'Number': '1',
-        'Type': 'String',
-        'Description': (
-            'Final variant classification based on the combination of CLINVAR_CLASSIFICTION (for '
-            'ClinVar-classified variants), and CPSR_CLASSIFICATION (for novel variants)'
-        ),
-    },
-    VcfInfo.CPSR_PATHOGENICITY_SCORE: {
-        'Number': '1',
-        'Type': 'Float',
-        'Description': 'Aggregated CPSR pathogenicity score',
-    },
     VcfInfo.CPSR_CLINVAR_CLASSIFICATION: {
         'Number': '1',
         'Type': 'String',
-        'Description': 'Clinical significance of variant on a five-tiered scale',
+        'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale',
     },
     VcfInfo.CPSR_CSQ: {
         'Number': '.',
@@ -302,13 +287,13 @@ def namespace(self):
             'Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|'
             'Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|'
             'Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|'
-            'FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE_SELECT|'
-            'MANE_PLUS_CLINICAL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|RefSeq|'
-            'DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|gnomAD_AF|gnomAD_AFR_AF|'
-            'gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|'
-            'gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|'
-            'MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|LoF|LoF_filter|LoF_flags|'
-            'LoF_info'
+            'FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE|MANE_SELECT|'
+            'MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|RefSeq|'
+            'DOMAINS|HGVS_OFFSET|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|gnomADe_ASJ_AF|'
+            'gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|gnomADe_REMAINING_AF|'
+            'gnomADe_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|'
+            'MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|MaxEntScan_alt|MaxEntScan_diff|'
+            'MaxEntScan_ref'
         ),
     },
 
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 7ad39be..b017aa4 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -293,14 +293,13 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
 
 def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir):
     # Set destination INFO field names and source TSV fields
+    # Note: Only include fields that exist in CPSR v2.2.1 output
     info_field_map = {
-        constants.VcfInfo.CPSR_FINAL_CLASSIFICATION: 'FINAL_CLASSIFICATION',
-        constants.VcfInfo.CPSR_PATHOGENICITY_SCORE: 'CPSR_PATHOGENICITY_SCORE',
         constants.VcfInfo.CPSR_CLINVAR_CLASSIFICATION: 'CLINVAR_CLASSIFICATION',
         constants.VcfInfo.CPSR_CSQ: 'CSQ',
     }
 
-    cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snv_indel_ann.tsv.gzv'
+    cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.classification.tsv.gz'
     cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz'
 
     # Enforce matching defined and source INFO annotations
@@ -312,8 +311,6 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir):
     # Open filehandles, set required header entries
     input_fh = cyvcf2.VCF(input_fp)
 
-    util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_FINAL_CLASSIFICATION)
-    util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_PATHOGENICITY_SCORE)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_CLINVAR_CLASSIFICATION)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_CSQ)
 
@@ -353,7 +350,7 @@ def check_annotation_headers(info_field_map, vcf_fp):
 def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
     # Gather all annotations from TSV
     data_tsv = dict()
-    with gzip.open(tsv_fp, 'rt') as tsv_fh:
+    with open(tsv_fp, 'r') as tsv_fh:
         for record in csv.DictReader(tsv_fh, delimiter='\t'):
             key, record_ann = get_annotation_entry_tsv(record, info_field_map)
             assert key not in data_tsv
@@ -376,7 +373,7 @@ def collect_cpsr_annotation_data(tsv_fp, vcf_fp, info_field_map):
     # Gather annotations from TSV
     data_tsv = dict()
     gdot_re = re.compile('^(?P<chrom>[\dXYM]+):g\.(?P<pos>\d+)(?P<ref>[A-Z]+)>(?P<alt>[A-Z]+)$')
-    with open(tsv_fp, 'r') as tsv_fh:
+    with gzip.open(tsv_fp, 'rt') as tsv_fh:
         for record in csv.DictReader(tsv_fh, delimiter='\t'):
             # Decompose CPSR 'GENOMIC_CHANGE' field into CHROM, POS, REF, and ALT
             re_result = gdot_re.match(record['GENOMIC_CHANGE'])

From 16118696d5961af6c831111ddbe1541a758f3e72 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Tue, 1 Jul 2025 17:59:02 +1000
Subject: [PATCH 10/25] remove unused PCGR_ACTIONABILITY_TIER header entry and
 update TSV file handling

---
 bolt/common/pcgr.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index b017aa4..f6fc600 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -268,7 +268,6 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
     # Open filehandles, set required header entries
     input_fh = cyvcf2.VCF(input_fp)
 
-    util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION)
@@ -350,15 +349,11 @@ def check_annotation_headers(info_field_map, vcf_fp):
 def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
     # Gather all annotations from TSV
     data_tsv = dict()
-    with open(tsv_fp, 'r') as tsv_fh:
+    with gzip.open(tsv_fp, 'rt') as tsv_fh:
         for record in csv.DictReader(tsv_fh, delimiter='\t'):
             key, record_ann = get_annotation_entry_tsv(record, info_field_map)
             assert key not in data_tsv
 
-            # Process PCGR_ACTIONABILITY_TIER
-            # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING
-            record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER']
-
             # Store annotation data
             data_tsv[key] = record_ann
 
@@ -436,6 +431,9 @@ def get_annotation_entry_tsv(record, info_field_map):
     if "GENOMIC_CHANGE" in record and record["GENOMIC_CHANGE"]:
         chrom, pos, ref, alt = parse_genomic_change(record["GENOMIC_CHANGE"])
 
+    if not chrom.startswith('chr'):
+        chrom = f'chr{chrom}'
+
     key = (chrom, pos, ref, alt)
 
     record_ann = dict()

From 64c3486cd14390316c9e565b060efe8a8da969d3 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Thu, 3 Jul 2025 10:46:01 +1000
Subject: [PATCH 11/25] add back  PCGR_ACTIONABILITY_TIER

---
 bolt/common/pcgr.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index f6fc600..7297ad8 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -268,6 +268,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
     # Open filehandles, set required header entries
     input_fh = cyvcf2.VCF(input_fp)
 
+    util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT)
     util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION)
@@ -354,6 +355,11 @@ def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
             key, record_ann = get_annotation_entry_tsv(record, info_field_map)
             assert key not in data_tsv
 
+
+            # Process PCGR_ACTIONABILITY_TIER
+            # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING
+            record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER']
+
             # Store annotation data
             data_tsv[key] = record_ann
 

From ecede4b97ac442cc7336736f165ac6ed8164359e Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Mon, 21 Jul 2025 10:14:19 +1000
Subject: [PATCH 12/25] Allow missing pcgr annotation for somatic record

---
 bolt/common/pcgr.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 7297ad8..f59ecbc 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -183,9 +183,18 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None,
 
     util.execute_command(command)
 
-    shutil.copytree(temp_dir.name, pcgr_output_dir)
+    shutil.copytree(temp_dir.name, output_dir)
 
-    return pcgr_output_dir
+    pcgr_tsv_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.snv_indel_ann.tsv.gz'
+    pcgr_vcf_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.pass.vcf.gz'
+
+    # Check if both files exist
+    if not pcgr_tsv_fp.exists():
+        raise FileNotFoundError(f"Expected file {pcgr_tsv_fp} not found.")
+    if not pcgr_vcf_fp.exists():
+        raise FileNotFoundError(f"Expected file {pcgr_vcf_fp} not found.")
+
+    return pcgr_tsv_fp, pcgr_vcf_fp
 
 
 def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None):
@@ -287,7 +296,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
             output_fh.write_record(record)
             continue
         # Annotate and write
-        record_ann = annotate_record(record, pcgr_data)
+        record_ann = annotate_record(record, pcgr_data, allow_missing=True)
         output_fh.write_record(record_ann)
 
 
@@ -502,7 +511,6 @@ def split_vcf(input_vcf, output_dir):
     chunk_number = 1
     variant_count = 0
     base_filename = pathlib.Path(input_vcf).stem
-    chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
     base_filename = input_vcf.stem
     chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf"
     chunk_files.append(chunk_filename)

From 33c4bf945cd9bec0c836c1eebf6c65c6e5e82abd Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Wed, 23 Jul 2025 09:43:44 +1000
Subject: [PATCH 13/25] remove PCGR temp dir made for fusion we no longer use

---
 bolt/common/pcgr.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index f59ecbc..87284e7 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -113,11 +113,13 @@ def get_minimal_header(input_fh):
 
 def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None):
 
-    # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a
-    # temporary directory outside of the FusionFS mounted directory then manually copy across
 
-    temp_dir = tempfile.TemporaryDirectory()
-    pcgr_output_dir = output_dir / 'pcgr/'
+    output_dir = output_dir / f"pcgr_{chunk_nbr}" if chunk_nbr is not None else output_dir
+
+    if output_dir.exists():
+        logger.warning(f"Output directory '{output_dir}' already exists and will be overwritten")
+        shutil.rmtree(output_dir)
+
 
     if not sample_id:
         sample_id = 'nosampleset'
@@ -164,7 +166,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None,
         command_args.append(f'--tumor_ploidy {ploidy}')
 
     # NOTE(SW): placed here to always have output directory last
-    command_args.append(f'--output_dir {temp_dir.name}')
+    command_args.append(f'--output_dir {output_dir}')
 
     delimiter_padding = ' ' * 10
     delimiter = f' \\\n{delimiter_padding}'
@@ -181,9 +183,11 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None,
         command_formatting = '\n' + ' ' * 4
         command = command_formatting + command_conda + command
 
-    util.execute_command(command)
+    # Log file path
+    log_file_path = output_dir / "run_somatic.log"
 
-    shutil.copytree(temp_dir.name, output_dir)
+    # Run the command and redirect output to the log file
+    util.execute_command(command, log_file_path=log_file_path)
 
     pcgr_tsv_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.snv_indel_ann.tsv.gz'
     pcgr_vcf_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.pass.vcf.gz'
@@ -202,10 +206,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre
     if not sample_id:
         sample_id = 'nosampleset'
 
-    # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a
-    # temporary directory outside of the FusionFS mounted directory then manually copy across
 
-    temp_dir = tempfile.TemporaryDirectory()
     cpsr_output_dir = output_dir / 'cpsr/'
 
     if cpsr_output_dir.exists():
@@ -232,7 +233,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre
         command_args.append(f'--pcgrr_conda {pcgrr_conda}')
 
     # NOTE(SW): placed here to always have output directory last
-    command_args.append(f'--output_dir {temp_dir.name}')
+    command_args.append(f'--output_dir {cpsr_output_dir}')
 
     delimiter_padding = ' ' * 10
     delimiter = f' \\\n{delimiter_padding}'
@@ -251,8 +252,6 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre
 
     util.execute_command(command)
 
-    shutil.copytree(temp_dir.name, cpsr_output_dir)
-
     return cpsr_output_dir
 
 

From d7e0e7344b461f1ce4306b7975ee59453ef460b1 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Wed, 23 Jul 2025 15:36:26 +1000
Subject: [PATCH 14/25] creat pcgr output dir

---
 bolt/common/pcgr.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 87284e7..0d53e30 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -120,6 +120,8 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None,
         logger.warning(f"Output directory '{output_dir}' already exists and will be overwritten")
         shutil.rmtree(output_dir)
 
+    # Create output directory
+    output_dir.mkdir(parents=True, exist_ok=True)
 
     if not sample_id:
         sample_id = 'nosampleset'
@@ -213,6 +215,9 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre
         logger.warning(f"Output directory '{cpsr_output_dir}' already exists and will be overwritten")
         shutil.rmtree(cpsr_output_dir)
 
+    # Create output directory
+    cpsr_output_dir.mkdir(parents=True, exist_ok=True)
+
     command_args = [
         f'--sample_id {sample_id}',
         f'--input_vcf {input_fp}',

From ef9fb08f85b30111c8ce140dc147ab57a70ededc Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Fri, 27 Jun 2025 12:45:14 +1000
Subject: [PATCH 15/25] update pcgr germline command

---
 bolt/common/pcgr.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 0d53e30..2c441de 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -223,7 +223,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre
         f'--input_vcf {input_fp}',
         f'--genome_assembly grch38',
         f'--custom_list {panel_fp}',
-        f'--vep_dir {vep_dir}',
+        f'--vep_dir {pcgr_refdata_dir}',
         f'--refdata_dir {pcgr_refdata_dir}',
         # NOTE(SW): probably useful to add versioning information here; weigh against maintainence
         # burden
@@ -269,8 +269,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
         constants.VcfInfo.PCGR_CSQ: 'CSQ',
     }
 
-    pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz'
-    pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz'
+    pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz'
+    pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr.grch38.vcf.gz'
 
     # Enforce matching defined and source INFO annotations
     check_annotation_headers(info_field_map, pcgr_vcf_fp)

From 9d695b39da1c94bc1b5cc47c29d57adfeea10826 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Fri, 25 Jul 2025 09:56:49 +1000
Subject: [PATCH 16/25] change vep #fork to recommanded one "Number of forks
 that VEP can use during annotation must be above 1 and not more than 8
 (recommended is 4)"

---
 bolt/common/pcgr.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 2c441de..e8d5a43 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -142,6 +142,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None,
         f'--estimate_tmb',
         #f'--show_noncoding',
         f'--vcfanno_n_proc {threads}',
+        f'--vep_n_forks 4',
         f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select',
     ]
 

From 2e6c3eaea2400a7d85794edd614be7b85fe362b2 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Thu, 17 Jul 2025 11:17:44 +1000
Subject: [PATCH 17/25] Fix I/O compress file and add logs

---
 bolt/common/pcgr.py |  3 +-
 bolt/util.py        | 95 +++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 93 insertions(+), 5 deletions(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index e8d5a43..81f2c6d 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -364,6 +364,7 @@ def check_annotation_headers(info_field_map, vcf_fp):
 def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
     # Gather all annotations from TSV
     data_tsv = dict()
+    logger.info(f"Collecting PCGR annotations from {tsv_fp} and {vcf_fp}")
     with gzip.open(tsv_fp, 'rt') as tsv_fh:
         for record in csv.DictReader(tsv_fh, delimiter='\t'):
             key, record_ann = get_annotation_entry_tsv(record, info_field_map)
@@ -582,7 +583,7 @@ def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp):
     util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp)
 
     # Step 5: Merge all VCF files into a single file in the pcgr directory
-    merged_vcf_path = pcgr_dir / "nosampleset.pcgr_acmg.grch38"
+    merged_vcf_path = pcgr_dir / "nosampleset.pcgr.grch38.pass.vcf.gz"
     merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path)
 
     return merged_vcf, merged_tsv_fp
diff --git a/bolt/util.py b/bolt/util.py
index b7333d9..b99362c 100644
--- a/bolt/util.py
+++ b/bolt/util.py
@@ -1,3 +1,4 @@
+import gzip
 import pathlib
 import subprocess
 import sys
@@ -95,7 +96,93 @@ def get_qualified_vcf_annotation(anno_enum):
     return f'{anno_enum.namespace}/{anno_enum.value}'
 
 
-#def add_vcf_filter(record, filter_enum):
-#    existing_filters = [e for e in record.FILTERS if e != 'PASS']
-#    assert filter_enum.value not in existing_filters
-#    return ';'.join([*existing_filters, filter_enum.value])
+def merge_tsv_files(tsv_files, merged_tsv_fp):
+    """
+    Merges all TSV files into a single TSV.
+    """
+    logger.info("Merging TSV files...")
+    with gzip.open(merged_tsv_fp, 'wt') as merged_tsv:
+        for i, tsv_file in enumerate(tsv_files):
+            with gzip.open(tsv_file, 'rt') as infile:
+                for line_number, line in enumerate(infile):
+                    # Skip header except for the first file
+                    if i > 0 and line_number == 0:
+                        continue
+                    merged_tsv.write(line)
+    logger.info(f"Merged TSV written to: {merged_tsv_fp}")
+
+
+def merge_vcf_files(vcf_files, merged_vcf_fp):
+    """
+    Merges multiple VCF files into a single sorted VCF file using bcftools.
+
+    Parameters:
+    - vcf_files: List of paths to VCF files to be merged.
+    - merged_vcf_fp: Path to the output merged VCF file (without extension).
+
+    Returns:
+    - Path to the sorted merged VCF file.
+    """
+    logger.info("Merging VCF files...")
+    merged_vcf_fp = pathlib.Path(merged_vcf_fp)
+    merged_unsorted_vcf = merged_vcf_fp.with_suffix('.unsorted.vcf.gz')
+    merged_vcf = merged_vcf_fp.with_suffix('.vcf.gz')
+
+    # Prepare the bcftools merge command arguments
+    command_args = [
+        'bcftools merge',
+        '-m all',
+        '-Oz',
+        f'-o {merged_unsorted_vcf}',
+    ] + [str(vcf_file) for vcf_file in vcf_files]
+
+    # Format the command for readability
+    delimiter_padding = ' ' * 10
+    delimiter = f' \\\n{delimiter_padding}'
+    command_args_str = delimiter.join(command_args)
+
+    command = f'''
+    {command_args_str}
+    '''
+
+    # Run the bcftools merge command
+    logger.info("Running bcftools merge...")
+    execute_command(command)
+    logger.info(f"Merged VCF written to: {merged_unsorted_vcf}")
+
+    # Sort the merged VCF file
+    sort_command_args = [
+        'bcftools sort',
+        '-Oz',
+        f'-o {merged_vcf}',
+        f'{merged_unsorted_vcf}'
+    ]
+    sort_command_args_str = delimiter.join(sort_command_args)
+    sort_command = f'''
+    {sort_command_args_str}
+    '''
+
+    logger.info("Sorting merged VCF file...")
+    execute_command(sort_command)
+    logger.info(f"Sorted merged VCF written to: {merged_vcf}")
+
+    # Index the sorted merged VCF file
+    index_command_args = [
+        'bcftools index',
+        '-t',
+        f'{merged_vcf}'
+    ]
+    index_command_args_str = delimiter.join(index_command_args)
+    index_command = f'''
+    {index_command_args_str}
+    '''
+
+    logger.info("Indexing sorted merged VCF file...")
+    execute_command(index_command)
+    logger.info(f"Indexed merged VCF file: {merged_vcf}.tbi")
+
+    # Optionally, remove the unsorted merged VCF file
+    if merged_unsorted_vcf.exists():
+        merged_unsorted_vcf.unlink()
+
+    return merged_vcf
\ No newline at end of file

From 94d4875d73d6da9aabbc212498bd9faf290d8468 Mon Sep 17 00:00:00 2001
From: Stephen Watts <hello@stephen.ac>
Date: Wed, 22 Jan 2025 15:10:49 +1100
Subject: [PATCH 18/25] Add input variant filter for final PCGR report

---
 bolt/common/constants.py              | 50 ++++++++++++++++
 bolt/common/pcgr.py                   |  3 +
 bolt/workflows/smlv_somatic/report.py | 82 ++++++++++++++++++++++++++-
 3 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index 1f84f51..2b21ee1 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -41,6 +41,43 @@
 }
 
 
+################################
+## Hypermutated report filter ##
+################################
+PCGR_TIERS_FILTERING = (
+    'TIER_1',
+    'TIER_2',
+    'TIER_3',
+    'TIER_4',
+    'NONCODING',
+)
+
+VEP_IMPACTS_FILTER = (
+    'intergenic',
+    'intronic',
+    'downstream',
+    'upstream',
+    'impacts_other',
+)
+
+GENOMIC_REGIONS_FILTERING = (
+    'difficult',
+    'none',
+    'giab_conf',
+)
+
+HOTSPOT_FIELDS_FILTERING = (
+    'SAGE_HOTSPOT',
+    'hotspot',
+    'PCGR_MUTATION_HOTSPOT',
+)
+
+RETAIN_FIELDS_FILTERING = (
+    'PANEL',
+    *HOTSPOT_FIELDS_FILTERING,
+)
+
+
 ##################################################
 ## VCF FILTER tags and FORMAT, INFO annotations ##
 ##################################################
@@ -61,6 +98,8 @@ class VcfFilter(enum.Enum):
     ENCODE = 'ENCODE'
     GNOMAD_COMMON = 'gnomAD_common'
 
+    PCGR_COUNT_LIMIT = 'PCGR_count_limit'
+
     @property
     def namespace(self):
         return 'FILTER'
@@ -119,6 +158,8 @@ class VcfInfo(enum.Enum):
     RESCUED_FILTERS_EXISTING = 'RESCUED_FILTERS_EXISTING'
     RESCUED_FILTERS_PENDING = 'RESCUED_FILTERS_PENDING'
 
+    PANEL = 'PANEL'
+
     @property
     def namespace(self):
         return 'INFO'
@@ -185,6 +226,9 @@ def namespace(self):
         'Description': f'gnomAD AF >= {MAX_GNOMAD_AF}',
     },
 
+    VcfFilter.PCGR_COUNT_LIMIT: {
+        'Description': 'Manually filtered to meet PCGR 500,000 variant limit',
+    },
 
     # INFO
     VcfInfo.TUMOR_AF: {
@@ -336,6 +380,12 @@ def namespace(self):
         'Description': 'Filters pending prior to variant rescue',
     },
 
+    VcfInfo.PANEL: {
+        'Number': '0',
+        'Type': 'Flag',
+        'Description': 'UMCCR somatic panel CDS (2,000 bp padding)',
+    },
+
 
     # FORMAT
     VcfFormat.SAGE_AD: {
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 81f2c6d..6c4e26d 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -1,4 +1,7 @@
+import collections
 import csv
+import functools
+import itertools
 import pathlib
 import re
 import shutil
diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py
index bf17ffb..9f68c3a 100644
--- a/bolt/workflows/smlv_somatic/report.py
+++ b/bolt/workflows/smlv_somatic/report.py
@@ -1,3 +1,4 @@
+import collections
 import csv
 import json
 import pathlib
@@ -106,10 +107,20 @@ def entry(ctx, **kwargs):
         fh.write('\n')
 
     # PCGR report
+    if variant_counts_process['filter_pass'] <= constants.MAX_SOMATIC_VARIANTS:
+        pcgr_input_vcf_fp = kwargs['vcf_fp']
+    else:
+        pcgr_input_vcf_fp = select_pcgr_variants(
+            kwargs['vcf_fp'],
+            kwargs['cancer_genes_fp'],
+            kwargs['tumor_name'],
+            output_dir,
+        )
+
     purple_data = parse_purple_purity_file(kwargs['purple_purity_fp'])
 
     pcgr_prep_fp = pcgr.prepare_vcf_somatic(
-        kwargs['vcf_fp'],
+        pcgr_input_vcf_fp,
         kwargs['tumor_name'],
         kwargs['normal_name'],
         output_dir,
@@ -286,6 +297,75 @@ def count_variant_process(vcf_fp):
     return counts
 
 
+def select_pcgr_variants(vcf_fp, cancer_genes_fp, tumor_name, output_dir):
+    # Annotate variants in UMCCR somatic gene panel
+    fp_annotated_out = output_dir / f'{tumor_name}.umccr_panel_variants_annotated.vcf.gz'
+    util.execute_command(fr'''
+        bcftools annotate \
+            --annotations <(awk 'BEGIN {{ OFS="\t" }} {{ print $1, $2-2000, $3+2000, "1" }}' {cancer_genes_fp}) \
+            --header-line '{util.get_vcf_header_line(constants.VcfInfo.PANEL)}' \
+            --columns CHROM,FROM,TO,{constants.VcfInfo.PANEL.value} \
+            --output {fp_annotated_out} \
+            {vcf_fp}
+    ''')
+
+    # Set filter category for each variant
+    variants_sorted = collections.defaultdict(list)
+    for variant_count, variant in enumerate(cyvcf2.VCF(fp_annotated_out), 1):
+        if any(variant.INFO.get(e) for e in constants.RETAIN_FIELDS_FILTERING):
+            continue
+
+        data = pcgr.get_variant_filter_data(variant)
+        variant_filter = pcgr.determine_filter(data)
+        assert variant_filter
+
+        filter_category = (data['tier'], *variant_filter)
+        variant_repr = pcgr.get_variant_repr(variant)
+        variants_sorted[filter_category].append(variant_repr)
+
+
+    # Determine the set of filter categories to come under the PCGR 500,000 variant threshold
+    filter_sum = 0
+    filter_categories = list()
+    for key in pcgr.get_ordering():
+
+        if (variant_count - filter_sum) <= constants.MAX_SOMATIC_VARIANTS:
+            break
+
+        filter_sum += len(variants_sorted.get(key, []))
+        filter_categories.append(key)
+
+    # Set FILTERS and write out records
+    filter_variants = set()
+    for key in filter_categories:
+        filter_variants.update(variants_sorted[key])
+
+    fh_in = cyvcf2.VCF(vcf_fp)
+    util.add_vcf_header_entry(fh_in, constants.VcfFilter.PCGR_COUNT_LIMIT)
+
+    # NOTE(SW): creating an additional VCF with all records for traceability
+    fp_out = output_dir / f'{tumor_name}.pcgr_hypermutated.pass.vcf.gz'
+    fp_set_out = output_dir / f'{tumor_name}.pcgr_hypermutated.filters_set.vcf.gz'
+
+    fh_out = cyvcf2.Writer(fp_out, fh_in)
+    fh_set_out = cyvcf2.Writer(fp_set_out, fh_in)
+
+    for variant in fh_in:
+        variant_repr = pcgr.get_variant_repr(variant)
+        if variant_repr not in filter_variants:
+            # Write only passing
+            fh_out.write_record(variant)
+        else:
+            variant.FILTER = constants.VcfFilter.PCGR_COUNT_LIMIT.value
+        # Write all variants including those with FILTER set
+        fh_set_out.write_record(variant)
+
+    fh_out.close()
+    fh_set_out.close()
+
+    return fp_out
+
+
 def parse_purple_purity_file(fp):
     with open(fp, 'r') as fh:
         entries = list(csv.DictReader(fh, delimiter='\t'))

From d6e8b53610d040c07d0d47663ade0ae5fe710ae5 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Tue, 15 Apr 2025 14:22:20 +1000
Subject: [PATCH 19/25] uptade pcgr tier

---
 bolt/common/constants.py | 39 +--------------------------------------
 bolt/common/pcgr.py      | 10 ++++++----
 2 files changed, 7 insertions(+), 42 deletions(-)

diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index 2b21ee1..09ad0de 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -41,43 +41,6 @@
 }
 
 
-################################
-## Hypermutated report filter ##
-################################
-PCGR_TIERS_FILTERING = (
-    'TIER_1',
-    'TIER_2',
-    'TIER_3',
-    'TIER_4',
-    'NONCODING',
-)
-
-VEP_IMPACTS_FILTER = (
-    'intergenic',
-    'intronic',
-    'downstream',
-    'upstream',
-    'impacts_other',
-)
-
-GENOMIC_REGIONS_FILTERING = (
-    'difficult',
-    'none',
-    'giab_conf',
-)
-
-HOTSPOT_FIELDS_FILTERING = (
-    'SAGE_HOTSPOT',
-    'hotspot',
-    'PCGR_MUTATION_HOTSPOT',
-)
-
-RETAIN_FIELDS_FILTERING = (
-    'PANEL',
-    *HOTSPOT_FIELDS_FILTERING,
-)
-
-
 ##################################################
 ## VCF FILTER tags and FORMAT, INFO annotations ##
 ##################################################
@@ -301,7 +264,7 @@ def namespace(self):
     VcfInfo.PCGR_CLINVAR_CLASSIFICATION: {
         'Number': '.',
         'Type': 'String',
-        'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale',
+        'Description': 'ClinVar - clinical significance - per phenotype submission',
     },
     VcfInfo.PCGR_COSMIC_COUNT: {
         'Number': '1',
diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 6c4e26d..29738bd 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -273,8 +273,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
         constants.VcfInfo.PCGR_CSQ: 'CSQ',
     }
 
-    pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz'
-    pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr.grch38.vcf.gz'
+    pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz'
+    pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz'
 
     # Enforce matching defined and source INFO annotations
     check_annotation_headers(info_field_map, pcgr_vcf_fp)
@@ -316,7 +316,7 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir):
         constants.VcfInfo.CPSR_CSQ: 'CSQ',
     }
 
-    cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.classification.tsv.gz'
+    cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snv_indel_ann.tsv.gzv'
     cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz'
 
     # Enforce matching defined and source INFO annotations
@@ -367,12 +367,14 @@ def check_annotation_headers(info_field_map, vcf_fp):
 def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
     # Gather all annotations from TSV
     data_tsv = dict()
-    logger.info(f"Collecting PCGR annotations from {tsv_fp} and {vcf_fp}")
     with gzip.open(tsv_fp, 'rt') as tsv_fh:
         for record in csv.DictReader(tsv_fh, delimiter='\t'):
             key, record_ann = get_annotation_entry_tsv(record, info_field_map)
             assert key not in data_tsv
 
+            # Process PCGR_ACTIONABILITY_TIER
+            # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING
+            record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_')
 
             # Process PCGR_ACTIONABILITY_TIER
             # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING

From 59be8dce414fff9d2412c4244fa1fdddc9206ce4 Mon Sep 17 00:00:00 2001
From: qclayssen <quentin.clayssen@gmail.com>
Date: Tue, 1 Jul 2025 13:14:05 +1000
Subject: [PATCH 20/25] update CPSR-related constants and file naming
 conventions for pcgr v2.1

---
 bolt/common/pcgr.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 29738bd..3d897ab 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -273,8 +273,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou
         constants.VcfInfo.PCGR_CSQ: 'CSQ',
     }
 
-    pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz'
-    pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz'
+    pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv'
+    pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz'
 
     # Enforce matching defined and source INFO annotations
     check_annotation_headers(info_field_map, pcgr_vcf_fp)
@@ -316,7 +316,7 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir):
         constants.VcfInfo.CPSR_CSQ: 'CSQ',
     }
 
-    cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snv_indel_ann.tsv.gzv'
+    cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.classification.tsv.gz'
     cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz'
 
     # Enforce matching defined and source INFO annotations
@@ -367,7 +367,7 @@ def check_annotation_headers(info_field_map, vcf_fp):
 def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map):
     # Gather all annotations from TSV
     data_tsv = dict()
-    with gzip.open(tsv_fp, 'rt') as tsv_fh:
+    with open(tsv_fp, 'r') as tsv_fh:
         for record in csv.DictReader(tsv_fh, delimiter='\t'):
             key, record_ann = get_annotation_entry_tsv(record, info_field_map)
             assert key not in data_tsv

From eb2483600696ed959050b82608b3160d94136409 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <quentin.clayssen@gmail.com>
Date: Thu, 28 Aug 2025 16:12:55 +1000
Subject: [PATCH 21/25] linting

---
 bolt/common/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bolt/common/constants.py b/bolt/common/constants.py
index 09ad0de..775530e 100644
--- a/bolt/common/constants.py
+++ b/bolt/common/constants.py
@@ -241,7 +241,7 @@ def namespace(self):
         ),
     },
     VcfInfo.PCGR_CSQ: {
-    'Number': '.',
+        'Number': '.',
     'Type': 'String',
     'Description': (
         'Consequence annotations from Ensembl VEP. Format: '

From 5109ebbaa8c947241a718b268e88474c9f333852 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <quentin.clayssen@gmail.com>
Date: Thu, 28 Aug 2025 16:13:42 +1000
Subject: [PATCH 22/25] Fix import

---
 bolt/common/pcgr.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 3d897ab..5d06188 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -1,13 +1,12 @@
 import collections
 import csv
 import functools
+import gzip
 import itertools
 import pathlib
 import re
 import shutil
 import tempfile
-import gzip  # added import
-
 
 import cyvcf2
 

From a91ff863a3a68c5050b134387c5bd9a1b751298b Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <quentin.clayssen@gmail.com>
Date: Thu, 28 Aug 2025 16:15:36 +1000
Subject: [PATCH 23/25] remove dead code

---
 bolt/common/pcgr.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py
index 5d06188..108633c 100644
--- a/bolt/common/pcgr.py
+++ b/bolt/common/pcgr.py
@@ -142,7 +142,6 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None,
         f'--estimate_signatures',
         f'--estimate_msi',
         f'--estimate_tmb',
-        #f'--show_noncoding',
         f'--vcfanno_n_proc {threads}',
         f'--vep_n_forks 4',
         f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select',

From b0cfc8e7e3ab5501f88bc7e4189e0f5fe7e12d42 Mon Sep 17 00:00:00 2001
From: Quentin Clayssen <quentin.clayssen@gmail.com>
Date: Thu, 28 Aug 2025 16:28:09 +1000
Subject: [PATCH 24/25] remove test assert absence `FILTER` column because now
 needed for breakend pairing

---
 tests/test_smlv_somatic_filter.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_smlv_somatic_filter.py b/tests/test_smlv_somatic_filter.py
index caebea8..6f0e54d 100644
--- a/tests/test_smlv_somatic_filter.py
+++ b/tests/test_smlv_somatic_filter.py
@@ -201,7 +201,6 @@ def test_clinical_potential_rescue_general(self):
                 info_data=info_data_set,
             )
             smlv_somatic_filter.set_filter_data(record, 0)
-            assert not record.FILTER
             assert record.INFO.get(rescue_tag_str) is not None
 
 

From acb6862c568b6992cb003b4c4f6427848c151c01 Mon Sep 17 00:00:00 2001
From: Stephen Watts <hello@stephen.ac>
Date: Thu, 28 Aug 2025 18:17:50 +1000
Subject: [PATCH 25/25] Fix, update clinical potential rescue test

---
 tests/test_smlv_somatic_filter.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_smlv_somatic_filter.py b/tests/test_smlv_somatic_filter.py
index 6f0e54d..6455162 100644
--- a/tests/test_smlv_somatic_filter.py
+++ b/tests/test_smlv_somatic_filter.py
@@ -189,9 +189,7 @@ def test_clinical_potential_rescue_general(self):
         info_data_sets = [
             {'HMF_HOTSPOT': ''},
             {'PCGR_MUTATION_HOTSPOT': ''},
-            {'PCGR_COSMIC_COUNT': 11},
             {'PCGR_TCGA_PANCANCER_COUNT': 6},
-            {'PCGR_ICGC_PCAWG_COUNT': 6},
         ]
         rescue_tag_str = bolt_constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE.value
 
@@ -201,6 +199,7 @@ def test_clinical_potential_rescue_general(self):
                 info_data=info_data_set,
             )
             smlv_somatic_filter.set_filter_data(record, 0)
+            assert not record.FILTER
             assert record.INFO.get(rescue_tag_str) is not None