From 107a9149650ccb9ad3b31e554ef72da2f1c7f66d Mon Sep 17 00:00:00 2001 From: qclayssen Date: Mon, 7 Apr 2025 09:56:34 +1000 Subject: [PATCH 01/25] uptade docker image --- docker/Dockerfile.pcgr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.pcgr b/docker/Dockerfile.pcgr index 4ea4932..ca984e8 100644 --- a/docker/Dockerfile.pcgr +++ b/docker/Dockerfile.pcgr @@ -15,13 +15,13 @@ RUN \ conda create \ --solver libmamba \ --name pcgr \ - --file https://raw.githubusercontent.com/sigven/pcgr/v1.4.1/conda/env/lock/pcgr-linux-64.lock + --file https://raw.githubusercontent.com/sigven/pcgr/refs/tags/v2.2.1/conda/env/lock/pcgr-linux-64.lock RUN \ conda create \ --solver libmamba \ --name pcgrr \ - --file https://raw.githubusercontent.com/sigven/pcgr/v1.4.1/conda/env/lock/pcgrr-linux-64.lock + --file https://raw.githubusercontent.com/sigven/pcgr/refs/tags/v2.2.1/conda/env/lock/pcgrr-linux-64.lock COPY ./conda/env/bolt_env.yml /tmp/ RUN \ From 0ec6c095c0ba79fdf59109317e98db4259778361 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Mon, 14 Apr 2025 17:45:00 +1000 Subject: [PATCH 02/25] chang command pcgr --- bolt/common/pcgr.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index fce4c56..aa4751e 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -124,19 +124,20 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No command_args = [ f'--sample_id {sample_id}', f'--input_vcf {input_fp}', + f'--vep_dir {pcgr_refdata_dir}', + f'--refdata_dir {pcgr_refdata_dir}', f'--tumor_dp_tag TUMOR_DP', f'--tumor_af_tag TUMOR_AF', f'--control_dp_tag NORMAL_DP', f'--control_af_tag NORMAL_AF', - f'--pcgr_dir {pcgr_refdata_dir}', f'--genome_assembly grch38', f'--assay WGS', f'--estimate_signatures', - f'--estimate_msi_status', + f'--estimate_msi', f'--estimate_tmb', - f'--show_noncoding', + #f'--show_noncoding', f'--vcfanno_n_proc {threads}', - f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane', + f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select', ] # NOTE(SW): VEP pick order is applied as a successive filter: @@ -209,7 +210,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pc f'--classify_all', f'--pcgr_dir {pcgr_refdata_dir}', f'--vcfanno_n_proc {threads}', - f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane', + f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select', ] if pcgrr_conda: @@ -249,8 +250,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou constants.VcfInfo.PCGR_CSQ: 'CSQ', } - pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv' - pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz' + pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snvs_indels.tiers.tsv' + pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz' # Enforce matching defined and source INFO annotations check_annotation_headers(info_field_map, pcgr_vcf_fp) From c51feda01ddcbfb30feb2cda4825ce8a57ecb100 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Mon, 14 Apr 2025 17:45:35 +1000 Subject: [PATCH 03/25] change pcgr vcf header/info --- bolt/common/constants.py | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/bolt/common/constants.py b/bolt/common/constants.py index e38ed3d..601c0b0 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -236,29 +236,30 @@ def namespace(self): ), }, VcfInfo.PCGR_CSQ: { - 'Number': '.', - 'Type': 'String', - 'Description': ( - 'Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|' - 'Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|' - 'CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|' - 'DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|' - 'MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|' - 'UNIPROT_ISOFORM|RefSeq|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|' - 'gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|' - 'gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|' - 'NearestExonJB' - ), + 'Number': '.', + 'Type': 'String', + 'Description': ( + 'Consequence annotations from Ensembl VEP. Format: ' + 'Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|' + 'HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|' + 'ALLELE_NUM|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|' + 'MANE|MANE_SELECT|MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|' + 'UNIPROT_ISOFORM|RefSeq|DOMAINS|HGVS_OFFSET|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|' + 'gnomADe_ASJ_AF|gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|' + 'gnomADe_REMAINING_AF|gnomADe_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|' + 'MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|' + 'MaxEntScan_alt|MaxEntScan_diff|MaxEntScan_ref' + ), }, VcfInfo.PCGR_MUTATION_HOTSPOT: { 'Number': '.', 'Type': 'String', - 'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org_v2, Gene|Codon|Q-value', + 'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org. Format: GeneSymbol|Entrez_ID|CodonRefAA|Alt_AA|Q-value', }, VcfInfo.PCGR_CLINVAR_CLNSIG: { 'Number': '.', 'Type': 'String', - 'Description': 'ClinVar clinical significance', + 'Description': 'ClinVar - clinical significance - per phenotype submissione', }, VcfInfo.PCGR_COSMIC_COUNT: { 'Number': '1', From feace9f9320516a5e7899ef2d85abc07f45b30e7 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Tue, 15 Apr 2025 14:22:20 +1000 Subject: [PATCH 04/25] uptade pcgr tier --- bolt/common/constants.py | 16 ++++++++-------- bolt/common/pcgr.py | 16 +++++++++------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/bolt/common/constants.py b/bolt/common/constants.py index 601c0b0..1fa73bf 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -35,9 +35,9 @@ 'pathogenic', 'uncertain_significance', } -PCGR_TIERS_RESCUE = { - 'TIER_1', - 'TIER_2', +PCGR_ACTIONABILITY_TIER_RESCUE = { + '1', + '2', } @@ -77,7 +77,7 @@ class VcfInfo(enum.Enum): SAGE_NOVEL = 'SAGE_NOVEL' SAGE_RESCUE = 'SAGE_RESCUE' - PCGR_TIER = 'PCGR_TIER' + PCGR_ACTIONABILITY_TIER = 'PCGR_ACTIONABILITY_TIER' PCGR_CSQ = 'PCGR_CSQ' PCGR_MUTATION_HOTSPOT = 'PCGR_MUTATION_HOTSPOT' PCGR_CLINVAR_CLNSIG = 'PCGR_CLINVAR_CLNSIG' @@ -112,7 +112,7 @@ class VcfInfo(enum.Enum): GNOMAD_AF = 'gnomAD_AF' - PCGR_TIER_RESCUE = 'PCGR_TIER_RESCUE' + PCGR_ACTIONABILITY_TIER_RESCUE = 'PCGR_ACTIONABILITY_TIER_RESCUE' SAGE_HOTSPOT_RESCUE = 'SAGE_HOTSPOT_RESCUE' CLINICAL_POTENTIAL_RESCUE = 'CLINICAL_POTENTIAL_RESCUE' @@ -226,7 +226,7 @@ def namespace(self): 'Description': 'Variant rescued by a matching SAGE call', }, - VcfInfo.PCGR_TIER: { + VcfInfo.PCGR_ACTIONABILITY_TIER: { 'Number': '1', 'Type': 'String', 'Description': ( @@ -259,7 +259,7 @@ def namespace(self): VcfInfo.PCGR_CLINVAR_CLNSIG: { 'Number': '.', 'Type': 'String', - 'Description': 'ClinVar - clinical significance - per phenotype submissione', + 'Description': 'ClinVar - clinical significance - per phenotype submission', }, VcfInfo.PCGR_COSMIC_COUNT: { 'Number': '1', @@ -317,7 +317,7 @@ def namespace(self): 'Type': 'Flag', 'Description': '', }, - VcfInfo.PCGR_TIER_RESCUE: { + VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE: { 'Number': '0', 'Type': 'Flag', 'Description': '', diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index aa4751e..9962da4 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -3,6 +3,7 @@ import re import shutil import tempfile +import gzip # added import import cyvcf2 @@ -250,7 +251,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou constants.VcfInfo.PCGR_CSQ: 'CSQ', } - pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snvs_indels.tiers.tsv' + pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz' pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz' # Enforce matching defined and source INFO annotations @@ -262,7 +263,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou # Open filehandles, set required header entries input_fh = cyvcf2.VCF(input_fp) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_TIER) + util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLNSIG) @@ -296,7 +297,7 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir): constants.VcfInfo.CPSR_CSQ: 'CSQ', } - cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snvs_indels.tiers.tsv' + cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snv_indel_ann.tsv.gzv' cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz' # Enforce matching defined and source INFO annotations @@ -337,6 +338,7 @@ def check_annotation_headers(info_field_map, vcf_fp): try: header_src_entry = vcf_fh.get_header_type(header_src) except KeyError: + print(f"Missing in VCF: {header_src}") continue header_dst_entry = util.get_vcf_header_entry(header_dst) @@ -348,14 +350,14 @@ def check_annotation_headers(info_field_map, vcf_fp): def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV data_tsv = dict() - with open(tsv_fp, 'r') as tsv_fh: + with gzip.open(tsv_fp, 'rt') as tsv_fh: for record in csv.DictReader(tsv_fh, delimiter='\t'): key, record_ann = get_annotation_entry_tsv(record, info_field_map) assert key not in data_tsv - # Process PCGR_TIER - # TIER_1, TIER_2, TIER_3, TIER_4, NONCODING - record_ann[constants.VcfInfo.PCGR_TIER] = record['TIER'].replace(' ', '_') + # Process PCGR_ACTIONABILITY_TIER + # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING + record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_') # Count COSMIC hits if record['COSMIC_MUTATION_ID'] == 'NA': From d21d145fd40ead1dea23499333d77e24431fcb40 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Tue, 15 Apr 2025 15:27:09 +1000 Subject: [PATCH 05/25] remove in annotation cosmic and ICG un uptade variable name --- bolt/common/constants.py | 6 +-- bolt/common/pcgr.py | 57 ++++++++++++------------- bolt/workflows/smlv_somatic/annotate.py | 5 +-- bolt/workflows/smlv_somatic/filter.py | 2 +- 4 files changed, 34 insertions(+), 36 deletions(-) diff --git a/bolt/common/constants.py b/bolt/common/constants.py index 1fa73bf..8aa8608 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -80,7 +80,7 @@ class VcfInfo(enum.Enum): PCGR_ACTIONABILITY_TIER = 'PCGR_ACTIONABILITY_TIER' PCGR_CSQ = 'PCGR_CSQ' PCGR_MUTATION_HOTSPOT = 'PCGR_MUTATION_HOTSPOT' - PCGR_CLINVAR_CLNSIG = 'PCGR_CLINVAR_CLNSIG' + PCGR_CLINVAR_CLASSIFICATION = 'PCGR_CLINVAR_CLASSIFICATION' PCGR_COSMIC_COUNT = 'PCGR_COSMIC_COUNT' PCGR_TCGA_PANCANCER_COUNT = 'PCGR_TCGA_PANCANCER_COUNT' PCGR_ICGC_PCAWG_COUNT = 'PCGR_ICGC_PCAWG_COUNT' @@ -256,10 +256,10 @@ def namespace(self): 'Type': 'String', 'Description': 'Known cancer mutation hotspot, as found in cancerhotspots.org. Format: GeneSymbol|Entrez_ID|CodonRefAA|Alt_AA|Q-value', }, - VcfInfo.PCGR_CLINVAR_CLNSIG: { + VcfInfo.PCGR_CLINVAR_CLASSIFICATION: { 'Number': '.', 'Type': 'String', - 'Description': 'ClinVar - clinical significance - per phenotype submission', + 'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale', }, VcfInfo.PCGR_COSMIC_COUNT: { 'Number': '1', diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 9962da4..4a9346f 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -246,7 +246,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou # Set destination INFO field names and source TSV fields info_field_map = { constants.VcfInfo.PCGR_MUTATION_HOTSPOT: 'MUTATION_HOTSPOT', - constants.VcfInfo.PCGR_CLINVAR_CLNSIG: 'CLINVAR_CLNSIG', + constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION: 'CLINVAR_CLASSIFICATION', constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT: 'TCGA_PANCANCER_COUNT', constants.VcfInfo.PCGR_CSQ: 'CSQ', } @@ -266,10 +266,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLNSIG) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_COSMIC_COUNT) + util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT) output_fp = output_dir / f'{tumor_name}.annotations.vcf.gz' output_fh = cyvcf2.Writer(output_fp, input_fh, 'wz') @@ -344,8 +342,8 @@ def check_annotation_headers(info_field_map, vcf_fp): header_dst_entry = util.get_vcf_header_entry(header_dst) # Remove leading and trailing quotes from source header_src_description_unquoted = header_src_entry['Description'].strip('"') - assert header_src_description_unquoted == header_dst_entry['Description'] - + if header_src_description_unquoted != header_dst_entry['Description']: + raise AssertionError(f"Mismatch for {header_src}:\nVCF: {header_src_description_unquoted}\nExpected: {header_dst_entry['Description']}") def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV @@ -357,25 +355,7 @@ def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Process PCGR_ACTIONABILITY_TIER # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING - record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_') - - # Count COSMIC hits - if record['COSMIC_MUTATION_ID'] == 'NA': - cosmic_count = 0 - else: - cosmic_count = len(record['COSMIC_MUTATION_ID'].split('&')) - record_ann[constants.VcfInfo.PCGR_COSMIC_COUNT] = cosmic_count - - # Count ICGC-PCAWG hits by taking sum of affected donors where the annotation value has - # the following format: project_code|tumor_type|affected_donors|tested_donors|frequency - icgc_pcawg_count = 0 - if record['ICGC_PCAWG_OCCURRENCE'] != 'NA': - for pcawg_hit_data in record['ICGC_PCAWG_OCCURRENCE'].split(','): - pcawrg_hit_data_fields = pcawg_hit_data.split('|') - affected_donors = int(pcawrg_hit_data_fields[2]) - icgc_pcawg_count += affected_donors - assert icgc_pcawg_count > 0 - record_ann[constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT] = icgc_pcawg_count + record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'] # Store annotation data data_tsv[key] = record_ann @@ -413,6 +393,24 @@ def collect_cpsr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Compile annotations, prefering TSV source return compile_annotation_data(data_tsv, data_vcf) +def parse_genomic_change(genomic_change): + """ + Parse a genomic change string, e.g., "3:g.41224645T>C" + Returns a tuple: (chrom, pos, ref, alt) + """ + # Regular expression for the format "chrom:g.posRef>Alt" + pattern = r'^(?P\w+):g\.(?P\d+)(?P\w+)>(?P\w+)$' + match = re.match(pattern, genomic_change) + if not match: + raise ValueError(f"Format not recognized: {genomic_change}") + + # Get values and format as needed + chrom = f"chr{match.group('chrom')}" + pos = int(match.group('pos')) + ref = match.group('ref') + alt = match.group('alt') + return chrom, pos, ref, alt + def get_annotations_vcf(vcf_fp, info_field_map): data_vcf = dict() @@ -432,10 +430,11 @@ def get_annotations_vcf(vcf_fp, info_field_map): def get_annotation_entry_tsv(record, info_field_map): - # Set lookup key; PCGR/CPSR strips leading 'chr' from contig names - chrom = f'chr{record["CHROM"]}' - pos = int(record['POS']) - key = (chrom, pos, record['REF'], record['ALT']) + # If GENOMIC_CHANGE is present, parse it for coordinates; otherwise, use separate fields. + if "GENOMIC_CHANGE" in record and record["GENOMIC_CHANGE"]: + chrom, pos, ref, alt = parse_genomic_change(record["GENOMIC_CHANGE"]) + + key = (chrom, pos, ref, alt) record_ann = dict() for info_dst, info_src in info_field_map.items(): diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py index f163d5f..687f17f 100644 --- a/bolt/workflows/smlv_somatic/annotate.py +++ b/bolt/workflows/smlv_somatic/annotate.py @@ -43,6 +43,7 @@ def entry(ctx, **kwargs): # Create output directory output_dir = pathlib.Path(kwargs['output_dir']) output_dir.mkdir(mode=0o755, parents=True, exist_ok=True) + pcgr_dir = output_dir / 'pcgr/' # Set all FILTER="." to FILTER="PASS" as required by PURPLE filter_pass_fp = set_filter_pass(kwargs['vcf_fp'], kwargs['tumor_name'], output_dir) @@ -80,10 +81,8 @@ def entry(ctx, **kwargs): # - PCGR ACMG TIER [INFO/PCGR_TIER] # - VEP consequence [INFO/PCR_CSQ] # - Known mutation hotspot [INFO/PCGR_MUTATION_HOTSPOT] - # - ClinVar clinical significant [INFO/PCGR_CLINVAR_CLNSIG] - # - Hits in COSMIC [INFO/PCGR_COSMIC_COUNT] + # - ClinVar clinical significant [INFO/PCGR_CLNSIG] # - Hits in TCGA [INFO/PCGR_TCGA_PANCANCER_COUNT] - # - Hits in PCAWG [INFO/PCGR_ICGC_PCAWG_COUNT] # Set selected data or full input selection_data = select_variants( pon_fp, diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py index e46f285..3228419 100644 --- a/bolt/workflows/smlv_somatic/filter.py +++ b/bolt/workflows/smlv_somatic/filter.py @@ -181,7 +181,7 @@ def set_filter_data(record, tumor_index): # single CLINICAL_POTENTIAL_RESCUE flag # Get ClinVar clinical significance entries - clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLNSIG.value, '') + clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value, '') clinvar_clinsigs = clinvar_clinsig.split(',') # Hit counts in relevant reference somatic mutation databases cosmic_count = record.INFO.get(constants.VcfInfo.PCGR_COSMIC_COUNT.value, 0) From 55f0f00e162ccdf96f982ab5fc8150bf218fb2f7 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Tue, 15 Apr 2025 16:59:53 +1000 Subject: [PATCH 06/25] change filter remove COSMIC_COUN and ICGC_PCAWG and change name pcgr tier use --- bolt/workflows/smlv_somatic/annotate.py | 1 - bolt/workflows/smlv_somatic/filter.py | 14 +++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py index 687f17f..385fcc7 100644 --- a/bolt/workflows/smlv_somatic/annotate.py +++ b/bolt/workflows/smlv_somatic/annotate.py @@ -43,7 +43,6 @@ def entry(ctx, **kwargs): # Create output directory output_dir = pathlib.Path(kwargs['output_dir']) output_dir.mkdir(mode=0o755, parents=True, exist_ok=True) - pcgr_dir = output_dir / 'pcgr/' # Set all FILTER="." to FILTER="PASS" as required by PURPLE filter_pass_fp = set_filter_pass(kwargs['vcf_fp'], kwargs['tumor_name'], output_dir) diff --git a/bolt/workflows/smlv_somatic/filter.py b/bolt/workflows/smlv_somatic/filter.py index 3228419..ea8b3c0 100644 --- a/bolt/workflows/smlv_somatic/filter.py +++ b/bolt/workflows/smlv_somatic/filter.py @@ -37,7 +37,7 @@ def entry(ctx, **kwargs): constants.VcfFilter.ENCODE, constants.VcfFilter.GNOMAD_COMMON, constants.VcfInfo.SAGE_HOTSPOT_RESCUE, - constants.VcfInfo.PCGR_TIER_RESCUE, + constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE, constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE, constants.VcfInfo.RESCUED_FILTERS_EXISTING, constants.VcfInfo.RESCUED_FILTERS_PENDING, @@ -162,9 +162,9 @@ def set_filter_data(record, tumor_index): ## # PCGR tier rescue ## - pcgr_tier = record.INFO.get(constants.VcfInfo.PCGR_TIER.value) - if pcgr_tier in constants.PCGR_TIERS_RESCUE: - info_rescue.append(constants.VcfInfo.PCGR_TIER_RESCUE) + pcgr_tier = record.INFO.get(constants.VcfInfo.PCGR_ACTIONABILITY_TIER.value) + if pcgr_tier in constants.PCGR_ACTIONABILITY_TIER_RESCUE: + info_rescue.append(constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE) ## # SAGE hotspot rescue @@ -184,16 +184,12 @@ def set_filter_data(record, tumor_index): clinvar_clinsig = record.INFO.get(constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION.value, '') clinvar_clinsigs = clinvar_clinsig.split(',') # Hit counts in relevant reference somatic mutation databases - cosmic_count = record.INFO.get(constants.VcfInfo.PCGR_COSMIC_COUNT.value, 0) tcga_pancancer_count = record.INFO.get(constants.VcfInfo.PCGR_TCGA_PANCANCER_COUNT.value, 0) - icgc_pcawg_count = record.INFO.get(constants.VcfInfo.PCGR_ICGC_PCAWG_COUNT.value, 0) if ( record.INFO.get(constants.VcfInfo.HMF_HOTSPOT.value) is not None or record.INFO.get(constants.VcfInfo.PCGR_MUTATION_HOTSPOT.value) is not None or any(e in clinvar_clinsigs for e in constants.CLINVAR_CLINSIGS_RESCUE) or - cosmic_count >= constants.MIN_COSMIC_COUNT_RESCUE or - tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE or - icgc_pcawg_count >= constants.MIN_ICGC_PCAWG_COUNT_RESCUE + tcga_pancancer_count >= constants.MIN_TCGA_PANCANCER_COUNT_RESCUE ): info_rescue.append(constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE) From f381589477561d84dd7fba9554daa99a058b29af Mon Sep 17 00:00:00 2001 From: qclayssen Date: Tue, 15 Apr 2025 17:00:22 +1000 Subject: [PATCH 07/25] change test --- tests/test_smlv_somatic_filter.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_smlv_somatic_filter.py b/tests/test_smlv_somatic_filter.py index 41a0e96..caebea8 100644 --- a/tests/test_smlv_somatic_filter.py +++ b/tests/test_smlv_somatic_filter.py @@ -160,15 +160,15 @@ def test_common_population_filter(self): def test_pcgr_tier_rescue(self): pcgr_tiers = [ - 'TIER_1', - 'TIER_2', + '1', + '2', ] - rescue_tag_str = bolt_constants.VcfInfo.PCGR_TIER_RESCUE.value + rescue_tag_str = bolt_constants.VcfInfo.PCGR_ACTIONABILITY_TIER_RESCUE.value for pcgr_tier in pcgr_tiers: record = get_record( **self.records['filter_min_af9.9'], - info_data={'PCGR_TIER': pcgr_tier}, + info_data={'PCGR_ACTIONABILITY_TIER': pcgr_tier}, ) smlv_somatic_filter.set_filter_data(record, 0) assert not record.FILTER @@ -217,7 +217,7 @@ def test_clinical_potential_rescue_clinvar_clinsig(self): for clinsig in clinsigs: record = get_record( **self.records['filter_min_af9.9'], - info_data={'PCGR_CLINVAR_CLNSIG': clinsig}, + info_data={'PCGR_CLINVAR_CLASSIFICATION': clinsig}, ) smlv_somatic_filter.set_filter_data(record, 0) assert not record.FILTER From 642d6917bb4d0b95b8514e3090f6222eb66298d9 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Fri, 27 Jun 2025 16:05:39 +1000 Subject: [PATCH 08/25] add input vep dir for pcgr 2.2.1 --- bolt/common/pcgr.py | 203 +++++++++++++++++++++++- bolt/workflows/smlv_germline/report.py | 2 + bolt/workflows/smlv_somatic/annotate.py | 14 ++ bolt/workflows/smlv_somatic/report.py | 4 +- 4 files changed, 218 insertions(+), 5 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 4a9346f..7ad39be 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -111,7 +111,7 @@ def get_minimal_header(input_fh): return '\n'.join([filetype_line, *chrom_lines, *format_lines, column_line]) -def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None): +def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None): # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a # temporary directory outside of the FusionFS mounted directory then manually copy across @@ -125,7 +125,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No command_args = [ f'--sample_id {sample_id}', f'--input_vcf {input_fp}', - f'--vep_dir {pcgr_refdata_dir}', + f'--vep_dir {vep_dir}', f'--refdata_dir {pcgr_refdata_dir}', f'--tumor_dp_tag TUMOR_DP', f'--tumor_af_tag TUMOR_AF', @@ -188,7 +188,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=No return pcgr_output_dir -def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None): +def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None): if not sample_id: sample_id = 'nosampleset' @@ -199,17 +199,22 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, output_dir, threads=1, pc temp_dir = tempfile.TemporaryDirectory() cpsr_output_dir = output_dir / 'cpsr/' + if cpsr_output_dir.exists(): + logger.warning(f"Output directory '{cpsr_output_dir}' already exists and will be overwritten") + shutil.rmtree(cpsr_output_dir) + command_args = [ f'--sample_id {sample_id}', f'--input_vcf {input_fp}', f'--genome_assembly grch38', f'--custom_list {panel_fp}', + f'--vep_dir {vep_dir}', + f'--refdata_dir {pcgr_refdata_dir}', # NOTE(SW): probably useful to add versioning information here; weigh against maintainence # burden f'--custom_list_name umccr_germline_panel', f'--pop_gnomad global', f'--classify_all', - f'--pcgr_dir {pcgr_refdata_dir}', f'--vcfanno_n_proc {threads}', f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select', ] @@ -483,3 +488,193 @@ def annotate_record(record, annotations, *, allow_missing=False): record.INFO[info_enum.value] = v return record + +def split_vcf(input_vcf, output_dir): + """ + Splits a VCF file into multiple chunks, each containing up to max_variants variants. + Each chunk includes the VCF header. + Ensures no overlapping positions between chunks. + """ + output_dir = pathlib.Path(output_dir / "vcf_chunks") + output_dir.mkdir(parents=True, exist_ok=True) + chunk_files = [] + chunk_number = 1 + variant_count = 0 + base_filename = pathlib.Path(input_vcf).stem + chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + base_filename = input_vcf.stem + chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + chunk_files.append(chunk_filename) + # Open the input VCF using cyvcf2 + vcf_in = cyvcf2.VCF(input_vcf) + # Create a new VCF file for the first chunk + vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in) + last_position = None + for record in vcf_in: + current_position = record.POS + # Check if we need to start a new chunk + if variant_count >= constants.MAX_SOMATIC_VARIANTS and (last_position is None or current_position != last_position): + # Close the current chunk file and start a new one + vcf_out.close() + chunk_number += 1 + chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" + chunk_files.append(chunk_filename) + vcf_out = cyvcf2.Writer(str(chunk_filename), vcf_in) + variant_count = 0 + # Write the record to the current chunk + vcf_out.write_record(record) + variant_count += 1 + last_position = current_position + # Close the last chunk file + vcf_out.close() + vcf_in.close() + logger.info(f"VCF file split into {len(chunk_files)} chunks.") + return chunk_files + +def run_somatic_chunck(vcf_chunks, pcgr_data_dir, vep_dir, output_dir, pcgr_output_dir, max_threads, pcgr_conda, pcgrr_conda): + pcgr_tsv_files = [] + pcgr_vcf_files = [] + num_chunks = len(vcf_chunks) + # Ensure we don't use more workers than available threads, and each worker has at least 2 threads + max_workers = min(num_chunks, max_threads // 2) + threads_quot, threads_rem = divmod(max_threads, num_chunks) + threads_per_chunk = max(2, threads_quot) + # Limit the number of workers to the smaller of num_chunks or max_threads // 2 + with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor: + futures = {} + for chunk_number, vcf_file in enumerate(vcf_chunks, start=1): + # Assign extra thread to the first 'threads_rem' chunks + additional_thread = 1 if chunk_number <= threads_rem else 0 + total_threads = threads_per_chunk + additional_thread + futures[executor.submit(run_somatic, vcf_file, pcgr_data_dir, vep_dir, pcgr_output_dir, chunk_number, total_threads, pcgr_conda, pcgrr_conda)] = chunk_number + for future in concurrent.futures.as_completed(futures): + try: + pcgr_tsv_fp, pcgr_vcf_fp = future.result() + if pcgr_tsv_fp: + pcgr_tsv_files.append(pcgr_tsv_fp) + if pcgr_vcf_fp: + pcgr_vcf_files.append(pcgr_vcf_fp) + except Exception as e: + print(f"Exception occurred: {e}") + merged_vcf_fp, merged_tsv_fp = merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_files) + return merged_tsv_fp, merged_vcf_fp + +def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp): + pcgr_dir = pathlib.Path(output_dir) / 'pcgr' + pcgr_dir.mkdir(exist_ok=True) + + # Merge all TSV files into a single file in the pcgr directory + merged_tsv_fp = pcgr_dir / "nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv" + util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp) + + # Step 5: Merge all VCF files into a single file in the pcgr directory + merged_vcf_path = pcgr_dir / "nosampleset.pcgr_acmg.grch38" + merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path) + + return merged_vcf, merged_tsv_fp + + +def get_variant_filter_data(variant): + attribute_names = ( + 'tier', + 'difficult', + 'giab_conf', + 'intergenic', + 'intronic', + 'downstream', + 'upstream', + 'impacts_other', + ) + + data = {e: None for e in attribute_names} + + + data['tier'] = variant.INFO['PCGR_TIER'] + + + info_keys = [k for k, v in variant.INFO] + + data['difficult'] = any(e.startswith('DIFFICULT') for e in info_keys) + data['giab_conf'] = 'GIAB_CONF' in info_keys + + + # NOTE(SW): GIAB_CONF always overrides DIFFICULT tags + if data['giab_conf'] and data['difficult']: + data['difficult']= False + + + for impact in get_impacts(variant.INFO['PCGR_CSQ']): + if impact == 'intergenic_variant': + data['intergenic'] = True + elif impact == 'intron_variant': + data['intronic'] = True + elif impact == 'downstream_gene_variant': + data['downstream'] = True + elif impact == 'upstream_gene_variant': + data['upstream'] = True + elif impact: + data['impacts_other'] = True + else: + assert False + + return data + + +def get_impacts(csq_str_full): + impacts = set() + for csq_str in csq_str_full.split(','): + csq_tokens = csq_str.split('|') + impact_str = csq_tokens[1] + impacts.update(impact_str.split('&')) + return impacts + + +def determine_filter(data): + + for impact, region in get_ordering(tiers=False): + + # NOTE(SW): this is less efficient than nested loops since the outer block is reevaluated + # within what would be the inner loop each cycle; taking this route for cleaner code + + impacts_higher = get_impacts_higher(impact) + impact_filter = bool(data[impact]) and not any(bool(data[e]) for e in impacts_higher) + + region_filter = False + if region == 'none': + region_filter = not (data['difficult'] or data['giab_conf']) + else: + region_filter = data[region] + + if impact_filter and region_filter: + return (impact, region) + + return False + + +def get_variant_repr(variant): + return (variant.CHROM, variant.POS, variant.REF, tuple(variant.ALT)) + + +@functools.cache +def get_ordering(tiers=True, impacts=True, regions=True): + categories = [ + constants.PCGR_TIERS_FILTERING if tiers else None, + constants.VEP_IMPACTS_FILTER if impacts else None, + constants.GENOMIC_REGIONS_FILTERING if regions else None, + ] + + # NOTE(SW): I'm not aware of any noncoding impacts for TIER_[1-4] other than TERT but keeping + # in to be overly cautious + ordering_iter = itertools.product(*(c for c in categories if c)) + + return tuple(ordering_iter) + + +@functools.cache +def get_impacts_higher(impact): + impact_index = constants.VEP_IMPACTS_FILTER.index(impact) + if impact_index + 1 < len(constants.VEP_IMPACTS_FILTER): + impacts_higher = constants.VEP_IMPACTS_FILTER[impact_index+1:len(constants.VEP_IMPACTS_FILTER)] + else: + impacts_higher = list() + return impacts_higher diff --git a/bolt/workflows/smlv_germline/report.py b/bolt/workflows/smlv_germline/report.py index 0742c19..8248a19 100644 --- a/bolt/workflows/smlv_germline/report.py +++ b/bolt/workflows/smlv_germline/report.py @@ -22,6 +22,7 @@ @click.option('--germline_panel_list_fp', required=True, type=click.Path(exists=True)) @click.option('--pcgr_data_dir', required=True, type=click.Path(exists=True)) +@click.option('--vep_dir', required=True, type=click.Path(exists=True)) @click.option('--threads', required=True, type=int, default=1) @@ -69,6 +70,7 @@ def entry(ctx, **kwargs): cpsr_prep_fp, kwargs['germline_panel_list_fp'], kwargs['pcgr_data_dir'], + kwargs['vep_dir'], output_dir, threads=kwargs['threads'], pcgr_conda=kwargs['pcgr_conda'], diff --git a/bolt/workflows/smlv_somatic/annotate.py b/bolt/workflows/smlv_somatic/annotate.py index 385fcc7..543755d 100644 --- a/bolt/workflows/smlv_somatic/annotate.py +++ b/bolt/workflows/smlv_somatic/annotate.py @@ -23,6 +23,7 @@ @click.option('--pon_dir', required=True, type=click.Path(exists=True)) @click.option('--pcgr_data_dir', required=True, type=click.Path(exists=True)) +@click.option('--vep_dir', required=True, type=click.Path(exists=True)) @click.option('--pcgr_conda', required=False, type=str) @click.option('--pcgrr_conda', required=False, type=str) @@ -105,7 +106,20 @@ def entry(ctx, **kwargs): pcgr_dir = pcgr.run_somatic( pcgr_prep_fp, kwargs['pcgr_data_dir'], + kwargs['vep_dir'], output_dir, + pcgr_output_dir, + kwargs['threads'], + kwargs['pcgr_conda'], + kwargs['pcgrr_conda'] + ) + else: + pcgr_tsv_fp, pcgr_vcf_fp = pcgr.run_somatic( + pcgr_prep_fp, + kwargs['pcgr_data_dir'], + kwargs['vep_dir'], + pcgr_output_dir, + chunk_nbr=None, threads=kwargs['threads'], pcgr_conda=kwargs['pcgr_conda'], pcgrr_conda=kwargs['pcgrr_conda'], diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py index bdf908f..bf17ffb 100644 --- a/bolt/workflows/smlv_somatic/report.py +++ b/bolt/workflows/smlv_somatic/report.py @@ -27,6 +27,7 @@ @click.option('--pcgrr_conda', required=False, type=str) @click.option('--pcgr_data_dir', required=False, type=str) +@click.option('--vep_dir', required=True, type=click.Path(exists=True)) @click.option('--purple_purity_fp', required=True, type=click.Path(exists=True)) @click.option('--cancer_genes_fp', required=True, type=click.Path(exists=True)) @@ -117,7 +118,8 @@ def entry(ctx, **kwargs): pcgr.run_somatic( pcgr_prep_fp, kwargs['pcgr_data_dir'], - output_dir, + kwargs['vep_dir'], + pcgr_output_dir, threads=kwargs['threads'], pcgr_conda=kwargs['pcgr_conda'], pcgrr_conda=kwargs['pcgrr_conda'], From a7a36ea83dedc41e5ccf966d0b727b1c740d8b19 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Tue, 1 Jul 2025 13:14:05 +1000 Subject: [PATCH 09/25] update CPSR-related constants and file naming conventions for pcgr v2.1 --- bolt/common/constants.py | 31 ++++++++----------------------- bolt/common/pcgr.py | 11 ++++------- 2 files changed, 12 insertions(+), 30 deletions(-) diff --git a/bolt/common/constants.py b/bolt/common/constants.py index 8aa8608..1f84f51 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -85,8 +85,6 @@ class VcfInfo(enum.Enum): PCGR_TCGA_PANCANCER_COUNT = 'PCGR_TCGA_PANCANCER_COUNT' PCGR_ICGC_PCAWG_COUNT = 'PCGR_ICGC_PCAWG_COUNT' - CPSR_FINAL_CLASSIFICATION = 'CPSR_FINAL_CLASSIFICATION' - CPSR_PATHOGENICITY_SCORE = 'CPSR_PATHOGENICITY_SCORE' CPSR_CLINVAR_CLASSIFICATION = 'CPSR_CLINVAR_CLASSIFICATION' CPSR_CSQ = 'CPSR_CSQ' @@ -277,23 +275,10 @@ def namespace(self): 'Description': 'Count of ICGC PCAWG hits', }, - VcfInfo.CPSR_FINAL_CLASSIFICATION: { - 'Number': '1', - 'Type': 'String', - 'Description': ( - 'Final variant classification based on the combination of CLINVAR_CLASSIFICTION (for ' - 'ClinVar-classified variants), and CPSR_CLASSIFICATION (for novel variants)' - ), - }, - VcfInfo.CPSR_PATHOGENICITY_SCORE: { - 'Number': '1', - 'Type': 'Float', - 'Description': 'Aggregated CPSR pathogenicity score', - }, VcfInfo.CPSR_CLINVAR_CLASSIFICATION: { 'Number': '1', 'Type': 'String', - 'Description': 'Clinical significance of variant on a five-tiered scale', + 'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale', }, VcfInfo.CPSR_CSQ: { 'Number': '.', @@ -302,13 +287,13 @@ def namespace(self): 'Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|' 'Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|' 'Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|' - 'FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE_SELECT|' - 'MANE_PLUS_CLINICAL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|RefSeq|' - 'DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|gnomAD_AF|gnomAD_AFR_AF|' - 'gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|' - 'gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|' - 'MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|LoF|LoF_filter|LoF_flags|' - 'LoF_info' + 'FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|MANE|MANE_SELECT|' + 'MANE_PLUS_CLINICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|UNIPROT_ISOFORM|RefSeq|' + 'DOMAINS|HGVS_OFFSET|gnomADe_AF|gnomADe_AFR_AF|gnomADe_AMR_AF|gnomADe_ASJ_AF|' + 'gnomADe_EAS_AF|gnomADe_FIN_AF|gnomADe_MID_AF|gnomADe_NFE_AF|gnomADe_REMAINING_AF|' + 'gnomADe_SAS_AF|CLIN_SIG|SOMATIC|PHENO|CHECK_REF|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|' + 'MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS|NearestExonJB|MaxEntScan_alt|MaxEntScan_diff|' + 'MaxEntScan_ref' ), }, diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 7ad39be..b017aa4 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -293,14 +293,13 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir): # Set destination INFO field names and source TSV fields + # Note: Only include fields that exist in CPSR v2.2.1 output info_field_map = { - constants.VcfInfo.CPSR_FINAL_CLASSIFICATION: 'FINAL_CLASSIFICATION', - constants.VcfInfo.CPSR_PATHOGENICITY_SCORE: 'CPSR_PATHOGENICITY_SCORE', constants.VcfInfo.CPSR_CLINVAR_CLASSIFICATION: 'CLINVAR_CLASSIFICATION', constants.VcfInfo.CPSR_CSQ: 'CSQ', } - cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snv_indel_ann.tsv.gzv' + cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.classification.tsv.gz' cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz' # Enforce matching defined and source INFO annotations @@ -312,8 +311,6 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir): # Open filehandles, set required header entries input_fh = cyvcf2.VCF(input_fp) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_FINAL_CLASSIFICATION) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_PATHOGENICITY_SCORE) util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_CLINVAR_CLASSIFICATION) util.add_vcf_header_entry(input_fh, constants.VcfInfo.CPSR_CSQ) @@ -353,7 +350,7 @@ def check_annotation_headers(info_field_map, vcf_fp): def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV data_tsv = dict() - with gzip.open(tsv_fp, 'rt') as tsv_fh: + with open(tsv_fp, 'r') as tsv_fh: for record in csv.DictReader(tsv_fh, delimiter='\t'): key, record_ann = get_annotation_entry_tsv(record, info_field_map) assert key not in data_tsv @@ -376,7 +373,7 @@ def collect_cpsr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather annotations from TSV data_tsv = dict() gdot_re = re.compile('^(?P[\dXYM]+):g\.(?P\d+)(?P[A-Z]+)>(?P[A-Z]+)$') - with open(tsv_fp, 'r') as tsv_fh: + with gzip.open(tsv_fp, 'rt') as tsv_fh: for record in csv.DictReader(tsv_fh, delimiter='\t'): # Decompose CPSR 'GENOMIC_CHANGE' field into CHROM, POS, REF, and ALT re_result = gdot_re.match(record['GENOMIC_CHANGE']) From 16118696d5961af6c831111ddbe1541a758f3e72 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Tue, 1 Jul 2025 17:59:02 +1000 Subject: [PATCH 10/25] remove unused PCGR_ACTIONABILITY_TIER header entry and update TSV file handling --- bolt/common/pcgr.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index b017aa4..f6fc600 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -268,7 +268,6 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou # Open filehandles, set required header entries input_fh = cyvcf2.VCF(input_fp) - util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION) @@ -350,15 +349,11 @@ def check_annotation_headers(info_field_map, vcf_fp): def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV data_tsv = dict() - with open(tsv_fp, 'r') as tsv_fh: + with gzip.open(tsv_fp, 'rt') as tsv_fh: for record in csv.DictReader(tsv_fh, delimiter='\t'): key, record_ann = get_annotation_entry_tsv(record, info_field_map) assert key not in data_tsv - # Process PCGR_ACTIONABILITY_TIER - # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING - record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'] - # Store annotation data data_tsv[key] = record_ann @@ -436,6 +431,9 @@ def get_annotation_entry_tsv(record, info_field_map): if "GENOMIC_CHANGE" in record and record["GENOMIC_CHANGE"]: chrom, pos, ref, alt = parse_genomic_change(record["GENOMIC_CHANGE"]) + if not chrom.startswith('chr'): + chrom = f'chr{chrom}' + key = (chrom, pos, ref, alt) record_ann = dict() From 64c3486cd14390316c9e565b060efe8a8da969d3 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Thu, 3 Jul 2025 10:46:01 +1000 Subject: [PATCH 11/25] add back PCGR_ACTIONABILITY_TIER --- bolt/common/pcgr.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index f6fc600..7297ad8 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -268,6 +268,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou # Open filehandles, set required header entries input_fh = cyvcf2.VCF(input_fp) + util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_ACTIONABILITY_TIER) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CSQ) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_MUTATION_HOTSPOT) util.add_vcf_header_entry(input_fh, constants.VcfInfo.PCGR_CLINVAR_CLASSIFICATION) @@ -354,6 +355,11 @@ def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): key, record_ann = get_annotation_entry_tsv(record, info_field_map) assert key not in data_tsv + + # Process PCGR_ACTIONABILITY_TIER + # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING + record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'] + # Store annotation data data_tsv[key] = record_ann From ecede4b97ac442cc7336736f165ac6ed8164359e Mon Sep 17 00:00:00 2001 From: qclayssen Date: Mon, 21 Jul 2025 10:14:19 +1000 Subject: [PATCH 12/25] Allow missing pcgr annotation for somatic record --- bolt/common/pcgr.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 7297ad8..f59ecbc 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -183,9 +183,18 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, util.execute_command(command) - shutil.copytree(temp_dir.name, pcgr_output_dir) + shutil.copytree(temp_dir.name, output_dir) - return pcgr_output_dir + pcgr_tsv_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.snv_indel_ann.tsv.gz' + pcgr_vcf_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.pass.vcf.gz' + + # Check if both files exist + if not pcgr_tsv_fp.exists(): + raise FileNotFoundError(f"Expected file {pcgr_tsv_fp} not found.") + if not pcgr_vcf_fp.exists(): + raise FileNotFoundError(f"Expected file {pcgr_vcf_fp} not found.") + + return pcgr_tsv_fp, pcgr_vcf_fp def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, threads=1, pcgr_conda=None, pcgrr_conda=None, sample_id=None): @@ -287,7 +296,7 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou output_fh.write_record(record) continue # Annotate and write - record_ann = annotate_record(record, pcgr_data) + record_ann = annotate_record(record, pcgr_data, allow_missing=True) output_fh.write_record(record_ann) @@ -502,7 +511,6 @@ def split_vcf(input_vcf, output_dir): chunk_number = 1 variant_count = 0 base_filename = pathlib.Path(input_vcf).stem - chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" base_filename = input_vcf.stem chunk_filename = output_dir / f"{base_filename}_chunk{chunk_number}.vcf" chunk_files.append(chunk_filename) From 33c4bf945cd9bec0c836c1eebf6c65c6e5e82abd Mon Sep 17 00:00:00 2001 From: qclayssen Date: Wed, 23 Jul 2025 09:43:44 +1000 Subject: [PATCH 13/25] remove PCGR temp dir made for fusion we no longer use --- bolt/common/pcgr.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index f59ecbc..87284e7 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -113,11 +113,13 @@ def get_minimal_header(input_fh): def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, threads=1, pcgr_conda=None, pcgrr_conda=None, purity=None, ploidy=None, sample_id=None): - # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a - # temporary directory outside of the FusionFS mounted directory then manually copy across - temp_dir = tempfile.TemporaryDirectory() - pcgr_output_dir = output_dir / 'pcgr/' + output_dir = output_dir / f"pcgr_{chunk_nbr}" if chunk_nbr is not None else output_dir + + if output_dir.exists(): + logger.warning(f"Output directory '{output_dir}' already exists and will be overwritten") + shutil.rmtree(output_dir) + if not sample_id: sample_id = 'nosampleset' @@ -164,7 +166,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, command_args.append(f'--tumor_ploidy {ploidy}') # NOTE(SW): placed here to always have output directory last - command_args.append(f'--output_dir {temp_dir.name}') + command_args.append(f'--output_dir {output_dir}') delimiter_padding = ' ' * 10 delimiter = f' \\\n{delimiter_padding}' @@ -181,9 +183,11 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, command_formatting = '\n' + ' ' * 4 command = command_formatting + command_conda + command - util.execute_command(command) + # Log file path + log_file_path = output_dir / "run_somatic.log" - shutil.copytree(temp_dir.name, output_dir) + # Run the command and redirect output to the log file + util.execute_command(command, log_file_path=log_file_path) pcgr_tsv_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.snv_indel_ann.tsv.gz' pcgr_vcf_fp = pathlib.Path(output_dir) / f'{sample_id}.pcgr.grch38.pass.vcf.gz' @@ -202,10 +206,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre if not sample_id: sample_id = 'nosampleset' - # NOTE(SW): Nextflow FusionFS v2.2.8 does not support PCGR output to S3; instead write to a - # temporary directory outside of the FusionFS mounted directory then manually copy across - temp_dir = tempfile.TemporaryDirectory() cpsr_output_dir = output_dir / 'cpsr/' if cpsr_output_dir.exists(): @@ -232,7 +233,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre command_args.append(f'--pcgrr_conda {pcgrr_conda}') # NOTE(SW): placed here to always have output directory last - command_args.append(f'--output_dir {temp_dir.name}') + command_args.append(f'--output_dir {cpsr_output_dir}') delimiter_padding = ' ' * 10 delimiter = f' \\\n{delimiter_padding}' @@ -251,8 +252,6 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre util.execute_command(command) - shutil.copytree(temp_dir.name, cpsr_output_dir) - return cpsr_output_dir From d7e0e7344b461f1ce4306b7975ee59453ef460b1 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Wed, 23 Jul 2025 15:36:26 +1000 Subject: [PATCH 14/25] creat pcgr output dir --- bolt/common/pcgr.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 87284e7..0d53e30 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -120,6 +120,8 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, logger.warning(f"Output directory '{output_dir}' already exists and will be overwritten") shutil.rmtree(output_dir) + # Create output directory + output_dir.mkdir(parents=True, exist_ok=True) if not sample_id: sample_id = 'nosampleset' @@ -213,6 +215,9 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre logger.warning(f"Output directory '{cpsr_output_dir}' already exists and will be overwritten") shutil.rmtree(cpsr_output_dir) + # Create output directory + cpsr_output_dir.mkdir(parents=True, exist_ok=True) + command_args = [ f'--sample_id {sample_id}', f'--input_vcf {input_fp}', From ef9fb08f85b30111c8ce140dc147ab57a70ededc Mon Sep 17 00:00:00 2001 From: qclayssen Date: Fri, 27 Jun 2025 12:45:14 +1000 Subject: [PATCH 15/25] update pcgr germline command --- bolt/common/pcgr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 0d53e30..2c441de 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -223,7 +223,7 @@ def run_germline(input_fp, panel_fp, pcgr_refdata_dir, vep_dir, output_dir, thre f'--input_vcf {input_fp}', f'--genome_assembly grch38', f'--custom_list {panel_fp}', - f'--vep_dir {vep_dir}', + f'--vep_dir {pcgr_refdata_dir}', f'--refdata_dir {pcgr_refdata_dir}', # NOTE(SW): probably useful to add versioning information here; weigh against maintainence # burden @@ -269,8 +269,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou constants.VcfInfo.PCGR_CSQ: 'CSQ', } - pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz' - pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz' + pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz' + pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr.grch38.vcf.gz' # Enforce matching defined and source INFO annotations check_annotation_headers(info_field_map, pcgr_vcf_fp) From 9d695b39da1c94bc1b5cc47c29d57adfeea10826 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Fri, 25 Jul 2025 09:56:49 +1000 Subject: [PATCH 16/25] change vep #fork to recommanded one "Number of forks that VEP can use during annotation must be above 1 and not more than 8 (recommended is 4)" --- bolt/common/pcgr.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 2c441de..e8d5a43 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -142,6 +142,7 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, f'--estimate_tmb', #f'--show_noncoding', f'--vcfanno_n_proc {threads}', + f'--vep_n_forks 4', f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select', ] From 2e6c3eaea2400a7d85794edd614be7b85fe362b2 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Thu, 17 Jul 2025 11:17:44 +1000 Subject: [PATCH 17/25] Fix I/O compress file and add logs --- bolt/common/pcgr.py | 3 +- bolt/util.py | 95 +++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 93 insertions(+), 5 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index e8d5a43..81f2c6d 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -364,6 +364,7 @@ def check_annotation_headers(info_field_map, vcf_fp): def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV data_tsv = dict() + logger.info(f"Collecting PCGR annotations from {tsv_fp} and {vcf_fp}") with gzip.open(tsv_fp, 'rt') as tsv_fh: for record in csv.DictReader(tsv_fh, delimiter='\t'): key, record_ann = get_annotation_entry_tsv(record, info_field_map) @@ -582,7 +583,7 @@ def merging_pcgr_files(output_dir, pcgr_vcf_files, pcgr_tsv_fp): util.merge_tsv_files(pcgr_tsv_fp, merged_tsv_fp) # Step 5: Merge all VCF files into a single file in the pcgr directory - merged_vcf_path = pcgr_dir / "nosampleset.pcgr_acmg.grch38" + merged_vcf_path = pcgr_dir / "nosampleset.pcgr.grch38.pass.vcf.gz" merged_vcf = util.merge_vcf_files(pcgr_vcf_files, merged_vcf_path) return merged_vcf, merged_tsv_fp diff --git a/bolt/util.py b/bolt/util.py index b7333d9..b99362c 100644 --- a/bolt/util.py +++ b/bolt/util.py @@ -1,3 +1,4 @@ +import gzip import pathlib import subprocess import sys @@ -95,7 +96,93 @@ def get_qualified_vcf_annotation(anno_enum): return f'{anno_enum.namespace}/{anno_enum.value}' -#def add_vcf_filter(record, filter_enum): -# existing_filters = [e for e in record.FILTERS if e != 'PASS'] -# assert filter_enum.value not in existing_filters -# return ';'.join([*existing_filters, filter_enum.value]) +def merge_tsv_files(tsv_files, merged_tsv_fp): + """ + Merges all TSV files into a single TSV. + """ + logger.info("Merging TSV files...") + with gzip.open(merged_tsv_fp, 'wt') as merged_tsv: + for i, tsv_file in enumerate(tsv_files): + with gzip.open(tsv_file, 'rt') as infile: + for line_number, line in enumerate(infile): + # Skip header except for the first file + if i > 0 and line_number == 0: + continue + merged_tsv.write(line) + logger.info(f"Merged TSV written to: {merged_tsv_fp}") + + +def merge_vcf_files(vcf_files, merged_vcf_fp): + """ + Merges multiple VCF files into a single sorted VCF file using bcftools. + + Parameters: + - vcf_files: List of paths to VCF files to be merged. + - merged_vcf_fp: Path to the output merged VCF file (without extension). + + Returns: + - Path to the sorted merged VCF file. + """ + logger.info("Merging VCF files...") + merged_vcf_fp = pathlib.Path(merged_vcf_fp) + merged_unsorted_vcf = merged_vcf_fp.with_suffix('.unsorted.vcf.gz') + merged_vcf = merged_vcf_fp.with_suffix('.vcf.gz') + + # Prepare the bcftools merge command arguments + command_args = [ + 'bcftools merge', + '-m all', + '-Oz', + f'-o {merged_unsorted_vcf}', + ] + [str(vcf_file) for vcf_file in vcf_files] + + # Format the command for readability + delimiter_padding = ' ' * 10 + delimiter = f' \\\n{delimiter_padding}' + command_args_str = delimiter.join(command_args) + + command = f''' + {command_args_str} + ''' + + # Run the bcftools merge command + logger.info("Running bcftools merge...") + execute_command(command) + logger.info(f"Merged VCF written to: {merged_unsorted_vcf}") + + # Sort the merged VCF file + sort_command_args = [ + 'bcftools sort', + '-Oz', + f'-o {merged_vcf}', + f'{merged_unsorted_vcf}' + ] + sort_command_args_str = delimiter.join(sort_command_args) + sort_command = f''' + {sort_command_args_str} + ''' + + logger.info("Sorting merged VCF file...") + execute_command(sort_command) + logger.info(f"Sorted merged VCF written to: {merged_vcf}") + + # Index the sorted merged VCF file + index_command_args = [ + 'bcftools index', + '-t', + f'{merged_vcf}' + ] + index_command_args_str = delimiter.join(index_command_args) + index_command = f''' + {index_command_args_str} + ''' + + logger.info("Indexing sorted merged VCF file...") + execute_command(index_command) + logger.info(f"Indexed merged VCF file: {merged_vcf}.tbi") + + # Optionally, remove the unsorted merged VCF file + if merged_unsorted_vcf.exists(): + merged_unsorted_vcf.unlink() + + return merged_vcf \ No newline at end of file From 94d4875d73d6da9aabbc212498bd9faf290d8468 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Wed, 22 Jan 2025 15:10:49 +1100 Subject: [PATCH 18/25] Add input variant filter for final PCGR report --- bolt/common/constants.py | 50 ++++++++++++++++ bolt/common/pcgr.py | 3 + bolt/workflows/smlv_somatic/report.py | 82 ++++++++++++++++++++++++++- 3 files changed, 134 insertions(+), 1 deletion(-) diff --git a/bolt/common/constants.py b/bolt/common/constants.py index 1f84f51..2b21ee1 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -41,6 +41,43 @@ } +################################ +## Hypermutated report filter ## +################################ +PCGR_TIERS_FILTERING = ( + 'TIER_1', + 'TIER_2', + 'TIER_3', + 'TIER_4', + 'NONCODING', +) + +VEP_IMPACTS_FILTER = ( + 'intergenic', + 'intronic', + 'downstream', + 'upstream', + 'impacts_other', +) + +GENOMIC_REGIONS_FILTERING = ( + 'difficult', + 'none', + 'giab_conf', +) + +HOTSPOT_FIELDS_FILTERING = ( + 'SAGE_HOTSPOT', + 'hotspot', + 'PCGR_MUTATION_HOTSPOT', +) + +RETAIN_FIELDS_FILTERING = ( + 'PANEL', + *HOTSPOT_FIELDS_FILTERING, +) + + ################################################## ## VCF FILTER tags and FORMAT, INFO annotations ## ################################################## @@ -61,6 +98,8 @@ class VcfFilter(enum.Enum): ENCODE = 'ENCODE' GNOMAD_COMMON = 'gnomAD_common' + PCGR_COUNT_LIMIT = 'PCGR_count_limit' + @property def namespace(self): return 'FILTER' @@ -119,6 +158,8 @@ class VcfInfo(enum.Enum): RESCUED_FILTERS_EXISTING = 'RESCUED_FILTERS_EXISTING' RESCUED_FILTERS_PENDING = 'RESCUED_FILTERS_PENDING' + PANEL = 'PANEL' + @property def namespace(self): return 'INFO' @@ -185,6 +226,9 @@ def namespace(self): 'Description': f'gnomAD AF >= {MAX_GNOMAD_AF}', }, + VcfFilter.PCGR_COUNT_LIMIT: { + 'Description': 'Manually filtered to meet PCGR 500,000 variant limit', + }, # INFO VcfInfo.TUMOR_AF: { @@ -336,6 +380,12 @@ def namespace(self): 'Description': 'Filters pending prior to variant rescue', }, + VcfInfo.PANEL: { + 'Number': '0', + 'Type': 'Flag', + 'Description': 'UMCCR somatic panel CDS (2,000 bp padding)', + }, + # FORMAT VcfFormat.SAGE_AD: { diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 81f2c6d..6c4e26d 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -1,4 +1,7 @@ +import collections import csv +import functools +import itertools import pathlib import re import shutil diff --git a/bolt/workflows/smlv_somatic/report.py b/bolt/workflows/smlv_somatic/report.py index bf17ffb..9f68c3a 100644 --- a/bolt/workflows/smlv_somatic/report.py +++ b/bolt/workflows/smlv_somatic/report.py @@ -1,3 +1,4 @@ +import collections import csv import json import pathlib @@ -106,10 +107,20 @@ def entry(ctx, **kwargs): fh.write('\n') # PCGR report + if variant_counts_process['filter_pass'] <= constants.MAX_SOMATIC_VARIANTS: + pcgr_input_vcf_fp = kwargs['vcf_fp'] + else: + pcgr_input_vcf_fp = select_pcgr_variants( + kwargs['vcf_fp'], + kwargs['cancer_genes_fp'], + kwargs['tumor_name'], + output_dir, + ) + purple_data = parse_purple_purity_file(kwargs['purple_purity_fp']) pcgr_prep_fp = pcgr.prepare_vcf_somatic( - kwargs['vcf_fp'], + pcgr_input_vcf_fp, kwargs['tumor_name'], kwargs['normal_name'], output_dir, @@ -286,6 +297,75 @@ def count_variant_process(vcf_fp): return counts +def select_pcgr_variants(vcf_fp, cancer_genes_fp, tumor_name, output_dir): + # Annotate variants in UMCCR somatic gene panel + fp_annotated_out = output_dir / f'{tumor_name}.umccr_panel_variants_annotated.vcf.gz' + util.execute_command(fr''' + bcftools annotate \ + --annotations <(awk 'BEGIN {{ OFS="\t" }} {{ print $1, $2-2000, $3+2000, "1" }}' {cancer_genes_fp}) \ + --header-line '{util.get_vcf_header_line(constants.VcfInfo.PANEL)}' \ + --columns CHROM,FROM,TO,{constants.VcfInfo.PANEL.value} \ + --output {fp_annotated_out} \ + {vcf_fp} + ''') + + # Set filter category for each variant + variants_sorted = collections.defaultdict(list) + for variant_count, variant in enumerate(cyvcf2.VCF(fp_annotated_out), 1): + if any(variant.INFO.get(e) for e in constants.RETAIN_FIELDS_FILTERING): + continue + + data = pcgr.get_variant_filter_data(variant) + variant_filter = pcgr.determine_filter(data) + assert variant_filter + + filter_category = (data['tier'], *variant_filter) + variant_repr = pcgr.get_variant_repr(variant) + variants_sorted[filter_category].append(variant_repr) + + + # Determine the set of filter categories to come under the PCGR 500,000 variant threshold + filter_sum = 0 + filter_categories = list() + for key in pcgr.get_ordering(): + + if (variant_count - filter_sum) <= constants.MAX_SOMATIC_VARIANTS: + break + + filter_sum += len(variants_sorted.get(key, [])) + filter_categories.append(key) + + # Set FILTERS and write out records + filter_variants = set() + for key in filter_categories: + filter_variants.update(variants_sorted[key]) + + fh_in = cyvcf2.VCF(vcf_fp) + util.add_vcf_header_entry(fh_in, constants.VcfFilter.PCGR_COUNT_LIMIT) + + # NOTE(SW): creating an additional VCF with all records for traceability + fp_out = output_dir / f'{tumor_name}.pcgr_hypermutated.pass.vcf.gz' + fp_set_out = output_dir / f'{tumor_name}.pcgr_hypermutated.filters_set.vcf.gz' + + fh_out = cyvcf2.Writer(fp_out, fh_in) + fh_set_out = cyvcf2.Writer(fp_set_out, fh_in) + + for variant in fh_in: + variant_repr = pcgr.get_variant_repr(variant) + if variant_repr not in filter_variants: + # Write only passing + fh_out.write_record(variant) + else: + variant.FILTER = constants.VcfFilter.PCGR_COUNT_LIMIT.value + # Write all variants including those with FILTER set + fh_set_out.write_record(variant) + + fh_out.close() + fh_set_out.close() + + return fp_out + + def parse_purple_purity_file(fp): with open(fp, 'r') as fh: entries = list(csv.DictReader(fh, delimiter='\t')) From d6e8b53610d040c07d0d47663ade0ae5fe710ae5 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Tue, 15 Apr 2025 14:22:20 +1000 Subject: [PATCH 19/25] uptade pcgr tier --- bolt/common/constants.py | 39 +-------------------------------------- bolt/common/pcgr.py | 10 ++++++---- 2 files changed, 7 insertions(+), 42 deletions(-) diff --git a/bolt/common/constants.py b/bolt/common/constants.py index 2b21ee1..09ad0de 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -41,43 +41,6 @@ } -################################ -## Hypermutated report filter ## -################################ -PCGR_TIERS_FILTERING = ( - 'TIER_1', - 'TIER_2', - 'TIER_3', - 'TIER_4', - 'NONCODING', -) - -VEP_IMPACTS_FILTER = ( - 'intergenic', - 'intronic', - 'downstream', - 'upstream', - 'impacts_other', -) - -GENOMIC_REGIONS_FILTERING = ( - 'difficult', - 'none', - 'giab_conf', -) - -HOTSPOT_FIELDS_FILTERING = ( - 'SAGE_HOTSPOT', - 'hotspot', - 'PCGR_MUTATION_HOTSPOT', -) - -RETAIN_FIELDS_FILTERING = ( - 'PANEL', - *HOTSPOT_FIELDS_FILTERING, -) - - ################################################## ## VCF FILTER tags and FORMAT, INFO annotations ## ################################################## @@ -301,7 +264,7 @@ def namespace(self): VcfInfo.PCGR_CLINVAR_CLASSIFICATION: { 'Number': '.', 'Type': 'String', - 'Description': 'ClinVar - Overall clinical significance of variant on a five-tiered scale', + 'Description': 'ClinVar - clinical significance - per phenotype submission', }, VcfInfo.PCGR_COSMIC_COUNT: { 'Number': '1', diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 6c4e26d..29738bd 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -273,8 +273,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou constants.VcfInfo.PCGR_CSQ: 'CSQ', } - pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz' - pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr.grch38.vcf.gz' + pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz' + pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz' # Enforce matching defined and source INFO annotations check_annotation_headers(info_field_map, pcgr_vcf_fp) @@ -316,7 +316,7 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir): constants.VcfInfo.CPSR_CSQ: 'CSQ', } - cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.classification.tsv.gz' + cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snv_indel_ann.tsv.gzv' cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz' # Enforce matching defined and source INFO annotations @@ -367,12 +367,14 @@ def check_annotation_headers(info_field_map, vcf_fp): def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV data_tsv = dict() - logger.info(f"Collecting PCGR annotations from {tsv_fp} and {vcf_fp}") with gzip.open(tsv_fp, 'rt') as tsv_fh: for record in csv.DictReader(tsv_fh, delimiter='\t'): key, record_ann = get_annotation_entry_tsv(record, info_field_map) assert key not in data_tsv + # Process PCGR_ACTIONABILITY_TIER + # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING + record_ann[constants.VcfInfo.PCGR_ACTIONABILITY_TIER] = record['ACTIONABILITY_TIER'].replace(' ', '_') # Process PCGR_ACTIONABILITY_TIER # TIER 1, TIER 2, TIER 3, TIER 4, NONCODING From 59be8dce414fff9d2412c4244fa1fdddc9206ce4 Mon Sep 17 00:00:00 2001 From: qclayssen Date: Tue, 1 Jul 2025 13:14:05 +1000 Subject: [PATCH 20/25] update CPSR-related constants and file naming conventions for pcgr v2.1 --- bolt/common/pcgr.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 29738bd..3d897ab 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -273,8 +273,8 @@ def transfer_annotations_somatic(input_fp, tumor_name, filter_name, pcgr_dir, ou constants.VcfInfo.PCGR_CSQ: 'CSQ', } - pcgr_tsv_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.snv_indel_ann.tsv.gz' - pcgr_vcf_fp = pathlib.Path(pcgr_dir) / 'nosampleset.pcgr.grch38.vcf.gz' + pcgr_tsv_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.snvs_indels.tiers.tsv' + pcgr_vcf_fp = pathlib.Path(output_dir) / 'nosampleset.pcgr_acmg.grch38.vcf.gz' # Enforce matching defined and source INFO annotations check_annotation_headers(info_field_map, pcgr_vcf_fp) @@ -316,7 +316,7 @@ def transfer_annotations_germline(input_fp, normal_name, cpsr_dir, output_dir): constants.VcfInfo.CPSR_CSQ: 'CSQ', } - cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.snv_indel_ann.tsv.gzv' + cpsr_tsv_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.classification.tsv.gz' cpsr_vcf_fp = pathlib.Path(cpsr_dir) / f'{normal_name}.cpsr.grch38.vcf.gz' # Enforce matching defined and source INFO annotations @@ -367,7 +367,7 @@ def check_annotation_headers(info_field_map, vcf_fp): def collect_pcgr_annotation_data(tsv_fp, vcf_fp, info_field_map): # Gather all annotations from TSV data_tsv = dict() - with gzip.open(tsv_fp, 'rt') as tsv_fh: + with open(tsv_fp, 'r') as tsv_fh: for record in csv.DictReader(tsv_fh, delimiter='\t'): key, record_ann = get_annotation_entry_tsv(record, info_field_map) assert key not in data_tsv From eb2483600696ed959050b82608b3160d94136409 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Thu, 28 Aug 2025 16:12:55 +1000 Subject: [PATCH 21/25] linting --- bolt/common/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/common/constants.py b/bolt/common/constants.py index 09ad0de..775530e 100644 --- a/bolt/common/constants.py +++ b/bolt/common/constants.py @@ -241,7 +241,7 @@ def namespace(self): ), }, VcfInfo.PCGR_CSQ: { - 'Number': '.', + 'Number': '.', 'Type': 'String', 'Description': ( 'Consequence annotations from Ensembl VEP. Format: ' From 5109ebbaa8c947241a718b268e88474c9f333852 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Thu, 28 Aug 2025 16:13:42 +1000 Subject: [PATCH 22/25] Fix import --- bolt/common/pcgr.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 3d897ab..5d06188 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -1,13 +1,12 @@ import collections import csv import functools +import gzip import itertools import pathlib import re import shutil import tempfile -import gzip # added import - import cyvcf2 From a91ff863a3a68c5050b134387c5bd9a1b751298b Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Thu, 28 Aug 2025 16:15:36 +1000 Subject: [PATCH 23/25] remove dead code --- bolt/common/pcgr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bolt/common/pcgr.py b/bolt/common/pcgr.py index 5d06188..108633c 100644 --- a/bolt/common/pcgr.py +++ b/bolt/common/pcgr.py @@ -142,7 +142,6 @@ def run_somatic(input_fp, pcgr_refdata_dir, vep_dir, output_dir, chunk_nbr=None, f'--estimate_signatures', f'--estimate_msi', f'--estimate_tmb', - #f'--show_noncoding', f'--vcfanno_n_proc {threads}', f'--vep_n_forks 4', f'--vep_pick_order biotype,rank,appris,tsl,ccds,canonical,length,mane_plus_clinical,mane_select', From b0cfc8e7e3ab5501f88bc7e4189e0f5fe7e12d42 Mon Sep 17 00:00:00 2001 From: Quentin Clayssen Date: Thu, 28 Aug 2025 16:28:09 +1000 Subject: [PATCH 24/25] remove test assert absence `FILTER` column because now needed for breakend pairing --- tests/test_smlv_somatic_filter.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_smlv_somatic_filter.py b/tests/test_smlv_somatic_filter.py index caebea8..6f0e54d 100644 --- a/tests/test_smlv_somatic_filter.py +++ b/tests/test_smlv_somatic_filter.py @@ -201,7 +201,6 @@ def test_clinical_potential_rescue_general(self): info_data=info_data_set, ) smlv_somatic_filter.set_filter_data(record, 0) - assert not record.FILTER assert record.INFO.get(rescue_tag_str) is not None From acb6862c568b6992cb003b4c4f6427848c151c01 Mon Sep 17 00:00:00 2001 From: Stephen Watts Date: Thu, 28 Aug 2025 18:17:50 +1000 Subject: [PATCH 25/25] Fix, update clinical potential rescue test --- tests/test_smlv_somatic_filter.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_smlv_somatic_filter.py b/tests/test_smlv_somatic_filter.py index 6f0e54d..6455162 100644 --- a/tests/test_smlv_somatic_filter.py +++ b/tests/test_smlv_somatic_filter.py @@ -189,9 +189,7 @@ def test_clinical_potential_rescue_general(self): info_data_sets = [ {'HMF_HOTSPOT': ''}, {'PCGR_MUTATION_HOTSPOT': ''}, - {'PCGR_COSMIC_COUNT': 11}, {'PCGR_TCGA_PANCANCER_COUNT': 6}, - {'PCGR_ICGC_PCAWG_COUNT': 6}, ] rescue_tag_str = bolt_constants.VcfInfo.CLINICAL_POTENTIAL_RESCUE.value @@ -201,6 +199,7 @@ def test_clinical_potential_rescue_general(self): info_data=info_data_set, ) smlv_somatic_filter.set_filter_data(record, 0) + assert not record.FILTER assert record.INFO.get(rescue_tag_str) is not None