diff --git a/pcgr/arg_checker.py b/pcgr/arg_checker.py index f2dc325b..8b87d92b 100644 --- a/pcgr/arg_checker.py +++ b/pcgr/arg_checker.py @@ -181,8 +181,8 @@ def verify_args(arg_dict): warn_message(warn_msg, logger) # Check that threshold for gains/amplifications are properly set, and that segment overlap with transcripts are set appropriately - if arg_dict['n_copy_gain'] <= 0: - err_msg = f"Totaly copy number threshold for gains/amplifications ('--n_copy_gain' = {arg_dict['n_copy_gain']}) should be > 0" + if arg_dict['n_copy_gain'] <= 2: + err_msg = f"Total copy number threshold for gains/amplifications ('--n_copy_gain' = {arg_dict['n_copy_gain']}) should be > 2" error_message(err_msg, logger) if arg_dict['cna_overlap_pct'] > 100 or arg_dict['cna_overlap_pct'] <= 0: err_msg = f"Minimum percent overlap between copy number segment and gene transcript ('--cna_overlap_pct' = {arg_dict['cna_overlap_pct']}) must be within (0, 100]" @@ -220,15 +220,11 @@ def define_output_files(arg_dict, cpsr = False): output_data['yaml']= f"{output_prefix}.conf.yaml" if not cpsr: - output_data['cna'] = f"{output_prefix}.cna_segments.tsv.gz" - output_data['expression'] = f"{output_prefix}.expression.tsv.gz" - output_data['csq_expression'] = f"{output_prefix}.csq_expression.tsv.gz" - output_data['expression_outliers'] = f"{output_prefix}.expression_outliers.tsv.gz" - output_data['expression_similarity'] = f"{output_prefix}.expression_similarity.tsv.gz" - output_data['snv_indel_ann'] = f"{output_prefix}.snv_indel_ann.tsv.gz" + for otype in ['cna_gene','cna_segment','expression','expression_outliers', + 'expression_similarity','snv_indel_ann','msigs']: + output_data[otype] = f"{output_prefix}.{otype}.tsv.gz" output_data['maf'] = f"{output_prefix}.maf" output_data['tmb'] = f"{output_prefix}.tmb.tsv" - output_data['msigs'] = f"{output_prefix}.msigs.tsv.gz" else: output_data['classification'] = f"{output_prefix}.classification.tsv.gz" @@ -240,7 +236,7 @@ def define_output_files(arg_dict, cpsr = False): error_message(err_msg, logger) if not cpsr: - for otype in ['cna', 'expression', 'expression_outliers', 'snv_indel_ann', + for otype in ['cna_gene', 'cna_segment','expression', 'expression_outliers', 'snv_indel_ann', 'expression_similarity','maf','tmb','msigs']: # if annotated output cna segments exist and overwrite not set if os.path.exists(output_data[otype]) and arg_dict["force_overwrite"] is False: @@ -267,6 +263,7 @@ def verify_input_files(arg_dict): input_cna_dir = 'NA' input_rna_fusion_dir = 'NA' input_germline_dir = 'NA' + input_germline_yaml_dir = 'NA' input_rna_expression_dir = 'NA' pon_vcf_dir = 'NA' db_dir = 'NA' @@ -277,6 +274,7 @@ def verify_input_files(arg_dict): input_rna_fusion_basename = 'NA' input_rna_expression_basename = 'NA' input_germline_basename = 'NA' + input_germline_yaml_basename = 'NA' arg_dict['rna_fusion_tumor'] = None # create output folder (if not already exists) @@ -366,19 +364,30 @@ def verify_input_files(arg_dict): os.path.abspath(arg_dict["input_rna_exp"])) # check if input germline calls (CPSR) exist - #if not arg_dict["input_germline"] is None: - # if not os.path.exists(os.path.abspath(arg_dict["input_germline"])): - # err_msg = "Input file (" + \ - # str(arg_dict["input_germline"]) + ") does not exist" - # error_message(err_msg, logger) - # if not (os.path.abspath(arg_dict["input_germline"]).endswith(".tsv.gz")): - # err_msg = "File with CPSR-classified germline calls (" + os.path.abspath( - # arg_dict["input_germline"]) + ") does not have the correct file extension (.json.gz)" - # error_message(err_msg, logger) - # input_germline_basename = os.path.basename( - # str(arg_dict["input_germline"])) - # input_germline_dir = os.path.dirname( - # os.path.abspath(arg_dict["input_germline"])) + if not arg_dict["input_cpsr"] is None: + if not os.path.exists(os.path.abspath(arg_dict["input_cpsr"])): + err_msg = "Input file (" + \ + str(arg_dict["input_cpsr"]) + ") does not exist" + error_message(err_msg, logger) + if not (os.path.abspath(arg_dict["input_cpsr"]).endswith(".tsv.gz")): + err_msg = "File with CPSR-classified germline calls (" + os.path.abspath( + arg_dict["input_cpsr"]) + ") does not have the correct file extension (.tsv.gz)" + error_message(err_msg, logger) + + if arg_dict["input_cpsr_yaml"] is None: + err_msg = "Input file with CPSR configuration settings (--input_cpsr_yaml) is missing" + error_message(err_msg, logger) + else: + check_file_exists(os.path.abspath(arg_dict["input_cpsr_yaml"]), strict = True, logger = logger) + input_germline_yaml_basename = os.path.basename( + str(arg_dict["input_cpsr_yaml"])) + input_germline_yaml_dir = os.path.dirname( + os.path.abspath(arg_dict["input_cpsr_yaml"])) + + input_germline_basename = os.path.basename( + str(arg_dict["input_cpsr"])) + input_germline_dir = os.path.dirname( + os.path.abspath(arg_dict["input_cpsr"])) vep_dir = verify_vep_cache(arg_dict, logger) refdata_assembly_dir = verify_refdata(arg_dict, logger, cpsr = False) @@ -391,6 +400,8 @@ def verify_input_files(arg_dict): "rna_expression_dir": input_rna_expression_dir, "germline_dir": input_germline_dir, "germline_basename": input_germline_basename, + "germline_yaml_dir": input_germline_yaml_dir, + "germline_yaml_basename": input_germline_yaml_basename, "pon_vcf_dir": pon_vcf_dir, "refdata_assembly_dir": refdata_assembly_dir, "vep_dir": vep_dir, diff --git a/pcgr/main.py b/pcgr/main.py index 3134f058..168ba5b2 100755 --- a/pcgr/main.py +++ b/pcgr/main.py @@ -39,10 +39,9 @@ def cli(): optional_signatures = parser.add_argument_group("Mutational signature options") optional_cna = parser.add_argument_group("Somatic copy number alteration (CNA) data options") optional_rna = parser.add_argument_group("Bulk RNA-seq and RNA fusion data options") - #optional_germline = parser.add_argument_group("Germline variant options") + optional_germline = parser.add_argument_group("Germline variant options") optional_other = parser.add_argument_group("Other options") - required.add_argument("--input_vcf", dest="input_vcf", help="VCF input file with somatic variants in tumor sample, SNVs/InDels", required=True) required.add_argument("--vep_dir", dest="vep_dir", help="Directory of VEP cache, e.g. $HOME/.vep", required=True) required.add_argument("--refdata_dir", dest="refdata_dir", help="Directory where PCGR reference data bundle was downloaded and unpacked", required=True) @@ -57,7 +56,6 @@ def cli(): optional_sample.add_argument("--tumor_purity", type=float, dest="tumor_purity", help="Estimated tumor purity (between 0 and 1) (default: %(default)s)") optional_sample.add_argument("--tumor_ploidy", type=float, dest="tumor_ploidy", help="Estimated tumor ploidy (default: %(default)s)") - optional_allelic_support.add_argument("--tumor_dp_tag", dest="tumor_dp_tag", default="_NA_", help="Specify VCF INFO tag for sequencing depth (tumor, must be Type=Integer, default: %(default)s") optional_allelic_support.add_argument("--tumor_af_tag", dest="tumor_af_tag", default="_NA_", help="Specify VCF INFO tag for variant allelic fraction (tumor, must be Type=Float, default: %(default)s") optional_allelic_support.add_argument("--control_dp_tag", dest="control_dp_tag", default="_NA_", help="Specify VCF INFO tag for sequencing depth (control, must be Type=Integer, default: %(default)s") @@ -88,7 +86,7 @@ def cli(): optional_vep.add_argument("--vep_n_forks", default=4, type=int, help="Number of forks (VEP option '--fork'), default: %(default)s") optional_vep.add_argument("--vep_buffer_size", default=500, type=int, help=f"Variant buffer size (variants read into memory simultaneously, VEP option '--buffer_size')\n- set lower to reduce memory usage, default: %(default)s") - optional_vep.add_argument("--vep_pick_order", default="mane_select,mane_plus_clinical,canonical,appris,tsl,biotype,ccds,rank,length", help=f"Comma-separated string " + \ + optional_vep.add_argument("--vep_pick_order", default="mane_select,mane_plus_clinical,canonical,biotype,ccds,rank,tsl,appris,length", help=f"Comma-separated string " + \ "of ordered transcript/variant properties for selection of primary variant consequence\n(option '--pick_order' in VEP), default: %(default)s") optional_vep.add_argument("--vep_no_intergenic", action="store_true", help="Skip intergenic variants during variant annotation (VEP option '--no_intergenic' in VEP), default: %(default)s") optional_vep.add_argument("--vep_regulatory", action="store_true", help="Add VEP regulatory annotations (VEP option '--regulatory') or non-coding interpretation, default: %(default)s") @@ -111,13 +109,13 @@ def cli(): optional_cna.add_argument("--cna_overlap_pct", type=float, default=50, dest="cna_overlap_pct", help="Mean percent overlap between copy number segment and gene transcripts for reporting of gains/losses in tumor suppressor genes/oncogenes, (default: %(default)s)") #optional_rna.add_argument("--input_rna_fusion", dest = "input_rna_fusion", help = "File with RNA fusion transcripts detected in tumor (tab-separated values)") optional_rna.add_argument("--input_rna_expression", dest = "input_rna_exp", help = "File with bulk RNA expression counts (TPM) of transcripts in tumor (tab-separated values)") - optional_rna.add_argument('--expression_sim', action='store_true', help="Compare expression profile of tumor sample to known expression profiles (default: %(default)s)") + optional_rna.add_argument('--expression_sim', action='store_true', help="Compare expression profile of tumor sample to expression profiles of other tumor samples (default: %(default)s)") optional_rna.add_argument("--expression_sim_db", dest = "expression_sim_db", default="tcga,depmap,treehouse", help=f"Comma-separated string " + \ "of databases for used in RNA expression similarity analysis, default: %(default)s") - - #optional_germline.add_argument("--input_germline", dest="input_germline", help="CPSR-classified germline calls (file '.cpsr..classification.tsv.gz')") - #optional_germline.add_argument("--sample_id_germline", dest="sample_id_germline", help="Sample identifier for germline calls - used for verification of input_germline file") + optional_germline.add_argument("--input_cpsr", dest="input_cpsr", help="CPSR-classified germline calls (file '.cpsr..classification.tsv.gz')") + optional_germline.add_argument("--input_cpsr_yaml", dest="input_cpsr_yaml", help="CPSR YAML configuration file (file '.cpsr..conf.yaml')") + optional_germline.add_argument("--cpsr_ignore_vus", action="store_true", help="Do not show variants of uncertain significance (VUS) in the germline section of the HTML report (default: %(default)s)") optional_other.add_argument("--vcf2maf", action="store_true", help="Generate a MAF file for input VCF using https://github.com/mskcc/vcf2maf (default: %(default)s)") optional_other.add_argument("--vcfanno_n_proc", default=4, type=int, help="Number of vcfanno processes (option '-p' in vcfanno), default: %(default)s") @@ -127,6 +125,7 @@ def cli(): optional_other.add_argument("--force_overwrite", action="store_true", help="By default, the script will fail with an error if any output file already exists. You can force the overwrite of existing result files by using this flag, default: %(default)s") optional_other.add_argument("--version", action="version", version="%(prog)s " + str(pcgr_vars.PCGR_VERSION)) optional_other.add_argument("--no_reporting", action="store_true", help="Run functional variant annotation on VCF through VEP/vcfanno, omit other analyses (i.e. Tier assignment/MSI/TMB/Signatures etc. and report generation (STEP 4), default: %(default)s") + optional_other.add_argument("--no_html", action="store_true", help="Do not generate HTML report (default: %(default)s)") optional_other.add_argument("--debug", action="store_true", help="Print full commands to log") optional_other.add_argument("--pcgrr_conda", default="pcgrr", help="pcgrr conda env name (default: %(default)s)") @@ -144,7 +143,7 @@ def cli(): # Run PCGR workflow run_pcgr(input_data, output_data, conf_options) -def run_pcgr(input_data, output_data,conf_options): +def run_pcgr(input_data, output_data, conf_options): """ Main function to run the PCGR workflow """ @@ -173,7 +172,8 @@ def run_pcgr(input_data, output_data,conf_options): input_cna = 'None' input_rna_fusion = 'None' input_rna_expression = 'None' - input_germline_cpsr = 'None' + input_cpsr_calls = 'None' + input_cpsr_yaml = 'None' pon_vcf = 'None' pon_annotation = 0 variant_set = pd.DataFrame @@ -189,8 +189,10 @@ def run_pcgr(input_data, output_data,conf_options): input_rna_fusion = os.path.join(input_data['rna_fusion_dir'], input_data['rna_fusion_basename']) if input_data['rna_expression_basename'] != 'NA': input_rna_expression = os.path.join(input_data['rna_expression_dir'], input_data['rna_expression_basename']) - #if input_data['germline_basename'] != 'NA': - # input_germline_cpsr = os.path.join(input_data['germline_dir'], input_data['germline_basename']) + if input_data['germline_basename'] != 'NA': + input_cpsr_calls = os.path.join(input_data['germline_dir'], input_data['germline_basename']) + if input_data['germline_yaml_basename'] != 'NA': + input_cpsr_yaml = os.path.join(input_data['germline_yaml_dir'], input_data['germline_yaml_basename']) if input_data['pon_vcf_basename'] != 'NA': pon_vcf = os.path.join(input_data['pon_vcf_dir'], input_data['pon_vcf_basename']) @@ -233,9 +235,11 @@ def run_pcgr(input_data, output_data,conf_options): f'{input_cna} ' f'{input_rna_fusion} ' f'{input_rna_expression} ' + f'{input_cpsr_calls} ' f'{pon_vcf} ' f'{conf_options["assay_properties"]["vcf_tumor_only"]} ' f'{conf_options["sample_id"]} ' + f'{conf_options["genome_assembly"]} ' f'{conf_options["other"]["retained_vcf_info_tags"]} ' f'{conf_options["somatic_snv"]["allelic_support"]["tumor_dp_tag"]} ' f'{conf_options["somatic_snv"]["allelic_support"]["tumor_af_tag"]} ' @@ -284,9 +288,10 @@ def run_pcgr(input_data, output_data,conf_options): logger.info(f'RNA expression similarity analysis: {rnaseq_sim_analysis_set}') #logger.info(f'Include molecularly targeted clinical trials (beta): {clinical_trials_set}') - # PCGR|Generate YAML file - containing configuration options and paths to annotated molecular profile datasets - # - VCF/TSV files (SNVs/InDels) - # - TSV files (copy number aberrations) + # PCGR|Generate YAML file - containing configuration options and paths to first-pass annotation of molecular profile datasets + # - VCF/TSV files (somatic SNVs/InDels) + # - TSV file (germline SNVs/InDels - CPSR) + # - TSV files (somatic copy number aberrations) # - TSV files (TMB) # - TSV files (RNA expression) # - TSV files (RNA fusion) - COMING @@ -302,7 +307,12 @@ def run_pcgr(input_data, output_data,conf_options): if conf_options['somatic_snv']['tmb']['run'] == 1: conf_options['molecular_data']['fname_tmb_tsv'] = tmb_fname if not input_cna == 'None': - conf_options['molecular_data']['fname_cna_tsv'] = output_data['cna'] + conf_options['molecular_data']['fname_cna_gene_tsv'] = output_data['cna_gene'] + conf_options['molecular_data']['fname_cna_segment_tsv'] = output_data['cna_segment'] + if not input_cpsr_calls == 'None': + conf_options['molecular_data']['fname_germline_tsv'] = input_cpsr_calls + if not input_cpsr_yaml == 'None': + conf_options['molecular_data']['fname_germline_yaml'] = input_cpsr_yaml if not input_rna_expression == 'None': conf_options['molecular_data']['fname_expression_tsv'] = output_data['expression'] conf_options['molecular_data']['fname_expression_outliers_tsv'] = output_data['expression_outliers'] @@ -412,7 +422,7 @@ def run_pcgr(input_data, output_data,conf_options): pcgr_summarise_command = ( f'pcgr_summarise.py {vep_vcfanno_vcf}.gz {vep_vcfanno_summarised_vcf} {pon_annotation} ' f'{yaml_data["conf"]["vep"]["vep_regulatory"]} {oncogenicity_annotation} ' - f'{yaml_data["conf"]["sample_properties"]["site2"]} {yaml_data["conf"]["vep"]["vep_pick_order"]} ' + f'{yaml_data["conf"]["sample_properties"]["site2"]} {yaml_data["genome_assembly"]} {yaml_data["conf"]["vep"]["vep_pick_order"]} ' f'{input_data["refdata_assembly_dir"]} --compress_output_vcf ' f'{"--debug" if debug else ""}' ) @@ -425,7 +435,7 @@ def run_pcgr(input_data, output_data,conf_options): logger.info(summarise_db_src_msg1) logger.info(summarise_db_src_msg2) - logger.info('Variant oncogenicity classification according to ClinGen/VICC recommendations (Horak et al., Genet Med, 2022)') + logger.info('Variant oncogenicity classification according to ClinGen/CGC/VICC standard operating procedures (Horak et al., Genet Med, 2022)') logger.info('Variant biomarker matching (CIViC, CGI) at multiple resolutions (genes, exons, amino acid positions, hgvsp/hgvsc, genomic)') logger.info('Tumor suppressor/oncogene annotations based on multiple sources (NCG, CGC, CancerMine)') check_subprocess(logger, pcgr_summarise_command, debug) @@ -483,7 +493,7 @@ def run_pcgr(input_data, output_data,conf_options): input_data["refdata_assembly_dir"], logger = logger) ## Write transcript-level expression data to TSV if 'transcript' in expression_data.keys(): - if not expression_data['transcript'] is None: + if not expression_data['transcript'] is None: expression_data['transcript'].fillna('.').to_csv( yaml_data['molecular_data']['fname_expression_tsv'], sep = "\t", compression = "gzip", index = False) @@ -492,12 +502,12 @@ def run_pcgr(input_data, output_data,conf_options): #exp_to_cons.fillna('.').to_csv( # yaml_data['molecular_data']['fname_csq_expression_tsv'], sep = "\t", # compression = "gzip", index = False) - else: - if 'gene' in expression_data.keys(): - if not expression_data['gene'] is None: - expression_data['gene'].fillna('.').to_csv( - yaml_data['molecular_data']['fname_expression_tsv'], sep = "\t", - compression = "gzip", index = False) + else: + if 'gene' in expression_data.keys(): + if not expression_data['gene'] is None: + expression_data['gene'].fillna('.').to_csv( + yaml_data['molecular_data']['fname_expression_tsv'], sep = "\t", + compression = "gzip", index = False) ## Merge expression data with somatic SNV/InDel variant set variant_set = integrate_variant_expression( @@ -570,6 +580,7 @@ def run_pcgr(input_data, output_data,conf_options): expression_data, yaml_data, input_data["refdata_assembly_dir"], + protein_coding_only = True, logger = logger ) if not expression_outliers.empty: @@ -609,7 +620,8 @@ def run_pcgr(input_data, output_data,conf_options): logger = getlogger("pcgr-annotate-cna-segments") logger.info('PCGR - STEP 5: Annotation of copy number segments - cytobands, overlapping transcripts, and biomarkers') cna_annotation = cna.annotate_cna_segments( - output_fname = output_data['cna'], + output_segment_gene_fname = output_data['cna_gene'], + output_segment_fname = output_data['cna_segment'], output_dir = output_data['dir'], cna_segment_file = input_cna, build = yaml_data['genome_assembly'], @@ -621,6 +633,9 @@ def run_pcgr(input_data, output_data,conf_options): logger = logger) if cna_annotation == 0: logger.info('Finished pcgr-annotate-cna-segments') + else: + yaml_data['molecular_data']['fname_cna_gene_tsv'] = "None" + yaml_data['molecular_data']['fname_cna_segment_tsv'] = "None" print('----') else: logger = getlogger("pcgr-annotate-cna-segments") diff --git a/pcgrr/R/main.R b/pcgrr/R/main.R index c3a76269..0f54c5e1 100644 --- a/pcgrr/R/main.R +++ b/pcgrr/R/main.R @@ -47,7 +47,19 @@ generate_report <- settings = settings) } - + ## Load pre-classified germline variants (output from CPSR) + if(settings$molecular_data$fname_germline_tsv != "None" & + file.exists(settings$molecular_data$fname_germline_tsv)){ + rep[['content']][['germline_classified']] <- + pcgrr::load_cpsr_classified_variants( + fname_cpsr_tsv = settings$molecular_data$fname_germline_tsv, + fname_cpsr_yaml = settings$molecular_data$fname_germline_yaml, + ignore_vus = as.logical(settings$conf$germline$ignore_vus), + cols = pcgrr::data_coltype_defs$snv_indel_germline_cpsr, + ref_data = ref_data + ) + rep[["content"]][['germline_classified']][["eval"]] <- TRUE + } conf_somatic_snv <- settings$conf$somatic_snv @@ -73,8 +85,6 @@ generate_report <- # a_elem = "clinicaltrials") #} - - rep[['content']][['snv_indel']][['callset']] <- callset_snv rep[['content']][['snv_indel']][['eval']] <- @@ -182,10 +192,12 @@ generate_report <- ## Load somatic CNA data if available callset_cna <- NULL - if (settings$molecular_data$fname_cna_tsv != "None") { + if (settings$molecular_data$fname_cna_gene_tsv != "None" & + settings$molecular_data$fname_cna_segment_tsv != "None") { callset_cna <- pcgrr::load_somatic_cna( - fname = settings$molecular_data$fname_cna_tsv, + fname_cna_segment = settings$molecular_data$fname_cna_segment_tsv, + fname_cna_gene = settings$molecular_data$fname_cna_gene_tsv, ref_data = ref_data, settings = settings ) @@ -197,12 +209,6 @@ generate_report <- callset = callset_cna, vartype = "cna", name = "vstats")[['vstats']] - #rep[['content']][['cna']][['cnaqc']] <- - # pcgrr::make_cnaqc_object( - # callset_cna = callset_cna, - # callset_snv = callset_snv, - # settings = settings - # ) rep[['content']][['cna']][['eval']] <- TRUE } @@ -764,7 +770,7 @@ generate_tier_tsv <- function(variant_set, #' #' @param report List object with all report data, settings etc. #' @param output_type character indicating output type for TSV, -#' i.e. 'snv_indel' or 'cna_gene', 'msigs' +#' i.e. 'snv_indel', 'snv_indel_unfiltered', 'cna_gene', or 'msigs' #' @export #' write_report_tsv <- function(report = NULL, output_type = 'snv_indel'){ @@ -792,6 +798,12 @@ write_report_tsv <- function(report = NULL, output_type = 'snv_indel'){ report$content$snv_indel$eval == TRUE){ eval_output <- TRUE } + if(output_type == "snv_indel_unfiltered" & + report$content$snv_indel$eval == TRUE & + as.logical( + report$settings$conf$assay_properties$vcf_tumor_only) == TRUE){ + eval_output <- TRUE + } ## Mutational signatures @@ -847,6 +859,45 @@ write_report_tsv <- function(report = NULL, output_type = 'snv_indel'){ } } + ## SNVs/InDels + if(output_type == 'snv_indel_unfiltered' & + !is.null(report$content$snv_indel) & + report$content$snv_indel$eval == TRUE){ + + snv_indel_cols <- pcgrr::tsv_cols$snv_indel_unfiltered + if(report$settings$conf$other$retained_vcf_info_tags != "None"){ + snv_indel_cols <- c( + snv_indel_cols, report$settings$conf$other$retained_vcf_info_tags) + } + + if(!is.null(report$content$snv_indel$callset)){ + if(is.data.frame(report$content$snv_indel$callset$variant_unfiltered)){ + output_data <- as.data.frame( + report$content$snv_indel$callset$variant_unfiltered |> + dplyr::select( + dplyr::any_of(snv_indel_cols)) + ) + if(NROW(output_data) > 0 & + "SOMATIC_CLASSIFICATION" %in% colnames(output_data) & + "ACTIONABILITY_TIER" %in% colnames(output_data) & + "ONCOGENICITY_SCORE" %in% colnames(output_data)){ + output_data$somatic_score <- 0 + output_data <- output_data |> + dplyr::mutate(somatic_score = dplyr::case_when( + .data$SOMATIC_CLASSIFICATION == "Somatic" ~ 1, + TRUE ~ as.numeric(.data$somatic_score) + )) + output_data <- output_data |> + dplyr::arrange( + dplyr::desc(.data$somatic_score), + .data$ACTIONABILITY_TIER, + dplyr::desc(.data$ONCOGENICITY_SCORE)) |> + dplyr::select(-c("somatic_score")) + } + } + } + } + if(NROW(output_data) > 0){ pcgrr::log4r_info("------") pcgrr::log4r_info(paste0( @@ -925,6 +976,10 @@ write_report_quarto_html <- function(report = NULL){ ## Save sample PCGR report object in temporary quarto rendering directory rds_report_path <- file.path( tmp_quarto_dir, "pcgr_report.rds") + + ## Remove ref_data from report object + #report$settings$chrom_coordinates <- + # report$ref_data$assembly$chrom_coordinates report$ref_data <- NULL saveRDS(report, file = rds_report_path) @@ -1000,15 +1055,17 @@ write_report_excel <- function(report = NULL){ i <- 15 for(elem in c('SAMPLE_ASSAY', - 'SNV_INDEL', - 'SNV_INDEL_BIOMARKER', - 'CNA', - 'CNA_BIOMARKER', + 'SOMATIC_SNV_INDEL', + 'SOMATIC_SNV_INDEL_BIOMARKER', + 'SOMATIC_CNA', + 'SOMATIC_CNA_BIOMARKER', + 'GERMLINE_SNV_INDEL', 'TMB', 'MSI', 'MUTATIONAL_SIGNATURE', - 'KATAEGIS', - 'IMMUNE_CONTEXTURE')){ + 'KATAEGIS_EVENTS', + 'RNA_EXPRESSION_OUTLIERS', + 'RNA_IMMUNE_CONTEXTURE')){ if(elem %in% names(excel_output)){ if(is.data.frame(excel_output[[elem]]) & NROW(excel_output[[elem]]) > 0 & diff --git a/pcgrr/R/report.R b/pcgrr/R/report.R index e19ddc28..e93a807e 100644 --- a/pcgrr/R/report.R +++ b/pcgrr/R/report.R @@ -86,10 +86,10 @@ init_report <- function(yaml_fname = NULL, "mutational_signatures", "tmb", "msi", + "germline_classified", "rainfall", "kataegis", - "expression", - "predisposition")){ + "expression")){ #"clinicaltrials")) { report[["content"]][[a_elem]] <- list() report[["content"]][[a_elem]][["eval"]] <- FALSE @@ -112,6 +112,12 @@ init_report <- function(yaml_fname = NULL, init_rainfall_content() } + if (a_elem == "germline_classified") { + report[["content"]][[a_elem]][['callset']] <- list() + report[["content"]][[a_elem]][['panel_info']] <- list() + report[["content"]][[a_elem]]$sample_id <- "NA" + } + if (a_elem == "snv_indel" | a_elem == "cna") { report[["content"]][[a_elem]][['callset']] <- list() @@ -122,8 +128,6 @@ init_report <- function(yaml_fname = NULL, if (a_elem == "cna") { report[["content"]][[a_elem]][['vstats']] <- init_cna_vstats() - report[["content"]][[a_elem]][['cnaqc']] <- - list() } } if (a_elem == "clinicaltrials") { @@ -548,12 +552,17 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { ref_data <- list() if (dir.exists( - report_settings[['reference_data']][['path']] - )) { + report_settings[['reference_data']][['path']])) { ref_data <- load_reference_data( pcgr_db_assembly_dir = report_settings[['reference_data']][['path']], genome_assembly = report_settings[['genome_assembly']] ) + }else{ + log4r_fatal( + paste0("Reference data directory ", + report_settings[['reference_data']][['path']], + " does not exist - exiting")) + } if (identical( @@ -686,12 +695,11 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { report_settings$conf$report_color <- pcgrr::color_palette[["report_color"]][["values"]][1] - #report_settings$conf$visual_reporting[["color_palette"]] <- - # pcgrr::color_palette - #report_settings$conf$visual_reporting[["color_none"]] <- - # pcgrr::color_palette[["none"]][["values"]][1] - #report_settings$conf$visual_reporting[["color_value_box"]] <- - # pcgrr::color_palette[["report_color"]][["values"]][1] + if(!is.null(ref_data$assembly$chrom_coordinates)){ + report_settings$chrom_coordinates <- + ref_data$assembly$chrom_coordinates + } + if (report_mode == "PCGR" & !is.null(report_settings$conf$assay_properties)) { if (report_settings$conf$assay_properties$vcf_tumor_only == 1) { diff --git a/pcgrr/R/variant_annotation.R b/pcgrr/R/variant_annotation.R index 0ea594e2..394da65d 100644 --- a/pcgrr/R/variant_annotation.R +++ b/pcgrr/R/variant_annotation.R @@ -270,7 +270,8 @@ append_tcga_var_link <- function(var_df, dplyr::select(.data$VAR_ID, .data$TCGALINK) |> dplyr::rename(TCGA_FREQUENCY = "TCGALINK") #magrittr::set_colnames(c("VAR_ID", "TCGA_FREQUENCY")) - var_df <- dplyr::rename(var_df, TCGA_FREQUENCY_RAW = .data$TCGA_FREQUENCY) + var_df <- dplyr::rename( + var_df, TCGA_FREQUENCY_RAW = .data$TCGA_FREQUENCY) var_df <- dplyr::left_join(var_df, var_df_links, by = c("VAR_ID" = "VAR_ID")) }else{ diff --git a/scripts/pcgr_validate_input.py b/scripts/pcgr_validate_input.py index 1d2e56e6..0e680b21 100755 --- a/scripts/pcgr_validate_input.py +++ b/scripts/pcgr_validate_input.py @@ -8,12 +8,14 @@ import sys import pandas as np from cyvcf2 import VCF +import gzip from pcgr import vcf, cna from pcgr.annoutils import read_infotag_file, read_vcfanno_tag_file from pcgr.utils import error_message, check_subprocess, remove_file, random_id_generator, getlogger from pcgr.cna import is_valid_cna from pcgr.vcf import check_existing_vcf_info_tags, check_retained_vcf_info_tags +from pcgr import pcgr_vars def __main__(): @@ -25,9 +27,11 @@ def __main__(): parser.add_argument('input_cna', help='Somatic (tumor) copy number query segments (tab-separated values)') parser.add_argument('input_rna_fusion', help='Tumor RNA fusion variants (tab-separated values)') parser.add_argument('input_rna_exp', help='Tumor gene expression estimates (tab-separated values)') + parser.add_argument('input_cpsr', help='Classified germline calls from CPSR (tab-separated values)') parser.add_argument('panel_normal_vcf',help="VCF file with germline calls from panel of normals") parser.add_argument('tumor_only',type=int, default=0,choices=[0,1],help="Tumor only sequencing") parser.add_argument('sample_id',help='PCGR sample_name') + parser.add_argument('build',help='Genome build (grch37/grch38)') parser.add_argument('retained_info_tags', help="Comma-separated string of custom VCF INFO tags to be kept in PCGR output") parser.add_argument('tumor_dp_tag', help='VCF INFO tag that denotes tumor sequencing depth') parser.add_argument('tumor_af_tag', help='VCF INFO tag that denotes tumor variant allelic fraction') @@ -47,6 +51,7 @@ def __main__(): args.input_cna, args.input_rna_fusion, args.input_rna_exp, + args.input_cpsr, args.tumor_dp_tag, args.tumor_af_tag, args.control_dp_tag, @@ -58,6 +63,7 @@ def __main__(): args.retained_info_tags, args.tumor_only, args.sample_id, + args.build, args.keep_uncompressed, args.output_dir, args.debug) @@ -133,6 +139,30 @@ def is_valid_rna_expression(rna_exp_file, logger): logger.info("RNA expression file ('" + str(os.path.basename(rna_exp_file)) + "') adheres to the correct format") return 0 +def is_valid_germline(germline_file, build, logger): + """ + Function that checks whether the germline variants file (pre-classified by CPSR) adheres to the correct format + """ + + if not os.path.isfile(germline_file): + err_msg = "Germline variants file (" + str(germline_file) + ") does not exist" + return error_message(err_msg, logger) + + if not str(germline_file).endswith(f'cpsr.{build}.classification.tsv.gz'): + err_msg = "Germline variants file (" + str(germline_file) + ") does not adhere to the correct naming format - wrong build or file type" + return error_message(err_msg, logger) + + with gzip.open(germline_file, 'rt') as f: + germline_reader = csv.DictReader(f, delimiter='\t') + ## check that required columns are present + for col in pcgr_vars.germline_input_required_cols: + if col not in germline_reader.fieldnames: + err_msg = "Germline variants file (" + str(germline_file) + ") is missing required column: " + str(col) + return error_message(err_msg, logger) + + logger.info("Germline variants file ('" + str(os.path.basename(germline_file)) + "') adheres to the correct format") + return 0 + def validate_panel_normal_vcf(vcf, logger): """ @@ -260,6 +290,7 @@ def validate_pcgr_input(refdata_assembly_dir, input_cna, input_rna_fusion, input_rna_expression, + input_cpsr, tumor_dp_tag, tumor_af_tag, control_dp_tag, @@ -271,11 +302,18 @@ def validate_pcgr_input(refdata_assembly_dir, retained_info_tags, tumor_only, sample_id, + build, keep_uncompressed, output_dir, debug): """ - Function that reads the input files to PCGR (VCF file and Tab-separated values file with copy number segments) and performs the following checks: + Function that checks the format of input files to PCGR + - VCF file with somatic SNVs/InDels - mandatory + - Tab-separated values file with somatic copy number segments - optional + - Tab-separated values file with RNA fusion variants - optional + - Tab-separated values file with RNA expression values - optional + - Tab-separated values file with CPSR-classified germline mutations - optional + Function performs the following checks: 1. No INFO annotation tags in the input VCF coincides with those generated by PCGR 2. Provided columns for tumor/normal coverage and allelic depths are found in VCF 3. Provided retained VCF INFO tags are present in VCF file @@ -285,6 +323,7 @@ def validate_pcgr_input(refdata_assembly_dir, 7. Check that copy number segment file has required columns and correct data types (and range) 8. Check that RNA fusion variant file has required columns and correct data types 9. Check that RNA expression file has required columns and correct data types + 10. Check that germline mutation file has required columns and correct data types """ logger = getlogger('pcgr-validate-input-arguments') @@ -356,6 +395,12 @@ def validate_pcgr_input(refdata_assembly_dir, valid_cna = is_valid_cna(input_cna, logger) if valid_cna == -1: return -1 + + ## Check whether file with classified germline calls is properly formatted + if not input_cpsr == 'None': + valid_germline = is_valid_germline(input_cpsr, build, logger) + if valid_germline == -1: + return -1 ## Check whether file with RNA fusion variants is properly formatted if not input_rna_fusion == 'None':