From f271e48bd2d0ac56cb472f0e6c254283e30083de Mon Sep 17 00:00:00 2001 From: dapregi Date: Wed, 11 Sep 2024 16:48:44 +0100 Subject: [PATCH] qc: draft code migration from zetta-bioinfo #TASK-6766 --- .../app/analysis/qc/family_qc/family_qc.py | 253 +++++++---- .../qc/individual_qc/individual_qc.py | 411 +++++++++++++++--- .../app/analysis/qc/sample_qc/sample_qc.py | 79 +++- .../app/analysis/qc/variant_qc.main.py | 4 +- 4 files changed, 570 insertions(+), 177 deletions(-) diff --git a/opencga-app/app/analysis/qc/family_qc/family_qc.py b/opencga-app/app/analysis/qc/family_qc/family_qc.py index 88698ba8aa..6537724d37 100644 --- a/opencga-app/app/analysis/qc/family_qc/family_qc.py +++ b/opencga-app/app/analysis/qc/family_qc/family_qc.py @@ -9,6 +9,7 @@ LOGGER = logging.getLogger('variant_qc_logger') + class FamilyQCExecutor: def __init__(self, vcf_file, info_file, bam_file, config, output_parent_dir, sample_ids, id_): """Create output dir @@ -29,19 +30,24 @@ def __init__(self, vcf_file, info_file, bam_file, config, output_parent_dir, sam self.sample_ids = sample_ids self.id_ = id_ - def filter_rename_variants_vcf(self,pop_freq_fpath,outdir_fpath): + # Loading configuration + config_fhand = open(self.config, 'r') + self.config_json = json.load(config_fhand) + config_fhand.close() + + def filter_rename_variants_vcf(self, pop_freq_fpath, outdir_fpath): # Reading VCF - vcf_fhand = gzip.open(self.vcf_file,'r') + vcf_fhand = gzip.open(self.vcf_file, 'r') # Reading pop_freq file - input_pop_freq_fhand = open(str(pop_freq_fpath),'r') + input_pop_freq_fhand = open(str(pop_freq_fpath), 'r') LOGGER.debug('Getting variant IDs to include in the VCF from file: "{}"'.format(pop_freq_fpath)) - variant_ids_to_include = [line.strip().split()[1] for line in input_pop_freq_fhand] + variant_ids_to_include = [line.strip().split()[1] for line in input_pop_freq_fhand] # Create output dir and file - filtered_vcf_outdir_fpath = create_output_dir(path_elements=[str(outdir_fpath),'filtered_vcf']) + filtered_vcf_outdir_fpath = create_output_dir(path_elements=[str(outdir_fpath), 'filtered_vcf']) output_file_name = 'filtered_vcf_' + str(self.vcf_file.split('/')[-1]) - filtered_vcf_fpath = os.path.join(filtered_vcf_outdir_fpath,output_file_name) - filtered_vcf_fhand = gzip.open(filtered_vcf_fpath,'wt') + filtered_vcf_fpath = os.path.join(filtered_vcf_outdir_fpath, output_file_name) + filtered_vcf_fhand = gzip.open(filtered_vcf_fpath, 'wt') LOGGER.debug('Generating filtered VCF with variant IDs under ID column: "{}"'.format(filtered_vcf_fpath)) # Generate VCF with variant IDs under ID column @@ -54,7 +60,7 @@ def filter_rename_variants_vcf(self,pop_freq_fpath,outdir_fpath): else: # Getting variant data variant_items = line.strip().split() - variant_id = ':'.join(variant_items[0],variant_items[1],variant_items[3],variant_items[4]) + variant_id = ':'.join([variant_items[0], variant_items[1], variant_items[3], variant_items[4]]) if 'chr' not in variant_items[0]: variant_id = 'chr' + variant_id if variant_id in variant_ids_to_include: @@ -66,10 +72,12 @@ def filter_rename_variants_vcf(self,pop_freq_fpath,outdir_fpath): return filtered_vcf_fpath def get_samples_individuals_info(self): - family_info_json = json.load(self.info_file) + family_info_fhand = open(self.info_file) + family_info_json = json.load(family_info_fhand) samples_individuals = {} for sample in self.sample_ids: - samples_individuals[sample] = {'individualId': '','individualSex': 0, 'fatherId': 'NA','motherId': 'NA', 'familyMembersRoles': 'NA'} + samples_individuals[sample] = {'individualId': '', 'individualSex': 0, 'fatherId': 'NA', 'motherId': 'NA', + 'familyMembersRoles': 'NA'} LOGGER.debug('Getting individual information for each sample') for member in family_info_json['members']: @@ -83,7 +91,9 @@ def get_samples_individuals_info(self): elif (member['sex']['id']).upper() == 'FEMALE' or member['karyotypicSex'] == 'XX': samples_individuals[sample_member['id']]['individualSex'] = 2 else: - LOGGER.warning('Sex information for individual "{}" (sample "{}") is not available. Hence, sex code for the fam file will be 0.'.format(member['id'],sample_member['id'])) + LOGGER.warning( + 'Sex information for individual "{}" (sample "{}") is not available. Hence, sex code for the fam file will be 0.'.format( + member['id'], sample_member['id'])) pass # Filling in father info if 'id' in member['father'].keys(): @@ -92,10 +102,11 @@ def get_samples_individuals_info(self): if 'id' in member['mother'].keys(): samples_individuals[sample_member['id']]['motherId'] = member['mother']['id'] # Filling in family roles info for the individual - samples_individuals[sample_member['id']]['familyMembersRoles'] = family_info_json['roles'][member['id']] + samples_individuals[sample_member['id']]['familyMembersRoles'] = family_info_json['roles'][ + member['id']] # Checking if individual information for each sample was found - for sample,individual_info in samples_individuals.items(): + for sample, individual_info in samples_individuals.items(): if individual_info['individualId'] == '': LOGGER.warning('No individual information available for sample "{}".'.format(sample['id'])) else: @@ -104,30 +115,33 @@ def get_samples_individuals_info(self): # Return samples_individuals dictionary return samples_individuals - def generate_files_for_plink_fam_file(self,outdir_fpath): + def generate_files_for_plink_fam_file(self, outdir_fpath): # Getting family id and sample_individuals_info family_id = self.id_ samples_individuals = self.get_samples_individuals_info() # Generating text file to update sex information sex_information_output_file_name = family_id + '_individual_sample_sex_information.txt' - sex_information_output_fpath = os.path.join(str(outdir_fpath),sex_information_output_file_name) - sex_information_output_fhand = open(sex_information_output_fpath,'w') - LOGGER.debug('Generating text file to update individual, sample, sex information: "{}"'.format(sex_information_output_fpath)) + sex_information_output_fpath = os.path.join(str(outdir_fpath), sex_information_output_file_name) + sex_information_output_fhand = open(sex_information_output_fpath, 'w') + LOGGER.debug('Generating text file to update individual, sample, sex information: "{}"'.format( + sex_information_output_fpath)) # Generating text file to update parent-offspring relationships parent_offspring_output_file_name = family_id + '_parent_offspring_relationships.txt' - parent_offspring_output_fpath = os.path.join(str(outdir_fpath),parent_offspring_output_file_name) - parent_offspring_output_fhand = open(parent_offspring_output_fpath,'w') - LOGGER.debug('Generating text file to update parent-offspring relationships: "{}"'.format(parent_offspring_output_fpath)) + parent_offspring_output_fpath = os.path.join(str(outdir_fpath), parent_offspring_output_file_name) + parent_offspring_output_fhand = open(parent_offspring_output_fpath, 'w') + LOGGER.debug( + 'Generating text file to update parent-offspring relationships: "{}"'.format(parent_offspring_output_fpath)) for sample in self.sample_ids: # Individual information for that sample individual_info = samples_individuals[sample] # Structure = FamilyID SampleID Sex - sex_information_output_fhand.write(('/t'.join([family_id,sample,individual_info['individualSex']])) + '\n') + sex_information_output_fhand.write( + ('\t'.join([family_id, sample, str(individual_info['individualSex'])])) + '\n') # Structure = FamilyID SampleID FatherID MotherID - parent_offspring_info = [family_id,sample,0,0] + parent_offspring_info = [family_id, sample, str(0), str(0)] father_id = individual_info['fatherId'] mother_id = individual_info['motherId'] if father_id != 'NA': @@ -140,61 +154,80 @@ def generate_files_for_plink_fam_file(self,outdir_fpath): if individual_info['individualId'] == mother_id: parent_offspring_info[3] = sample_id break - parent_offspring_output_fhand.write(('/t'.join(parent_offspring_info)) + '\n') - LOGGER.info('Text file generated to update individual, sample, sex information: "{}"'.format(sex_information_output_fpath)) - LOGGER.info('Text file generated to update parent-offspring relationships: "{}"'.format(parent_offspring_output_fpath)) + parent_offspring_output_fhand.write(('\t'.join(parent_offspring_info)) + '\n') + LOGGER.info('Text file generated to update individual, sample, sex information: "{}"'.format( + sex_information_output_fpath)) + LOGGER.info( + 'Text file generated to update parent-offspring relationships: "{}"'.format(parent_offspring_output_fpath)) # Return paths of text files generated. First path: individual, sample, sex information file. Second path: parent-offspring information file. - return [sex_information_output_fpath,parent_offspring_output_fpath] + return [sex_information_output_fpath, parent_offspring_output_fpath] def relatedness_results_data_model(method): - relatedness_json = [ - { - "method": "", - "maf": "", - "scores": [ - { - "sampleId1": "", - "sampleId2": "", - "reportedRelationship": "", - "inferredRelationship": "", - "validation": "", - "values": { - "RT": "", - "ez": "", - "z0": "", - "z1": "", - "z2": "", - "PiHat": "" - } + relatedness_json = { + "method": "", + "software": { + "name": "", + "version": "", + "commit": "", + "params": {} + }, + "scores": [ + { + "sampleId1": "", + "sampleId2": "", + "reportedRelationship": "", + "inferredRelationship": "", + "validation": "", + "values": { + "RT": "", + "ez": "", + "z0": "", + "z1": "", + "z2": "", + "PiHat": "" } - ] + } + ], + "images": [ + { + "name": "", + "base64": "", + "description": "" + } + ], + "attributes": { + "cli": "", + "files": [], + "JOB_ID": "" } - ] + } + method = str(method) if method == "PLINK/IBD": LOGGER.info('Relatedness method to be used: "{}"'.format(method)) relatedness_json["method"] = "PLINK/IBD" - relatedness_json["maf"] = "1000G:ALL>0.3" + relatedness_json["software"]["name"] = "plink" + relatedness_json["software"]["version"] = "1.9" else: LOGGER.info('Relatedness method to be used: "{}"'.format(method)) - relatedness_json["method"] = str(method) - relatedness_json["maf"] = "NA" + relatedness_json["method"] = method # Return relatedness json data model with method info filled in. return relatedness_json - def relatedness_plink(self,filtered_vcf_fpath,pop_freq_fpath,pop_exclude_var_fpath,outdir_fpath,plink_path,method="PLINK/IBD"): + def relatedness_plink(self, filtered_vcf_fpath, pop_freq_fpath, pop_exclude_var_fpath, outdir_fpath, plink_path, + method="PLINK/IBD"): LOGGER.info('Method: {}'.format(method)) - plink_outdir_fpath = create_output_dir(path_elements=[str(outdir_fpath),'plink_IBD']) + plink_outdir_fpath = create_output_dir(path_elements=[str(outdir_fpath), 'plink_IBD']) sex_info_fpath, parent_offspring_fpath = self.generate_files_for_plink_fam_file(outdir_fpath=plink_outdir_fpath) # Preparing PLINK commands plink_path = str(plink_path) files_prefix = self.id_ + "_plink_relatedness_results" - plink_output_folder_files_prefix = os.path.join(plink_outdir_fpath,files_prefix) + plink_output_folder_files_prefix = os.path.join(plink_outdir_fpath, files_prefix) cmd_plink_files = ' '.join([plink_path, "--vcf", str(filtered_vcf_fpath), "--make-bed", "--const-fid", self.id_, - "--chr","1-22", + "--chr", "1-22", "--not-chr", "X,Y,MT", "--allow-extra-chr", "--snps-only", @@ -207,33 +240,35 @@ def relatedness_plink(self,filtered_vcf_fpath,pop_freq_fpath,pop_exclude_var_fpa plink_files = execute_bash_command(cmd_plink_files) if plink_files[0] == 0: plink_files_generated = os.listdir(plink_outdir_fpath) - LOGGER.info('Files available in directory "{}":\n{}'.format(plink_outdir_fpath,plink_files_generated)) + LOGGER.info('Files available in directory "{}":\n{}'.format(plink_outdir_fpath, plink_files_generated)) cmd_plink_ibd = ' '.join([plink_path, - "--bfile", plink_output_folder_files_prefix, - "--genome", "rel-check", - "--read-freq", str(pop_freq_fpath), - "--exclude", str(pop_exclude_var_fpath), - "--out", plink_output_folder_files_prefix]) + "--bfile", plink_output_folder_files_prefix, + "--genome", "rel-check", + "--read-freq", str(pop_freq_fpath), + "--exclude", str(pop_exclude_var_fpath), + "--out", plink_output_folder_files_prefix]) LOGGER.debug("Performing IBD analysis") plink_ibd = execute_bash_command(cmd_plink_ibd) if plink_ibd[0] == 0: plink_files_generated = os.listdir(plink_outdir_fpath) - LOGGER.info('Files available in directory "{}":\n{}'.format(plink_outdir_fpath,plink_files_generated)) + LOGGER.info('Files available in directory "{}":\n{}'.format(plink_outdir_fpath, plink_files_generated)) plink_genome_fpath = plink_output_folder_files_prefix + '.genome' if os.path.isfile(plink_genome_fpath) == False: - LOGGER.error('File "{}" does not exist. Check:\nSTDOUT: "{}"\nSTDERR: "{}"'.format(plink_genome_fpath,plink_ibd[1],plink_ibd[2])) - raise Exception('File "{}" does not exist. Check:\nSTDOUT: "{}"\nSTDERR: "{}"'.format(plink_genome_fpath,plink_ibd[1],plink_ibd[2])) + LOGGER.error('File "{}" does not exist. Check:\nSTDOUT: "{}"\nSTDERR: "{}"'.format(plink_genome_fpath, + plink_ibd[1], + plink_ibd[2])) + raise Exception( + 'File "{}" does not exist. Check:\nSTDOUT: "{}"\nSTDERR: "{}"'.format(plink_genome_fpath, + plink_ibd[1], plink_ibd[2])) else: # Return method used and path of .genome file generated by PLINK - return [method,plink_genome_fpath] + return [method, plink_genome_fpath] - def relatedness_validation(reported_result,inferred_result): + def relatedness_validation(reported_result, inferred_result): LOGGER.debug('Comparing reported {} and inferred {} results'.format(reported_result, inferred_result)) - if 'UNKNOWN' in reported_result or 'UNKNOWN' in inferred_result: - validation = "UNKNOWN" - elif reported_result == "" or inferred_result == "": + if 'UNKNOWN' in reported_result or reported_result == "": validation = "UNKNOWN" else: if reported_result == inferred_result: @@ -243,17 +278,17 @@ def relatedness_validation(reported_result,inferred_result): # Return validation result return validation - def relatedness_inference(self,relatedness_thresholds_fpath,method,plink_genome_fpath): + def relatedness_inference(self, relatedness_thresholds_fpath, method, plink_genome_fpath): # Reading relatedness thresholds file (.tsv) LOGGER.debug('Getting relatedness thresholds from file: "{}"'.format(relatedness_thresholds_fpath)) relatedness_thresholds_fhand = open(str(relatedness_thresholds_fpath)) relationship_groups_thresholds_dict = {} - for index,line in enumerate(relatedness_thresholds_fhand): + for index, line in enumerate(relatedness_thresholds_fhand): relatedness_thresholds_row_values = line.strip().split() if index == 0: relatedness_thresholds_file_header = relatedness_thresholds_row_values continue - for column,value in enumerate(relatedness_thresholds_row_values): + for column, value in enumerate(relatedness_thresholds_row_values): if relatedness_thresholds_file_header[column] == 'relationship': relationship_groups_thresholds_dict[value] == {} else: @@ -265,7 +300,7 @@ def relatedness_inference(self,relatedness_thresholds_fpath,method,plink_genome_ relatedness_results = self.relatedness_results_data_model(method) relatedness_scores = [] - for index,line in enumerate(input_genome_file_fhand): + for index, line in enumerate(input_genome_file_fhand): genome_file_row_values = line.strip().split() if index == 0: continue @@ -281,76 +316,101 @@ def relatedness_inference(self,relatedness_thresholds_fpath,method,plink_genome_ score["values"]["PiHat"] = str(genome_file_row_values[9]) # Inferring family relationship block: - LOGGER.debug("Inferring family relationship between sample {} and sample {} ".format(str(genome_file_row_values[1]),str(genome_file_row_values[3]))) + LOGGER.debug( + "Inferring family relationship between sample {} and sample {} ".format(str(genome_file_row_values[1]), + str(genome_file_row_values[3]))) inference_groups = [] - for relationship,values in relationship_groups_thresholds_dict.items(): + for relationship, values in relationship_groups_thresholds_dict.items(): # Check if PI_HAT, Z0, Z1, Z2 values (from PLINK .genome file) are within range (internal thresholds) - if (values['minPiHat'] <= score["values"]["PiHat"] <= values['maxPiHat']) and (values['minZ0'] <= score["values"]["z0"] <= values['maxZ0']) and (values['minZ1'] <= score["values"]["z1"] <= values['maxZ1']) and (values['minZ2'] <= score["values"]["z2"] <= values['maxZ2']): + if (values['minPiHat'] <= score["values"]["PiHat"] <= values['maxPiHat']) and ( + values['minZ0'] <= score["values"]["z0"] <= values['maxZ0']) and ( + values['minZ1'] <= score["values"]["z1"] <= values['maxZ1']) and ( + values['minZ2'] <= score["values"]["z2"] <= values['maxZ2']): inference_groups.append(str(relationship)) continue if len(inference_groups) == 0: score["inferredRelationship"] = "UNKNOWN" - LOGGER.info("UNKNOWN family relationship inferred between sample {} and sample {} ".format(str(genome_file_row_values[1]),str(genome_file_row_values[3]))) + LOGGER.info("UNKNOWN family relationship inferred between sample {} and sample {} ".format( + str(genome_file_row_values[1]), str(genome_file_row_values[3]))) else: score["inferredRelationship"] = ', '.join(inference_groups) - LOGGER.info("Family relationship inferred between sample {} and sample {} ".format(str(genome_file_row_values[1]),str(genome_file_row_values[3]))) + LOGGER.info( + "Family relationship inferred between sample {} and sample {} ".format(str(genome_file_row_values[1]), + str(genome_file_row_values[3]))) relatedness_scores.append(score) relatedness_results["scores"] = relatedness_scores # Return dict/json with plink and inferred results return relatedness_results - def relatedness_report(self,relatedness_inference_results): + def relatedness_report(self, relatedness_inference_results): samples_individuals = self.get_samples_individuals_info() # Getting reported family relationship block: relatedness_results = relatedness_inference_results for score_result in relatedness_inference_results["scores"]: - LOGGER.debug('Getting reported relatedness information for sample {} and sample {}'.format(score_result["sampleId1"],score_result["sampleId2"])) + LOGGER.debug( + 'Getting reported relatedness information for sample {} and sample {}'.format(score_result["sampleId1"], + score_result[ + "sampleId2"])) reported_relationship = [] individual1_info = samples_individuals[score_result["sampleId1"]] individual2_info = samples_individuals[score_result["sampleId2"]] if individual1_info["individualId"] == "" or individual2_info["individualId"] == "": - LOGGER.warning('No individual information available for sample {} and sample {}). Hence reported family relationship UNKNOWN'.format(score_result["sampleId1"],score_result["sampleId2"])) + LOGGER.warning( + 'No individual information available for sample {} and sample {}). Hence reported family relationship UNKNOWN'.format( + score_result["sampleId1"], score_result["sampleId2"])) relatedness_results["scores"]["reportedRelationship"] = "UNKNOWN" continue else: - unknown_results = [False,False] + unknown_results = [False, False] if individual1_info["individualId"] in individual2_info["familyMembersRoles"].keys(): - reported_relationship.append(individual2_info["familyMembersRoles"][individual1_info["individualId"]]) + reported_relationship.append( + individual2_info["familyMembersRoles"][individual1_info["individualId"]]) else: reported_relationship.append("UNKNOWN") unknown_results[0] = True if individual2_info["individualId"] in individual1_info["familyMembersRoles"].keys(): - reported_relationship.append(individual1_info["familyMembersRoles"][individual2_info["individualId"]]) + reported_relationship.append( + individual1_info["familyMembersRoles"][individual2_info["individualId"]]) else: reported_relationship.append("UNKNOWN") unknown_results[1] = True if all(unknown_results): relatedness_results["scores"]["reportedRelationship"] = "UNRELATED" - LOGGER.info("UNRELATED family relationship found for sample {} (individual: {}) and sample {} (individual: {})".format(score_result["sampleId1"], individual1_info["individualId"],score_result["sampleId2"], individual2_info["individualId"])) + LOGGER.info( + "UNRELATED family relationship found for sample {} (individual: {}) and sample {} (individual: {})".format( + score_result["sampleId1"], individual1_info["individualId"], score_result["sampleId2"], + individual2_info["individualId"])) elif any(unknown_results): - LOGGER.warning('Family relationship discrepancy found for sample {} (individual: {}) and sample {} (individual: {}). Hence reported family relationship UNKNOWN'.format(score_result["sampleId1"], individual1_info["individualId"],score_result["sampleId2"], individual2_info["individualId"])) + LOGGER.warning( + 'Family relationship discrepancy found for sample {} (individual: {}) and sample {} (individual: {}). Hence reported family relationship UNKNOWN'.format( + score_result["sampleId1"], individual1_info["individualId"], score_result["sampleId2"], + individual2_info["individualId"])) relatedness_results["scores"]["reportedRelationship"] = "UNKNOWN" else: relatedness_results["scores"]["reportedRelationship"] = ', '.join(reported_relationship) - LOGGER.info("Family relationship reported for sample {} (individual: {}) and sample {} (individual: {})".format(score_result["sampleId1"], individual1_info["individualId"],score_result["sampleId2"], individual2_info["individualId"])) + LOGGER.info( + "Family relationship reported for sample {} (individual: {}) and sample {} (individual: {})".format( + score_result["sampleId1"], individual1_info["individualId"], score_result["sampleId2"], + individual2_info["individualId"])) # Validating reported vs inferred family relationship results block: - validation_result = self.relatedness_validation(relatedness_results["scores"]["reportedRelationship"],relatedness_results["scores"]["inferredRelationship"]) + validation_result = self.relatedness_validation(relatedness_results["scores"]["reportedRelationship"], + relatedness_results["scores"]["inferredRelationship"]) relatedness_results["scores"]["validation"] = validation_result # Return dict/json with plink, inferred, reported and validation results return relatedness_results - def relatedness_results_json(self,relatedness_results, outdir_fpath): + def relatedness_results_json(self, relatedness_results, outdir_fpath): relatedness_output_dir_fpath = outdir_fpath # Generating json file with relatedness results relatedness_results_file_name = 'relatedness.json' - relatedness_results_fpath = os.path.join(relatedness_output_dir_fpath,relatedness_results_file_name) + relatedness_results_fpath = os.path.join(relatedness_output_dir_fpath, relatedness_results_file_name) LOGGER.debug('Generating json file with relatedness results. File path: "{}"'.format(relatedness_results_fpath)) - json.dump(relatedness_results,open(relatedness_results_fpath,'w')) + json.dump(relatedness_results, open(relatedness_results_fpath, 'w')) LOGGER.info('Json file with relatedness results generated. File path: "{}"'.format(relatedness_results_fpath)) # Return json file path with relatedness results @@ -363,18 +423,21 @@ def relatedness(self): relatedness_thresholds_fpath = "/path/to/relatedness_thresholds.tsv" # Create output dir for relatedness analysis - relatedness_output_dir_fpath = create_output_dir(path_elements=[self.output_parent_dir,'relatedness']) + relatedness_output_dir_fpath = create_output_dir(path_elements=[self.output_parent_dir, 'relatedness']) # Filtering VCF and renaming variants - filtered_vcf_fpath = self.filter_rename_variants_vcf(pop_freq_fpath,relatedness_output_dir_fpath) + filtered_vcf_fpath = self.filter_rename_variants_vcf(pop_freq_fpath, relatedness_output_dir_fpath) # Performing IBD analysis from PLINK - method, plink_genome_fpath = self.relatedness_plink(filtered_vcf_fpath,pop_freq_fpath,pop_exclude_var_fpath,relatedness_output_dir_fpath) + method, plink_genome_fpath = self.relatedness_plink(filtered_vcf_fpath, pop_freq_fpath, pop_exclude_var_fpath, + relatedness_output_dir_fpath) # Inferring family relationships - relatedness_inference_dict = self.relatedness_inference(relatedness_thresholds_fpath,method,plink_genome_fpath) + relatedness_inference_dict = self.relatedness_inference(relatedness_thresholds_fpath, method, + plink_genome_fpath) # Getting reported family relationships and validating inferred vs reported results relatedness_results_dict = self.relatedness_report(relatedness_inference_dict) # Generating file with results - relatedness_results_json_fpath = self.relatedness_results_json(relatedness_results_dict,relatedness_output_dir_fpath) + relatedness_results_json_fpath = self.relatedness_results_json(relatedness_results_dict, + relatedness_output_dir_fpath) def run(self): # Checking data diff --git a/opencga-app/app/analysis/qc/individual_qc/individual_qc.py b/opencga-app/app/analysis/qc/individual_qc/individual_qc.py index 075087ff98..9e0628ce75 100644 --- a/opencga-app/app/analysis/qc/individual_qc/individual_qc.py +++ b/opencga-app/app/analysis/qc/individual_qc/individual_qc.py @@ -1,15 +1,16 @@ #!/usr/bin/env python3 -import subprocess -import sys -#import bokeh import os import logging - -from utils import create_output_dir +import json +import gzip +from utils import create_output_dir, execute_bash_command LOGGER = logging.getLogger('variant_qc_logger') +chrx_vars = "data/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chrX.recalibrated_variants_filtered_annotated_chrX.prune.in" +chrx_var_frq = "data/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_chrX.recalibrated_variants_filtered_annotated_chrX.frq" + class IndividualQCExecutor: def __init__(self, vcf_file, info_file, bam_file, config, output_parent_dir, sample_ids, id_): """Create output dir @@ -30,85 +31,351 @@ def __init__(self, vcf_file, info_file, bam_file, config, output_parent_dir, sam self.sample_ids = sample_ids self.id_ = id_ - def run(self): + # Loading configuration + config_fhand = open(self.config, 'r') + self.config_json = json.load(config_fhand) + config_fhand.close() + +# def run(self): # Checking data # self.checking_data() # TODO check input data # Running sample QC steps # self.step1() # TODO run all encessary steps for this QC (e.g. relatedness) - + if self.bam_file != None: + self.compute_coverage_metrics(bam_file=self.bam_file, output_dir=self.output_parent_dir) + # check the output of the compute_coverage_metrics is OK and continue with the checks + self.calculate_aneuploidies() + self.calculate_coverage_based_inferred_sex() + else: + LOGGER.warning("Skipping coverage-based inferred sex: BIGWIG file not found for sample '" + sample_id + "'") + pass # Return results - # ... # TODO return results +# # ... # TODO return results pass - def step1(self): +# def step1(self): # Create output dir for this step output_dir = create_output_dir([self.output_parent_dir, 'step1']) # Run step1 # ... # TODO execute this step commands +# pass + + def compute_coverage_metrics(self, bam_file, output_dir): + """ + Given a BigWig file, it calculates the average coverage per chromosome and autosomes + :param bw_file: Path to the BigWig file for the sample + :param output_dir: Output directory + + :return: dict with median coverage per chr1-22,X,Y and autosomes + """ + # Create output dir for this step + output_dir = create_output_dir([self.output_parent_dir, 'coverage_metrics']) + + bam_file = "file.bam" + #cr = crpb.CountReadsPerBin([bam_file], binLength=50, stepSize=50) + + def calculate_aneuploidies(self, coverage, output_dir): + # Create output dir for this step + output_dir = create_output_dir([self.output_parent_dir, 'aneuploidies']) + pass + + def calculate_coverage_based_inferred_sex(bw_file, config, coverage, output_dir): + """ + Calculates inferred sex based on coverage if BAM information is available and checks if matches against the expected + :param bw_file: Path to the BigWig file for the sample + :param output_dir: Output directory + :return: + """ + chromosomes = ['chrX', 'chrY', "autosomes"] + if all(chromosomes) in coverage and all(coverage[chromosomes]) is not None: + # Calculate ratio X-chrom / autosomes + ratio_chrX = float(coverage["chrX"] / coverage["autosomes"]) + # Calculate ratio Y-chrom / autosomes + ratio_chrY = float(coverage["chrY"] / coverage["autosomes"]) + + + else: + LOGGER.warning("Coverage-based inferred sex not calculate for sample {sample}. Missing or invalid coverage " + "values".format(sample=sample_ids)) + + def filter_variants(self, chrx_vars, outdir_path): + """ + Annotates input VCF with chromosome coordinate ID and filters for LD pruned PASS chr X variants using BCFTools. + :param self: Input data + :param chrX_vars: List of LD pruned chr X variants + :param outdir_path: Output directory path + :return: + """ + individual_id = self.id_ + # Read in input files: + individual_info_json = json.load(self.info_file) + vcf_file = gzip.open(self.vcf_file,'r') + good_vars = open(chrx_vars, 'r') + for sample in self.sample_ids: + sample_id = sample + # Create output directory: + filtered_vcf_name = individual_id + "_" + sample_id + '_passed_variants.vcf.gz' + filtered_vcf_path = os.path.join(str(outdir_path),filtered_vcf_name) + + # Annotate input VCF with chromosome coordinate ID: + bcftools_annotate_cmd = "bcftools annotate --set-id '%CHROM\:%POS\:%REF\:%FIRST_ALT' " + vcf_file + " -Oz -o " + os.path.join(outdir_path, "/annotated.vcf.gz") + execute_bash_command(cmd=bcftools_annotate_cmd) + LOGGER.info("{file} annotated with chromosome coordinate IDs".format(file=vcf_file)) + + # Filter for chromosome X LD pruned variants: + bcftools_variant_filter_cmd = "bcftools view --include ID==@" + good_vars + " " + os.path.join(outdir_path, "/annotated.vcf.gz") + " -Oz -o " + os.path.join(outdir_path, "/varfilter.vcf.gz") + execute_bash_command(cmd=bcftools_variant_filter_cmd) + LOGGER.info("annotated.vcf.gz filtered for chr X LD pruned variants") + + # Check if FILTER field contains PASS variants: + varcount_all_cmd = "bcftools view -i 'FILTER="PASS"' " + vcf_file + " zgrep -vw '#/|CHROM' | wc -l" + all_pass_vars = execute_bash_command(cmd=varcount_all_cmd) + LOGGER.info("checking for PASS variants in the FILTER field of '{}'".format(vcf_file)) + + # Filter for PASS variants: + bcftools_pass_filter_cmd = "bcftools view -i 'FILTER="PASS"' " + os.path.join(outdir_path, "/varfilter.vcf.gz") + " -Oz -o " + filtered_vcf_path + execute_bash_command(cmd=bcftools_pass_filter_cmd) + varcount_filt_cmd = "bcftools view -i 'FILTER="PASS"' " + filtered_vcf_path + " zgrep -vw '#\|CHROM' | wc -l" + chrX_variants = execute_bash_command(cmd=varcount_filt_cmd) + if all_pass_vars == 0: + LOGGER.info("WARNING: no FILTER information available, input data will not be filtered for PASS variants, results may be unreliable. There are '{}' chr X LD pruned variants after filtering".format(chrX_variants)) + elif chrX_variants < 1000: # NB, need to come up with a good threshold - this is a random suggestion + LOGGER.info("WARNING: poor quality data, results may be unreliable. There are '{}' good quality chr X LD pruned variants after filtering".format(chrX_variants)) + else: + LOGGER.info("There are '{}' good quality chr X LD pruned variants after filtering") + # Return annotated, filtered VCF path: + return filtered_vcf_path pass -vcf_file = "/home/mbleda/Downloads/KFSHRC/CGMQ2022-02850.vcf.gz" -output_dir = "/tmp/qc_tests/" -sample_ids = "" -info_file = "" -config = "" - - -def run(vcf_file, sample_ids, info_file, config, output_dir): - inferred_sex(vcf_file=vcf_file, output_dir=output_dir) - # missingness() - # heterozygosity () - # roh() - # upd() - -def exec_bash_command(cmd_line): - """ - Run a bash command (e.g. bcftools), and return output - """ - po = subprocess.Popen(cmd_line, - shell=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - - stdout, stderr = po.communicate() - po.wait() - return_code = po.returncode - - if return_code != 0: - raise Exception( - "Command line {} got return code {}.\nSTDOUT: {}\nSTDERR: {}".format(cmd_line, return_code, stdout, - stderr)) - - return po.returncode, stdout - - -def bcftools_stats(vcf_file, output_dir): - """ - Calculates VCF stats using BCFTools - :param vcf_file: VCF file to get stats from - :param output_dir: Output directory - :return: - """ - bcftools_stats_output = exec_bash_command(cmd_line='bcftools stats -v ' + vcf_file + ' > ' + output_dir + '/bcftools_stats.txt') - if bcftools_stats_output[0] == 0: - print("BCFTools stats calculated successfully for {file}".format(file=vcf_file)) - #plot_bcftools_stats(file=output_dir + '/bcftools_stats.txt', prefix="stats", output=output_dir) - -def inferred_sex(vcf_file, output_dir): - """ - Calculates inferred sex and checks if matches against the expected - :param vcf_file: VCF file containing variants for chrX and chrY. This VCF should have - been previously filtered to contain variants - :param output_dir: Output directory - :return: - """ - binary = "/home/mbleda/soft/plink/plink_linux_x86_64_20231018/plink" - plink_sex_output = exec_bash_command(cmd_line=binary + 'bcftools stats -v ' + vcf_file + ' > ' + output_dir + '/bcftools_stats.txt') - # if bcftools_stats_output[0] == 0: - # print("BCFTools stats calculated successfully for {file}".format(file=vcf_file)) - # plot_bcftools_stats(file=output_dir + '/bcftools_stats.txt', prefix="stats", output=output_dir) - -if __name__ == '__main__': - sys.exit(run(vcf_file=vcf_file, sample_ids=sample_ids, info_file=info_file, config=config, output_dir=output_dir)) + + def get_individual_sex(self): + """ + Retrieve individual sex for sample and recode for Plink input. + """ + individual_info_json = json.load(self.info_file) + indsex = individual_info_json['sex']['id'] + karsex = individual_info_json['karyotypicSex'] + ind_metadata = {} + for sample in self.sample_ids: + ind_metadata[sample] = {'individualId': individual_info_json['id'], 'individualSex': 0, 'sampleId': 'NA'} + + LOGGER.debug("Retrieve sex information and recode") + for sam in individual_info_json['samples']: + if sam['id'] in self.sample_ids: + ind_metadata[sam['id']]['sampleId'] = sample['id'] + if indsex.upper() == 'MALE' or karsex == 'XY': + ind_metadata[sam['id']]['individualSex'] = 1 + elif indsex.upper() == 'FEMALE' or karsex == 'XX': + ind_metadata[sam['id']]['individualSex'] = 2 + else: + LOGGER.info("Sex information for Individual '{}' (Sample '{}') is not available, sex will be inferred".format(sam['id'],sample['id'])) + # Check individual information for each sample is present: + for sample,individual_info in ind_metadata.items(): + if individual_info['individualSex'] == 0: + LOGGER.warning("No individual information available for sample '{}'.".format(individual_info['sampleId'])) + else: + LOGGER.info("Individual information for sample '{}' found".format(individual_info['sampleId'])) + # Return sex information: + return ind_metadata + pass + + + def generate_plink_fam_file_input(self,output_dir): + # Retrieve sex information: + individual_id = self.id_ + ind_metadata = self.get_individual_sex() + + # Generate a text file to update the Plink fam file with sex information: + sex_info_file_name = individual_id + '_sex_information.txt' + sex_info_path = os.path.join(str(output_dir),sex_info_file_name) + sex_info_input = open(sex_info_path,'w') + LOGGER.debug("Generating text file to update the input sex information: '{}'".format(sex_info_path)) + + for sample in self.sample_ids: + # Individual information for sample: + individual_info = ind_metadata[sample] + # Plink sex update file format: familyId individualId sex (note, for the purpose of simplicity, individualId replaces familyId below) + ###### BECAUSE IT'S POSSIBLE TO HAVE MULTIPLE FAMILY IDS & THE FAMILY ID ISN'T ACTUALLY REQUIRED FOR THE ANALYSIS + sex_info_input.write(('/t'.join([sample,sample,individual_info['individualSex']])) + '\n') + LOGGER.info("Text file generated to update the Plink fam file with sex information: '{}'".format(sex_info_path)) + + # Return paths of text files generated: + return sex_info_path + pass + + + def inferred_sex_results_data_model(sex_method): + inferredSex = [ + { + method: "", + sampleId: "", + inferredKaryotypicSex "", #### I'M NOT SURE I LIKE THIS - I'D RATHER HAVE "inferredSex" - it's specific to the coverage based inference + software: { + name: "plink", + version: "1.9", + commit: "", + params: { + key: value + } + }, + values: { + "sampleId": "", + "PEDSEX": "", + "SNPSEX": "", + "STATUS": "", + "F": "" + }, + images: [ + { + name: "Sex check", + base64: "", + description: "TO UPDATE" + } + ], + attributes: { + cli: "", + files: [], + JOB_ID: "" + } + } + ] + if sex_method == "Plink/check-sex": + LOGGER.info("Relatedness method used: '{}'".format(sex_method)) + inferredSex['method'] = str(sex_method) + elif sex_method == "Plink/impute-sex": + LOGGER.info("Relatedness method used: '{}'".format(sex_method)) + inferredSex['method'] = str(sex_method) + inferredSex['values']['PEDSEX'] == "NA" + inferredSex['values']['STATUS'] == "INFERRED" +# elif sex_method == "MartaCoverageBasedSexMethodName": +# LOGGER.info("Relatedness method used: '{}'".format(sex_method)) +# #etc + else: + LOGGER.warning("No method for sex inference defined.") + # Return inferredSex data model json with method specific fields filled in: + return inferredSex + pass + + + def check_sex_plink(self,filtered_vcf_path,chrx_var_frq,plink_outdir,sex_method="Plink/check-sex"): + # Define Plink check-sex methodology: + LOGGER.info("Method: {}".format(sex_method)) + plink_outdir_path = create_output_dir(path_elements=[str(plink_outdir),"plink_check_sex"]) + sex_info = self.generate_plink_fam_file_input(output_dir=plink_outdir_path) + + # Plink parameters: + file_prefix = self.id_ + "plink_sex_check_results" + plink_output_prefix = os.path.join(plink_outdir,file_prefix) + + # Convert input to Plink format: + plink_convert_input = ' '.join([plink, + "--vcf", str(filtered_vcf_path), + "--make-bed", + "--keep-allele-order", + "--update-sex", sex_info_path, + "--split-x, "hg38", "no-fail", + "--out", plink_output_prefix]) + LOGGER.debug("Generating Plink check-sex input files") + plink_input_files = execute_bash_command(plink_convert_input) + if plink_convert_input[0] == 0: + plink_files_generated = os.listdir(plink_outdir) + LOGGER.info("Files available: '{}':\n{}".format(plink_outdir,plink_files_generated)) + + # Run Plink check-sex analysis: + plink_check_sex_input = ' '.join([plink, + "--bfile", plink_output_prefix, + "--read-freq", str(chrx_var_freq), + "--check-sex", + "--out", plink_output_prefix]) + LOGGER.debug("Performing Plink check-sex") + plink_sexcheck = execute_bash_command(plink_check_sex_input) + if plink_check_sex_input[0] == 0: + plink_files_generated = os.listdir(plink_outdir) + LOGGER.info("Files available: '{}':\n{}".format(plink_outdir,plink_files_generated)) + + plink_sexcheck_path = plink_output_prefix + '.sexcheck' + if os.path.isfile(plink_sexcheck_path) == False: + LOGGER.error("File '{}' does not exist. Check:\nSTDOUT: '{}'\nSTDERR: '{}'".format(plink_sexcheck_path,plink_sexcheck[1],plink_sexcheck[2])) + raise Exception("File '{}' does not exist. Check:\nSTDOUT: '{}'\nSTDERR: '{}'".format(plink_sexcheck_path,plink_sexcheck[1],plink_sexcheck[2])) + else: + # Return method used and path of the output .sexcheck file generated by Plink + return [sex_method,plink_sexcheck_path] + pass + + + def impute_sex_plink(self,filtered_vcf_path,chrx_var_frq,plink_outdir,sex_method="Plink/impute-sex"): + # Define Plink impute-sex methodology: + LOGGER.info("Method: {}".format(sex_method)) + plink_outdir_path = create_output_dir(path_elements=[str(plink_outdir),"plink_impute_sex"]) + + # Plink parameters: + file_prefix = self.id_ + "plink_impute_check_results" + plink_output_prefix = os.path.join(plink_outdir,file_prefix) + + # Convert input to Plink format: + plink_convert_input = ' '.join([plink, + "--vcf", str(filtered_vcf_path), + "--make-bed", + "--keep-allele-order", + "--split-x, "hg38", "no-fail", + "--out", plink_output_prefix]) + LOGGER.debug("Generating Plink sex-check input files") + plink_input_files = execute_bash_command(plink_convert_input) + if plink_convert_input[0] == 0: + plink_files_generated = os.listdir(plink_outdir) + LOGGER.info("Files available: '{}':\n{}".format(plink_outdir,plink_files_generated)) + + # Run Plink impute-sex analysis: + plink_impute_sex_input = ' '.join([plink, + "--bfile", plink_output_prefix, + "--read-freq", str(chrx_var_frq), + "--impute-sex", + "--out", plink_output_prefix]) + LOGGER.debug("Performing Plink impute-sex") + plink_imputesex = execute_bash_command(plink_impute_sex_input) + if plink_impute_sex_input[0] == 0: + plink_files_generated = os.listdir(plink_outdir) + LOGGER.info("Files available: '{}':\n{}".format(plink_outdir,plink_files_generated)) + + plink_sexcheck_path = plink_output_prefix + '.sexcheck' + if os.path.isfile(plink_sexcheck_path) == False: + LOGGER.error("File '{}' does not exist. Check:\nSTDOUT: '{}'\nSTDERR: '{}'".format(plink_sexcheck_path,plink_sexcheck[1],plink_sexcheck[2])) + raise Exception("File '{}' does not exist. Check:\nSTDOUT: '{}'\nSTDERR: '{}'".format(plink_sexcheck_path,plink_sexcheck[1],plink_sexcheck[2])) + else: + # Return method used and path of the output .sexcheck file generated by Plink + return [sex_method,plink_sexcheck_path] + pass + + + def run_inferred_sex(self): + """ + Run Plink check-sex or impute-sex as appropriate, dependent on the availability of user supplied sex information. + :param self: Input data + :filtered_vcf_path: Input VCF filtered for LD pruned chr X variants + :param chrX_var_frq: 1000 Genomes population based allele frequencies for the selected LD pruned chr X variants + :param plink_outdir: Output directory path + :return: + """ + # Retrieve sex information: + individual_id = self.id_ + ind_metadata = self.get_individual_sex() + + # Run inferred sex analysis: + if ind_metadata['individualSex'] == 0: + inferred_sex_output = impute_sex_plink(TO UPDATE) + LOGGER.info("Running Plink impute-sex") + else: + inferred_sex_output = check_sex_plink(TO UPDATE) + LOGGER.info("Running Plink check-sex") + + + + + def plot_results(TO UPDATE): + # Plot results: + pass + + + diff --git a/opencga-app/app/analysis/qc/sample_qc/sample_qc.py b/opencga-app/app/analysis/qc/sample_qc/sample_qc.py index 3b7f0756bd..66fa020b75 100644 --- a/opencga-app/app/analysis/qc/sample_qc/sample_qc.py +++ b/opencga-app/app/analysis/qc/sample_qc/sample_qc.py @@ -3,12 +3,13 @@ import sys import os import logging - +import json from utils import create_output_dir, execute_bash_command LOGGER = logging.getLogger('variant_qc_logger') + class SampleQCExecutor: def __init__(self, vcf_file, info_file, bam_file, config, output_parent_dir, sample_ids, id_): """Create output dir @@ -29,16 +30,21 @@ def __init__(self, vcf_file, info_file, bam_file, config, output_parent_dir, sam self.sample_ids = sample_ids self.id_ = id_ + # Loading configuration + config_fhand = open(self.config, 'r') + self.config_json = json.load(config_fhand) + config_fhand.close() + + def run(self): - # check_data() - # relatedness() - # inferred_sex() - # if info_file.somatic == True && config.mutational_signature.skip == False: - # mutational_signature() - # mendelian_errors() - # return ; - self.bcftools_stats(vcf_file=self.vcf_file) + # Genome plot + if 'skip' not in self.config_json or 'genomePlot' not in self.config_json['skip']: + self.create_genome_plot() + + + + # self.bcftools_stats(vcf_file=self.vcf_file) # missingness() # heterozygosity () @@ -49,6 +55,61 @@ def run(self): # ... # TODO return results pass + def get_genome_plot_vcfs(self, output_dir, gp_config_json): + + # Creating VCF files for each variant type + snvs_fpath = os.path.join(output_dir, 'snvs.tsv') + indels_fpath = os.path.join(output_dir, 'indels.tsv') + cnvs_fpath = os.path.join(output_dir, 'cnvs.tsv') + rearrs_fpath = os.path.join(output_dir, 'rearrs.tsv') + + # TODO Filtering VCF + vcf_fhand = open(self.vcf_file, 'r') + # /opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/genomePlot/GenomePlotLocalAnalysisExecutor.java + vcf_fhand.close() + + return snvs_fpath, indels_fpath, cnvs_fpath, rearrs_fpath + + def create_genome_plot(self): + # Creating output dir for this step + output_dir = create_output_dir([self.output_parent_dir, 'genome-plot']) + + # Getting genome plot config file + gp_config_fpath = self.config_json['genomePlot']['configFile'] + gp_config_fhand = open(gp_config_fpath, 'r') + gp_config_json = json.load(gp_config_fhand) + gp_config_fhand.close() + + # Getting variants + snvs_fpath, indels_fpath, cnvs_fpath, rearrs_fpath = self.get_genome_plot_vcfs(output_dir, gp_config_json) + + # Creating CMD + # /analysis/R/genome-plot/circos.R + # https://github.com/opencb/opencga/blob/TASK-6766/opencga-analysis/src/main/R/genome-plot/circos.R + # TODO input/output paths + cmd = ( + ' R CMD Rscript --vanilla /data/input/circos.R' + ' --genome_version hg38' + ' --out_path /data/output' + ' --plot_title {plot_title}' + ' --out_format {out_format}' + ' /data/output/{snvs_fname}' + ' /data/output/{indels_fname}' + ' /data/output/{cnvs_fname}' + ' /data/output/{rearrs_fname}' + ' {sample_id}' + ).format( + plot_title=gp_config_json['title'], + out_format='png', + snvs_fname=os.path.basename(snvs_fpath), + indels_fname=os.path.basename(indels_fpath), + cnvs_fname=os.path.basename(cnvs_fpath), + rearrs_fname=os.path.basename(rearrs_fpath), + sample_id=self.id_ + ) + + return_code, stdout, stderr = execute_bash_command(cmd) + def bcftools_stats(self, vcf_file): """ diff --git a/opencga-app/app/analysis/qc/variant_qc.main.py b/opencga-app/app/analysis/qc/variant_qc.main.py index 263686e869..581056c28b 100644 --- a/opencga-app/app/analysis/qc/variant_qc.main.py +++ b/opencga-app/app/analysis/qc/variant_qc.main.py @@ -35,7 +35,7 @@ def get_parser(): help='comma-separated BAM file paths') parser.add_argument('-q', '--qc-type', dest='qc_type', choices = ['sample', 'individual', 'family'], required=True, help='type of QC') - parser.add_argument('-c', '--config', dest='config', + parser.add_argument('-c', '--config', dest='config', required=True, help='configuration file path') parser.add_argument('-o', '--output-dir', dest='output_dir', help='output directory path') @@ -217,6 +217,8 @@ def main(): shutil.copy(vcf_files[i], qc_outdir_fpath) LOGGER.debug('Copying info JSON file "{}" in output directory "{}"'.format(info_jsons[i], qc_outdir_fpath)) shutil.copy(info_jsons[i], qc_outdir_fpath) + LOGGER.debug('Copying config JSON file "{}" in output directory "{}"'.format(config, qc_outdir_fpath)) + shutil.copy(config, qc_outdir_fpath) # Execute QC qc_executor(