diff --git a/scripts/python/cache/clinvar_cache.py b/scripts/python/cache/clinvar_cache.py new file mode 100644 index 00000000..db878124 --- /dev/null +++ b/scripts/python/cache/clinvar_cache.py @@ -0,0 +1,140 @@ +"""Cache data on variants related to human health, from NCBI ClinVar + +Example: +python clinvar_cache.py +""" + +import csv +import json +import gzip + +clinical_concerns = ['Likely_pathogenic', 'Pathogenic/Likely_pathogenic', 'Pathogenic'] +robust_review_statuses = [ + 'criteria_provided,_multiple_submitters,_no_conflicts', + 'reviewed_by_expert_panel', + 'practice_guideline' +] + +disease_ids_and_names = [] +variant_types = [] +molecular_consequences = [] + +def get_is_relevant(fields): + is_clinical_concern = False + is_robustly_reviewed = False + for field in fields: + [name, value] = field.split('=') + + if name == 'CLNSIG' and value in clinical_concerns: + is_clinical_concern = True + + if name == 'CLNREVSTAT' and value in robust_review_statuses: + is_robustly_reviewed = True + + return is_clinical_concern and is_robustly_reviewed + +def trim_info_fields(fields): + slim_fields = [] + names_to_keep = ['CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] + disease_indexes = [] + disease_names = [] + disease_ids = [] + for field in fields: + [name, value] = field.split('=') + + if name == 'CLNDISDB': # "Clinical disease database" + entries = value.split('|') + for entry in entries: + full_values = entry.split(',') + has_mondo = False + for fv in full_values: + split_fv = fv.split(':') + db_name = split_fv[0] + db_value = split_fv[-1] + if db_name == 'MONDO': + disease_ids.append(db_value) + has_mondo = True + if not has_mondo: + disease_ids.append('-1') + + elif name == 'CLNDN': # "Clinical disease name" + disease_names = value.split('|') + + elif name == 'CLNVC': + if value not in variant_types: + variant_types.append(value) + variant_type = variant_types.index(value) + slim_fields.append(str(variant_type)) + + elif name == 'MC': + entries = value.split(',') + slim_mc = [] + for entry in entries: + if entry not in molecular_consequences: + molecular_consequences.append(entry) + molecular_consequence = molecular_consequences.index(entry) + slim_mc.append(str(molecular_consequence)) + slim_fields.append(','.join(slim_mc)) + + elif name in names_to_keep: + if name == 'CLNSIG': + value = clinical_concerns.index(value) + elif name == 'CLNREVSTAT': + value = robust_review_statuses.index(value) + slim_fields.append(str(value)) + + for (i, disease_id) in enumerate(disease_ids): + if disease_id == '-1': + continue + disease_name = disease_names[i] + disease_id_and_name = disease_id + '|' + disease_name + if disease_id_and_name not in disease_ids_and_names: + disease_ids_and_names.append(disease_id_and_name) + disease_index = disease_ids_and_names.index(disease_id_and_name) + disease_indexes.append(str(disease_index)) + + disease_indexes_string = ','.join(disease_indexes) + slim_fields.insert(0, disease_indexes_string) + + slim_info = '\t'.join(slim_fields) + return slim_info + +output_rows = [] + +# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20241215.vcf.gz +# Source: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/ +with open('clinvar_20241215.vcf') as file: + reader = csv.reader(file, delimiter="\t") + for row in reader: + if row[0][0] == '#': + continue + # print('row', row) + info = row[7] + fields = info.split(';') + is_relevant = get_is_relevant(fields) + if not is_relevant: + continue + slim_info = trim_info_fields(fields) + del row[5:7] + row[5] = slim_info + + output_rows.append('\t'.join(row)) + +content = '\n'.join(output_rows) + +disease_map = json.dumps(disease_ids_and_names) +column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'DISEASE_IDS', 'CLNREVSTAT', 'CLNSIG', 'CLNVC', 'MC', 'ORIGIN', 'RS'] +headers = '\n'.join([ + '# disease_mondo_ids_and_names = ' + disease_map, + '# variant_types = ' + str(variant_types), + '# molecular_consequences = ' + str(molecular_consequences), + '\t'.join(column_names) +]) +content = headers + '\n' + content + + +output_path = 'clinvar_priority_20241215.tsv' +with open(output_path, "w") as f: + f.write(content) +with gzip.open(f"{output_path}.gz", "wt") as f: + f.write(content) diff --git a/test/offline/gene-structure.test.js b/test/offline/gene-structure.test.js index f4a94119..dd4a2e3a 100644 --- a/test/offline/gene-structure.test.js +++ b/test/offline/gene-structure.test.js @@ -22,7 +22,7 @@ describe('Ideogram gene structure functionality', function() { const apoeLabel = document.querySelector('#_c18_a3 path'); apoeLabel.dispatchEvent(new Event('mouseover')); const subparts = document.querySelectorAll('rect.subpart'); - assert.equal(subparts.length, 7); // spliced, without introns + assert.equal(subparts.length, 9); // spliced, without introns done(); }, 500); } @@ -162,7 +162,7 @@ describe('Ideogram gene structure functionality', function() { const sKeydown = new KeyboardEvent('keydown', {key: 's'}); document.dispatchEvent(sKeydown); let subparts = document.querySelectorAll('rect.subpart'); - assert.equal(subparts.length, 10); // includes introns + assert.equal(subparts.length, 12); // includes introns document.dispatchEvent(sKeydown); setTimeout(async function() {