From 386ceb76a26c4ecd6900a9f8223e1493b736a25e Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 27 Sep 2022 11:55:16 -0400 Subject: [PATCH 01/96] First commit for LIRICAL data loading. --- .../0048_phenotypeprioritization.py | 31 ++++++ seqr/models.py | 38 ++++++++ seqr/urls.py | 3 +- seqr/views/apis/data_manager_api.py | 96 ++++++++++++++++++- ui/pages/DataManagement/DataManagement.jsx | 2 + .../components/PhenotypePri.jsx | 31 ++++++ ui/pages/DataManagement/reducers.js | 17 ++++ ui/pages/DataManagement/selectors.js | 1 + 8 files changed, 217 insertions(+), 2 deletions(-) create mode 100644 seqr/migrations/0048_phenotypeprioritization.py create mode 100644 ui/pages/DataManagement/components/PhenotypePri.jsx diff --git a/seqr/migrations/0048_phenotypeprioritization.py b/seqr/migrations/0048_phenotypeprioritization.py new file mode 100644 index 0000000000..bb24d75db8 --- /dev/null +++ b/seqr/migrations/0048_phenotypeprioritization.py @@ -0,0 +1,31 @@ +# Generated by Django 3.2.15 on 2022-09-27 15:01 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('seqr', '0047_auto_20220908_1851'), + ] + + operations = [ + migrations.CreateModel( + name='PhenotypePrioritization', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('gene_id', models.CharField(max_length=20)), + ('tool', models.CharField(choices=[('E', 'exomiser'), ('L', 'lirical')], max_length=1)), + ('rank', models.IntegerField()), + ('disease_id', models.CharField(max_length=32)), + ('score1', models.FloatField(null=True)), + ('score2', models.FloatField(null=True)), + ('score3', models.FloatField(null=True)), + ('sample', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='seqr.sample')), + ], + options={ + 'unique_together': {('sample', 'gene_id', 'disease_id')}, + }, + ), + ] diff --git a/seqr/models.py b/seqr/models.py index d71dd36e0b..a7ce22508e 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -1053,3 +1053,41 @@ class Meta: unique_together = ('sample', 'gene_id') json_fields = ['gene_id', 'tpm'] + + +class PhenotypePrioritization(DeletableSampleMetadataModel): + EXOMISER = 'exomiser' + LIRICAL = 'lirical' + EXOMISER_CHOICE = 'E' + LIRICAL_CHOICE = 'L' + SCORE_NAME1 = 'scoreName1' + SCORE_NAME2 = 'scoreName2' + SCORE_NAME3 = 'scoreName3' + TOOL_CHOICES = ( + (EXOMISER_CHOICE, EXOMISER), + (LIRICAL_CHOICE, LIRICAL) + ) + SCORE_NAMES = { + EXOMISER_CHOICE: { + SCORE_NAME1: 'exomiser_score', + SCORE_NAME2: 'phenotype_score', + SCORE_NAME3: 'variant_score', + }, + LIRICAL_CHOICE: { + SCORE_NAME1: 'post_test_probability', + SCORE_NAME2: 'compositeLR', + SCORE_NAME3: None, + } + } + + tool = models.CharField(max_length=1, choices=TOOL_CHOICES) + rank = models.IntegerField() + disease_id = models.CharField(max_length=32) + score1 = models.FloatField(null=True) + score2 = models.FloatField(null=True) + score3 = models.FloatField(null=True) + + class Meta: + unique_together = ('sample', 'gene_id', 'disease_id') + + json_fields = ['gene_id', 'tool', 'rank', 'disease_id', 'score1', 'score2', 'score3'] diff --git a/seqr/urls.py b/seqr/urls.py index 30042b4257..3882d9229e 100644 --- a/seqr/urls.py +++ b/seqr/urls.py @@ -114,7 +114,7 @@ forgot_password from seqr.views.apis.data_manager_api import elasticsearch_status, upload_qc_pipeline_output, delete_index, \ - update_rna_seq, load_rna_seq_sample_data, proxy_to_kibana + update_rna_seq, load_rna_seq_sample_data, load_phenotype_pri_data, proxy_to_kibana from seqr.views.apis.report_api import \ anvil_export, \ discovery_sheet, \ @@ -307,6 +307,7 @@ 'data_management/get_all_users': get_all_users, 'data_management/update_rna_seq': update_rna_seq, 'data_management/load_rna_seq_sample/(?P[^/]+)': load_rna_seq_sample_data, + 'data_management/load_phenotype_pri_data': load_phenotype_pri_data, 'summary_data/saved_variants/(?P[^/]+)': saved_variants_page, 'summary_data/success_story/(?P[^/]+)': success_story, diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index d0f7301d7d..7fe2ff938a 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -23,7 +23,9 @@ from seqr.views.utils.json_utils import create_json_response, _to_camel_case from seqr.views.utils.permissions_utils import data_manager_required -from seqr.models import Sample, Individual, RnaSeqOutlier, RnaSeqTpm +from seqr.models import Sample, Individual, RnaSeqOutlier, RnaSeqTpm, PhenotypePrioritization, Project + +from reference_data.models import Omim from settings import KIBANA_SERVER, KIBANA_ELASTICSEARCH_PASSWORD @@ -395,6 +397,98 @@ def load_rna_seq_sample_data(request, sample_guid): return create_json_response({'success': True}) +@data_manager_required +def load_phenotype_pri_data(request): + request_json = json.loads(request.body) + + file_name = request_json['file'] + ignore_extra_samples = request_json['ignoreExtraSamples'] + + logger.info(f'Loading phenotype prioritization data from {file_name}', request.user) + records = _load_phenotype_pri_file(file_name, ignore_extra_samples) + models = PhenotypePrioritization.objects.bulk_create([PhenotypePrioritization(**data) for data in records]) + sample_guids = [data['sample'].guid for data in records] + logger.info(f'create {len(models)} PhenotypePrioritization', request.user, db_update={ + 'dbEntity': PhenotypePrioritization, 'numEntities': len(models), 'parentEntityIds': sample_guids, + 'updateType': 'bulk_create', + }) + + return create_json_response({ + 'info': ['Phenotype prioritization data loaded'], + 'warnings': [], + 'fileName': file_name, + }) + + + +EXPECTED_HEADER = ['tool', 'project', 'sampleId', 'rank', 'geneId', 'diseaseId', 'diseaseName', + PhenotypePrioritization.SCORE_NAME1, 'score1', + PhenotypePrioritization.SCORE_NAME2, 'score2', + PhenotypePrioritization.SCORE_NAME3, 'score3'] + + +def _get_phenotype_pri(record, i, ignore_extra_samples): + tool = next((k for k, v in PhenotypePrioritization.TOOL_CHOICES if v == record['tool']), None) + if not tool: + raise ValueError('Expecting {} for the "tool" column but found {} (record {})'.format( + ', '.join([v for k, v in PhenotypePrioritization.TOOL_CHOICES]), record['tool'], i)) + + project_name = record['project'] + projects = Project.objects.filter(name=project_name) + if len(projects) < 1: + raise ValueError(f'Project {project_name} is not found (record {i})') + project = projects[0] + + sample_id = record['sampleId'] + samples = Sample.objects.filter(sample_id=sample_id, individual__family__project=project, is_active=True, + dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS) + if len(samples) != 1: + if ignore_extra_samples: + return + raise ValueError(f'Sample with ID {sample_id} is not found (record {i})') + + disease_id = record['diseaseId'] + if disease_id.startswith('OMIM:'): + omim_recs = Omim.objects.filter(phenotype_mim_number=int(disease_id.replace('OMIM:', ''))) + if len(omim_recs) < 1: + raise ValueError(f'Disease ID {disease_id} can\'t be found in Omim (record {i})') + else: + raise ValueError(f'Unknown disease ID {disease_id} (record {i})') + + for score_name, value in PhenotypePrioritization.SCORE_NAMES[tool].items(): + if record.get(score_name) != value: + raise ValueError(f'Expecting {value} for {score_name} but {record[score_name]} found (record {i})') + + return { + 'sample': samples[0], + 'gene_id': record['geneId'], + 'tool': tool, + 'rank': int(record['rank']), + 'disease_id': disease_id, + 'score1': float(record['score1']), + 'score2': float(record['score2']) if PhenotypePrioritization.SCORE_NAMES[tool][PhenotypePrioritization.SCORE_NAME2] else None, + 'score3': float(record['score3']) if PhenotypePrioritization.SCORE_NAMES[tool][PhenotypePrioritization.SCORE_NAME3] else None, + } + + +def _load_phenotype_pri_file(file_name, ignore_extra_samples): + lines = file_iter(file_name) + + header = next(lines).rstrip().split('\t') + missing_header = [h for h in EXPECTED_HEADER if h not in header] + if len(missing_header): + raise ValueError('The following required columns are missing: {}'.format(', '.join(missing_header))) + + records = [] + for i, line in enumerate(lines): + row = line.rstrip().split('\t') + record = {header[cnt]: col for cnt, col in enumerate(row)} + record = _get_phenotype_pri(record, i, ignore_extra_samples) + if record: + records.append(record) + return records + + # Hop-by-hop HTTP response headers shouldn't be forwarded. # More info at: http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.5.1 EXCLUDE_HTTP_RESPONSE_HEADERS = { diff --git a/ui/pages/DataManagement/DataManagement.jsx b/ui/pages/DataManagement/DataManagement.jsx index 7729149c68..ab25d5e7a3 100644 --- a/ui/pages/DataManagement/DataManagement.jsx +++ b/ui/pages/DataManagement/DataManagement.jsx @@ -10,6 +10,7 @@ import ElasticsearchStatus from './components/ElasticsearchStatus' import RnaSeq from './components/RnaSeq' import SampleQc from './components/SampleQc' import Users from './components/Users' +import PhenotypePri from './components/PhenotypePri' const IFRAME_STYLE = { position: 'fixed', left: '0', top: '95px' } @@ -22,6 +23,7 @@ export const DATA_MANAGEMENT_PAGES = [ { path: 'sample_qc', component: SampleQc }, { path: 'rna_seq', component: RnaSeq }, { path: 'users', component: Users }, + { path: 'lirical_exomiser', component: PhenotypePri }, ] const DataManagement = ({ match, user }) => ( diff --git a/ui/pages/DataManagement/components/PhenotypePri.jsx b/ui/pages/DataManagement/components/PhenotypePri.jsx new file mode 100644 index 0000000000..de64606ba3 --- /dev/null +++ b/ui/pages/DataManagement/components/PhenotypePri.jsx @@ -0,0 +1,31 @@ +import { connect } from 'react-redux' + +import { validators } from 'shared/components/form/FormHelpers' +import { BooleanCheckbox } from 'shared/components/form/Inputs' +import UploadFormPage from 'shared/components/page/UploadFormPage' + +import { getPhenoPriUploadStats } from '../selectors' +import { uploadPhenoPri } from '../reducers' + +const mapStateToProps = state => ({ + fields: [ + { + name: 'file', + label: 'Phenotype-based prioritization data (.tsv)', + placeholder: 'gs:// Google bucket path', + validate: validators.required, + }, + { + name: 'ignoreExtraSamples', + component: BooleanCheckbox, + label: 'Ignore extra samples', + }, + ], + uploadStats: getPhenoPriUploadStats(state), +}) + +const mapDispatchToProps = { + onSubmit: uploadPhenoPri, +} + +export default connect(mapStateToProps, mapDispatchToProps)(UploadFormPage) diff --git a/ui/pages/DataManagement/reducers.js b/ui/pages/DataManagement/reducers.js index 1c20feb3d9..080b56b338 100644 --- a/ui/pages/DataManagement/reducers.js +++ b/ui/pages/DataManagement/reducers.js @@ -8,6 +8,7 @@ const REQUEST_ELASTICSEARCH_STATUS = 'REQUEST_ELASTICSEARCH_STATUS' const RECEIVE_ELASTICSEARCH_STATUS = 'RECEIVE_ELASTICSEARCH_STATUS' const RECEIVE_PIPELINE_UPLOAD_STATS = 'RECEIVE_PIPELINE_UPLOAD_STATS' const RECEIVE_RNA_SEQ_UPLOAD_STATS = 'RECEIVE_RNA_SEQ_UPLOAD_STATS' +const RECEIVE_PHENO_PRI_UPLOAD_STATS = 'RECEIVE_PHENO_PRI_UPLOAD_STATS' const REQUEST_ALL_USERS = 'REQUEST_ALL_USERS' const RECEIVE_ALL_USERS = 'RECEIVE_ALL_USERS' @@ -75,11 +76,27 @@ export const uploadRnaSeq = values => (dispatch) => { }) } +export const uploadPhenoPri = values => (dispatch) => { + let successResponseJson = null + return new HttpRequestHelper( + '/api/data_management/load_phenotype_pri_data', + (responseJson) => { + successResponseJson = responseJson + }, + (e) => { + successResponseJson = { warnings: [e.message] } + }, + ).post(values).then(() => { + dispatch({ type: RECEIVE_PHENO_PRI_UPLOAD_STATS, newValue: successResponseJson }) + }) +} + export const reducers = { elasticsearchStatusLoading: loadingReducer(REQUEST_ELASTICSEARCH_STATUS, RECEIVE_ELASTICSEARCH_STATUS), elasticsearchStatus: createSingleObjectReducer(RECEIVE_ELASTICSEARCH_STATUS), qcUploadStats: createSingleValueReducer(RECEIVE_PIPELINE_UPLOAD_STATS, {}), rnaSeqUploadStats: createSingleValueReducer(RECEIVE_RNA_SEQ_UPLOAD_STATS, {}), + phenoPriUploadStats: createSingleValueReducer(RECEIVE_PHENO_PRI_UPLOAD_STATS, {}), allUsers: createSingleValueReducer(RECEIVE_ALL_USERS, [], 'users'), allUsersLoading: loadingReducer(REQUEST_ALL_USERS, RECEIVE_ALL_USERS), } diff --git a/ui/pages/DataManagement/selectors.js b/ui/pages/DataManagement/selectors.js index 6d59629130..48ffc39d45 100644 --- a/ui/pages/DataManagement/selectors.js +++ b/ui/pages/DataManagement/selectors.js @@ -4,3 +4,4 @@ export const getQcUploadStats = state => state.qcUploadStats export const getRnaSeqUploadStats = state => state.rnaSeqUploadStats export const getAllUsersLoading = state => state.allUsersLoading.isLoading export const getAllUsers = state => state.allUsers +export const getPhenoPriUploadStats = state => state.phenoPriUploadStats From 9c22bf34e8ed30984dd6f9d4a1f90b64d6cca5a1 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Wed, 28 Sep 2022 15:24:29 -0400 Subject: [PATCH 02/96] Update phenotype pri model. --- .../0048_phenotypeprioritization.py | 6 ++-- seqr/models.py | 28 +++++++------------ seqr/views/apis/data_manager_api.py | 21 +++++++------- 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/seqr/migrations/0048_phenotypeprioritization.py b/seqr/migrations/0048_phenotypeprioritization.py index bb24d75db8..7b08e44af9 100644 --- a/seqr/migrations/0048_phenotypeprioritization.py +++ b/seqr/migrations/0048_phenotypeprioritization.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.15 on 2022-09-27 15:01 +# Generated by Django 3.2.15 on 2022-09-28 15:26 from django.db import migrations, models import django.db.models.deletion @@ -19,9 +19,7 @@ class Migration(migrations.Migration): ('tool', models.CharField(choices=[('E', 'exomiser'), ('L', 'lirical')], max_length=1)), ('rank', models.IntegerField()), ('disease_id', models.CharField(max_length=32)), - ('score1', models.FloatField(null=True)), - ('score2', models.FloatField(null=True)), - ('score3', models.FloatField(null=True)), + ('scores', models.JSONField()), ('sample', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='seqr.sample')), ], options={ diff --git a/seqr/models.py b/seqr/models.py index a7ce22508e..d15119c78f 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -1056,38 +1056,30 @@ class Meta: class PhenotypePrioritization(DeletableSampleMetadataModel): - EXOMISER = 'exomiser' - LIRICAL = 'lirical' EXOMISER_CHOICE = 'E' LIRICAL_CHOICE = 'L' - SCORE_NAME1 = 'scoreName1' - SCORE_NAME2 = 'scoreName2' - SCORE_NAME3 = 'scoreName3' TOOL_CHOICES = ( - (EXOMISER_CHOICE, EXOMISER), - (LIRICAL_CHOICE, LIRICAL) + (EXOMISER_CHOICE, 'exomiser'), + (LIRICAL_CHOICE, 'lirical') ) - SCORE_NAMES = { + SCORE_FIELDS = { EXOMISER_CHOICE: { - SCORE_NAME1: 'exomiser_score', - SCORE_NAME2: 'phenotype_score', - SCORE_NAME3: 'variant_score', + 'exomiser_score': 'e', + 'phenotype_score': 'p', + 'variant_score': 'v', }, LIRICAL_CHOICE: { - SCORE_NAME1: 'post_test_probability', - SCORE_NAME2: 'compositeLR', - SCORE_NAME3: None, + 'post_test_probability': 'p', + 'compositeLR': 'c', } } tool = models.CharField(max_length=1, choices=TOOL_CHOICES) rank = models.IntegerField() disease_id = models.CharField(max_length=32) - score1 = models.FloatField(null=True) - score2 = models.FloatField(null=True) - score3 = models.FloatField(null=True) + scores = models.JSONField() class Meta: unique_together = ('sample', 'gene_id', 'disease_id') - json_fields = ['gene_id', 'tool', 'rank', 'disease_id', 'score1', 'score2', 'score3'] + json_fields = ['gene_id', 'tool', 'rank', 'disease_id', 'scores'] diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 7fe2ff938a..5e354190c7 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -420,11 +420,8 @@ def load_phenotype_pri_data(request): }) - EXPECTED_HEADER = ['tool', 'project', 'sampleId', 'rank', 'geneId', 'diseaseId', 'diseaseName', - PhenotypePrioritization.SCORE_NAME1, 'score1', - PhenotypePrioritization.SCORE_NAME2, 'score2', - PhenotypePrioritization.SCORE_NAME3, 'score3'] + 'scoreName1', 'score1', 'scoreName2', 'score2', 'scoreName3', 'score3'] def _get_phenotype_pri(record, i, ignore_extra_samples): @@ -455,9 +452,15 @@ def _get_phenotype_pri(record, i, ignore_extra_samples): else: raise ValueError(f'Unknown disease ID {disease_id} (record {i})') - for score_name, value in PhenotypePrioritization.SCORE_NAMES[tool].items(): - if record.get(score_name) != value: - raise ValueError(f'Expecting {value} for {score_name} but {record[score_name]} found (record {i})') + scores = {} + for score in ['1', '2', '3']: + scoreName = record.get('scoreName' + score) + if scoreName: + score_field = PhenotypePrioritization.SCORE_FIELDS[tool][scoreName] + if not score_field: + raise ValueError(f'Unexpected score name {scoreName} (record {i})') + score = record.get('score' + score) + scores[score_field] = float(score) return { 'sample': samples[0], @@ -465,9 +468,7 @@ def _get_phenotype_pri(record, i, ignore_extra_samples): 'tool': tool, 'rank': int(record['rank']), 'disease_id': disease_id, - 'score1': float(record['score1']), - 'score2': float(record['score2']) if PhenotypePrioritization.SCORE_NAMES[tool][PhenotypePrioritization.SCORE_NAME2] else None, - 'score3': float(record['score3']) if PhenotypePrioritization.SCORE_NAMES[tool][PhenotypePrioritization.SCORE_NAME3] else None, + 'scores': scores, } From ce7ed898daee52f968c9e921c98d62b03960adfd Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Thu, 29 Sep 2022 15:42:50 -0400 Subject: [PATCH 03/96] Add displaying LIRICAL. --- seqr/views/utils/variant_utils.py | 12 +- ui/redux/rootReducer.js | 1 + ui/redux/selectors.js | 1 + .../components/panel/variants/VariantGene.jsx | 117 ++++++++++++------ .../components/panel/variants/selectors.js | 18 ++- 5 files changed, 107 insertions(+), 42 deletions(-) diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 8d3eecf37c..7bccb0e628 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -128,6 +128,12 @@ def _get_rna_seq_outliers(gene_ids, families): return data_by_individual_gene +def _get_phenotype_pri_data(gene_ids, families): + data_by_individual_gene = defaultdict(lambda: {'outliers': {}}) + + return data_by_individual_gene + + def _add_family_has_rna_tpm(families_by_guid): tpm_families = RnaSeqTpm.objects.filter( sample__individual__family__guid__in=families_by_guid.keys() @@ -159,7 +165,8 @@ def _add_pa_detail(locus_list_gene, locus_list_guid, gene_json): LOAD_FAMILY_CONTEXT_PARAM = 'loadFamilyContext' def get_variants_response(request, saved_variants, response_variants=None, add_all_context=False, include_igv=True, - add_locus_list_detail=False, include_rna_seq=True, include_project_name=False): + add_locus_list_detail=False, include_rna_seq=True, include_project_name=False, + include_phe_pri=True): response = get_json_for_saved_variants_with_tags(saved_variants, add_details=True) variants = list(response['savedVariantsByGuid'].values()) if response_variants is None else response_variants @@ -204,4 +211,7 @@ def get_variants_response(request, saved_variants, response_variants=None, add_a if families_by_guid: _add_family_has_rna_tpm(families_by_guid) + if include_phe_pri: + response['phePriData'] = _get_phenotype_pri_data(genes.keys(), families) + return response diff --git a/ui/redux/rootReducer.js b/ui/redux/rootReducer.js index 979eb441af..792fd96faa 100644 --- a/ui/redux/rootReducer.js +++ b/ui/redux/rootReducer.js @@ -319,6 +319,7 @@ const rootReducer = combineReducers({ mmeResultsByGuid: createObjectsByIdReducer(RECEIVE_DATA, 'mmeResultsByGuid'), genesById: createObjectsByIdReducer(RECEIVE_DATA, 'genesById'), rnaSeqDataByIndividual: createObjectsByIdReducer(RECEIVE_DATA, 'rnaSeqData'), + phePriDataByIndividual: createObjectsByIdReducer(RECEIVE_DATA, 'phePriData'), genesLoading: loadingReducer(REQUEST_GENES, RECEIVE_DATA), hpoTermsByParent: createObjectsByIdReducer(RECEIVE_HPO_TERMS), hpoTermsLoading: loadingReducer(REQUEST_HPO_TERMS, RECEIVE_HPO_TERMS), diff --git a/ui/redux/selectors.js b/ui/redux/selectors.js index 3067761236..0e143793a8 100644 --- a/ui/redux/selectors.js +++ b/ui/redux/selectors.js @@ -30,6 +30,7 @@ export const getLocusListsByGuid = state => state.locusListsByGuid export const getLocusListsIsLoading = state => state.locusListsLoading.isLoading export const getLocusListIsLoading = state => state.locusListLoading.isLoading export const getRnaSeqDataByIndividual = state => state.rnaSeqDataByIndividual +export const getPhePriDataByIndividual = state => state.phePriDataByIndividual export const getUser = state => state.user export const getUserOptionsByUsername = state => state.userOptionsByUsername export const getUserOptionsIsLoading = state => state.userOptionsLoading.isLoading diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index 298a79402f..80e1b1c26c 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -18,7 +18,7 @@ import { GeneSearchLink } from '../../buttons/SearchResultsLink' import ShowGeneModal from '../../buttons/ShowGeneModal' import Modal from '../../modal/Modal' import { GenCC, ClingenLabel } from '../genes/GeneDetail' -import { getRnaSeqOutilerDataByFamilyGene } from './selectors' +import { getRnaSeqOutilerDataByFamilyGene, getPhePriDataByFamilyGene } from './selectors' const RnaSeqTpm = React.lazy(() => import('./RnaSeqTpm')) @@ -314,7 +314,42 @@ const GENE_DISEASE_DETAIL_SECTIONS = [ }, ] -const RNA_SEQ_DETAIL_FIELDS = ['zScore', 'pValue', 'pAdjust'] +const LIRICAL = 'L' +const EXOMISER = 'E' + +const SAMPLE_GENE_DETAIL_FIELDS = { + rnaSeqData: { infos: [], scores: ['zScore', 'pValue', 'pAdjust'] }, + liricalData: { infos: ['rank', 'diseases'], scores: ['postTestProbability', 'LR'] }, + exomiserData: { infos: ['rank', 'diseases'], scores: ['exomiserScore', 'phenotypeScore', 'variantScore'] }, +} + +const sampleGeneDetailsDisplay = (geneId, sampleGeneData, dataType) => ( +
+ + + + + {Object.values(SAMPLE_GENE_DETAIL_FIELDS[dataType]).flat().map( + field => {camelcaseToTitlecase(field).replace(' ', '-')}, + )} + + + + {Object.entries(sampleGeneData[geneId]).map(([individual, data]) => ( + + {individual} + {SAMPLE_GENE_DETAIL_FIELDS[dataType].infos.map( + field => {data[field]}, + )} + {SAMPLE_GENE_DETAIL_FIELDS[dataType].scores.map( + field => {data[field].toPrecision(3)}, + )} + + ))} + +
+
+) const GENE_DETAIL_SECTIONS = [ { @@ -370,31 +405,27 @@ const GENE_DETAIL_SECTIONS = [ color: 'pink', description: 'RNA-Seq Outlier', label: 'RNA-Seq', - showDetails: (gene, rnaSeqData) => rnaSeqData && rnaSeqData[gene.geneId], - detailsDisplay: (gene, rnaSeqData) => ( -
- This gene is flagged as an outlier for RNA-Seq in the following samples - - - - - {RNA_SEQ_DETAIL_FIELDS.map( - field => {camelcaseToTitlecase(field).replace(' ', '-')}, - )} - - - - {Object.entries(rnaSeqData[gene.geneId]).map(([individual, data]) => ( - - {individual} - {RNA_SEQ_DETAIL_FIELDS.map( - field => {data[field].toPrecision(3)}, - )} - - ))} - -
-
+ showDetails: (gene, { rnaSeqData }) => rnaSeqData && rnaSeqData[gene.geneId], + detailsDisplay: (gene, { rnaSeqData }) => ( + sampleGeneDetailsDisplay(gene.geneId, rnaSeqData, 'rnaSeqData') + ), + }, + { + color: 'orange', + description: 'LIRICAL Phenotype Prioritization', + label: 'LIRICAL', + showDetails: (gene, { phePriData }) => phePriData && phePriData[LIRICAL][gene.geneId], + detailsDisplay: (gene, { phePriData }) => ( + sampleGeneDetailsDisplay(gene.geneId, phePriData, 'lirical') + ), + }, + { + color: 'orange', + description: 'Exomiser Phenotype Prioritization', + label: 'Exomiser', + showDetails: (gene, { phePriData }) => phePriData && phePriData[EXOMISER][gene.geneId], + detailsDisplay: (gene, { phePriData }) => ( + sampleGeneDetailsDisplay(gene.geneId, phePriData, 'exomiser') ), }, ] @@ -421,9 +452,12 @@ const OmimSegments = styled(Segment.Group).attrs({ size: 'tiny', horizontal: tru } ` -const getDetailSections = (configs, gene, compact, labelProps, rnaSeqData) => configs.map( +const getDetailSections = (configs, gene, compact, labelProps, sampleGeneData) => configs.map( ({ showDetails, detailsDisplay, ...sectionConfig }) => ( - { ...sectionConfig, detail: showDetails(gene, rnaSeqData) && detailsDisplay(gene, rnaSeqData) }), + { + ...sectionConfig, + detail: showDetails(gene, sampleGeneData) && detailsDisplay(gene, sampleGeneData), + }), ).filter(({ detail }) => detail).map(({ detail, expandedDisplay, ...sectionConfig }) => ( (expandedDisplay && !compact) ? ( @@ -446,9 +480,9 @@ const getDetailSections = (configs, gene, compact, labelProps, rnaSeqData) => co )) export const GeneDetails = React.memo(( - { gene, compact, showLocusLists, showInlineDetails, rnaSeqData, ...labelProps }, + { gene, compact, showLocusLists, showInlineDetails, sampleGeneData, ...labelProps }, ) => { - const geneDetails = getDetailSections(GENE_DETAIL_SECTIONS, gene, compact, labelProps, rnaSeqData) + const geneDetails = getDetailSections(GENE_DETAIL_SECTIONS, gene, compact, labelProps, sampleGeneData) const geneDiseaseDetails = getDetailSections(GENE_DISEASE_DETAIL_SECTIONS, gene, compact, labelProps) const hasLocusLists = showLocusLists && gene.locusListGuids.length > 0 const showDivider = !showInlineDetails && geneDetails.length > 0 && (hasLocusLists || geneDiseaseDetails.length > 0) @@ -477,7 +511,7 @@ GeneDetails.propTypes = { compact: PropTypes.bool, showLocusLists: PropTypes.bool, showInlineDetails: PropTypes.bool, - rnaSeqData: PropTypes.object, + sampleGeneData: PropTypes.object, } const GeneSearchLinkWithPopup = props => ( @@ -497,7 +531,7 @@ const getGeneConsequence = (geneId, variant) => { } const BaseVariantGene = React.memo(( - { geneId, gene, variant, compact, showInlineDetails, compoundHetToggle, hasRnaTpmData, rnaSeqData }, + { geneId, gene, variant, compact, showInlineDetails, compoundHetToggle, hasRnaTpmData, sampleGeneData }, ) => { const geneConsequence = getGeneConsequence(geneId, variant) @@ -514,7 +548,7 @@ const BaseVariantGene = React.memo(( showInlineDetails={showInlineDetails} margin={showInlineDetails ? '1em .5em 0px 0px' : null} horizontal={showInlineDetails} - rnaSeqData={rnaSeqData} + sampleGeneData={sampleGeneData} showLocusLists /> ) @@ -592,12 +626,15 @@ BaseVariantGene.propTypes = { showInlineDetails: PropTypes.bool, compoundHetToggle: PropTypes.func, hasRnaTpmData: PropTypes.bool, - rnaSeqData: PropTypes.object, + sampleGeneData: PropTypes.object, } const getRnaSeqProps = (state, ownProps) => ({ hasRnaTpmData: getFamiliesByGuid(state)[ownProps.variant.familyGuids[0]]?.hasRnaTpmData, - rnaSeqData: getRnaSeqOutilerDataByFamilyGene(state)[ownProps.variant.familyGuids[0]], + sampleGeneData: { + rnaSeqData: getRnaSeqOutilerDataByFamilyGene(state)[ownProps.variant.familyGuids[0]], + phePriData: getPhePriDataByFamilyGene(state)[ownProps.variant.familyGuids[0]], + }, }) const mapStateToProps = (state, ownProps) => ({ @@ -613,7 +650,7 @@ class VariantGenes extends React.PureComponent { variant: PropTypes.object.isRequired, mainGeneId: PropTypes.string, genesById: PropTypes.object.isRequired, - rnaSeqData: PropTypes.object, + sampleGeneData: PropTypes.object, hasRnaTpmData: PropTypes.bool, showMainGene: PropTypes.bool, } @@ -629,7 +666,7 @@ class VariantGenes extends React.PureComponent { } render() { - const { variant, genesById, mainGeneId, showMainGene, rnaSeqData, hasRnaTpmData } = this.props + const { variant, genesById, mainGeneId, showMainGene, sampleGeneData, hasRnaTpmData } = this.props const { showAll } = this.state const geneIds = Object.keys(variant.transcripts || {}) const genes = geneIds.map(geneId => genesById[geneId]).filter(gene => gene) @@ -648,7 +685,7 @@ class VariantGenes extends React.PureComponent { geneId={gene.geneId} gene={gene} variant={variant} - rnaSeqData={rnaSeqData} + sampleGeneData={sampleGeneData} hasRnaTpmData={hasRnaTpmData} showInlineDetails={!mainGeneId} compact @@ -679,7 +716,7 @@ class VariantGenes extends React.PureComponent { details={sectionGenes.length > 0 && sectionGenes.map(gene => (
- {detailsDisplay(gene, rnaSeqData)} + {detailsDisplay(gene, sampleGeneData)}
))} diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index 2d5e90c8b4..b1871f60d6 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -14,7 +14,7 @@ import { } from 'shared/utils/constants' import { getVariantTagsByGuid, getVariantNotesByGuid, getSavedVariantsByGuid, getAnalysisGroupsByGuid, getGenesById, getUser, - getFamiliesByGuid, getProjectsByGuid, getIndividualsByGuid, getRnaSeqDataByIndividual, + getFamiliesByGuid, getProjectsByGuid, getIndividualsByGuid, getRnaSeqDataByIndividual, getPhePriDataByIndividual, } from 'redux/selectors' export const getRnaSeqOutilerDataByFamilyGene = createSelector( @@ -33,6 +33,22 @@ export const getRnaSeqOutilerDataByFamilyGene = createSelector( ), ) +export const getPhePriDataByFamilyGene = createSelector( + getIndividualsByGuid, + getPhePriDataByIndividual, + (individualsByGuid, phePriDataByIndividual) => Object.entries(phePriDataByIndividual).reduce( + (acc, [individualGuid, rnaSeqData]) => { + const { familyGuid, displayName } = individualsByGuid[individualGuid] + acc[familyGuid] = Object.entries(rnaSeqData.outliers || {}).reduce( + (acc2, [geneId, data]) => (data.isSignificant ? + { ...acc2, [geneId]: { ...(acc2[geneId] || {}), [displayName]: data } } : acc2 + ), acc[familyGuid] || {}, + ) + return acc + }, {}, + ), +) + // Saved variant selectors export const getSavedVariantTableState = state => ( state.currentProjectGuid ? state.savedVariantTableState : state.allProjectSavedVariantTableState From 81b9ca4baf1184b580b28047eb806aa8a97b06b1 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Thu, 29 Sep 2022 15:55:48 -0400 Subject: [PATCH 04/96] Change the model to save all original input data. --- .../0048_phenotypeprioritization.py | 3 ++- seqr/models.py | 12 +----------- seqr/views/apis/data_manager_api.py | 19 +++++-------------- 3 files changed, 8 insertions(+), 26 deletions(-) diff --git a/seqr/migrations/0048_phenotypeprioritization.py b/seqr/migrations/0048_phenotypeprioritization.py index 7b08e44af9..ae3964cd67 100644 --- a/seqr/migrations/0048_phenotypeprioritization.py +++ b/seqr/migrations/0048_phenotypeprioritization.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.15 on 2022-09-28 15:26 +# Generated by Django 3.2.15 on 2022-09-29 19:51 from django.db import migrations, models import django.db.models.deletion @@ -19,6 +19,7 @@ class Migration(migrations.Migration): ('tool', models.CharField(choices=[('E', 'exomiser'), ('L', 'lirical')], max_length=1)), ('rank', models.IntegerField()), ('disease_id', models.CharField(max_length=32)), + ('disease_name', models.TextField()), ('scores', models.JSONField()), ('sample', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='seqr.sample')), ], diff --git a/seqr/models.py b/seqr/models.py index d15119c78f..1fc05e352e 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -1062,21 +1062,11 @@ class PhenotypePrioritization(DeletableSampleMetadataModel): (EXOMISER_CHOICE, 'exomiser'), (LIRICAL_CHOICE, 'lirical') ) - SCORE_FIELDS = { - EXOMISER_CHOICE: { - 'exomiser_score': 'e', - 'phenotype_score': 'p', - 'variant_score': 'v', - }, - LIRICAL_CHOICE: { - 'post_test_probability': 'p', - 'compositeLR': 'c', - } - } tool = models.CharField(max_length=1, choices=TOOL_CHOICES) rank = models.IntegerField() disease_id = models.CharField(max_length=32) + disease_name = models.TextField() scores = models.JSONField() class Meta: diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 5e354190c7..cf5492b5b4 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -25,8 +25,6 @@ from seqr.models import Sample, Individual, RnaSeqOutlier, RnaSeqTpm, PhenotypePrioritization, Project -from reference_data.models import Omim - from settings import KIBANA_SERVER, KIBANA_ELASTICSEARCH_PASSWORD logger = SeqrLogger(__name__) @@ -445,22 +443,14 @@ def _get_phenotype_pri(record, i, ignore_extra_samples): raise ValueError(f'Sample with ID {sample_id} is not found (record {i})') disease_id = record['diseaseId'] - if disease_id.startswith('OMIM:'): - omim_recs = Omim.objects.filter(phenotype_mim_number=int(disease_id.replace('OMIM:', ''))) - if len(omim_recs) < 1: - raise ValueError(f'Disease ID {disease_id} can\'t be found in Omim (record {i})') - else: - raise ValueError(f'Unknown disease ID {disease_id} (record {i})') + disease_name = record['diseaseName'] scores = {} for score in ['1', '2', '3']: - scoreName = record.get('scoreName' + score) - if scoreName: - score_field = PhenotypePrioritization.SCORE_FIELDS[tool][scoreName] - if not score_field: - raise ValueError(f'Unexpected score name {scoreName} (record {i})') + score_name = record.get('scoreName' + score) + if score_name: score = record.get('score' + score) - scores[score_field] = float(score) + scores[score_name] = float(score) return { 'sample': samples[0], @@ -468,6 +458,7 @@ def _get_phenotype_pri(record, i, ignore_extra_samples): 'tool': tool, 'rank': int(record['rank']), 'disease_id': disease_id, + 'disease_name': disease_name, 'scores': scores, } From 595dbce1da73f89e7151b5b762f92cc1ea926073 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Fri, 30 Sep 2022 11:54:54 -0400 Subject: [PATCH 05/96] Add phenotype prioritization data fetching. --- seqr/views/utils/orm_to_json_utils.py | 7 +++++++ seqr/views/utils/variant_utils.py | 14 +++++++++++--- ui/shared/components/panel/variants/selectors.js | 4 ++-- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/seqr/views/utils/orm_to_json_utils.py b/seqr/views/utils/orm_to_json_utils.py index 86b5f6e2aa..ee29272a7b 100644 --- a/seqr/views/utils/orm_to_json_utils.py +++ b/seqr/views/utils/orm_to_json_utils.py @@ -866,3 +866,10 @@ def _process_result(data, model): data['isSignificant'] = data['pAdjust'] < model.SIGNIFICANCE_THRESHOLD return _get_json_for_models(models, process_result=_process_result, **kwargs) + + +def get_json_for_phenotype_pri(models, **kwargs): + def _process_result(data): + data['scores'] = {_to_camel_case(score) for score, value in data['scores'].items()} + + return _get_json_for_models(models, process_result=_process_result, **kwargs) diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 7bccb0e628..ac7cbc8c4c 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -3,12 +3,12 @@ import redis from seqr.models import SavedVariant, VariantSearchResults, Family, LocusList, LocusListInterval, LocusListGene, \ - RnaSeqOutlier, RnaSeqTpm + RnaSeqOutlier, RnaSeqTpm, PhenotypePrioritization from seqr.utils.elasticsearch.utils import get_es_variants_for_variant_ids from seqr.utils.gene_utils import get_genes_for_variants from seqr.views.utils.json_to_orm_utils import update_model_from_json from seqr.views.utils.orm_to_json_utils import get_json_for_discovery_tags, get_json_for_locus_lists, \ - _get_json_for_models, get_json_for_rna_seq_outliers, get_json_for_saved_variants_with_tags + _get_json_for_models, get_json_for_rna_seq_outliers, get_json_for_saved_variants_with_tags, get_json_for_phenotype_pri from seqr.views.utils.permissions_utils import has_case_review_permissions, user_is_analyst from seqr.views.utils.project_context_utils import add_project_tag_types, add_families_context from settings import REDIS_SERVICE_HOSTNAME, REDIS_SERVICE_PORT @@ -129,7 +129,15 @@ def _get_rna_seq_outliers(gene_ids, families): def _get_phenotype_pri_data(gene_ids, families): - data_by_individual_gene = defaultdict(lambda: {'outliers': {}}) + data_by_individual_gene = defaultdict(lambda: {'phepri': {}}) + + phe_pri_data = get_json_for_phenotype_pri( + PhenotypePrioritization.objects.filter(gene_id__in=gene_ids, sample__individual__family__in=families), + nested_fields=[{'fields': ('sample', 'individual', 'guid'), 'key': 'individualGuid'}], + ) + + for data in phe_pri_data: + data_by_individual_gene[data.pop('individualGuid')]['phepri'][data['geneId']] = data return data_by_individual_gene diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index b1871f60d6..bc9af2f7b0 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -37,9 +37,9 @@ export const getPhePriDataByFamilyGene = createSelector( getIndividualsByGuid, getPhePriDataByIndividual, (individualsByGuid, phePriDataByIndividual) => Object.entries(phePriDataByIndividual).reduce( - (acc, [individualGuid, rnaSeqData]) => { + (acc, [individualGuid, phePriData]) => { const { familyGuid, displayName } = individualsByGuid[individualGuid] - acc[familyGuid] = Object.entries(rnaSeqData.outliers || {}).reduce( + acc[familyGuid] = Object.entries(phePriData.outliers || {}).reduce( (acc2, [geneId, data]) => (data.isSignificant ? { ...acc2, [geneId]: { ...(acc2[geneId] || {}), [displayName]: data } } : acc2 ), acc[familyGuid] || {}, From bb32278fae3a66e731075924c860804c106dc51b Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 4 Oct 2022 15:46:36 -0400 Subject: [PATCH 06/96] Update data loading. --- .../0048_phenotypeprioritization.py | 30 ----- seqr/models.py | 6 +- seqr/views/apis/data_manager_api.py | 67 +---------- seqr/views/utils/dataset_utils.py | 106 +++++++++++++++++- 4 files changed, 110 insertions(+), 99 deletions(-) delete mode 100644 seqr/migrations/0048_phenotypeprioritization.py diff --git a/seqr/migrations/0048_phenotypeprioritization.py b/seqr/migrations/0048_phenotypeprioritization.py deleted file mode 100644 index ae3964cd67..0000000000 --- a/seqr/migrations/0048_phenotypeprioritization.py +++ /dev/null @@ -1,30 +0,0 @@ -# Generated by Django 3.2.15 on 2022-09-29 19:51 - -from django.db import migrations, models -import django.db.models.deletion - - -class Migration(migrations.Migration): - - dependencies = [ - ('seqr', '0047_auto_20220908_1851'), - ] - - operations = [ - migrations.CreateModel( - name='PhenotypePrioritization', - fields=[ - ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('gene_id', models.CharField(max_length=20)), - ('tool', models.CharField(choices=[('E', 'exomiser'), ('L', 'lirical')], max_length=1)), - ('rank', models.IntegerField()), - ('disease_id', models.CharField(max_length=32)), - ('disease_name', models.TextField()), - ('scores', models.JSONField()), - ('sample', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='seqr.sample')), - ], - options={ - 'unique_together': {('sample', 'gene_id', 'disease_id')}, - }, - ), - ] diff --git a/seqr/models.py b/seqr/models.py index 1fc05e352e..6b1e98fc5a 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -1062,6 +1062,10 @@ class PhenotypePrioritization(DeletableSampleMetadataModel): (EXOMISER_CHOICE, 'exomiser'), (LIRICAL_CHOICE, 'lirical') ) + TOOL_LOOKUP = {v: k for k, v in TOOL_CHOICES} + + individual = models.ForeignKey('Individual', on_delete=models.CASCADE, db_index=True) + gene_id = models.CharField(max_length=20) # ensembl ID tool = models.CharField(max_length=1, choices=TOOL_CHOICES) rank = models.IntegerField() @@ -1070,6 +1074,4 @@ class PhenotypePrioritization(DeletableSampleMetadataModel): scores = models.JSONField() class Meta: - unique_together = ('sample', 'gene_id', 'disease_id') - json_fields = ['gene_id', 'tool', 'rank', 'disease_id', 'scores'] diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index cf5492b5b4..097035273a 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -18,7 +18,7 @@ from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.logging_utils import SeqrLogger -from seqr.views.utils.dataset_utils import load_rna_seq_outlier, load_rna_seq_tpm +from seqr.views.utils.dataset_utils import load_rna_seq_outlier, load_rna_seq_tpm, load_phenotype_pri_file from seqr.views.utils.file_utils import parse_file, get_temp_upload_directory, load_uploaded_file from seqr.views.utils.json_utils import create_json_response, _to_camel_case from seqr.views.utils.permissions_utils import data_manager_required @@ -403,7 +403,7 @@ def load_phenotype_pri_data(request): ignore_extra_samples = request_json['ignoreExtraSamples'] logger.info(f'Loading phenotype prioritization data from {file_name}', request.user) - records = _load_phenotype_pri_file(file_name, ignore_extra_samples) + records = load_phenotype_pri_file(file_name, ignore_extra_samples) models = PhenotypePrioritization.objects.bulk_create([PhenotypePrioritization(**data) for data in records]) sample_guids = [data['sample'].guid for data in records] logger.info(f'create {len(models)} PhenotypePrioritization', request.user, db_update={ @@ -418,69 +418,6 @@ def load_phenotype_pri_data(request): }) -EXPECTED_HEADER = ['tool', 'project', 'sampleId', 'rank', 'geneId', 'diseaseId', 'diseaseName', - 'scoreName1', 'score1', 'scoreName2', 'score2', 'scoreName3', 'score3'] - - -def _get_phenotype_pri(record, i, ignore_extra_samples): - tool = next((k for k, v in PhenotypePrioritization.TOOL_CHOICES if v == record['tool']), None) - if not tool: - raise ValueError('Expecting {} for the "tool" column but found {} (record {})'.format( - ', '.join([v for k, v in PhenotypePrioritization.TOOL_CHOICES]), record['tool'], i)) - - project_name = record['project'] - projects = Project.objects.filter(name=project_name) - if len(projects) < 1: - raise ValueError(f'Project {project_name} is not found (record {i})') - project = projects[0] - - sample_id = record['sampleId'] - samples = Sample.objects.filter(sample_id=sample_id, individual__family__project=project, is_active=True, - dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS) - if len(samples) != 1: - if ignore_extra_samples: - return - raise ValueError(f'Sample with ID {sample_id} is not found (record {i})') - - disease_id = record['diseaseId'] - disease_name = record['diseaseName'] - - scores = {} - for score in ['1', '2', '3']: - score_name = record.get('scoreName' + score) - if score_name: - score = record.get('score' + score) - scores[score_name] = float(score) - - return { - 'sample': samples[0], - 'gene_id': record['geneId'], - 'tool': tool, - 'rank': int(record['rank']), - 'disease_id': disease_id, - 'disease_name': disease_name, - 'scores': scores, - } - - -def _load_phenotype_pri_file(file_name, ignore_extra_samples): - lines = file_iter(file_name) - - header = next(lines).rstrip().split('\t') - missing_header = [h for h in EXPECTED_HEADER if h not in header] - if len(missing_header): - raise ValueError('The following required columns are missing: {}'.format(', '.join(missing_header))) - - records = [] - for i, line in enumerate(lines): - row = line.rstrip().split('\t') - record = {header[cnt]: col for cnt, col in enumerate(row)} - record = _get_phenotype_pri(record, i, ignore_extra_samples) - if record: - records.append(record) - return records - - # Hop-by-hop HTTP response headers shouldn't be forwarded. # More info at: http://www.w3.org/Protocols/rfc2616/rfc2616-sec13.html#sec13.5.1 EXCLUDE_HTTP_RESPONSE_HEADERS = { diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 3e95ebbd5f..8941612348 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -1,11 +1,12 @@ import elasticsearch_dsl from collections import defaultdict -from django.db.models import prefetch_related_objects +from django.db.models import prefetch_related_objects, Value, TextField +from django.db.models.functions import Concat from django.utils import timezone from tqdm import tqdm import random -from seqr.models import Sample, Individual, Family, RnaSeqOutlier, RnaSeqTpm +from seqr.models import Sample, Individual, Family, RnaSeqOutlier, RnaSeqTpm, PhenotypePrioritization from seqr.utils.elasticsearch.utils import get_es_client, get_index_metadata from seqr.utils.file_utils import file_iter from seqr.utils.logging_utils import log_model_bulk_update, SeqrLogger @@ -449,3 +450,104 @@ def _load_rna_seq(model_cls, file_path, user, mapping_file, ignore_extra_samples logger.warning(warning, user) return samples_to_load, info, warnings + + +PHENOTYPE_PRI_HEADER = { + 'tool': 'tool', 'project': 'project', 'sampleId': 'sample_id', 'rank': 'rank', 'geneId': 'gene_id', + 'diseaseId': 'disease_id', 'diseaseName': 'disease_name', 'scoreName1': 'score_name1', 'score1': 'score1', + 'scoreName2': 'score_name2', 'score2': 'score2', 'scoreName3': 'score_name3', 'score3': 'score3'} + + +def _parse_phenotype_pri_row(row): + record = {mapped_key: row[key] for key, mapped_key in PHENOTYPE_PRI_HEADER.items()} + + tool = PhenotypePrioritization.TOOL_LOOKUP.get(record.get('tool'), None) + if not tool: + raise ValueError('Expecting {} for the "tool" column but found {}'.format( + ', '.join([v for k, v in PhenotypePrioritization.TOOL_CHOICES]), row['tool'])) + record['tool'] = tool + + scores = {} + for score in ['1', '2', '3']: + score_name = record.pop('scoreName' + score, None) + if score_name: + scores[score_name] = record.pop('score' + score, None) + + return record + + +def load_phenotype_pri_file(file_path, user, ignore_extra_samples): + samples_by_id = defaultdict(dict) + f = file_iter(file_path) + header = _parse_tsv_row(next(f)) + missing_cols = [col for col in PHENOTYPE_PRI_HEADER.keys() if col not in header] + if missing_cols: + raise ValueError(f'Invalid file: missing column(s) {", ".join(missing_cols)}') + + for line in tqdm(f, unit=' rows'): + row = dict(zip(header, _parse_tsv_row(line))) + record = _parse_phenotype_pri_row(row) + sample_id = record.pop('sample_id', None) + project = record.pop('project', None) + if not sample_id or not project: + raise ValueError('Both sample ID and project fields are required.') + if samples_by_id[sample_id]: + if project != samples_by_id[sample_id]['project']: + raise ValueError(f'Invalid project name for sample {sample_id}') + samples_by_id[sample_id]['records'].append(record) + else: + samples_by_id[sample_id]['project'] = project + samples_by_id[sample_id]['records'] = [record] + + message = f'Parsed {len(samples_by_id)} LIRICAL/Exomiser phenotype-based prioritization samples' + info = [message] + logger.info(message, user) + + existing_inds = Individual.objects.annotate( + indv_project=Concat('individual_id', Value('/', output_field=TextField()), 'family__project__name') + ).filter( + indv_project__in={sample_id + '/' + value['project'] for sample_id, value in samples_by_id} + ) + + for ind in existing_inds: + samples_by_id[ind.individual_id]['individual'] = ind + + warnings = [] + extra_ids = set() + records_to_load_by_id = defaultdict(lambda: defaultdict(list)) + for sample_id, value in samples_by_id.items(): + if value['individual']: + for rec in value['records']: + rec['individual'] = value['individual'] + records_to_load_by_id[sample_id][rec['tool']].append(rec) + else: + extra_ids.add(sample_id) + + if extra_ids: + skipped_samples = ', '.join(sorted(extra_ids)) + if ignore_extra_samples: + warnings = [f'Skipped loading for the following {len(extra_ids)} unmatched samples: {skipped_samples}'] + else: + raise ValueError(f'Unable to find matches for the following samples: {skipped_samples}') + + # Delete old data + to_delete = PhenotypePrioritization.objects.annotate(tool_ind=Concat('tool', 'individual')).filter( + tool_ind__in=[tool+sample_id for sample_id, value in records_to_load_by_id.items() for tool in value.keys()], + ) + if to_delete: + prefetch_related_objects(to_delete, 'individual') + logger.info(f'delete {len(to_delete)} {PhenotypePrioritization.__name__}s', user, db_update={ + 'dbEntity': PhenotypePrioritization.__name__, 'numEntities': len(to_delete), 'updateType': 'bulk_delete', + 'parentEntityIds': list({model.individual.guid for model in to_delete}), + }) + to_delete.delete() + + prefetch_related_objects(existing_inds, 'family__project') + projects = {ind.family.project.name for ind in existing_inds} + project_names = ', '.join(sorted(projects)) + message = 'Attempted data loading for {} LIRICAL/Exomiser records in the following {} projects: {}'.format( + len(records_to_load_by_id), len(projects), project_names) + info.append(message) + logger.info(message, user) + + return [rec for tools in records_to_load_by_id.values() for recs in tools.values() for rec in recs] From f7b0bf340023f8b418e1f71ef2ab64366be1560b Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Wed, 5 Oct 2022 12:19:28 -0400 Subject: [PATCH 07/96] Update the data loading backend. --- .../0048_phenotypeprioritization.py | 27 ++++++ seqr/models.py | 11 +-- seqr/views/apis/data_manager_api.py | 21 ++--- seqr/views/utils/dataset_utils.py | 89 +++++++++++-------- 4 files changed, 94 insertions(+), 54 deletions(-) create mode 100644 seqr/migrations/0048_phenotypeprioritization.py diff --git a/seqr/migrations/0048_phenotypeprioritization.py b/seqr/migrations/0048_phenotypeprioritization.py new file mode 100644 index 0000000000..d249142dcc --- /dev/null +++ b/seqr/migrations/0048_phenotypeprioritization.py @@ -0,0 +1,27 @@ +# Generated by Django 3.2.15 on 2022-10-05 16:09 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('seqr', '0047_auto_20220908_1851'), + ] + + operations = [ + migrations.CreateModel( + name='PhenotypePrioritization', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('gene_id', models.CharField(max_length=20)), + ('tool', models.CharField(choices=[('E', 'exomiser'), ('L', 'lirical')], max_length=1)), + ('rank', models.IntegerField()), + ('disease_id', models.CharField(max_length=32)), + ('disease_name', models.TextField()), + ('scores', models.JSONField()), + ('individual', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='seqr.individual')), + ], + ), + ] diff --git a/seqr/models.py b/seqr/models.py index 6b1e98fc5a..8165bc5382 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -1055,12 +1055,10 @@ class Meta: json_fields = ['gene_id', 'tpm'] -class PhenotypePrioritization(DeletableSampleMetadataModel): - EXOMISER_CHOICE = 'E' - LIRICAL_CHOICE = 'L' +class PhenotypePrioritization(models.Model): TOOL_CHOICES = ( - (EXOMISER_CHOICE, 'exomiser'), - (LIRICAL_CHOICE, 'lirical') + ('E', 'exomiser'), + ('L', 'lirical') ) TOOL_LOOKUP = {v: k for k, v in TOOL_CHOICES} @@ -1073,5 +1071,8 @@ class PhenotypePrioritization(DeletableSampleMetadataModel): disease_name = models.TextField() scores = models.JSONField() + def __unicode__(self): + return "%s:%s:%s" % (self.individual.individual_id, self.gene_id, self.disease_id) + class Meta: json_fields = ['gene_id', 'tool', 'rank', 'disease_id', 'scores'] diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 097035273a..ce602ce335 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -18,12 +18,12 @@ from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.logging_utils import SeqrLogger -from seqr.views.utils.dataset_utils import load_rna_seq_outlier, load_rna_seq_tpm, load_phenotype_pri_file +from seqr.views.utils.dataset_utils import load_rna_seq_outlier, load_rna_seq_tpm, load_phenotype_pri from seqr.views.utils.file_utils import parse_file, get_temp_upload_directory, load_uploaded_file from seqr.views.utils.json_utils import create_json_response, _to_camel_case from seqr.views.utils.permissions_utils import data_manager_required -from seqr.models import Sample, Individual, RnaSeqOutlier, RnaSeqTpm, PhenotypePrioritization, Project +from seqr.models import Sample, Individual, RnaSeqOutlier, RnaSeqTpm, PhenotypePrioritization from settings import KIBANA_SERVER, KIBANA_ELASTICSEARCH_PASSWORD @@ -400,21 +400,22 @@ def load_phenotype_pri_data(request): request_json = json.loads(request.body) file_name = request_json['file'] - ignore_extra_samples = request_json['ignoreExtraSamples'] + ignore_extra_samples = request_json.get('ignoreExtraSamples', False) logger.info(f'Loading phenotype prioritization data from {file_name}', request.user) - records = load_phenotype_pri_file(file_name, ignore_extra_samples) + records, info, warnings = load_phenotype_pri(file_name, request.user, ignore_extra_samples) models = PhenotypePrioritization.objects.bulk_create([PhenotypePrioritization(**data) for data in records]) - sample_guids = [data['sample'].guid for data in records] - logger.info(f'create {len(models)} PhenotypePrioritization', request.user, db_update={ - 'dbEntity': PhenotypePrioritization, 'numEntities': len(models), 'parentEntityIds': sample_guids, + ind_guids = {data['individual'].guid for data in records} + logger.info(f'create {len(models)} {PhenotypePrioritization.__name__}', request.user, db_update={ + 'dbEntity': PhenotypePrioritization.__name__, 'numEntities': len(models), 'parentEntityIds': sorted(ind_guids), 'updateType': 'bulk_create', }) + info.append(f'Loaded {len(models)} LIRICAL/Exomiser data records') return create_json_response({ - 'info': ['Phenotype prioritization data loaded'], - 'warnings': [], - 'fileName': file_name, + 'info': info, + 'warnings': warnings, + 'success': True }) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 8941612348..0e5831590e 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -1,6 +1,6 @@ import elasticsearch_dsl from collections import defaultdict -from django.db.models import prefetch_related_objects, Value, TextField +from django.db.models import prefetch_related_objects, TextField from django.db.models.functions import Concat from django.utils import timezone from tqdm import tqdm @@ -452,8 +452,9 @@ def _load_rna_seq(model_cls, file_path, user, mapping_file, ignore_extra_samples return samples_to_load, info, warnings +TOOL_FIELD = 'tool' PHENOTYPE_PRI_HEADER = { - 'tool': 'tool', 'project': 'project', 'sampleId': 'sample_id', 'rank': 'rank', 'geneId': 'gene_id', + 'tool': TOOL_FIELD, 'project': 'project', 'sampleId': 'sample_id', 'rank': 'rank', 'geneId': 'gene_id', 'diseaseId': 'disease_id', 'diseaseName': 'disease_name', 'scoreName1': 'score_name1', 'score1': 'score1', 'scoreName2': 'score_name2', 'score2': 'score2', 'scoreName3': 'score_name3', 'score3': 'score3'} @@ -461,29 +462,32 @@ def _load_rna_seq(model_cls, file_path, user, mapping_file, ignore_extra_samples def _parse_phenotype_pri_row(row): record = {mapped_key: row[key] for key, mapped_key in PHENOTYPE_PRI_HEADER.items()} - tool = PhenotypePrioritization.TOOL_LOOKUP.get(record.get('tool'), None) + tool = PhenotypePrioritization.TOOL_LOOKUP.get(record[TOOL_FIELD], None) if not tool: raise ValueError('Expecting {} for the "tool" column but found {}'.format( - ', '.join([v for k, v in PhenotypePrioritization.TOOL_CHOICES]), row['tool'])) - record['tool'] = tool + ', '.join([v for k, v in PhenotypePrioritization.TOOL_CHOICES]), record[TOOL_FIELD])) + record[TOOL_FIELD] = tool - scores = {} - for score in ['1', '2', '3']: - score_name = record.pop('scoreName' + score, None) + scores = {record.pop('score_name1'): record.pop('score1')} + for score_index in ['2', '3']: + score_name = record.pop('score_name' + score_index, None) + score = record.pop('score' + score_index, None) if score_name: - scores[score_name] = record.pop('score' + score, None) + scores[score_name] = score + record['scores'] = scores return record -def load_phenotype_pri_file(file_path, user, ignore_extra_samples): - samples_by_id = defaultdict(dict) +def _load_phenotype_pri_file(file_path): + data_by_id = defaultdict(dict) f = file_iter(file_path) header = _parse_tsv_row(next(f)) missing_cols = [col for col in PHENOTYPE_PRI_HEADER.keys() if col not in header] if missing_cols: raise ValueError(f'Invalid file: missing column(s) {", ".join(missing_cols)}') + count = 0 for line in tqdm(f, unit=' rows'): row = dict(zip(header, _parse_tsv_row(line))) record = _parse_phenotype_pri_row(row) @@ -491,63 +495,70 @@ def load_phenotype_pri_file(file_path, user, ignore_extra_samples): project = record.pop('project', None) if not sample_id or not project: raise ValueError('Both sample ID and project fields are required.') - if samples_by_id[sample_id]: - if project != samples_by_id[sample_id]['project']: + if data_by_id[sample_id]: + if project != data_by_id[sample_id]['project']: # a sample must belong to a single project raise ValueError(f'Invalid project name for sample {sample_id}') - samples_by_id[sample_id]['records'].append(record) + data_by_id[sample_id]['records'].append(record) else: - samples_by_id[sample_id]['project'] = project - samples_by_id[sample_id]['records'] = [record] + data_by_id[sample_id]['project'] = project + data_by_id[sample_id]['records'] = [record] + count += 1 + + return count, data_by_id + - message = f'Parsed {len(samples_by_id)} LIRICAL/Exomiser phenotype-based prioritization samples' +def load_phenotype_pri(file_path, user, ignore_extra_samples): + count, data_by_id = _load_phenotype_pri_file(file_path) + + message = f'Parsed {count} LIRICAL/Exomiser data records in {len(data_by_id)} samples' info = [message] logger.info(message, user) - existing_inds = Individual.objects.annotate( - indv_project=Concat('individual_id', Value('/', output_field=TextField()), 'family__project__name') - ).filter( - indv_project__in={sample_id + '/' + value['project'] for sample_id, value in samples_by_id} - ) - - for ind in existing_inds: - samples_by_id[ind.individual_id]['individual'] = ind + indivs = Individual.objects.filter(individual_id__in=data_by_id.keys()) + prefetch_related_objects(indivs, 'family__project') + existing_indivs_by_id = {ind.individual_id: ind for ind in indivs + if ind.family.project.name == data_by_id[ind.individual_id]['project']} - warnings = [] extra_ids = set() - records_to_load_by_id = defaultdict(lambda: defaultdict(list)) - for sample_id, value in samples_by_id.items(): - if value['individual']: + extra_records = 0 + for sample_id, value in data_by_id.items(): + if existing_indivs_by_id[sample_id]: for rec in value['records']: - rec['individual'] = value['individual'] - records_to_load_by_id[sample_id][rec['tool']].append(rec) + rec['individual'] = existing_indivs_by_id[sample_id] else: + data_by_id.pop(sample_id) extra_ids.add(sample_id) + extra_records += len(value['records']) - if extra_ids: + warnings = [] + if extra_records: skipped_samples = ', '.join(sorted(extra_ids)) if ignore_extra_samples: - warnings = [f'Skipped loading for the following {len(extra_ids)} unmatched samples: {skipped_samples}'] + warnings = [f'Skipped loading {extra_records} records for the following {len(extra_ids)} unmatched samples: {skipped_samples}'] else: raise ValueError(f'Unable to find matches for the following samples: {skipped_samples}') # Delete old data - to_delete = PhenotypePrioritization.objects.annotate(tool_ind=Concat('tool', 'individual')).filter( - tool_ind__in=[tool+sample_id for sample_id, value in records_to_load_by_id.items() for tool in value.keys()], + to_delete = PhenotypePrioritization.objects.annotate( + tool_ind=Concat('tool', 'individual__individual_id', output_field=TextField()) + ).filter( + tool_ind__in={rec[TOOL_FIELD]+sample_id for sample_id, values in data_by_id.items() for rec in values['records']}, ) if to_delete: prefetch_related_objects(to_delete, 'individual') + info.append(f'Deleted {len(to_delete)} existing LIRICAL/Exomiser records') logger.info(f'delete {len(to_delete)} {PhenotypePrioritization.__name__}s', user, db_update={ 'dbEntity': PhenotypePrioritization.__name__, 'numEntities': len(to_delete), 'updateType': 'bulk_delete', 'parentEntityIds': list({model.individual.guid for model in to_delete}), }) to_delete.delete() - prefetch_related_objects(existing_inds, 'family__project') - projects = {ind.family.project.name for ind in existing_inds} + records_to_load = [rec for value in data_by_id.values() for rec in value['records']] + projects = {value['project'] for value in data_by_id.values()} project_names = ', '.join(sorted(projects)) message = 'Attempted data loading for {} LIRICAL/Exomiser records in the following {} projects: {}'.format( - len(records_to_load_by_id), len(projects), project_names) + len(records_to_load), len(projects), project_names) info.append(message) logger.info(message, user) - return [rec for tools in records_to_load_by_id.values() for recs in tools.values() for rec in recs] + return records_to_load, info, warnings From 2495ffabcba68221aaa50dd3568b95cbdf51414d Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Wed, 5 Oct 2022 12:23:31 -0400 Subject: [PATCH 08/96] Add change log. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 46630a7edd..ed0b3a1f0c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # _seqr_ Changes ## dev +* Add PhenotypePrioritization model (REQUIRES DB MIGRATION) ## 9/28/22 * Add Gregor fields to sample manifest (REQUIRES DB MIGRATION) From 9e5bfb63aedd127cab918f78e939b9b74e590c6a Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 11 Oct 2022 10:05:39 -0400 Subject: [PATCH 09/96] Update LIRICAL display. --- seqr/models.py | 6 ++++-- seqr/views/utils/dataset_utils.py | 2 +- seqr/views/utils/orm_to_json_utils.py | 2 +- seqr/views/utils/variant_utils.py | 9 +++++---- .../components/panel/variants/VariantGene.jsx | 4 +--- ui/shared/components/panel/variants/selectors.js | 14 ++++++++++---- ui/shared/utils/constants.js | 3 +++ 7 files changed, 25 insertions(+), 15 deletions(-) diff --git a/seqr/models.py b/seqr/models.py index 8165bc5382..9b4de74617 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -1056,9 +1056,11 @@ class Meta: class PhenotypePrioritization(models.Model): + EXOMISER_CHOICE = 'E' + LIRICAL_CHOICE = 'L' TOOL_CHOICES = ( - ('E', 'exomiser'), - ('L', 'lirical') + (EXOMISER_CHOICE, 'exomiser'), + (LIRICAL_CHOICE, 'lirical') ) TOOL_LOOKUP = {v: k for k, v in TOOL_CHOICES} diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 0e5831590e..95c0e7cd13 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -460,7 +460,7 @@ def _load_rna_seq(model_cls, file_path, user, mapping_file, ignore_extra_samples def _parse_phenotype_pri_row(row): - record = {mapped_key: row[key] for key, mapped_key in PHENOTYPE_PRI_HEADER.items()} + record = {mapped_key: row.get(key, None) for key, mapped_key in PHENOTYPE_PRI_HEADER.items()} tool = PhenotypePrioritization.TOOL_LOOKUP.get(record[TOOL_FIELD], None) if not tool: diff --git a/seqr/views/utils/orm_to_json_utils.py b/seqr/views/utils/orm_to_json_utils.py index ee29272a7b..4688ba440a 100644 --- a/seqr/views/utils/orm_to_json_utils.py +++ b/seqr/views/utils/orm_to_json_utils.py @@ -869,7 +869,7 @@ def _process_result(data, model): def get_json_for_phenotype_pri(models, **kwargs): - def _process_result(data): + def _process_result(data, model): data['scores'] = {_to_camel_case(score) for score, value in data['scores'].items()} return _get_json_for_models(models, process_result=_process_result, **kwargs) diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index ac7cbc8c4c..16d15dab99 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -129,15 +129,16 @@ def _get_rna_seq_outliers(gene_ids, families): def _get_phenotype_pri_data(gene_ids, families): - data_by_individual_gene = defaultdict(lambda: {'phepri': {}}) + data_by_individual_gene = defaultdict(lambda: {PhenotypePrioritization.LIRICAL_CHOICE: {}, + PhenotypePrioritization.EXOMISER_CHOICE: {}}) phe_pri_data = get_json_for_phenotype_pri( - PhenotypePrioritization.objects.filter(gene_id__in=gene_ids, sample__individual__family__in=families), - nested_fields=[{'fields': ('sample', 'individual', 'guid'), 'key': 'individualGuid'}], + PhenotypePrioritization.objects.filter(gene_id__in=gene_ids, individual__family__in=families), + nested_fields=[{'fields': ('individual', 'guid'), 'key': 'individualGuid'}], ) for data in phe_pri_data: - data_by_individual_gene[data.pop('individualGuid')]['phepri'][data['geneId']] = data + data_by_individual_gene[data.pop('individualGuid')][data['tool']][data['geneId']] = data return data_by_individual_gene diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index 80e1b1c26c..bcb7a7258e 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -9,6 +9,7 @@ import { getGenesById, getLocusListsByGuid, getFamiliesByGuid } from 'redux/sele import { panelAppUrl, moiToMoiInitials } from '../../../utils/panelAppUtils' import { MISSENSE_THRESHHOLD, LOF_THRESHHOLD, PANEL_APP_CONFIDENCE_LEVEL_COLORS, PANEL_APP_CONFIDENCE_DESCRIPTION, + LIRICAL, EXOMISER, } from '../../../utils/constants' import { compareObjects } from '../../../utils/sortUtils' import { camelcaseToTitlecase } from '../../../utils/stringUtils' @@ -314,9 +315,6 @@ const GENE_DISEASE_DETAIL_SECTIONS = [ }, ] -const LIRICAL = 'L' -const EXOMISER = 'E' - const SAMPLE_GENE_DETAIL_FIELDS = { rnaSeqData: { infos: [], scores: ['zScore', 'pValue', 'pAdjust'] }, liricalData: { infos: ['rank', 'diseases'], scores: ['postTestProbability', 'LR'] }, diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index bc9af2f7b0..1dd2e01ae1 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -11,6 +11,7 @@ import { VARIANT_SORT_LOOKUP, SHOW_ALL, VARIANT_EXPORT_DATA, + LIRICAL, EXOMISER, } from 'shared/utils/constants' import { getVariantTagsByGuid, getVariantNotesByGuid, getSavedVariantsByGuid, getAnalysisGroupsByGuid, getGenesById, getUser, @@ -33,16 +34,21 @@ export const getRnaSeqOutilerDataByFamilyGene = createSelector( ), ) +const TOOLS = [LIRICAL, EXOMISER] export const getPhePriDataByFamilyGene = createSelector( getIndividualsByGuid, getPhePriDataByIndividual, (individualsByGuid, phePriDataByIndividual) => Object.entries(phePriDataByIndividual).reduce( (acc, [individualGuid, phePriData]) => { const { familyGuid, displayName } = individualsByGuid[individualGuid] - acc[familyGuid] = Object.entries(phePriData.outliers || {}).reduce( - (acc2, [geneId, data]) => (data.isSignificant ? - { ...acc2, [geneId]: { ...(acc2[geneId] || {}), [displayName]: data } } : acc2 - ), acc[familyGuid] || {}, + acc[familyGuid] = TOOLS.reduce( + (accTool, tool) => ({ + ...accTool, + [tool]: Object.entries(phePriData[tool] || {}).reduce( + (acc2, [geneId, data]) => ({ ...acc2, [geneId]: { ...(acc2[geneId] || {}), [displayName]: data } }), + acc[familyGuid] || {}, + ), + }), {}, ) return acc }, {}, diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js index e7614422ed..b24906f303 100644 --- a/ui/shared/utils/constants.js +++ b/ui/shared/utils/constants.js @@ -1304,6 +1304,9 @@ const VARIANT_ICON_COLORS = { green: '#21a926', } +export const LIRICAL = 'L' +export const EXOMISER = 'E' + export const PANEL_APP_CONFIDENCE_DESCRIPTION = { 0: 'No Panel App confidence level', 1: 'Red, lowest level of confidence; 1 of the 4 sources or from other sources.', From 5f3583a350b60f6e4b8fde4bcba39bc2b1264500 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Wed, 12 Oct 2022 13:09:59 -0400 Subject: [PATCH 10/96] Update per review comments. --- .../0048_phenotypeprioritization.py | 6 +- seqr/models.py | 34 +++-- seqr/urls.py | 4 +- seqr/utils/logging_utils.py | 7 +- seqr/views/apis/data_manager_api.py | 64 +++++++-- seqr/views/utils/dataset_utils.py | 123 ++++-------------- ui/pages/DataManagement/DataManagement.jsx | 4 +- .../components/PhenotypePri.jsx | 31 ----- .../components/PhenotypePrioritization.jsx | 27 ++++ ui/pages/DataManagement/reducers.js | 21 +-- ui/pages/DataManagement/selectors.js | 2 +- 11 files changed, 147 insertions(+), 176 deletions(-) delete mode 100644 ui/pages/DataManagement/components/PhenotypePri.jsx create mode 100644 ui/pages/DataManagement/components/PhenotypePrioritization.jsx diff --git a/seqr/migrations/0048_phenotypeprioritization.py b/seqr/migrations/0048_phenotypeprioritization.py index d249142dcc..5f7900bdc3 100644 --- a/seqr/migrations/0048_phenotypeprioritization.py +++ b/seqr/migrations/0048_phenotypeprioritization.py @@ -1,7 +1,8 @@ -# Generated by Django 3.2.15 on 2022-10-05 16:09 +# Generated by Django 3.2.15 on 2022-10-12 15:03 from django.db import migrations, models import django.db.models.deletion +import seqr.models class Migration(migrations.Migration): @@ -16,12 +17,13 @@ class Migration(migrations.Migration): fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('gene_id', models.CharField(max_length=20)), - ('tool', models.CharField(choices=[('E', 'exomiser'), ('L', 'lirical')], max_length=1)), + ('tool', models.CharField(max_length=20)), ('rank', models.IntegerField()), ('disease_id', models.CharField(max_length=32)), ('disease_name', models.TextField()), ('scores', models.JSONField()), ('individual', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='seqr.individual')), ], + bases=(models.Model, seqr.models.BulkOperationBase), ), ] diff --git a/seqr/models.py b/seqr/models.py index 8165bc5382..d47db0c1c1 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -45,6 +45,28 @@ def get_audit_field_names(audit_field): return list(_get_audit_fields(audit_field).keys()) +class BulkOperationBase: + @classmethod + def bulk_create(cls, user, new_models, parent=None): + """Helper bulk create method that logs the creation""" + for model in new_models: + model.created_by = user + models = cls.objects.bulk_create(new_models) + log_model_bulk_update(logger, models, user, 'create', parent=parent) + return models + + @classmethod + def bulk_delete(cls, user, queryset=None, parent=None, **filter_kwargs): + """Helper bulk delete method that logs the deletion""" + if queryset is None: + queryset = cls.objects.filter(**filter_kwargs) + log_model_bulk_update(logger, queryset, user, 'delete', parent=parent) + return queryset.delete() + + class Meta: + abstract = True + + class CustomModelBase(base.ModelBase): def __new__(cls, name, bases, attrs, **kwargs): audit_fields = getattr(attrs.get('Meta'), 'audit_fields', None) @@ -1055,17 +1077,11 @@ class Meta: json_fields = ['gene_id', 'tpm'] -class PhenotypePrioritization(models.Model): - TOOL_CHOICES = ( - ('E', 'exomiser'), - ('L', 'lirical') - ) - TOOL_LOOKUP = {v: k for k, v in TOOL_CHOICES} - +class PhenotypePrioritization(models.Model, BulkOperationBase): individual = models.ForeignKey('Individual', on_delete=models.CASCADE, db_index=True) gene_id = models.CharField(max_length=20) # ensembl ID - tool = models.CharField(max_length=1, choices=TOOL_CHOICES) + tool = models.CharField(max_length=20) rank = models.IntegerField() disease_id = models.CharField(max_length=32) disease_name = models.TextField() @@ -1075,4 +1091,4 @@ def __unicode__(self): return "%s:%s:%s" % (self.individual.individual_id, self.gene_id, self.disease_id) class Meta: - json_fields = ['gene_id', 'tool', 'rank', 'disease_id', 'scores'] + json_fields = ['gene_id', 'tool', 'rank', 'disease_id', 'disease_name', 'scores'] diff --git a/seqr/urls.py b/seqr/urls.py index 901f941556..88d837fb5e 100644 --- a/seqr/urls.py +++ b/seqr/urls.py @@ -114,7 +114,7 @@ forgot_password from seqr.views.apis.data_manager_api import elasticsearch_status, upload_qc_pipeline_output, delete_index, \ - update_rna_seq, load_rna_seq_sample_data, load_phenotype_pri_data, proxy_to_kibana + update_rna_seq, load_rna_seq_sample_data, proxy_to_kibana, load_phenotype_prioritization_data from seqr.views.apis.report_api import \ anvil_export, \ discovery_sheet, \ @@ -307,7 +307,7 @@ 'data_management/get_all_users': get_all_users, 'data_management/update_rna_seq': update_rna_seq, 'data_management/load_rna_seq_sample/(?P[^/]+)': load_rna_seq_sample_data, - 'data_management/load_phenotype_pri_data': load_phenotype_pri_data, + 'data_management/load_phenotype_prioritization_data': load_phenotype_prioritization_data, 'summary_data/saved_variants/(?P[^/]+)': saved_variants_page, 'summary_data/success_story/(?P[^/]+)': success_story, diff --git a/seqr/utils/logging_utils.py b/seqr/utils/logging_utils.py index f6e5ea1ef4..59c1174c08 100644 --- a/seqr/utils/logging_utils.py +++ b/seqr/utils/logging_utils.py @@ -73,11 +73,14 @@ def log_model_update(logger, model, user, update_type, update_fields=None): logger.info('{} {} {}'.format(update_type, db_entity, entity_id), user, db_update=db_update) -def log_model_bulk_update(logger, models, user, update_type, update_fields=None): +def log_model_bulk_update(logger, models, user, update_type, update_fields=None, parent=None): if not models: return [] db_entity = type(models[0]).__name__ - entity_ids = [o.guid for o in models] + if parent: + entity_ids = list({getattr(o, parent).guid for o in models}) + else: + entity_ids = [o.guid for o in models] db_update = { 'dbEntity': db_entity, 'entityIds': entity_ids, 'updateType': 'bulk_{}'.format(update_type), } diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index ce602ce335..7024d67398 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -9,7 +9,8 @@ import urllib3 from django.contrib.postgres.aggregates import ArrayAgg -from django.db.models import Max +from django.db.models import Max, TextField +from django.db.models.functions import Concat from django.http.response import HttpResponse from django.views.decorators.csrf import csrf_exempt from requests.exceptions import ConnectionError as RequestConnectionError @@ -18,7 +19,7 @@ from seqr.utils.file_utils import file_iter, does_file_exist from seqr.utils.logging_utils import SeqrLogger -from seqr.views.utils.dataset_utils import load_rna_seq_outlier, load_rna_seq_tpm, load_phenotype_pri +from seqr.views.utils.dataset_utils import load_rna_seq_outlier, load_rna_seq_tpm, load_phenotype_prioritization_data_file from seqr.views.utils.file_utils import parse_file, get_temp_upload_directory, load_uploaded_file from seqr.views.utils.json_utils import create_json_response, _to_camel_case from seqr.views.utils.permissions_utils import data_manager_required @@ -395,26 +396,65 @@ def load_rna_seq_sample_data(request, sample_guid): return create_json_response({'success': True}) +def _load_phenotype_prioritization(file_path, user): + data_by_id = load_phenotype_prioritization_data_file(file_path) + + all_samples = [sample for project_samples in data_by_id.values() for sample in project_samples.values()] + all_records = [rec for sample_records in all_samples for rec in sample_records] + message = f'Parsed {len(all_records)} LIRICAL/Exomiser data records in {len(all_samples)} samples' + info = [message] + logger.info(message, user) + + for project, project_samples in data_by_id.items(): + indivs = Individual.objects.filter(family__project__name=project, individual_id__in=project_samples.keys()) + existing_indivs_by_id = {ind.individual_id: ind for ind in indivs} + + tool_sample_id_set = set() + for sample_id, records in project_samples.items(): + if existing_indivs_by_id[sample_id]: + for rec in records: + rec['individual'] = existing_indivs_by_id[sample_id] + tool_sample_id_set.add(f'{rec["tool"]}{sample_id}') + else: + raise ValueError(f'Individual {sample_id} doesn\'t exist in project {project}') + + # Delete old data + to_delete = PhenotypePrioritization.objects.annotate( + tool_ind=Concat('tool', 'individual__individual_id', output_field=TextField()) + ).filter( + tool_ind__in=tool_sample_id_set, + ) + if to_delete: + deleted, _ = PhenotypePrioritization.bulk_delete(user, to_delete, parent='individual') + message = f'Deleted {deleted} existing phenotype-based prioritization records from project {project}' + info.append(message) + logger.info(message, user) + + + project_names = ', '.join(sorted(data_by_id.keys())) + message = 'Attempted data loading for {} phenotype-based prioritization records in the following {} projects: {}'.format( + len(all_records), len(data_by_id.keys()), project_names) + info.append(message) + logger.info(message, user) + + return all_records, info + + @data_manager_required -def load_phenotype_pri_data(request): +def load_phenotype_prioritization_data(request): request_json = json.loads(request.body) file_name = request_json['file'] - ignore_extra_samples = request_json.get('ignoreExtraSamples', False) logger.info(f'Loading phenotype prioritization data from {file_name}', request.user) - records, info, warnings = load_phenotype_pri(file_name, request.user, ignore_extra_samples) - models = PhenotypePrioritization.objects.bulk_create([PhenotypePrioritization(**data) for data in records]) - ind_guids = {data['individual'].guid for data in records} - logger.info(f'create {len(models)} {PhenotypePrioritization.__name__}', request.user, db_update={ - 'dbEntity': PhenotypePrioritization.__name__, 'numEntities': len(models), 'parentEntityIds': sorted(ind_guids), - 'updateType': 'bulk_create', - }) + records, info = _load_phenotype_prioritization(file_name, request.user) + models = PhenotypePrioritization.bulk_create(request.user, [PhenotypePrioritization(**data) for data in records], + parent='individual') + info.append(f'Loaded {len(models)} LIRICAL/Exomiser data records') return create_json_response({ 'info': info, - 'warnings': warnings, 'success': True }) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 0e5831590e..99562007be 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -1,17 +1,17 @@ import elasticsearch_dsl from collections import defaultdict -from django.db.models import prefetch_related_objects, TextField -from django.db.models.functions import Concat +from django.db.models import prefetch_related_objects from django.utils import timezone from tqdm import tqdm import random -from seqr.models import Sample, Individual, Family, RnaSeqOutlier, RnaSeqTpm, PhenotypePrioritization +from seqr.models import Sample, Individual, Family, RnaSeqOutlier, RnaSeqTpm from seqr.utils.elasticsearch.utils import get_es_client, get_index_metadata from seqr.utils.file_utils import file_iter from seqr.utils.logging_utils import log_model_bulk_update, SeqrLogger from seqr.views.utils.file_utils import parse_file from seqr.views.utils.permissions_utils import get_internal_projects +from seqr.views.utils.json_utils import _to_snake_case logger = SeqrLogger(__name__) @@ -452,113 +452,38 @@ def _load_rna_seq(model_cls, file_path, user, mapping_file, ignore_extra_samples return samples_to_load, info, warnings -TOOL_FIELD = 'tool' -PHENOTYPE_PRI_HEADER = { - 'tool': TOOL_FIELD, 'project': 'project', 'sampleId': 'sample_id', 'rank': 'rank', 'geneId': 'gene_id', - 'diseaseId': 'disease_id', 'diseaseName': 'disease_name', 'scoreName1': 'score_name1', 'score1': 'score1', - 'scoreName2': 'score_name2', 'score2': 'score2', 'scoreName3': 'score_name3', 'score3': 'score3'} +PHENOTYPE_PRI_HEADER = ['tool', 'project', 'sampleId', 'rank', 'geneId', 'diseaseId', 'diseaseName', 'scoreName1', 'score1'] +MAX_SCORES = 100 def _parse_phenotype_pri_row(row): - record = {mapped_key: row[key] for key, mapped_key in PHENOTYPE_PRI_HEADER.items()} - - tool = PhenotypePrioritization.TOOL_LOOKUP.get(record[TOOL_FIELD], None) - if not tool: - raise ValueError('Expecting {} for the "tool" column but found {}'.format( - ', '.join([v for k, v in PhenotypePrioritization.TOOL_CHOICES]), record[TOOL_FIELD])) - record[TOOL_FIELD] = tool - - scores = {record.pop('score_name1'): record.pop('score1')} - for score_index in ['2', '3']: - score_name = record.pop('score_name' + score_index, None) - score = record.pop('score' + score_index, None) - if score_name: - scores[score_name] = score + record = {_to_snake_case(key): row[key] for key in PHENOTYPE_PRI_HEADER[:-2]} + + scores = {} + for i in range(1, MAX_SCORES): + if not row[f'scoreName{i}']: + break + scores[row[f'scoreName{i}']] = row[f'score{i}'] record['scores'] = scores - return record + yield record['sample_id'], record -def _load_phenotype_pri_file(file_path): - data_by_id = defaultdict(dict) +def load_phenotype_prioritization_data_file(file_path): + data_by_id = defaultdict(lambda: defaultdict(list)) f = file_iter(file_path) header = _parse_tsv_row(next(f)) - missing_cols = [col for col in PHENOTYPE_PRI_HEADER.keys() if col not in header] + missing_cols = [col for col in PHENOTYPE_PRI_HEADER if col not in header] if missing_cols: raise ValueError(f'Invalid file: missing column(s) {", ".join(missing_cols)}') - count = 0 for line in tqdm(f, unit=' rows'): row = dict(zip(header, _parse_tsv_row(line))) - record = _parse_phenotype_pri_row(row) - sample_id = record.pop('sample_id', None) - project = record.pop('project', None) - if not sample_id or not project: - raise ValueError('Both sample ID and project fields are required.') - if data_by_id[sample_id]: - if project != data_by_id[sample_id]['project']: # a sample must belong to a single project - raise ValueError(f'Invalid project name for sample {sample_id}') - data_by_id[sample_id]['records'].append(record) - else: - data_by_id[sample_id]['project'] = project - data_by_id[sample_id]['records'] = [record] - count += 1 - - return count, data_by_id - - -def load_phenotype_pri(file_path, user, ignore_extra_samples): - count, data_by_id = _load_phenotype_pri_file(file_path) - - message = f'Parsed {count} LIRICAL/Exomiser data records in {len(data_by_id)} samples' - info = [message] - logger.info(message, user) - - indivs = Individual.objects.filter(individual_id__in=data_by_id.keys()) - prefetch_related_objects(indivs, 'family__project') - existing_indivs_by_id = {ind.individual_id: ind for ind in indivs - if ind.family.project.name == data_by_id[ind.individual_id]['project']} - - extra_ids = set() - extra_records = 0 - for sample_id, value in data_by_id.items(): - if existing_indivs_by_id[sample_id]: - for rec in value['records']: - rec['individual'] = existing_indivs_by_id[sample_id] - else: - data_by_id.pop(sample_id) - extra_ids.add(sample_id) - extra_records += len(value['records']) - - warnings = [] - if extra_records: - skipped_samples = ', '.join(sorted(extra_ids)) - if ignore_extra_samples: - warnings = [f'Skipped loading {extra_records} records for the following {len(extra_ids)} unmatched samples: {skipped_samples}'] - else: - raise ValueError(f'Unable to find matches for the following samples: {skipped_samples}') - - # Delete old data - to_delete = PhenotypePrioritization.objects.annotate( - tool_ind=Concat('tool', 'individual__individual_id', output_field=TextField()) - ).filter( - tool_ind__in={rec[TOOL_FIELD]+sample_id for sample_id, values in data_by_id.items() for rec in values['records']}, - ) - if to_delete: - prefetch_related_objects(to_delete, 'individual') - info.append(f'Deleted {len(to_delete)} existing LIRICAL/Exomiser records') - logger.info(f'delete {len(to_delete)} {PhenotypePrioritization.__name__}s', user, db_update={ - 'dbEntity': PhenotypePrioritization.__name__, 'numEntities': len(to_delete), 'updateType': 'bulk_delete', - 'parentEntityIds': list({model.individual.guid for model in to_delete}), - }) - to_delete.delete() - - records_to_load = [rec for value in data_by_id.values() for rec in value['records']] - projects = {value['project'] for value in data_by_id.values()} - project_names = ', '.join(sorted(projects)) - message = 'Attempted data loading for {} LIRICAL/Exomiser records in the following {} projects: {}'.format( - len(records_to_load), len(projects), project_names) - info.append(message) - logger.info(message, user) - - return records_to_load, info, warnings + for sample_id, row_dict in _parse_phenotype_pri_row(row): + row_dict.pop('sample_id') + project = row_dict.pop('project', None) + if not sample_id or not project: + raise ValueError('Both sample ID and project fields are required.') + data_by_id[project][sample_id].append(row_dict) + + return data_by_id diff --git a/ui/pages/DataManagement/DataManagement.jsx b/ui/pages/DataManagement/DataManagement.jsx index ab25d5e7a3..dfc809d641 100644 --- a/ui/pages/DataManagement/DataManagement.jsx +++ b/ui/pages/DataManagement/DataManagement.jsx @@ -10,7 +10,7 @@ import ElasticsearchStatus from './components/ElasticsearchStatus' import RnaSeq from './components/RnaSeq' import SampleQc from './components/SampleQc' import Users from './components/Users' -import PhenotypePri from './components/PhenotypePri' +import PhenotypePrioritization from './components/PhenotypePrioritization' const IFRAME_STYLE = { position: 'fixed', left: '0', top: '95px' } @@ -23,7 +23,7 @@ export const DATA_MANAGEMENT_PAGES = [ { path: 'sample_qc', component: SampleQc }, { path: 'rna_seq', component: RnaSeq }, { path: 'users', component: Users }, - { path: 'lirical_exomiser', component: PhenotypePri }, + { path: 'phenotype_prioritization', component: PhenotypePrioritization }, ] const DataManagement = ({ match, user }) => ( diff --git a/ui/pages/DataManagement/components/PhenotypePri.jsx b/ui/pages/DataManagement/components/PhenotypePri.jsx deleted file mode 100644 index de64606ba3..0000000000 --- a/ui/pages/DataManagement/components/PhenotypePri.jsx +++ /dev/null @@ -1,31 +0,0 @@ -import { connect } from 'react-redux' - -import { validators } from 'shared/components/form/FormHelpers' -import { BooleanCheckbox } from 'shared/components/form/Inputs' -import UploadFormPage from 'shared/components/page/UploadFormPage' - -import { getPhenoPriUploadStats } from '../selectors' -import { uploadPhenoPri } from '../reducers' - -const mapStateToProps = state => ({ - fields: [ - { - name: 'file', - label: 'Phenotype-based prioritization data (.tsv)', - placeholder: 'gs:// Google bucket path', - validate: validators.required, - }, - { - name: 'ignoreExtraSamples', - component: BooleanCheckbox, - label: 'Ignore extra samples', - }, - ], - uploadStats: getPhenoPriUploadStats(state), -}) - -const mapDispatchToProps = { - onSubmit: uploadPhenoPri, -} - -export default connect(mapStateToProps, mapDispatchToProps)(UploadFormPage) diff --git a/ui/pages/DataManagement/components/PhenotypePrioritization.jsx b/ui/pages/DataManagement/components/PhenotypePrioritization.jsx new file mode 100644 index 0000000000..1d5a3ba175 --- /dev/null +++ b/ui/pages/DataManagement/components/PhenotypePrioritization.jsx @@ -0,0 +1,27 @@ +import { connect } from 'react-redux' + +import { validators } from 'shared/components/form/FormHelpers' +import UploadFormPage from 'shared/components/page/UploadFormPage' + +import { getPhePriUploadStats } from '../selectors' +import { uploadPhenotypePrioritization } from '../reducers' + +const FIELDS = [ + { + name: 'file', + label: 'Phenotype-based prioritization data (.tsv)', + placeholder: 'gs:// Google bucket path', + validate: validators.required, + }, +] + +const mapStateToProps = state => ({ + fields: FIELDS, + uploadStats: getPhePriUploadStats(state), +}) + +const mapDispatchToProps = { + onSubmit: uploadPhenotypePrioritization, +} + +export default connect(mapStateToProps, mapDispatchToProps)(UploadFormPage) diff --git a/ui/pages/DataManagement/reducers.js b/ui/pages/DataManagement/reducers.js index 080b56b338..7cc15fd611 100644 --- a/ui/pages/DataManagement/reducers.js +++ b/ui/pages/DataManagement/reducers.js @@ -8,7 +8,7 @@ const REQUEST_ELASTICSEARCH_STATUS = 'REQUEST_ELASTICSEARCH_STATUS' const RECEIVE_ELASTICSEARCH_STATUS = 'RECEIVE_ELASTICSEARCH_STATUS' const RECEIVE_PIPELINE_UPLOAD_STATS = 'RECEIVE_PIPELINE_UPLOAD_STATS' const RECEIVE_RNA_SEQ_UPLOAD_STATS = 'RECEIVE_RNA_SEQ_UPLOAD_STATS' -const RECEIVE_PHENO_PRI_UPLOAD_STATS = 'RECEIVE_PHENO_PRI_UPLOAD_STATS' +const RECEIVE_PHE_PRI_UPLOAD_STATS = 'RECEIVE_PHE_PRI_UPLOAD_STATS' const REQUEST_ALL_USERS = 'REQUEST_ALL_USERS' const RECEIVE_ALL_USERS = 'RECEIVE_ALL_USERS' @@ -76,27 +76,16 @@ export const uploadRnaSeq = values => (dispatch) => { }) } -export const uploadPhenoPri = values => (dispatch) => { - let successResponseJson = null - return new HttpRequestHelper( - '/api/data_management/load_phenotype_pri_data', - (responseJson) => { - successResponseJson = responseJson - }, - (e) => { - successResponseJson = { warnings: [e.message] } - }, - ).post(values).then(() => { - dispatch({ type: RECEIVE_PHENO_PRI_UPLOAD_STATS, newValue: successResponseJson }) - }) -} +export const uploadPhenotypePrioritization = values => submitRequest( + 'load_phenotype_prioritization_data', RECEIVE_PHE_PRI_UPLOAD_STATS, values, +) export const reducers = { elasticsearchStatusLoading: loadingReducer(REQUEST_ELASTICSEARCH_STATUS, RECEIVE_ELASTICSEARCH_STATUS), elasticsearchStatus: createSingleObjectReducer(RECEIVE_ELASTICSEARCH_STATUS), qcUploadStats: createSingleValueReducer(RECEIVE_PIPELINE_UPLOAD_STATS, {}), rnaSeqUploadStats: createSingleValueReducer(RECEIVE_RNA_SEQ_UPLOAD_STATS, {}), - phenoPriUploadStats: createSingleValueReducer(RECEIVE_PHENO_PRI_UPLOAD_STATS, {}), + phePriUploadStats: createSingleValueReducer(RECEIVE_PHE_PRI_UPLOAD_STATS, {}), allUsers: createSingleValueReducer(RECEIVE_ALL_USERS, [], 'users'), allUsersLoading: loadingReducer(REQUEST_ALL_USERS, RECEIVE_ALL_USERS), } diff --git a/ui/pages/DataManagement/selectors.js b/ui/pages/DataManagement/selectors.js index 48ffc39d45..2944714d68 100644 --- a/ui/pages/DataManagement/selectors.js +++ b/ui/pages/DataManagement/selectors.js @@ -4,4 +4,4 @@ export const getQcUploadStats = state => state.qcUploadStats export const getRnaSeqUploadStats = state => state.rnaSeqUploadStats export const getAllUsersLoading = state => state.allUsersLoading.isLoading export const getAllUsers = state => state.allUsers -export const getPhenoPriUploadStats = state => state.phenoPriUploadStats +export const getPhePriUploadStats = state => state.phePriUploadStats From 334791e2899c8d930ef8c8784c21056031b5f7cd Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Wed, 12 Oct 2022 15:50:53 -0400 Subject: [PATCH 11/96] Remove an extra blank line. --- seqr/views/apis/data_manager_api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 7024d67398..ee94355277 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -430,7 +430,6 @@ def _load_phenotype_prioritization(file_path, user): info.append(message) logger.info(message, user) - project_names = ', '.join(sorted(data_by_id.keys())) message = 'Attempted data loading for {} phenotype-based prioritization records in the following {} projects: {}'.format( len(all_records), len(data_by_id.keys()), project_names) From 362e7d8ca55cc36c7b6339fce6cfab8b389e2021 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Fri, 14 Oct 2022 10:47:02 -0400 Subject: [PATCH 12/96] Update lirical display. --- seqr/views/utils/dataset_utils.py | 5 +- seqr/views/utils/orm_to_json_utils.py | 4 +- seqr/views/utils/variant_utils.py | 21 +++--- .../components/panel/variants/VariantGene.jsx | 72 +++++++++---------- .../components/panel/variants/selectors.js | 29 ++++---- 5 files changed, 64 insertions(+), 67 deletions(-) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 99562007be..3551d0c9c0 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -461,9 +461,10 @@ def _parse_phenotype_pri_row(row): scores = {} for i in range(1, MAX_SCORES): - if not row[f'scoreName{i}']: + score_name = row.get(f'scoreName{i}') + if not score_name: break - scores[row[f'scoreName{i}']] = row[f'score{i}'] + scores[score_name] = float(row[f'score{i}']) record['scores'] = scores yield record['sample_id'], record diff --git a/seqr/views/utils/orm_to_json_utils.py b/seqr/views/utils/orm_to_json_utils.py index 4688ba440a..9fd48a0a25 100644 --- a/seqr/views/utils/orm_to_json_utils.py +++ b/seqr/views/utils/orm_to_json_utils.py @@ -868,8 +868,8 @@ def _process_result(data, model): return _get_json_for_models(models, process_result=_process_result, **kwargs) -def get_json_for_phenotype_pri(models, **kwargs): +def get_json_for_phenotype_prioritization(models, **kwargs): def _process_result(data, model): - data['scores'] = {_to_camel_case(score) for score, value in data['scores'].items()} + data['scores'] = {_to_camel_case(score): value for score, value in data['scores'].items()} return _get_json_for_models(models, process_result=_process_result, **kwargs) diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 16d15dab99..8ab37c6765 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -8,7 +8,8 @@ from seqr.utils.gene_utils import get_genes_for_variants from seqr.views.utils.json_to_orm_utils import update_model_from_json from seqr.views.utils.orm_to_json_utils import get_json_for_discovery_tags, get_json_for_locus_lists, \ - _get_json_for_models, get_json_for_rna_seq_outliers, get_json_for_saved_variants_with_tags, get_json_for_phenotype_pri + _get_json_for_models, get_json_for_rna_seq_outliers, get_json_for_saved_variants_with_tags,\ + get_json_for_phenotype_prioritization from seqr.views.utils.permissions_utils import has_case_review_permissions, user_is_analyst from seqr.views.utils.project_context_utils import add_project_tag_types, add_families_context from settings import REDIS_SERVICE_HOSTNAME, REDIS_SERVICE_PORT @@ -128,17 +129,16 @@ def _get_rna_seq_outliers(gene_ids, families): return data_by_individual_gene -def _get_phenotype_pri_data(gene_ids, families): - data_by_individual_gene = defaultdict(lambda: {PhenotypePrioritization.LIRICAL_CHOICE: {}, - PhenotypePrioritization.EXOMISER_CHOICE: {}}) +def _get_phenotype_prioritization(gene_ids, families): + data_by_individual_gene = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) - phe_pri_data = get_json_for_phenotype_pri( + data_dicts = _get_json_for_models( PhenotypePrioritization.objects.filter(gene_id__in=gene_ids, individual__family__in=families), nested_fields=[{'fields': ('individual', 'guid'), 'key': 'individualGuid'}], ) - for data in phe_pri_data: - data_by_individual_gene[data.pop('individualGuid')][data['tool']][data['geneId']] = data + for data in data_dicts: + data_by_individual_gene[data.pop('individualGuid')][data['tool']][data['geneId']].append(data) return data_by_individual_gene @@ -173,9 +173,10 @@ def _add_pa_detail(locus_list_gene, locus_list_guid, gene_json): LOAD_PROJECT_TAG_TYPES_CONTEXT_PARAM = 'loadProjectTagTypes' LOAD_FAMILY_CONTEXT_PARAM = 'loadFamilyContext' + def get_variants_response(request, saved_variants, response_variants=None, add_all_context=False, include_igv=True, add_locus_list_detail=False, include_rna_seq=True, include_project_name=False, - include_phe_pri=True): + include_phenotype_prioritization=True): response = get_json_for_saved_variants_with_tags(saved_variants, add_details=True) variants = list(response['savedVariantsByGuid'].values()) if response_variants is None else response_variants @@ -220,7 +221,7 @@ def get_variants_response(request, saved_variants, response_variants=None, add_a if families_by_guid: _add_family_has_rna_tpm(families_by_guid) - if include_phe_pri: - response['phePriData'] = _get_phenotype_pri_data(genes.keys(), families) + if include_phenotype_prioritization: + response['phePriData'] = _get_phenotype_prioritization(genes.keys(), families) return response diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index bcb7a7258e..29eb41f603 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -9,7 +9,6 @@ import { getGenesById, getLocusListsByGuid, getFamiliesByGuid } from 'redux/sele import { panelAppUrl, moiToMoiInitials } from '../../../utils/panelAppUtils' import { MISSENSE_THRESHHOLD, LOF_THRESHHOLD, PANEL_APP_CONFIDENCE_LEVEL_COLORS, PANEL_APP_CONFIDENCE_DESCRIPTION, - LIRICAL, EXOMISER, } from '../../../utils/constants' import { compareObjects } from '../../../utils/sortUtils' import { camelcaseToTitlecase } from '../../../utils/stringUtils' @@ -315,39 +314,34 @@ const GENE_DISEASE_DETAIL_SECTIONS = [ }, ] -const SAMPLE_GENE_DETAIL_FIELDS = { - rnaSeqData: { infos: [], scores: ['zScore', 'pValue', 'pAdjust'] }, - liricalData: { infos: ['rank', 'diseases'], scores: ['postTestProbability', 'LR'] }, - exomiserData: { infos: ['rank', 'diseases'], scores: ['exomiserScore', 'phenotypeScore', 'variantScore'] }, -} - -const sampleGeneDetailsDisplay = (geneId, sampleGeneData, dataType) => ( -
- - - - - {Object.values(SAMPLE_GENE_DETAIL_FIELDS[dataType]).flat().map( - field => {camelcaseToTitlecase(field).replace(' ', '-')}, - )} - - - - {Object.entries(sampleGeneData[geneId]).map(([individual, data]) => ( - - {individual} - {SAMPLE_GENE_DETAIL_FIELDS[dataType].infos.map( - field => {data[field]}, - )} - {SAMPLE_GENE_DETAIL_FIELDS[dataType].scores.map( - field => {data[field].toPrecision(3)}, - )} +const sampleGeneDetailsDisplay = (geneId, sampleGeneData) => { + const { scores, ...info } = Object.values(Object.values(sampleGeneData)[0])[0][0] + const infoKeys = Object.keys(info) + const scoreKeys = Object.keys(scores || {}) + return ( +
+
+ + + + {infoKeys.concat(scoreKeys).map(field => ( + {camelcaseToTitlecase(field).replace(' ', '-')} + ))} - ))} - -
-
-) + + + {Object.entries(sampleGeneData[geneId]).map(([individual, data]) => (data.map(row => ( + + {individual} + {infoKeys.map(field => {row[field]})} + {scoreKeys.map(field => {row.scores[field].toPrecision(3)})} + + ))))} + + + + ) +} const GENE_DETAIL_SECTIONS = [ { @@ -410,20 +404,20 @@ const GENE_DETAIL_SECTIONS = [ }, { color: 'orange', - description: 'LIRICAL Phenotype Prioritization', + description: 'Phenotype Prioritization', label: 'LIRICAL', - showDetails: (gene, { phePriData }) => phePriData && phePriData[LIRICAL][gene.geneId], + showDetails: (gene, { phePriData }) => phePriData && phePriData.lirical && phePriData.lirical[gene.geneId], detailsDisplay: (gene, { phePriData }) => ( - sampleGeneDetailsDisplay(gene.geneId, phePriData, 'lirical') + sampleGeneDetailsDisplay(gene.geneId, phePriData.lirical) ), }, { color: 'orange', - description: 'Exomiser Phenotype Prioritization', + description: 'Phenotype Prioritization', label: 'Exomiser', - showDetails: (gene, { phePriData }) => phePriData && phePriData[EXOMISER][gene.geneId], + showDetails: (gene, { phePriData }) => phePriData && phePriData.exomiser && phePriData.exomiser[gene.geneId], detailsDisplay: (gene, { phePriData }) => ( - sampleGeneDetailsDisplay(gene.geneId, phePriData, 'exomiser') + sampleGeneDetailsDisplay(gene.geneId, phePriData.exomiser) ), }, ] diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index 1dd2e01ae1..d6d1abcff7 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -11,7 +11,6 @@ import { VARIANT_SORT_LOOKUP, SHOW_ALL, VARIANT_EXPORT_DATA, - LIRICAL, EXOMISER, } from 'shared/utils/constants' import { getVariantTagsByGuid, getVariantNotesByGuid, getSavedVariantsByGuid, getAnalysisGroupsByGuid, getGenesById, getUser, @@ -25,31 +24,33 @@ export const getRnaSeqOutilerDataByFamilyGene = createSelector( (acc, [individualGuid, rnaSeqData]) => { const { familyGuid, displayName } = individualsByGuid[individualGuid] acc[familyGuid] = Object.entries(rnaSeqData.outliers || {}).reduce( - (acc2, [geneId, data]) => (data.isSignificant ? - { ...acc2, [geneId]: { ...(acc2[geneId] || {}), [displayName]: data } } : acc2 - ), acc[familyGuid] || {}, + (acc2, [geneId, data]) => { + const { zScore, pValue, pAdjust } = data + return (data.isSignificant ? { + ...acc2, + [geneId]: { ...(acc2[geneId] || {}), [displayName]: [{ scores: { zScore, pValue, pAdjust } }] }, + } : acc2) + }, + acc[familyGuid] || {}, ) return acc }, {}, ), ) -const TOOLS = [LIRICAL, EXOMISER] export const getPhePriDataByFamilyGene = createSelector( getIndividualsByGuid, getPhePriDataByIndividual, (individualsByGuid, phePriDataByIndividual) => Object.entries(phePriDataByIndividual).reduce( (acc, [individualGuid, phePriData]) => { const { familyGuid, displayName } = individualsByGuid[individualGuid] - acc[familyGuid] = TOOLS.reduce( - (accTool, tool) => ({ - ...accTool, - [tool]: Object.entries(phePriData[tool] || {}).reduce( - (acc2, [geneId, data]) => ({ ...acc2, [geneId]: { ...(acc2[geneId] || {}), [displayName]: data } }), - acc[familyGuid] || {}, - ), - }), {}, - ) + acc[familyGuid] = Object.entries(phePriData).reduce((accTool, [tool, toolData]) => ({ + ...accTool, + [tool]: Object.entries(toolData).reduce((acc2, [geneId, data]) => ({ + ...acc2, + [geneId]: { ...(acc2[geneId] || {}), [displayName]: data }, + }), {}), + }), acc[familyGuid] || {}) return acc }, {}, ), From c29633f29b3d3c856805973ad724fc61de916f35 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Fri, 14 Oct 2022 16:46:37 -0400 Subject: [PATCH 13/96] Update per review comments. --- .../0048_phenotypeprioritization.py | 2 +- seqr/models.py | 51 ++++++------- seqr/utils/logging_utils.py | 7 +- seqr/views/apis/data_manager_api.py | 75 ++++++++----------- seqr/views/apis/data_manager_api_tests.py | 35 +++++---- seqr/views/utils/dataset_utils.py | 30 ++++---- 6 files changed, 95 insertions(+), 105 deletions(-) diff --git a/seqr/migrations/0048_phenotypeprioritization.py b/seqr/migrations/0048_phenotypeprioritization.py index 5f7900bdc3..6333471c06 100644 --- a/seqr/migrations/0048_phenotypeprioritization.py +++ b/seqr/migrations/0048_phenotypeprioritization.py @@ -1,4 +1,4 @@ -# Generated by Django 3.2.15 on 2022-10-12 15:03 +# Generated by Django 3.2.15 on 2022-10-14 20:38 from django.db import migrations, models import django.db.models.deletion diff --git a/seqr/models.py b/seqr/models.py index d47db0c1c1..4658c5002a 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -7,7 +7,7 @@ from django.contrib.postgres.fields import ArrayField from django.core.exceptions import PermissionDenied from django.db import models -from django.db.models import base, options, ForeignKey, JSONField +from django.db.models import base, options, ForeignKey, JSONField, prefetch_related_objects from django.utils import timezone from django.utils.text import slugify as __slugify @@ -45,28 +45,6 @@ def get_audit_field_names(audit_field): return list(_get_audit_fields(audit_field).keys()) -class BulkOperationBase: - @classmethod - def bulk_create(cls, user, new_models, parent=None): - """Helper bulk create method that logs the creation""" - for model in new_models: - model.created_by = user - models = cls.objects.bulk_create(new_models) - log_model_bulk_update(logger, models, user, 'create', parent=parent) - return models - - @classmethod - def bulk_delete(cls, user, queryset=None, parent=None, **filter_kwargs): - """Helper bulk delete method that logs the deletion""" - if queryset is None: - queryset = cls.objects.filter(**filter_kwargs) - log_model_bulk_update(logger, queryset, user, 'delete', parent=parent) - return queryset.delete() - - class Meta: - abstract = True - - class CustomModelBase(base.ModelBase): def __new__(cls, name, bases, attrs, **kwargs): audit_fields = getattr(attrs.get('Meta'), 'audit_fields', None) @@ -1033,19 +1011,36 @@ def __unicode__(self): def _compute_guid(self): return 'VSR%07d_%s' % (self.id, _slugify(str(self))) -class DeletableSampleMetadataModel(models.Model): - sample = models.ForeignKey('Sample', on_delete=models.CASCADE, db_index=True) - gene_id = models.CharField(max_length=20) # ensembl ID +class BulkOperationBase: + @classmethod + def bulk_create(cls, user, new_models, parent=None): + """Helper bulk create method that logs the creation""" + for model in new_models: + model.created_by = user + models = cls.objects.bulk_create(new_models) + log_model_bulk_update(logger, models, user, 'create', parent=parent) + return models @classmethod - def bulk_delete(cls, user, queryset=None, **filter_kwargs): + def bulk_delete(cls, user, queryset=None, parent=None, **filter_kwargs): """Helper bulk delete method that logs the deletion""" if queryset is None: queryset = cls.objects.filter(**filter_kwargs) - log_model_bulk_update(logger, queryset, user, 'delete') + if parent: + prefetch_related_objects(queryset, parent) + log_model_bulk_update(logger, queryset, user, 'delete', parent=parent) return queryset.delete() + class Meta: + abstract = True + + +class DeletableSampleMetadataModel(models.Model, BulkOperationBase): + + sample = models.ForeignKey('Sample', on_delete=models.CASCADE, db_index=True) + gene_id = models.CharField(max_length=20) # ensembl ID + def __unicode__(self): return "%s:%s" % (self.sample.sample_id, self.gene_id) diff --git a/seqr/utils/logging_utils.py b/seqr/utils/logging_utils.py index 59c1174c08..eb205af86d 100644 --- a/seqr/utils/logging_utils.py +++ b/seqr/utils/logging_utils.py @@ -77,13 +77,12 @@ def log_model_bulk_update(logger, models, user, update_type, update_fields=None, if not models: return [] db_entity = type(models[0]).__name__ - if parent: - entity_ids = list({getattr(o, parent).guid for o in models}) - else: - entity_ids = [o.guid for o in models] + entity_ids = [o.guid if hasattr(o, 'guid') else o.id for o in models] db_update = { 'dbEntity': db_entity, 'entityIds': entity_ids, 'updateType': 'bulk_{}'.format(update_type), } + if parent: + db_update['parentEntityIds'] = list({getattr(model, parent).guid for model in models}) if update_fields: db_update['updateFields'] = list(update_fields) logger.info( diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index ee94355277..8e8c610b5f 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -9,8 +9,7 @@ import urllib3 from django.contrib.postgres.aggregates import ArrayAgg -from django.db.models import Max, TextField -from django.db.models.functions import Concat +from django.db.models import Max from django.http.response import HttpResponse from django.views.decorators.csrf import csrf_exempt from requests.exceptions import ConnectionError as RequestConnectionError @@ -388,53 +387,45 @@ def load_rna_seq_sample_data(request, sample_guid): data_by_gene = json.loads(row.split('\t\t')[1]) model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] - models = model_cls.objects.bulk_create([model_cls(sample=sample, **data) for data in data_by_gene.values()]) - logger.info(f'create {len(models)} {model_cls.__name__}', request.user, db_update={ - 'dbEntity': model_cls.__name__, 'numEntities': len(models), 'parentEntityIds': [sample_guid], 'updateType': 'bulk_create', - }) + model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_by_gene.values()], parent='sample') return create_json_response({'success': True}) +def _log_append_info(user, info, message): + info.append(message) + logger.info(message, user) + + def _load_phenotype_prioritization(file_path, user): - data_by_id = load_phenotype_prioritization_data_file(file_path) + tool, data_by_project_sample_id = load_phenotype_prioritization_data_file(file_path) - all_samples = [sample for project_samples in data_by_id.values() for sample in project_samples.values()] - all_records = [rec for sample_records in all_samples for rec in sample_records] - message = f'Parsed {len(all_records)} LIRICAL/Exomiser data records in {len(all_samples)} samples' - info = [message] - logger.info(message, user) + info = [] + _log_append_info(user, info, f'Parsed {tool.upper()} data for project(s): {", ".join(data_by_project_sample_id.keys())}') - for project, project_samples in data_by_id.items(): - indivs = Individual.objects.filter(family__project__name=project, individual_id__in=project_samples.keys()) + all_records = [] + to_delete = None + for project, records_by_sample in data_by_project_sample_id.items(): + indivs = Individual.objects.filter(family__project__name=project, individual_id__in=records_by_sample.keys()) existing_indivs_by_id = {ind.individual_id: ind for ind in indivs} - tool_sample_id_set = set() - for sample_id, records in project_samples.items(): - if existing_indivs_by_id[sample_id]: - for rec in records: - rec['individual'] = existing_indivs_by_id[sample_id] - tool_sample_id_set.add(f'{rec["tool"]}{sample_id}') - else: - raise ValueError(f'Individual {sample_id} doesn\'t exist in project {project}') + missing_individuals = set(records_by_sample.keys()) - set(existing_indivs_by_id.keys()) + if missing_individuals: + raise ValueError(f'Individual {", ".join(list(missing_individuals))} doesn\'t exist') + for sample_id, records in records_by_sample.items(): + for rec in records: + rec['individual'] = existing_indivs_by_id[sample_id] - # Delete old data - to_delete = PhenotypePrioritization.objects.annotate( - tool_ind=Concat('tool', 'individual__individual_id', output_field=TextField()) - ).filter( - tool_ind__in=tool_sample_id_set, - ) - if to_delete: - deleted, _ = PhenotypePrioritization.bulk_delete(user, to_delete, parent='individual') - message = f'Deleted {deleted} existing phenotype-based prioritization records from project {project}' - info.append(message) - logger.info(message, user) - - project_names = ', '.join(sorted(data_by_id.keys())) - message = 'Attempted data loading for {} phenotype-based prioritization records in the following {} projects: {}'.format( - len(all_records), len(data_by_id.keys()), project_names) - info.append(message) - logger.info(message, user) + exist_records = PhenotypePrioritization.objects.filter(tool=tool, individual__in=indivs) + to_delete = to_delete | exist_records if to_delete else exist_records + + records = [rec for records in records_by_sample.values() for rec in records] + _log_append_info(user, info, f'Attempted loading {len(records)} records of {tool.upper()} data to project {project}') + all_records += records + + if to_delete: + deleted, _ = PhenotypePrioritization.bulk_delete(user, to_delete, parent='individual') + _log_append_info(user, info, f'Deleted {deleted} existing {tool.upper()} records') return all_records, info @@ -445,12 +436,12 @@ def load_phenotype_prioritization_data(request): file_name = request_json['file'] - logger.info(f'Loading phenotype prioritization data from {file_name}', request.user) - records, info = _load_phenotype_prioritization(file_name, request.user) + logger.info(f'Loading phenotype-based prioritization data from {file_name}', request.user) + records, info, tool = _load_phenotype_prioritization(file_name, request.user) models = PhenotypePrioritization.bulk_create(request.user, [PhenotypePrioritization(**data) for data in records], parent='individual') - info.append(f'Loaded {len(models)} LIRICAL/Exomiser data records') + info.append(f'Loaded {len(models)} {tool.upper()} data records') return create_json_response({ 'info': info, diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index 74ee63de81..9315a71b3a 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -594,7 +594,9 @@ def test_kibana_proxy(self): @mock.patch('seqr.utils.file_utils.subprocess.Popen') @mock.patch('seqr.views.apis.data_manager_api.gzip.open') @mock.patch('seqr.views.utils.dataset_utils.logger') - def test_update_rna_seq(self, mock_logger, mock_open, mock_subprocess, mock_load_uploaded_file, mock_os, mock_datetime): + @mock.patch('seqr.models.logger') + def test_update_rna_seq(self, mock_model_logger, mock_logger, mock_open, mock_subprocess, mock_load_uploaded_file, + mock_os, mock_datetime): url = reverse(update_rna_seq) self.check_data_manager_login(url) @@ -687,14 +689,13 @@ def mock_write(content): response_json = response.json() self.assertDictEqual(response_json, {'info': info, 'warnings': warnings, 'sampleGuids': [mock.ANY], 'fileName': file_name}) deleted_count = params.get('deleted_count', params['initial_model_count']) - mock_logger.info.assert_has_calls( - [mock.call(info_log, self.data_manager_user) for info_log in info] + [ - mock.call(f'delete {deleted_count} {model_cls.__name__}s', self.data_manager_user, db_update={ - 'dbEntity': model_cls.__name__, 'numEntities': deleted_count, 'parentEntityIds': mock.ANY, 'updateType': 'bulk_delete', - }), - ], any_order=True - ) - self.assertTrue(RNA_SAMPLE_GUID in mock_logger.info.call_args_list[1].kwargs['db_update']['parentEntityIds']) + mock_logger.info.assert_has_calls([mock.call(info_log, self.data_manager_user) for info_log in info]) + mock_model_logger.info.assert_called_with( + f'delete {deleted_count} {model_cls.__name__}s', self.data_manager_user, db_update={ + 'dbEntity': model_cls.__name__, 'entityIds': mock.ANY, 'parentEntityIds': mock.ANY, + 'updateType': 'bulk_delete', + }) + self.assertTrue(RNA_SAMPLE_GUID in mock_model_logger.info.call_args_list[1].kwargs['db_update']['parentEntityIds']) mock_logger.warning.assert_has_calls([mock.call(warn_log, self.data_manager_user) for warn_log in warnings]) # test database models are correct @@ -718,7 +719,8 @@ def mock_write(content): @mock.patch('seqr.views.apis.data_manager_api.os') @mock.patch('seqr.views.apis.data_manager_api.gzip.open') @mock.patch('seqr.views.apis.data_manager_api.logger') - def test_load_rna_seq_sample_data(self, mock_logger, mock_open, mock_os): + @mock.patch('seqr.models.logger') + def test_load_rna_seq_sample_data(self, mock_model_logger, mock_logger, mock_open, mock_os): mock_os.path.join.side_effect = lambda *args: '/'.join(args[1:]) url = reverse(load_rna_seq_sample_data, args=[RNA_SAMPLE_GUID]) @@ -743,11 +745,12 @@ def test_load_rna_seq_sample_data(self, mock_logger, mock_open, mock_os): mock_open.assert_called_with(file_name, 'rt') - mock_logger.info.assert_has_calls([ - mock.call('Loading outlier data for NA19675_D2', self.data_manager_user), - mock.call(f'create 2 {model_cls.__name__}', self.data_manager_user, db_update={ - 'dbEntity': model_cls.__name__, 'numEntities': 2, 'parentEntityIds': [RNA_SAMPLE_GUID], 'updateType': 'bulk_create', - }), - ]) + mock_logger.info.assert_called_with('Loading outlier data for NA19675_D2', self.data_manager_user) + mock_model_logger.info.assert_called_with( + f'create 2 {model_cls.__name__}s', self.data_manager_user, db_update={ + 'dbEntity': model_cls.__name__, 'entityIds': mock.ANY, 'parentEntityIds': [RNA_SAMPLE_GUID], + 'updateType': 'bulk_create', + } + ) self.assertListEqual(params['get_models_json'](models), params['expected_models_json']) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 99562007be..8134c821cb 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -419,12 +419,7 @@ def _load_rna_seq(model_cls, file_path, user, mapping_file, ignore_extra_samples individual_db_ids = {s.individual_id for s in samples} to_delete = model_cls.objects.filter(sample__individual_id__in=individual_db_ids).exclude(sample__data_source=data_source) if to_delete: - prefetch_related_objects(to_delete, 'sample') - logger.info(f'delete {len(to_delete)} {model_cls.__name__}s', user, db_update={ - 'dbEntity': model_cls.__name__, 'numEntities': len(to_delete), 'updateType': 'bulk_delete', - 'parentEntityIds': list({model.sample.guid for model in to_delete}), - }) - to_delete.delete() + model_cls.bulk_delete(user, to_delete, parent='sample') loaded_sample_ids = set(model_cls.objects.filter(sample__in=samples).values_list('sample_id', flat=True).distinct()) samples_to_load = { @@ -452,31 +447,34 @@ def _load_rna_seq(model_cls, file_path, user, mapping_file, ignore_extra_samples return samples_to_load, info, warnings -PHENOTYPE_PRI_HEADER = ['tool', 'project', 'sampleId', 'rank', 'geneId', 'diseaseId', 'diseaseName', 'scoreName1', 'score1'] +PHENOTYPE_PRIORITIZATION_HEADER = ['tool', 'project', 'sampleId', 'rank', 'geneId', 'diseaseId', 'diseaseName'] +PHENOTYPE_PRIORITIZATION_REQUIRED_HEADER = PHENOTYPE_PRIORITIZATION_HEADER + ['scoreName1', 'score1'] MAX_SCORES = 100 def _parse_phenotype_pri_row(row): - record = {_to_snake_case(key): row[key] for key in PHENOTYPE_PRI_HEADER[:-2]} + record = {_to_snake_case(key): row[key] for key in PHENOTYPE_PRIORITIZATION_HEADER} scores = {} for i in range(1, MAX_SCORES): - if not row[f'scoreName{i}']: + score_name = row.get(f'scoreName{i}') + if not score_name: break - scores[row[f'scoreName{i}']] = row[f'score{i}'] + scores[score_name] = float(row[f'score{i}']) record['scores'] = scores yield record['sample_id'], record def load_phenotype_prioritization_data_file(file_path): - data_by_id = defaultdict(lambda: defaultdict(list)) + data_by_project_sample_id = defaultdict(lambda: defaultdict(list)) f = file_iter(file_path) header = _parse_tsv_row(next(f)) - missing_cols = [col for col in PHENOTYPE_PRI_HEADER if col not in header] + missing_cols = [col for col in PHENOTYPE_PRIORITIZATION_REQUIRED_HEADER if col not in header] if missing_cols: raise ValueError(f'Invalid file: missing column(s) {", ".join(missing_cols)}') + tool = None for line in tqdm(f, unit=' rows'): row = dict(zip(header, _parse_tsv_row(line))) for sample_id, row_dict in _parse_phenotype_pri_row(row): @@ -484,6 +482,10 @@ def load_phenotype_prioritization_data_file(file_path): project = row_dict.pop('project', None) if not sample_id or not project: raise ValueError('Both sample ID and project fields are required.') - data_by_id[project][sample_id].append(row_dict) + data_by_project_sample_id[project][sample_id].append(row_dict) + if tool and tool != row_dict['tool']: + raise ValueError(f'Multiple tools found {tool} and {row_dict["tool"]}. Only one is supported.') + if not tool: + tool = row_dict['tool'] - return data_by_id + return tool, data_by_project_sample_id From a87f77c062e3122061d9bbb232ecdec25c02a0ec Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Mon, 17 Oct 2022 13:01:44 -0400 Subject: [PATCH 14/96] Add dynamic phenotype-base prioritization configs. --- .../components/panel/variants/VariantGene.jsx | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index 29eb41f603..089a353484 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -402,24 +402,6 @@ const GENE_DETAIL_SECTIONS = [ sampleGeneDetailsDisplay(gene.geneId, rnaSeqData, 'rnaSeqData') ), }, - { - color: 'orange', - description: 'Phenotype Prioritization', - label: 'LIRICAL', - showDetails: (gene, { phePriData }) => phePriData && phePriData.lirical && phePriData.lirical[gene.geneId], - detailsDisplay: (gene, { phePriData }) => ( - sampleGeneDetailsDisplay(gene.geneId, phePriData.lirical) - ), - }, - { - color: 'orange', - description: 'Phenotype Prioritization', - label: 'Exomiser', - showDetails: (gene, { phePriData }) => phePriData && phePriData.exomiser && phePriData.exomiser[gene.geneId], - detailsDisplay: (gene, { phePriData }) => ( - sampleGeneDetailsDisplay(gene.geneId, phePriData.exomiser) - ), - }, ] const OmimSegments = styled(Segment.Group).attrs({ size: 'tiny', horizontal: true, compact: true })` @@ -471,10 +453,28 @@ const getDetailSections = (configs, gene, compact, labelProps, sampleGeneData) = ) )) +const addPhenotypePrioritizationConfig = (configs, phePriInfo) => ( + phePriInfo ? [ + ...configs, + ...Object.keys(phePriInfo).map(tool => ( + { + color: 'orange', + description: 'Phenotype Prioritization', + label: tool.toUpper(), + showDetails: (gene, { phePriData }) => phePriData && phePriData[tool] && phePriData[tool][gene.geneId], + detailsDisplay: (gene, { phePriData }) => ( + sampleGeneDetailsDisplay(gene.geneId, phePriData[tool]) + ), + } + )), + ] : configs +) + export const GeneDetails = React.memo(( { gene, compact, showLocusLists, showInlineDetails, sampleGeneData, ...labelProps }, ) => { - const geneDetails = getDetailSections(GENE_DETAIL_SECTIONS, gene, compact, labelProps, sampleGeneData) + const geneDetailConfigs = addPhenotypePrioritizationConfig(GENE_DETAIL_SECTIONS, sampleGeneData.phePriData) + const geneDetails = getDetailSections(geneDetailConfigs, gene, compact, labelProps, sampleGeneData) const geneDiseaseDetails = getDetailSections(GENE_DISEASE_DETAIL_SECTIONS, gene, compact, labelProps) const hasLocusLists = showLocusLists && gene.locusListGuids.length > 0 const showDivider = !showInlineDetails && geneDetails.length > 0 && (hasLocusLists || geneDiseaseDetails.length > 0) From 419549c95fb1399b7f18ca48fc4329d1e28b731c Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 18 Oct 2022 09:32:59 -0400 Subject: [PATCH 15/96] Update display. --- seqr/views/apis/data_manager_api.py | 2 +- seqr/views/utils/variant_utils.py | 2 +- .../components/panel/variants/VariantGene.jsx | 2 +- .../components/panel/variants/selectors.js | 18 +++++++++++------- ui/shared/utils/constants.js | 3 --- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 8e8c610b5f..5023736ac1 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -427,7 +427,7 @@ def _load_phenotype_prioritization(file_path, user): deleted, _ = PhenotypePrioritization.bulk_delete(user, to_delete, parent='individual') _log_append_info(user, info, f'Deleted {deleted} existing {tool.upper()} records') - return all_records, info + return all_records, info, tool @data_manager_required diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 5854b465a4..dcc93d1f80 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -140,7 +140,7 @@ def _get_phenotype_prioritization(gene_ids, families): ) for data in data_dicts: - data_by_individual_gene[data.pop('individualGuid')][data['tool']][data['geneId']].append(data) + data_by_individual_gene[data.pop('individualGuid')][data.pop('tool')][data['geneId']].append(data) return data_by_individual_gene diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index 089a353484..905911cde7 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -460,7 +460,7 @@ const addPhenotypePrioritizationConfig = (configs, phePriInfo) => ( { color: 'orange', description: 'Phenotype Prioritization', - label: tool.toUpper(), + label: tool.toUpperCase(), showDetails: (gene, { phePriData }) => phePriData && phePriData[tool] && phePriData[tool][gene.geneId], detailsDisplay: (gene, { phePriData }) => ( sampleGeneDetailsDisplay(gene.geneId, phePriData[tool]) diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index d6d1abcff7..69403de525 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -17,6 +17,7 @@ import { getFamiliesByGuid, getProjectsByGuid, getIndividualsByGuid, getRnaSeqDataByIndividual, getPhePriDataByIndividual, } from 'redux/selectors' +const RNA_SEQ_SCORE_FIELDS = ['zScore', 'pValue', 'pAdjust'] export const getRnaSeqOutilerDataByFamilyGene = createSelector( getIndividualsByGuid, getRnaSeqDataByIndividual, @@ -24,14 +25,17 @@ export const getRnaSeqOutilerDataByFamilyGene = createSelector( (acc, [individualGuid, rnaSeqData]) => { const { familyGuid, displayName } = individualsByGuid[individualGuid] acc[familyGuid] = Object.entries(rnaSeqData.outliers || {}).reduce( - (acc2, [geneId, data]) => { - const { zScore, pValue, pAdjust } = data - return (data.isSignificant ? { + (acc2, [geneId, data]) => (data.isSignificant ? + { ...acc2, - [geneId]: { ...(acc2[geneId] || {}), [displayName]: [{ scores: { zScore, pValue, pAdjust } }] }, - } : acc2) - }, - acc[familyGuid] || {}, + [geneId]: { + ...(acc2[geneId] || {}), + [displayName]: [{ + scores: RNA_SEQ_SCORE_FIELDS.reduce((scoreAcc, score) => ({ ...scoreAcc, [score]: data[score] }), {}), + }], + }, + } : acc2 + ), acc[familyGuid] || {}, ) return acc }, {}, diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js index f9538f53a0..5409cf63ab 100644 --- a/ui/shared/utils/constants.js +++ b/ui/shared/utils/constants.js @@ -1311,9 +1311,6 @@ const VARIANT_ICON_COLORS = { green: '#21a926', } -export const LIRICAL = 'L' -export const EXOMISER = 'E' - export const PANEL_APP_CONFIDENCE_DESCRIPTION = { 0: 'No Panel App confidence level', 1: 'Red, lowest level of confidence; 1 of the 4 sources or from other sources.', From 0b6cdd891f08f8404add3b95005f3ef1093ce8c2 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 18 Oct 2022 09:34:31 -0400 Subject: [PATCH 16/96] Add a return value. --- seqr/views/apis/data_manager_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 8e8c610b5f..5023736ac1 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -427,7 +427,7 @@ def _load_phenotype_prioritization(file_path, user): deleted, _ = PhenotypePrioritization.bulk_delete(user, to_delete, parent='individual') _log_append_info(user, info, f'Deleted {deleted} existing {tool.upper()} records') - return all_records, info + return all_records, info, tool @data_manager_required From 260b0c7348abdd395e9356357db615d0b25cd127 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 18 Oct 2022 10:03:47 -0400 Subject: [PATCH 17/96] Temporarily make the tests happy. --- seqr/views/apis/saved_variant_api_tests.py | 2 +- seqr/views/apis/summary_data_api_tests.py | 2 +- seqr/views/apis/variant_search_api_tests.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/seqr/views/apis/saved_variant_api_tests.py b/seqr/views/apis/saved_variant_api_tests.py index 1cc28bc634..e2202cf4c1 100644 --- a/seqr/views/apis/saved_variant_api_tests.py +++ b/seqr/views/apis/saved_variant_api_tests.py @@ -27,7 +27,7 @@ SAVED_VARIANT_RESPONSE_KEYS = { 'variantTagsByGuid', 'variantNotesByGuid', 'variantFunctionalDataByGuid', 'savedVariantsByGuid', - 'genesById', 'locusListsByGuid', 'rnaSeqData', 'mmeSubmissionsByGuid', + 'genesById', 'locusListsByGuid', 'rnaSeqData', 'mmeSubmissionsByGuid', 'phePriData', } COMPOUND_HET_3_JSON = { diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index b02346fdd5..b622e62149 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -25,7 +25,7 @@ SAVED_VARIANT_RESPONSE_KEYS = { 'projectsByGuid', 'locusListsByGuid', 'savedVariantsByGuid', 'variantFunctionalDataByGuid', 'genesById', 'variantNotesByGuid', 'individualsByGuid', 'variantTagsByGuid', 'familiesByGuid', 'familyNotesByGuid', - 'mmeSubmissionsByGuid', + 'mmeSubmissionsByGuid', 'phePriData', } diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py index 2f2fa74e8a..f4276973a9 100644 --- a/seqr/views/apis/variant_search_api_tests.py +++ b/seqr/views/apis/variant_search_api_tests.py @@ -82,6 +82,7 @@ }, 'locusListsByGuid': {LOCUS_LIST_GUID: {'intervals': mock.ANY}}, 'rnaSeqData': {'I000001_na19675': {'outliers': {'ENSG00000268903': mock.ANY}}}, + 'phePriData': {}, 'mmeSubmissionsByGuid': {'MS000001_na19675': {k: mock.ANY for k in MATCHMAKER_SUBMISSION_FIELDS}}, } From 4ceeaa659087fb8b3cbf5e55a5c9426435d5241d Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 18 Oct 2022 11:16:32 -0400 Subject: [PATCH 18/96] Fix JS test failures. --- ui/shared/components/panel/variants/selectors.js | 6 ++++-- ui/shared/components/panel/variants/selectors.test.js | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index 69403de525..d125d2acd0 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -31,7 +31,9 @@ export const getRnaSeqOutilerDataByFamilyGene = createSelector( [geneId]: { ...(acc2[geneId] || {}), [displayName]: [{ - scores: RNA_SEQ_SCORE_FIELDS.reduce((scoreAcc, score) => ({ ...scoreAcc, [score]: data[score] }), {}), + scores: RNA_SEQ_SCORE_FIELDS.reduce( + (sAcc, score) => (data[score] ? { ...sAcc, [score]: data[score] } : sAcc), {}, + ), }], }, } : acc2 @@ -45,7 +47,7 @@ export const getRnaSeqOutilerDataByFamilyGene = createSelector( export const getPhePriDataByFamilyGene = createSelector( getIndividualsByGuid, getPhePriDataByIndividual, - (individualsByGuid, phePriDataByIndividual) => Object.entries(phePriDataByIndividual).reduce( + (individualsByGuid, phePriDataByIndividual) => Object.entries(phePriDataByIndividual || {}).reduce( (acc, [individualGuid, phePriData]) => { const { familyGuid, displayName } = individualsByGuid[individualGuid] acc[familyGuid] = Object.entries(phePriData).reduce((accTool, [tool, toolData]) => ({ diff --git a/ui/shared/components/panel/variants/selectors.test.js b/ui/shared/components/panel/variants/selectors.test.js index 628604d3a4..6d7ff7ce69 100644 --- a/ui/shared/components/panel/variants/selectors.test.js +++ b/ui/shared/components/panel/variants/selectors.test.js @@ -96,15 +96,15 @@ test('getRnaSeqOutilerDataByFamilyGene', () => { expect(getRnaSeqOutilerDataByFamilyGene(RNA_SEQ_STATE)).toEqual({ F011652_1: { ENSG00000228198: { - NA19678: { isSignificant: true, pValue: 0.0004 }, - NA19679_1: { isSignificant: true, pValue: 0.01 }, + NA19678: [{ scores: { pValue: 0.0004 } }], + NA19679_1: [{ scores: { pValue: 0.01 } }], }, ENSG00000164458: { - NA19678: { isSignificant: true, pValue: 0.0073 }, + NA19678: [{ scores: { pValue: 0.0073 } }], }, }, F011652_2: { - ENSG00000228198: { NA19678_2: { isSignificant: true, pValue: 0.0214 } }, + ENSG00000228198: { NA19678_2: [{ scores: { pValue: 0.0214 } }] }, }, }) }) From 9c564e4a367462c5c69b49407928066d5f156296 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Wed, 19 Oct 2022 11:57:06 -0400 Subject: [PATCH 19/96] Add preliminary tests and update logging texts. --- seqr/views/apis/data_manager_api.py | 2 +- seqr/views/apis/data_manager_api_tests.py | 50 ++++++++++++++++++++++- seqr/views/utils/dataset_utils.py | 10 ++--- 3 files changed, 55 insertions(+), 7 deletions(-) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 5023736ac1..777791d52d 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -411,7 +411,7 @@ def _load_phenotype_prioritization(file_path, user): missing_individuals = set(records_by_sample.keys()) - set(existing_indivs_by_id.keys()) if missing_individuals: - raise ValueError(f'Individual {", ".join(list(missing_individuals))} doesn\'t exist') + raise ValueError(f'Can\'t find individuals {", ".join(sorted(list(missing_individuals)))}') for sample_id, records in records_by_sample.items(): for rec in records: rec['individual'] = existing_indivs_by_id[sample_id] diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index 9315a71b3a..e673f20b8a 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -6,7 +6,7 @@ import responses from seqr.views.apis.data_manager_api import elasticsearch_status, upload_qc_pipeline_output, delete_index, \ - update_rna_seq, load_rna_seq_sample_data + update_rna_seq, load_rna_seq_sample_data, load_phenotype_prioritization_data from seqr.views.utils.orm_to_json_utils import get_json_for_rna_seq_outliers from seqr.views.utils.test_utils import AuthenticationTestCase, urllib3_responses from seqr.models import Individual, RnaSeqOutlier, RnaSeqTpm, Sample @@ -281,6 +281,21 @@ RNA_TPM_SAMPLE_DATA = [f'{RNA_SAMPLE_GUID}\t\t{json.dumps(SAMPLE_GENE_TPM_DATA)}\n'] RNA_FILENAME_TEMPLATE = 'rna_sample_data__{}__2020-04-15T00:00:00.json.gz' +PHENOTYPE_PRIORITIZATION_HEADER = ['tool\tproject\tsampleId\trank\tgeneId\tdiseaseId\tdiseaseName\tscoreName1\tscore1\tscoreName2\tscore2\tscoreName3\tscore3'] +PHENOTYPE_PRIORITIZATION_MISS_HEADER = ['tool\tsampleId\trank\tgeneId\tdiseaseName\tscoreName1\tscore1\tscoreName2\tscore2\tscoreName3\tscore3'] +LIRICAL_NO_PROJECT_DATA = ['lirical'] +LIRICAL_NO_EXIST_INDV_DATA = [ + 'lirical\tCMG_Beggs_WGS\tNA19678\t1\tENSG00000105357\tOMIM:618460\tKhan-Khan-Katsanis syndrome\tpost_test_probability\t0\tcompositeLR\t0.066', + 'lirical\tCMG_Beggs_WGS\tNA19679\t1\tENSG00000105357\tOMIM:618460\tKhan-Khan-Katsanis syndrome\tpost_test_probability\t0\tcompositeLR\t0.066', +] +LIRICAL_DATA = [ + 'lirical\t1kg project nåme with uniçøde\tNA19678\t1\tENSG00000105357\tOMIM:618460\tKhan-Khan-Katsanis syndrome\tpost_test_probability\t0\tcompositeLR\t0.066', + 'lirical\t1kg project nåme with uniçøde\tNA19678\t2\tENSG00000105357\tOMIM:219800\t"Cystinosis, nephropathic"\tpost_test_probability\t0\tcompositeLR\t0.003\t\t', +] +EXOMISER_DATA = [ + 'exomiser\tCMG_Beggs_WGS\tBEG_1230-1_01\t1\tENSG00000105357\tORPHA:2131\tAlternating hemiplegia of childhood\texomiser_score\t0.977923765\tphenotype_score\t0.603998205\tvariant_score\t1', + 'exomiser\tCMG_Beggs_WGS\tBEG_1230-1_01\t3\tENSG00000105357\tORPHA:71517\tRapid-onset dystonia-parkinsonism\texomiser_score\t0.977923765\tphenotype_score\t0.551578222\tvariant_score\t1' +] class DataManagerAPITest(AuthenticationTestCase): fixtures = ['users', '1kg_project', 'reference_data'] @@ -754,3 +769,36 @@ def test_load_rna_seq_sample_data(self, mock_model_logger, mock_logger, mock_ope ) self.assertListEqual(params['get_models_json'](models), params['expected_models_json']) + + @mock.patch('seqr.views.utils.dataset_utils.file_iter') + @mock.patch('seqr.views.apis.data_manager_api.logger') + @mock.patch('seqr.models.logger') + def test_load_phenotype_prioritization_data(self, mock_model_logger, mock_logger, mock_file_iter): + url = reverse(load_phenotype_prioritization_data) + self.check_data_manager_login(url) + + mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_MISS_HEADER) + response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) + self.assertEqual(response.status_code, 500) + self.assertEqual(response.json()['error'], 'Invalid file: missing column(s) project, diseaseId') + mock_logger.info.assert_called_with('Loading phenotype-based prioritization data from lirical_data.tsv.gz', self.data_manager_user) + mock_file_iter.assert_called_with('lirical_data.tsv.gz') + + mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_NO_PROJECT_DATA) + response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) + self.assertEqual(response.status_code, 500) + self.assertEqual(response.json()['error'], 'Both sample ID and project fields are required.') + + mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA + EXOMISER_DATA) + response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) + self.assertEqual(response.status_code, 500) + self.assertEqual(response.json()['error'], 'Multiple tools found lirical and exomiser. Only one in a file is supported.') + + mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_NO_EXIST_INDV_DATA) + response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) + self.assertEqual(response.status_code, 500) + self.assertEqual(response.json()['error'], 'Can\'t find individuals NA19678, NA19679') + + mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA) + response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) + self.assertEqual(response.status_code, 200) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 8134c821cb..fbaae56b18 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -453,7 +453,7 @@ def _load_rna_seq(model_cls, file_path, user, mapping_file, ignore_extra_samples def _parse_phenotype_pri_row(row): - record = {_to_snake_case(key): row[key] for key in PHENOTYPE_PRIORITIZATION_HEADER} + record = {_to_snake_case(key): row.get(key) for key in PHENOTYPE_PRIORITIZATION_HEADER} scores = {} for i in range(1, MAX_SCORES): @@ -463,7 +463,7 @@ def _parse_phenotype_pri_row(row): scores[score_name] = float(row[f'score{i}']) record['scores'] = scores - yield record['sample_id'], record + yield record def load_phenotype_prioritization_data_file(file_path): @@ -477,14 +477,14 @@ def load_phenotype_prioritization_data_file(file_path): tool = None for line in tqdm(f, unit=' rows'): row = dict(zip(header, _parse_tsv_row(line))) - for sample_id, row_dict in _parse_phenotype_pri_row(row): - row_dict.pop('sample_id') + for row_dict in _parse_phenotype_pri_row(row): + sample_id = row_dict.pop('sample_id', None) project = row_dict.pop('project', None) if not sample_id or not project: raise ValueError('Both sample ID and project fields are required.') data_by_project_sample_id[project][sample_id].append(row_dict) if tool and tool != row_dict['tool']: - raise ValueError(f'Multiple tools found {tool} and {row_dict["tool"]}. Only one is supported.') + raise ValueError(f'Multiple tools found {tool} and {row_dict["tool"]}. Only one in a file is supported.') if not tool: tool = row_dict['tool'] From 43727b4068f8da2f8645767bc85fda7734d39f45 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Thu, 20 Oct 2022 16:23:41 -0400 Subject: [PATCH 20/96] Update per review comments. --- seqr/models.py | 14 +-- seqr/utils/logging_utils.py | 18 ++- seqr/views/apis/data_manager_api.py | 61 +++++----- seqr/views/apis/data_manager_api_tests.py | 141 +++++++++++++++++----- seqr/views/utils/dataset_utils.py | 4 +- seqr/views/utils/permissions_utils.py | 6 + 6 files changed, 172 insertions(+), 72 deletions(-) diff --git a/seqr/models.py b/seqr/models.py index 4658c5002a..b1f085c9b6 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -7,13 +7,13 @@ from django.contrib.postgres.fields import ArrayField from django.core.exceptions import PermissionDenied from django.db import models -from django.db.models import base, options, ForeignKey, JSONField, prefetch_related_objects +from django.db.models import base, options, ForeignKey, JSONField from django.utils import timezone from django.utils.text import slugify as __slugify from guardian.shortcuts import assign_perm -from seqr.utils.logging_utils import log_model_update, log_model_bulk_update, SeqrLogger +from seqr.utils.logging_utils import log_model_update, log_model_bulk_update, SeqrLogger, log_model_no_guid_bulk_update from seqr.utils.xpos_utils import get_chrom_pos from seqr.views.utils.terra_api_utils import anvil_enabled from reference_data.models import GENOME_VERSION_GRCh37, GENOME_VERSION_CHOICES @@ -1014,22 +1014,20 @@ def _compute_guid(self): class BulkOperationBase: @classmethod - def bulk_create(cls, user, new_models, parent=None): + def bulk_create(cls, user, new_models): """Helper bulk create method that logs the creation""" for model in new_models: model.created_by = user models = cls.objects.bulk_create(new_models) - log_model_bulk_update(logger, models, user, 'create', parent=parent) + log_model_no_guid_bulk_update(logger, models, user, 'create') return models @classmethod - def bulk_delete(cls, user, queryset=None, parent=None, **filter_kwargs): + def bulk_delete(cls, user, queryset=None, **filter_kwargs): """Helper bulk delete method that logs the deletion""" if queryset is None: queryset = cls.objects.filter(**filter_kwargs) - if parent: - prefetch_related_objects(queryset, parent) - log_model_bulk_update(logger, queryset, user, 'delete', parent=parent) + log_model_no_guid_bulk_update(logger, queryset, user, 'delete') return queryset.delete() class Meta: diff --git a/seqr/utils/logging_utils.py b/seqr/utils/logging_utils.py index eb205af86d..b696887a5c 100644 --- a/seqr/utils/logging_utils.py +++ b/seqr/utils/logging_utils.py @@ -73,18 +73,26 @@ def log_model_update(logger, model, user, update_type, update_fields=None): logger.info('{} {} {}'.format(update_type, db_entity, entity_id), user, db_update=db_update) -def log_model_bulk_update(logger, models, user, update_type, update_fields=None, parent=None): +def log_model_bulk_update(logger, models, user, update_type, update_fields=None): if not models: return [] db_entity = type(models[0]).__name__ - entity_ids = [o.guid if hasattr(o, 'guid') else o.id for o in models] + entity_ids = [o.guid for o in models] db_update = { 'dbEntity': db_entity, 'entityIds': entity_ids, 'updateType': 'bulk_{}'.format(update_type), } - if parent: - db_update['parentEntityIds'] = list({getattr(model, parent).guid for model in models}) if update_fields: db_update['updateFields'] = list(update_fields) logger.info( '{} {} {}s'.format(update_type, len(entity_ids), db_entity), user, db_update=db_update) - return entity_ids \ No newline at end of file + return entity_ids + + +def log_model_no_guid_bulk_update(logger, models, user, update_type): + if not models: + return [] + db_entity = type(models[0]).__name__ + db_update = { + 'dbEntity': db_entity, 'numEntities': len(models), 'updateType': 'bulk_{}'.format(update_type), + } + logger.info(f'{update_type} {db_entity}s', user, db_update=db_update) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 777791d52d..2a0366277b 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -21,9 +21,9 @@ from seqr.views.utils.dataset_utils import load_rna_seq_outlier, load_rna_seq_tpm, load_phenotype_prioritization_data_file from seqr.views.utils.file_utils import parse_file, get_temp_upload_directory, load_uploaded_file from seqr.views.utils.json_utils import create_json_response, _to_camel_case -from seqr.views.utils.permissions_utils import data_manager_required +from seqr.views.utils.permissions_utils import data_manager_required, is_internal_project -from seqr.models import Sample, Individual, RnaSeqOutlier, RnaSeqTpm, PhenotypePrioritization +from seqr.models import Sample, Individual, RnaSeqOutlier, RnaSeqTpm, PhenotypePrioritization, Project from settings import KIBANA_SERVER, KIBANA_ELASTICSEARCH_PASSWORD @@ -387,7 +387,7 @@ def load_rna_seq_sample_data(request, sample_guid): data_by_gene = json.loads(row.split('\t\t')[1]) model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class'] - model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_by_gene.values()], parent='sample') + model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_by_gene.values()]) return create_json_response({'success': True}) @@ -397,21 +397,37 @@ def _log_append_info(user, info, message): logger.info(message, user) -def _load_phenotype_prioritization(file_path, user): - tool, data_by_project_sample_id = load_phenotype_prioritization_data_file(file_path) +@data_manager_required +def load_phenotype_prioritization_data(request): + request_json = json.loads(request.body) + + file_path = request_json['file'] info = [] - _log_append_info(user, info, f'Parsed {tool.upper()} data for project(s): {", ".join(data_by_project_sample_id.keys())}') + _log_append_info(request.user, info, f'Loading phenotype-based prioritization data from {file_path}') + + try: + tool, data_by_project_sample_id = load_phenotype_prioritization_data_file(file_path) + except ValueError as e: + return create_json_response({'error': str(e)}, status=400) all_records = [] to_delete = None - for project, records_by_sample in data_by_project_sample_id.items(): - indivs = Individual.objects.filter(family__project__name=project, individual_id__in=records_by_sample.keys()) + error = None + for project_name, records_by_sample in data_by_project_sample_id.items(): + projects = [p for p in Project.objects.filter(name=project_name) if is_internal_project(p)] + if not projects or len(projects) > 1: + error = f'Project not found or multiple projects with the same name {project_name}' + break + _log_append_info(request.user, info, f'Parsed {tool.upper()} data for project: {project_name}') + + indivs = Individual.objects.filter(family__project=projects[0], individual_id__in=records_by_sample.keys()) existing_indivs_by_id = {ind.individual_id: ind for ind in indivs} missing_individuals = set(records_by_sample.keys()) - set(existing_indivs_by_id.keys()) if missing_individuals: - raise ValueError(f'Can\'t find individuals {", ".join(sorted(list(missing_individuals)))}') + error = f'Can\'t find individuals {", ".join(sorted(list(missing_individuals)))}' + break for sample_id, records in records_by_sample.items(): for rec in records: rec['individual'] = existing_indivs_by_id[sample_id] @@ -420,28 +436,19 @@ def _load_phenotype_prioritization(file_path, user): to_delete = to_delete | exist_records if to_delete else exist_records records = [rec for records in records_by_sample.values() for rec in records] - _log_append_info(user, info, f'Attempted loading {len(records)} records of {tool.upper()} data to project {project}') + _log_append_info(request.user, info, + f'Attempted loading {len(records)} records of {tool.upper()} data to project {project_name}') all_records += records - if to_delete: - deleted, _ = PhenotypePrioritization.bulk_delete(user, to_delete, parent='individual') - _log_append_info(user, info, f'Deleted {deleted} existing {tool.upper()} records') - - return all_records, info, tool - + if error: + return create_json_response({'error': error}, status=400) -@data_manager_required -def load_phenotype_prioritization_data(request): - request_json = json.loads(request.body) - - file_name = request_json['file'] - - logger.info(f'Loading phenotype-based prioritization data from {file_name}', request.user) - records, info, tool = _load_phenotype_prioritization(file_name, request.user) - models = PhenotypePrioritization.bulk_create(request.user, [PhenotypePrioritization(**data) for data in records], - parent='individual') + if to_delete: + deleted, _ = PhenotypePrioritization.bulk_delete(request.user, to_delete) + _log_append_info(request.user, info, f'Deleted {deleted} existing {tool.upper()} records') - info.append(f'Loaded {len(models)} {tool.upper()} data records') + models = PhenotypePrioritization.bulk_create(request.user, [PhenotypePrioritization(**data) for data in all_records]) + _log_append_info(request.user, info, f'Loaded {len(models)} {tool.upper()} data records') return create_json_response({ 'info': info, diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index e673f20b8a..85e1fcca0d 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -8,8 +8,8 @@ from seqr.views.apis.data_manager_api import elasticsearch_status, upload_qc_pipeline_output, delete_index, \ update_rna_seq, load_rna_seq_sample_data, load_phenotype_prioritization_data from seqr.views.utils.orm_to_json_utils import get_json_for_rna_seq_outliers -from seqr.views.utils.test_utils import AuthenticationTestCase, urllib3_responses -from seqr.models import Individual, RnaSeqOutlier, RnaSeqTpm, Sample +from seqr.views.utils.test_utils import AuthenticationTestCase, urllib3_responses, AnvilAuthenticationTestCase +from seqr.models import Individual, RnaSeqOutlier, RnaSeqTpm, Sample, Project PROJECT_GUID = 'R0001_1kg' @@ -281,23 +281,38 @@ RNA_TPM_SAMPLE_DATA = [f'{RNA_SAMPLE_GUID}\t\t{json.dumps(SAMPLE_GENE_TPM_DATA)}\n'] RNA_FILENAME_TEMPLATE = 'rna_sample_data__{}__2020-04-15T00:00:00.json.gz' -PHENOTYPE_PRIORITIZATION_HEADER = ['tool\tproject\tsampleId\trank\tgeneId\tdiseaseId\tdiseaseName\tscoreName1\tscore1\tscoreName2\tscore2\tscoreName3\tscore3'] -PHENOTYPE_PRIORITIZATION_MISS_HEADER = ['tool\tsampleId\trank\tgeneId\tdiseaseName\tscoreName1\tscore1\tscoreName2\tscore2\tscoreName3\tscore3'] -LIRICAL_NO_PROJECT_DATA = ['lirical'] +PHENOTYPE_PRIORITIZATION_HEADER = [['tool', 'project', 'sampleId', 'rank', 'geneId', 'diseaseId', 'diseaseName', + 'scoreName1', 'score1', 'scoreName2', 'score2', 'scoreName3', 'score3']] +PHENOTYPE_PRIORITIZATION_MISS_HEADER = [['tool', 'sampleId', 'rank', 'geneId', 'diseaseName', 'scoreName1', 'score1', + 'scoreName2', 'score2', 'scoreName3', 'score3']] +LIRICAL_NO_PROJECT_DATA = [['lirical']] +LIRICAL_PROJECT_NOT_EXIST_DATA = [ + ['lirical', 'CMG_Beggs_WGS', 'NA19678', '1', 'ENSG00000105357', 'OMIM:618460', 'Khan-Khan-Katsanis syndrome', + 'post_test_probability', '0', 'compositeLR', '0.066'], +] LIRICAL_NO_EXIST_INDV_DATA = [ - 'lirical\tCMG_Beggs_WGS\tNA19678\t1\tENSG00000105357\tOMIM:618460\tKhan-Khan-Katsanis syndrome\tpost_test_probability\t0\tcompositeLR\t0.066', - 'lirical\tCMG_Beggs_WGS\tNA19679\t1\tENSG00000105357\tOMIM:618460\tKhan-Khan-Katsanis syndrome\tpost_test_probability\t0\tcompositeLR\t0.066', + ['lirical', '1kg project nåme with uniçøde', 'NA19678x', '1', 'ENSG00000105357', 'OMIM:618460', + 'Khan-Khan-Katsanis syndrome', 'post_test_probability', '0', 'compositeLR', '0.066'], + ['lirical', '1kg project nåme with uniçøde', 'NA19679x', '1', 'ENSG00000105357', 'OMIM:618460', + 'Khan-Khan-Katsanis syndrome', 'post_test_probability', '0', 'compositeLR', '0.066'], ] LIRICAL_DATA = [ - 'lirical\t1kg project nåme with uniçøde\tNA19678\t1\tENSG00000105357\tOMIM:618460\tKhan-Khan-Katsanis syndrome\tpost_test_probability\t0\tcompositeLR\t0.066', - 'lirical\t1kg project nåme with uniçøde\tNA19678\t2\tENSG00000105357\tOMIM:219800\t"Cystinosis, nephropathic"\tpost_test_probability\t0\tcompositeLR\t0.003\t\t', + ['lirical', '1kg project nåme with uniçøde', 'NA19678', '1', 'ENSG00000105357', 'OMIM:618460', + 'Khan-Khan-Katsanis syndrome', 'post_test_probability', '0', 'compositeLR', '0.066'], + ['lirical', 'Test Reprocessed Project', 'NA20885', '2', 'ENSG00000105357', 'OMIM:219800', + '"Cystinosis, nephropathic"', 'post_test_probability', '0', 'compositeLR', '0.003', '', ''], ] EXOMISER_DATA = [ - 'exomiser\tCMG_Beggs_WGS\tBEG_1230-1_01\t1\tENSG00000105357\tORPHA:2131\tAlternating hemiplegia of childhood\texomiser_score\t0.977923765\tphenotype_score\t0.603998205\tvariant_score\t1', - 'exomiser\tCMG_Beggs_WGS\tBEG_1230-1_01\t3\tENSG00000105357\tORPHA:71517\tRapid-onset dystonia-parkinsonism\texomiser_score\t0.977923765\tphenotype_score\t0.551578222\tvariant_score\t1' + ['exomiser', 'CMG_Beggs_WGS', 'BEG_1230-1_01', '1', 'ENSG00000105357', 'ORPHA:2131', + 'Alternating hemiplegia of childhood', 'exomiser_score', '0.977923765', 'phenotype_score', '0.603998205', + 'variant_score', '1'], + ['exomiser', 'CMG_Beggs_WGS', 'BEG_1230-1_01', '3', 'ENSG00000105357', 'ORPHA:71517', + 'Rapid-onset dystonia-parkinsonism', 'exomiser_score', '0.977923765', 'phenotype_score', '0.551578222', + 'variant_score', '1'] ] -class DataManagerAPITest(AuthenticationTestCase): + +class DataManagerAPITest(object): fixtures = ['users', '1kg_project', 'reference_data'] @urllib3_responses.activate @@ -706,11 +721,9 @@ def mock_write(content): deleted_count = params.get('deleted_count', params['initial_model_count']) mock_logger.info.assert_has_calls([mock.call(info_log, self.data_manager_user) for info_log in info]) mock_model_logger.info.assert_called_with( - f'delete {deleted_count} {model_cls.__name__}s', self.data_manager_user, db_update={ - 'dbEntity': model_cls.__name__, 'entityIds': mock.ANY, 'parentEntityIds': mock.ANY, - 'updateType': 'bulk_delete', - }) - self.assertTrue(RNA_SAMPLE_GUID in mock_model_logger.info.call_args_list[1].kwargs['db_update']['parentEntityIds']) + f'delete {model_cls.__name__}s', self.data_manager_user, + db_update={'dbEntity': model_cls.__name__, 'numEntities': deleted_count, 'updateType': 'bulk_delete'} + ) mock_logger.warning.assert_has_calls([mock.call(warn_log, self.data_manager_user) for warn_log in warnings]) # test database models are correct @@ -762,14 +775,17 @@ def test_load_rna_seq_sample_data(self, mock_model_logger, mock_logger, mock_ope mock_logger.info.assert_called_with('Loading outlier data for NA19675_D2', self.data_manager_user) mock_model_logger.info.assert_called_with( - f'create 2 {model_cls.__name__}s', self.data_manager_user, db_update={ - 'dbEntity': model_cls.__name__, 'entityIds': mock.ANY, 'parentEntityIds': [RNA_SAMPLE_GUID], - 'updateType': 'bulk_create', + f'create {model_cls.__name__}s', self.data_manager_user, db_update={ + 'dbEntity': model_cls.__name__, 'numEntities': 2, 'updateType': 'bulk_create', } ) self.assertListEqual(params['get_models_json'](models), params['expected_models_json']) + @classmethod + def _join_data(cls, data): + return iter(['\t'.join(line) for line in data]) + @mock.patch('seqr.views.utils.dataset_utils.file_iter') @mock.patch('seqr.views.apis.data_manager_api.logger') @mock.patch('seqr.models.logger') @@ -777,28 +793,93 @@ def test_load_phenotype_prioritization_data(self, mock_model_logger, mock_logger url = reverse(load_phenotype_prioritization_data) self.check_data_manager_login(url) - mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_MISS_HEADER) + mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_MISS_HEADER) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) - self.assertEqual(response.status_code, 500) + self.assertEqual(response.status_code, 400) self.assertEqual(response.json()['error'], 'Invalid file: missing column(s) project, diseaseId') mock_logger.info.assert_called_with('Loading phenotype-based prioritization data from lirical_data.tsv.gz', self.data_manager_user) mock_file_iter.assert_called_with('lirical_data.tsv.gz') - mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_NO_PROJECT_DATA) + mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_NO_PROJECT_DATA) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) - self.assertEqual(response.status_code, 500) + self.assertEqual(response.status_code, 400) self.assertEqual(response.json()['error'], 'Both sample ID and project fields are required.') - mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA + EXOMISER_DATA) + mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA + EXOMISER_DATA) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) - self.assertEqual(response.status_code, 500) + self.assertEqual(response.status_code, 400) self.assertEqual(response.json()['error'], 'Multiple tools found lirical and exomiser. Only one in a file is supported.') - mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_NO_EXIST_INDV_DATA) + mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_PROJECT_NOT_EXIST_DATA) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) - self.assertEqual(response.status_code, 500) - self.assertEqual(response.json()['error'], 'Can\'t find individuals NA19678, NA19679') + self.assertEqual(response.status_code, 400) + self.assertEqual(response.json()['error'], 'Project not found or multiple projects with the same name CMG_Beggs_WGS') - mock_file_iter.return_value = iter(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA) + project = Project.objects.get(name='Empty Project') + project.name = '1kg project nåme with uniçøde' + project.save() + mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA) + response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) + self.assertEqual(response.status_code, 400) + self.assertEqual(response.json()['error'], 'Project not found or multiple projects with the same name 1kg project nåme with uniçøde') + project.name = 'Empty Project' + project.save() + + mock_logger.reset_mock() + mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_NO_EXIST_INDV_DATA) + response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) + self.assertEqual(response.status_code, 400) + self.assertEqual(response.json()['error'], 'Can\'t find individuals NA19678x, NA19679x') + info = [ + 'Loading phenotype-based prioritization data from lirical_data.tsv.gz', + 'Parsed LIRICAL data for project: 1kg project nåme with uniçøde' + ] + mock_logger.info.assert_has_calls([mock.call(info_log, self.data_manager_user) for info_log in info]) + mock_model_logger.info.assert_not_called() + + info = [ + 'Loading phenotype-based prioritization data from lirical_data.tsv.gz', + 'Parsed LIRICAL data for project: 1kg project nåme with uniçøde', + 'Attempted loading 1 records of LIRICAL data to project 1kg project nåme with uniçøde', + 'Parsed LIRICAL data for project: Test Reprocessed Project', + 'Attempted loading 1 records of LIRICAL data to project Test Reprocessed Project', + ] + + mock_logger.reset_mock() + mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA) + response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) + self.assertEqual(response.status_code, 200) + add_only_info = info + ['Loaded 2 LIRICAL data records'] + self.assertEqual(response.json()['info'], add_only_info) + mock_logger.info.assert_has_calls([mock.call(info_log, self.data_manager_user) for info_log in add_only_info]) + db_update = {'dbEntity': 'PhenotypePrioritization', 'numEntities': 2, 'updateType': 'bulk_create'} + mock_model_logger.info.assert_called_with('create PhenotypePrioritizations', self.data_manager_user, db_update=db_update) + + mock_logger.reset_mock() + mock_model_logger.reset_mock() + mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) self.assertEqual(response.status_code, 200) + info += ['Deleted 2 existing LIRICAL records', 'Loaded 2 LIRICAL data records'] + self.assertEqual(response.json()['info'], info) + mock_logger.info.assert_has_calls([mock.call(info_log, self.data_manager_user) for info_log in info]) + mock_model_logger.info.assert_has_calls([ + mock.call('delete PhenotypePrioritizations', self.data_manager_user, db_update={ + 'dbEntity': 'PhenotypePrioritization', 'numEntities': 2, 'updateType': 'bulk_delete', + }), + mock.call('create PhenotypePrioritizations', self.data_manager_user, db_update=db_update), + ]) + + +# Tests for AnVIL access disabled +class LocalDataManagerAPITest(AuthenticationTestCase, DataManagerAPITest): + fixtures = ['users', '1kg_project'] + + +# Test for permissions from AnVIL only +class AnvilDataManagerAPITest(AnvilAuthenticationTestCase, DataManagerAPITest): + fixtures = ['users', 'social_auth', '1kg_project'] + + @mock.patch('seqr.views.utils.permissions_utils.INTERNAL_NAMESPACES', ['my-seqr-billing']) + def test_load_phenotype_prioritization_data(self): + super(AnvilDataManagerAPITest, self).test_load_phenotype_prioritization_data() diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index fbaae56b18..4cfd0241c7 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -483,9 +483,9 @@ def load_phenotype_prioritization_data_file(file_path): if not sample_id or not project: raise ValueError('Both sample ID and project fields are required.') data_by_project_sample_id[project][sample_id].append(row_dict) - if tool and tool != row_dict['tool']: - raise ValueError(f'Multiple tools found {tool} and {row_dict["tool"]}. Only one in a file is supported.') if not tool: tool = row_dict['tool'] + elif tool != row_dict['tool']: + raise ValueError(f'Multiple tools found {tool} and {row_dict["tool"]}. Only one in a file is supported.') return tool, data_by_project_sample_id diff --git a/seqr/views/utils/permissions_utils.py b/seqr/views/utils/permissions_utils.py index d4ac50c02d..de84e8d472 100644 --- a/seqr/views/utils/permissions_utils.py +++ b/seqr/views/utils/permissions_utils.py @@ -103,6 +103,12 @@ def is_internal_anvil_project(project): return anvil_enabled() and project.workspace_namespace in INTERNAL_NAMESPACES +def is_internal_project(project): + if anvil_enabled(): + return project.workspace_namespace in INTERNAL_NAMESPACES + return True + + def get_internal_projects(): if anvil_enabled(): return Project.objects.filter(workspace_namespace__in=INTERNAL_NAMESPACES) From 25644cc015296b1e893a8157dfa74dd7271e732d Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Fri, 21 Oct 2022 10:18:05 -0400 Subject: [PATCH 21/96] Fix a codacy error. --- seqr/views/apis/data_manager_api_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index 85e1fcca0d..bee13b28fe 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -881,5 +881,5 @@ class AnvilDataManagerAPITest(AnvilAuthenticationTestCase, DataManagerAPITest): fixtures = ['users', 'social_auth', '1kg_project'] @mock.patch('seqr.views.utils.permissions_utils.INTERNAL_NAMESPACES', ['my-seqr-billing']) - def test_load_phenotype_prioritization_data(self): - super(AnvilDataManagerAPITest, self).test_load_phenotype_prioritization_data() + def test_load_phenotype_prioritization_data(self, *args): + super(AnvilDataManagerAPITest, self).test_load_phenotype_prioritization_data(*args) From 9351e7cd8006028007da62ee009530caf95813f7 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Fri, 21 Oct 2022 15:29:42 -0400 Subject: [PATCH 22/96] Update the sampleGene selector. --- .../components/panel/variants/VariantGene.jsx | 7 +- .../components/panel/variants/selectors.js | 78 +++++++++---------- .../panel/variants/selectors.test.js | 25 +++--- 3 files changed, 55 insertions(+), 55 deletions(-) diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index 905911cde7..84caaf78cc 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -18,7 +18,7 @@ import { GeneSearchLink } from '../../buttons/SearchResultsLink' import ShowGeneModal from '../../buttons/ShowGeneModal' import Modal from '../../modal/Modal' import { GenCC, ClingenLabel } from '../genes/GeneDetail' -import { getRnaSeqOutilerDataByFamilyGene, getPhePriDataByFamilyGene } from './selectors' +import { getSampleGeneDataByFamilyGene } from './selectors' const RnaSeqTpm = React.lazy(() => import('./RnaSeqTpm')) @@ -623,10 +623,7 @@ BaseVariantGene.propTypes = { const getRnaSeqProps = (state, ownProps) => ({ hasRnaTpmData: getFamiliesByGuid(state)[ownProps.variant.familyGuids[0]]?.hasRnaTpmData, - sampleGeneData: { - rnaSeqData: getRnaSeqOutilerDataByFamilyGene(state)[ownProps.variant.familyGuids[0]], - phePriData: getPhePriDataByFamilyGene(state)[ownProps.variant.familyGuids[0]], - }, + sampleGeneData: getSampleGeneDataByFamilyGene(state)[ownProps.variant.familyGuids[0]] || {}, }) const mapStateToProps = (state, ownProps) => ({ diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index d125d2acd0..f741d671ac 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -18,48 +18,48 @@ import { } from 'redux/selectors' const RNA_SEQ_SCORE_FIELDS = ['zScore', 'pValue', 'pAdjust'] -export const getRnaSeqOutilerDataByFamilyGene = createSelector( +export const getSampleGeneDataByFamilyGene = createSelector( getIndividualsByGuid, getRnaSeqDataByIndividual, - (individualsByGuid, rnaSeqDataByIndividual) => Object.entries(rnaSeqDataByIndividual).reduce( - (acc, [individualGuid, rnaSeqData]) => { - const { familyGuid, displayName } = individualsByGuid[individualGuid] - acc[familyGuid] = Object.entries(rnaSeqData.outliers || {}).reduce( - (acc2, [geneId, data]) => (data.isSignificant ? - { - ...acc2, - [geneId]: { - ...(acc2[geneId] || {}), - [displayName]: [{ - scores: RNA_SEQ_SCORE_FIELDS.reduce( - (sAcc, score) => (data[score] ? { ...sAcc, [score]: data[score] } : sAcc), {}, - ), - }], - }, - } : acc2 - ), acc[familyGuid] || {}, - ) - return acc - }, {}, - ), -) - -export const getPhePriDataByFamilyGene = createSelector( - getIndividualsByGuid, getPhePriDataByIndividual, - (individualsByGuid, phePriDataByIndividual) => Object.entries(phePriDataByIndividual || {}).reduce( - (acc, [individualGuid, phePriData]) => { - const { familyGuid, displayName } = individualsByGuid[individualGuid] - acc[familyGuid] = Object.entries(phePriData).reduce((accTool, [tool, toolData]) => ({ - ...accTool, - [tool]: Object.entries(toolData).reduce((acc2, [geneId, data]) => ({ - ...acc2, - [geneId]: { ...(acc2[geneId] || {}), [displayName]: data }, - }), {}), - }), acc[familyGuid] || {}) - return acc - }, {}, - ), + (individualsByGuid, rnaSeqDataByIndividual, phePriDataByIndividual) => { + const rnaSeqD = Object.entries(rnaSeqDataByIndividual).reduce( + (acc, [individualGuid, rnaSeqData]) => { + const { familyGuid, displayName } = individualsByGuid[individualGuid] + acc[familyGuid] = acc[familyGuid] || {} + acc[familyGuid].rnaSeqData = Object.entries(rnaSeqData.outliers || {}).reduce( + (acc2, [geneId, data]) => (data.isSignificant ? + { + ...acc2, + [geneId]: { + ...(acc2[geneId] || {}), + [displayName]: [{ + scores: RNA_SEQ_SCORE_FIELDS.reduce( + (sAcc, score) => (data[score] ? { ...sAcc, [score]: data[score] } : sAcc), {}, + ), + }], + }, + } : acc2 + ), acc[familyGuid].rnaSeqData || {}, + ) + return acc + }, {}, + ) + return Object.entries(phePriDataByIndividual || {}).reduce( + (acc, [individualGuid, phePriData]) => { + const { familyGuid, displayName } = individualsByGuid[individualGuid] + acc[familyGuid] = acc[familyGuid] || {} + acc[familyGuid].phePriData = Object.entries(phePriData).reduce((accTool, [tool, toolData]) => ({ + ...accTool, + [tool]: Object.entries(toolData).reduce((acc2, [geneId, data]) => ({ + ...acc2, + [geneId]: { ...(acc2[geneId] || {}), [displayName]: data }, + }), {}), + }), acc[familyGuid].phePriData || {}) + return acc + }, rnaSeqD, + ) + }, ) // Saved variant selectors diff --git a/ui/shared/components/panel/variants/selectors.test.js b/ui/shared/components/panel/variants/selectors.test.js index 6d7ff7ce69..7ca77945dd 100644 --- a/ui/shared/components/panel/variants/selectors.test.js +++ b/ui/shared/components/panel/variants/selectors.test.js @@ -5,7 +5,7 @@ import { getPairedSelectedSavedVariants, getVisibleSortedSavedVariants, getPairedFilteredSavedVariants, - getRnaSeqOutilerDataByFamilyGene, + getSampleGeneDataByFamilyGene, } from './selectors' test('getPairedSelectedSavedVariants', () => { @@ -92,20 +92,23 @@ const RNA_SEQ_STATE = { ...STATE_WITH_2_FAMILIES, } -test('getRnaSeqOutilerDataByFamilyGene', () => { - expect(getRnaSeqOutilerDataByFamilyGene(RNA_SEQ_STATE)).toEqual({ +test('getSampleGeneDataByFamilyGene', () => { + expect(getSampleGeneDataByFamilyGene(RNA_SEQ_STATE)).toEqual({ F011652_1: { - ENSG00000228198: { - NA19678: [{ scores: { pValue: 0.0004 } }], - NA19679_1: [{ scores: { pValue: 0.01 } }], - }, - ENSG00000164458: { - NA19678: [{ scores: { pValue: 0.0073 } }], + rnaSeqData: { + ENSG00000228198: { + NA19678: [{ scores: { pValue: 0.0004 } }], + NA19679_1: [{ scores: { pValue: 0.01 } }], + }, + ENSG00000164458: { + NA19678: [{ scores: { pValue: 0.0073 } }], + }, }, }, F011652_2: { - ENSG00000228198: { NA19678_2: [{ scores: { pValue: 0.0214 } }] }, + rnaSeqData: { + ENSG00000228198: { NA19678_2: [{ scores: { pValue: 0.0214 } }] }, + }, }, }) }) - From 1cb52b47c24f19edf993f79aad6b6c74c6660597 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 25 Oct 2022 10:31:19 -0400 Subject: [PATCH 23/96] Add backend tests. --- seqr/fixtures/1kg_project.json | 32 +++++++++++++++++++++ seqr/views/apis/variant_search_api_tests.py | 10 ++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json index 2960130ef9..f17418353a 100644 --- a/seqr/fixtures/1kg_project.json +++ b/seqr/fixtures/1kg_project.json @@ -1287,6 +1287,38 @@ "tpm": 1.01 } }, +{ + "model": "seqr.phenotypeprioritization", + "pk": 1, + "fields": { + "individual": 1, + "gene_id": "ENSG00000268903", + "tool": "lirical", + "rank": 1, + "disease_id": "OMIM:618460", + "disease_name": "Khan-Khan-Katsanis syndrome", + "scores": { + "post_test_probability": 0, + "compositeLR": 0.066 + } + } +}, +{ + "model": "seqr.phenotypeprioritization", + "pk": 2, + "fields": { + "individual": 1, + "gene_id": "ENSG00000268903", + "tool": "lirical", + "rank": 2, + "disease_id": "OMIM:219800", + "disease_name": "Cystinosis, nephropathic", + "scores": { + "post_test_probability": 0, + "compositeLR": 0.003 + } + } +}, { "model": "seqr.igvsample", "pk": 145, diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py index f4276973a9..958888d628 100644 --- a/seqr/views/apis/variant_search_api_tests.py +++ b/seqr/views/apis/variant_search_api_tests.py @@ -82,7 +82,14 @@ }, 'locusListsByGuid': {LOCUS_LIST_GUID: {'intervals': mock.ANY}}, 'rnaSeqData': {'I000001_na19675': {'outliers': {'ENSG00000268903': mock.ANY}}}, - 'phePriData': {}, + 'phePriData': {'I000001_na19675': { + 'lirical': {'ENSG00000268903': [ + {'diseaseId': 'OMIM:618460', 'diseaseName': 'Khan-Khan-Katsanis syndrome', 'geneId': 'ENSG00000268903', + 'rank': 1, 'scores': {'compositeLR': 0.066, 'post_test_probability': 0}}, + {'diseaseId': 'OMIM:219800', 'diseaseName': 'Cystinosis, nephropathic', 'geneId': 'ENSG00000268903', + 'rank': 2, 'scores': {'compositeLR': 0.003, 'post_test_probability': 0}} + ]} + }}, 'mmeSubmissionsByGuid': {'MS000001_na19675': {k: mock.ANY for k in MATCHMAKER_SUBMISSION_FIELDS}}, } @@ -394,6 +401,7 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro 'VT1726970_2103343353_r0004_tes': EXPECTED_TAG, 'VT1726945_2103343353_r0390_100': EXPECTED_TAG, }, 'variantFunctionalDataByGuid': {}, + 'phePriData': {}, 'rnaSeqData': {}, 'mmeSubmissionsByGuid': {}, }) From bc1babaed2845b33aa29c0feb8dda210af6c7a7b Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 25 Oct 2022 11:14:03 -0400 Subject: [PATCH 24/96] Add frontend tests. --- .../panel/variants/selectors.test.js | 30 +++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/ui/shared/components/panel/variants/selectors.test.js b/ui/shared/components/panel/variants/selectors.test.js index 7ca77945dd..4bd954b31e 100644 --- a/ui/shared/components/panel/variants/selectors.test.js +++ b/ui/shared/components/panel/variants/selectors.test.js @@ -73,7 +73,7 @@ test('getVisibleSortedSavedVariants', () => { expect(savedVariants[0].variantGuid).toEqual('SV0000002_1248367227_r0390_100') }) -const RNA_SEQ_STATE = { +const RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE = { rnaSeqDataByIndividual: { I021476_na19678_1: { outliers: { @@ -89,11 +89,24 @@ const RNA_SEQ_STATE = { }, I021476_na19678_2: { outliers: { ENSG00000228198: { isSignificant: true, pValue: 0.0214 } } }, }, + phePriDataByIndividual: { + I021476_na19678_1: { + lirical: { + ENSG00000228198: [{ + diseaseId: 'OMIM:618460', + diseaseName: 'Khan-Khan-Katsanis syndrome', + geneId: 'ENSG00000228198', + rank: 1, + scores: { compositeLR: 0.066, post_test_probability: 0 }, + }], + }, + }, + }, ...STATE_WITH_2_FAMILIES, } test('getSampleGeneDataByFamilyGene', () => { - expect(getSampleGeneDataByFamilyGene(RNA_SEQ_STATE)).toEqual({ + expect(getSampleGeneDataByFamilyGene(RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE)).toEqual({ F011652_1: { rnaSeqData: { ENSG00000228198: { @@ -104,6 +117,19 @@ test('getSampleGeneDataByFamilyGene', () => { NA19678: [{ scores: { pValue: 0.0073 } }], }, }, + phePriData: { + lirical: { + ENSG00000228198: { + NA19678: [{ + diseaseId: 'OMIM:618460', + diseaseName: 'Khan-Khan-Katsanis syndrome', + geneId: 'ENSG00000228198', + rank: 1, + scores: { compositeLR: 0.066, post_test_probability: 0 }, + }], + }, + }, + }, }, F011652_2: { rnaSeqData: { From 1b728e061b55f1e199bcc98bd2133ad85a0eff36 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Thu, 27 Oct 2022 15:40:22 -0400 Subject: [PATCH 25/96] Update to use Datatable. --- seqr/views/apis/saved_variant_api_tests.py | 2 +- seqr/views/apis/summary_data_api_tests.py | 2 +- seqr/views/apis/variant_search_api_tests.py | 4 +- seqr/views/utils/variant_utils.py | 4 +- ui/redux/rootReducer.js | 2 +- ui/redux/selectors.js | 2 +- .../components/panel/variants/VariantGene.jsx | 134 ++++++++++-------- .../components/panel/variants/selectors.js | 73 +++++----- .../panel/variants/selectors.test.js | 71 +++++----- 9 files changed, 148 insertions(+), 146 deletions(-) diff --git a/seqr/views/apis/saved_variant_api_tests.py b/seqr/views/apis/saved_variant_api_tests.py index e2202cf4c1..62b6adc481 100644 --- a/seqr/views/apis/saved_variant_api_tests.py +++ b/seqr/views/apis/saved_variant_api_tests.py @@ -27,7 +27,7 @@ SAVED_VARIANT_RESPONSE_KEYS = { 'variantTagsByGuid', 'variantNotesByGuid', 'variantFunctionalDataByGuid', 'savedVariantsByGuid', - 'genesById', 'locusListsByGuid', 'rnaSeqData', 'mmeSubmissionsByGuid', 'phePriData', + 'genesById', 'locusListsByGuid', 'rnaSeqData', 'mmeSubmissionsByGuid', 'phenotypeGeneScores', } COMPOUND_HET_3_JSON = { diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index b622e62149..260337ddba 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -25,7 +25,7 @@ SAVED_VARIANT_RESPONSE_KEYS = { 'projectsByGuid', 'locusListsByGuid', 'savedVariantsByGuid', 'variantFunctionalDataByGuid', 'genesById', 'variantNotesByGuid', 'individualsByGuid', 'variantTagsByGuid', 'familiesByGuid', 'familyNotesByGuid', - 'mmeSubmissionsByGuid', 'phePriData', + 'mmeSubmissionsByGuid', 'phenotypeGeneScores', } diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py index 958888d628..471bc0e943 100644 --- a/seqr/views/apis/variant_search_api_tests.py +++ b/seqr/views/apis/variant_search_api_tests.py @@ -82,7 +82,7 @@ }, 'locusListsByGuid': {LOCUS_LIST_GUID: {'intervals': mock.ANY}}, 'rnaSeqData': {'I000001_na19675': {'outliers': {'ENSG00000268903': mock.ANY}}}, - 'phePriData': {'I000001_na19675': { + 'phenotypeGeneScores': {'I000001_na19675': { 'lirical': {'ENSG00000268903': [ {'diseaseId': 'OMIM:618460', 'diseaseName': 'Khan-Khan-Katsanis syndrome', 'geneId': 'ENSG00000268903', 'rank': 1, 'scores': {'compositeLR': 0.066, 'post_test_probability': 0}}, @@ -401,7 +401,7 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro 'VT1726970_2103343353_r0004_tes': EXPECTED_TAG, 'VT1726945_2103343353_r0390_100': EXPECTED_TAG, }, 'variantFunctionalDataByGuid': {}, - 'phePriData': {}, + 'phenotypeGeneScores': {}, 'rnaSeqData': {}, 'mmeSubmissionsByGuid': {}, }) diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index dcc93d1f80..f43f98243d 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -140,7 +140,7 @@ def _get_phenotype_prioritization(gene_ids, families): ) for data in data_dicts: - data_by_individual_gene[data.pop('individualGuid')][data.pop('tool')][data['geneId']].append(data) + data_by_individual_gene[data.pop('individualGuid')][data.pop('geneId')][data.pop('tool')].append(data) return data_by_individual_gene @@ -237,6 +237,6 @@ def get_variants_response(request, saved_variants, response_variants=None, add_a _add_family_has_rna_tpm(families_by_guid) if include_phenotype_prioritization: - response['phePriData'] = _get_phenotype_prioritization(genes.keys(), families) + response['phenotypeGeneScores'] = _get_phenotype_prioritization(genes.keys(), families) return response diff --git a/ui/redux/rootReducer.js b/ui/redux/rootReducer.js index 792fd96faa..9d505084d2 100644 --- a/ui/redux/rootReducer.js +++ b/ui/redux/rootReducer.js @@ -319,7 +319,7 @@ const rootReducer = combineReducers({ mmeResultsByGuid: createObjectsByIdReducer(RECEIVE_DATA, 'mmeResultsByGuid'), genesById: createObjectsByIdReducer(RECEIVE_DATA, 'genesById'), rnaSeqDataByIndividual: createObjectsByIdReducer(RECEIVE_DATA, 'rnaSeqData'), - phePriDataByIndividual: createObjectsByIdReducer(RECEIVE_DATA, 'phePriData'), + phenotypeGeneScoresByIndividual: createObjectsByIdReducer(RECEIVE_DATA, 'phenotypeGeneScores'), genesLoading: loadingReducer(REQUEST_GENES, RECEIVE_DATA), hpoTermsByParent: createObjectsByIdReducer(RECEIVE_HPO_TERMS), hpoTermsLoading: loadingReducer(REQUEST_HPO_TERMS, RECEIVE_HPO_TERMS), diff --git a/ui/redux/selectors.js b/ui/redux/selectors.js index 0e143793a8..e71de638f9 100644 --- a/ui/redux/selectors.js +++ b/ui/redux/selectors.js @@ -30,7 +30,7 @@ export const getLocusListsByGuid = state => state.locusListsByGuid export const getLocusListsIsLoading = state => state.locusListsLoading.isLoading export const getLocusListIsLoading = state => state.locusListLoading.isLoading export const getRnaSeqDataByIndividual = state => state.rnaSeqDataByIndividual -export const getPhePriDataByIndividual = state => state.phePriDataByIndividual +export const getPhenotypeGeneScoresByIndividual = state => state.phenotypeGeneScoresByIndividual export const getUser = state => state.user export const getUserOptionsByUsername = state => state.userOptionsByUsername export const getUserOptionsIsLoading = state => state.userOptionsLoading.isLoading diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index 84caaf78cc..2edb303fc2 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -6,19 +6,19 @@ import { NavLink } from 'react-router-dom' import { Label, Popup, List, Header, Segment, Divider, Table, Button, Loader } from 'semantic-ui-react' import { getGenesById, getLocusListsByGuid, getFamiliesByGuid } from 'redux/selectors' +import DataTable from 'shared/components/table/DataTable' import { panelAppUrl, moiToMoiInitials } from '../../../utils/panelAppUtils' import { MISSENSE_THRESHHOLD, LOF_THRESHHOLD, PANEL_APP_CONFIDENCE_LEVEL_COLORS, PANEL_APP_CONFIDENCE_DESCRIPTION, } from '../../../utils/constants' import { compareObjects } from '../../../utils/sortUtils' -import { camelcaseToTitlecase } from '../../../utils/stringUtils' import { HorizontalSpacer, VerticalSpacer } from '../../Spacers' import { InlineHeader, NoBorderTable, ButtonLink, ColoredLabel } from '../../StyledComponents' import { GeneSearchLink } from '../../buttons/SearchResultsLink' import ShowGeneModal from '../../buttons/ShowGeneModal' import Modal from '../../modal/Modal' import { GenCC, ClingenLabel } from '../genes/GeneDetail' -import { getSampleGeneDataByFamilyGene } from './selectors' +import { getIndividualGeneDataByFamilyGene } from './selectors' const RnaSeqTpm = React.lazy(() => import('./RnaSeqTpm')) @@ -314,33 +314,31 @@ const GENE_DISEASE_DETAIL_SECTIONS = [ }, ] -const sampleGeneDetailsDisplay = (geneId, sampleGeneData) => { - const { scores, ...info } = Object.values(Object.values(sampleGeneData)[0])[0][0] - const infoKeys = Object.keys(info) - const scoreKeys = Object.keys(scores || {}) - return ( -
- - - - - {infoKeys.concat(scoreKeys).map(field => ( - {camelcaseToTitlecase(field).replace(' ', '-')} - ))} - - - - {Object.entries(sampleGeneData[geneId]).map(([individual, data]) => (data.map(row => ( - - {individual} - {infoKeys.map(field => {row[field]})} - {scoreKeys.map(field => {row.scores[field].toPrecision(3)})} - - ))))} - -
-
- ) +const RNA_SEQ_COLUMNS = [ + { name: 'individual', content: '', width: 3 }, + { name: 'zScore', content: 'Z-Score', width: 3, format: ({ zScore }) => (zScore ? zScore.toPrecision(3) : null) }, + { name: 'pValue', content: 'P-Value', width: 3, format: ({ pValue }) => (pValue ? pValue.toPrecision(3) : null) }, + { name: 'pAdjust', content: 'P-Adjust', width: 3, format: ({ pAdjust }) => (pAdjust ? pAdjust.toPrecision(3) : null) }, +] + +const PHENOTYPE_GENE_INFO_COLUMNS = [ + { name: 'individual', content: '', width: 3 }, + { name: 'rank', content: 'Rank', width: 3 }, + { name: 'diseaseName', content: 'Disease', width: 3, format: ({ diseaseName, diseaseId }) => `${diseaseName} (${diseaseId})` }, +] + +const PHENOTYPE_GENE_SCORE_COLUMNS = { + lirical: [ + ...PHENOTYPE_GENE_INFO_COLUMNS, + { name: 'scores.post_test_probability', content: 'Posttest-Probability', width: 3, format: ({ scores }) => (scores.post_test_probability.toPrecision(3)) }, + { name: 'scores.compositeLR', content: 'Composite-LR', width: 3, format: ({ scores }) => (scores.compositeLR.toPrecision(3)) }, + ], + exomiser: [ + ...PHENOTYPE_GENE_INFO_COLUMNS, + { name: 'scores.exomiser_score', content: 'Exomiser-Score', width: 3, format: ({ scores }) => (scores.exomiser_score.toPrecision(3)) }, + { name: 'scores.phenotype_score', content: 'Phenotype-Score', width: 3, format: ({ scores }) => (scores.phenotype_score.toPrecision(3)) }, + { name: 'scores.variant_score', content: 'Variant-Score', width: 3, format: ({ scores }) => (scores.variant_score.toPrecision(3)) }, + ], } const GENE_DETAIL_SECTIONS = [ @@ -399,9 +397,34 @@ const GENE_DETAIL_SECTIONS = [ label: 'RNA-Seq', showDetails: (gene, { rnaSeqData }) => rnaSeqData && rnaSeqData[gene.geneId], detailsDisplay: (gene, { rnaSeqData }) => ( - sampleGeneDetailsDisplay(gene.geneId, rnaSeqData, 'rnaSeqData') + ), }, + { + color: 'orange', + description: 'Phenotype Prioritization', + lable: 'PhenotypeGene', + showDetails: (gene, { phenotypeGeneScores }) => phenotypeGeneScores && phenotypeGeneScores[gene.geneId], + detailsDisplay: (gene, { phenotypeGeneScores }) => (Object.entries(phenotypeGeneScores[gene.geneId]).map( + ([tool, data]) => ([ + tool, + ( + + ), + ]), + )), + }, ] const OmimSegments = styled(Segment.Group).attrs({ size: 'tiny', horizontal: true, compact: true })` @@ -426,13 +449,18 @@ const OmimSegments = styled(Segment.Group).attrs({ size: 'tiny', horizontal: tru } ` -const getDetailSections = (configs, gene, compact, labelProps, sampleGeneData) => configs.map( +const getDetailSections = (configs, gene, compact, labelProps, individualGeneData) => configs.map( ({ showDetails, detailsDisplay, ...sectionConfig }) => ( { ...sectionConfig, - detail: showDetails(gene, sampleGeneData) && detailsDisplay(gene, sampleGeneData), + detail: showDetails(gene, individualGeneData) && detailsDisplay(gene, individualGeneData), }), -).filter(({ detail }) => detail).map(({ detail, expandedDisplay, ...sectionConfig }) => ( +).reduce((acc, config) => (Array.isArray(config.detail) ? + [ + ...acc, + ...config.detail.map(([tool, detail]) => ({ ...config, label: tool.toUpperCase(), detail })), + ] : [...acc, config]), +[]).filter(({ detail }) => detail).map(({ detail, expandedDisplay, ...sectionConfig }) => ( (expandedDisplay && !compact) ? ( @@ -453,28 +481,10 @@ const getDetailSections = (configs, gene, compact, labelProps, sampleGeneData) = ) )) -const addPhenotypePrioritizationConfig = (configs, phePriInfo) => ( - phePriInfo ? [ - ...configs, - ...Object.keys(phePriInfo).map(tool => ( - { - color: 'orange', - description: 'Phenotype Prioritization', - label: tool.toUpperCase(), - showDetails: (gene, { phePriData }) => phePriData && phePriData[tool] && phePriData[tool][gene.geneId], - detailsDisplay: (gene, { phePriData }) => ( - sampleGeneDetailsDisplay(gene.geneId, phePriData[tool]) - ), - } - )), - ] : configs -) - export const GeneDetails = React.memo(( - { gene, compact, showLocusLists, showInlineDetails, sampleGeneData, ...labelProps }, + { gene, compact, showLocusLists, showInlineDetails, individualGeneData, ...labelProps }, ) => { - const geneDetailConfigs = addPhenotypePrioritizationConfig(GENE_DETAIL_SECTIONS, sampleGeneData.phePriData) - const geneDetails = getDetailSections(geneDetailConfigs, gene, compact, labelProps, sampleGeneData) + const geneDetails = getDetailSections(GENE_DETAIL_SECTIONS, gene, compact, labelProps, individualGeneData) const geneDiseaseDetails = getDetailSections(GENE_DISEASE_DETAIL_SECTIONS, gene, compact, labelProps) const hasLocusLists = showLocusLists && gene.locusListGuids.length > 0 const showDivider = !showInlineDetails && geneDetails.length > 0 && (hasLocusLists || geneDiseaseDetails.length > 0) @@ -503,7 +513,7 @@ GeneDetails.propTypes = { compact: PropTypes.bool, showLocusLists: PropTypes.bool, showInlineDetails: PropTypes.bool, - sampleGeneData: PropTypes.object, + individualGeneData: PropTypes.object, } const GeneSearchLinkWithPopup = props => ( @@ -523,7 +533,7 @@ const getGeneConsequence = (geneId, variant) => { } const BaseVariantGene = React.memo(( - { geneId, gene, variant, compact, showInlineDetails, compoundHetToggle, hasRnaTpmData, sampleGeneData }, + { geneId, gene, variant, compact, showInlineDetails, compoundHetToggle, hasRnaTpmData, individualGeneData }, ) => { const geneConsequence = getGeneConsequence(geneId, variant) @@ -540,7 +550,7 @@ const BaseVariantGene = React.memo(( showInlineDetails={showInlineDetails} margin={showInlineDetails ? '1em .5em 0px 0px' : null} horizontal={showInlineDetails} - sampleGeneData={sampleGeneData} + individualGeneData={individualGeneData} showLocusLists /> ) @@ -618,12 +628,12 @@ BaseVariantGene.propTypes = { showInlineDetails: PropTypes.bool, compoundHetToggle: PropTypes.func, hasRnaTpmData: PropTypes.bool, - sampleGeneData: PropTypes.object, + individualGeneData: PropTypes.object, } const getRnaSeqProps = (state, ownProps) => ({ hasRnaTpmData: getFamiliesByGuid(state)[ownProps.variant.familyGuids[0]]?.hasRnaTpmData, - sampleGeneData: getSampleGeneDataByFamilyGene(state)[ownProps.variant.familyGuids[0]] || {}, + individualGeneData: getIndividualGeneDataByFamilyGene(state)[ownProps.variant.familyGuids[0]] || {}, }) const mapStateToProps = (state, ownProps) => ({ @@ -639,7 +649,7 @@ class VariantGenes extends React.PureComponent { variant: PropTypes.object.isRequired, mainGeneId: PropTypes.string, genesById: PropTypes.object.isRequired, - sampleGeneData: PropTypes.object, + individualGeneData: PropTypes.object, hasRnaTpmData: PropTypes.bool, showMainGene: PropTypes.bool, } @@ -655,7 +665,7 @@ class VariantGenes extends React.PureComponent { } render() { - const { variant, genesById, mainGeneId, showMainGene, sampleGeneData, hasRnaTpmData } = this.props + const { variant, genesById, mainGeneId, showMainGene, individualGeneData, hasRnaTpmData } = this.props const { showAll } = this.state const geneIds = Object.keys(variant.transcripts || {}) const genes = geneIds.map(geneId => genesById[geneId]).filter(gene => gene) @@ -674,7 +684,7 @@ class VariantGenes extends React.PureComponent { geneId={gene.geneId} gene={gene} variant={variant} - sampleGeneData={sampleGeneData} + individualGeneData={individualGeneData} hasRnaTpmData={hasRnaTpmData} showInlineDetails={!mainGeneId} compact @@ -705,7 +715,7 @@ class VariantGenes extends React.PureComponent { details={sectionGenes.length > 0 && sectionGenes.map(gene => (
- {detailsDisplay(gene, sampleGeneData)} + {detailsDisplay(gene, individualGeneData)}
))} diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index f741d671ac..3cb862df91 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -14,52 +14,43 @@ import { } from 'shared/utils/constants' import { getVariantTagsByGuid, getVariantNotesByGuid, getSavedVariantsByGuid, getAnalysisGroupsByGuid, getGenesById, getUser, - getFamiliesByGuid, getProjectsByGuid, getIndividualsByGuid, getRnaSeqDataByIndividual, getPhePriDataByIndividual, + getFamiliesByGuid, getProjectsByGuid, getIndividualsByGuid, getRnaSeqDataByIndividual, + getPhenotypeGeneScoresByIndividual, } from 'redux/selectors' -const RNA_SEQ_SCORE_FIELDS = ['zScore', 'pValue', 'pAdjust'] -export const getSampleGeneDataByFamilyGene = createSelector( +export const getIndividualGeneDataByFamilyGene = createSelector( getIndividualsByGuid, getRnaSeqDataByIndividual, - getPhePriDataByIndividual, - (individualsByGuid, rnaSeqDataByIndividual, phePriDataByIndividual) => { - const rnaSeqD = Object.entries(rnaSeqDataByIndividual).reduce( - (acc, [individualGuid, rnaSeqData]) => { - const { familyGuid, displayName } = individualsByGuid[individualGuid] - acc[familyGuid] = acc[familyGuid] || {} - acc[familyGuid].rnaSeqData = Object.entries(rnaSeqData.outliers || {}).reduce( - (acc2, [geneId, data]) => (data.isSignificant ? - { - ...acc2, - [geneId]: { - ...(acc2[geneId] || {}), - [displayName]: [{ - scores: RNA_SEQ_SCORE_FIELDS.reduce( - (sAcc, score) => (data[score] ? { ...sAcc, [score]: data[score] } : sAcc), {}, - ), - }], - }, - } : acc2 - ), acc[familyGuid].rnaSeqData || {}, - ) - return acc - }, {}, - ) - return Object.entries(phePriDataByIndividual || {}).reduce( - (acc, [individualGuid, phePriData]) => { - const { familyGuid, displayName } = individualsByGuid[individualGuid] - acc[familyGuid] = acc[familyGuid] || {} - acc[familyGuid].phePriData = Object.entries(phePriData).reduce((accTool, [tool, toolData]) => ({ - ...accTool, - [tool]: Object.entries(toolData).reduce((acc2, [geneId, data]) => ({ - ...acc2, - [geneId]: { ...(acc2[geneId] || {}), [displayName]: data }, - }), {}), - }), acc[familyGuid].phePriData || {}) + getPhenotypeGeneScoresByIndividual, + (individualsByGuid, rnaSeqDataByIndividual, phenotypeGeneScoresByIndividual) => ( + Object.values(individualsByGuid).reduce((acc, { individualGuid, familyGuid, displayName }) => { + const rnaSeqData = rnaSeqDataByIndividual && rnaSeqDataByIndividual[individualGuid]?.outliers + const phenotypeGeneScores = phenotypeGeneScoresByIndividual && phenotypeGeneScoresByIndividual[individualGuid] + if (!rnaSeqData && !phenotypeGeneScores) { return acc - }, rnaSeqD, - ) - }, + } + return { + ...acc, + [familyGuid]: { + rnaSeqData: Object.entries(rnaSeqData || {}).reduce( + (acc2, [geneId, data]) => (data.isSignificant ? { + ...acc2, + [geneId]: [...(acc2[geneId] || []), { ...data, individual: displayName }], + } : acc2), acc[familyGuid]?.rnaSeqData || {}, + ), + phenotypeGeneScores: Object.entries(phenotypeGeneScores || {}).reduce( + (acc2, [geneId, dataByTool]) => ({ + ...acc2, + [geneId]: Object.entries(dataByTool).reduce((acc3, [tool, data]) => ({ + ...acc3, + [tool]: [...(acc3[tool] || []), ...data.map(d => ({ ...d, individual: displayName }))], + }), acc2[geneId] || {}), + }), acc[familyGuid]?.phenotypeGeneScores || {}, + ), + }, + } + }, {}) + ), ) // Saved variant selectors diff --git a/ui/shared/components/panel/variants/selectors.test.js b/ui/shared/components/panel/variants/selectors.test.js index 4bd954b31e..7379f72273 100644 --- a/ui/shared/components/panel/variants/selectors.test.js +++ b/ui/shared/components/panel/variants/selectors.test.js @@ -5,7 +5,7 @@ import { getPairedSelectedSavedVariants, getVisibleSortedSavedVariants, getPairedFilteredSavedVariants, - getSampleGeneDataByFamilyGene, + getIndividualGeneDataByFamilyGene, } from './selectors' test('getPairedSelectedSavedVariants', () => { @@ -89,7 +89,7 @@ const RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE = { }, I021476_na19678_2: { outliers: { ENSG00000228198: { isSignificant: true, pValue: 0.0214 } } }, }, - phePriDataByIndividual: { + phenotypeGeneScoresByIndividual: { I021476_na19678_1: { lirical: { ENSG00000228198: [{ @@ -105,36 +105,37 @@ const RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE = { ...STATE_WITH_2_FAMILIES, } -test('getSampleGeneDataByFamilyGene', () => { - expect(getSampleGeneDataByFamilyGene(RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE)).toEqual({ - F011652_1: { - rnaSeqData: { - ENSG00000228198: { - NA19678: [{ scores: { pValue: 0.0004 } }], - NA19679_1: [{ scores: { pValue: 0.01 } }], - }, - ENSG00000164458: { - NA19678: [{ scores: { pValue: 0.0073 } }], - }, - }, - phePriData: { - lirical: { - ENSG00000228198: { - NA19678: [{ - diseaseId: 'OMIM:618460', - diseaseName: 'Khan-Khan-Katsanis syndrome', - geneId: 'ENSG00000228198', - rank: 1, - scores: { compositeLR: 0.066, post_test_probability: 0 }, - }], - }, - }, - }, - }, - F011652_2: { - rnaSeqData: { - ENSG00000228198: { NA19678_2: [{ scores: { pValue: 0.0214 } }] }, - }, - }, - }) -}) +// Temporarily remove the test. +// test('getIndividualGeneDataByFamilyGene', () => { +// expect(getIndividualGeneDataByFamilyGene(RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE)).toEqual({ +// F011652_1: { +// rnaSeqData: { +// ENSG00000228198: [ +// { individual: 'NA19678', isSignificant: true, pValue: 0.0004 }, +// { individual: 'NA19679_1', isSignificant: true, pValue: 0.01 }, +// ], +// ENSG00000164458: [ +// { individual: 'NA19678', isSignificant: true, pValue: 0.0073 }, +// ], +// }, +// phenotypeGeneScores: { +// ENSG00000228198: { +// lirical: [{ +// individual: 'NA19678', +// diseaseId: 'OMIM:618460', +// diseaseName: 'Khan-Khan-Katsanis syndrome', +// geneId: 'ENSG00000228198', +// rank: 1, +// scores: { compositeLR: 0.066, post_test_probability: 0 }, +// }], +// }, +// }, +// }, +// F011652_2: { +// rnaSeqData: { +// ENSG00000228198: [{ individual: 'NA19678_2', isSignificant: true, pValue: 0.0214 }], +// }, +// phenotypeGeneScores: {}, +// }, +// }) +// }) From f447e94618d02fbc4315b43b86aafe99a1ba348c Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Thu, 27 Oct 2022 17:30:42 -0400 Subject: [PATCH 26/96] Resolve the test issues. --- seqr/views/apis/variant_search_api_tests.py | 13 ++-- .../components/panel/variants/selectors.js | 2 +- .../panel/variants/selectors.test.js | 71 +++++++++---------- 3 files changed, 44 insertions(+), 42 deletions(-) diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py index 471bc0e943..5dd1574274 100644 --- a/seqr/views/apis/variant_search_api_tests.py +++ b/seqr/views/apis/variant_search_api_tests.py @@ -83,11 +83,11 @@ 'locusListsByGuid': {LOCUS_LIST_GUID: {'intervals': mock.ANY}}, 'rnaSeqData': {'I000001_na19675': {'outliers': {'ENSG00000268903': mock.ANY}}}, 'phenotypeGeneScores': {'I000001_na19675': { - 'lirical': {'ENSG00000268903': [ - {'diseaseId': 'OMIM:618460', 'diseaseName': 'Khan-Khan-Katsanis syndrome', 'geneId': 'ENSG00000268903', - 'rank': 1, 'scores': {'compositeLR': 0.066, 'post_test_probability': 0}}, - {'diseaseId': 'OMIM:219800', 'diseaseName': 'Cystinosis, nephropathic', 'geneId': 'ENSG00000268903', - 'rank': 2, 'scores': {'compositeLR': 0.003, 'post_test_probability': 0}} + 'ENSG00000268903': {'lirical': [ + {'diseaseId': 'OMIM:618460', 'diseaseName': 'Khan-Khan-Katsanis syndrome', 'rank': 1, + 'scores': {'compositeLR': 0.066, 'post_test_probability': 0}}, + {'diseaseId': 'OMIM:219800', 'diseaseName': 'Cystinosis, nephropathic', 'rank': 2, + 'scores': {'compositeLR': 0.003, 'post_test_probability': 0}} ]} }}, 'mmeSubmissionsByGuid': {'MS000001_na19675': {k: mock.ANY for k in MATCHMAKER_SUBMISSION_FIELDS}}, @@ -282,6 +282,7 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro self.assertEqual(response.status_code, 200) response_json = response.json() self.assertSetEqual(set(response_json.keys()), set(EXPECTED_SEARCH_RESPONSE.keys())) + self.maxDiff = None self.assertDictEqual(response_json, EXPECTED_SEARCH_RESPONSE) self.assertSetEqual( set(response_json['search']['projectFamilies'][0]['familyGuids']), {'F000001_1', 'F000002_2'}) @@ -495,6 +496,7 @@ def _get_variants(results_model, **kwargs): self.assertEqual(response.status_code, 200) response_json = response.json() self.assertSetEqual(set(response_json.keys()), set(EXPECTED_SEARCH_RESPONSE.keys())) + self.maxDiff = None self.assertDictEqual(response_json, EXPECTED_SEARCH_RESPONSE) self._assert_expected_results_context(response_json) self.assertSetEqual( @@ -642,6 +644,7 @@ def test_query_single_variant(self, mock_get_variant): expected_search_response['variantNotesByGuid'].pop('VN0714935_2103343353_r0390_100') expected_search_response['genesById'].pop('ENSG00000233653') expected_search_response['searchedVariants'] = [single_family_variant] + self.maxDiff = None self.assertDictEqual(response_json, expected_search_response) self._assert_expected_results_family_context(response_json, locus_list_detail=True) self.assertSetEqual(set(response_json['projectsByGuid'][PROJECT_GUID].keys()), PROJECT_TAG_TYPE_FIELDS) diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index 3cb862df91..15a9871c31 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -23,7 +23,7 @@ export const getIndividualGeneDataByFamilyGene = createSelector( getRnaSeqDataByIndividual, getPhenotypeGeneScoresByIndividual, (individualsByGuid, rnaSeqDataByIndividual, phenotypeGeneScoresByIndividual) => ( - Object.values(individualsByGuid).reduce((acc, { individualGuid, familyGuid, displayName }) => { + Object.entries(individualsByGuid).reduce((acc, [individualGuid, { familyGuid, displayName }]) => { const rnaSeqData = rnaSeqDataByIndividual && rnaSeqDataByIndividual[individualGuid]?.outliers const phenotypeGeneScores = phenotypeGeneScoresByIndividual && phenotypeGeneScoresByIndividual[individualGuid] if (!rnaSeqData && !phenotypeGeneScores) { diff --git a/ui/shared/components/panel/variants/selectors.test.js b/ui/shared/components/panel/variants/selectors.test.js index 7379f72273..ddb45c94e6 100644 --- a/ui/shared/components/panel/variants/selectors.test.js +++ b/ui/shared/components/panel/variants/selectors.test.js @@ -91,8 +91,8 @@ const RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE = { }, phenotypeGeneScoresByIndividual: { I021476_na19678_1: { - lirical: { - ENSG00000228198: [{ + ENSG00000228198: { + lirical: [{ diseaseId: 'OMIM:618460', diseaseName: 'Khan-Khan-Katsanis syndrome', geneId: 'ENSG00000228198', @@ -105,37 +105,36 @@ const RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE = { ...STATE_WITH_2_FAMILIES, } -// Temporarily remove the test. -// test('getIndividualGeneDataByFamilyGene', () => { -// expect(getIndividualGeneDataByFamilyGene(RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE)).toEqual({ -// F011652_1: { -// rnaSeqData: { -// ENSG00000228198: [ -// { individual: 'NA19678', isSignificant: true, pValue: 0.0004 }, -// { individual: 'NA19679_1', isSignificant: true, pValue: 0.01 }, -// ], -// ENSG00000164458: [ -// { individual: 'NA19678', isSignificant: true, pValue: 0.0073 }, -// ], -// }, -// phenotypeGeneScores: { -// ENSG00000228198: { -// lirical: [{ -// individual: 'NA19678', -// diseaseId: 'OMIM:618460', -// diseaseName: 'Khan-Khan-Katsanis syndrome', -// geneId: 'ENSG00000228198', -// rank: 1, -// scores: { compositeLR: 0.066, post_test_probability: 0 }, -// }], -// }, -// }, -// }, -// F011652_2: { -// rnaSeqData: { -// ENSG00000228198: [{ individual: 'NA19678_2', isSignificant: true, pValue: 0.0214 }], -// }, -// phenotypeGeneScores: {}, -// }, -// }) -// }) +test('getIndividualGeneDataByFamilyGene', () => { + expect(getIndividualGeneDataByFamilyGene(RNA_SEQ_PHENOTYPE_PRIORITIZATION_STATE)).toEqual({ + F011652_1: { + rnaSeqData: { + ENSG00000228198: [ + { individual: 'NA19678', isSignificant: true, pValue: 0.0004 }, + { individual: 'NA19679_1', isSignificant: true, pValue: 0.01 }, + ], + ENSG00000164458: [ + { individual: 'NA19678', isSignificant: true, pValue: 0.0073 }, + ], + }, + phenotypeGeneScores: { + ENSG00000228198: { + lirical: [{ + individual: 'NA19678', + diseaseId: 'OMIM:618460', + diseaseName: 'Khan-Khan-Katsanis syndrome', + geneId: 'ENSG00000228198', + rank: 1, + scores: { compositeLR: 0.066, post_test_probability: 0 }, + }], + }, + }, + }, + F011652_2: { + rnaSeqData: { + ENSG00000228198: [{ individual: 'NA19678_2', isSignificant: true, pValue: 0.0214 }], + }, + phenotypeGeneScores: {}, + }, + }) +}) From c5e0cbe2509e65a84a11802f89c8ad0a8db1b894 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Fri, 28 Oct 2022 10:08:10 -0400 Subject: [PATCH 27/96] Update tests and score headers. --- seqr/views/apis/variant_search_api_tests.py | 23 +++++---- .../components/panel/variants/VariantGene.jsx | 50 +++++++++---------- 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py index 5dd1574274..50a0c6477b 100644 --- a/seqr/views/apis/variant_search_api_tests.py +++ b/seqr/views/apis/variant_search_api_tests.py @@ -83,16 +83,18 @@ 'locusListsByGuid': {LOCUS_LIST_GUID: {'intervals': mock.ANY}}, 'rnaSeqData': {'I000001_na19675': {'outliers': {'ENSG00000268903': mock.ANY}}}, 'phenotypeGeneScores': {'I000001_na19675': { - 'ENSG00000268903': {'lirical': [ - {'diseaseId': 'OMIM:618460', 'diseaseName': 'Khan-Khan-Katsanis syndrome', 'rank': 1, - 'scores': {'compositeLR': 0.066, 'post_test_probability': 0}}, - {'diseaseId': 'OMIM:219800', 'diseaseName': 'Cystinosis, nephropathic', 'rank': 2, - 'scores': {'compositeLR': 0.003, 'post_test_probability': 0}} - ]} + 'ENSG00000268903': {'lirical': mock.ANY} }}, 'mmeSubmissionsByGuid': {'MS000001_na19675': {k: mock.ANY for k in MATCHMAKER_SUBMISSION_FIELDS}}, } +EXPECTED_LIRICAL_DATA = [ + {'diseaseId': 'OMIM:219800', 'diseaseName': 'Cystinosis, nephropathic', 'rank': 2, + 'scores': {'compositeLR': 0.003, 'post_test_probability': 0}}, + {'diseaseId': 'OMIM:618460', 'diseaseName': 'Khan-Khan-Katsanis syndrome', 'rank': 1, + 'scores': {'compositeLR': 0.066, 'post_test_probability': 0}}, +] + EXPECTED_SEARCH_CONTEXT_RESPONSE = { 'savedSearchesByGuid': { 'VS0000001_de_novo_dominant_res': mock.ANY, 'VS0000002_recessive_restrictiv': mock.ANY, 'VS0000003_de_novo_dominant_per': mock.ANY, @@ -282,8 +284,9 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro self.assertEqual(response.status_code, 200) response_json = response.json() self.assertSetEqual(set(response_json.keys()), set(EXPECTED_SEARCH_RESPONSE.keys())) - self.maxDiff = None self.assertDictEqual(response_json, EXPECTED_SEARCH_RESPONSE) + lirical_data = response_json['phenotypeGeneScores']['I000001_na19675']['ENSG00000268903']['lirical'] + self.assertListEqual(sorted(lirical_data, key=lambda d: d['diseaseId']), EXPECTED_LIRICAL_DATA) self.assertSetEqual( set(response_json['search']['projectFamilies'][0]['familyGuids']), {'F000001_1', 'F000002_2'}) self._assert_expected_results_context(response_json) @@ -496,8 +499,9 @@ def _get_variants(results_model, **kwargs): self.assertEqual(response.status_code, 200) response_json = response.json() self.assertSetEqual(set(response_json.keys()), set(EXPECTED_SEARCH_RESPONSE.keys())) - self.maxDiff = None self.assertDictEqual(response_json, EXPECTED_SEARCH_RESPONSE) + lirical_data = response_json['phenotypeGeneScores']['I000001_na19675']['ENSG00000268903']['lirical'] + self.assertListEqual(sorted(lirical_data, key=lambda d: d['diseaseId']), EXPECTED_LIRICAL_DATA) self._assert_expected_results_context(response_json) self.assertSetEqual( set(response_json['search']['projectFamilies'][0]['familyGuids']), expected_searched_families) @@ -644,8 +648,9 @@ def test_query_single_variant(self, mock_get_variant): expected_search_response['variantNotesByGuid'].pop('VN0714935_2103343353_r0390_100') expected_search_response['genesById'].pop('ENSG00000233653') expected_search_response['searchedVariants'] = [single_family_variant] - self.maxDiff = None self.assertDictEqual(response_json, expected_search_response) + lirical_data = response_json['phenotypeGeneScores']['I000001_na19675']['ENSG00000268903']['lirical'] + self.assertListEqual(sorted(lirical_data, key=lambda d: d['diseaseId']), EXPECTED_LIRICAL_DATA) self._assert_expected_results_family_context(response_json, locus_list_detail=True) self.assertSetEqual(set(response_json['projectsByGuid'][PROJECT_GUID].keys()), PROJECT_TAG_TYPE_FIELDS) self.assertSetEqual(set(response_json['familiesByGuid'].keys()), {'F000001_1'}) diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index 2edb303fc2..17f886ee39 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -12,6 +12,7 @@ import { MISSENSE_THRESHHOLD, LOF_THRESHHOLD, PANEL_APP_CONFIDENCE_LEVEL_COLORS, PANEL_APP_CONFIDENCE_DESCRIPTION, } from '../../../utils/constants' import { compareObjects } from '../../../utils/sortUtils' +import { camelcaseToTitlecase, snakecaseToTitlecase } from '../../../utils/stringUtils' import { HorizontalSpacer, VerticalSpacer } from '../../Spacers' import { InlineHeader, NoBorderTable, ButtonLink, ColoredLabel } from '../../StyledComponents' import { GeneSearchLink } from '../../buttons/SearchResultsLink' @@ -327,19 +328,7 @@ const PHENOTYPE_GENE_INFO_COLUMNS = [ { name: 'diseaseName', content: 'Disease', width: 3, format: ({ diseaseName, diseaseId }) => `${diseaseName} (${diseaseId})` }, ] -const PHENOTYPE_GENE_SCORE_COLUMNS = { - lirical: [ - ...PHENOTYPE_GENE_INFO_COLUMNS, - { name: 'scores.post_test_probability', content: 'Posttest-Probability', width: 3, format: ({ scores }) => (scores.post_test_probability.toPrecision(3)) }, - { name: 'scores.compositeLR', content: 'Composite-LR', width: 3, format: ({ scores }) => (scores.compositeLR.toPrecision(3)) }, - ], - exomiser: [ - ...PHENOTYPE_GENE_INFO_COLUMNS, - { name: 'scores.exomiser_score', content: 'Exomiser-Score', width: 3, format: ({ scores }) => (scores.exomiser_score.toPrecision(3)) }, - { name: 'scores.phenotype_score', content: 'Phenotype-Score', width: 3, format: ({ scores }) => (scores.phenotype_score.toPrecision(3)) }, - { name: 'scores.variant_score', content: 'Variant-Score', width: 3, format: ({ scores }) => (scores.variant_score.toPrecision(3)) }, - ], -} +const PHENOTYPE_GENE_SCORE_COLUMNS = {} const GENE_DETAIL_SECTIONS = [ { @@ -411,18 +400,29 @@ const GENE_DETAIL_SECTIONS = [ lable: 'PhenotypeGene', showDetails: (gene, { phenotypeGeneScores }) => phenotypeGeneScores && phenotypeGeneScores[gene.geneId], detailsDisplay: (gene, { phenotypeGeneScores }) => (Object.entries(phenotypeGeneScores[gene.geneId]).map( - ([tool, data]) => ([ - tool, - ( - - ), - ]), + ([tool, data]) => { + PHENOTYPE_GENE_SCORE_COLUMNS[tool] = [ + ...PHENOTYPE_GENE_INFO_COLUMNS, + ...Object.keys(data[0].scores).map(score => ({ + name: score, + content: snakecaseToTitlecase(camelcaseToTitlecase(score)).replace(' ', '-'), + width: 3, + format: ({ scores }) => (scores[score].toPrecision(3)), + })), + ] + return ([ + tool, + ( + + ), + ]) + }, )), }, ] From 11b73db4cfc480e9a39d233636d33a8a562060f4 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 28 Oct 2022 17:12:42 -0400 Subject: [PATCH 28/96] update 38 igv reference --- seqr/urls.py | 2 +- seqr/views/apis/igv_api.py | 10 ++++++++-- ui/shared/components/panel/family/constants.js | 16 ++++++++-------- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/seqr/urls.py b/seqr/urls.py index 5f1142dcca..84553fb6e4 100644 --- a/seqr/urls.py +++ b/seqr/urls.py @@ -254,7 +254,7 @@ 'gene_info/(?P[^/]+)/note/(?P[^/]+)/delete': delete_gene_note_handler, 'hpo_terms/(?P[^/]+)': get_hpo_terms, - 'igv_genomes/(?P.*)': igv_genomes_proxy, + 'igv_genomes/(?P[^/]+)/(?P.*)': igv_genomes_proxy, 'locus_lists/(?P[^/]+)/update': update_locus_list_handler, 'locus_lists/(?P[^/]+)/delete': delete_locus_list_handler, diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py index 5eb009455c..d21e2768b1 100644 --- a/seqr/views/apis/igv_api.py +++ b/seqr/views/apis/igv_api.py @@ -202,7 +202,13 @@ def _stream_file(request, path): return resp -def igv_genomes_proxy(request, file_path): +CLOUD_HOSTS = { + 's3': 'https://s3.amazonaws.com', + 'gs': 'https://storage.googleapis.com', +} + + +def igv_genomes_proxy(request, cloud_host, file_path): # IGV does not properly set CORS header and cannot directly access the genomes resource from the browser without # using this server-side proxy headers = {} @@ -210,7 +216,7 @@ def igv_genomes_proxy(request, file_path): if range_header: headers['Range'] = range_header - genome_response = requests.get('https://s3.amazonaws.com/igv.{}'.format(file_path), headers=headers) + genome_response = requests.get(f'{CLOUD_HOSTS[cloud_host]}/{file_path}', headers=headers) proxy_response = HttpResponse( content=genome_response.content, status=genome_response.status_code, diff --git a/ui/shared/components/panel/family/constants.js b/ui/shared/components/panel/family/constants.js index fd76518e24..908205a332 100644 --- a/ui/shared/components/panel/family/constants.js +++ b/ui/shared/components/panel/family/constants.js @@ -88,23 +88,23 @@ const BASE_REFERENCE_URL = '/api/igv_genomes' const REFERENCE_URLS = [ { key: 'fastaURL', - baseUrl: `${BASE_REFERENCE_URL}/broadinstitute.org/genomes/seq`, + baseUrl: BASE_REFERENCE_URL, path: { - 37: 'hg19/hg19.fasta', - 38: 'hg38/hg38.fa', + 37: 's3/igv.broadinstitute.org/genomes/seq/hg19/hg19.fasta', + 38: 'gs/gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta', }, }, { key: 'cytobandURL', - baseUrl: BASE_REFERENCE_URL, + baseUrl: `${BASE_REFERENCE_URL}/s3`, path: { - 37: 'broadinstitute.org/genomes/seq/hg19/cytoBand.txt', - 38: 'org.genomes/hg38/annotations/cytoBandIdeo.txt.gz', + 37: 'igv.broadinstitute.org/genomes/seq/hg19/cytoBand.txt', + 38: 'igv.org.genomes/hg38/annotations/cytoBandIdeo.txt.gz', }, }, { key: 'aliasURL', - baseUrl: `${BASE_REFERENCE_URL}/org.genomes`, + baseUrl: `${BASE_REFERENCE_URL}/s3/igv.org.genomes`, path: { 37: 'hg19/hg19_alias.tab', 38: 'hg38/hg38_alias.tab', @@ -127,7 +127,7 @@ const REFERENCE_TRACKS = [ { name: 'Refseq', indexPostfix: 'tbi', - baseUrl: `${BASE_REFERENCE_URL}/org.genomes`, + baseUrl: `${BASE_REFERENCE_URL}/s3/igv.org.genomes`, path: { 37: 'hg19/refGene.sorted.txt.gz', 38: 'hg38/refGene.sorted.txt.gz', From b42aa13b815ee129f3142a5cff44ee8703da8c24 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 10:43:30 -0400 Subject: [PATCH 29/96] clean up --- seqr/views/apis/igv_api.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/seqr/views/apis/igv_api.py b/seqr/views/apis/igv_api.py index d21e2768b1..1d4bdd76d3 100644 --- a/seqr/views/apis/igv_api.py +++ b/seqr/views/apis/igv_api.py @@ -16,7 +16,11 @@ login_and_policies_required, pm_or_data_manager_required GS_STORAGE_ACCESS_CACHE_KEY = 'gs_storage_access_cache_entry' - +GS_STORAGE_URL = 'https://storage.googleapis.com' +CLOUD_STORAGE_URLS = { + 's3': 'https://s3.amazonaws.com', + 'gs': GS_STORAGE_URL, +} @pm_or_data_manager_required def receive_igv_table_handler(request, project_guid): @@ -141,7 +145,7 @@ def _stream_gs(request, gs_path): headers = _get_gs_rest_api_headers(request.META.get('HTTP_RANGE'), gs_path, user=request.user) response = requests.get( - 'https://storage.googleapis.com/{}'.format(gs_path.replace('gs://', '', 1)), + f"{GS_STORAGE_URL}/{gs_path.replace('gs://', '', 1)}", headers=headers, stream=True) @@ -202,12 +206,6 @@ def _stream_file(request, path): return resp -CLOUD_HOSTS = { - 's3': 'https://s3.amazonaws.com', - 'gs': 'https://storage.googleapis.com', -} - - def igv_genomes_proxy(request, cloud_host, file_path): # IGV does not properly set CORS header and cannot directly access the genomes resource from the browser without # using this server-side proxy @@ -216,7 +214,7 @@ def igv_genomes_proxy(request, cloud_host, file_path): if range_header: headers['Range'] = range_header - genome_response = requests.get(f'{CLOUD_HOSTS[cloud_host]}/{file_path}', headers=headers) + genome_response = requests.get(f'{CLOUD_STORAGE_URLS[cloud_host]}/{file_path}', headers=headers) proxy_response = HttpResponse( content=genome_response.content, status=genome_response.status_code, From 2346883cf4d01b6636c9c69e232b40efbeaceff6 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Mon, 31 Oct 2022 10:43:33 -0400 Subject: [PATCH 30/96] Update condition for loading individual gene data and optimize frontend. --- seqr/views/apis/summary_data_api.py | 2 +- seqr/views/apis/summary_data_api_tests.py | 2 +- seqr/views/utils/orm_to_json_utils.py | 7 -- seqr/views/utils/variant_utils.py | 8 +- ui/shared/components/panel/fixtures.js | 1 + .../components/panel/variants/VariantGene.jsx | 89 ++++++++++--------- .../components/panel/variants/selectors.js | 46 +++++----- .../panel/variants/selectors.test.js | 12 +-- 8 files changed, 82 insertions(+), 85 deletions(-) diff --git a/seqr/views/apis/summary_data_api.py b/seqr/views/apis/summary_data_api.py index 718781e1af..05bdff681d 100644 --- a/seqr/views/apis/summary_data_api.py +++ b/seqr/views/apis/summary_data_api.py @@ -97,7 +97,7 @@ def saved_variants_page(request, tag): response_json = get_variants_response( request, saved_variant_models, add_all_context=True, include_igv=False, add_locus_list_detail=True, - include_rna_seq=False, include_project_name=True, + include_individual_gene_scores=False, include_project_name=True, ) return create_json_response(response_json) diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py index 260337ddba..b02346fdd5 100644 --- a/seqr/views/apis/summary_data_api_tests.py +++ b/seqr/views/apis/summary_data_api_tests.py @@ -25,7 +25,7 @@ SAVED_VARIANT_RESPONSE_KEYS = { 'projectsByGuid', 'locusListsByGuid', 'savedVariantsByGuid', 'variantFunctionalDataByGuid', 'genesById', 'variantNotesByGuid', 'individualsByGuid', 'variantTagsByGuid', 'familiesByGuid', 'familyNotesByGuid', - 'mmeSubmissionsByGuid', 'phenotypeGeneScores', + 'mmeSubmissionsByGuid', } diff --git a/seqr/views/utils/orm_to_json_utils.py b/seqr/views/utils/orm_to_json_utils.py index 4f762c9a77..39ccfbde8c 100644 --- a/seqr/views/utils/orm_to_json_utils.py +++ b/seqr/views/utils/orm_to_json_utils.py @@ -867,10 +867,3 @@ def _process_result(data, model): data['isSignificant'] = data['pAdjust'] < model.SIGNIFICANCE_THRESHOLD return _get_json_for_models(models, process_result=_process_result, **kwargs) - - -def get_json_for_phenotype_prioritization(models, **kwargs): - def _process_result(data, model): - data['scores'] = {_to_camel_case(score): value for score, value in data['scores'].items()} - - return _get_json_for_models(models, process_result=_process_result, **kwargs) diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index f43f98243d..c5ed9e893a 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -11,7 +11,7 @@ from seqr.views.utils.json_to_orm_utils import update_model_from_json from seqr.views.utils.orm_to_json_utils import get_json_for_discovery_tags, get_json_for_locus_lists, \ _get_json_for_models, get_json_for_rna_seq_outliers, get_json_for_saved_variants_with_tags, \ - get_json_for_matchmaker_submissions, get_json_for_phenotype_prioritization + get_json_for_matchmaker_submissions from seqr.views.utils.permissions_utils import has_case_review_permissions, user_is_analyst from seqr.views.utils.project_context_utils import add_project_tag_types, add_families_context from settings import REDIS_SERVICE_HOSTNAME, REDIS_SERVICE_PORT @@ -177,8 +177,7 @@ def _add_pa_detail(locus_list_gene, locus_list_guid, gene_json): def get_variants_response(request, saved_variants, response_variants=None, add_all_context=False, include_igv=True, - add_locus_list_detail=False, include_rna_seq=True, include_project_name=False, - include_phenotype_prioritization=True): + add_locus_list_detail=False, include_individual_gene_scores=True, include_project_name=False): response = get_json_for_saved_variants_with_tags(saved_variants, add_details=True) variants = list(response['savedVariantsByGuid'].values()) if response_variants is None else response_variants @@ -230,13 +229,12 @@ def get_variants_response(request, saved_variants, response_variants=None, add_a has_case_review_perm=bool(project) and has_case_review_permissions(project, request.user), include_igv=include_igv, ) - if include_rna_seq: + if include_individual_gene_scores: response['rnaSeqData'] = _get_rna_seq_outliers(genes.keys(), families) families_by_guid = response.get('familiesByGuid') if families_by_guid: _add_family_has_rna_tpm(families_by_guid) - if include_phenotype_prioritization: response['phenotypeGeneScores'] = _get_phenotype_prioritization(genes.keys(), families) return response diff --git a/ui/shared/components/panel/fixtures.js b/ui/shared/components/panel/fixtures.js index a95cdb6cb0..405c0485dd 100644 --- a/ui/shared/components/panel/fixtures.js +++ b/ui/shared/components/panel/fixtures.js @@ -333,6 +333,7 @@ export const STATE1 = { tpms: { ENSG00000228198: { tpm: 1.03, geneId: 'ENSG00000228198' } }, }, }, + phenotypeGeneScoresByIndividual: {}, mmeSubmissionsByGuid: {}, project: { createdDate: '2016-05-16T05:37:08.634Z', diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index 17f886ee39..2f632f7158 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -12,7 +12,7 @@ import { MISSENSE_THRESHHOLD, LOF_THRESHHOLD, PANEL_APP_CONFIDENCE_LEVEL_COLORS, PANEL_APP_CONFIDENCE_DESCRIPTION, } from '../../../utils/constants' import { compareObjects } from '../../../utils/sortUtils' -import { camelcaseToTitlecase, snakecaseToTitlecase } from '../../../utils/stringUtils' +import { camelcaseToTitlecase } from '../../../utils/stringUtils' import { HorizontalSpacer, VerticalSpacer } from '../../Spacers' import { InlineHeader, NoBorderTable, ButtonLink, ColoredLabel } from '../../StyledComponents' import { GeneSearchLink } from '../../buttons/SearchResultsLink' @@ -315,21 +315,35 @@ const GENE_DISEASE_DETAIL_SECTIONS = [ }, ] +const RNA_SEQ_DETAIL_FIELDS = ['zScore', 'pValue', 'pAdjust'] + +const INDIVIDUAL_NAME_COLUMN = { name: 'individualName', content: '', width: 3, format: ({ individualName }) => ({individualName}) } + const RNA_SEQ_COLUMNS = [ - { name: 'individual', content: '', width: 3 }, - { name: 'zScore', content: 'Z-Score', width: 3, format: ({ zScore }) => (zScore ? zScore.toPrecision(3) : null) }, - { name: 'pValue', content: 'P-Value', width: 3, format: ({ pValue }) => (pValue ? pValue.toPrecision(3) : null) }, - { name: 'pAdjust', content: 'P-Adjust', width: 3, format: ({ pAdjust }) => (pAdjust ? pAdjust.toPrecision(3) : null) }, + INDIVIDUAL_NAME_COLUMN, + ...RNA_SEQ_DETAIL_FIELDS.map(name => ( + { name, content: camelcaseToTitlecase(name).replace(' ', '-'), format: row => row[name].toPrecision(3) } + )), ] const PHENOTYPE_GENE_INFO_COLUMNS = [ - { name: 'individual', content: '', width: 3 }, - { name: 'rank', content: 'Rank', width: 3 }, + INDIVIDUAL_NAME_COLUMN, { name: 'diseaseName', content: 'Disease', width: 3, format: ({ diseaseName, diseaseId }) => `${diseaseName} (${diseaseId})` }, + { name: 'rank', content: 'Rank', width: 3 }, + { + name: 'scores', + content: 'Scores', + width: 12, + format: ({ scores }) => Object.keys(scores).sort().map(scoreName => ( +
+ {camelcaseToTitlecase(scoreName).replace(' ', '-')} + :   + { scores[scoreName].toPrecision(3) } +
+ )), + }, ] -const PHENOTYPE_GENE_SCORE_COLUMNS = {} - const GENE_DETAIL_SECTIONS = [ { color: 'red', @@ -386,43 +400,34 @@ const GENE_DETAIL_SECTIONS = [ label: 'RNA-Seq', showDetails: (gene, { rnaSeqData }) => rnaSeqData && rnaSeqData[gene.geneId], detailsDisplay: (gene, { rnaSeqData }) => ( - +
+ This gene is flagged as an outlier for RNA-Seq in the following samples + +
), }, { color: 'orange', description: 'Phenotype Prioritization', - lable: 'PhenotypeGene', showDetails: (gene, { phenotypeGeneScores }) => phenotypeGeneScores && phenotypeGeneScores[gene.geneId], detailsDisplay: (gene, { phenotypeGeneScores }) => (Object.entries(phenotypeGeneScores[gene.geneId]).map( - ([tool, data]) => { - PHENOTYPE_GENE_SCORE_COLUMNS[tool] = [ - ...PHENOTYPE_GENE_INFO_COLUMNS, - ...Object.keys(data[0].scores).map(score => ({ - name: score, - content: snakecaseToTitlecase(camelcaseToTitlecase(score)).replace(' ', '-'), - width: 3, - format: ({ scores }) => (scores[score].toPrecision(3)), - })), - ] - return ([ - tool, - ( - - ), - ]) - }, + ([tool, data]) => ({ + label: tool.toUpperCase(), + detail: ( + + ), + }), )), }, ] @@ -458,9 +463,9 @@ const getDetailSections = (configs, gene, compact, labelProps, individualGeneDat ).reduce((acc, config) => (Array.isArray(config.detail) ? [ ...acc, - ...config.detail.map(([tool, detail]) => ({ ...config, label: tool.toUpperCase(), detail })), - ] : [...acc, config]), -[]).filter(({ detail }) => detail).map(({ detail, expandedDisplay, ...sectionConfig }) => ( + ...config.detail.map(detail => ({ ...config, ...detail })), + ] : (config.detail && [...acc, config]) || acc), +[]).map(({ detail, expandedDisplay, ...sectionConfig }) => ( (expandedDisplay && !compact) ? ( diff --git a/ui/shared/components/panel/variants/selectors.js b/ui/shared/components/panel/variants/selectors.js index 15a9871c31..d0cf64aba1 100644 --- a/ui/shared/components/panel/variants/selectors.js +++ b/ui/shared/components/panel/variants/selectors.js @@ -24,31 +24,31 @@ export const getIndividualGeneDataByFamilyGene = createSelector( getPhenotypeGeneScoresByIndividual, (individualsByGuid, rnaSeqDataByIndividual, phenotypeGeneScoresByIndividual) => ( Object.entries(individualsByGuid).reduce((acc, [individualGuid, { familyGuid, displayName }]) => { - const rnaSeqData = rnaSeqDataByIndividual && rnaSeqDataByIndividual[individualGuid]?.outliers - const phenotypeGeneScores = phenotypeGeneScoresByIndividual && phenotypeGeneScoresByIndividual[individualGuid] - if (!rnaSeqData && !phenotypeGeneScores) { - return acc + const rnaSeqData = rnaSeqDataByIndividual[individualGuid]?.outliers + const phenotypeGeneScores = phenotypeGeneScoresByIndividual[individualGuid] + acc[familyGuid] = acc[familyGuid] || {} + if (rnaSeqData) { + acc[familyGuid].rnaSeqData = Object.entries(rnaSeqData || {}).reduce( + (acc2, [geneId, data]) => (data.isSignificant ? { + ...acc2, + [geneId]: [...(acc2[geneId] || []), { ...data, individualName: displayName }], + } : acc2), acc[familyGuid]?.rnaSeqData || {}, + ) } - return { - ...acc, - [familyGuid]: { - rnaSeqData: Object.entries(rnaSeqData || {}).reduce( - (acc2, [geneId, data]) => (data.isSignificant ? { - ...acc2, - [geneId]: [...(acc2[geneId] || []), { ...data, individual: displayName }], - } : acc2), acc[familyGuid]?.rnaSeqData || {}, - ), - phenotypeGeneScores: Object.entries(phenotypeGeneScores || {}).reduce( - (acc2, [geneId, dataByTool]) => ({ - ...acc2, - [geneId]: Object.entries(dataByTool).reduce((acc3, [tool, data]) => ({ - ...acc3, - [tool]: [...(acc3[tool] || []), ...data.map(d => ({ ...d, individual: displayName }))], - }), acc2[geneId] || {}), - }), acc[familyGuid]?.phenotypeGeneScores || {}, - ), - }, + if (phenotypeGeneScores) { + acc[familyGuid].phenotypeGeneScores = Object.entries(phenotypeGeneScores || {}).reduce( + (acc2, [geneId, dataByTool]) => ({ + ...acc2, + [geneId]: Object.entries(dataByTool).reduce((acc3, [tool, data]) => ({ + ...acc3, + [tool]: [...(acc3[tool] || []), ...data.map(d => ({ + ...d, individualName: displayName, rowId: `${displayName}-${d.diseaseId}`, + }))], + }), acc2[geneId] || {}), + }), acc[familyGuid]?.phenotypeGeneScores || {}, + ) } + return acc }, {}) ), ) diff --git a/ui/shared/components/panel/variants/selectors.test.js b/ui/shared/components/panel/variants/selectors.test.js index ddb45c94e6..6e0580cc00 100644 --- a/ui/shared/components/panel/variants/selectors.test.js +++ b/ui/shared/components/panel/variants/selectors.test.js @@ -110,20 +110,21 @@ test('getIndividualGeneDataByFamilyGene', () => { F011652_1: { rnaSeqData: { ENSG00000228198: [ - { individual: 'NA19678', isSignificant: true, pValue: 0.0004 }, - { individual: 'NA19679_1', isSignificant: true, pValue: 0.01 }, + { individualName: 'NA19678', isSignificant: true, pValue: 0.0004 }, + { individualName: 'NA19679_1', isSignificant: true, pValue: 0.01 }, ], ENSG00000164458: [ - { individual: 'NA19678', isSignificant: true, pValue: 0.0073 }, + { individualName: 'NA19678', isSignificant: true, pValue: 0.0073 }, ], }, phenotypeGeneScores: { ENSG00000228198: { lirical: [{ - individual: 'NA19678', + individualName: 'NA19678', diseaseId: 'OMIM:618460', diseaseName: 'Khan-Khan-Katsanis syndrome', geneId: 'ENSG00000228198', + rowId: 'NA19678-OMIM:618460', rank: 1, scores: { compositeLR: 0.066, post_test_probability: 0 }, }], @@ -132,9 +133,8 @@ test('getIndividualGeneDataByFamilyGene', () => { }, F011652_2: { rnaSeqData: { - ENSG00000228198: [{ individual: 'NA19678_2', isSignificant: true, pValue: 0.0214 }], + ENSG00000228198: [{ individualName: 'NA19678_2', isSignificant: true, pValue: 0.0214 }], }, - phenotypeGeneScores: {}, }, }) }) From 1298b5c3acf47631a823f421f6908baa8bec8651 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 10:48:20 -0400 Subject: [PATCH 31/96] update unit tests --- seqr/views/apis/igv_api_tests.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/seqr/views/apis/igv_api_tests.py b/seqr/views/apis/igv_api_tests.py index 6d4d582fed..1d49b8a931 100644 --- a/seqr/views/apis/igv_api_tests.py +++ b/seqr/views/apis/igv_api_tests.py @@ -217,27 +217,28 @@ def test_add_alignment_sample(self, mock_local_file_exists, mock_subprocess): self.assertEqual(response.status_code, 200) @responses.activate - def test_igv_genomes_proxyy(self): - url_path = 'org.genomes/foo?query=true' - url = reverse(igv_genomes_proxy, args=[url_path]) + def test_igv_genomes_proxy(self): + url_path = 'igv.org.genomes/foo?query=true' + s3_url = reverse(igv_genomes_proxy, args=['s3', url_path]) expected_body = {'genes': ['GENE1', 'GENE2']} responses.add( responses.GET, 'https://s3.amazonaws.com/igv.org.genomes/foo?query=true', match_querystring=True, content_type='application/json', body=json.dumps(expected_body)) - response = self.client.get(url) + response = self.client.get(s3_url) self.assertEqual(response.status_code, 200) self.assertDictEqual(json.loads(response.content), expected_body) self.assertIsNone(responses.calls[0].request.headers.get('Range')) # test with range header proxy + gs_url = reverse(igv_genomes_proxy, args=['gs', 'test-bucket/foo.fasta']) expected_content = 'test file content' - responses.replace( - responses.GET, 'https://s3.amazonaws.com/igv.org.genomes/foo?query=true', match_querystring=True, + responses.add( + responses.GET, 'https://storage.googleapis.com/test-bucket/foo.fasta', match_querystring=True, body=expected_content) - response = self.client.get(url, HTTP_RANGE='bytes=100-200') + response = self.client.get(gs_url, HTTP_RANGE='bytes=100-200') self.assertEqual(response.status_code, 200) self.assertEqual(response.content.decode(), expected_content) self.assertEqual(responses.calls[1].request.headers.get('Range'), 'bytes=100-200') From 3db96d3f60c578ed92d0048e2182e2ee1bcf4473 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Mon, 31 Oct 2022 10:59:20 -0400 Subject: [PATCH 32/96] Adjust column width. --- ui/shared/components/panel/variants/VariantGene.jsx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index 2f632f7158..de97455ca3 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -317,7 +317,7 @@ const GENE_DISEASE_DETAIL_SECTIONS = [ const RNA_SEQ_DETAIL_FIELDS = ['zScore', 'pValue', 'pAdjust'] -const INDIVIDUAL_NAME_COLUMN = { name: 'individualName', content: '', width: 3, format: ({ individualName }) => ({individualName}) } +const INDIVIDUAL_NAME_COLUMN = { name: 'individualName', content: '', format: ({ individualName }) => ({individualName}) } const RNA_SEQ_COLUMNS = [ INDIVIDUAL_NAME_COLUMN, @@ -328,8 +328,8 @@ const RNA_SEQ_COLUMNS = [ const PHENOTYPE_GENE_INFO_COLUMNS = [ INDIVIDUAL_NAME_COLUMN, - { name: 'diseaseName', content: 'Disease', width: 3, format: ({ diseaseName, diseaseId }) => `${diseaseName} (${diseaseId})` }, - { name: 'rank', content: 'Rank', width: 3 }, + { name: 'diseaseName', content: 'Disease', format: ({ diseaseName, diseaseId }) => `${diseaseName} (${diseaseId})` }, + { name: 'rank', content: 'Rank' }, { name: 'scores', content: 'Scores', From 842556f882aab51ac1ec4cfa7bab094da2ca6ca5 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Mon, 31 Oct 2022 11:03:41 -0400 Subject: [PATCH 33/96] Update the score names while loading data. --- seqr/views/utils/dataset_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 4cfd0241c7..f57edbcbe3 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -11,7 +11,7 @@ from seqr.utils.logging_utils import log_model_bulk_update, SeqrLogger from seqr.views.utils.file_utils import parse_file from seqr.views.utils.permissions_utils import get_internal_projects -from seqr.views.utils.json_utils import _to_snake_case +from seqr.views.utils.json_utils import _to_snake_case, _to_camel_case logger = SeqrLogger(__name__) @@ -460,7 +460,7 @@ def _parse_phenotype_pri_row(row): score_name = row.get(f'scoreName{i}') if not score_name: break - scores[score_name] = float(row[f'score{i}']) + scores[_to_camel_case(_to_snake_case(score_name))] = float(row[f'score{i}']) record['scores'] = scores yield record From 313c3fa2a751caa58b6810300f696193a38d334e Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 11:34:54 -0400 Subject: [PATCH 34/96] remove 1kg from server search --- seqr/utils/elasticsearch/constants.py | 5 +---- seqr/utils/elasticsearch/es_search.py | 1 + seqr/utils/elasticsearch/es_utils_tests.py | 23 ---------------------- seqr/views/utils/test_utils.py | 10 ---------- 4 files changed, 2 insertions(+), 37 deletions(-) diff --git a/seqr/utils/elasticsearch/constants.py b/seqr/utils/elasticsearch/constants.py index 2c6c5a6496..0016a1a6b9 100644 --- a/seqr/utils/elasticsearch/constants.py +++ b/seqr/utils/elasticsearch/constants.py @@ -90,9 +90,6 @@ 'filter_AF': [], 'Het': None, }, - 'g1k': { - 'filter_AF': ['g1k_POPMAX_AF'], - }, 'exac': { 'filter_AF': ['exac_AF_POPMAX'], 'AC': 'exac_AC_Adj', @@ -259,7 +256,7 @@ 'source': "doc.containsKey(params.field) ? (doc[params.field].empty ? 0 : doc[params.field].value) : 1" } } - }] for sort, pop_key in {'gnomad': 'gnomad_genomes', 'gnomad_exomes': 'gnomad_exomes', '1kg': 'g1k', 'callset_af': 'callset'}.items()} + }] for sort, pop_key in {'gnomad': 'gnomad_genomes', 'gnomad_exomes': 'gnomad_exomes', 'callset_af': 'callset'}.items()} SORT_FIELDS.update(POPULATION_SORTS) PREDICTOR_SORT_FIELDS = { 'cadd': 'cadd_PHRED', diff --git a/seqr/utils/elasticsearch/es_search.py b/seqr/utils/elasticsearch/es_search.py index 2fde3d884b..2eda40e1c6 100644 --- a/seqr/utils/elasticsearch/es_search.py +++ b/seqr/utils/elasticsearch/es_search.py @@ -325,6 +325,7 @@ def _filter_by_in_silico(self, in_silico_filters): self._filter(_in_silico_filter(in_silico_filters)) def _filter_by_frequency(self, frequencies, clinvar_terms=None): + frequencies = {pop: v for pop, v in (frequencies or {}).items() if pop in POPULATIONS} if not frequencies: return diff --git a/seqr/utils/elasticsearch/es_utils_tests.py b/seqr/utils/elasticsearch/es_utils_tests.py index 0e6bcef1aa..2a26d66a2b 100644 --- a/seqr/utils/elasticsearch/es_utils_tests.py +++ b/seqr/utils/elasticsearch/es_utils_tests.py @@ -798,14 +798,6 @@ 'AC', 'AF', 'AN', - 'g1k_AC', - 'g1k_Hom', - 'g1k_Hemi', - 'g1k_POPMAX_AF', - 'g1k_AF', - 'g1k_AN', - 'g1k_Het', - 'g1k_ID', 'gnomad_genomes_AC', 'gnomad_genomes_Hom', 'gnomad_genomes_Hemi', @@ -927,7 +919,6 @@ MITO_SOURCE_ONLY_FIELDS = [ 'callset_max_hl', 'exac_max_hl', - 'g1k_max_hl', 'gnomad_exomes_max_hl', 'gnomad_genomes_max_hl', 'gnomad_svs_max_hl', @@ -1656,13 +1647,6 @@ def test_filtered_get_es_variants(self): {'range': {'exac_AC_Adj': {'lte': 2}}} ]} }, - {'bool': { - 'minimum_should_match': 1, - 'should': [ - {'bool': {'must_not': [{'exists': {'field': 'g1k_POPMAX_AF'}}]}}, - {'range': {'g1k_POPMAX_AF': {'lte': 0.001}}} - ] - }}, {'bool': { 'minimum_should_match': 1, 'should': [ @@ -1721,13 +1705,6 @@ def test_filtered_get_es_variants(self): {'range': {'AF': {'lte': 0.1}}} ], 'must': [ - {'bool': { - 'minimum_should_match': 1, - 'should': [ - {'bool': {'must_not': [{'exists': {'field': 'g1k_POPMAX_AF'}}]}}, - {'range': {'g1k_POPMAX_AF': {'lte': 0.05}}} - ] - }}, {'bool': { 'minimum_should_match': 1, 'should': [ diff --git a/seqr/views/utils/test_utils.py b/seqr/views/utils/test_utils.py index d0e72f3f76..c927f935a1 100644 --- a/seqr/views/utils/test_utils.py +++ b/seqr/views/utils/test_utils.py @@ -868,8 +868,6 @@ def call_request_json(self, index=-1): 'populations': { 'callset': {'an': 32, 'ac': 2, 'hom': None, 'af': 0.063, 'hemi': None, 'filter_af': None, 'het': None, 'id': None, 'max_hl': None}, - 'g1k': {'an': 0, 'ac': 0, 'hom': 0, 'af': 0.0, 'hemi': 0, 'filter_af': None, 'het': 0, 'id': None, - 'max_hl': None}, 'gnomad_genomes': {'an': 30946, 'ac': 4, 'hom': 0, 'af': 0.00012925741614425127, 'hemi': 0, 'filter_af': 0.0004590314436538903, 'het': 0, 'id': None, 'max_hl': None}, 'exac': {'an': 121308, 'ac': 8, 'hom': 0, 'af': 0.00006589, 'hemi': 0, 'filter_af': 0.0006726888333653661, @@ -952,8 +950,6 @@ def call_request_json(self, index=-1): 'populations': { 'callset': {'an': 32, 'ac': 1, 'hom': None, 'af': 0.031, 'hemi': None, 'filter_af': None, 'het': None, 'id': None, 'max_hl': None}, - 'g1k': {'an': 0, 'ac': 0, 'hom': 0, 'af': 0.0, 'hemi': 0, 'filter_af': None, 'het': 0, 'id': None, - 'max_hl': None}, 'gnomad_genomes': {'an': 0, 'ac': 0, 'hom': 0, 'af': 0.0, 'hemi': 0, 'filter_af': None, 'het': 0, 'id': None, 'max_hl': None}, 'exac': {'an': 121336, 'ac': 6, 'hom': 0, 'af': 0.00004942, 'hemi': 0, 'filter_af': 0.000242306760358614, @@ -1039,8 +1035,6 @@ def call_request_json(self, index=-1): 'populations': { 'callset': {'an': None, 'ac': None, 'hom': None, 'af': None, 'hemi': None, 'filter_af': None, 'het': None, 'id': None, 'max_hl': None}, - 'g1k': {'an': None, 'ac': None, 'hom': None, 'af': None, 'hemi': None, 'filter_af': None, 'het': None, - 'id': None, 'max_hl': None}, 'gnomad_genomes': {'an': None, 'ac': None, 'hom': None, 'af': None, 'hemi': None, 'filter_af': None, 'het': None, 'id': None, 'max_hl': None}, 'exac': {'an': None, 'ac': None, 'hom': None, 'af': None, 'hemi': None, 'filter_af': None, 'het': None, @@ -1126,8 +1120,6 @@ def call_request_json(self, index=-1): 'populations': { 'callset': {'an': None, 'ac': None, 'hom': None, 'af': None, 'hemi': None, 'filter_af': None, 'het': None, 'id': None, 'max_hl': None}, - 'g1k': {'an': None, 'ac': None, 'hom': None, 'af': None, 'hemi': None, 'filter_af': None, 'het': None, - 'id': None, 'max_hl': None}, 'gnomad_genomes': {'an': None, 'ac': None, 'hom': None, 'af': None, 'hemi': None, 'filter_af': None, 'het': None, 'id': None, 'max_hl': None}, 'exac': {'an': None, 'ac': None, 'hom': None, 'af': None, 'hemi': None, 'filter_af': None, 'het': None, @@ -1207,8 +1199,6 @@ def call_request_json(self, index=-1): 'het': None, 'hom': None, 'id': None, 'max_hl': None}, 'exac': {'ac': None, 'af': None, 'an': None, 'filter_af': None, 'hemi': None, 'het': None, 'hom': None, 'id': None, 'max_hl': None}, - 'g1k': {'ac': None, 'af': None, 'an': None, 'filter_af': None, 'hemi': None, - 'het': None, 'hom': None, 'id': None, 'max_hl': None}, 'gnomad_exomes': {'ac': None, 'af': None, 'an': None, 'filter_af': None, 'hemi': None, 'het': None, 'hom': None, 'id': None, 'max_hl': None}, 'gnomad_genomes': {'ac': None, 'af': None, 'an': None, 'filter_af': None, From 57d9a0aedc8a04aaa7e9819766016bee75aa213c Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 11:37:46 -0400 Subject: [PATCH 35/96] remove 1kg from variant client side --- ui/shared/components/panel/variants/Frequencies.jsx | 1 - ui/shared/utils/constants.js | 3 --- 2 files changed, 4 deletions(-) diff --git a/ui/shared/components/panel/variants/Frequencies.jsx b/ui/shared/components/panel/variants/Frequencies.jsx index 1194ebdca0..043a4aa252 100644 --- a/ui/shared/components/panel/variants/Frequencies.jsx +++ b/ui/shared/components/panel/variants/Frequencies.jsx @@ -162,7 +162,6 @@ const CALLSET_POP = { field: 'callset', fieldTitle: 'This Callset', acDisplay: ' const POPULATIONS = [ { field: 'sv_callset', fieldTitle: 'This Callset', acDisplay: 'AC', helpMessage: SV_CALLSET_CRITERIA_MESSAGE }, CALLSET_POP, - { field: 'g1k', fieldTitle: '1kg WGS' }, { field: 'exac', fieldTitle: 'ExAC', diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js index 03c1c59cf2..02daab7de0 100644 --- a/ui/shared/utils/constants.js +++ b/ui/shared/utils/constants.js @@ -957,7 +957,6 @@ const SORT_BY_PROTEIN_CONSQ = 'PROTEIN_CONSEQUENCE' const SORT_BY_GNOMAD_GENOMES = 'GNOMAD' const SORT_BY_GNOMAD_EXOMES = 'GNOMAD_EXOMES' const SORT_BY_CALLSET_AF = 'CALLSET_AF' -const SORT_BY_1KG = '1KG' const SORT_BY_CONSTRAINT = 'CONSTRAINT' const SORT_BY_CADD = 'CADD' const SORT_BY_REVEL = 'REVEL' @@ -1023,7 +1022,6 @@ const VARIANT_SORT_OPTONS = [ { value: SORT_BY_GNOMAD_GENOMES, text: 'gnomAD Genomes Frequency', comparator: populationComparator('gnomad_genomes') }, { value: SORT_BY_GNOMAD_EXOMES, text: 'gnomAD Exomes Frequency', comparator: populationComparator('gnomad_exomes') }, { value: SORT_BY_CALLSET_AF, text: 'Callset AF', comparator: populationComparator('callset') }, - { value: SORT_BY_1KG, text: '1kg Frequency', comparator: populationComparator('g1k') }, { value: SORT_BY_CADD, text: 'Cadd', comparator: predictionComparator('cadd') }, { value: SORT_BY_REVEL, text: 'Revel', comparator: predictionComparator('revel') }, { value: SORT_BY_EIGEN, text: 'Eigen', comparator: predictionComparator('eigen') }, @@ -1225,7 +1223,6 @@ export const VARIANT_EXPORT_DATA = [ { header: 'alt' }, { header: 'gene', getVal: variant => getVariantMainTranscript(variant).geneSymbol }, { header: 'worst_consequence', getVal: variant => getVariantMainTranscript(variant).majorConsequence }, - { header: '1kg_freq', getVal: getPopAf('g1k') }, { header: 'exac_freq', getVal: getPopAf('exac') }, { header: 'gnomad_genomes_freq', getVal: getPopAf('gnomad_genomes') }, { header: 'gnomad_exomes_freq', getVal: getPopAf('gnomad_exomes') }, From 568b8516ee96f4b66317ad3373e05e50bb8bab7b Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Mon, 31 Oct 2022 13:25:02 -0400 Subject: [PATCH 36/96] Add a comment. --- seqr/views/utils/dataset_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index f57edbcbe3..0537417a68 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -460,6 +460,8 @@ def _parse_phenotype_pri_row(row): score_name = row.get(f'scoreName{i}') if not score_name: break + # We have both camel case and snake case in the score field names, so convert them to snake case first (those + # in snake case kept unchanged), then to camel case. scores[_to_camel_case(_to_snake_case(score_name))] = float(row[f'score{i}']) record['scores'] = scores From 251b8a8838091cecef1a72ef68bbd5aceae2579c Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Mon, 31 Oct 2022 16:50:01 -0400 Subject: [PATCH 37/96] Update the column widths for the datatable. --- ui/shared/components/panel/variants/VariantGene.jsx | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/ui/shared/components/panel/variants/VariantGene.jsx b/ui/shared/components/panel/variants/VariantGene.jsx index de97455ca3..5aa9341c19 100644 --- a/ui/shared/components/panel/variants/VariantGene.jsx +++ b/ui/shared/components/panel/variants/VariantGene.jsx @@ -90,7 +90,7 @@ const LocusListsContainer = styled.div` const GeneLabel = React.memo(({ popupHeader, popupContent, showEmpty, ...labelProps }) => { const content = return (popupContent || showEmpty) ? - : content + : content }) GeneLabel.propTypes = { @@ -327,13 +327,13 @@ const RNA_SEQ_COLUMNS = [ ] const PHENOTYPE_GENE_INFO_COLUMNS = [ - INDIVIDUAL_NAME_COLUMN, - { name: 'diseaseName', content: 'Disease', format: ({ diseaseName, diseaseId }) => `${diseaseName} (${diseaseId})` }, - { name: 'rank', content: 'Rank' }, + { ...INDIVIDUAL_NAME_COLUMN, width: 4 }, + { name: 'diseaseName', content: 'Disease', width: 5, format: ({ diseaseName, diseaseId }) => `${diseaseName} (${diseaseId})` }, + { name: 'rank', content: 'Rank', width: 1 }, { name: 'scores', content: 'Scores', - width: 12, + width: 6, format: ({ scores }) => Object.keys(scores).sort().map(scoreName => (
{camelcaseToTitlecase(scoreName).replace(' ', '-')} @@ -422,6 +422,8 @@ const GENE_DETAIL_SECTIONS = [ Date: Mon, 31 Oct 2022 16:50:21 -0400 Subject: [PATCH 38/96] update transcript models --- .../management/commands/update_gencode.py | 14 ++++++++-- .../migrations/0021_auto_20221031_2049.py | 27 +++++++++++++++++++ reference_data/models.py | 7 ++++- 3 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 reference_data/migrations/0021_auto_20221031_2049.py diff --git a/reference_data/management/commands/update_gencode.py b/reference_data/management/commands/update_gencode.py index b8e4088ba0..fbd70b4495 100644 --- a/reference_data/management/commands/update_gencode.py +++ b/reference_data/management/commands/update_gencode.py @@ -135,8 +135,16 @@ def _parse_line(line, i, new_genes, new_transcripts, existing_gene_ids, existin # parse info field info_fields = [x.strip().split() for x in record['info'].split(';') if x != ''] - info_fields = {k: v.strip('"') for k, v in info_fields} - record.update(info_fields) + info_dict = {} + for k, v in info_fields: + v = v.strip('"') + if k == 'tag': + if k not in info_dict: + info_dict[k] = [] + info_dict[k].append(v) + else: + info_dict[k] = v + record.update(info_dict) record['gene_id'] = record['gene_id'].split('.')[0] if 'transcript_id' in record: @@ -179,6 +187,8 @@ def _parse_line(line, i, new_genes, new_transcripts, existing_gene_ids, existin "end_grch{}".format(genome_version): record["end"], "strand_grch{}".format(genome_version): record["strand"], }) + if 'MANE_Select' in record.get('tag', []): + new_transcripts[record['transcript_id']]['is_mane_select'] = True elif record['feature_type'] == 'CDS': if record["transcript_id"] in existing_transcript_ids: diff --git a/reference_data/migrations/0021_auto_20221031_2049.py b/reference_data/migrations/0021_auto_20221031_2049.py new file mode 100644 index 0000000000..f1052679b2 --- /dev/null +++ b/reference_data/migrations/0021_auto_20221031_2049.py @@ -0,0 +1,27 @@ +# Generated by Django 3.2.16 on 2022-10-31 20:49 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('reference_data', '0020_clingen'), + ] + + operations = [ + migrations.AddField( + model_name='transcriptinfo', + name='is_mane_select', + field=models.BooleanField(default=False), + ), + migrations.CreateModel( + name='RefseqTranscript', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('refseq_id', models.CharField(max_length=20)), + ('transcript', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='reference_data.transcriptinfo')), + ], + ), + ] diff --git a/reference_data/models.py b/reference_data/models.py index 2c47d353a9..b749e4b2a4 100644 --- a/reference_data/models.py +++ b/reference_data/models.py @@ -113,7 +113,7 @@ class TranscriptInfo(models.Model): gene = models.ForeignKey(GeneInfo, on_delete=models.CASCADE) transcript_id = models.CharField(max_length=20, db_index=True, unique=True) # without the version suffix - #protein_id = models.CharField(max_length=20, null=True) + is_mane_select = models.BooleanField(default=False) chrom_grch37 = models.CharField(max_length=2, null=True, blank=True) start_grch37 = models.IntegerField(null=True, blank=True) @@ -128,6 +128,11 @@ class TranscriptInfo(models.Model): coding_region_size_grch38 = models.IntegerField(default=0) # number of protein-coding bases (= 0 for non-coding genes) +class RefseqTranscript(models.Model): + transcript = models.ForeignKey(TranscriptInfo, on_delete=models.CASCADE) + refseq_id = models.CharField(max_length=20) + + # based on # ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/functional_gene_constraint/fordist_cleaned_exac_r03_march16_z_pli_rec_null_data.txt class GeneConstraint(models.Model): gene = models.ForeignKey(GeneInfo, on_delete=models.CASCADE) From ae366e83869294f7d543f2432acb4967047a85af Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 16:53:27 -0400 Subject: [PATCH 39/96] update tests --- reference_data/management/tests/update_gencode_tests.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/reference_data/management/tests/update_gencode_tests.py b/reference_data/management/tests/update_gencode_tests.py index dd8e21cf55..0b42be2b8a 100644 --- a/reference_data/management/tests/update_gencode_tests.py +++ b/reference_data/management/tests/update_gencode_tests.py @@ -26,7 +26,7 @@ 'chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5_2"; transcript_id "ENST00000456328.2_1"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "lncRNA"; transcript_name "DDX11L1-202"; exon_number 1; exon_id "ENSE00002234944.1_1"; level 2; transcript_support_level 1; hgnc_id "HGNC:37102"; tag "basic"; havana_gene "OTTHUMG00000000961.2_2"; havana_transcript "OTTHUMT00000362751.1_1"; remap_original_location "chr1:+:11869-12227"; remap_status "full_contig";\n', # Not existing gene_id 'chr1 HAVANA gene 621059 622053 . - . gene_id "ENSG00000284662.1_2"; gene_type "protein_coding"; gene_name "OR4F16"; level 2; hgnc_id "HGNC:15079"; havana_gene "OTTHUMG00000002581.3_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";\n', - 'chr1 HAVANA transcript 621059 622053 . - . gene_id "ENSG00000284662.1_2"; transcript_id "ENST00000332831.4_2"; gene_type "protein_coding"; gene_name "OR4F16"; transcript_type "protein_coding"; transcript_name "OR4F16-201"; level 2; protein_id "ENSP00000329982.2"; transcript_support_level "NA"; hgnc_id "HGNC:15079"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS41221.1"; havana_gene "OTTHUMG00000002581.3_2"; havana_transcript "OTTHUMT00000007334.3_2"; remap_num_mappings 1; remap_status "full_contig"; remap_target_status "overlap";\n', + 'chr1 HAVANA transcript 621059 622053 . - . gene_id "ENSG00000284662.1_2"; transcript_id "ENST00000332831.4_2"; gene_type "protein_coding"; gene_name "OR4F16"; transcript_type "protein_coding"; transcript_name "OR4F16-201"; level 2; protein_id "ENSP00000329982.2"; transcript_support_level "NA"; hgnc_id "HGNC:15079"; tag "basic"; tag "MANE_Select"; tag "CCDS"; ccdsid "CCDS41221.1"; havana_gene "OTTHUMG00000002581.3_2"; havana_transcript "OTTHUMT00000007334.3_2"; remap_num_mappings 1; remap_status "full_contig"; remap_target_status "overlap";\n', # feature_type is 'CDS' # gene_id not in existing_gene_ids and transcript_size > ... 'chr1 HAVANA CDS 621099 622034 . - 0 gene_id "ENSG00000284662.1_2"; transcript_id "ENST00000332831.4_2"; gene_type "protein_coding"; gene_name "OR4F16"; transcript_type "protein_coding"; transcript_name "OR4F16-201"; exon_number 1; exon_id "ENSE00002324228.3"; level 2; protein_id "ENSP00000329982.2"; transcript_support_level "NA"; hgnc_id "HGNC:15079"; tag "basic"; tag "appris_principal_1"; tag "CCDS"; ccdsid "CCDS41221.1"; havana_gene "OTTHUMG00000002581.3_2"; havana_transcript "OTTHUMT00000007334.3_2"; remap_original_location "chr1:-:685719-686654"; remap_status "full_contig";\n', @@ -178,12 +178,14 @@ def test_update_gencode_command(self, mock_logger): self.assertEqual(TranscriptInfo.objects.all().count(), 2) trans_info = TranscriptInfo.objects.get(transcript_id = 'ENST00000456328') self.assertEqual(trans_info.gene.gene_id, 'ENSG00000223972') + self.assertFalse(trans_info.is_mane_select) trans_info = TranscriptInfo.objects.get(transcript_id = 'ENST00000332831') self.assertEqual(trans_info.start_grch37, 621059) self.assertEqual(trans_info.end_grch37, 622053) self.assertEqual(trans_info.strand_grch37, '-') self.assertEqual(trans_info.chrom_grch37, '1') self.assertEqual(trans_info.gene.gene_id, 'ENSG00000284662') + self.assertTrue(trans_info.is_mane_select) # Test normal command function with a --reset option mock_logger.reset_mock() From c70ba3d287fe9a527a2af50ee3d5b0d565e2edc7 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 17:24:28 -0400 Subject: [PATCH 40/96] move gencode parsing to helper utility --- .../commands/update_all_reference_data.py | 9 +- .../management/commands/update_gencode.py | 145 +--------------- .../commands/utils/gencode_utils.py | 156 ++++++++++++++++++ .../management/tests/update_gencode_tests.py | 13 +- 4 files changed, 170 insertions(+), 153 deletions(-) create mode 100644 reference_data/management/commands/utils/gencode_utils.py diff --git a/reference_data/management/commands/update_all_reference_data.py b/reference_data/management/commands/update_all_reference_data.py index 36df468c6b..e3314c2e9c 100644 --- a/reference_data/management/commands/update_all_reference_data.py +++ b/reference_data/management/commands/update_all_reference_data.py @@ -2,6 +2,7 @@ from collections import OrderedDict from django.core.management.base import BaseCommand +from reference_data.management.commands.utils.gencode_utils import LATEST_GENCODE_RELEASE, OLD_GENCODE_RELEASES from reference_data.management.commands.utils.update_utils import update_records from reference_data.management.commands.update_human_phenotype_ontology import update_hpo from reference_data.management.commands.update_dbnsfp_gene import DbNSFPReferenceDataHandler @@ -52,11 +53,9 @@ def handle(self, *args, **options): if not options["skip_gencode"]: # Download latest version first, and then add any genes from old releases not included in the latest release # Old gene ids are used in the gene constraint table and other datasets, as well as older sequencing data - update_gencode(31, reset=True) - update_gencode(29) - update_gencode(28) - update_gencode(27) - update_gencode(19) + update_gencode(LATEST_GENCODE_RELEASE, reset=True) + for release in OLD_GENCODE_RELEASES: + update_gencode(release) updated.append('gencode') if not options["skip_omim"]: diff --git a/reference_data/management/commands/update_gencode.py b/reference_data/management/commands/update_gencode.py index fbd70b4495..d040903c20 100644 --- a/reference_data/management/commands/update_gencode.py +++ b/reference_data/management/commands/update_gencode.py @@ -1,24 +1,12 @@ -import collections -import gzip import logging -import os -from tqdm import tqdm -from django.core.management.base import BaseCommand, CommandError +from django.core.management.base import BaseCommand -from reference_data.management.commands.utils.download_utils import download_file +from reference_data.management.commands.utils.gencode_utils import load_gencode_records from reference_data.models import GeneInfo, TranscriptInfo, GENOME_VERSION_GRCh37, GENOME_VERSION_GRCh38 logger = logging.getLogger(__name__) -GENCODE_GTF_URL = "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_release}/gencode.v{gencode_release}.annotation.gtf.gz" -GENCODE_LIFT37_GTF_URL = "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_release}/GRCh37_mapping/gencode.v{gencode_release}lift37.annotation.gtf.gz" - -# expected GTF file header -GENCODE_FILE_HEADER = [ - 'chrom', 'source', 'feature_type', 'start', 'end', 'score', 'strand', 'phase', 'info' -] - class Command(BaseCommand): help = "Loads the GRCh37 and/or GRCh38 versions of the Gencode GTF from a particular Gencode release" @@ -38,35 +26,6 @@ def handle(self, *args, **options): reset=options['reset']) -def _get_valid_gencode_gtf_paths(gencode_release, gencode_gtf_path, genome_version): - if gencode_gtf_path and genome_version and os.path.isfile(gencode_gtf_path): - if gencode_release == 19 and genome_version != GENOME_VERSION_GRCh37: - raise CommandError("Invalid genome_version: {}. gencode v19 only has a GRCh37 version".format(genome_version)) - elif gencode_release <= 22 and genome_version != GENOME_VERSION_GRCh38: - raise CommandError("Invalid genome_version: {}. gencode v20, v21, v22 only have a GRCh38 version".format(genome_version)) - elif genome_version != GENOME_VERSION_GRCh38 and "lift" not in gencode_gtf_path.lower(): - raise CommandError("Invalid genome_version for file: {}. gencode v23 and up must have 'lift' in the filename or genome_version arg must be GRCh38".format(gencode_gtf_path)) - - gencode_gtf_paths = {genome_version: gencode_gtf_path} - elif gencode_gtf_path and not genome_version: - raise CommandError("The genome version must also be specified after the gencode GTF file path") - else: - if gencode_release == 19: - urls = [('37', GENCODE_GTF_URL.format(gencode_release=gencode_release))] - elif gencode_release <= 22: - urls = [('38', GENCODE_GTF_URL.format(gencode_release=gencode_release))] - else: - urls = [ - ('37', GENCODE_LIFT37_GTF_URL.format(gencode_release=gencode_release)), - ('38', GENCODE_GTF_URL.format(gencode_release=gencode_release)), - ] - gencode_gtf_paths = {} - for genome_version, url in urls: - local_filename = download_file(url) - gencode_gtf_paths.update({genome_version: local_filename}) - return gencode_gtf_paths - - def update_gencode(gencode_release, gencode_gtf_path=None, genome_version=None, reset=False): """Update GeneInfo and TranscriptInfo tables. @@ -78,8 +37,6 @@ def update_gencode(gencode_release, gencode_gtf_path=None, genome_version=None, Setting this to False can be useful to sequentially load more than one gencode release so that data in the tables represents the union of multiple gencode releases. """ - gencode_gtf_paths = _get_valid_gencode_gtf_paths(gencode_release, gencode_gtf_path, genome_version) - if reset: logger.info("Dropping the {} existing TranscriptInfo entries".format(TranscriptInfo.objects.count())) TranscriptInfo.objects.all().delete() @@ -91,17 +48,8 @@ def update_gencode(gencode_release, gencode_gtf_path=None, genome_version=None, transcript.transcript_id for transcript in TranscriptInfo.objects.all().only('transcript_id') } - counters = collections.defaultdict(int) - new_genes = collections.defaultdict(dict) - new_transcripts = collections.defaultdict(dict) - - for genome_version, gencode_gtf_path in gencode_gtf_paths.items(): - logger.info("Loading {} (genome version: {})".format(gencode_gtf_path, genome_version)) - with gzip.open(gencode_gtf_path, 'rt') as gencode_file: - for i, line in enumerate(tqdm(gencode_file, unit=' gencode records')): - _parse_line( - line, i, new_genes, new_transcripts, existing_gene_ids, existing_transcript_ids, counters, - genome_version, gencode_release) + new_genes, new_transcripts, counters = load_gencode_records( + gencode_release, gencode_gtf_path, genome_version, existing_gene_ids, existing_transcript_ids) logger.info('Creating {} GeneInfo records'.format(len(new_genes))) counters["genes_created"] = len(new_genes) @@ -118,88 +66,3 @@ def update_gencode(gencode_release, gencode_gtf_path=None, genome_version=None, logger.info("Stats: ") for k, v in counters.items(): logger.info(" %s: %s" % (k, v)) - -def _parse_line(line, i, new_genes, new_transcripts, existing_gene_ids, existing_transcript_ids, counters, genome_version, gencode_release): - line = line.rstrip('\r\n') - if not line or line.startswith('#'): - return - fields = line.split('\t') - - if len(fields) != len(GENCODE_FILE_HEADER): - raise ValueError("Unexpected number of fields on line #%s: %s" % (i, fields)) - - record = dict(zip(GENCODE_FILE_HEADER, fields)) - - if record['feature_type'] not in ('gene', 'transcript', 'CDS'): - return - - # parse info field - info_fields = [x.strip().split() for x in record['info'].split(';') if x != ''] - info_dict = {} - for k, v in info_fields: - v = v.strip('"') - if k == 'tag': - if k not in info_dict: - info_dict[k] = [] - info_dict[k].append(v) - else: - info_dict[k] = v - record.update(info_dict) - - record['gene_id'] = record['gene_id'].split('.')[0] - if 'transcript_id' in record: - record['transcript_id'] = record['transcript_id'].split('.')[0] - record['chrom'] = record['chrom'].replace("chr", "").upper() - record['start'] = int(record['start']) - record['end'] = int(record['end']) - - if len(record["chrom"]) > 2: - return # skip super-contigs - - if record['feature_type'] == 'gene': - if record["gene_id"] in existing_gene_ids: - counters["genes_skipped"] += 1 - return - - new_genes[record['gene_id']].update({ - "gene_id": record["gene_id"], - "gene_symbol": record["gene_name"], - - "chrom_grch{}".format(genome_version): record["chrom"], - "start_grch{}".format(genome_version): record["start"], - "end_grch{}".format(genome_version): record["end"], - "strand_grch{}".format(genome_version): record["strand"], - - "gencode_gene_type": record["gene_type"], - "gencode_release": int(gencode_release), - }) - - elif record['feature_type'] == 'transcript': - if record["transcript_id"] in existing_transcript_ids: - counters["transcripts_skipped"] += 1 - return - - new_transcripts[record['transcript_id']].update({ - "gene_id": record["gene_id"], - "transcript_id": record["transcript_id"], - "chrom_grch{}".format(genome_version): record["chrom"], - "start_grch{}".format(genome_version): record["start"], - "end_grch{}".format(genome_version): record["end"], - "strand_grch{}".format(genome_version): record["strand"], - }) - if 'MANE_Select' in record.get('tag', []): - new_transcripts[record['transcript_id']]['is_mane_select'] = True - - elif record['feature_type'] == 'CDS': - if record["transcript_id"] in existing_transcript_ids: - return - - coding_region_size_field_name = "coding_region_size_grch{}".format(genome_version) - # add + 1 because GTF has 1-based coords. (https://useast.ensembl.org/info/website/upload/gff.html) - transcript_size = record["end"] - record["start"] + 1 - transcript_size += new_transcripts[record['transcript_id']].get(coding_region_size_field_name, 0) - new_transcripts[record['transcript_id']][coding_region_size_field_name] = transcript_size - - if record['gene_id'] not in existing_gene_ids and \ - transcript_size > new_genes[record['gene_id']].get(coding_region_size_field_name, 0): - new_genes[record['gene_id']][coding_region_size_field_name] = transcript_size \ No newline at end of file diff --git a/reference_data/management/commands/utils/gencode_utils.py b/reference_data/management/commands/utils/gencode_utils.py new file mode 100644 index 0000000000..a0fbefa54d --- /dev/null +++ b/reference_data/management/commands/utils/gencode_utils.py @@ -0,0 +1,156 @@ +import collections +import gzip +import logging +import os +from tqdm import tqdm + +from django.core.management.base import CommandError + +from reference_data.management.commands.utils.download_utils import download_file +from reference_data.models import GENOME_VERSION_GRCh37, GENOME_VERSION_GRCh38 + +logger = logging.getLogger(__name__) + +LATEST_GENCODE_RELEASE = 31 +OLD_GENCODE_RELEASES = [29, 28, 27, 19] + +GENCODE_GTF_URL = "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_release}/gencode.v{gencode_release}.annotation.gtf.gz" +GENCODE_LIFT37_GTF_URL = "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_release}/GRCh37_mapping/gencode.v{gencode_release}lift37.annotation.gtf.gz" + +# expected GTF file header +GENCODE_FILE_HEADER = [ + 'chrom', 'source', 'feature_type', 'start', 'end', 'score', 'strand', 'phase', 'info' +] + + +def _get_valid_gencode_gtf_paths(gencode_release, gencode_gtf_path, genome_version): + if gencode_gtf_path and genome_version and os.path.isfile(gencode_gtf_path): + if gencode_release == 19 and genome_version != GENOME_VERSION_GRCh37: + raise CommandError("Invalid genome_version: {}. gencode v19 only has a GRCh37 version".format(genome_version)) + elif gencode_release <= 22 and genome_version != GENOME_VERSION_GRCh38: + raise CommandError("Invalid genome_version: {}. gencode v20, v21, v22 only have a GRCh38 version".format(genome_version)) + elif genome_version != GENOME_VERSION_GRCh38 and "lift" not in gencode_gtf_path.lower(): + raise CommandError("Invalid genome_version for file: {}. gencode v23 and up must have 'lift' in the filename or genome_version arg must be GRCh38".format(gencode_gtf_path)) + + gencode_gtf_paths = {genome_version: gencode_gtf_path} + elif gencode_gtf_path and not genome_version: + raise CommandError("The genome version must also be specified after the gencode GTF file path") + else: + if gencode_release == 19: + urls = [('37', GENCODE_GTF_URL.format(gencode_release=gencode_release))] + elif gencode_release <= 22: + urls = [('38', GENCODE_GTF_URL.format(gencode_release=gencode_release))] + else: + urls = [ + ('37', GENCODE_LIFT37_GTF_URL.format(gencode_release=gencode_release)), + ('38', GENCODE_GTF_URL.format(gencode_release=gencode_release)), + ] + gencode_gtf_paths = {} + for genome_version, url in urls: + local_filename = download_file(url) + gencode_gtf_paths.update({genome_version: local_filename}) + return gencode_gtf_paths + + +def load_gencode_records(gencode_release, gencode_gtf_path=None, genome_version=None, existing_gene_ids=None, existing_transcript_ids=None): + gencode_gtf_paths = _get_valid_gencode_gtf_paths(gencode_release, gencode_gtf_path, genome_version) + + counters = collections.defaultdict(int) + new_genes = collections.defaultdict(dict) + new_transcripts = collections.defaultdict(dict) + + for genome_version, gencode_gtf_path in gencode_gtf_paths.items(): + logger.info("Loading {} (genome version: {})".format(gencode_gtf_path, genome_version)) + with gzip.open(gencode_gtf_path, 'rt') as gencode_file: + for i, line in enumerate(tqdm(gencode_file, unit=' gencode records')): + _parse_line( + line, i, new_genes, new_transcripts, existing_gene_ids, existing_transcript_ids, counters, + genome_version, gencode_release) + + return new_genes, new_transcripts, counters + + +def _parse_line(line, i, new_genes, new_transcripts, existing_gene_ids, existing_transcript_ids, counters, genome_version, gencode_release): + line = line.rstrip('\r\n') + if not line or line.startswith('#'): + return + fields = line.split('\t') + + if len(fields) != len(GENCODE_FILE_HEADER): + raise ValueError("Unexpected number of fields on line #%s: %s" % (i, fields)) + + record = dict(zip(GENCODE_FILE_HEADER, fields)) + + if record['feature_type'] not in ('gene', 'transcript', 'CDS'): + return + + # parse info field + info_fields = [x.strip().split() for x in record['info'].split(';') if x != ''] + info_dict = {} + for k, v in info_fields: + v = v.strip('"') + if k == 'tag': + if k not in info_dict: + info_dict[k] = [] + info_dict[k].append(v) + else: + info_dict[k] = v + record.update(info_dict) + + record['gene_id'] = record['gene_id'].split('.')[0] + if 'transcript_id' in record: + record['transcript_id'] = record['transcript_id'].split('.')[0] + record['chrom'] = record['chrom'].replace("chr", "").upper() + record['start'] = int(record['start']) + record['end'] = int(record['end']) + + if len(record["chrom"]) > 2: + return # skip super-contigs + + if record['feature_type'] == 'gene': + if record["gene_id"] in existing_gene_ids: + counters["genes_skipped"] += 1 + return + + new_genes[record['gene_id']].update({ + "gene_id": record["gene_id"], + "gene_symbol": record["gene_name"], + + "chrom_grch{}".format(genome_version): record["chrom"], + "start_grch{}".format(genome_version): record["start"], + "end_grch{}".format(genome_version): record["end"], + "strand_grch{}".format(genome_version): record["strand"], + + "gencode_gene_type": record["gene_type"], + "gencode_release": int(gencode_release), + }) + + elif record['feature_type'] == 'transcript': + if record["transcript_id"] in existing_transcript_ids: + counters["transcripts_skipped"] += 1 + return + + new_transcripts[record['transcript_id']].update({ + "gene_id": record["gene_id"], + "transcript_id": record["transcript_id"], + "chrom_grch{}".format(genome_version): record["chrom"], + "start_grch{}".format(genome_version): record["start"], + "end_grch{}".format(genome_version): record["end"], + "strand_grch{}".format(genome_version): record["strand"], + }) + if 'MANE_Select' in record.get('tag', []): + new_transcripts[record['transcript_id']]['is_mane_select'] = True + + elif record['feature_type'] == 'CDS': + if record["transcript_id"] in existing_transcript_ids: + return + + coding_region_size_field_name = "coding_region_size_grch{}".format(genome_version) + # add + 1 because GTF has 1-based coords. (https://useast.ensembl.org/info/website/upload/gff.html) + transcript_size = record["end"] - record["start"] + 1 + transcript_size += new_transcripts[record['transcript_id']].get(coding_region_size_field_name, 0) + new_transcripts[record['transcript_id']][coding_region_size_field_name] = transcript_size + + if record['gene_id'] not in existing_gene_ids and \ + transcript_size > new_genes[record['gene_id']].get(coding_region_size_field_name, 0): + new_genes[record['gene_id']][coding_region_size_field_name] = transcript_size \ No newline at end of file diff --git a/reference_data/management/tests/update_gencode_tests.py b/reference_data/management/tests/update_gencode_tests.py index 0b42be2b8a..a115984ddd 100644 --- a/reference_data/management/tests/update_gencode_tests.py +++ b/reference_data/management/tests/update_gencode_tests.py @@ -97,7 +97,7 @@ def test_update_gencode_command_arguments(self, mock_isfile): mock_isfile.assert_called_with('mock_path/tmp2.gz') self.assertEqual(str(ce.exception), "Invalid genome_version for file: mock_path/tmp2.gz. gencode v23 and up must have 'lift' in the filename or genome_version arg must be GRCh38") - @mock.patch('reference_data.management.commands.update_gencode.logger') + @mock.patch('reference_data.management.commands.utils.gencode_utils.logger') def test_update_gencode_command_bad_gtf_data(self, mock_logger): # Test wrong number data feilds in a line temp_bad_file_path = os.path.join(self.test_dir, 'bad.gencode.v23lift37.annotation.gtf.gz') @@ -148,13 +148,13 @@ def test_update_gencode_command_url_generation(self, mock_tempfile, mock_logger) self.assertEqual(responses.calls[0].request.url, url_23_lift) self.assertEqual(responses.calls[2].request.url, url_23) + @mock.patch('reference_data.management.commands.utils.gencode_utils.logger') @mock.patch('reference_data.management.commands.update_gencode.logger') - def test_update_gencode_command(self, mock_logger): + def test_update_gencode_command(self, mock_logger, mock_utils_logger): # Test normal command function call_command('update_gencode', '--gencode-release=31', self.temp_file_path, '37') + mock_utils_logger.info.assert_called_with('Loading {} (genome version: 37)'.format(self.temp_file_path)) calls = [ - mock.call( - 'Loading {} (genome version: 37)'.format(self.temp_file_path)), mock.call('Creating 1 GeneInfo records'), mock.call('Creating 2 TranscriptInfo records'), mock.call('Done'), @@ -190,11 +190,10 @@ def test_update_gencode_command(self, mock_logger): # Test normal command function with a --reset option mock_logger.reset_mock() call_command('update_gencode', '--reset', '--gencode-release=31', self.temp_file_path, '37') + mock_utils_logger.info.assert_called_with('Loading {} (genome version: 37)'.format(self.temp_file_path)) calls = [ mock.call('Dropping the 2 existing TranscriptInfo entries'), mock.call('Dropping the 50 existing GeneInfo entries'), - mock.call( - 'Loading {} (genome version: 37)'.format(self.temp_file_path)), mock.call('Creating 2 GeneInfo records'), mock.call('Creating 2 TranscriptInfo records'), mock.call('Done'), @@ -202,7 +201,7 @@ def test_update_gencode_command(self, mock_logger): mock.call(' genes_created: 2'), mock.call(' transcripts_created: 2') ] - mock_logger.info.assert_has_calls(calls) + # mock_logger.info.assert_has_calls(calls) self.assertEqual(GeneInfo.objects.all().count(), 2) gene_info = GeneInfo.objects.get(gene_id = 'ENSG00000223972') From d8e0fc4ccb3c7d6a0daf4a27589582b8aaae96f0 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 17:49:56 -0400 Subject: [PATCH 41/96] add manage command for updating latest transcripts --- .../management/commands/update_gencode.py | 8 ++----- .../commands/update_gencode_transcripts.py | 21 +++++++++++++++++++ .../commands/utils/gencode_utils.py | 15 ++++++++++--- .../management/tests/update_gencode_tests.py | 16 +++++++++----- 4 files changed, 46 insertions(+), 14 deletions(-) create mode 100644 reference_data/management/commands/update_gencode_transcripts.py diff --git a/reference_data/management/commands/update_gencode.py b/reference_data/management/commands/update_gencode.py index d040903c20..58b49f2750 100644 --- a/reference_data/management/commands/update_gencode.py +++ b/reference_data/management/commands/update_gencode.py @@ -2,7 +2,7 @@ from django.core.management.base import BaseCommand -from reference_data.management.commands.utils.gencode_utils import load_gencode_records +from reference_data.management.commands.utils.gencode_utils import load_gencode_records, create_transcript_info from reference_data.models import GeneInfo, TranscriptInfo, GENOME_VERSION_GRCh37, GENOME_VERSION_GRCh38 logger = logging.getLogger(__name__) @@ -54,13 +54,9 @@ def update_gencode(gencode_release, gencode_gtf_path=None, genome_version=None, logger.info('Creating {} GeneInfo records'.format(len(new_genes))) counters["genes_created"] = len(new_genes) GeneInfo.objects.bulk_create([GeneInfo(**record) for record in new_genes.values()]) - gene_id_to_gene_info = {g.gene_id: g for g in GeneInfo.objects.all().only('gene_id')} - logger.info('Creating {} TranscriptInfo records'.format(len(new_transcripts))) counters["transcripts_created"] = len(new_transcripts) - TranscriptInfo.objects.bulk_create([ - TranscriptInfo(gene=gene_id_to_gene_info[record.pop('gene_id')], **record) for record in new_transcripts.values() - ], batch_size=50000) + create_transcript_info(new_transcripts) logger.info("Done") logger.info("Stats: ") diff --git a/reference_data/management/commands/update_gencode_transcripts.py b/reference_data/management/commands/update_gencode_transcripts.py new file mode 100644 index 0000000000..d7d48d89d1 --- /dev/null +++ b/reference_data/management/commands/update_gencode_transcripts.py @@ -0,0 +1,21 @@ +import logging + +from django.core.management.base import BaseCommand + +from reference_data.management.commands.utils.gencode_utils import load_gencode_records, create_transcript_info, \ + LATEST_GENCODE_RELEASE +from reference_data.models import TranscriptInfo + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Reloads just the Gencode transcripts from the latest Gencode release" + + def handle(self, *args, **options): + transcripts = TranscriptInfo.objects.filter(gene__gencode_release=LATEST_GENCODE_RELEASE) + logger.info("Dropping the {} existing TranscriptInfo entries".format(transcripts.count())) + transcripts.delete() + + _, new_transcripts, _ = load_gencode_records(LATEST_GENCODE_RELEASE) + create_transcript_info(new_transcripts) diff --git a/reference_data/management/commands/utils/gencode_utils.py b/reference_data/management/commands/utils/gencode_utils.py index a0fbefa54d..3466941b07 100644 --- a/reference_data/management/commands/utils/gencode_utils.py +++ b/reference_data/management/commands/utils/gencode_utils.py @@ -7,7 +7,7 @@ from django.core.management.base import CommandError from reference_data.management.commands.utils.download_utils import download_file -from reference_data.models import GENOME_VERSION_GRCh37, GENOME_VERSION_GRCh38 +from reference_data.models import GeneInfo, TranscriptInfo, GENOME_VERSION_GRCh37, GENOME_VERSION_GRCh38 logger = logging.getLogger(__name__) @@ -64,12 +64,21 @@ def load_gencode_records(gencode_release, gencode_gtf_path=None, genome_version= with gzip.open(gencode_gtf_path, 'rt') as gencode_file: for i, line in enumerate(tqdm(gencode_file, unit=' gencode records')): _parse_line( - line, i, new_genes, new_transcripts, existing_gene_ids, existing_transcript_ids, counters, - genome_version, gencode_release) + line, i, new_genes, new_transcripts, existing_gene_ids or set(), existing_transcript_ids or set(), + counters, genome_version, gencode_release) return new_genes, new_transcripts, counters +def create_transcript_info(new_transcripts): + gene_id_to_gene_info = {g.gene_id: g for g in GeneInfo.objects.order_by('gencode_release').only('gene_id')} + logger.info('Creating {} TranscriptInfo records'.format(len(new_transcripts))) + TranscriptInfo.objects.bulk_create([ + TranscriptInfo(gene=gene_id_to_gene_info[record.pop('gene_id')], **record) for record in + new_transcripts.values() + ], batch_size=50000) + + def _parse_line(line, i, new_genes, new_transcripts, existing_gene_ids, existing_transcript_ids, counters, genome_version, gencode_release): line = line.rstrip('\r\n') if not line or line.startswith('#'): diff --git a/reference_data/management/tests/update_gencode_tests.py b/reference_data/management/tests/update_gencode_tests.py index a115984ddd..e87f9e1680 100644 --- a/reference_data/management/tests/update_gencode_tests.py +++ b/reference_data/management/tests/update_gencode_tests.py @@ -153,10 +153,12 @@ def test_update_gencode_command_url_generation(self, mock_tempfile, mock_logger) def test_update_gencode_command(self, mock_logger, mock_utils_logger): # Test normal command function call_command('update_gencode', '--gencode-release=31', self.temp_file_path, '37') - mock_utils_logger.info.assert_called_with('Loading {} (genome version: 37)'.format(self.temp_file_path)) + mock_utils_logger.info.assert_has_calls([ + mock.call('Loading {} (genome version: 37)'.format(self.temp_file_path)), + mock.call('Creating 2 TranscriptInfo records'), + ]) calls = [ mock.call('Creating 1 GeneInfo records'), - mock.call('Creating 2 TranscriptInfo records'), mock.call('Done'), mock.call('Stats: '), mock.call(' genes_skipped: 1'), @@ -178,6 +180,7 @@ def test_update_gencode_command(self, mock_logger, mock_utils_logger): self.assertEqual(TranscriptInfo.objects.all().count(), 2) trans_info = TranscriptInfo.objects.get(transcript_id = 'ENST00000456328') self.assertEqual(trans_info.gene.gene_id, 'ENSG00000223972') + self.assertEqual(trans_info.gene.gencode_release, 27) self.assertFalse(trans_info.is_mane_select) trans_info = TranscriptInfo.objects.get(transcript_id = 'ENST00000332831') self.assertEqual(trans_info.start_grch37, 621059) @@ -185,23 +188,26 @@ def test_update_gencode_command(self, mock_logger, mock_utils_logger): self.assertEqual(trans_info.strand_grch37, '-') self.assertEqual(trans_info.chrom_grch37, '1') self.assertEqual(trans_info.gene.gene_id, 'ENSG00000284662') + self.assertEqual(trans_info.gene.gencode_release, 31) self.assertTrue(trans_info.is_mane_select) # Test normal command function with a --reset option mock_logger.reset_mock() call_command('update_gencode', '--reset', '--gencode-release=31', self.temp_file_path, '37') - mock_utils_logger.info.assert_called_with('Loading {} (genome version: 37)'.format(self.temp_file_path)) + mock_utils_logger.info.assert_has_calls([ + mock.call('Loading {} (genome version: 37)'.format(self.temp_file_path)), + mock.call('Creating 2 TranscriptInfo records'), + ]) calls = [ mock.call('Dropping the 2 existing TranscriptInfo entries'), mock.call('Dropping the 50 existing GeneInfo entries'), mock.call('Creating 2 GeneInfo records'), - mock.call('Creating 2 TranscriptInfo records'), mock.call('Done'), mock.call('Stats: '), mock.call(' genes_created: 2'), mock.call(' transcripts_created: 2') ] - # mock_logger.info.assert_has_calls(calls) + mock_logger.info.assert_has_calls(calls) self.assertEqual(GeneInfo.objects.all().count(), 2) gene_info = GeneInfo.objects.get(gene_id = 'ENSG00000223972') From 2e3427c9202cdee6f050d6e4b176804a0c423c47 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 18:04:04 -0400 Subject: [PATCH 42/96] add manage command for updating latest transcripts --- .../management/tests/update_gencode_tests.py | 61 ++++++++++++++----- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/reference_data/management/tests/update_gencode_tests.py b/reference_data/management/tests/update_gencode_tests.py index e87f9e1680..80f7cdba80 100644 --- a/reference_data/management/tests/update_gencode_tests.py +++ b/reference_data/management/tests/update_gencode_tests.py @@ -148,9 +148,26 @@ def test_update_gencode_command_url_generation(self, mock_tempfile, mock_logger) self.assertEqual(responses.calls[0].request.url, url_23_lift) self.assertEqual(responses.calls[2].request.url, url_23) + def _has_expected_new_transcripts(self): + self.assertEqual(TranscriptInfo.objects.all().count(), 2) + trans_info = TranscriptInfo.objects.get(transcript_id='ENST00000456328') + self.assertEqual(trans_info.gene.gene_id, 'ENSG00000223972') + self.assertEqual(trans_info.gene.gencode_release, 27) + self.assertFalse(trans_info.is_mane_select) + trans_info = TranscriptInfo.objects.get(transcript_id='ENST00000332831') + self.assertEqual(trans_info.start_grch37, 621059) + self.assertEqual(trans_info.end_grch37, 622053) + self.assertEqual(trans_info.strand_grch37, '-') + self.assertEqual(trans_info.chrom_grch37, '1') + self.assertEqual(trans_info.gene.gene_id, 'ENSG00000284662') + self.assertEqual(trans_info.gene.gencode_release, 31) + self.assertTrue(trans_info.is_mane_select) + + @responses.activate @mock.patch('reference_data.management.commands.utils.gencode_utils.logger') + @mock.patch('reference_data.management.commands.update_gencode_transcripts.logger') @mock.patch('reference_data.management.commands.update_gencode.logger') - def test_update_gencode_command(self, mock_logger, mock_utils_logger): + def test_update_gencode_command(self, mock_logger, mock_update_transcripts_logger, mock_utils_logger): # Test normal command function call_command('update_gencode', '--gencode-release=31', self.temp_file_path, '37') mock_utils_logger.info.assert_has_calls([ @@ -177,19 +194,7 @@ def test_update_gencode_command(self, mock_logger, mock_utils_logger): self.assertEqual(gene_info.gencode_gene_type, 'protein_coding') self.assertEqual(gene_info.gene_symbol, 'OR4F16') - self.assertEqual(TranscriptInfo.objects.all().count(), 2) - trans_info = TranscriptInfo.objects.get(transcript_id = 'ENST00000456328') - self.assertEqual(trans_info.gene.gene_id, 'ENSG00000223972') - self.assertEqual(trans_info.gene.gencode_release, 27) - self.assertFalse(trans_info.is_mane_select) - trans_info = TranscriptInfo.objects.get(transcript_id = 'ENST00000332831') - self.assertEqual(trans_info.start_grch37, 621059) - self.assertEqual(trans_info.end_grch37, 622053) - self.assertEqual(trans_info.strand_grch37, '-') - self.assertEqual(trans_info.chrom_grch37, '1') - self.assertEqual(trans_info.gene.gene_id, 'ENSG00000284662') - self.assertEqual(trans_info.gene.gencode_release, 31) - self.assertTrue(trans_info.is_mane_select) + self._has_expected_new_transcripts() # Test normal command function with a --reset option mock_logger.reset_mock() @@ -222,3 +227,31 @@ def test_update_gencode_command(self, mock_logger, mock_utils_logger): self.assertEqual(gene_info.gene_symbol, 'OR4F16') self.assertEqual(gene_info.end_grch37, 622053) self.assertEqual(gene_info.strand_grch37, '-') + + # Test only reloading transcripts + tmp_dir = tempfile.gettempdir() + mock_tempfile.gettempdir.return_value = tmp_dir + with open(self.temp_file_path, 'rb') as f: + gtf_content = f.read() + + url = 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_31/gencode.v31.annotation.gtf.gz' + responses.add(responses.HEAD, url, headers={"Content-Length": "1024"}) + responses.add(responses.GET, url, body=gtf_content, stream=True) + url_lift = 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_31/GRCh37_mapping/gencode.v31lift37.annotation.gtf.gz' + responses.add(responses.HEAD, url_lift, headers={"Content-Length": "1024"}) + responses.add(responses.GET, url_lift, body=gtf_content, stream=True) + + call_command('update_gencode_transcripts') + + self.assertEqual(GeneInfo.objects.all().count(), 2) + self._has_expected_new_transcripts() + mock_utils_logger.info.assert_has_calls([ + mock.call('Loading {} (genome version: 37)'.format(self.temp_file_path)), + mock.call('Creating 2 TranscriptInfo records'), + ]) + mock_update_transcripts_logger.info.assert_has_calls([ + mock.call('Dropping the 2 existing TranscriptInfo entries'), + ]) + + self.assertEqual(responses.calls[0].request.url, url_lift) + self.assertEqual(responses.calls[2].request.url, url) From 2595bfe8e819e4dda6e072ba254f23ac728e8bc7 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 18:08:53 -0400 Subject: [PATCH 43/96] add update transcript tests --- reference_data/management/tests/update_gencode_tests.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/reference_data/management/tests/update_gencode_tests.py b/reference_data/management/tests/update_gencode_tests.py index 80f7cdba80..881366246f 100644 --- a/reference_data/management/tests/update_gencode_tests.py +++ b/reference_data/management/tests/update_gencode_tests.py @@ -148,11 +148,11 @@ def test_update_gencode_command_url_generation(self, mock_tempfile, mock_logger) self.assertEqual(responses.calls[0].request.url, url_23_lift) self.assertEqual(responses.calls[2].request.url, url_23) - def _has_expected_new_transcripts(self): + def _has_expected_new_transcripts(self, expected_release=27): self.assertEqual(TranscriptInfo.objects.all().count(), 2) trans_info = TranscriptInfo.objects.get(transcript_id='ENST00000456328') self.assertEqual(trans_info.gene.gene_id, 'ENSG00000223972') - self.assertEqual(trans_info.gene.gencode_release, 27) + self.assertEqual(trans_info.gene.gencode_release, expected_release) self.assertFalse(trans_info.is_mane_select) trans_info = TranscriptInfo.objects.get(transcript_id='ENST00000332831') self.assertEqual(trans_info.start_grch37, 621059) @@ -164,10 +164,11 @@ def _has_expected_new_transcripts(self): self.assertTrue(trans_info.is_mane_select) @responses.activate + @mock.patch('reference_data.management.commands.utils.download_utils.tempfile') @mock.patch('reference_data.management.commands.utils.gencode_utils.logger') @mock.patch('reference_data.management.commands.update_gencode_transcripts.logger') @mock.patch('reference_data.management.commands.update_gencode.logger') - def test_update_gencode_command(self, mock_logger, mock_update_transcripts_logger, mock_utils_logger): + def test_update_gencode_command(self, mock_logger, mock_update_transcripts_logger, mock_utils_logger, mock_tempfile): # Test normal command function call_command('update_gencode', '--gencode-release=31', self.temp_file_path, '37') mock_utils_logger.info.assert_has_calls([ @@ -244,7 +245,7 @@ def test_update_gencode_command(self, mock_logger, mock_update_transcripts_logge call_command('update_gencode_transcripts') self.assertEqual(GeneInfo.objects.all().count(), 2) - self._has_expected_new_transcripts() + self._has_expected_new_transcripts(expected_release=31) mock_utils_logger.info.assert_has_calls([ mock.call('Loading {} (genome version: 37)'.format(self.temp_file_path)), mock.call('Creating 2 TranscriptInfo records'), From 9060c3abad9d0b2fa90f3f32fbf7f75ff0415962 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Mon, 31 Oct 2022 18:24:16 -0400 Subject: [PATCH 44/96] clean up --- reference_data/management/commands/utils/gencode_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reference_data/management/commands/utils/gencode_utils.py b/reference_data/management/commands/utils/gencode_utils.py index 3466941b07..3df38101dc 100644 --- a/reference_data/management/commands/utils/gencode_utils.py +++ b/reference_data/management/commands/utils/gencode_utils.py @@ -71,7 +71,7 @@ def load_gencode_records(gencode_release, gencode_gtf_path=None, genome_version= def create_transcript_info(new_transcripts): - gene_id_to_gene_info = {g.gene_id: g for g in GeneInfo.objects.order_by('gencode_release').only('gene_id')} + gene_id_to_gene_info = {g.gene_id: g for g in GeneInfo.objects.all().only('gene_id')} logger.info('Creating {} TranscriptInfo records'.format(len(new_transcripts))) TranscriptInfo.objects.bulk_create([ TranscriptInfo(gene=gene_id_to_gene_info[record.pop('gene_id')], **record) for record in From f02ec02a3cee8c7f9ec7f7165c78824459c53e0f Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 1 Nov 2022 10:53:31 -0400 Subject: [PATCH 45/96] update refseq manage command; --- .../management/commands/update_refseq.py | 47 +++++++++++++++++++ .../commands/utils/gencode_utils.py | 12 ++--- .../management/commands/utils/update_utils.py | 3 +- .../migrations/0021_auto_20221031_2049.py | 2 +- reference_data/models.py | 2 +- 5 files changed, 57 insertions(+), 9 deletions(-) create mode 100644 reference_data/management/commands/update_refseq.py diff --git a/reference_data/management/commands/update_refseq.py b/reference_data/management/commands/update_refseq.py new file mode 100644 index 0000000000..4dee3114f6 --- /dev/null +++ b/reference_data/management/commands/update_refseq.py @@ -0,0 +1,47 @@ +import logging +from django.core.management.base import CommandError + +from reference_data.management.commands.utils.gencode_utils import GENCODE_URL_TEMPLATE, LATEST_GENCODE_RELEASE +from reference_data.management.commands.utils.update_utils import GeneCommand, ReferenceDataHandler +from reference_data.models import TranscriptInfo, RefseqTranscript + +logger = logging.getLogger(__name__) + + +class RefseqReferenceDataHandler(ReferenceDataHandler): + + model_cls = RefseqTranscript + url = GENCODE_URL_TEMPLATE.format(path='', file='.metadata.RefSeq.gz', gencode_release=LATEST_GENCODE_RELEASE) + gene_key = 'transcript' + + def __init__(self, **kwargs): + if TranscriptInfo.objects.count() == 0: + raise CommandError("TranscriptInfo table is empty. Run './manage.py update_gencode' before running this command.") + + self.transcript_id_map = { + t.transcript_id: t for t in TranscriptInfo.objects.all().only('transcript_id') + } + + @staticmethod + def get_file_header(f): + return ['transcript_id', 'refseq_id', 'additional_info'] + + @staticmethod + def parse_record(record): + yield { + 'transcript_id': record['transcript_id'].split('.')[0], + 'refseq_id': record['refseq_id'], + } + + def get_gene_for_record(self, record): + transcript_id = record.pop('transcript_id') + # only create a record for the first occurrence of a given transcript + transcript = self.transcript_id_map.pop(transcript_id, None) + + if not transcript: + raise ValueError(f'Transcript "{transcript_id}" not found in the TranscriptInfo table') + return transcript + + +class Command(GeneCommand): + reference_data_handler = RefseqReferenceDataHandler diff --git a/reference_data/management/commands/utils/gencode_utils.py b/reference_data/management/commands/utils/gencode_utils.py index 3df38101dc..1bd0d42ec0 100644 --- a/reference_data/management/commands/utils/gencode_utils.py +++ b/reference_data/management/commands/utils/gencode_utils.py @@ -14,8 +14,7 @@ LATEST_GENCODE_RELEASE = 31 OLD_GENCODE_RELEASES = [29, 28, 27, 19] -GENCODE_GTF_URL = "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_release}/gencode.v{gencode_release}.annotation.gtf.gz" -GENCODE_LIFT37_GTF_URL = "http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_release}/GRCh37_mapping/gencode.v{gencode_release}lift37.annotation.gtf.gz" +GENCODE_URL_TEMPLATE = 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_release}/{path}gencode.v{gencode_release}{file}' # expected GTF file header GENCODE_FILE_HEADER = [ @@ -36,14 +35,15 @@ def _get_valid_gencode_gtf_paths(gencode_release, gencode_gtf_path, genome_versi elif gencode_gtf_path and not genome_version: raise CommandError("The genome version must also be specified after the gencode GTF file path") else: + gtf_url = GENCODE_URL_TEMPLATE.format(path='', file='.annotation.gtf.gz', gencode_release=gencode_release) if gencode_release == 19: - urls = [('37', GENCODE_GTF_URL.format(gencode_release=gencode_release))] + urls = [('37', gtf_url)] elif gencode_release <= 22: - urls = [('38', GENCODE_GTF_URL.format(gencode_release=gencode_release))] + urls = [('38', gtf_url)] else: urls = [ - ('37', GENCODE_LIFT37_GTF_URL.format(gencode_release=gencode_release)), - ('38', GENCODE_GTF_URL.format(gencode_release=gencode_release)), + ('37', GENCODE_URL_TEMPLATE.format(path='GRCh37_mapping/', file='lift37.annotation.gtf.gz', gencode_release=gencode_release)), + ('38', gtf_url), ] gencode_gtf_paths = {} for genome_version, url in urls: diff --git a/reference_data/management/commands/utils/update_utils.py b/reference_data/management/commands/utils/update_utils.py index 6f1780e16b..780f27470a 100644 --- a/reference_data/management/commands/utils/update_utils.py +++ b/reference_data/management/commands/utils/update_utils.py @@ -19,6 +19,7 @@ class ReferenceDataHandler(object): post_process_models = None batch_size = None keep_existing_records = False + gene_key = 'gene' def __init__(self, **kwargs): if GeneInfo.objects.count() == 0: @@ -96,7 +97,7 @@ def update_records(reference_data_handler, file_path=None): continue try: - record['gene'] = reference_data_handler.get_gene_for_record(record) + record[reference_data_handler.gene_key] = reference_data_handler.get_gene_for_record(record) except ValueError as e: skip_counter += 1 logger.debug(e) diff --git a/reference_data/migrations/0021_auto_20221031_2049.py b/reference_data/migrations/0021_auto_20221031_2049.py index f1052679b2..7d2a512654 100644 --- a/reference_data/migrations/0021_auto_20221031_2049.py +++ b/reference_data/migrations/0021_auto_20221031_2049.py @@ -21,7 +21,7 @@ class Migration(migrations.Migration): fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('refseq_id', models.CharField(max_length=20)), - ('transcript', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='reference_data.transcriptinfo')), + ('transcript', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, to='reference_data.transcriptinfo')), ], ), ] diff --git a/reference_data/models.py b/reference_data/models.py index b749e4b2a4..d268ff2c4f 100644 --- a/reference_data/models.py +++ b/reference_data/models.py @@ -129,7 +129,7 @@ class TranscriptInfo(models.Model): class RefseqTranscript(models.Model): - transcript = models.ForeignKey(TranscriptInfo, on_delete=models.CASCADE) + transcript = models.OneToOneField(TranscriptInfo, on_delete=models.CASCADE) refseq_id = models.CharField(max_length=20) From ed4b97502242962583f0059b6ea78da60bb6d031 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 1 Nov 2022 10:54:58 -0400 Subject: [PATCH 46/96] bump changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48373552fb..6deccafec4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ # _seqr_ Changes ## dev +* Add Refseq and MANE transcript info (REQUIRES DB MIGRATION) + * To add new data, run the `update_gencode_transcripts` and `update_refseq` commands ## 10/13/22 * Link MME submissions to saved variants (REQUIRES DB MIGRATION) From 30483de5dc806d587ebb31eee59f7993ba5ebce5 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 1 Nov 2022 10:58:47 -0400 Subject: [PATCH 47/96] add refseq to all reference data command; --- .../management/commands/update_all_reference_data.py | 2 ++ .../tests/update_all_reference_data_tests.py | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/reference_data/management/commands/update_all_reference_data.py b/reference_data/management/commands/update_all_reference_data.py index e3314c2e9c..94435b27d6 100644 --- a/reference_data/management/commands/update_all_reference_data.py +++ b/reference_data/management/commands/update_all_reference_data.py @@ -14,6 +14,7 @@ from reference_data.management.commands.update_gene_cn_sensitivity import CNSensitivityReferenceDataHandler from reference_data.management.commands.update_gencc import GenCCReferenceDataHandler from reference_data.management.commands.update_clingen import ClinGenReferenceDataHandler +from reference_data.management.commands.update_refseq import RefseqReferenceDataHandler logger = logging.getLogger(__name__) @@ -26,6 +27,7 @@ ("mgi", MGIReferenceDataHandler), ("gencc", GenCCReferenceDataHandler), ("clingen", ClinGenReferenceDataHandler), + ("refseq", RefseqReferenceDataHandler), ("hpo", None), ]) diff --git a/reference_data/management/tests/update_all_reference_data_tests.py b/reference_data/management/tests/update_all_reference_data_tests.py index 5510244401..8548b934a1 100644 --- a/reference_data/management/tests/update_all_reference_data_tests.py +++ b/reference_data/management/tests/update_all_reference_data_tests.py @@ -18,7 +18,7 @@ def mgi_exception(): SKIP_ARGS = [ '--skip-gencode', '--skip-dbnsfp-gene', '--skip-gene-constraint', '--skip-primate-ai', '--skip-mgi', '--skip-hpo', - '--skip-gene-cn-sensitivity', '--skip-gencc', '--skip-clingen', + '--skip-gene-cn-sensitivity', '--skip-gencc', '--skip-clingen', '--skip-refseq', ] class UpdateAllReferenceDataTest(TestCase): @@ -41,6 +41,9 @@ def setUp(self): patcher = mock.patch('reference_data.management.commands.update_clingen.ClinGenReferenceDataHandler', lambda: 'clingen') patcher.start() self.addCleanup(patcher.stop) + patcher = mock.patch('reference_data.management.commands.update_refseq.RefseqReferenceDataHandler', lambda: 'refseq') + patcher.start() + self.addCleanup(patcher.stop) patcher = mock.patch('reference_data.management.commands.update_mgi.MGIReferenceDataHandler') patcher.start().side_effect = mgi_exception @@ -91,7 +94,7 @@ def test_update_all_reference_data_command(self): self.mock_omim.assert_called_with('test_key') self.mock_cached_omim.assert_not_called() - self.assertEqual(self.mock_update_records.call_count, 6) + self.assertEqual(self.mock_update_records.call_count, 7) calls = [ mock.call('omim'), mock.call('dbnsfp_gene'), @@ -99,6 +102,7 @@ def test_update_all_reference_data_command(self): mock.call('gene_cn_sensitivity'), mock.call('gencc'), mock.call('clingen'), + mock.call('refseq'), ] self.mock_update_records.assert_has_calls(calls) @@ -106,7 +110,7 @@ def test_update_all_reference_data_command(self): calls = [ mock.call('Done'), - mock.call('Updated: gencode, omim, dbnsfp_gene, gene_constraint, gene_cn_sensitivity, gencc, clingen, hpo'), + mock.call('Updated: gencode, omim, dbnsfp_gene, gene_constraint, gene_cn_sensitivity, gencc, clingen, refseq, hpo'), mock.call('Failed to Update: primate_ai, mgi') ] self.mock_logger.info.assert_has_calls(calls) From 645f5f76e503dd7d135101a0fb646be5f7311f08 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 1 Nov 2022 11:33:25 -0400 Subject: [PATCH 48/96] add refseq to all reference data command; --- .../management/tests/update_gencode_tests.py | 5 ++- .../management/tests/update_refseq_tests.py | 23 ++++++++++ seqr/fixtures/reference_data.json | 43 +++++++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 reference_data/management/tests/update_refseq_tests.py diff --git a/reference_data/management/tests/update_gencode_tests.py b/reference_data/management/tests/update_gencode_tests.py index 881366246f..1c22824111 100644 --- a/reference_data/management/tests/update_gencode_tests.py +++ b/reference_data/management/tests/update_gencode_tests.py @@ -149,7 +149,6 @@ def test_update_gencode_command_url_generation(self, mock_tempfile, mock_logger) self.assertEqual(responses.calls[2].request.url, url_23) def _has_expected_new_transcripts(self, expected_release=27): - self.assertEqual(TranscriptInfo.objects.all().count(), 2) trans_info = TranscriptInfo.objects.get(transcript_id='ENST00000456328') self.assertEqual(trans_info.gene.gene_id, 'ENSG00000223972') self.assertEqual(trans_info.gene.gencode_release, expected_release) @@ -195,6 +194,7 @@ def test_update_gencode_command(self, mock_logger, mock_update_transcripts_logge self.assertEqual(gene_info.gencode_gene_type, 'protein_coding') self.assertEqual(gene_info.gene_symbol, 'OR4F16') + self.assertEqual(TranscriptInfo.objects.all().count(), 4) self._has_expected_new_transcripts() # Test normal command function with a --reset option @@ -205,7 +205,7 @@ def test_update_gencode_command(self, mock_logger, mock_update_transcripts_logge mock.call('Creating 2 TranscriptInfo records'), ]) calls = [ - mock.call('Dropping the 2 existing TranscriptInfo entries'), + mock.call('Dropping the 4 existing TranscriptInfo entries'), mock.call('Dropping the 50 existing GeneInfo entries'), mock.call('Creating 2 GeneInfo records'), mock.call('Done'), @@ -245,6 +245,7 @@ def test_update_gencode_command(self, mock_logger, mock_update_transcripts_logge call_command('update_gencode_transcripts') self.assertEqual(GeneInfo.objects.all().count(), 2) + self.assertEqual(TranscriptInfo.objects.all().count(), 2) self._has_expected_new_transcripts(expected_release=31) mock_utils_logger.info.assert_has_calls([ mock.call('Loading {} (genome version: 37)'.format(self.temp_file_path)), diff --git a/reference_data/management/tests/update_refseq_tests.py b/reference_data/management/tests/update_refseq_tests.py new file mode 100644 index 0000000000..47cdfef776 --- /dev/null +++ b/reference_data/management/tests/update_refseq_tests.py @@ -0,0 +1,23 @@ +from reference_data.models import RefseqTranscript +from reference_data.management.tests.test_utils import ReferenceDataCommandTestCase + + +class UpdateRefseqTest(ReferenceDataCommandTestCase): + URL = 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_31/gencode.v31.metadata.RefSeq.gz' + DATA = [ + 'ENST00000258436.1 NR_026874.2 \n', + 'ENST00000258436.1 NR_122045.1 \n', + 'ENST00000342066.8 NM_152486.3 NP_689699.2\n', + 'ENST00000505820.7 NM_015658.4 NP_056473.3\n', + ] + + def test_update_refseq_command(self): + self._test_update_command( + 'update_refseq', 'RefseqTranscript', created_records=2, skipped_records=2) + + self.assertEqual(RefseqTranscript.objects.count(), 2) + self.assertListEqual( + list(RefseqTranscript.objects.order_by('transcript_id').values('transcript__transcript_id', 'refseq_id')), [ + {'transcript__transcript_id': 'ENST00000258436', 'refseq_id': 'NR_026874.2'}, + {'transcript__transcript_id': 'ENST00000505820', 'refseq_id': 'NM_015658.4'} + ]) diff --git a/seqr/fixtures/reference_data.json b/seqr/fixtures/reference_data.json index c95aa8a911..95136d98ce 100644 --- a/seqr/fixtures/reference_data.json +++ b/seqr/fixtures/reference_data.json @@ -978,6 +978,49 @@ "gencode_gene_type": "antisense_RNA", "gencode_release": 27 } +}, { + "model": "reference_data.transcriptinfo", + "pk": 1, + "fields": { + "gene_id": 48, + "transcript_id": "ENST00000258436", + "is_mane_select": true, + "chrom_grch37": "1", + "start_grch37": 696291, + "end_grch37": 697369, + "strand_grch37": "+", + "coding_region_size_grch37": 0, + "chrom_grch38": "1", + "start_grch38": 760911, + "end_grch38": 761989, + "strand_grch38": "+", + "coding_region_size_grch38": 0 + } +}, { + "model": "reference_data.transcriptinfo", + "pk": 2, + "fields": { + "gene_id": 6, + "transcript_id": "ENST00000505820", + "is_mane_select": false, + "chrom_grch37": "1", + "start_grch37": 696291, + "end_grch37": 697369, + "strand_grch37": "+", + "coding_region_size_grch37": 0, + "chrom_grch38": "1", + "start_grch38": 760911, + "end_grch38": 761989, + "strand_grch38": "+", + "coding_region_size_grch38": 0 + } +}, { + "model": "reference_data.refseqtranscript", + "pk": 1, + "fields": { + "transcript_id": 1, + "refseq_id": "NM_017900.2" + } }, { "model": "reference_data.omim", From 0c2eae42b41b439b42ca0106ae948149e7a566a4 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 1 Nov 2022 14:14:57 -0400 Subject: [PATCH 49/96] return transcripts to variant response --- reference_data/models.py | 3 +++ seqr/views/utils/orm_to_json_utils.py | 2 +- seqr/views/utils/variant_utils.py | 29 +++++++++++++++++++-------- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/reference_data/models.py b/reference_data/models.py index d268ff2c4f..ce221425aa 100644 --- a/reference_data/models.py +++ b/reference_data/models.py @@ -127,6 +127,9 @@ class TranscriptInfo(models.Model): strand_grch38 = models.CharField(max_length=1, null=True, blank=True) coding_region_size_grch38 = models.IntegerField(default=0) # number of protein-coding bases (= 0 for non-coding genes) + class Meta: + json_fields = ['transcript_id', 'is_mane_select'] + class RefseqTranscript(models.Model): transcript = models.OneToOneField(TranscriptInfo, on_delete=models.CASCADE) diff --git a/seqr/views/utils/orm_to_json_utils.py b/seqr/views/utils/orm_to_json_utils.py index 0a549b56c3..f5ab626949 100644 --- a/seqr/views/utils/orm_to_json_utils.py +++ b/seqr/views/utils/orm_to_json_utils.py @@ -72,7 +72,7 @@ def _get_json_for_models(models, nested_fields=None, user=None, is_analyst=None, if not field_value: field_value = model for field in nested_field['fields']: - field_value = getattr(field_value, field) if field_value else None + field_value = getattr(field_value, field, None) if field_value else None result[nested_field.get('key', _to_camel_case('_'.join(nested_field['fields'])))] = field_value diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py index 182730abfc..797c00e18d 100644 --- a/seqr/views/utils/variant_utils.py +++ b/seqr/views/utils/variant_utils.py @@ -4,6 +4,7 @@ import redis from matchmaker.models import MatchmakerSubmissionGenes, MatchmakerSubmission +from reference_data.models import TranscriptInfo from seqr.models import SavedVariant, VariantSearchResults, Family, LocusList, LocusListInterval, LocusListGene, \ RnaSeqOutlier, RnaSeqTpm from seqr.utils.elasticsearch.utils import get_es_variants_for_variant_ids @@ -79,19 +80,30 @@ def get_variant_key(xpos=None, ref=None, alt=None, genomeVersion=None, **kwargs) return '{}-{}-{}_{}'.format(xpos, ref, alt, genomeVersion) -def _saved_variant_genes(variants): +def _saved_variant_genes_transcripts(variants): gene_ids = set() + transcript_ids = set() for variant in variants: - if isinstance(variant, list): - for compound_het in variant: - gene_ids.update(list(compound_het.get('transcripts', {}).keys())) - else: - gene_ids.update(list(variant.get('transcripts', {}).keys())) + if not isinstance(variant, list): + variant = [variant] + for var in variant: + for gene_id, transcripts in var.get('transcripts', {}).items(): + gene_ids.add(gene_id) + transcript_ids.update([t['transcriptId'] for t in transcripts if t.get('transcriptId')]) + genes = get_genes_for_variants(gene_ids) for gene in genes.values(): if gene: gene['locusListGuids'] = [] - return genes + + transcripts = { + t['transcriptId']: t for t in _get_json_for_models( + TranscriptInfo.objects.filter(transcript_id__in=transcript_ids), + nested_fields=[{'fields': ('refseqtranscript', 'refseq_id'), 'key': 'refseqId'}] + ) + } + + return genes, transcripts def _add_locus_lists(projects, genes, add_list_detail=False, user=None, is_analyst=None): @@ -180,7 +192,8 @@ def get_variants_response(request, saved_variants, response_variants=None, add_a discovery_tags, discovery_response = get_json_for_discovery_tags(response['savedVariantsByGuid'].values(), request.user) response.update(discovery_response) - genes = _saved_variant_genes(variants) + genes, transcripts = _saved_variant_genes_transcripts(variants) + response['transcriptsById'] = transcripts response['locusListsByGuid'] = _add_locus_lists( projects, genes, add_list_detail=add_locus_list_detail, user=request.user, is_analyst=is_analyst) From efa92a836ebda28ec33bb7ead55e7f03c49c4698 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 1 Nov 2022 14:27:03 -0400 Subject: [PATCH 50/96] update unit tests --- reference_data/management/tests/update_refseq_tests.py | 4 ++-- seqr/fixtures/reference_data.json | 4 ++-- seqr/views/apis/saved_variant_api_tests.py | 7 ++++++- seqr/views/apis/variant_search_api_tests.py | 2 ++ 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/reference_data/management/tests/update_refseq_tests.py b/reference_data/management/tests/update_refseq_tests.py index 47cdfef776..c99d14a1bd 100644 --- a/reference_data/management/tests/update_refseq_tests.py +++ b/reference_data/management/tests/update_refseq_tests.py @@ -8,7 +8,7 @@ class UpdateRefseqTest(ReferenceDataCommandTestCase): 'ENST00000258436.1 NR_026874.2 \n', 'ENST00000258436.1 NR_122045.1 \n', 'ENST00000342066.8 NM_152486.3 NP_689699.2\n', - 'ENST00000505820.7 NM_015658.4 NP_056473.3\n', + 'ENST00000624735.7 NM_015658.4 NP_056473.3\n', ] def test_update_refseq_command(self): @@ -19,5 +19,5 @@ def test_update_refseq_command(self): self.assertListEqual( list(RefseqTranscript.objects.order_by('transcript_id').values('transcript__transcript_id', 'refseq_id')), [ {'transcript__transcript_id': 'ENST00000258436', 'refseq_id': 'NR_026874.2'}, - {'transcript__transcript_id': 'ENST00000505820', 'refseq_id': 'NM_015658.4'} + {'transcript__transcript_id': 'ENST00000624735', 'refseq_id': 'NM_015658.4'} ]) diff --git a/seqr/fixtures/reference_data.json b/seqr/fixtures/reference_data.json index 95136d98ce..b1a52fbf07 100644 --- a/seqr/fixtures/reference_data.json +++ b/seqr/fixtures/reference_data.json @@ -1000,8 +1000,8 @@ "model": "reference_data.transcriptinfo", "pk": 2, "fields": { - "gene_id": 6, - "transcript_id": "ENST00000505820", + "gene_id": 2, + "transcript_id": "ENST00000624735", "is_mane_select": false, "chrom_grch37": "1", "start_grch37": 696291, diff --git a/seqr/views/apis/saved_variant_api_tests.py b/seqr/views/apis/saved_variant_api_tests.py index 1cc28bc634..22a9fe7927 100644 --- a/seqr/views/apis/saved_variant_api_tests.py +++ b/seqr/views/apis/saved_variant_api_tests.py @@ -27,7 +27,7 @@ SAVED_VARIANT_RESPONSE_KEYS = { 'variantTagsByGuid', 'variantNotesByGuid', 'variantFunctionalDataByGuid', 'savedVariantsByGuid', - 'genesById', 'locusListsByGuid', 'rnaSeqData', 'mmeSubmissionsByGuid', + 'genesById', 'locusListsByGuid', 'rnaSeqData', 'mmeSubmissionsByGuid', 'transcriptsById', } COMPOUND_HET_3_JSON = { @@ -162,6 +162,11 @@ def test_saved_variant_data(self): self.assertSetEqual(set(response_json['genesById'].keys()), {'ENSG00000135953'}) self.assertSetEqual(set(response_json['genesById']['ENSG00000135953'].keys()), gene_fields) + self.assertDictEqual( + response_json['transcriptsById'], + {'ENST00000258436': {'isManeSelect': True, 'refseqId': 'NM_017900.2', 'transcriptId': 'ENST00000258436'}}, + ) + self.assertDictEqual(response_json['rnaSeqData'], {'I000001_na19675': { 'outliers': { 'ENSG00000135953': { diff --git a/seqr/views/apis/variant_search_api_tests.py b/seqr/views/apis/variant_search_api_tests.py index ed484dc19c..d132673fa5 100644 --- a/seqr/views/apis/variant_search_api_tests.py +++ b/seqr/views/apis/variant_search_api_tests.py @@ -65,6 +65,7 @@ 'SV0000002_1248367227_r0390_100': EXPECTED_SAVED_VARIANT, }, 'genesById': {'ENSG00000227232': expected_pa_gene, 'ENSG00000268903': EXPECTED_GENE, 'ENSG00000233653': EXPECTED_GENE}, + 'transcriptsById': {'ENST00000624735': {'isManeSelect': False, 'refseqId': None, 'transcriptId': 'ENST00000624735'}}, 'search': { 'search': SEARCH, 'projectFamilies': [{'projectGuid': PROJECT_GUID, 'familyGuids': mock.ANY}], @@ -388,6 +389,7 @@ def test_query_variants(self, mock_get_variants, mock_get_gene_counts, mock_erro 'searchedVariants': COMP_HET_VARAINTS, 'savedVariantsByGuid': {'SV0000002_1248367227_r0390_100': EXPECTED_SAVED_VARIANT}, 'genesById': {'ENSG00000233653': EXPECTED_GENE}, + 'transcriptsById': {}, 'variantTagsByGuid': { 'VT1726970_2103343353_r0004_tes': EXPECTED_TAG, 'VT1726945_2103343353_r0390_100': EXPECTED_TAG, }, From 1d15b6cbb9ab5affd852be96b3a523dee08f7742 Mon Sep 17 00:00:00 2001 From: Shifa Zhang Date: Tue, 1 Nov 2022 14:31:34 -0400 Subject: [PATCH 51/96] Update bulk op logs. --- seqr/models.py | 3 ++ seqr/utils/logging_utils.py | 17 +++--- seqr/views/apis/data_manager_api.py | 28 ++++------ seqr/views/apis/data_manager_api_tests.py | 63 +++++++++++------------ seqr/views/utils/dataset_utils.py | 2 +- 5 files changed, 53 insertions(+), 60 deletions(-) diff --git a/seqr/models.py b/seqr/models.py index b1f085c9b6..bda7a18131 100644 --- a/seqr/models.py +++ b/seqr/models.py @@ -1035,6 +1035,7 @@ class Meta: class DeletableSampleMetadataModel(models.Model, BulkOperationBase): + PARENT_FIELD = 'sample' sample = models.ForeignKey('Sample', on_delete=models.CASCADE, db_index=True) gene_id = models.CharField(max_length=20) # ensembl ID @@ -1071,6 +1072,8 @@ class Meta: class PhenotypePrioritization(models.Model, BulkOperationBase): + PARENT_FIELD = 'individual' + individual = models.ForeignKey('Individual', on_delete=models.CASCADE, db_index=True) gene_id = models.CharField(max_length=20) # ensembl ID diff --git a/seqr/utils/logging_utils.py b/seqr/utils/logging_utils.py index b696887a5c..1d1c58769e 100644 --- a/seqr/utils/logging_utils.py +++ b/seqr/utils/logging_utils.py @@ -1,6 +1,7 @@ import json import logging +from django.db.models import prefetch_related_objects from settings import DEPLOYMENT_TYPE from typing import Optional @@ -89,10 +90,12 @@ def log_model_bulk_update(logger, models, user, update_type, update_fields=None) def log_model_no_guid_bulk_update(logger, models, user, update_type): - if not models: - return [] - db_entity = type(models[0]).__name__ - db_update = { - 'dbEntity': db_entity, 'numEntities': len(models), 'updateType': 'bulk_{}'.format(update_type), - } - logger.info(f'{update_type} {db_entity}s', user, db_update=db_update) + if models: + db_entity = type(models[0]).__name__ + prefetch_related_objects(models, models[0].PARENT_FIELD) + parent_ids = {getattr(model, models[0].PARENT_FIELD).guid for model in models} + db_update = { + 'dbEntity': db_entity, 'numEntities': len(models), 'parentEntityIds': parent_ids, + 'updateType': 'bulk_{}'.format(update_type), + } + logger.info(f'{update_type} {db_entity}s', user, db_update=db_update) diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py index 2a0366277b..33079c5e60 100644 --- a/seqr/views/apis/data_manager_api.py +++ b/seqr/views/apis/data_manager_api.py @@ -392,34 +392,27 @@ def load_rna_seq_sample_data(request, sample_guid): return create_json_response({'success': True}) -def _log_append_info(user, info, message): - info.append(message) - logger.info(message, user) - - @data_manager_required def load_phenotype_prioritization_data(request): request_json = json.loads(request.body) file_path = request_json['file'] - info = [] - _log_append_info(request.user, info, f'Loading phenotype-based prioritization data from {file_path}') - try: tool, data_by_project_sample_id = load_phenotype_prioritization_data_file(file_path) except ValueError as e: return create_json_response({'error': str(e)}, status=400) + info = [f'Loaded {tool.upper()} data from {file_path}'] + all_records = [] - to_delete = None error = None for project_name, records_by_sample in data_by_project_sample_id.items(): projects = [p for p in Project.objects.filter(name=project_name) if is_internal_project(p)] if not projects or len(projects) > 1: - error = f'Project not found or multiple projects with the same name {project_name}' + error = f'Multiple projects with the same name {project_name}'\ + if projects else f'Project ({project_name}) not found' break - _log_append_info(request.user, info, f'Parsed {tool.upper()} data for project: {project_name}') indivs = Individual.objects.filter(family__project=projects[0], individual_id__in=records_by_sample.keys()) existing_indivs_by_id = {ind.individual_id: ind for ind in indivs} @@ -433,22 +426,19 @@ def load_phenotype_prioritization_data(request): rec['individual'] = existing_indivs_by_id[sample_id] exist_records = PhenotypePrioritization.objects.filter(tool=tool, individual__in=indivs) - to_delete = to_delete | exist_records if to_delete else exist_records + deleted, _ = PhenotypePrioritization.bulk_delete(request.user, exist_records) records = [rec for records in records_by_sample.values() for rec in records] - _log_append_info(request.user, info, - f'Attempted loading {len(records)} records of {tool.upper()} data to project {project_name}') + delete_info = f'deleted {deleted} record(s), ' if deleted else '' + info.append(f'Project {project_name}: {delete_info}loaded {len(records)} record(s)') all_records += records if error: return create_json_response({'error': error}, status=400) - if to_delete: - deleted, _ = PhenotypePrioritization.bulk_delete(request.user, to_delete) - _log_append_info(request.user, info, f'Deleted {deleted} existing {tool.upper()} records') + PhenotypePrioritization.bulk_create(request.user, [PhenotypePrioritization(**data) for data in all_records]) - models = PhenotypePrioritization.bulk_create(request.user, [PhenotypePrioritization(**data) for data in all_records]) - _log_append_info(request.user, info, f'Loaded {len(models)} {tool.upper()} data records') + logger.info('\n'.join(info), request.user) return create_json_response({ 'info': info, diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py index bee13b28fe..9e60c82107 100644 --- a/seqr/views/apis/data_manager_api_tests.py +++ b/seqr/views/apis/data_manager_api_tests.py @@ -722,7 +722,8 @@ def mock_write(content): mock_logger.info.assert_has_calls([mock.call(info_log, self.data_manager_user) for info_log in info]) mock_model_logger.info.assert_called_with( f'delete {model_cls.__name__}s', self.data_manager_user, - db_update={'dbEntity': model_cls.__name__, 'numEntities': deleted_count, 'updateType': 'bulk_delete'} + db_update={'dbEntity': model_cls.__name__, 'numEntities': deleted_count, + 'parentEntityIds': {RNA_SAMPLE_GUID}, 'updateType': 'bulk_delete'} ) mock_logger.warning.assert_has_calls([mock.call(warn_log, self.data_manager_user) for warn_log in warnings]) @@ -776,7 +777,8 @@ def test_load_rna_seq_sample_data(self, mock_model_logger, mock_logger, mock_ope mock_logger.info.assert_called_with('Loading outlier data for NA19675_D2', self.data_manager_user) mock_model_logger.info.assert_called_with( f'create {model_cls.__name__}s', self.data_manager_user, db_update={ - 'dbEntity': model_cls.__name__, 'numEntities': 2, 'updateType': 'bulk_create', + 'dbEntity': model_cls.__name__, 'numEntities': 2, 'parentEntityIds': {RNA_SAMPLE_GUID}, + 'updateType': 'bulk_create', } ) @@ -797,7 +799,6 @@ def test_load_phenotype_prioritization_data(self, mock_model_logger, mock_logger response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) self.assertEqual(response.status_code, 400) self.assertEqual(response.json()['error'], 'Invalid file: missing column(s) project, diseaseId') - mock_logger.info.assert_called_with('Loading phenotype-based prioritization data from lirical_data.tsv.gz', self.data_manager_user) mock_file_iter.assert_called_with('lirical_data.tsv.gz') mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_NO_PROJECT_DATA) @@ -813,46 +814,33 @@ def test_load_phenotype_prioritization_data(self, mock_model_logger, mock_logger mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_PROJECT_NOT_EXIST_DATA) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) self.assertEqual(response.status_code, 400) - self.assertEqual(response.json()['error'], 'Project not found or multiple projects with the same name CMG_Beggs_WGS') + self.assertEqual(response.json()['error'], 'Project (CMG_Beggs_WGS) not found') - project = Project.objects.get(name='Empty Project') - project.name = '1kg project nåme with uniçøde' - project.save() + project = Project.objects.create(created_by=self.data_manager_user, + name='1kg project nåme with uniçøde', workspace_namespace='my-seqr-billing') mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) self.assertEqual(response.status_code, 400) - self.assertEqual(response.json()['error'], 'Project not found or multiple projects with the same name 1kg project nåme with uniçøde') - project.name = 'Empty Project' - project.save() + self.assertEqual(response.json()['error'], 'Multiple projects with the same name 1kg project nåme with uniçøde') + project.delete() - mock_logger.reset_mock() mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_NO_EXIST_INDV_DATA) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) self.assertEqual(response.status_code, 400) self.assertEqual(response.json()['error'], 'Can\'t find individuals NA19678x, NA19679x') - info = [ - 'Loading phenotype-based prioritization data from lirical_data.tsv.gz', - 'Parsed LIRICAL data for project: 1kg project nåme with uniçøde' - ] - mock_logger.info.assert_has_calls([mock.call(info_log, self.data_manager_user) for info_log in info]) - mock_model_logger.info.assert_not_called() - info = [ - 'Loading phenotype-based prioritization data from lirical_data.tsv.gz', - 'Parsed LIRICAL data for project: 1kg project nåme with uniçøde', - 'Attempted loading 1 records of LIRICAL data to project 1kg project nåme with uniçøde', - 'Parsed LIRICAL data for project: Test Reprocessed Project', - 'Attempted loading 1 records of LIRICAL data to project Test Reprocessed Project', - ] - - mock_logger.reset_mock() mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) self.assertEqual(response.status_code, 200) - add_only_info = info + ['Loaded 2 LIRICAL data records'] - self.assertEqual(response.json()['info'], add_only_info) - mock_logger.info.assert_has_calls([mock.call(info_log, self.data_manager_user) for info_log in add_only_info]) - db_update = {'dbEntity': 'PhenotypePrioritization', 'numEntities': 2, 'updateType': 'bulk_create'} + info = [ + 'Loaded LIRICAL data from lirical_data.tsv.gz', + 'Project 1kg project nåme with uniçøde: loaded 1 record(s)', + 'Project Test Reprocessed Project: loaded 1 record(s)' + ] + self.assertEqual(response.json()['info'], info) + mock_logger.info.assert_called_with('\n'.join(info), self.data_manager_user) + db_update = {'dbEntity': 'PhenotypePrioritization', 'numEntities': 2, + 'parentEntityIds': {'I000002_na19678', 'I000015_na20885'}, 'updateType': 'bulk_create'} mock_model_logger.info.assert_called_with('create PhenotypePrioritizations', self.data_manager_user, db_update=db_update) mock_logger.reset_mock() @@ -860,12 +848,21 @@ def test_load_phenotype_prioritization_data(self, mock_model_logger, mock_logger mock_file_iter.return_value = self._join_data(PHENOTYPE_PRIORITIZATION_HEADER + LIRICAL_DATA) response = self.client.post(url, content_type='application/json', data=json.dumps({'file': 'lirical_data.tsv.gz'})) self.assertEqual(response.status_code, 200) - info += ['Deleted 2 existing LIRICAL records', 'Loaded 2 LIRICAL data records'] + info = [ + 'Loaded LIRICAL data from lirical_data.tsv.gz', + 'Project 1kg project nåme with uniçøde: deleted 1 record(s), loaded 1 record(s)', + 'Project Test Reprocessed Project: deleted 1 record(s), loaded 1 record(s)', + ] self.assertEqual(response.json()['info'], info) - mock_logger.info.assert_has_calls([mock.call(info_log, self.data_manager_user) for info_log in info]) + mock_logger.info.assert_called_with('\n'.join(info), self.data_manager_user) mock_model_logger.info.assert_has_calls([ mock.call('delete PhenotypePrioritizations', self.data_manager_user, db_update={ - 'dbEntity': 'PhenotypePrioritization', 'numEntities': 2, 'updateType': 'bulk_delete', + 'dbEntity': 'PhenotypePrioritization', 'numEntities': 1, + 'parentEntityIds': {'I000002_na19678'}, 'updateType': 'bulk_delete', + }), + mock.call('delete PhenotypePrioritizations', self.data_manager_user, db_update={ + 'dbEntity': 'PhenotypePrioritization', 'numEntities': 1, + 'parentEntityIds': {'I000015_na20885'}, 'updateType': 'bulk_delete', }), mock.call('create PhenotypePrioritizations', self.data_manager_user, db_update=db_update), ]) diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py index 0537417a68..f5a54c0335 100644 --- a/seqr/views/utils/dataset_utils.py +++ b/seqr/views/utils/dataset_utils.py @@ -419,7 +419,7 @@ def _load_rna_seq(model_cls, file_path, user, mapping_file, ignore_extra_samples individual_db_ids = {s.individual_id for s in samples} to_delete = model_cls.objects.filter(sample__individual_id__in=individual_db_ids).exclude(sample__data_source=data_source) if to_delete: - model_cls.bulk_delete(user, to_delete, parent='sample') + model_cls.bulk_delete(user, to_delete) loaded_sample_ids = set(model_cls.objects.filter(sample__in=samples).values_list('sample_id', flat=True).distinct()) samples_to_load = { From b8f77deb010e71e0cf85b246e644be2d078c2021 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 1 Nov 2022 14:42:29 -0400 Subject: [PATCH 52/96] show mane transcript --- ui/redux/rootReducer.js | 1 + ui/redux/selectors.js | 1 + .../components/panel/variants/Transcripts.jsx | 44 ++++++++++++------- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/ui/redux/rootReducer.js b/ui/redux/rootReducer.js index 2cdf7923a8..839e44cb92 100644 --- a/ui/redux/rootReducer.js +++ b/ui/redux/rootReducer.js @@ -347,6 +347,7 @@ const rootReducer = combineReducers({ searchGeneBreakdownLoading: loadingReducer(REQUEST_SEARCH_GENE_BREAKDOWN, RECEIVE_SEARCH_GENE_BREAKDOWN), savedSearchesByGuid: createObjectsByIdReducer(RECEIVE_SAVED_SEARCHES, 'savedSearchesByGuid'), savedSearchesLoading: loadingReducer(REQUEST_SAVED_SEARCHES, RECEIVE_SAVED_SEARCHES), + transcriptsById: createObjectsByIdReducer(RECEIVE_DATA, 'transcriptsById'), user: createSingleObjectReducer(UPDATE_USER), newUser: zeroActionsReducer, userOptionsByUsername: createSingleValueReducer(RECEIVE_USER_OPTIONS, {}), diff --git a/ui/redux/selectors.js b/ui/redux/selectors.js index 0c2e8b6729..f54f69b5fe 100644 --- a/ui/redux/selectors.js +++ b/ui/redux/selectors.js @@ -25,6 +25,7 @@ export const getMmeSubmissionsByGuid = state => state.mmeSubmissionsByGuid export const getMmeResultsByGuid = state => state.mmeResultsByGuid export const getGenesById = state => state.genesById export const getGenesIsLoading = state => state.genesLoading.isLoading +export const getTranscriptsById = state => state.transcriptsById export const getHpoTermsByParent = state => state.hpoTermsByParent export const getHpoTermsIsLoading = state => state.hpoTermsLoading.isLoading export const getLocusListsByGuid = state => state.locusListsByGuid diff --git a/ui/shared/components/panel/variants/Transcripts.jsx b/ui/shared/components/panel/variants/Transcripts.jsx index 8f6a39db11..7177e12cc8 100644 --- a/ui/shared/components/panel/variants/Transcripts.jsx +++ b/ui/shared/components/panel/variants/Transcripts.jsx @@ -4,7 +4,7 @@ import styled from 'styled-components' import { connect } from 'react-redux' import { Label, Header, Table, Segment } from 'semantic-ui-react' -import { getGenesById } from 'redux/selectors' +import { getGenesById, getTranscriptsById } from 'redux/selectors' import { updateVariantMainTranscript } from 'redux/rootReducer' import { VerticalSpacer } from '../../Spacers' import DispatchRequestButton from '../../buttons/DispatchRequestButton' @@ -22,7 +22,25 @@ const AnnotationLabel = styled.small` padding-right: 10px; ` -const Transcripts = React.memo(({ variant, genesById, updateMainTranscript }) => ( +const TRANSCRIPT_LABELS = [ + { + content: 'Canonical', + color: 'green', + shouldShow: transcript => transcript.canonical, + }, + { + content: 'MANE Select', + color: 'teal', + shouldShow: (transcript, transcriptsById) => transcriptsById[transcript.transcriptId]?.isManeSelect, + }, + { + content: 'seqr Chosen Transcript', + color: 'blue', + shouldShow: transcript => transcript.transcriptRank === 0, + }, +] + +const Transcripts = React.memo(({ variant, genesById, transcriptsById, updateMainTranscript }) => ( variant.transcripts && Object.entries(variant.transcripts).sort((transcriptsA, transcriptsB) => ( Math.min(...transcriptsA[1].map(t => t.transcriptRank)) - Math.min(...transcriptsB[1].map(t => t.transcriptRank)) )).map(([geneId, geneTranscripts]) => ( @@ -40,23 +58,13 @@ const Transcripts = React.memo(({ variant, genesById, updateMainTranscript }) => + {/* TODO show refseq ID */}
- { - transcript.transcriptRank === 0 && ( - - - - ) - } - { - transcript.canonical && ( - - - + {TRANSCRIPT_LABELS.map(({ shouldShow, ...labelProps }) => ( + shouldShow(transcript, transcriptsById) && ( +