diff --git a/CHANGELOG.md b/CHANGELOG.md index 9808f0fea5..73d72d8dda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ # _seqr_ Changes ## dev + +## 8/22/23 * Add db indices to optimize RNA data queries (REQUIRES DB MIGRATION) ## 7/11/23 diff --git a/deploy/docker/seqr/Dockerfile b/deploy/docker/seqr/Dockerfile index 2093aef700..22a7b63a27 100644 --- a/deploy/docker/seqr/Dockerfile +++ b/deploy/docker/seqr/Dockerfile @@ -93,6 +93,7 @@ EXPOSE 8000 ENV TERM=xterm COPY deploy/docker/seqr/readiness_probe / +COPY deploy/docker/seqr/wait_for_routes / COPY deploy/docker/seqr/bin/*.sh /usr/local/bin/ COPY deploy/docker/seqr/config/*.py ./ COPY deploy/docker/seqr/bashrc /root/.bashrc diff --git a/deploy/docker/seqr/wait_for_routes b/deploy/docker/seqr/wait_for_routes new file mode 100755 index 0000000000..f8263fe1ac --- /dev/null +++ b/deploy/docker/seqr/wait_for_routes @@ -0,0 +1,26 @@ +#!/bin/bash + +### +# Waits for network endpoints. Intended usage is within Kubernetes CronJobs to wait for sidecar availability. +# Usage: ./wait_for_routes https://www.google.com/ https://www.broadinstitute.org https://www.broadins.org +### + +RETRY_COUNT=10 +SLEEP_S=2 + +for route in "$@" +do + retries=0 + until [ "$retries" -ge 10 ] + do + curl -s $route -o /dev/null && echo "Successful ping of $route" && break + retries=$((retries+1)) + if [ "$retries" -eq 10 ]; then + echo "Route ${route} wasn't available after ${RETRY_COUNT} connection attempts" + exit 1 + else + echo "Unable to connect to ${route}, retrying. Attempt ${retries}/${RETRY_COUNT}" + sleep $SLEEP_S + fi + done +done diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 8580ae9aee..eb2a55f875 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -778,6 +778,29 @@ def _omim_sort(cls, r, omim_gene_set): def _gene_rank_sort(cls, r, gene_ranks): return [hl.min(cls._gene_ids_expr(r).map(gene_ranks.get))] + def gene_counts(self): + selects = { + 'gene_ids': self._gene_ids_expr, + 'families': self.BASE_ANNOTATION_FIELDS['familyGuids'], + } + ch_ht = None + if self._comp_het_ht: + ch_ht = self._comp_het_ht.explode(self._comp_het_ht[GROUPED_VARIANTS_FIELD]) + ch_ht = ch_ht.select(**{k: v(ch_ht[GROUPED_VARIANTS_FIELD]) for k, v in selects.items()}) + + if self._ht: + ht = self._ht.select(**{k: v(self._ht) for k, v in selects.items()}) + if ch_ht: + ht = ht.join(ch_ht, 'outer') + ht = ht.transmute(**{k: hl.or_else(ht[k], ht[f'{k}_1']) for k in selects}) + else: + ht = ch_ht + + ht = ht.explode('gene_ids').explode('families') + return ht.aggregate(hl.agg.group_by( + ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families)) + )) + class VariantHailTableQuery(BaseHailTableQuery): diff --git a/hail_search/search.py b/hail_search/search.py index eec334a80f..9e9fc4dd7e 100644 --- a/hail_search/search.py +++ b/hail_search/search.py @@ -1,7 +1,7 @@ from hail_search.hail_search_query import QUERY_CLASS_MAP -def search_hail_backend(request): +def search_hail_backend(request, gene_counts=False): sample_data = request.pop('sample_data', {}) genome_version = request.pop('genome_version') @@ -12,7 +12,10 @@ def search_hail_backend(request): query_cls = QUERY_CLASS_MAP[single_data_type] query = query_cls(sample_data, genome_version, **request) - return query.search() + if gene_counts: + return query.gene_counts() + else: + return query.search() def load_globals(): diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 42c03b4f5d..71ebe42b92 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -3,8 +3,8 @@ from hail_search.test_utils import get_hail_search_body, FAMILY_2_VARIANT_SAMPLE_DATA, FAMILY_2_MISSING_SAMPLE_DATA, \ VARIANT1, VARIANT2, VARIANT3, VARIANT4, MULTI_PROJECT_SAMPLE_DATA, MULTI_PROJECT_MISSING_SAMPLE_DATA, \ - LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, SV_WGS_SAMPLE_DATA, SV_VARIANT1, \ - SV_VARIANT2, SV_VARIANT3, SV_VARIANT4 + LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS, SV_WGS_SAMPLE_DATA, \ + SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4 from hail_search.web_app import init_web_app PROJECT_2_VARIANT = { @@ -119,7 +119,7 @@ async def test_status(self): resp_json = await resp.json() self.assertDictEqual(resp_json, {'success': True}) - async def _assert_expected_search(self, results, **search_kwargs): + async def _assert_expected_search(self, results, gene_counts=None, **search_kwargs): search_body = get_hail_search_body(**search_kwargs) async with self.client.request('POST', '/search', json=search_body) as resp: self.assertEqual(resp.status, 200) @@ -129,9 +129,18 @@ async def _assert_expected_search(self, results, **search_kwargs): for i, result in enumerate(resp_json['results']): self.assertEqual(result, results[i]) + if gene_counts: + async with self.client.request('POST', '/gene_counts', json=search_body) as resp: + self.assertEqual(resp.status, 200) + gene_counts_json = await resp.json() + self.assertDictEqual(gene_counts_json, gene_counts) + async def test_single_family_search(self): await self._assert_expected_search( - [VARIANT1, VARIANT2, VARIANT3, VARIANT4], sample_data=FAMILY_2_VARIANT_SAMPLE_DATA, + [VARIANT1, VARIANT2, VARIANT3, VARIANT4], sample_data=FAMILY_2_VARIANT_SAMPLE_DATA, gene_counts={ + 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}}, + } ) await self._assert_expected_search( @@ -140,13 +149,16 @@ async def test_single_family_search(self): async def test_single_project_search(self): await self._assert_expected_search( - [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', + [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', gene_counts={ + 'ENSG00000097046': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, + 'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, + } ) async def test_multi_project_search(self): await self._assert_expected_search( [PROJECT_2_VARIANT, MULTI_PROJECT_VARIANT1, MULTI_PROJECT_VARIANT2, VARIANT3, VARIANT4], - sample_data=MULTI_PROJECT_SAMPLE_DATA, + gene_counts=GENE_COUNTS, sample_data=MULTI_PROJECT_SAMPLE_DATA, ) async def test_inheritance_filter(self): @@ -195,8 +207,10 @@ async def test_inheritance_filter(self): inheritance_mode = 'compound_het' await self._assert_expected_search( - [[VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, - **COMP_HET_ALL_PASS_FILTERS, + [[VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, gene_counts={ + 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000177000': {'total': 1, 'families': {'F000002_2': 1}}, + }, **COMP_HET_ALL_PASS_FILTERS, ) await self._assert_expected_search( @@ -206,8 +220,10 @@ async def test_inheritance_filter(self): inheritance_mode = 'recessive' await self._assert_expected_search( - [PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, - sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS, + [PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, gene_counts={ + 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}}, + }, sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS, ) await self._assert_expected_search( diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py index 85a942107e..993f5fc63f 100644 --- a/hail_search/test_utils.py +++ b/hail_search/test_utils.py @@ -537,6 +537,11 @@ VARIANT_ID_SEARCH = {'variant_ids': [['1', 10439, 'AC', 'A'], ['1', 91511686, 'TCA', 'G']], 'rs_ids': []} RSID_SEARCH = {'variant_ids': [], 'rs_ids': ['rs1801131']} +GENE_COUNTS = { + 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000011_11': 1}}, +} + def get_hail_search_body(genome_version='GRCh38', num_results=100, sample_data=None, omit_sample_type=None, **search_body): sample_data = sample_data or EXPECTED_SAMPLE_DATA diff --git a/hail_search/web_app.py b/hail_search/web_app.py index 0dc5775a6b..303ab82f5c 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -14,6 +14,10 @@ def hl_json_dumps(obj): return json.dumps(obj, default=_hl_json_default) +async def gene_counts(request: web.Request) -> web.Response: + return web.json_response(search_hail_backend(await request.json(), gene_counts=True), dumps=hl_json_dumps) + + async def search(request: web.Request) -> web.Response: hail_results, total_results = search_hail_backend(await request.json()) return web.json_response({'results': hail_results, 'total': total_results}, dumps=hl_json_dumps) @@ -28,6 +32,7 @@ def init_web_app(): app.add_routes([ web.get('/status', status), web.post('/search', search), + web.post('/gene_counts', gene_counts), ]) load_globals() return app diff --git a/seqr/utils/search/hail_search_utils_tests.py b/seqr/utils/search/hail_search_utils_tests.py index b4dc36b882..9090249b2a 100644 --- a/seqr/utils/search/hail_search_utils_tests.py +++ b/seqr/utils/search/hail_search_utils_tests.py @@ -8,10 +8,10 @@ from seqr.models import Family from seqr.utils.search.utils import get_variant_query_gene_counts, query_variants, get_single_variant, \ get_variants_for_variant_ids, InvalidSearchException -from seqr.utils.search.search_utils_tests import SearchTestHelper, MOCK_COUNTS +from seqr.utils.search.search_utils_tests import SearchTestHelper from hail_search.test_utils import get_hail_search_body, EXPECTED_SAMPLE_DATA, FAMILY_1_SAMPLE_DATA, \ FAMILY_2_ALL_SAMPLE_DATA, ALL_AFFECTED_SAMPLE_DATA, CUSTOM_AFFECTED_SAMPLE_DATA, HAIL_BACKEND_VARIANTS, \ - LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH + LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS MOCK_HOST = 'http://test-hail-host' @@ -155,10 +155,10 @@ def test_query_variants(self): @responses.activate def test_get_variant_query_gene_counts(self): - responses.add(responses.POST, f'{MOCK_HOST}:5000/gene_counts', json=MOCK_COUNTS, status=200) + responses.add(responses.POST, f'{MOCK_HOST}:5000/gene_counts', json=GENE_COUNTS, status=200) gene_counts = get_variant_query_gene_counts(self.results_model, self.user) - self.assertDictEqual(gene_counts, MOCK_COUNTS) + self.assertDictEqual(gene_counts, GENE_COUNTS) self.assert_cached_results({'gene_aggs': gene_counts}) self._test_expected_search_call(sort=None) diff --git a/seqr/utils/search/search_utils_tests.py b/seqr/utils/search/search_utils_tests.py index afb0816f98..ccfec426da 100644 --- a/seqr/utils/search/search_utils_tests.py +++ b/seqr/utils/search/search_utils_tests.py @@ -4,17 +4,12 @@ import json import mock +from hail_search.test_utils import GENE_COUNTS from seqr.models import Family, Sample, VariantSearch, VariantSearchResults from seqr.utils.search.utils import get_single_variant, get_variants_for_variant_ids, get_variant_query_gene_counts, \ query_variants, InvalidSearchException from seqr.views.utils.test_utils import PARSED_VARIANTS, PARSED_COMPOUND_HET_VARIANTS_MULTI_PROJECT, GENE_FIELDS -MOCK_COUNTS = { - 'ENSG00000135953': {'total': 3, 'families': {'F000003_3': 2, 'F000002_2': 1, 'F000005_5': 1}}, - 'ENSG00000228198': {'total': 5, 'families': {'F000003_3': 4, 'F000002_2': 1, 'F000005_5': 1}}, - 'ENSG00000240361': {'total': 2, 'families': {'F000003_3': 2}}, -} - class SearchTestHelper(object): @@ -354,12 +349,12 @@ def test_invalid_search_get_variant_query_gene_counts(self): def test_get_variant_query_gene_counts(self, mock_get_variants): def _mock_get_variants(families, search, user, previous_search_results, genome_version, **kwargs): - previous_search_results['gene_aggs'] = MOCK_COUNTS - return MOCK_COUNTS + previous_search_results['gene_aggs'] = GENE_COUNTS + return GENE_COUNTS mock_get_variants.side_effect = _mock_get_variants gene_counts = get_variant_query_gene_counts(self.results_model, self.user) - self.assertDictEqual(gene_counts, MOCK_COUNTS) + self.assertDictEqual(gene_counts, GENE_COUNTS) results_cache = {'gene_aggs': gene_counts} self.assert_cached_results(results_cache) self._test_expected_search_call( diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py index cdc87d1cdb..2bae0faebe 100644 --- a/seqr/views/apis/report_api.py +++ b/seqr/views/apis/report_api.py @@ -713,8 +713,9 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False): READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('gene_annotation')+1, 'gene_annotation_details') READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('alignment_log_file')+1, 'alignment_postprocessing') READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id'] +CALLED_VARIANT_FILE_COLUMN = 'called_variants_dna_file' CALLED_TABLE_COLUMNS = [ - 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum', + 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', CALLED_VARIANT_FILE_COLUMN, 'md5sum', 'caller_software', 'variant_types', 'analysis_details', ] @@ -912,7 +913,9 @@ def gregor_export(request): ('experiment_dna_short_read', EXPERIMENT_TABLE_COLUMNS, airtable_rows), ('aligned_dna_short_read', READ_TABLE_COLUMNS, airtable_rows), ('aligned_dna_short_read_set', READ_SET_TABLE_COLUMNS, airtable_rows), - ('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows), + ('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, [ + row for row in airtable_rows if row.get(CALLED_VARIANT_FILE_COLUMN) + ]), ('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows), ('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows), ('experiment', EXPERIMENT_LOOKUP_TABLE_COLUMNS, experiment_lookup_rows), @@ -1033,6 +1036,19 @@ def _get_experiment_lookup_row(is_rna, row_data): } +is_integer = lambda val, *args: val.isnumeric() or re.match(r'^[\d{3},]*\d{3}$', val) +DATA_TYPE_VALIDATORS = { + 'string': lambda val, validator: (not validator.get('is_bucket_path')) or val.startswith('gs://'), + 'enumeration': lambda val, validator: val in validator['enumerations'], + 'integer': is_integer, + 'float': lambda val, validator: is_integer(val) or re.match(r'^\d+.\d+$', val), + 'date': lambda val, validator: bool(re.match(r'^\d{4}-\d{2}-\d{2}$', val)), +} +DATA_TYPE_ERROR_FORMATTERS = { + 'string': lambda validator: ' are a google bucket path starting with gs://', + 'enumeration': lambda validator: f': {", ".join(validator["enumerations"])}', +} + def _validate_gregor_files(file_data): errors = [] warnings = [] @@ -1070,6 +1086,26 @@ def _validate_gregor_files(file_data): warnings.append( f'The following columns are included in the "{file_name}" data model but are missing in the report: {col_summary}' ) + invalid_data_type_columns = { + col: validator['data_type'] for col, validator in table_validator.items() + if validator.get('data_type') and validator['data_type'] not in DATA_TYPE_VALIDATORS + } + if invalid_data_type_columns: + col_summary = ', '.join(sorted([f'{col} ({data_type})' for col, data_type in invalid_data_type_columns.items()])) + warnings.append( + f'The following columns are included in the "{file_name}" data model but have an unsupported data type: {col_summary}' + ) + invalid_enum_columns = [ + col for col, validator in table_validator.items() + if validator.get('data_type') == 'enumeration' and not validator.get('enumerations') + ] + if invalid_enum_columns: + for col in invalid_enum_columns: + table_validator[col]['data_type'] = None + col_summary = ', '.join(sorted(invalid_enum_columns)) + warnings.append( + f'The following columns are specified as "enumeration" in the "{file_name}" data model but are missing the allowed values definition: {col_summary}' + ) for column in columns: _validate_column_data( @@ -1112,15 +1148,18 @@ def _has_required_table(table, validator, tables): def _validate_column_data(column, file_name, data, column_validator, warnings, errors): - enum = column_validator.get('enumerations') + data_type = column_validator.get('data_type') + data_type_validator = DATA_TYPE_VALIDATORS.get(data_type) + unique = column_validator.get('is_unique') required = column_validator.get('required') recommended = column in WARN_MISSING_TABLE_COLUMNS.get(file_name, []) - if not (required or enum or recommended): + if not (required or unique or recommended or data_type_validator): return missing = [] warn_missing = [] invalid = [] + grouped_values = defaultdict(set) for row in data: value = row.get(column) if not value: @@ -1130,9 +1169,13 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e check_recommend_condition = WARN_MISSING_CONDITIONAL_COLUMNS.get(column) if not check_recommend_condition or check_recommend_condition(row): warn_missing.append(_get_row_id(row)) - elif enum and value not in enum: + elif data_type_validator and not data_type_validator(value, column_validator): invalid.append(f'{_get_row_id(row)} ({value})') - if missing or warn_missing or invalid: + elif unique: + grouped_values[value].add(_get_row_id(row)) + + duplicates = [f'{k} ({", ".join(sorted(v))})' for k, v in grouped_values.items() if len(v) > 1] + if missing or warn_missing or invalid or duplicates: airtable_summary = ' (from Airtable)' if column in ALL_AIRTABLE_COLUMNS else '' error_template = f'The following entries {{issue}} "{column}"{airtable_summary} in the "{file_name}" table' if missing: @@ -1141,8 +1184,14 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e ) if invalid: invalid_values = f'Invalid values: {", ".join(sorted(invalid))}' + allowed = DATA_TYPE_ERROR_FORMATTERS[data_type](column_validator) \ + if data_type in DATA_TYPE_ERROR_FORMATTERS else f' have data type {data_type}' + errors.append( + f'{error_template.format(issue="have invalid values for")}. Allowed values{allowed}. {invalid_values}' + ) + if duplicates: errors.append( - f'{error_template.format(issue="have invalid values for")}. Allowed values: {", ".join(enum)}. {invalid_values}' + f'{error_template.format(issue="have non-unique values for")}: {", ".join(sorted(duplicates))}' ) if warn_missing: warnings.append( diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py index b03234cebe..ee2d6c3bf3 100644 --- a/seqr/views/apis/report_api_tests.py +++ b/seqr/views/apis/report_api_tests.py @@ -209,7 +209,7 @@ 'target_insert_size_wes': '385', 'sequencing_platform_wes': 'NovaSeq', 'aligned_dna_short_read_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram', - 'aligned_dna_short_read_index_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', + 'aligned_dna_short_read_index_file_wes': 'NA', 'md5sum_wes': '129c28163df082', 'reference_assembly': 'GRCh38', 'alignment_software_dna': 'BWA-MEM-2.3', @@ -295,13 +295,13 @@ 'md5sum_wes': 'a6f6308866765ce8', 'md5sum_wgs': '2aa33e8c32020b1c', 'reference_assembly': 'GRCh38', - 'alignment_software_dna': 'BWA 0.7.15.r1140', + 'alignment_software_dna': 'BWA-MEM-2.3', 'mean_coverage_wes': '42.8', 'mean_coverage_wgs': '36.1', 'analysis_details': '', - 'called_variants_dna_short_read_id': 'NA', + 'called_variants_dna_short_read_id': '', 'aligned_dna_short_read_set_id': 'Broad_NA20888_D1', - 'called_variants_dna_file': 'NA', + 'called_variants_dna_file': '', 'caller_software': 'NA', 'variant_types': 'SNV', }, @@ -388,23 +388,23 @@ 'table': 'participant', 'required': True, 'columns': [ - {'column': 'participant_id', 'required': True}, - {'column': 'internal_project_id'}, - {'column': 'gregor_center', 'required': True, 'enumerations': ['BCM', 'BROAD', 'UW']}, - {'column': 'consent_code', 'required': True, 'enumerations': ['GRU', 'HMB']}, - {'column': 'recontactable', 'enumerations': ['Yes', 'No']}, - {'column': 'prior_testing'}, + {'column': 'participant_id', 'required': True, 'data_type': 'string'}, + {'column': 'internal_project_id', 'data_type': 'reference'}, + {'column': 'gregor_center', 'required': True, 'data_type': 'enumeration', 'enumerations': ['BCM', 'BROAD', 'UW']}, + {'column': 'consent_code', 'required': True, 'data_type': 'enumeration', 'enumerations': ['GRU', 'HMB']}, + {'column': 'recontactable', 'data_type': 'enumeration', 'enumerations': ['Yes', 'No']}, + {'column': 'prior_testing', 'data_type': 'enumeration'}, {'column': 'family_id', 'required': True}, {'column': 'paternal_id'}, {'column': 'maternal_id'}, {'column': 'proband_relationship', 'required': True}, - {'column': 'sex', 'required': True, 'enumerations': ['Male', 'Female', 'Unknown']}, - {'column': 'reported_race', 'enumerations': ['Asian', 'White', 'Black']}, - {'column': 'reported_ethnicity', 'enumerations': ['Hispanic or Latino', 'Not Hispanic or Latino']}, + {'column': 'sex', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Male', 'Female', 'Unknown']}, + {'column': 'reported_race', 'data_type': 'enumeration', 'enumerations': ['Asian', 'White', 'Black']}, + {'column': 'reported_ethnicity', 'data_type': 'enumeration', 'enumerations': ['Hispanic or Latino', 'Not Hispanic or Latino']}, {'column': 'ancestry_metadata'}, - {'column': 'affected_status', 'required': True, 'enumerations': ['Affected', 'Unaffected', 'Unknown']}, + {'column': 'affected_status', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Affected', 'Unaffected', 'Unknown']}, {'column': 'phenotype_description'}, - {'column': 'age_at_enrollment'}, + {'column': 'age_at_enrollment', 'data_type': 'date'}, ], }, { @@ -413,13 +413,13 @@ 'columns': [ {'column': 'aligned_dna_short_read_id', 'required': True}, {'column': 'experiment_dna_short_read_id', 'required': True}, - {'column': 'aligned_dna_short_read_file'}, - {'column': 'aligned_dna_short_read_index_file'}, - {'column': 'alignment_software'}, + {'column': 'aligned_dna_short_read_file', 'is_unique': True, 'data_type': 'string', 'is_bucket_path': True}, + {'column': 'aligned_dna_short_read_index_file', 'data_type': 'string', 'is_bucket_path': True}, + {'column': 'alignment_software', 'is_unique': True}, {'column': 'analysis_details'}, - {'column': 'md5sum'}, - {'column': 'mean_coverage', 'required': True}, - {'column': 'reference_assembly'}, + {'column': 'md5sum', 'is_unique': True}, + {'column': 'mean_coverage', 'required': True, 'data_type': 'float'}, + {'column': 'reference_assembly', 'data_type': 'integer'}, {'column': 'reference_assembly_details'}, {'column': 'reference_assembly_uri'}, {'column': 'quality_issues'}, @@ -441,6 +441,33 @@ 'required': 'CONDITIONAL (aligned_dna_short_read_set, dna_read_data)', 'columns': [{'column': 'analyte_id', 'required': True}], }, + { + 'table': 'experiment_rna_short_read', + 'columns': [ + {'column': 'experiment_rna_short_read_id', 'required': True}, + {'column': 'analyte_id', 'required': True}, + {'column': 'experiment_sample_id'}, + {'column': 'seq_library_prep_kit_method'}, + {'column': 'library_prep_type'}, + {'column': 'experiment_type'}, + {'column': 'read_length', 'data_type': 'integer'}, + {'column': 'single_or_paired_ends'}, + {'column': 'date_data_generation', 'data_type': 'float'}, + {'column': 'sequencing_platform'}, + {'column': 'within_site_batch_name'}, + {'column': 'RIN', 'data_type': 'float'}, + {'column': 'estimated_library_size'}, + {'column': 'total_reads', 'data_type': 'integer'}, + {'column': 'percent_rRNA', 'data_type': 'float'}, + {'column': 'percent_mRNA', 'data_type': 'float'}, + {'column': 'percent_mtRNA', 'data_type': 'float'}, + {'column': 'percent_Globin', 'data_type': 'float'}, + {'column': 'percent_UMI', 'data_type': 'float'}, + {'column': '5prime3prime_bias', 'data_type': 'float'}, + {'column': 'percent_GC', 'data_type': 'float'}, + {'column': 'percent_chrX_Y', 'data_type': 'float'}, + ], + }, ] } @@ -816,15 +843,22 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat 'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set', 'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id', 'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata', + 'The following columns are included in the "participant" data model but have an unsupported data type: internal_project_id (reference)', + 'The following columns are specified as "enumeration" in the "participant" data model but are missing the allowed values definition: prior_testing', 'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881', 'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', 'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', 'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', - ] + skipped_file_validation_warnings[1:5] + skipped_file_validation_warnings[7:]) + ] + skipped_file_validation_warnings[1:5] + skipped_file_validation_warnings[7:8] + skipped_file_validation_warnings[9:]) self.assertListEqual(response.json()['errors'], [ 'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888', 'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)', + 'The following entries have invalid values for "age_at_enrollment" in the "participant" table. Allowed values have data type date. Invalid values: Broad_NA19675_1 (18)', + 'The following entries have invalid values for "aligned_dna_short_read_index_file" (from Airtable) in the "aligned_dna_short_read" table. Allowed values are a google bucket path starting with gs://. Invalid values: VCGS_FAM203_621_D2 (NA)', + 'The following entries have invalid values for "reference_assembly" (from Airtable) in the "aligned_dna_short_read" table. Allowed values have data type integer. Invalid values: NA20888 (GRCh38), VCGS_FAM203_621_D2 (GRCh38)', 'The following entries are missing required "mean_coverage" (from Airtable) in the "aligned_dna_short_read" table: VCGS_FAM203_621_D2', + 'The following entries have non-unique values for "alignment_software" (from Airtable) in the "aligned_dna_short_read" table: BWA-MEM-2.3 (NA20888, VCGS_FAM203_621_D2)', + 'The following entries have invalid values for "date_data_generation" (from Airtable) in the "experiment_rna_short_read" table. Allowed values have data type float. Invalid values: NA19679 (2023-02-11)', ]) responses.add(responses.GET, MOCK_DATA_MODEL_URL, status=404) @@ -976,20 +1010,19 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): self.assertIn([ 'Broad_exome_VCGS_FAM203_621_D2_1', 'Broad_exome_VCGS_FAM203_621_D2', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram', - 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', '129c28163df082', 'GRCh38', - '', '', '', 'BWA-MEM-2.3', 'DOI:10.5281/zenodo.4469317', '', + 'NA', '129c28163df082', 'GRCh38', '', '', '', 'BWA-MEM-2.3', 'DOI:10.5281/zenodo.4469317', '', ], read_file) self.assertIn([ 'Broad_exome_NA20888_1', 'Broad_exome_NA20888', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38', '', '', - '42.8', 'BWA 0.7.15.r1140', '', '', + '42.8', 'BWA-MEM-2.3', '', '', ], read_file) self.assertEqual([ 'Broad_genome_NA20888_1_1', 'Broad_genome_NA20888_1', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.cram', 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.crai', '2aa33e8c32020b1c', 'GRCh38', '', '', - '36.1', 'BWA 0.7.15.r1140', '', '', + '36.1', 'BWA-MEM-2.3', '', '', ] in read_file, has_second_project) self.assertEqual(len(read_set_file), num_airtable_rows) @@ -998,7 +1031,7 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): self.assertIn(['Broad_NA20888_D1', 'Broad_exome_NA20888_1'], read_set_file) self.assertEqual(['Broad_NA20888_D1', 'Broad_genome_NA20888_1_1'] in read_set_file, has_second_project) - self.assertEqual(len(called_file), num_airtable_rows) + self.assertEqual(len(called_file), 2) self.assertEqual(called_file[0], [ 'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum', 'caller_software', 'variant_types', 'analysis_details', @@ -1007,9 +1040,6 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False): 'SX2-3', 'BCM_H7YG5DSX2', 'gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.SV.vcf', '129c28163df082', 'gatk4.1.2', 'SNV', 'DOI:10.5281/zenodo.4469317', ], called_file) - self.assertIn(['NA', 'Broad_NA20888_D1', 'NA', 'a6f6308866765ce8', 'NA', 'SNV', ''], called_file) - self.assertEqual( - ['NA', 'Broad_NA20888_D1', 'NA', '2aa33e8c32020b1c', 'NA', 'SNV', ''] in called_file, has_second_project) self.assertEqual(len(experiment_rna_file), 2) self.assertEqual(experiment_rna_file[0], [ diff --git a/ui/shared/components/panel/variants/Predictions.jsx b/ui/shared/components/panel/variants/Predictions.jsx index c4dcd7e5d9..7899621545 100644 --- a/ui/shared/components/panel/variants/Predictions.jsx +++ b/ui/shared/components/panel/variants/Predictions.jsx @@ -8,7 +8,7 @@ import { getGenesById } from 'redux/selectors' import { PREDICTOR_FIELDS, getVariantMainGeneId } from 'shared/utils/constants' import { snakecaseToTitlecase } from 'shared/utils/stringUtils' import { HorizontalSpacer } from '../../Spacers' -import { ButtonLink } from '../../StyledComponents' +import { ButtonLink, ColoredIcon } from '../../StyledComponents' const PredictionValue = styled.span` margin-left: 5px; @@ -19,8 +19,10 @@ const PredictionValue = styled.span` const NUM_TO_SHOW_ABOVE_THE_FOLD = 6 // how many predictors to show immediately +const PRED_COLOR_MAP = ['green', 'olive', 'grey', 'yellow', 'red', '#8b0000'] + const predictionFieldValue = ( - predictions, { field, dangerThreshold, warningThreshold, indicatorMap, infoField, infoTitle }, + predictions, { field, thresholds, indicatorMap, infoField, infoTitle }, ) => { let value = predictions[field] if (value === null || value === undefined) { @@ -29,22 +31,23 @@ const predictionFieldValue = ( const infoValue = predictions[infoField] - if (dangerThreshold) { - value = parseFloat(value).toPrecision(2) - let color = 'green' - if (value >= dangerThreshold) { - color = 'red' - } else if (value >= warningThreshold) { - color = 'yellow' - } - return { value, color, infoValue, infoTitle, dangerThreshold, warningThreshold } + if (thresholds) { + value = parseFloat(value).toPrecision(3) + const color = PRED_COLOR_MAP.find( + (clr, i) => (thresholds[i - 1] || thresholds[i]) && + (thresholds[i - 1] === undefined || value >= thresholds[i - 1]) && + (thresholds[i] === undefined || value < thresholds[i]), + ) + return { value, color, infoValue, infoTitle, thresholds } } return indicatorMap[value[0]] || indicatorMap[value] } +const coloredIcon = color => React.createElement(color.startsWith('#') ? ColoredIcon : Icon, { name: 'circle', size: 'small', color }) + const Prediction = ( - { field, fieldTitle, value, color, infoValue, infoTitle, warningThreshold, dangerThreshold, href }, + { field, fieldTitle, value, color, infoValue, infoTitle, thresholds, href }, ) => { const indicator = infoValue ? ( } /> - ) : + ) : coloredIcon(color) const fieldName = fieldTitle || snakecaseToTitlecase(field) - const fieldDisplay = dangerThreshold ? ( + const fieldDisplay = thresholds ? ( -
{`Red > ${dangerThreshold}`}
- {warningThreshold < dangerThreshold &&
{`Yellow > ${warningThreshold}`}
} - + PRED_COLOR_MAP.map((c, i) => { + if (thresholds[i] === undefined && thresholds[i - 1] === undefined) { + return null + } + return ( +
+ {coloredIcon(c)} + {thresholds[i] === undefined ? ` >= ${thresholds[i - 1]}` : ` < ${thresholds[i]}`} +
+ ) + }) } trigger={{fieldName}} /> @@ -85,8 +95,7 @@ Prediction.propTypes = { infoTitle: PropTypes.string, fieldTitle: PropTypes.string, color: PropTypes.string, - warningThreshold: PropTypes.number, - dangerThreshold: PropTypes.number, + thresholds: PropTypes.arrayOf(PropTypes.number), href: PropTypes.string, } @@ -116,8 +125,8 @@ class Predictions extends React.PureComponent { if (gene && gene.primateAi) { genePredictors.primate_ai = { field: 'primate_ai', - warningThreshold: gene.primateAi.percentile25, - dangerThreshold: gene.primateAi.percentile75, + thresholds: [undefined, undefined, gene.primateAi.percentile25.toPrecision(3), + gene.primateAi.percentile75.toPrecision(3), undefined], } } diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js index 709b72b471..9334446cab 100644 --- a/ui/shared/utils/constants.js +++ b/ui/shared/utils/constants.js @@ -1317,15 +1317,14 @@ export const NO_SV_IN_SILICO_GROUPS = [MISSENSE_IN_SILICO_GROUP, CODING_IN_SILIC export const SPLICE_AI_FIELD = 'splice_ai' export const PREDICTOR_FIELDS = [ - { field: 'cadd', group: CODING_IN_SILICO_GROUP, warningThreshold: 10, dangerThreshold: 20, min: 1, max: 99 }, - { field: 'revel', group: MISSENSE_IN_SILICO_GROUP, warningThreshold: 0.5, dangerThreshold: 0.75 }, - { field: 'primate_ai', group: MISSENSE_IN_SILICO_GROUP, warningThreshold: 0.5, dangerThreshold: 0.7 }, - { field: 'mpc', group: MISSENSE_IN_SILICO_GROUP, warningThreshold: 1, dangerThreshold: 2, max: 5 }, + { field: 'cadd', group: CODING_IN_SILICO_GROUP, thresholds: [0.151, 22.8, 25.3, 28.1, undefined], min: 1, max: 99 }, + { field: 'revel', group: MISSENSE_IN_SILICO_GROUP, thresholds: [0.0161, 0.291, 0.644, 0.773, 0.932] }, + { field: 'primate_ai', group: MISSENSE_IN_SILICO_GROUP, thresholds: [undefined, 0.484, 0.79, 0.867, undefined] }, + { field: 'mpc', group: MISSENSE_IN_SILICO_GROUP, thresholds: [undefined, undefined, 1.36, 1.828, undefined], max: 5 }, { field: SPLICE_AI_FIELD, group: SPLICING_IN_SILICO_GROUP, - warningThreshold: 0.5, - dangerThreshold: 0.8, + thresholds: [undefined, undefined, 0.5, 0.8, undefined], infoField: 'splice_ai_consequence', infoTitle: 'Predicted Consequence', fieldTitle: 'SpliceAI', @@ -1333,20 +1332,25 @@ export const PREDICTOR_FIELDS = [ `https://spliceailookup.broadinstitute.org/#variant=${chrom}-${pos}-${ref}-${alt}&hg=${genomeVersion}&distance=1000&mask=1` ), }, - { field: 'eigen', group: CODING_IN_SILICO_GROUP, warningThreshold: 1, dangerThreshold: 2, max: 99 }, - { field: 'dann', displayOnly: true, warningThreshold: 0.93, dangerThreshold: 0.96 }, - { field: 'strvctvre', group: SV_IN_SILICO_GROUP, warningThreshold: 0.5, dangerThreshold: 0.75 }, + { field: 'eigen', group: CODING_IN_SILICO_GROUP, thresholds: [undefined, undefined, 1, 2, undefined], max: 99 }, + { field: 'dann', displayOnly: true, thresholds: [undefined, undefined, 0.93, 0.96, undefined] }, + { field: 'strvctvre', group: SV_IN_SILICO_GROUP, thresholds: [undefined, undefined, 0.5, 0.75, undefined] }, { field: 'polyphen', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: POLYPHEN_MAP }, { field: 'sift', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: INDICATOR_MAP }, { field: 'mut_taster', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: MUTTASTER_MAP }, { field: 'fathmm', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: FATHMM_MAP }, - { field: 'vest', warningThreshold: 0.5, dangerThreshold: 0.764 }, - { field: 'mut_pred', warningThreshold: 0.392, dangerThreshold: 0.737 }, - { field: 'apogee', warningThreshold: 0.5, dangerThreshold: 0.5 }, - { field: 'gnomad_noncoding', fieldTitle: 'gnomAD Constraint', displayOnly: true, warningThreshold: 2.18, dangerThreshold: 4 }, + { field: 'vest', thresholds: [undefined, 0.45, 0.764, 0.861, 0.965] }, + { field: 'mut_pred', thresholds: [0.0101, 0.392, 0.737, 0.829, 0.932] }, + { field: 'apogee', thresholds: [undefined, undefined, 0.5, 0.5, undefined] }, + { + field: 'gnomad_noncoding', + fieldTitle: 'gnomAD Constraint', + displayOnly: true, + thresholds: [undefined, undefined, 2.18, 4, undefined], + }, { field: 'haplogroup_defining', indicatorMap: { Y: { color: 'green', value: '' } } }, { field: 'mitotip', indicatorMap: MITOTIP_MAP }, - { field: 'hmtvar', warningThreshold: 0.35, dangerThreshold: 0.35 }, + { field: 'hmtvar', thresholds: [undefined, undefined, 0.35, 0.35, undefined] }, ] export const getVariantMainGeneId = ({ transcripts = {}, mainTranscriptId, selectedMainTranscriptId }) => {