From b4bde4e279df82a1a0609f413499eef748ed8dce Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 15 Feb 2024 17:00:20 -0500 Subject: [PATCH 1/9] fix mixed data typoe cmp het sort --- hail_search/queries/base.py | 2 +- hail_search/queries/multi_data_types.py | 2 +- hail_search/test_search.py | 76 +++++++++++++++++++++---- 3 files changed, 67 insertions(+), 13 deletions(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index a87c8b5d8e..6b4f0ce538 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -976,7 +976,7 @@ def _get_sort_expressions(self, ht, sort): if sort in self.PREDICTION_FIELDS_CONFIG: prediction_path = self.PREDICTION_FIELDS_CONFIG[sort] - return [-hl.float64(ht[prediction_path.source][prediction_path.field])] + return [hl.or_else(-hl.float64(ht[prediction_path.source][prediction_path.field]), 0)] if sort == OMIM_SORT: return self._omim_sort(ht, hl.set(set(self._sort_metadata))) diff --git a/hail_search/queries/multi_data_types.py b/hail_search/queries/multi_data_types.py index 33fae6d12d..eaaecd5723 100644 --- a/hail_search/queries/multi_data_types.py +++ b/hail_search/queries/multi_data_types.py @@ -126,7 +126,7 @@ def format_search_ht(self): v2=self._format_comp_het_result(ch_ht.v2, data_type), ) hts.append(ch_ht.select( - _sort=hl.sorted([ch_ht.v1._sort, ch_ht.v2._sort])[0], + _sort=hl.sorted([ch_ht.v1._sort.map(hl.float64), ch_ht.v2._sort.map(hl.float64)])[0], **{f'comp_het_{data_type}': ch_ht.row}, )) diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 4d3be16d2c..8cc0c03d98 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -179,6 +179,8 @@ 'ENSG00000277972': {'total': 1, 'families': {'F000002_2': 1}}, } +OMIM_SORT_METADATA = ['ENSG00000177000', 'ENSG00000097046', 'ENSG00000275023'] + def _sorted(variant, sorts): return {**variant, '_sort': sorts + variant['_sort']} @@ -203,6 +205,9 @@ async def _assert_expected_search(self, results, gene_counts=None, **search_kwar self.assertSetEqual(set(resp_json.keys()), {'results', 'total'}) self.assertEqual(resp_json['total'], len(results)) for i, result in enumerate(resp_json['results']): + if result != results[i]: + exp = results[i] + import pdb; pdb.set_trace() self.assertEqual(result, results[i]) if gene_counts: @@ -1100,22 +1105,21 @@ async def test_sort(self): _sorted(VARIANT1, [None])], omit_sample_type='SV_WES', sort='splice_ai', ) - omim_sort_metadata = ['ENSG00000177000', 'ENSG00000097046', 'ENSG00000275023'] sort = 'in_omim' await self._assert_expected_search( [_sorted(MULTI_FAMILY_VARIANT, [0, -2]), _sorted(VARIANT2, [0, -1]), _sorted(VARIANT4, [0, -1]), _sorted(VARIANT1, [1, 0])], - omit_sample_type='SV_WES', sort=sort, sort_metadata=omim_sort_metadata, + omit_sample_type='SV_WES', sort=sort, sort_metadata=OMIM_SORT_METADATA, ) await self._assert_expected_search( [_sorted(GCNV_VARIANT3, [-1]), _sorted(GCNV_VARIANT4, [-1]), _sorted(GCNV_VARIANT1, [0]), _sorted(GCNV_VARIANT2, [0])], - omit_sample_type='SNV_INDEL', sort=sort, sort_metadata=omim_sort_metadata, + omit_sample_type='SNV_INDEL', sort=sort, sort_metadata=OMIM_SORT_METADATA, ) await self._assert_expected_search( [_sorted(MULTI_FAMILY_VARIANT, [0, -2]), _sorted(VARIANT2, [0, -1]), _sorted(VARIANT4, [0, -1]), _sorted(GCNV_VARIANT3, [0, -1]), _sorted(GCNV_VARIANT4, [0, -1]), _sorted(GCNV_VARIANT1, [0, 0]), - _sorted(GCNV_VARIANT2, [0, 0]), _sorted(VARIANT1, [1, 0])], sort=sort, sort_metadata=omim_sort_metadata, + _sorted(GCNV_VARIANT2, [0, 0]), _sorted(VARIANT1, [1, 0])], sort=sort, sort_metadata=OMIM_SORT_METADATA, ) await self._assert_expected_search( @@ -1160,13 +1164,6 @@ async def test_sort(self): ) # sort applies to compound hets - await self._assert_expected_search( - [[_sorted(GCNV_VARIANT4, [0]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [11, 11])], - _sorted(GCNV_VARIANT3, [4.5, 0]), [_sorted(GCNV_VARIANT3, [0]), _sorted(GCNV_VARIANT4, [0])], - _sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]], - sort='protein_consequence', inheritance_mode='recessive', **COMP_HET_ALL_PASS_FILTERS, - ) - await self._assert_expected_search( [[_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT3, [None])], _sorted(VARIANT2, [-0.19699999690055847])], @@ -1177,3 +1174,60 @@ async def test_sort(self): [[_sorted(VARIANT3, [-0.009999999776482582]), _sorted(VARIANT4, [0])], _sorted(VARIANT2, [0])], sort='splice_ai', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, ) + + async def test_multi_data_type_comp_het_sort(self): + # await self._assert_expected_search( + # [[_sorted(GCNV_VARIANT4, [0]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [11, 11])], + # _sorted(GCNV_VARIANT3, [4.5, 0]), [_sorted(GCNV_VARIANT3, [0]), _sorted(GCNV_VARIANT4, [0])], + # _sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]], + # sort='protein_consequence', inheritance_mode='recessive', **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # await self._assert_expected_search( + # [[_sorted(GCNV_VARIANT4, [-14487]), _sorted(GCNV_VARIANT3, [-2666])], + # [_sorted(GCNV_VARIANT4, [-14487]), MULTI_DATA_TYPE_COMP_HET_VARIANT2], + # [VARIANT3, VARIANT4]], + # sort='size', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # await self._assert_expected_search( + # [[_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [8]), GCNV_VARIANT4], + # [_sorted(VARIANT3, [12.5]), _sorted(VARIANT4, [12.5])], + # [GCNV_VARIANT3, GCNV_VARIANT4]], + # sort='pathogenicity', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # await self._assert_expected_search( + # [[_sorted(VARIANT3, [-0.009999999776482582]), _sorted(VARIANT4, [0])], + # [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0]), GCNV_VARIANT4], + # [GCNV_VARIANT3, GCNV_VARIANT4]], + # sort='splice_ai', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + # ) + + # await self._assert_expected_search( + # [[_sorted(GCNV_VARIANT3, [-0.7860000133514404]), _sorted(GCNV_VARIANT4, [-0.7099999785423279])], + # [_sorted(GCNV_VARIANT4, [-0.7099999785423279]), MULTI_DATA_TYPE_COMP_HET_VARIANT2], + # [VARIANT3, VARIANT4]], + # sort='strvctvre', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + # ) + + # await self._assert_expected_search( + # [[_sorted(GCNV_VARIANT3, [0.0015185698866844177]), _sorted(GCNV_VARIANT4, [0.004989586770534515])], + # [_sorted(GCNV_VARIANT4, [0.004989586770534515]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0.31111112236976624])], + # [_sorted(VARIANT4, [0.02222222276031971]), _sorted(VARIANT3, [0.6666666865348816])]], + # sort='callset_af', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # await self._assert_expected_search( + # [[_sorted(VARIANT3, [0]), _sorted(VARIANT4, [0])], + # [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0.28899794816970825]), GCNV_VARIANT4], + # [GCNV_VARIANT3, GCNV_VARIANT4]], + # sort='gnomad_exomes', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + # ) + + await self._assert_expected_search( + [[_sorted(VARIANT3, [0, -2]), _sorted(VARIANT4, [0, -1])], + [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0, -1]), _sorted(GCNV_VARIANT4, [0, -1])], + [_sorted(GCNV_VARIANT3, [0, -1]), _sorted(GCNV_VARIANT4, [0, -1])]], + sort='in_omim', sort_metadata=OMIM_SORT_METADATA, inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + ) From 3c08e873fc871bb5b1e4a1c1778d71056515115b Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 15 Feb 2024 17:01:59 -0500 Subject: [PATCH 2/9] fix mixed data typoe cmp het sort --- hail_search/queries/multi_data_types.py | 2 ++ hail_search/web_app.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/hail_search/queries/multi_data_types.py b/hail_search/queries/multi_data_types.py index eaaecd5723..d47003500c 100644 --- a/hail_search/queries/multi_data_types.py +++ b/hail_search/queries/multi_data_types.py @@ -162,6 +162,8 @@ def _format_collected_row( row, data_types): data_type = next(data_type for data_type in data_types if row.get(data_type)) formatted_row = row.get(data_type) if 'comp_het' in data_type: + if data_type.startswith('SV'): + import pdb; pdb.set_trace() formatted_row = {GROUPED_VARIANTS_FIELD: sorted([formatted_row.v1, formatted_row.v2], key=lambda x: x._sort)} return formatted_row diff --git a/hail_search/web_app.py b/hail_search/web_app.py index f18cc79c2d..5209d12daa 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -65,7 +65,7 @@ async def init_web_app(): spark_conf['spark.driver.memory'] = f'{int((int(MACHINE_MEM)-11)*JVM_MEMORY_FRACTION)}g' if JAVA_OPTS_XSS: spark_conf.update({f'spark.{field}.extraJavaOptions': f'-Xss{JAVA_OPTS_XSS}' for field in ['driver', 'executor']}) - hl.init(idempotent=True, spark_conf=spark_conf or None) + hl.init(idempotent=True, spark_conf=spark_conf or None, backend='local') hl._set_flags(use_new_shuffle='1') load_globals() app = web.Application(middlewares=[error_middleware], client_max_size=(1024**2)*10) From a35aa1e1f1c8548d9b238a2528d0c9b7ed95d4a4 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 15 Feb 2024 17:28:28 -0500 Subject: [PATCH 3/9] merge comp het sort --- hail_search/queries/multi_data_types.py | 25 +++--- hail_search/test_search.py | 103 ++++++++++++------------ 2 files changed, 63 insertions(+), 65 deletions(-) diff --git a/hail_search/queries/multi_data_types.py b/hail_search/queries/multi_data_types.py index d47003500c..2d41037f1f 100644 --- a/hail_search/queries/multi_data_types.py +++ b/hail_search/queries/multi_data_types.py @@ -115,9 +115,7 @@ def format_search_ht(self): dt_ht = query.format_search_ht() if dt_ht is None: continue - merged_sort_expr = self._merged_sort_expr(data_type, dt_ht) - if merged_sort_expr is not None: - dt_ht = dt_ht.annotate(_sort=merged_sort_expr) + dt_ht = self._merged_sort(data_type, dt_ht) hts.append(dt_ht.select('_sort', **{data_type: dt_ht.row})) for data_type, ch_ht in self._comp_het_hts.items(): @@ -137,21 +135,26 @@ def format_search_ht(self): return ht def _format_comp_het_result(self, v, data_type): - return self._data_type_queries[data_type]._format_results(v) + result = self._data_type_queries[data_type]._format_results(v) + return self._merged_sort(data_type, result) - def _merged_sort_expr(self, data_type, ht): + def _merged_sort(self, data_type, ht): # Certain sorts have an extra element for variant-type data, so need to add an element for SV data if not data_type.startswith('SV'): - return None + return ht + sort_expr = None if self._sort == CONSEQUENCE_SORT: - return hl.array([hl.float64(4.5)]).extend(ht._sort.map(hl.float64)) + sort_expr = hl.array([hl.float64(4.5)]).extend(ht._sort.map(hl.float64)) elif self._sort == OMIM_SORT: - return hl.array([hl.int64(0)]).extend(ht._sort) + sort_expr = hl.array([hl.int64(0)]).extend(ht._sort) elif self._sort_metadata: - return ht._sort[:1].extend(ht._sort) + sort_expr = ht._sort[:1].extend(ht._sort) + + if sort_expr is not None: + ht = ht.annotate(_sort=sort_expr) - return None + return ht def _format_collected_rows(self, collected): data_types = [*self._data_type_queries, *[f'comp_het_{data_type}' for data_type in self._comp_het_hts]] @@ -162,8 +165,6 @@ def _format_collected_row( row, data_types): data_type = next(data_type for data_type in data_types if row.get(data_type)) formatted_row = row.get(data_type) if 'comp_het' in data_type: - if data_type.startswith('SV'): - import pdb; pdb.set_trace() formatted_row = {GROUPED_VARIANTS_FIELD: sorted([formatted_row.v1, formatted_row.v2], key=lambda x: x._sort)} return formatted_row diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 8cc0c03d98..4ce5fb0e75 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -205,9 +205,6 @@ async def _assert_expected_search(self, results, gene_counts=None, **search_kwar self.assertSetEqual(set(resp_json.keys()), {'results', 'total'}) self.assertEqual(resp_json['total'], len(results)) for i, result in enumerate(resp_json['results']): - if result != results[i]: - exp = results[i] - import pdb; pdb.set_trace() self.assertEqual(result, results[i]) if gene_counts: @@ -1176,58 +1173,58 @@ async def test_sort(self): ) async def test_multi_data_type_comp_het_sort(self): - # await self._assert_expected_search( - # [[_sorted(GCNV_VARIANT4, [0]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [11, 11])], - # _sorted(GCNV_VARIANT3, [4.5, 0]), [_sorted(GCNV_VARIANT3, [0]), _sorted(GCNV_VARIANT4, [0])], - # _sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]], - # sort='protein_consequence', inheritance_mode='recessive', **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # await self._assert_expected_search( - # [[_sorted(GCNV_VARIANT4, [-14487]), _sorted(GCNV_VARIANT3, [-2666])], - # [_sorted(GCNV_VARIANT4, [-14487]), MULTI_DATA_TYPE_COMP_HET_VARIANT2], - # [VARIANT3, VARIANT4]], - # sort='size', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # await self._assert_expected_search( - # [[_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [8]), GCNV_VARIANT4], - # [_sorted(VARIANT3, [12.5]), _sorted(VARIANT4, [12.5])], - # [GCNV_VARIANT3, GCNV_VARIANT4]], - # sort='pathogenicity', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # await self._assert_expected_search( - # [[_sorted(VARIANT3, [-0.009999999776482582]), _sorted(VARIANT4, [0])], - # [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0]), GCNV_VARIANT4], - # [GCNV_VARIANT3, GCNV_VARIANT4]], - # sort='splice_ai', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, - # ) - - # await self._assert_expected_search( - # [[_sorted(GCNV_VARIANT3, [-0.7860000133514404]), _sorted(GCNV_VARIANT4, [-0.7099999785423279])], - # [_sorted(GCNV_VARIANT4, [-0.7099999785423279]), MULTI_DATA_TYPE_COMP_HET_VARIANT2], - # [VARIANT3, VARIANT4]], - # sort='strvctvre', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, - # ) - - # await self._assert_expected_search( - # [[_sorted(GCNV_VARIANT3, [0.0015185698866844177]), _sorted(GCNV_VARIANT4, [0.004989586770534515])], - # [_sorted(GCNV_VARIANT4, [0.004989586770534515]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0.31111112236976624])], - # [_sorted(VARIANT4, [0.02222222276031971]), _sorted(VARIANT3, [0.6666666865348816])]], - # sort='callset_af', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # await self._assert_expected_search( - # [[_sorted(VARIANT3, [0]), _sorted(VARIANT4, [0])], - # [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0.28899794816970825]), GCNV_VARIANT4], - # [GCNV_VARIANT3, GCNV_VARIANT4]], - # sort='gnomad_exomes', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, - # ) + await self._assert_expected_search( + [[_sorted(GCNV_VARIANT4, [0]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [11, 11])], + _sorted(GCNV_VARIANT3, [4.5, 0]), [_sorted(GCNV_VARIANT3, [0]), _sorted(GCNV_VARIANT4, [0])], + _sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]], + sort='protein_consequence', inheritance_mode='recessive', **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[_sorted(GCNV_VARIANT4, [-14487]), _sorted(GCNV_VARIANT3, [-2666])], + [_sorted(GCNV_VARIANT4, [-14487]), MULTI_DATA_TYPE_COMP_HET_VARIANT2], + [VARIANT3, VARIANT4]], + sort='size', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [8]), GCNV_VARIANT4], + [_sorted(VARIANT3, [12.5]), _sorted(VARIANT4, [12.5])], + [GCNV_VARIANT3, GCNV_VARIANT4]], + sort='pathogenicity', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[_sorted(VARIANT3, [-0.009999999776482582]), _sorted(VARIANT4, [0])], + [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0]), GCNV_VARIANT4], + [GCNV_VARIANT3, GCNV_VARIANT4]], + sort='splice_ai', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[_sorted(GCNV_VARIANT3, [-0.7860000133514404]), _sorted(GCNV_VARIANT4, [-0.7099999785423279])], + [_sorted(GCNV_VARIANT4, [-0.7099999785423279]), MULTI_DATA_TYPE_COMP_HET_VARIANT2], + [VARIANT3, VARIANT4]], + sort='strvctvre', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[_sorted(GCNV_VARIANT3, [0.0015185698866844177]), _sorted(GCNV_VARIANT4, [0.004989586770534515])], + [_sorted(GCNV_VARIANT4, [0.004989586770534515]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0.31111112236976624])], + [_sorted(VARIANT4, [0.02222222276031971]), _sorted(VARIANT3, [0.6666666865348816])]], + sort='callset_af', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[_sorted(VARIANT3, [0]), _sorted(VARIANT4, [0])], + [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0.28899794816970825]), GCNV_VARIANT4], + [GCNV_VARIANT3, GCNV_VARIANT4]], + sort='gnomad_exomes', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, + ) await self._assert_expected_search( [[_sorted(VARIANT3, [0, -2]), _sorted(VARIANT4, [0, -1])], - [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0, -1]), _sorted(GCNV_VARIANT4, [0, -1])], - [_sorted(GCNV_VARIANT3, [0, -1]), _sorted(GCNV_VARIANT4, [0, -1])]], + [_sorted(GCNV_VARIANT3, [-1]), _sorted(GCNV_VARIANT4, [-1])], + [_sorted(GCNV_VARIANT4, [0, -1]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [1, -1])]], sort='in_omim', sort_metadata=OMIM_SORT_METADATA, inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, ) From a8915bf99d8a621a7c8af5c6da17df66233497b9 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 15 Feb 2024 17:29:33 -0500 Subject: [PATCH 4/9] undo debug --- hail_search/web_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hail_search/web_app.py b/hail_search/web_app.py index 5209d12daa..f18cc79c2d 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -65,7 +65,7 @@ async def init_web_app(): spark_conf['spark.driver.memory'] = f'{int((int(MACHINE_MEM)-11)*JVM_MEMORY_FRACTION)}g' if JAVA_OPTS_XSS: spark_conf.update({f'spark.{field}.extraJavaOptions': f'-Xss{JAVA_OPTS_XSS}' for field in ['driver', 'executor']}) - hl.init(idempotent=True, spark_conf=spark_conf or None, backend='local') + hl.init(idempotent=True, spark_conf=spark_conf or None) hl._set_flags(use_new_shuffle='1') load_globals() app = web.Application(middlewares=[error_middleware], client_max_size=(1024**2)*10) From 3114c7c9f918c5dba73979b20378b057429bea04 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 16 Feb 2024 12:59:18 -0500 Subject: [PATCH 5/9] fix tests --- hail_search/test_search.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 4ce5fb0e75..4c20cfc556 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -1094,12 +1094,12 @@ async def test_sort(self): await self._assert_expected_search( [_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT2, [-0.19699999690055847]), - _sorted(VARIANT1, [None]), _sorted(MULTI_FAMILY_VARIANT, [None])], omit_sample_type='SV_WES', sort='revel', + _sorted(VARIANT1, [0]), _sorted(MULTI_FAMILY_VARIANT, [0])], omit_sample_type='SV_WES', sort='revel', ) await self._assert_expected_search( [_sorted(MULTI_FAMILY_VARIANT, [-0.009999999776482582]), _sorted(VARIANT2, [0]), _sorted(VARIANT4, [0]), - _sorted(VARIANT1, [None])], omit_sample_type='SV_WES', sort='splice_ai', + _sorted(VARIANT1, [0])], omit_sample_type='SV_WES', sort='splice_ai', ) sort = 'in_omim' @@ -1162,7 +1162,7 @@ async def test_sort(self): # sort applies to compound hets await self._assert_expected_search( - [[_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT3, [None])], + [[_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT3, [0])], _sorted(VARIANT2, [-0.19699999690055847])], sort='revel', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, ) @@ -1174,8 +1174,8 @@ async def test_sort(self): async def test_multi_data_type_comp_het_sort(self): await self._assert_expected_search( - [[_sorted(GCNV_VARIANT4, [0]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [11, 11])], - _sorted(GCNV_VARIANT3, [4.5, 0]), [_sorted(GCNV_VARIANT3, [0]), _sorted(GCNV_VARIANT4, [0])], + [_sorted(GCNV_VARIANT3, [4.5, 0]), [_sorted(GCNV_VARIANT3, [0]), _sorted(GCNV_VARIANT4, [0])], + [_sorted(GCNV_VARIANT4, [4.5, 0]), _sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [11, 11])], _sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]], sort='protein_consequence', inheritance_mode='recessive', **COMP_HET_ALL_PASS_FILTERS, ) @@ -1194,6 +1194,13 @@ async def test_multi_data_type_comp_het_sort(self): sort='pathogenicity', inheritance_mode='compound_het', **COMP_HET_ALL_PASS_FILTERS, ) + await self._assert_expected_search( + [[_sorted(VARIANT4, [-0.6869999766349792]), _sorted(VARIANT3, [0])], _sorted(VARIANT2, [0]), + [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0]), GCNV_VARIANT4], + GCNV_VARIANT3, [GCNV_VARIANT3, GCNV_VARIANT4]], + sort='mut_pred', inheritance_mode='recessive', **COMP_HET_ALL_PASS_FILTERS, + ) + await self._assert_expected_search( [[_sorted(VARIANT3, [-0.009999999776482582]), _sorted(VARIANT4, [0])], [_sorted(MULTI_DATA_TYPE_COMP_HET_VARIANT2, [0]), GCNV_VARIANT4], From 672a189aad368c54fb4c28862f620938e94c4baf Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 16 Feb 2024 13:23:22 -0500 Subject: [PATCH 6/9] do not use interval prefilter for lareg gene lists --- hail_search/queries/base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index a87c8b5d8e..6c8bb5d432 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -561,7 +561,7 @@ def _filter_rs_ids(self, ht, rs_ids): rs_id_set = hl.set(rs_ids) return ht.filter(rs_id_set.contains(ht.rsid)) - def _parse_intervals(self, intervals, **kwargs): + def _parse_intervals(self, intervals, gene_ids=None, **kwargs): parsed_variant_keys = self._parse_variant_keys(**kwargs) if parsed_variant_keys: self._load_table_kwargs['variant_ht'] = hl.Table.parallelize(parsed_variant_keys).key_by(*self.KEY_FIELD) @@ -582,6 +582,9 @@ def _parse_intervals(self, intervals, **kwargs): reference_genome = hl.get_reference(self.GENOME_VERSION) intervals = (intervals or []) + [reference_genome.x_contigs[0]] + if len(intervals) > 100 and len(intervals) == len(gene_ids or []): + return [] + parsed_intervals = [ hl.eval(hl.parse_locus_interval(interval, reference_genome=self.GENOME_VERSION, invalid_missing=True)) for interval in intervals From 90375e316492d6e22b53f1d4a9a251372605a186 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 16 Feb 2024 13:28:33 -0500 Subject: [PATCH 7/9] change mito field name (#3895) --- .../MITO/annotations.ht/.README.txt.crc | Bin 12 -> 12 bytes .../MITO/annotations.ht/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../GRCh38/MITO/annotations.ht/README.txt | 4 ++-- .../.index.crc | Bin .../.metadata.json.gz.crc | Bin .../index | Bin .../metadata.json.gz | Bin .../MITO/annotations.ht/metadata.json.gz | Bin 829 -> 829 bytes .../annotations.ht/rows/.metadata.json.gz.crc | Bin 20 -> 20 bytes .../MITO/annotations.ht/rows/metadata.json.gz | Bin 1168 -> 1168 bytes ...0-9762ffa7-64ef-4769-b166-fbd6e240e32d.crc | Bin 12 -> 0 bytes ...0-cce43fcc-aae8-40b4-a3d9-61782f574461.crc | Bin 0 -> 12 bytes ...art-0-9762ffa7-64ef-4769-b166-fbd6e240e32d | Bin 467 -> 0 bytes ...art-0-cce43fcc-aae8-40b4-a3d9-61782f574461 | Bin 0 -> 461 bytes hail_search/queries/mito.py | 4 +++- 15 files changed, 5 insertions(+), 3 deletions(-) rename hail_search/fixtures/GRCh38/MITO/annotations.ht/index/{part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx => part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx}/.index.crc (100%) rename hail_search/fixtures/GRCh38/MITO/annotations.ht/index/{part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx => part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx}/.metadata.json.gz.crc (100%) rename hail_search/fixtures/GRCh38/MITO/annotations.ht/index/{part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx => part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx}/index (100%) rename hail_search/fixtures/GRCh38/MITO/annotations.ht/index/{part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx => part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx}/metadata.json.gz (100%) delete mode 100644 hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/.part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.crc create mode 100644 hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/.part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.crc delete mode 100644 hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d create mode 100644 hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461 diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/MITO/annotations.ht/.README.txt.crc index 4a38aced39041db3aac0d113493f5ead2d64436f..1db259a0c4f065f37c86d5e683d7a33a90a41b23 100644 GIT binary patch literal 12 TcmYc;N@ieSU}9kUwB`x`5{d(_ literal 12 TcmYc;N@ieSU}DJB;H&@u5GDdm diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/MITO/annotations.ht/.metadata.json.gz.crc index c8d8e4108e7a858a7fbc52a440dc405363dfb576..0fa5ab54bf8c25cac1859a19222b9640968eb46c 100644 GIT binary patch literal 16 XcmYc;N@ieSU}89X#_`cIhb%1sD6R!@ literal 16 XcmYc;N@ieSU}A_(EDvhh8nOleA=(7_ diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/README.txt b/hail_search/fixtures/GRCh38/MITO/annotations.ht/README.txt index 2ea61e4a2b..4398c58b04 100644 --- a/hail_search/fixtures/GRCh38/MITO/annotations.ht/README.txt +++ b/hail_search/fixtures/GRCh38/MITO/annotations.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.124-13536b531342 - Created at 2023/11/22 10:50:28 \ No newline at end of file + Written with version 0.2.120-f00f916faf78 + Created at 2024/02/15 17:49:47 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx/.index.crc b/hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx/.index.crc similarity index 100% rename from hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx/.index.crc rename to hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx/.index.crc diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx/.metadata.json.gz.crc similarity index 100% rename from hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx/.metadata.json.gz.crc rename to hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx/.metadata.json.gz.crc diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx/index b/hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx/index similarity index 100% rename from hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx/index rename to hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx/index diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx/metadata.json.gz b/hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx/metadata.json.gz similarity index 100% rename from hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.idx/metadata.json.gz rename to hail_search/fixtures/GRCh38/MITO/annotations.ht/index/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.idx/metadata.json.gz diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/metadata.json.gz b/hail_search/fixtures/GRCh38/MITO/annotations.ht/metadata.json.gz index 8c63b43ec0c56b8dba7340601b0bc83912ba71fa..dfe2f808e2b08c3ce87d5367ca4548526ca053d1 100644 GIT binary patch delta 819 zcmV-31I+xr2E7K57k{M5&~$$5GIS`m6-86*B?tt?rfsdVFL>Y@-dxE($8n-DNW}WK^Q1V zL4k-t69t@<)Z_^6vjvnJ^eN zLjt5=Mp_6HSr~IB9;xzTvJpBh2Z&6_f+ho14xBv?HHGO#v6h0CJ&_e9jZLGd!=@DP zA^KmK`Ev9l)FFtfcq>&4smElka(R7qPf7(zbg2~GuJ*y#;m^uYgAwkjHlU6m>T#a? zBZiqax;EFn+<(iqRdo68Raz+MB*^_*nAx-rRW_TG1A6n5 z5C_mF_*xIN8Q7y}!Gt69Vz&gZ&+0GL*!%th-t{D$KdMD$rho*Mx5Sxxs(WmoVN^Q;OpI)5t zj_#51?%mFj$-ca4;$JRohj`)230GWF8?~U%BdgHCoE;Jc;bK zw`|yWAO@SL-=~!@Z6{@tCrd+>Zmj=sYB$$Gk7AKu_~Y@FtnrC}8>W;3gwa8?3-+%A z_fycqz8da+eO-9SvV_<@JlfZW$JeNpPjB~gV00=e@^=}y0`vpideg#L#=$&(HCx*_ xxGWiMDPRzxRW?3DRKhggR&4n8J8U5dBD^4|aqWA5_h53a^eN?aP25 zFv&yekRl1~f(_$9o00PbIY4AW7Bn5Ga^UP`s3}Y@inSE9?1`)>X<`~h9X6$S z57Ga^%$K7dp$mbfQ>kTypTcaxLd^YbI^ejkx!a1j!d8GnX5J zR60aFaL}tz$!E{TS%2d1_W#4z`*|K+9(mlPWJQ7P z!sRo`0V^X*PIa-3KTFAhiZG`omzk1PTO6iI3L7TS-J|fP4|ZW2<@Nd!OzlRPXjX?R zi!R87{l!s;^Xc>dSBta>+M{T}gd{X!scvVA3W@iGJIWo3B}kjmcS8&UNOxAAJ?W$` zw6oefzJG%^ec5I-X-Q&<>=`=1D#Uu)#uPNeUt6)RSEnAmlkw}Zsog5Xo@7+?>AaLu zTl4wpt1Fs!zG(8k%%^G&bO zHr&pW$nJW(h>Ztgu&DZdS_#v3QYLx2FjVQr`WL5mu}ylku4{Z9;BqOY0AX|x?}Giq zz@c0_F^7-w44vbDEMQ)CrORgW_&YR}WG7jeP%V=fe x;4D(KX~G~vt89FRsFZ2EiNx^jH`qcN#8`a5qJ-Yxy_KA6{RgmJr1PW+006mBnc@Hd diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/.metadata.json.gz.crc index 24d59c359e4151388e39c7844d3c6d61732ef082..b7402c926974a343112904394260520750c88435 100644 GIT binary patch literal 20 bcmYc;N@ieSU}C6wwP~FzxB0m;H;#A!KPCow literal 20 bcmYc;N@ieSU}8wNIbSDqj-$ht-Rvy@Hh=~G diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/metadata.json.gz index 41d8e9c190309f912b327ac5a6ff70597a67c3e4..c0a74926e737c314c170227f99f35deb62a86f81 100644 GIT binary patch delta 1157 zcmV;01bX|B36Kep8h^1}-C{Q>47A1~4)(BtAutpraU+phlFr%<^xt==Pg0VTxIixr zf*>S^oOy@ChlHLThEn2q&YA_}K!V%9zHBps$BygxkL2ih_fEp`UP2os2}y`_b})o_ zK(ID+yytp_yJJRzRWkrqM0df+hwCMhWUIs;tLI`l&+5XQDSsO2KFW|dx*3;LQO=-e z&D3O^yOnR~xDo;#?`zYYi+P%I<|q6=KPHL@o+U`8NAG&cc|s6#W0WO45`3R|rcWyU zfP|D$RyZz=b2p`mD{7l6!H}N`5>Tw8@@e4JafxU$_e(|R2@+DTQVmNk6bXDK5R+KY zOhJB`B>(OS!+#*U(rh|FDP`P8mFwX&vV zgksEipj^R@wh3^JIDJTd>K6f3@fIdK7c!mj&}S%u-*#VgUq};}H8jZx0RDSy61Q`o zkf4ZJaUCGPMMSfy8$2)>X1?A>YfHB4o4aBYjp)Dc6@Sv(a)+c2!ye7ui18E!{iUup zWjNsNYNe~LZQfMPn}j3nAw>s2h7GCP+JL7A4$|gdPSQmR_o;D<|z5n|kC7 zZ(3iRDoCc~Nn$QYGod^VDYB14hE&XSJ~QgRrcI*< zCzOGpet-T-daTW=4HJEsS@dTk2I6ne+J|AmR2{A2-!o-YX2g?VtB{bj!KsNzN+K$h zbSodWg+_4!05}5_3SE#$n^vl-d~3WuRd}_)`c40R_w{xaPZsYRZk8^<5G0>w<^VDW zXxU@sX-xJhX@oa_UsAA)>NN;93cC2KrnmI7;D1+**6gdACHjm;RyS<$Rh&P2t+5+0E zE)1Gk;OTL!kFWuN4KLB4>n#F|$a+z4ydhq7QS~aRtOL;_jY+6JpHARuh)!mWzRV^Q zO@9yjFeIY$K>T*SycjQ*?U3u>m?nW?vII&zf>_JwL3eKH4=|WrUQTBHX@5%<6jRTf z2iYSi?as{g4~U@pN!-z}uKS7?vLK1x7U=)@@sGk>_d*uJ;vM{^$FX_Sgj7g<(htpx z0^Z=(tGBg<5$_j1taF!#p)}q$h%sJb#DDgY=ewcXsFh*}v1(QeIQQymqUy!eiRL#c zK)^La$p-4?=L)LuCkPuZp`v2-g+%<$X7>xR?D^$ISGo}d*c%K7b!Htl8L(=vue__f ztNQ#(9g=cF$nRLZkhGtCpB6c2vtzH4J9_a933WsD*9h<4(T&lk5yoUX2{9faM1B^d z>8FS32u*^|quIrLF%B>1)9LJ@_;~TpEn*Q_!rd6?`OgmA>q2Q^<);q#r>tS{ievE| z!l`T6@ut)9=e+$G0`9-U_>}`2?hqEdag~edd@-??xdVe$u?%H6*#~4>Mg_7-(|4^s X=$>2)K}rejD9`@~A3os)#}NPk2gx>> delta 1157 zcmV;01bX|B36Kep8h^DD-C{Q>479-_4)(BtA+rSVO zgkm$%3!V=a?)Et)%Vq#(QPTw@AFdWi(Y2O4te;DBfzyQr*MBTFeT*Zy_j93`W`aY{ zitEX^@N3`DaVaI*-yf!Kd@Qm|@Guqsg$dPEiabRs+Xq(*AySICpP)Pyu@t*JuzfNW z5*jhicb z7O+&tN z85!2o7bs8#zTgl!UQ(t&^8l9X0?}9yIc;{NmKST z;IQ&itFE22Hf|0hCwMdQC7Fg~I-V33k~Ry9@;GG3Jq|h2iGX5a*gjY=lVv%=`bB%O z4i)6Q!+(cY;D0MyWZ>JpQu@2K2XQ?Za@uOz)lI-*IhK zW-L;&)<~+ugkuYlQdHI`=~jMG7_vUrWObPO&HR1$^>&#|&fho4tX9KBh~_vh2WU9} z%GWU2rQ~ry_W5Y3pTDp8SBZWNe@%d0@YTy(?teMht49yitD6<{tVVulxX7zJE#kcZ zyh`C9?vBvTS4%a$V>tLpHUi(OMPrGy2t<&tF z>{S;Q#~kGJ2-b(!0J?@vG|+lW0G6_z>YHGQR9#5D$|&bR^cZ8asZXXOU>f3*U85D1 zwtu@QqO$WW{C2fC8=o)QA>YFZONljQ0R(t}K%7Zi)466pV8ZP4=gF+U>~Ez>x%9+! zkQs`y?#jaaKoHD4ggY^;4t?djSkly_9Qr?g{G*(1dZAXL^7Z?sC$W8hL`*7kil3Ty z0=&GPw{ANZMuK1XoGyG3Map`yAkIaF2!Gc{QEY}TqBayO5T_nZ0H@wsjjEoej+)=3 z00G4ur8iJFKUYwtKS}v;36&LVE*lzlHoISlRnIRky2_2Dz*RCV>cl#1GT_u+Kk{ze z-KbBm)S*z01oAr;FC^_JpQdFF+U&Si3eOnNk<{1Be1iz?J<}L{8ePn1<0wM&(SB@7 zqtSFeyBKZHX0uVWB{Mpnexj2xDL+~KbBlOP7jW+pv;NtGJ6mcKSoMJe{;6shyy8TD z2RVlt_JZkjd{MNYK*0T17{Byj!#&E$8?Bs8=jRjG%pFWvmSz~o>5kBKr3%8PP2ZLA XV0sD@2+~>_M@8{JnaZU)y%7Ka3Vt^u diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/.part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.crc b/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/.part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d.crc deleted file mode 100644 index 692d08a7702501a92b4fca5e30152b5b5f7b61f2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}ErkIcqKe6Ep** diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/.part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.crc b/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/.part-0-cce43fcc-aae8-40b4-a3d9-61782f574461.crc new file mode 100644 index 0000000000000000000000000000000000000000..b28a06e3c18b65052ab5fa932a19606a219800e7 GIT binary patch literal 12 TcmYc;N@ieSU}DHqXte_X5a0rg literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d b/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/part-0-9762ffa7-64ef-4769-b166-fbd6e240e32d deleted file mode 100644 index 1f232e10bc2bafbd571c94fc711bda67be538cbd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 467 zcmV;^0WAK$0RR9*0ssIgwJ-f(L;-~j0EU-ZL?Dee9S8`BR--WCt!konjT=D_I?9?O z{sG;bt3h;L$)fk%Md?4XM{}Q_J5ODsz1xlK-TwYXfW(6LpRJcJQBRso0Ym^t07U>7 zVrV6X!7jiJya!-h4p(Vi@Q4D`*nJ&j5Z-kAa$Ogz)z|8x-_h8I$s#R$9V)w=^^;a*jGqHJN%ESAkHf~T`VfVmt0 zr<8#hG{@5L{+wuD~^Xh6`v1aO7u#1Pv=# zsAj_8*`-($!S&j!G4C@<5EyI&XB%>6yUt7k;eqRmlgFM7fYGqP9t{8h0000004TLD J{U87V004L@&i()Z diff --git a/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461 b/hail_search/fixtures/GRCh38/MITO/annotations.ht/rows/parts/part-0-cce43fcc-aae8-40b4-a3d9-61782f574461 new file mode 100644 index 0000000000000000000000000000000000000000..74f06cabec202b8f9cbcfdecc797cc722914b855 GIT binary patch literal 461 zcmV;;0W$u$0RR9*0ssIgwJ-f(L;+O|0A?3iLlBQP9e@N-4MahM`BpGF2;4XT;0k*t z5{^LbWa-*q;Jf{S-0pye^$v$Y8LYgP3TIuqREXLjXqrMF8K0 z7+QHOA+e%#(mJio*Eu=4>(=eSzSeVV8|^T3HNv0Y1Lm zN&kNvxAgIK9_V9<@3+oceL`dv>x`L4R%^A<^T8L-=VJ7|-7K8Xb5yH(&#ba}54VAMs&$A+=bHu%k}BK|B_&Riy{Y6^6`S*t8ySHW>qW3;!+8 zDCr=z$$}3aPNr@LDzpY1B{&V<32J4es+_2Bya0+e0Yr16ze?zkn5Gnx6(bDHa8`N= zc!StVwHY%|&|Dbif!Q*hw$ct7900ePc_0v6W@oOzHGGB(Xa{iQXMzL`D_Eyy0^!-E zSQEkZ+N&||GfEH`Yy)Q-a%N?nnGeDP*B2*`JsSX{V1Ydi000000000ewJ-f300961 DYE8hl literal 0 HcmV?d00001 diff --git a/hail_search/queries/mito.py b/hail_search/queries/mito.py index 0b48d592f6..00cca55968 100644 --- a/hail_search/queries/mito.py +++ b/hail_search/queries/mito.py @@ -58,7 +58,9 @@ class MitoHailTableQuery(BaseHailTableQuery): CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS + ['rsid'] MITO_ANNOTATION_FIELDS = { 'commonLowHeteroplasmy': lambda r: r.common_low_heteroplasmy, - 'highConstraintRegion': lambda r: r.high_constraint_region, + 'highConstraintRegion': ( + lambda r: r.high_constraint_region if hasattr(r, 'high_constraint_region') else r.high_constraint_region_mito + ), 'mitomapPathogenic': lambda r: r.mitomap.pathogenic, } BASE_ANNOTATION_FIELDS = { From 48d436e672800313762727a00a06f07c5c2d72be Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 16 Feb 2024 13:46:17 -0500 Subject: [PATCH 8/9] fix performace for many project search --- hail_search/queries/base.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index a87c8b5d8e..480c22c5a1 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -245,13 +245,19 @@ def _read_table(self, path, drop_globals=None, use_ssd_dir=False, skip_missing_f table_path = self._get_table_path(path, use_ssd_dir=use_ssd_dir) if 'variant_ht' in self._load_table_kwargs: ht = self._query_table_annotations(self._load_table_kwargs['variant_ht'], table_path) - if skip_missing_field and not ht.any(hl.is_defined(ht[skip_missing_field])): + if self._should_skip_ht(ht, skip_missing_field): return None ht_globals = hl.read_table(table_path).globals if drop_globals: ht_globals = ht_globals.drop(*drop_globals) return ht.annotate_globals(**hl.eval(ht_globals)) - return hl.read_table(table_path, **self._load_table_kwargs) + + ht = hl.read_table(table_path, **self._load_table_kwargs) + return None if self._should_skip_ht(ht, skip_missing_field) else ht + + @staticmethod + def _should_skip_ht(ht, skip_missing_field): + return skip_missing_field and not ht.any(hl.is_defined(ht[skip_missing_field])) @staticmethod def _query_table_annotations(ht, query_table_path): @@ -290,6 +296,8 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, ** if exception_messages: raise HTTPBadRequest(reason='; '.join(exception_messages)) + if len(project_samples) > len(filtered_project_hts): + logger.info(f'Found {len(filtered_project_hts)} {self.DATA_TYPE} projects with matched entries') return filtered_project_hts def import_filtered_table(self, project_samples, num_families, intervals=None, **kwargs): From 7773271961ae1bcd38160b0e3bdeee388942e134 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 16 Feb 2024 14:18:22 -0500 Subject: [PATCH 9/9] no magic numbers --- hail_search/queries/base.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py index 2e4b86e504..4ff212a0e2 100644 --- a/hail_search/queries/base.py +++ b/hail_search/queries/base.py @@ -13,6 +13,10 @@ DATASETS_DIR = os.environ.get('DATASETS_DIR', '/hail_datasets') SSD_DATASETS_DIR = os.environ.get('SSD_DATASETS_DIR', DATASETS_DIR) +# Number of filtered genes at which pre-filtering a table by gene-intervals does not improve performance +# Estimated based on behavior for several representative gene lists +MAX_GENE_INTERVALS = 100 + logger = logging.getLogger(__name__) @@ -590,7 +594,7 @@ def _parse_intervals(self, intervals, gene_ids=None, **kwargs): reference_genome = hl.get_reference(self.GENOME_VERSION) intervals = (intervals or []) + [reference_genome.x_contigs[0]] - if len(intervals) > 100 and len(intervals) == len(gene_ids or []): + if len(intervals) > MAX_GENE_INTERVALS and len(intervals) == len(gene_ids or []): return [] parsed_intervals = [