From 9f84c5c48f392d652b1ec27f03f339f52e36e87a Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 22 Aug 2023 14:58:29 -0400 Subject: [PATCH 01/16] clean up --- hail_search/test_search.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 6abe0eeba8..83cc30fbcc 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -121,10 +121,6 @@ async def _assert_expected_search(self, results, **search_kwargs): self.assertSetEqual(set(resp_json.keys()), {'results', 'total'}) self.assertEqual(resp_json['total'], len(results)) for i, result in enumerate(resp_json['results']): - if result != results[i]: - self.assertSetEqual(set(result.keys()), set(results[i].keys())) - diff_k = {k for k, v in results[i].items() if v != result[k]} - import pdb; pdb.set_trace() self.assertEqual(result, results[i]) async def test_single_family_search(self): From 521f591363821bfddc8304aa1412bad9ec6cebc3 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 22 Aug 2023 17:03:21 -0400 Subject: [PATCH 02/16] hadle gcnv missing ref calls --- hail_search/hail_search_query.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 154f36a58b..91009584de 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -40,6 +40,7 @@ class BaseHailTableQuery(object): HAS_ALT: lambda gt: gt.is_non_ref(), HAS_REF: lambda gt: gt.is_hom_ref() | gt.is_het_ref(), } + MISSING_NUM_ALT = -1 GENOTYPE_FIELDS = {} NESTED_GENOTYPE_FIELDS = {} @@ -97,7 +98,7 @@ def annotation_fields(self): lambda gt: hl.is_defined(gt.individualGuid) ).group_by(lambda x: x.individualGuid).map_values(lambda x: x[0].select( 'sampleId', 'individualGuid', 'familyGuid', - numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), -1), + numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), self.MISSING_NUM_ALT), **{k: x[0][field] for k, field in self.GENOTYPE_FIELDS.items()}, **{_to_camel_case(k): x[0][field][k] for field, v in self.NESTED_GENOTYPE_FIELDS.items() for k in v}, )), @@ -365,7 +366,7 @@ def _add_entry_sample_families(cls, ht, sample_data): ht = ht.transmute( family_entries=family_sample_indices.map(lambda sample_indices: sample_indices.map( - lambda i: hl.or_else(ht.entries[i], cls._missing_entry(ht.entries[i])).annotate( + lambda i: ht.entries[i].annotate( sampleId=sample_index_id_map.get(i), individualGuid=sample_index_individual_map.get(i), familyGuid=sample_index_family_map.get(i), @@ -376,11 +377,6 @@ def _add_entry_sample_families(cls, ht, sample_data): return ht, sample_id_family_index_map, num_families - @classmethod - def _missing_entry(cls, entry): - entry_type = dict(**entry.dtype) - return hl.struct(**{k: hl.missing(v) for k, v in entry_type.items()}) - def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map): any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT) @@ -1162,10 +1158,14 @@ class GcnvHailTableQuery(SvHailTableQuery): DATA_TYPE = 'SV_WES' + # gCNV data has no ref/ref calls so a missing entry indicates ref/ref GENOTYPE_QUERY_MAP = { **BaseHailTableQuery.GENOTYPE_QUERY_MAP, + REF_REF: hl.is_missing, + HAS_REF: lambda gt: hl.is_missing(gt) | gt.is_het_ref(), COMP_HET_ALT: BaseHailTableQuery.GENOTYPE_QUERY_MAP[HAS_ALT], } + MISSING_NUM_ALT = 0 GENOTYPE_FIELDS = { **SvHailTableQuery.GENOTYPE_FIELDS, @@ -1173,6 +1173,7 @@ class GcnvHailTableQuery(SvHailTableQuery): **{_to_camel_case(f): f'sample_{f}' for f in ['start', 'end', 'num_exon', 'gene_ids']}, } del GENOTYPE_FIELDS['gq'] + GENOTYPE_QUERY_FIELDS = {} NESTED_GENOTYPE_FIELDS = { 'concordance': SvHailTableQuery.NESTED_GENOTYPE_FIELDS['concordance'][:-1] + ['prev_overlap'] } @@ -1197,15 +1198,10 @@ def _get_genotype_override_field(r, field, agg, default=None): if default is None: default = r[field] return hl.if_else( - entries.any(lambda g: g.GT.is_non_ref() & hl.is_missing(g[sample_field])), + entries.any(lambda g: hl.is_defined(g.GT) & hl.is_missing(g[sample_field])), default, agg(entries.map(lambda g: g[sample_field])) ) - @classmethod - def _missing_entry(cls, entry): - # gCNV data has no ref/ref calls so a missing entry indicates ref/ref - return super()._missing_entry(entry).annotate(GT=hl.Call([0, 0])) - def _filter_annotated_table(self, **kwargs): # sorted_gene_consequences may contain genes absent from the queried families, so remove those before filtering empty_gene_set = hl.empty_set(hl.tstr) From 734c4c117e05d9d95a5ffaf60ff146a1a25f6e57 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 23 Aug 2023 11:43:52 -0400 Subject: [PATCH 03/16] fux gene id override behavior --- hail_search/hail_search_query.py | 36 ++++++++++++-------- seqr/utils/search/elasticsearch/constants.py | 2 +- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 91009584de..90b92b0ac3 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -64,7 +64,7 @@ class BaseHailTableQuery(object): 'response_key': 'transcripts', 'empty_array': True, 'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}), - 'format_values': lambda values: values.group_by(lambda t: t.geneId), + 'format_values': lambda values, *args: values.group_by(lambda t: t.geneId), }, } LIFTOVER_ANNOTATION_FIELDS = { @@ -145,7 +145,7 @@ def _get_enum_lookup(self, field, subfield): def _get_enum_terms_ids(self, field, subfield, terms): enum = self._get_enum_lookup(field, subfield) - return {enum[t] for t in terms if enum.get(t)} + return {enum[t] for t in terms if enum.get(t) is not None} def _format_enum_response(self, k, enum): enum_config = self.ENUM_ANNOTATION_FIELDS.get(k, {}) @@ -163,7 +163,7 @@ def _format_enum(cls, r, field, enum, empty_array=False, format_values=None, **k value = hl.or_else(value, hl.empty_array(value.dtype.element_type)) value = value.map(lambda x: cls._enum_field(x, enum, **kwargs)) if format_values: - value = format_values(value) + value = format_values(value, r) return value return cls._enum_field(value, enum, **kwargs) @@ -630,6 +630,7 @@ def _filter_by_annotations(self, pathogenicity, annotations, annotations_seconda annotation_filter = self._ht[HAS_ALLOWED_ANNOTATION] if has_secondary_annotations: annotation_filter |= self._ht[HAS_ALLOWED_SECONDARY_ANNOTATION] + self._ht = self._ht.filter(annotation_filter) def _get_allowed_consequences_annotations(self, annotations, annotation_filters, is_secondary=False): @@ -1163,7 +1164,8 @@ class GcnvHailTableQuery(SvHailTableQuery): **BaseHailTableQuery.GENOTYPE_QUERY_MAP, REF_REF: hl.is_missing, HAS_REF: lambda gt: hl.is_missing(gt) | gt.is_het_ref(), - COMP_HET_ALT: BaseHailTableQuery.GENOTYPE_QUERY_MAP[HAS_ALT], + HAS_ALT: hl.is_defined, + COMP_HET_ALT: hl.is_defined, } MISSING_NUM_ALT = 0 @@ -1189,6 +1191,14 @@ class GcnvHailTableQuery(SvHailTableQuery): } del BASE_ANNOTATION_FIELDS['bothsidesSupport'] + TRANSCRIPTS_ENUM_FIELD = SvHailTableQuery.ENUM_ANNOTATION_FIELDS[SvHailTableQuery.TRANSCRIPTS_FIELD] + ENUM_ANNOTATION_FIELDS = {SvHailTableQuery.TRANSCRIPTS_FIELD: { + **TRANSCRIPTS_ENUM_FIELD, + 'format_values': lambda values, r: GcnvHailTableQuery.TRANSCRIPTS_ENUM_FIELD['format_values']( + GcnvHailTableQuery._get_gene_id_transcripts_override(values, r), r + ), + }} + POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'} @staticmethod @@ -1202,21 +1212,19 @@ def _get_genotype_override_field(r, field, agg, default=None): default, agg(entries.map(lambda g: g[sample_field])) ) - def _filter_annotated_table(self, **kwargs): - # sorted_gene_consequences may contain genes absent from the queried families, so remove those before filtering + @classmethod + def _get_gene_id_transcripts_override(cls, transcripts, r): empty_gene_set = hl.empty_set(hl.tstr) - geneotype_gene_ids_expr = self._get_genotype_override_field( - self._ht, 'gene_ids', + geneotype_gene_ids_expr = cls._get_genotype_override_field( + r, 'gene_ids', lambda entry_gene_ids: entry_gene_ids.fold(lambda s1, s2: s1.union(s2), empty_gene_set), default=hl.missing(empty_gene_set.dtype)) - self._ht = self._ht.annotate(sorted_gene_consequences=hl.bind( + return hl.bind( lambda gene_ids: hl.if_else( - hl.is_missing(gene_ids), self._ht.sorted_gene_consequences, - self._ht.sorted_gene_consequences.filter(lambda t: gene_ids.contains(t.gene_id)), + hl.is_missing(gene_ids), transcripts, + transcripts.filter(lambda t: gene_ids.contains(t.geneId)), ), geneotype_gene_ids_expr, - )) - - return super()._filter_annotated_table(**kwargs) + ) def _additional_annotation_fields(self): return {} diff --git a/seqr/utils/search/elasticsearch/constants.py b/seqr/utils/search/elasticsearch/constants.py index 0bd970693a..2053c16de5 100644 --- a/seqr/utils/search/elasticsearch/constants.py +++ b/seqr/utils/search/elasticsearch/constants.py @@ -437,7 +437,7 @@ def get_prediction_response_key(key): SV_SAMPLE_OVERRIDE_FIELD_CONFIGS = { 'pos': {'select_val': min, 'genotype_field': 'start'}, 'end': {'select_val': max}, - 'numExon':{'select_val': max}, + 'numExon': {'select_val': max}, 'geneIds': { 'select_val': lambda gene_lists: set([gene_id for gene_list in gene_lists for gene_id in (gene_list or [])]), 'equal': lambda a, b: set(a or []) == set(b or []) From 5a7d7b5bbd458457254644f0099252dc692ca263 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Wed, 23 Aug 2023 11:52:00 -0400 Subject: [PATCH 04/16] clean up --- hail_search/hail_search_query.py | 1 - seqr/utils/search/elasticsearch/constants.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index aaed72054e..a26c51b959 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -630,7 +630,6 @@ def _filter_by_annotations(self, pathogenicity, annotations, annotations_seconda annotation_filter = self._ht[HAS_ALLOWED_ANNOTATION] if has_secondary_annotations: annotation_filter |= self._ht[HAS_ALLOWED_SECONDARY_ANNOTATION] - self._ht = self._ht.filter(annotation_filter) def _get_allowed_consequences_annotations(self, annotations, annotation_filters, is_secondary=False): diff --git a/seqr/utils/search/elasticsearch/constants.py b/seqr/utils/search/elasticsearch/constants.py index 2053c16de5..0bd970693a 100644 --- a/seqr/utils/search/elasticsearch/constants.py +++ b/seqr/utils/search/elasticsearch/constants.py @@ -437,7 +437,7 @@ def get_prediction_response_key(key): SV_SAMPLE_OVERRIDE_FIELD_CONFIGS = { 'pos': {'select_val': min, 'genotype_field': 'start'}, 'end': {'select_val': max}, - 'numExon': {'select_val': max}, + 'numExon':{'select_val': max}, 'geneIds': { 'select_val': lambda gene_lists: set([gene_id for gene_list in gene_lists for gene_id in (gene_list or [])]), 'equal': lambda a, b: set(a or []) == set(b or []) From 7866de2eecbbb7724be3e04312f18ffd1d2e1415 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 24 Aug 2023 14:20:12 -0400 Subject: [PATCH 05/16] remove endcrhom from gcnv results --- hail_search/hail_search_query.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 450e30367d..bf4004570f 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -1065,7 +1065,6 @@ class SvHailTableQuery(BaseHailTableQuery): BASE_ANNOTATION_FIELDS = { 'bothsidesSupport': lambda r: r.bothsides_support, 'chrom': lambda r: r.start_locus.contig.replace('^chr', ''), - 'endChrom': lambda r: hl.or_missing(r.start_locus.contig != r.end_locus.contig, r.end_locus.contig.replace('^chr', '')), 'pos': lambda r: r.start_locus.position, 'end': lambda r: r.end_locus.position, 'rg37LocusEnd': lambda r: hl.or_missing( @@ -1149,7 +1148,7 @@ def _get_annotation_override_filters(self, annotations, **kwargs): def _additional_annotation_fields(self): sv_type_enum = self._enums['sv_type'] insertion_type_id = sv_type_enum.index('INS') - get_end_chrom = self.BASE_ANNOTATION_FIELDS['endChrom'] + get_end_chrom = lambda r: hl.or_missing(r.start_locus.contig != r.end_locus.contig, r.end_locus.contig.replace('^chr', '')) return { 'cpxIntervals': lambda r: self._format_enum( r, 'cpx_intervals', {'type': sv_type_enum}, annotate_value=lambda val, *args: { From 42a704477dff0834c2eb8ad0138f30266d5dc829 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 24 Aug 2023 14:29:27 -0400 Subject: [PATCH 06/16] handle gcnv specific sv type --- hail_search/hail_search_query.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index bf4004570f..b09f186de8 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -1139,12 +1139,15 @@ def _get_consequence_filter(self, allowed_consequence_ids, annotation_exprs): def _get_annotation_override_filters(self, annotations, **kwargs): annotation_filters = [] if annotations.get(STRUCTURAL_ANNOTATION_FIELD): - allowed_type_ids = self._get_enum_terms_ids('sv_type', None, annotations[STRUCTURAL_ANNOTATION_FIELD]) + allowed_type_ids = self._get_allowed_sv_types(annotations[STRUCTURAL_ANNOTATION_FIELD]) if allowed_type_ids: annotation_filters.append(hl.set(allowed_type_ids).contains(self._ht.sv_type_id)) return annotation_filters + def _get_allowed_sv_types(self, sv_types): + return self._get_enum_terms_ids('sv_type', None, sv_types) + def _additional_annotation_fields(self): sv_type_enum = self._enums['sv_type'] insertion_type_id = sv_type_enum.index('INS') @@ -1237,6 +1240,11 @@ def _get_gene_id_transcripts_override(cls, transcripts, r): ), geneotype_gene_ids_expr, ) + def _get_allowed_sv_types(self, sv_types): + return super()._get_allowed_sv_types([ + type.replace('gCNV_', '') for type in sv_types if type.startswith('gCNV_') + ]) + def _additional_annotation_fields(self): return {} From c6c3a47d5fa3231bbe7c7c694b7d8e78ba123e2e Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Thu, 24 Aug 2023 14:38:04 -0400 Subject: [PATCH 07/16] clean up --- hail_search/hail_search_query.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index b09f186de8..678b9b3343 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -1139,13 +1139,13 @@ def _get_consequence_filter(self, allowed_consequence_ids, annotation_exprs): def _get_annotation_override_filters(self, annotations, **kwargs): annotation_filters = [] if annotations.get(STRUCTURAL_ANNOTATION_FIELD): - allowed_type_ids = self._get_allowed_sv_types(annotations[STRUCTURAL_ANNOTATION_FIELD]) + allowed_type_ids = self.get_allowed_sv_type_ids(annotations[STRUCTURAL_ANNOTATION_FIELD]) if allowed_type_ids: annotation_filters.append(hl.set(allowed_type_ids).contains(self._ht.sv_type_id)) return annotation_filters - def _get_allowed_sv_types(self, sv_types): + def get_allowed_sv_type_ids(self, sv_types): return self._get_enum_terms_ids('sv_type', None, sv_types) def _additional_annotation_fields(self): @@ -1240,8 +1240,8 @@ def _get_gene_id_transcripts_override(cls, transcripts, r): ), geneotype_gene_ids_expr, ) - def _get_allowed_sv_types(self, sv_types): - return super()._get_allowed_sv_types([ + def get_allowed_sv_type_ids(self, sv_types): + return super().get_allowed_sv_type_ids([ type.replace('gCNV_', '') for type in sv_types if type.startswith('gCNV_') ]) From 881df7dcbd64092505afb3d89cfb5a6b4b121b37 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 25 Aug 2023 10:48:03 -0400 Subject: [PATCH 08/16] add wes fixture data --- .../GRCh38/SV_WES/annotations.ht/.README.txt.crc | Bin 0 -> 12 bytes .../GRCh38/SV_WES/annotations.ht/._SUCCESS.crc | Bin 0 -> 8 bytes .../SV_WES/annotations.ht/.metadata.json.gz.crc | Bin 0 -> 12 bytes .../GRCh38/SV_WES/annotations.ht/README.txt | 3 +++ .../GRCh38/SV_WES/annotations.ht/_SUCCESS | 0 .../annotations.ht/globals/.metadata.json.gz.crc | Bin 0 -> 12 bytes .../annotations.ht/globals/metadata.json.gz | Bin 0 -> 366 bytes .../annotations.ht/globals/parts/.part-0.crc | Bin 0 -> 12 bytes .../SV_WES/annotations.ht/globals/parts/part-0 | Bin 0 -> 330 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin 0 -> 12 bytes .../index | Bin 0 -> 104 bytes .../metadata.json.gz | Bin 0 -> 161 bytes .../GRCh38/SV_WES/annotations.ht/metadata.json.gz | Bin 0 -> 443 bytes .../annotations.ht/rows/.metadata.json.gz.crc | Bin 0 -> 16 bytes .../SV_WES/annotations.ht/rows/metadata.json.gz | Bin 0 -> 733 bytes ...0-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.crc | Bin 0 -> 12 bytes ...0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a | Bin 0 -> 366 bytes .../SV_WES/families/F000002_2.ht/.README.txt.crc | Bin 0 -> 12 bytes .../SV_WES/families/F000002_2.ht/._SUCCESS.crc | Bin 0 -> 8 bytes .../families/F000002_2.ht/.metadata.json.gz.crc | Bin 0 -> 12 bytes .../SV_WES/families/F000002_2.ht/README.txt | 3 +++ .../GRCh38/SV_WES/families/F000002_2.ht/_SUCCESS | 0 .../F000002_2.ht/globals/.metadata.json.gz.crc | Bin 0 -> 12 bytes .../F000002_2.ht/globals/metadata.json.gz | Bin 0 -> 295 bytes .../F000002_2.ht/globals/parts/.part-0.crc | Bin 0 -> 12 bytes .../families/F000002_2.ht/globals/parts/part-0 | Bin 0 -> 477 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin 0 -> 12 bytes .../index | Bin 0 -> 104 bytes .../metadata.json.gz | Bin 0 -> 161 bytes .../SV_WES/families/F000002_2.ht/metadata.json.gz | Bin 0 -> 368 bytes .../F000002_2.ht/rows/.metadata.json.gz.crc | Bin 0 -> 16 bytes .../families/F000002_2.ht/rows/metadata.json.gz | Bin 0 -> 669 bytes ...8-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.crc | Bin 0 -> 12 bytes ...0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48 | Bin 0 -> 200 bytes .../SV_WES/projects/R0001_1kg.ht/.README.txt.crc | Bin 0 -> 12 bytes .../SV_WES/projects/R0001_1kg.ht/._SUCCESS.crc | Bin 0 -> 8 bytes .../projects/R0001_1kg.ht/.metadata.json.gz.crc | Bin 0 -> 12 bytes .../SV_WES/projects/R0001_1kg.ht/README.txt | 3 +++ .../GRCh38/SV_WES/projects/R0001_1kg.ht/_SUCCESS | 0 .../R0001_1kg.ht/globals/.metadata.json.gz.crc | Bin 0 -> 12 bytes .../R0001_1kg.ht/globals/metadata.json.gz | Bin 0 -> 295 bytes .../R0001_1kg.ht/globals/parts/.part-0.crc | Bin 0 -> 12 bytes .../projects/R0001_1kg.ht/globals/parts/part-0 | Bin 0 -> 485 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin 0 -> 12 bytes .../index | Bin 0 -> 104 bytes .../metadata.json.gz | Bin 0 -> 161 bytes .../SV_WES/projects/R0001_1kg.ht/metadata.json.gz | Bin 0 -> 368 bytes .../R0001_1kg.ht/rows/.metadata.json.gz.crc | Bin 0 -> 16 bytes .../projects/R0001_1kg.ht/rows/metadata.json.gz | Bin 0 -> 667 bytes ...6-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.crc | Bin 0 -> 12 bytes ...0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7 | Bin 0 -> 227 bytes 54 files changed, 9 insertions(+) create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/.README.txt.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/._SUCCESS.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/README.txt create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/_SUCCESS create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/globals/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/globals/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/globals/parts/.part-0.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/globals/parts/part-0 create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/.index.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/index create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/rows/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/rows/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/rows/parts/.part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/annotations.ht/rows/parts/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/.README.txt.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/._SUCCESS.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/README.txt create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/_SUCCESS create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/parts/.part-0.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/parts/part-0 create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/.index.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/index create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/rows/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/rows/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/rows/parts/.part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/rows/parts/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48 create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/.README.txt.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/._SUCCESS.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/README.txt create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/_SUCCESS create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/globals/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/globals/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/globals/parts/.part-0.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/globals/parts/part-0 create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.index.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/index create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/.metadata.json.gz.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/metadata.json.gz create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/.part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/.README.txt.crc new file mode 100644 index 0000000000000000000000000000000000000000..b1f590f9a02bb013dcccb7f6b37c306cce9139d8 GIT binary patch literal 12 TcmYc;N@ieSU}9)c=$s7z5sd=> literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/._SUCCESS.crc b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..15fb1af5aa824b7bac050f8349593a1996d87c7f GIT binary patch literal 12 TcmYc;N@ieSU}7j-QDOlA5?liS literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/README.txt b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/README.txt new file mode 100644 index 0000000000..dc2b042c6b --- /dev/null +++ b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.109-b71b065e4bb6 + Created at 2023/08/24 17:42:53 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/_SUCCESS b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/globals/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/globals/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..5b3d27b2cbfff9db3ceb6448774f417fd5329a8e GIT binary patch literal 12 TcmYc;N@ieSU}BJKyz?Fa5+Vbo literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/globals/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/globals/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..13ff2b0cc2090baa841fbd177dd73e6633c717b1 GIT binary patch literal 366 zcmV-!0g?V6iwFP!000000F_c(OT#b}{x5m8f=c!6&6?wLnrxs;Xt6 z3uS;ODM-KprG_sHl0na((`}(H`C*cCfh{>T2W!inBN)vx4JJGpf>IS*U?xee;4SKw z&M#qqTtcV*uz0U>uWT~}fi4R$QFvgYK`7~;b(%$2K M00ILrdW~*mo#PA!Qk~<}r-o z@fl@;qmVF)C7H+U9(=bG#05oB2mFC1j*}-DBFb99ERmPMZ%@wW@XAoq+vy t*QoMkqiJB|!X_(K>h4147u*}(-OA^(kRPkfdQ};@RBwgUpW9L!@DHALUmXAd literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/.index.crc b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..399bfc17c871350c80f57e12e1ae79569e924312 GIT binary patch literal 12 TcmYc;N@ieSU}Cs&B!w9O6ZZo? literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..d82735252bf5ccf12488fef777e0997b07455a6a GIT binary patch literal 12 TcmYc;N@ieSU}C6Ka$F7o5eEXp literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/index b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/index/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.idx/index new file mode 100644 index 0000000000000000000000000000000000000000..7f91fffbf24726a4068986ff0fc32f802e19088b GIT binary patch literal 104 zcmZ=}U|P=$GYN<=Y_K#nHUXWz=X-wd^tEO@gh{vGdjkr_33u>C2K;VoQ;d2XhKcN5y*87Yw4Ie~ PvXt@x0+ti5w*UYD`u|7U literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..f735bf3d0f716d2bd74ce095700aeed316a41c8d GIT binary patch literal 443 zcmV;s0Yv^EiwFP!000000F6`MPunmM{$KJ0l0l{In&G#q6C2VrO)%}Hsw~Gj(lEyk z_9akN{@-W2aoWL+wvV~!(ipy5`(UlA4YArFl&k^;On5Oob+LX@HT+SVQ{-R0bEIoajzIvp z9z=pmX#wehL|UUCK_IKTx*TF|`13sYPe$_wYUY zdbirnKA)2cnMICc!^<0bBQ?j_lq^^DRz>5-V0_a}vV42da>Or*!HZS{VyapGwo-n6 z0x;jqEAZVtj_V4|Mff<=o8<=V(3O7~v{V#snw4_Tz0yJ+x=~}6(s&?DMswy(^4RsqF z<~Rt3^51uzK)wj-$773B!S%V<=iYPdWJ?5H0P)C`V6X;u_xt>Q34u)v;=jO#cn@Sn zC8t99oRLS21~G#`V~DriC;!|mflbNS|C)18t?HqCF z1&nYpuQ}mb|NrviZt~O{eyx@*7G*bW7C}#!pVZ=Wrd3fQI>H z^rjoCZtIZCL|bxQxVY~acn2omKkvvCTPTcIVwbqUM@~pEjP;^yADak zr^cq?H_hVMi7IzL*t@msXR(U{DQI+qDt}D=d_M`ITkzape#j&6TdE$~rii@FA_Z0g z9&y%P`JJvQ+h{oDkbawdoBDBIbx}S-_)}U9r}N6wEN`|NLld15U1W*3{{_mC=$9=0 z!?aA-wM_jPge+i=%go^nsCjDjXJ+WQt_ijUmeU`e^(~sejXmE{F^6%S3a0C9N&ZmL zjnrP-Jd$7!eC?mwj%!=~`LA2e0H1NkfM;&cDREo`QOeMCd1Y(?%Ggww(2-1&p-M?) Pfnxs$zqifLR|)_C=rwH! literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/rows/parts/.part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.crc b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/rows/parts/.part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a.crc new file mode 100644 index 0000000000000000000000000000000000000000..f57036f8ca98914611ae7808b7c1bf6ce8a2a59c GIT binary patch literal 12 TcmYc;N@ieSU}9iBt^N=I5hMc> literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/rows/parts/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a b/hail_search/fixtures/GRCh38/SV_WES/annotations.ht/rows/parts/part-0-550-0-0-7fdcd45d-8bea-106b-f455-e20e0468960a new file mode 100644 index 0000000000000000000000000000000000000000..26a44a296a4d81299ce4996a5101e477514833c5 GIT binary patch literal 366 zcmWNJ%`0?K0LGtthrx|I=Gsg)7W@JC+;i?Z_xPHUk0z2xv2hkkyi1dkjU6CZ%jU+FoDnjK_{is@8uu^xsG%?@eAlB2KZZ!9P8{@!M7%up`fwUo;M@%qie+n= zQzw?4&$J}9qEdhFJ{88^pRb}Jg(?TV!t(eY2*(6TLlaePx@Tp%VZln4;E@iWP5nyD7Ve< E3);YQKL7v# literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/.README.txt.crc new file mode 100644 index 0000000000000000000000000000000000000000..17e76b8f5b74d8de753995b0ff9788fbcbea6317 GIT binary patch literal 12 TcmYc;N@ieSU}A_74iX0d599(9 literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/._SUCCESS.crc b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..d51b8848d746ac5febcdf9b99b63652a9a7c3f2e GIT binary patch literal 12 TcmYc;N@ieSU}Erj_ml?!6Ho(Y literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/README.txt b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/README.txt new file mode 100644 index 0000000000..440bda49da --- /dev/null +++ b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.109-b71b065e4bb6 + Created at 2023/08/25 10:47:38 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/_SUCCESS b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..d2dc39b830e972bd0f8cbab05cf11594ac345777 GIT binary patch literal 12 TcmYc;N@ieSU}CUeE&Kog5rYF5 literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..be8cfbcd3f386024b3c5ddbf688fc61596f64fa7 GIT binary patch literal 295 zcmV+?0oeW@iwFP!000000F{wXPlG@Z#lOp5i-}gF4L7M64^2#p@gQl)c6Y$Fu#o*@ zf`sqx47IIbFLGMm@4YuH(=p8}U?F=p5)8=o>qF9jn6rRoJ7`&$Ztx^(34(MMc89=A z+kjji0lI)BB(pf;77}N?aGh@?cVNR=d~XbIQ-n%$({2LGtEPlZ$#XF0l`7k;@Mm#P z<34}JO$H_4Qk0Y~MkDqO2QaE1p`m)~^Bm0N!mBvCnlGXg3Yd6TrTk@y%3AEOWn|it zn5Zu_T+i=r7k^``x0|Qsko{*6zZrNnP4@KQEgEjzqbkAT78$)$V0 literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/parts/part-0 b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/globals/parts/part-0 new file mode 100644 index 0000000000000000000000000000000000000000..d08c20b4765418d26668dcc7e1325bab2c8a85e9 GIT binary patch literal 477 zcmXAkO-|cT6oth5))$cvjT>c zQ3tG}i|7_Q>k=x_1@u}Bo<2R@bI-ZwHv_)6fWITgt}m-S$Eot$#ZH#iX_eQNb1#$= z#lu2}PCSZ|^Dn2#!Qv*(lB2TD7iBsng>;4|QKE#>B9v~DmgP;I)_FEK#@mNMmCokV z{Ag4q6S6GRSw5KLS<5(#gb1`WVJQ=DhTgQk3gv0xB@b8A>tym!2q8JT0f(iDA<-r7 za;$nBJxRmO+2t5pQqhRsD#_N$M?}8%^cWT?JG5a-!D0Lam;31Y8G3q+WBC`Vt88{b z86B*L9k3TfLFmgfEK^A1*o68-3N{3@CD{>sirhX18_c33Oae)|?gnc;Vk>Cwa>>2@ z2k2m^rR)A+aeeo#uFh~yUr)TCSv3B^R%F;!1+|H>?YRZ4pbz7TaUH|>#BAk%Mtja$ neVB7*skniyskOnEY~6GsD!S5qe($s%v`N>;$}1ZeLMi+Qjf`G+ literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/.index.crc b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..e911752221cc1e519f5019165a26c23a55dcce78 GIT binary patch literal 12 TcmYc;N@ieSU}AX4DPso!5tRae literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..d82735252bf5ccf12488fef777e0997b07455a6a GIT binary patch literal 12 TcmYc;N@ieSU}C6Ka$F7o5eEXp literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/index b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/index new file mode 100644 index 0000000000000000000000000000000000000000..6ba08ef5883c0e5facfd4027a17a9e3be7e129ca GIT binary patch literal 104 zcmZ=}U|P=0FbRk-Y_K#nHUX=;&m;>pLL7)ca4<45FkmqU0AK7D3;+NC literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/index/part-0-668-0-0-a8c75457-2bef-dd0d-b9b9-f76af029cc48.idx/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..c951bb1eef5c6a28538ddc6c8c5164018acd0fb5 GIT binary patch literal 161 zcmV;S0ABweiwFP!0000009B1K4#F@DMepJ{10ZPC9Ds!ZBoHfdTeq=54oY1_RORj% zHa6@3_n#LS_(4}n6Gs7G?doRZpiL-UU>Wz=X-wd^tEO@gh{vGdjkr_33u>C2K;VoQ;d2XhKcN5y*87Yw4Ie~ PvXt@x0+ti5w*UYD`u|7U literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/families/F000002_2.ht/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..6a81de8537f587e4d4f1f9124f192692de52dcca GIT binary patch literal 368 zcmV-$0gwJ4iwFP!000000F6>jPs1<_{VzR-sT*Sxxj~T-7X)#DLq9_>7XoK}zjiYmk`L;m{RQryzOkY$IIPm<$}&JVv)Bs8q#< z)leC7m08X2tcCVODk|_> z0`MuT>K`~=-_Xncs0VNm@_AJBPLQ!CL(v3+`W&BCG>zM}% zRLA^(Jd0|o1V6R9bkq*48=>fVEFM=pqOsU(d<6PU-_ufru8dX`w+?t09QQGl3aZ1N z-rgS~FNqC`Vb!PodVNo*-v8e>h{mGO)9&i+(_7mIBIyc##_&NO39|cx7C|U!T!1G}WUh1S+kyg6$DIhoUX0kgki!#NrSr zx?BP;LSwpGhv0)mY$A2r$&e*Tz&09ap917&<-;zLmAqI}r88`FmtB2(%&vY+Pglu2 zF*r#LNXy7m zR%fvZ0#Fq*-j0XYPJdV|S%f`_ke5P+VZ@%1%309DfK%~;0d1$(?bp+8f4kpDY3@4{ zv2mcQYGz`7*o82oS7~CBVd*3)2=A$WLj3CLa~f_?6h26Y?oE?e9+W_+*qCUC{zpB# zu7$H6<4x}7A{JM?*QPkJq9BMt=jEuBk)~SU;hyCt&9*N1l-QiP*RZ%W1NrW?y(_zZ z6uaQSOB&wcDgWQ`r~4_8%?FfrC!l&Ete*;fAB>(#qHZLSQdfUs{e9Zi=e!=br zj11|92F50qrUDFMX6DA`h5`)ljVuf-ER8u@wg6S}$ht5X$m;1H_TLvMH84Ly2=j{<<|GH6! z!C2-$qbP&wiz%WE&a#XQ0`?5vmZrugaf~0Bm+ZXF#lZ0I_)RWm9<~3B{}~wo5OXuA literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/.README.txt.crc new file mode 100644 index 0000000000000000000000000000000000000000..17c414e9d5b7ded20df83dfa3a45f6c4cce0c1f5 GIT binary patch literal 12 TcmYc;N@ieSU}E6d*>(#65wQbY literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/._SUCCESS.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..d51b8848d746ac5febcdf9b99b63652a9a7c3f2e GIT binary patch literal 12 TcmYc;N@ieSU}Erj_ml?!6Ho(Y literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/README.txt b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/README.txt new file mode 100644 index 0000000000..e9bf5b62a9 --- /dev/null +++ b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.109-b71b065e4bb6 + Created at 2023/08/25 10:45:26 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/_SUCCESS b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/_SUCCESS new file mode 100644 index 0000000000..e69de29bb2 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/globals/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/globals/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..d2dc39b830e972bd0f8cbab05cf11594ac345777 GIT binary patch literal 12 TcmYc;N@ieSU}CUeE&Kog5rYF5 literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/globals/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/globals/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..be8cfbcd3f386024b3c5ddbf688fc61596f64fa7 GIT binary patch literal 295 zcmV+?0oeW@iwFP!000000F{wXPlG@Z#lOp5i-}gF4L7M64^2#p@gQl)c6Y$Fu#o*@ zf`sqx47IIbFLGMm@4YuH(=p8}U?F=p5)8=o>qF9jn6rRoJ7`&$Ztx^(34(MMc89=A z+kjji0lI)BB(pf;77}N?aGh@?cVNR=d~XbIQ-n%$({2LGtEPlZ$#XF0l`7k;@Mm#P z<34}JO$H_4Qk0Y~MkDqO2QaE1p`m)~^Bm0N!mBvCnlGXg3Yd6TrTk@y%3AEOWn|it zn5Zu_T+i=r7k^``x0|Qsko{*6zZrNnP4@KQEgEjzqbkAT78$)#?&bCf8dvQMnVKyny{3K=M`;yd8b4eh;8eGP@WcEa(zC#NT%0?5R#)8 za9Elc5?$gh$C}5{mo(g-pG~kM6^-bvk!-!ZLF8*sPhgRzB!R`?Sp)ZfnrjW+53H6B-YzSscvLp5txoren%%USq0!e!A7Hi#L zCur_+$-VO%=whU$>prl!{(f3lM>wXhC+^WK8h>FcGVG~>+QfM6xdp6X0OOW%9m9CV uY~{a2XTe$nm9Bkf3is-l)^tN(qJe6 literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.index.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..a2063fcbcdcd520eae71faab30752ce90e7472dd GIT binary patch literal 12 TcmYc;N@ieSU}8Axuuu>H5~l+4 literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..d82735252bf5ccf12488fef777e0997b07455a6a GIT binary patch literal 12 TcmYc;N@ieSU}C6Ka$F7o5eEXp literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/index b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/index new file mode 100644 index 0000000000000000000000000000000000000000..d9f7a479646071fbb913a7ccf66d675f9cd4d029 GIT binary patch literal 104 zcmZ=}U|P>>FbRk-Y_K#nHUX>p$|MUkLL7)ca4<45FkmqU0Bax?NB{r; literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..c951bb1eef5c6a28538ddc6c8c5164018acd0fb5 GIT binary patch literal 161 zcmV;S0ABweiwFP!0000009B1K4#F@DMepJ{10ZPC9Ds!ZBoHfdTeq=54oY1_RORj% zHa6@3_n#LS_(4}n6Gs7G?doRZpiL-UU>Wz=X-wd^tEO@gh{vGdjkr_33u>C2K;VoQ;d2XhKcN5y*87Yw4Ie~ PvXt@x0+ti5w*UYD`u|7U literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..6a81de8537f587e4d4f1f9124f192692de52dcca GIT binary patch literal 368 zcmV-$0gwJ4iwFP!000000F6>jPs1<_{VzR-sT*Sxxj~T-7X)#DLq9_>7XoK}zjiYmk`L;m{RQryzOkY$IIPm<$}&JVv)Bs8q#< z)leC7m08X2tcCVODk|_> z0`MuT>K`~=-_Xncs0VNm@_AJBPLQ!CL(v3+`W&BCG>zM}% zRLA^(Jd0|o1V6R9bkq*48=>fVEFM=pqOsU(d<6PU-_ufru8dX`w+?t09QQGl3aZ1N z-rgS~FNqC`Vb!PodVNo*-v8e>h{bF8@ixfp^Nc&KPtYtkW+nBYr zUMLj#-#csY-6%;P0}?W4XJ*fwJ!AVw1l<5}$xQgL2mj&c%{&HX5rcToa3Jo2?5N~a zD3AMOi71PFU^<4l$2|G*5Q9G{7BC7lgexDWW17HR$t_dIvXYB97gQdMfQJfX;!ZzH zNe_!0Um75k##_$NcvPTaI3h0KOUoDsH;a&2viucgxv|NL(vXoNaHdxHGBvp zU9Z6}Lle4*L-0T#HWSLWa%2qxu#FDd=K!(Yc(9K|BQH@@svMi#9j%@`I;bL9rHwU#iL8WvjZXKdE^=hltDOQJ69ZGczRpr;zs6(Pcp+cYu($ex& z)md$W0A$0Ax6^UY8jZ^(gRmza`jwDz7_n!hQVUvXaL)f?fNc#2qjuWuFX!7N%Y9=a zHV$;t%uMwMyWo2C8cob{ESp3L;XPGPh+kcO%)&K_!Ux&Vy=oGxgA#C=XcO(x|ETA{ zF>uxs{8PBOjKvlAr72F0C` zo76A+s*Cak!tc^*sLl(|pRCzZhI({LbdebE=U?zR68$ov|Crb2z!~5#e*GQ**nQI) z!nL__U}&=5YGpDu7zNC-sT~Znw~pTrRIFhfZH2Gv97*<=(2Z1HPh65<4}9 zdpAGsH3NJoT?+ivx1A!63l?sDG+kC1TYv(a>Jl1BH5sauL>741{{ZA&-4{Oz001x# BKyv^9 literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/.part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/.part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.crc new file mode 100644 index 0000000000000000000000000000000000000000..c0e184f89f44249ff5950572cdd5f48baaf0a082 GIT binary patch literal 12 TcmYc;N@ieSU}A_~y8S2s6AlBn literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7 b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7 new file mode 100644 index 0000000000000000000000000000000000000000..0dca74ce0f4f9ec367c684dd58d234018ba50a4b GIT binary patch literal 227 zcmcc5z`(GRk%8d@2P1=EacNpwW<|WAk%_61LA*<700RpHBf|rr^tP9?`Iz}#{es;M z7#Y$H4UA1JO$8Xj%*>6=4Fwq98(A1wSQ>M*Yyqm`k#%7(kkEW zf=vueEsei22uf%$TwC3@q#!gg7!7 z3a-!FBgFo7qaK5?%zs8v2Gti+L>UBR9T|iL92vYVO^r=T7%wc@d7F!Y;otF_T+BRb Z`3%ew3?CTQKDrAO`@ZA`P;3b!0|1FnI^6&O literal 0 HcmV?d00001 From 919744d858c86830c929aa4cce1c01d32a5a2072 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 25 Aug 2023 11:19:57 -0400 Subject: [PATCH 09/16] initial response --- hail_search/hail_search_query.py | 2 + hail_search/test_search.py | 1013 +++++++++++++++--------------- hail_search/test_utils.py | 165 +++++ 3 files changed, 686 insertions(+), 494 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 678b9b3343..775248242d 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -1245,6 +1245,8 @@ def get_allowed_sv_type_ids(self, sv_types): type.replace('gCNV_', '') for type in sv_types if type.startswith('gCNV_') ]) + # TODO override genotype fields in genotypes response, actually return geneIds + def _additional_annotation_fields(self): return {} diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 71ebe42b92..7b772f0afc 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -4,7 +4,7 @@ from hail_search.test_utils import get_hail_search_body, FAMILY_2_VARIANT_SAMPLE_DATA, FAMILY_2_MISSING_SAMPLE_DATA, \ VARIANT1, VARIANT2, VARIANT3, VARIANT4, MULTI_PROJECT_SAMPLE_DATA, MULTI_PROJECT_MISSING_SAMPLE_DATA, \ LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS, SV_WGS_SAMPLE_DATA, \ - SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4 + SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4, GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4 from hail_search.web_app import init_web_app PROJECT_2_VARIANT = { @@ -113,11 +113,11 @@ class HailSearchTestCase(AioHTTPTestCase): async def get_application(self): return init_web_app() - async def test_status(self): - async with self.client.request('GET', '/status') as resp: - self.assertEqual(resp.status, 200) - resp_json = await resp.json() - self.assertDictEqual(resp_json, {'success': True}) + # async def test_status(self): + # async with self.client.request('GET', '/status') as resp: + # self.assertEqual(resp.status, 200) + # resp_json = await resp.json() + # self.assertDictEqual(resp_json, {'success': True}) async def _assert_expected_search(self, results, gene_counts=None, **search_kwargs): search_body = get_hail_search_body(**search_kwargs) @@ -127,12 +127,16 @@ async def _assert_expected_search(self, results, gene_counts=None, **search_kwar self.assertSetEqual(set(resp_json.keys()), {'results', 'total'}) self.assertEqual(resp_json['total'], len(results)) for i, result in enumerate(resp_json['results']): + if result != results[i]: + import pdb; pdb.set_trace() self.assertEqual(result, results[i]) if gene_counts: async with self.client.request('POST', '/gene_counts', json=search_body) as resp: self.assertEqual(resp.status, 200) gene_counts_json = await resp.json() + if gene_counts_json != gene_counts: + import pdb; pdb.set_trace() self.assertDictEqual(gene_counts_json, gene_counts) async def test_single_family_search(self): @@ -144,497 +148,518 @@ async def test_single_family_search(self): ) await self._assert_expected_search( - [SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, - ) - - async def test_single_project_search(self): - await self._assert_expected_search( - [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', gene_counts={ - 'ENSG00000097046': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, - 'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, + [GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], omit_sample_type='VARIANTS', gene_counts={ + 'ENSG00000129562': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000013364': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000079616': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000103495': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000167371': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000280789': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000280893': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000281348': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000275023': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000277258': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000277972': {'total': 1, 'families': {'F000002_2': 1}}, } ) - async def test_multi_project_search(self): - await self._assert_expected_search( - [PROJECT_2_VARIANT, MULTI_PROJECT_VARIANT1, MULTI_PROJECT_VARIANT2, VARIANT3, VARIANT4], - gene_counts=GENE_COUNTS, sample_data=MULTI_PROJECT_SAMPLE_DATA, - ) - - async def test_inheritance_filter(self): - inheritance_mode = 'any_affected' - await self._assert_expected_search( - [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - ) - - await self._assert_expected_search( - [SV_VARIANT2], inheritance_mode=inheritance_mode, annotations=NEW_SV_FILTER, sample_data=SV_WGS_SAMPLE_DATA, - ) - - inheritance_mode = 'de_novo' - await self._assert_expected_search( - [VARIANT1, FAMILY_3_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [SV_VARIANT1], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - ) - - inheritance_mode = 'x_linked_recessive' - await self._assert_expected_search([], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES') - await self._assert_expected_search([], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA) - - inheritance_mode = 'homozygous_recessive' - await self._assert_expected_search( - [VARIANT2], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [PROJECT_2_VARIANT1, VARIANT2], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, - ) - - await self._assert_expected_search( - [SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - ) - - gt_inheritance_filter = {'genotype': {'I000006_hg00733': 'has_alt', 'I000005_hg00732': 'ref_ref'}} - await self._assert_expected_search( - [VARIANT2, VARIANT3], inheritance_filter=gt_inheritance_filter, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA) - - inheritance_mode = 'compound_het' - await self._assert_expected_search( - [[VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, gene_counts={ - 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, - 'ENSG00000177000': {'total': 1, 'families': {'F000002_2': 1}}, - }, **COMP_HET_ALL_PASS_FILTERS, - ) - - await self._assert_expected_search( - [[SV_VARIANT1, SV_VARIANT2]], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - **COMP_HET_ALL_PASS_FILTERS, - ) - - inheritance_mode = 'recessive' - await self._assert_expected_search( - [PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, gene_counts={ - 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, - 'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}}, - }, sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS, - ) - - await self._assert_expected_search( - [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - **COMP_HET_ALL_PASS_FILTERS, - ) - - async def test_quality_filter(self): - quality_filter = {'vcf_filter': 'pass'} - await self._assert_expected_search( - [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search([SV_VARIANT4], quality_filter=quality_filter, sample_data=SV_WGS_SAMPLE_DATA) - - await self._assert_expected_search( - [VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40}, omit_sample_type='SV_WES', - ) - - sv_quality_filter = {'min_gq_sv': 40} - await self._assert_expected_search( - [SV_VARIANT3, SV_VARIANT4], quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA, - ) - - await self._assert_expected_search( - [], annotations=NEW_SV_FILTER, quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA, - ) - - await self._assert_expected_search( - [VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40, 'vcf_filter': 'pass'}, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 60, 'affected_only': True}, - omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [SV_VARIANT3, SV_VARIANT4], quality_filter={'min_gq_sv': 60, 'affected_only': True}, sample_data=SV_WGS_SAMPLE_DATA, - ) - - await self._assert_expected_search( - [VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter={'min_ab': 50}, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [VARIANT2, VARIANT3], quality_filter={'min_ab': 70, 'affected_only': True}, - omit_sample_type='SV_WES', - ) - - quality_filter = {'min_gq': 40, 'min_ab': 50} - await self._assert_expected_search( - [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', - ) - - annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters - await self._assert_expected_search( - [VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', - annotations=annotations, pathogenicity={'clinvar': ['likely_pathogenic', 'vus_or_conflicting']}, - ) - - await self._assert_expected_search( - [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', - annotations=annotations, pathogenicity={'clinvar': ['pathogenic']}, - ) - - async def test_location_search(self): - await self._assert_expected_search( - [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', **LOCATION_SEARCH, - ) - - sv_intervals = ['1:9310023-9380264'] - await self._assert_expected_search( - [SV_VARIANT1, SV_VARIANT2], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, gene_ids=['ENSG00000171621'], - ) - - await self._assert_expected_search( - [VARIANT1], omit_sample_type='SV_WES', **EXCLUDE_LOCATION_SEARCH, - ) - - await self._assert_expected_search( - [SV_VARIANT3, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, exclude_intervals=True, - ) - - await self._assert_expected_search( - [SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], omit_sample_type='SV_WES', - intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1] - ) - - async def test_variant_id_search(self): - await self._assert_expected_search([VARIANT2], omit_sample_type='SV_WES', **RSID_SEARCH) - - await self._assert_expected_search([VARIANT1], omit_sample_type='SV_WES', **VARIANT_ID_SEARCH) - - await self._assert_expected_search( - [VARIANT1], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][:1], - ) - - await self._assert_expected_search( - [], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][1:], - ) - - await self._assert_expected_search([SV_VARIANT2, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, variant_keys=[ - 'cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640', - ]) - - async def test_frequency_filter(self): - await self._assert_expected_search( - [VARIANT1, VARIANT4], frequencies={'seqr': {'af': 0.2}}, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {'ac': 4}}, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {'hh': 1}}, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [VARIANT4], frequencies={'seqr': {'ac': 4, 'hh': 0}}, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [SV_VARIANT1], frequencies={'sv_callset': {'af': 0.05}}, sample_data=SV_WGS_SAMPLE_DATA, - ) - - await self._assert_expected_search( - [VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05}}, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05, 'hh': 1}}, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.005}}, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [SV_VARIANT1, SV_VARIANT3, SV_VARIANT4], frequencies={'gnomad_svs': {'af': 0.001}}, sample_data=SV_WGS_SAMPLE_DATA, - ) - await self._assert_expected_search( - [VARIANT4], frequencies={'seqr': {'af': 0.2}, 'gnomad_genomes': {'ac': 50}}, - omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {}, 'gnomad_genomes': {'af': None}}, - omit_sample_type='SV_WES', - ) - - annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters - await self._assert_expected_search( - [VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', - annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'likely_pathogenic', 'vus_or_conflicting']}, - ) - - await self._assert_expected_search( - [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', - annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'vus_or_conflicting']}, - ) - - async def test_annotations_filter(self): - await self._assert_expected_search([VARIANT2], pathogenicity={'hgmd': ['hgmd_other']}, omit_sample_type='SV_WES') - - pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting', 'benign']} - await self._assert_expected_search([VARIANT1, VARIANT2], pathogenicity=pathogenicity, omit_sample_type='SV_WES') - - pathogenicity['clinvar'] = pathogenicity['clinvar'][:1] - await self._assert_expected_search( - [VARIANT1, VARIANT4], pathogenicity=pathogenicity, annotations={'SCREEN': ['CTCF-only', 'DNase-only']}, - omit_sample_type='SV_WES', - ) - - annotations = { - 'missense': ['missense_variant'], 'in_frame': ['inframe_insertion', 'inframe_deletion'], 'frameshift': None, - 'structural_consequence': ['INTRONIC'], - } - await self._assert_expected_search( - [VARIANT1, VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search([VARIANT2, VARIANT4], annotations=annotations, omit_sample_type='SV_WES') - - await self._assert_expected_search([SV_VARIANT1], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) - - annotations['splice_ai'] = '0.005' - await self._assert_expected_search( - [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], annotations=annotations, omit_sample_type='SV_WES', - ) - - annotations['structural'] = ['DEL'] - await self._assert_expected_search([SV_VARIANT1, SV_VARIANT4], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) - - annotations = {'other': ['non_coding_transcript_exon_variant']} - await self._assert_expected_search( - [VARIANT1, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT], - pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], - gene_ids=LOCATION_SEARCH['gene_ids'][:1], annotations=annotations, omit_sample_type='SV_WES', - ) - - async def test_secondary_annotations_filter(self): - annotations_1 = {'missense': ['missense_variant']} - annotations_2 = {'other': ['intron_variant']} - - await self._assert_expected_search( - [[VARIANT3, VARIANT4]], inheritance_mode='compound_het', omit_sample_type='SV_WES', - annotations=annotations_1, annotations_secondary=annotations_2, - ) - - await self._assert_expected_search( - [VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', - annotations=annotations_1, annotations_secondary=annotations_2, - ) - - await self._assert_expected_search( - [[VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', - annotations=annotations_2, annotations_secondary=annotations_1, - ) - - sv_annotations_1 = {'structural': ['INS']} - sv_annotations_2 = {'structural': ['DEL'], 'structural_consequence': ['INTRONIC']} - - await self._assert_expected_search( - [[SV_VARIANT1, SV_VARIANT2]], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='compound_het', - annotations=sv_annotations_1, annotations_secondary=sv_annotations_2, - ) - - await self._assert_expected_search( - [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='recessive', - annotations=sv_annotations_2, annotations_secondary=sv_annotations_1, - ) - - pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting']} - await self._assert_expected_search( - [VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', - annotations=annotations_2, annotations_secondary=annotations_1, pathogenicity=pathogenicity, - ) - - screen_annotations = {'SCREEN': ['CTCF-only']} - await self._assert_expected_search( - [], inheritance_mode='recessive', omit_sample_type='SV_WES', - annotations=screen_annotations, annotations_secondary=annotations_1, - ) - - await self._assert_expected_search( - [[VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', - annotations=screen_annotations, annotations_secondary=annotations_2, - ) - - selected_transcript_annotations = {'other': ['non_coding_transcript_exon_variant']} - await self._assert_expected_search( - [VARIANT2, [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3, VARIANT4]], inheritance_mode='recessive', - annotations=screen_annotations, annotations_secondary=selected_transcript_annotations, - pathogenicity=pathogenicity, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3, VARIANT4]], - annotations={**selected_transcript_annotations, **screen_annotations}, annotations_secondary=annotations_2, - inheritance_mode='recessive', omit_sample_type='SV_WES', - ) - - async def test_in_silico_filter(self): - in_silico = {'eigen': '5.5', 'mut_taster': 'P'} - await self._assert_expected_search( - [VARIANT1, VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', - ) - - in_silico['requireScore'] = True - await self._assert_expected_search( - [VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, in_silico={'strvctvre': 0.1, 'requireScore': True}, - ) - - async def test_search_errors(self): - search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA) - async with self.client.request('POST', '/search', json=search_body) as resp: - self.assertEqual(resp.status, 400) - reason = resp.reason - self.assertEqual(reason, 'The following samples are available in seqr but missing the loaded data: NA19675, NA19678') - - search_body = get_hail_search_body(sample_data=MULTI_PROJECT_MISSING_SAMPLE_DATA) - async with self.client.request('POST', '/search', json=search_body) as resp: - self.assertEqual(resp.status, 400) - reason = resp.reason - self.assertEqual(reason, 'The following samples are available in seqr but missing the loaded data: NA19675, NA19678') - - search_body = get_hail_search_body( - intervals=LOCATION_SEARCH['intervals'] + ['1:1-99999999999'], omit_sample_type='SV_WES', - ) - async with self.client.request('POST', '/search', json=search_body) as resp: - self.assertEqual(resp.status, 400) - reason = resp.reason - self.assertEqual(reason, 'Invalid intervals: 1:1-99999999999') - - async def test_sort(self): - await self._assert_expected_search( - [_sorted(VARIANT2, [11, 11]), _sorted(VARIANT4, [11, 11]), _sorted(MULTI_FAMILY_VARIANT, [22, 24]), - _sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='protein_consequence', - ) - - await self._assert_expected_search( - [_sorted(SV_VARIANT1, [11]), _sorted(SV_VARIANT2, [12]), _sorted(SV_VARIANT3, [12]), _sorted(SV_VARIANT4, [12])], - sample_data=SV_WGS_SAMPLE_DATA, sort='protein_consequence', - ) - - await self._assert_expected_search( - [_sorted(VARIANT4, [11, 11]), _sorted(SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [11, 22]), - _sorted(SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT, [22, 22])], - omit_sample_type='SV_WES', sort='protein_consequence', - annotations={'other': ['non_coding_transcript_exon_variant'], 'splice_ai': '0'}, - ) - - await self._assert_expected_search( - [_sorted(VARIANT1, [4]), _sorted(VARIANT2, [8]), _sorted(MULTI_FAMILY_VARIANT, [12.5]), - _sorted(VARIANT4, [12.5])], omit_sample_type='SV_WES', sort='pathogenicity', - ) - - await self._assert_expected_search( - [_sorted(VARIANT1, [4, None]), _sorted(VARIANT2, [8, 3]), _sorted(MULTI_FAMILY_VARIANT, [12.5, None]), - _sorted(VARIANT4, [12.5, None])], omit_sample_type='SV_WES', sort='pathogenicity_hgmd', - ) - - await self._assert_expected_search( - [_sorted(VARIANT2, [0]), _sorted(VARIANT4, [0.00026519427774474025]), - _sorted(VARIANT1, [0.034449315071105957]), _sorted(MULTI_FAMILY_VARIANT, [0.38041073083877563])], - omit_sample_type='SV_WES', sort='gnomad', - ) - - await self._assert_expected_search( - [_sorted(VARIANT1, [0]), _sorted(MULTI_FAMILY_VARIANT, [0]), _sorted(VARIANT4, [0]), - _sorted(VARIANT2, [0.28899794816970825])], omit_sample_type='SV_WES', sort='gnomad_exomes', - ) - - await self._assert_expected_search( - [_sorted(VARIANT4, [0.02222222276031971]), _sorted(VARIANT1, [0.10000000149011612]), - _sorted(VARIANT2, [0.31111112236976624]), _sorted(MULTI_FAMILY_VARIANT, [0.6666666865348816])], - omit_sample_type='SV_WES', sort='callset_af', - ) - - await self._assert_expected_search( - [_sorted(VARIANT4, [-29.899999618530273]), _sorted(VARIANT2, [-20.899999618530273]), - _sorted(VARIANT1, [-4.668000221252441]), _sorted(MULTI_FAMILY_VARIANT, [-2.753999948501587]), ], - omit_sample_type='SV_WES', sort='cadd', - ) - - await self._assert_expected_search( - [_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT2, [-0.19699999690055847]), - _sorted(VARIANT1, [None]), _sorted(MULTI_FAMILY_VARIANT, [None])], omit_sample_type='SV_WES', sort='revel', - ) - - await self._assert_expected_search( - [_sorted(MULTI_FAMILY_VARIANT, [-0.009999999776482582]), _sorted(VARIANT2, [0]), _sorted(VARIANT4, [0]), - _sorted(VARIANT1, [None])], omit_sample_type='SV_WES', sort='splice_ai', - ) - - await self._assert_expected_search( - [_sorted(MULTI_FAMILY_VARIANT, [0, -2]), _sorted(VARIANT2, [0, -1]), _sorted(VARIANT4, [0, -1]), _sorted(VARIANT1, [1, 0])], - omit_sample_type='SV_WES', sort='in_omim', sort_metadata=['ENSG00000177000', 'ENSG00000097046'], - ) - - await self._assert_expected_search( - [_sorted(VARIANT2, [0, -1]), _sorted(MULTI_FAMILY_VARIANT, [1, -1]), _sorted(VARIANT1, [1, 0]), _sorted(VARIANT4, [1, 0])], - omit_sample_type='SV_WES', sort='in_omim', sort_metadata=['ENSG00000177000'], - ) - - await self._assert_expected_search( - [_sorted(VARIANT2, [2, 2]), _sorted(MULTI_FAMILY_VARIANT, [4, 2]), _sorted(VARIANT4, [4, 4]), - _sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='constraint', - sort_metadata={'ENSG00000177000': 2, 'ENSG00000097046': 4}, - ) - - await self._assert_expected_search( - [_sorted(VARIANT2, [3, 3]), _sorted(MULTI_FAMILY_VARIANT, [None, 3]), _sorted(VARIANT1, [None, None]), - _sorted(VARIANT4, [None, None])], omit_sample_type='SV_WES', sort='prioritized_gene', - sort_metadata={'ENSG00000177000': 3}, - ) - - # size sort only applies to SVs, so has no impact on other variants - await self._assert_expected_search( - [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], sort='size', omit_sample_type='SV_WES', - ) - - await self._assert_expected_search( - [_sorted(SV_VARIANT4, [-46343]), _sorted(SV_VARIANT1, [-104]), _sorted(SV_VARIANT2, [-50]), - _sorted(SV_VARIANT3, [-50])], sample_data=SV_WGS_SAMPLE_DATA, sort='size', - ) - - # sort applies to compound hets - await self._assert_expected_search( - [_sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]], - sort='protein_consequence', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, - ) - - await self._assert_expected_search( - [[_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT3, [None])], - _sorted(VARIANT2, [-0.19699999690055847])], - sort='revel', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, + [SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, gene_counts={ + 'ENSG00000171621': {'total': 2, 'families': {'F000011_11': 2}}, + 'ENSG00000083544': {'total': 1, 'families': {'F000011_11': 1}}, + 'ENSG00000184986': {'total': 1, 'families': {'F000011_11': 1}}, + 'null': {'total': 1, 'families': {'F000011_11': 1}}, + } ) - await self._assert_expected_search( - [[_sorted(VARIANT3, [-0.009999999776482582]), _sorted(VARIANT4, [0])], _sorted(VARIANT2, [0])], - sort='splice_ai', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, - ) + # async def test_single_project_search(self): + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', gene_counts={ + # 'ENSG00000097046': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, + # 'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, + # } + # ) + # + # async def test_multi_project_search(self): + # await self._assert_expected_search( + # [PROJECT_2_VARIANT, MULTI_PROJECT_VARIANT1, MULTI_PROJECT_VARIANT2, VARIANT3, VARIANT4], + # gene_counts=GENE_COUNTS, sample_data=MULTI_PROJECT_SAMPLE_DATA, + # ) + # + # async def test_inheritance_filter(self): + # inheritance_mode = 'any_affected' + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + # ) + # + # await self._assert_expected_search( + # [SV_VARIANT2], inheritance_mode=inheritance_mode, annotations=NEW_SV_FILTER, sample_data=SV_WGS_SAMPLE_DATA, + # ) + # + # inheritance_mode = 'de_novo' + # await self._assert_expected_search( + # [VARIANT1, FAMILY_3_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [SV_VARIANT1], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + # ) + # + # inheritance_mode = 'x_linked_recessive' + # await self._assert_expected_search([], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES') + # await self._assert_expected_search([], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA) + # + # inheritance_mode = 'homozygous_recessive' + # await self._assert_expected_search( + # [VARIANT2], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [PROJECT_2_VARIANT1, VARIANT2], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, + # ) + # + # await self._assert_expected_search( + # [SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + # ) + # + # gt_inheritance_filter = {'genotype': {'I000006_hg00733': 'has_alt', 'I000005_hg00732': 'ref_ref'}} + # await self._assert_expected_search( + # [VARIANT2, VARIANT3], inheritance_filter=gt_inheritance_filter, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA) + # + # inheritance_mode = 'compound_het' + # await self._assert_expected_search( + # [[VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, gene_counts={ + # 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, + # 'ENSG00000177000': {'total': 1, 'families': {'F000002_2': 1}}, + # }, **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # await self._assert_expected_search( + # [[SV_VARIANT1, SV_VARIANT2]], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + # **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # inheritance_mode = 'recessive' + # await self._assert_expected_search( + # [PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, gene_counts={ + # 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, + # 'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}}, + # }, sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # await self._assert_expected_search( + # [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + # **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # async def test_quality_filter(self): + # quality_filter = {'vcf_filter': 'pass'} + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search([SV_VARIANT4], quality_filter=quality_filter, sample_data=SV_WGS_SAMPLE_DATA) + # + # await self._assert_expected_search( + # [VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40}, omit_sample_type='SV_WES', + # ) + # + # sv_quality_filter = {'min_gq_sv': 40} + # await self._assert_expected_search( + # [SV_VARIANT3, SV_VARIANT4], quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA, + # ) + # + # await self._assert_expected_search( + # [], annotations=NEW_SV_FILTER, quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA, + # ) + # + # await self._assert_expected_search( + # [VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40, 'vcf_filter': 'pass'}, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 60, 'affected_only': True}, + # omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [SV_VARIANT3, SV_VARIANT4], quality_filter={'min_gq_sv': 60, 'affected_only': True}, sample_data=SV_WGS_SAMPLE_DATA, + # ) + # + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter={'min_ab': 50}, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [VARIANT2, VARIANT3], quality_filter={'min_ab': 70, 'affected_only': True}, + # omit_sample_type='SV_WES', + # ) + # + # quality_filter = {'min_gq': 40, 'min_ab': 50} + # await self._assert_expected_search( + # [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + # ) + # + # annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + # annotations=annotations, pathogenicity={'clinvar': ['likely_pathogenic', 'vus_or_conflicting']}, + # ) + # + # await self._assert_expected_search( + # [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + # annotations=annotations, pathogenicity={'clinvar': ['pathogenic']}, + # ) + # + # async def test_location_search(self): + # await self._assert_expected_search( + # [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', **LOCATION_SEARCH, + # ) + # + # sv_intervals = ['1:9310023-9380264'] + # await self._assert_expected_search( + # [SV_VARIANT1, SV_VARIANT2], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, gene_ids=['ENSG00000171621'], + # ) + # + # await self._assert_expected_search( + # [VARIANT1], omit_sample_type='SV_WES', **EXCLUDE_LOCATION_SEARCH, + # ) + # + # await self._assert_expected_search( + # [SV_VARIANT3, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, exclude_intervals=True, + # ) + # + # await self._assert_expected_search( + # [SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], omit_sample_type='SV_WES', + # intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1] + # ) + # + # async def test_variant_id_search(self): + # await self._assert_expected_search([VARIANT2], omit_sample_type='SV_WES', **RSID_SEARCH) + # + # await self._assert_expected_search([VARIANT1], omit_sample_type='SV_WES', **VARIANT_ID_SEARCH) + # + # await self._assert_expected_search( + # [VARIANT1], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][:1], + # ) + # + # await self._assert_expected_search( + # [], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][1:], + # ) + # + # await self._assert_expected_search([SV_VARIANT2, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, variant_keys=[ + # 'cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640', + # ]) + # + # async def test_frequency_filter(self): + # await self._assert_expected_search( + # [VARIANT1, VARIANT4], frequencies={'seqr': {'af': 0.2}}, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {'ac': 4}}, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {'hh': 1}}, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [VARIANT4], frequencies={'seqr': {'ac': 4, 'hh': 0}}, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [SV_VARIANT1], frequencies={'sv_callset': {'af': 0.05}}, sample_data=SV_WGS_SAMPLE_DATA, + # ) + # + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05}}, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05, 'hh': 1}}, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.005}}, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [SV_VARIANT1, SV_VARIANT3, SV_VARIANT4], frequencies={'gnomad_svs': {'af': 0.001}}, sample_data=SV_WGS_SAMPLE_DATA, + # ) + # + # await self._assert_expected_search( + # [VARIANT4], frequencies={'seqr': {'af': 0.2}, 'gnomad_genomes': {'ac': 50}}, + # omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {}, 'gnomad_genomes': {'af': None}}, + # omit_sample_type='SV_WES', + # ) + # + # annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', + # annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'likely_pathogenic', 'vus_or_conflicting']}, + # ) + # + # await self._assert_expected_search( + # [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', + # annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'vus_or_conflicting']}, + # ) + # + # async def test_annotations_filter(self): + # await self._assert_expected_search([VARIANT2], pathogenicity={'hgmd': ['hgmd_other']}, omit_sample_type='SV_WES') + # + # pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting', 'benign']} + # await self._assert_expected_search([VARIANT1, VARIANT2], pathogenicity=pathogenicity, omit_sample_type='SV_WES') + # + # pathogenicity['clinvar'] = pathogenicity['clinvar'][:1] + # await self._assert_expected_search( + # [VARIANT1, VARIANT4], pathogenicity=pathogenicity, annotations={'SCREEN': ['CTCF-only', 'DNase-only']}, + # omit_sample_type='SV_WES', + # ) + # + # annotations = { + # 'missense': ['missense_variant'], 'in_frame': ['inframe_insertion', 'inframe_deletion'], 'frameshift': None, + # 'structural_consequence': ['INTRONIC'], + # } + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search([VARIANT2, VARIANT4], annotations=annotations, omit_sample_type='SV_WES') + # + # await self._assert_expected_search([SV_VARIANT1], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) + # + # annotations['splice_ai'] = '0.005' + # await self._assert_expected_search( + # [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], annotations=annotations, omit_sample_type='SV_WES', + # ) + # + # annotations['structural'] = ['DEL'] + # await self._assert_expected_search([SV_VARIANT1, SV_VARIANT4], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) + # + # annotations = {'other': ['non_coding_transcript_exon_variant']} + # await self._assert_expected_search( + # [VARIANT1, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT], + # pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], + # gene_ids=LOCATION_SEARCH['gene_ids'][:1], annotations=annotations, omit_sample_type='SV_WES', + # ) + # + # async def test_secondary_annotations_filter(self): + # annotations_1 = {'missense': ['missense_variant']} + # annotations_2 = {'other': ['intron_variant']} + # + # await self._assert_expected_search( + # [[VARIANT3, VARIANT4]], inheritance_mode='compound_het', omit_sample_type='SV_WES', + # annotations=annotations_1, annotations_secondary=annotations_2, + # ) + # + # await self._assert_expected_search( + # [VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', + # annotations=annotations_1, annotations_secondary=annotations_2, + # ) + # + # await self._assert_expected_search( + # [[VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', + # annotations=annotations_2, annotations_secondary=annotations_1, + # ) + # + # sv_annotations_1 = {'structural': ['INS']} + # sv_annotations_2 = {'structural': ['DEL'], 'structural_consequence': ['INTRONIC']} + # + # await self._assert_expected_search( + # [[SV_VARIANT1, SV_VARIANT2]], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='compound_het', + # annotations=sv_annotations_1, annotations_secondary=sv_annotations_2, + # ) + # + # await self._assert_expected_search( + # [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='recessive', + # annotations=sv_annotations_2, annotations_secondary=sv_annotations_1, + # ) + # + # pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting']} + # await self._assert_expected_search( + # [VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', + # annotations=annotations_2, annotations_secondary=annotations_1, pathogenicity=pathogenicity, + # ) + # + # screen_annotations = {'SCREEN': ['CTCF-only']} + # await self._assert_expected_search( + # [], inheritance_mode='recessive', omit_sample_type='SV_WES', + # annotations=screen_annotations, annotations_secondary=annotations_1, + # ) + # + # await self._assert_expected_search( + # [[VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', + # annotations=screen_annotations, annotations_secondary=annotations_2, + # ) + # + # selected_transcript_annotations = {'other': ['non_coding_transcript_exon_variant']} + # await self._assert_expected_search( + # [VARIANT2, [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3, VARIANT4]], inheritance_mode='recessive', + # annotations=screen_annotations, annotations_secondary=selected_transcript_annotations, + # pathogenicity=pathogenicity, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3, VARIANT4]], + # annotations={**selected_transcript_annotations, **screen_annotations}, annotations_secondary=annotations_2, + # inheritance_mode='recessive', omit_sample_type='SV_WES', + # ) + # + # async def test_in_silico_filter(self): + # in_silico = {'eigen': '5.5', 'mut_taster': 'P'} + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', + # ) + # + # in_silico['requireScore'] = True + # await self._assert_expected_search( + # [VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, in_silico={'strvctvre': 0.1, 'requireScore': True}, + # ) + # + # async def test_search_errors(self): + # search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA) + # async with self.client.request('POST', '/search', json=search_body) as resp: + # self.assertEqual(resp.status, 400) + # reason = resp.reason + # self.assertEqual(reason, 'The following samples are available in seqr but missing the loaded data: NA19675, NA19678') + # + # search_body = get_hail_search_body(sample_data=MULTI_PROJECT_MISSING_SAMPLE_DATA) + # async with self.client.request('POST', '/search', json=search_body) as resp: + # self.assertEqual(resp.status, 400) + # reason = resp.reason + # self.assertEqual(reason, 'The following samples are available in seqr but missing the loaded data: NA19675, NA19678') + # + # search_body = get_hail_search_body( + # intervals=LOCATION_SEARCH['intervals'] + ['1:1-99999999999'], omit_sample_type='SV_WES', + # ) + # async with self.client.request('POST', '/search', json=search_body) as resp: + # self.assertEqual(resp.status, 400) + # reason = resp.reason + # self.assertEqual(reason, 'Invalid intervals: 1:1-99999999999') + # + # async def test_sort(self): + # await self._assert_expected_search( + # [_sorted(VARIANT2, [11, 11]), _sorted(VARIANT4, [11, 11]), _sorted(MULTI_FAMILY_VARIANT, [22, 24]), + # _sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='protein_consequence', + # ) + # + # await self._assert_expected_search( + # [_sorted(SV_VARIANT1, [11]), _sorted(SV_VARIANT2, [12]), _sorted(SV_VARIANT3, [12]), _sorted(SV_VARIANT4, [12])], + # sample_data=SV_WGS_SAMPLE_DATA, sort='protein_consequence', + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT4, [11, 11]), _sorted(SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [11, 22]), + # _sorted(SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT, [22, 22])], + # omit_sample_type='SV_WES', sort='protein_consequence', + # annotations={'other': ['non_coding_transcript_exon_variant'], 'splice_ai': '0'}, + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT1, [4]), _sorted(VARIANT2, [8]), _sorted(MULTI_FAMILY_VARIANT, [12.5]), + # _sorted(VARIANT4, [12.5])], omit_sample_type='SV_WES', sort='pathogenicity', + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT1, [4, None]), _sorted(VARIANT2, [8, 3]), _sorted(MULTI_FAMILY_VARIANT, [12.5, None]), + # _sorted(VARIANT4, [12.5, None])], omit_sample_type='SV_WES', sort='pathogenicity_hgmd', + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT2, [0]), _sorted(VARIANT4, [0.00026519427774474025]), + # _sorted(VARIANT1, [0.034449315071105957]), _sorted(MULTI_FAMILY_VARIANT, [0.38041073083877563])], + # omit_sample_type='SV_WES', sort='gnomad', + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT1, [0]), _sorted(MULTI_FAMILY_VARIANT, [0]), _sorted(VARIANT4, [0]), + # _sorted(VARIANT2, [0.28899794816970825])], omit_sample_type='SV_WES', sort='gnomad_exomes', + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT4, [0.02222222276031971]), _sorted(VARIANT1, [0.10000000149011612]), + # _sorted(VARIANT2, [0.31111112236976624]), _sorted(MULTI_FAMILY_VARIANT, [0.6666666865348816])], + # omit_sample_type='SV_WES', sort='callset_af', + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT4, [-29.899999618530273]), _sorted(VARIANT2, [-20.899999618530273]), + # _sorted(VARIANT1, [-4.668000221252441]), _sorted(MULTI_FAMILY_VARIANT, [-2.753999948501587]), ], + # omit_sample_type='SV_WES', sort='cadd', + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT2, [-0.19699999690055847]), + # _sorted(VARIANT1, [None]), _sorted(MULTI_FAMILY_VARIANT, [None])], omit_sample_type='SV_WES', sort='revel', + # ) + # + # await self._assert_expected_search( + # [_sorted(MULTI_FAMILY_VARIANT, [-0.009999999776482582]), _sorted(VARIANT2, [0]), _sorted(VARIANT4, [0]), + # _sorted(VARIANT1, [None])], omit_sample_type='SV_WES', sort='splice_ai', + # ) + # + # await self._assert_expected_search( + # [_sorted(MULTI_FAMILY_VARIANT, [0, -2]), _sorted(VARIANT2, [0, -1]), _sorted(VARIANT4, [0, -1]), _sorted(VARIANT1, [1, 0])], + # omit_sample_type='SV_WES', sort='in_omim', sort_metadata=['ENSG00000177000', 'ENSG00000097046'], + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT2, [0, -1]), _sorted(MULTI_FAMILY_VARIANT, [1, -1]), _sorted(VARIANT1, [1, 0]), _sorted(VARIANT4, [1, 0])], + # omit_sample_type='SV_WES', sort='in_omim', sort_metadata=['ENSG00000177000'], + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT2, [2, 2]), _sorted(MULTI_FAMILY_VARIANT, [4, 2]), _sorted(VARIANT4, [4, 4]), + # _sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='constraint', + # sort_metadata={'ENSG00000177000': 2, 'ENSG00000097046': 4}, + # ) + # + # await self._assert_expected_search( + # [_sorted(VARIANT2, [3, 3]), _sorted(MULTI_FAMILY_VARIANT, [None, 3]), _sorted(VARIANT1, [None, None]), + # _sorted(VARIANT4, [None, None])], omit_sample_type='SV_WES', sort='prioritized_gene', + # sort_metadata={'ENSG00000177000': 3}, + # ) + # + # # size sort only applies to SVs, so has no impact on other variants + # await self._assert_expected_search( + # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], sort='size', omit_sample_type='SV_WES', + # ) + # + # await self._assert_expected_search( + # [_sorted(SV_VARIANT4, [-46343]), _sorted(SV_VARIANT1, [-104]), _sorted(SV_VARIANT2, [-50]), + # _sorted(SV_VARIANT3, [-50])], sample_data=SV_WGS_SAMPLE_DATA, sort='size', + # ) + # + # # sort applies to compound hets + # await self._assert_expected_search( + # [_sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]], + # sort='protein_consequence', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # await self._assert_expected_search( + # [[_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT3, [None])], + # _sorted(VARIANT2, [-0.19699999690055847])], + # sort='revel', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, + # ) + # + # await self._assert_expected_search( + # [[_sorted(VARIANT3, [-0.009999999776482582]), _sorted(VARIANT4, [0])], _sorted(VARIANT2, [0])], + # sort='splice_ai', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, + # ) diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py index 993f5fc63f..446cd13040 100644 --- a/hail_search/test_utils.py +++ b/hail_search/test_utils.py @@ -528,6 +528,171 @@ }, '_sort': [14106694244], } +GCNV_VARIANT1 = { + 'variantId': 'suffix_95340_DUP', + 'chrom': '14', + 'pos': 22438910, + 'end': 22469796, + 'genomeVersion': '38', + 'liftedOverGenomeVersion': '37', + 'liftedOverChrom': '14', + 'liftedOverPos': 22886546, + 'rg37LocusEnd': {'contig': '14', 'position': 23058228}, + 'xpos': 14022417556, + 'familyGuids': ['F000002_2'], + 'genotypeFilters': '', + 'genotypes': { + 'I000004_hg00731': { + 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'cn': 3, 'qs': 38, 'defragged': False, 'start': 22438910, 'end': 22469796, 'numExon': 0, + 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, + }, + 'I000005_hg00732': { + 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', + 'numAlt': 0, 'cn': None, 'qs': None, 'defragged': None, 'start': None, 'end': None, 'numExon': None, + 'geneIds': None, 'newCall': None, 'prevCall': None, 'prevOverlap': None, + }, + 'I000006_hg00733': { + 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', + 'numAlt': 0, 'cn': None, 'qs': None, 'defragged': None, 'start': None, 'end': None, 'numExon': None, + 'geneIds': None, 'newCall': None, 'prevCall': None, 'prevOverlap': None, + } + }, + 'populations': {'sv_callset': {'af': 0.076492540538311, 'ac': 1763, 'an': 23048, 'hom': 0, 'het': 0}}, + 'predictions': {'strvctvre': 0.1809999942779541}, + 'numExon': 0, + 'svType': 'DUP', + 'transcripts': { + 'ENSG00000129562': [{'geneId': 'ENSG00000129562', 'majorConsequence': 'COPY_GAIN'}], + }, + '_sort': [14022417556], + +} +GCNV_VARIANT2 = { + 'variantId': 'suffix_124520_DUP', + 'chrom': '16', + 'pos': 29809156, + 'end': 29815990, + 'xpos': 16029802672, + 'genomeVersion': '38', + 'liftedOverGenomeVersion': '37', + 'liftedOverChrom': '16', + 'liftedOverPos': 29813993, + 'rg37LocusEnd': {'contig': '16', 'position': 29831761}, + 'familyGuids': ['F000002_2'], + 'genotypeFilters': '', + 'genotypes': { + 'I000004_hg00731': { + 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'cn': 3, 'qs': 29, 'defragged': False, 'start': 29809156, 'end': 29815990, 'numExon': 8, + 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, + }, + 'I000005_hg00732': { + 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'cn': 3, 'qs': 46, 'defragged': False, 'start': 29809156, 'end': 29815990, 'numExon': 8, + 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, + }, + 'I000006_hg00733': { + 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'cn': 3, 'qs': 37, 'defragged': False, 'start': 29809156, 'end': 29815990, 'numExon': 8, + 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, + } + }, + 'populations': {'sv_callset': {'af': 0.012322110123932362, 'ac': 284, 'an': 23047, 'hom': 0, 'het': 0}}, + 'predictions': {'strvctvre': 0.5479999780654907}, + 'numExon': 8, + 'svType': 'DUP', + 'transcripts': { + 'ENSG00000013364': [{'geneId': 'ENSG00000013364', 'majorConsequence': 'LOF'}], + 'ENSG00000079616': [{'geneId': 'ENSG00000079616', 'majorConsequence': 'LOF'}], + 'ENSG00000103495': [{'geneId': 'ENSG00000103495', 'majorConsequence': 'COPY_GAIN'}], + 'ENSG00000167371': [{'geneId': 'ENSG00000167371', 'majorConsequence': 'COPY_GAIN'}], + 'ENSG00000280789': [{'geneId': 'ENSG00000280789', 'majorConsequence': 'LOF'}], + 'ENSG00000280893': [{'geneId': 'ENSG00000280893', 'majorConsequence': 'COPY_GAIN'}], + 'ENSG00000281348': [{'geneId': 'ENSG00000281348', 'majorConsequence': 'LOF'}], + }, + '_sort': [16029802672], +} +GCNV_VARIANT3 = { + 'variantId': 'suffix_140593_DUP', + 'chrom': '17', + 'pos': 38717327, + 'end': 38719636, + 'xpos': 17038717327, + 'genomeVersion': '38', + 'liftedOverGenomeVersion': '37', + 'liftedOverChrom': '17', + 'liftedOverPos': 36873580, + 'rg37LocusEnd': {'contig': '17', 'position': 36876246}, + 'familyGuids': ['F000002_2'], + 'genotypeFilters': '', + 'genotypes': { + 'I000004_hg00731': { + 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', + 'numAlt': 2, 'cn': 4, 'qs': 13, 'defragged': True, 'start': 38717327, 'end': 38719636, 'numExon': None, + 'geneIds': None, 'newCall': True, 'prevCall': False, 'prevOverlap': False, + }, + 'I000005_hg00732': { + 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'cn': 3, 'qs': 7, 'defragged': False, 'start': 38717327, 'end': 38719636, 'numExon': None, + 'geneIds': None, 'newCall': False, 'prevCall': False, 'prevOverlap': True, + }, + 'I000006_hg00733': { + 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', 'numAlt': 0, + 'cn': None, 'qs': None, 'defragged': None, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, + 'newCall': None, 'prevCall': None, 'prevOverlap': None, + }, + }, + 'populations': {'sv_callset': {'af': 0.0015185698866844177, 'ac': 35, 'an': 23048, 'hom': 0, 'het': 0}}, + 'predictions': {'strvctvre': 0.7860000133514404}, + 'numExon': 3, + 'svType': 'DEL', + 'transcripts': { + 'ENSG00000275023': [{'geneId': 'ENSG00000275023', 'majorConsequence': 'LOF'}], + }, + '_sort': [17038717327], +} +GCNV_VARIANT4 = { + 'variantId': 'suffix_140608_DUP', + 'chrom': '17', + 'pos': 38721781, + 'end': 38735703, + 'genomeVersion': '38', + 'liftedOverGenomeVersion': '37', + 'liftedOverChrom': '17', + 'liftedOverPos': 36878034, + 'rg37LocusEnd': {'contig': '17', 'position': 36892521}, + 'familyGuids': ['F000002_2'], + 'genotypeFilters': '', + 'xpos': 17038721781, + 'genotypes': { + 'I000004_hg00731': { + 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'cn': 3, 'qs': 28, 'defragged': False, 'start': 38721781, 'end': 38735703, 'numExon': 7, + 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, + }, + 'I000005_hg00732': { + 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', + 'numAlt': 0, 'cn': None, 'qs': None, 'defragged': None, 'start': None, 'end': None, 'numExon': None, + 'geneIds': None, 'newCall': None, 'prevCall': None, 'prevOverlap': None, + }, + 'I000006_hg00733': { + 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', + 'numAlt': 1, 'cn': 3, 'qs': 29, 'defragged': False, 'start': 38721781, 'end': 38734440, 'numExon': 7, + 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, + } + }, + 'populations': {'sv_callset': {'af': 0.004989586770534515, 'ac': 115, 'an': 23048, 'hom': 0, 'het': 0}}, + 'predictions': {'strvctvre': 0.7099999785423279}, + 'numExon': 7, + 'svType': 'DUP', + 'transcripts': { + 'ENSG00000275023': [{'geneId': 'ENSG00000275023', 'majorConsequence': 'LOF'}], + 'ENSG00000277258': [{'geneId': 'ENSG00000277258', 'majorConsequence': 'LOF'}], + 'ENSG00000277972': [{'geneId': 'ENSG00000277972', 'majorConsequence': 'COPY_GAIN'}], + }, + '_sort': [17038721781], +} LOCATION_SEARCH = { 'gene_ids': ['ENSG00000177000', 'ENSG00000097046'], From c91f63c5aa92b55845de5284defe2926a326506e Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 25 Aug 2023 12:36:17 -0400 Subject: [PATCH 10/16] initial test and transcript overrde --- .../projects/R0001_1kg.ht/.README.txt.crc | Bin 12 -> 12 bytes .../SV_WES/projects/R0001_1kg.ht/README.txt | 2 +- .../.index.crc | Bin .../.metadata.json.gz.crc | Bin .../index | Bin .../metadata.json.gz | Bin .../R0001_1kg.ht/rows/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../R0001_1kg.ht/rows/metadata.json.gz | Bin 667 -> 668 bytes ...0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.crc | Bin 12 -> 0 bytes ...0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.crc | Bin 0 -> 12 bytes ...-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa} | Bin 227 -> 235 bytes hail_search/hail_search_query.py | 4 +- hail_search/test_search.py | 40 +++++++++++++----- hail_search/test_utils.py | 40 +++++++++++++++--- 14 files changed, 66 insertions(+), 20 deletions(-) rename hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/{part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx => part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx}/.index.crc (100%) rename hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/{part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx => part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx}/.metadata.json.gz.crc (100%) rename hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/{part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx => part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx}/index (100%) rename hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/{part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx => part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx}/metadata.json.gz (100%) delete mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/.part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.crc create mode 100644 hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/.part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.crc rename hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/{part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7 => part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa} (60%) diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/.README.txt.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/.README.txt.crc index 17c414e9d5b7ded20df83dfa3a45f6c4cce0c1f5..a50f0cb506c8fc42d14688e60a15b6bf87baa97d 100644 GIT binary patch literal 12 TcmYc;N@ieSU}AWYb?zSk6!-*L literal 12 TcmYc;N@ieSU}E6d*>(#65wQbY diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/README.txt b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/README.txt index e9bf5b62a9..3bd7ff0ea8 100644 --- a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/README.txt +++ b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.109-b71b065e4bb6 - Created at 2023/08/25 10:45:26 \ No newline at end of file + Created at 2023/08/25 11:56:33 \ No newline at end of file diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.index.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx/.index.crc similarity index 100% rename from hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.index.crc rename to hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx/.index.crc diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx/.metadata.json.gz.crc similarity index 100% rename from hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/.metadata.json.gz.crc rename to hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx/.metadata.json.gz.crc diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/index b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx/index similarity index 100% rename from hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/index rename to hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx/index diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx/metadata.json.gz similarity index 100% rename from hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.idx/metadata.json.gz rename to hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/index/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.idx/metadata.json.gz diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/.metadata.json.gz.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/.metadata.json.gz.crc index 9cba05dcb8675e53e6e06d64fcb287acdd42349a..5dc5b05e4bf9fc30be96d841f2634f10565f1fa7 100644 GIT binary patch literal 16 XcmYc;N@ieSU}BiCVz!BF#?RRRBx40z literal 16 XcmYc;N@ieSU}9iyt8)ID+1myHAqWKW diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/metadata.json.gz b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/metadata.json.gz index da058c7333047f2ddd7f9435bba544cc03a546e4..ee8e1b9e6ff26e9b4757b255eac866996373b7f4 100644 GIT binary patch literal 668 zcmV;N0%QFjiwFP!000000NqwibJ{Qv{V%%KPQlno@U0NqGLy-qA?=|I;}O!Ds4+4l z3kqTU-@7vS1KUiJ9>amLpLSPopWfQu6G@jqd@`0Cw!j~L-Aq=%9Ac5+8Fs{X$PL3=<0=CgY`xqeC%K)~qtmP$|DxG7)yZq|gV}A8xa=c1s zsnKzB)Ul}`B0??-{U)HN2dI?pz^z5Jx?WASTE%LSszs@Gp{o4q8nsANC{zeEL|R6k zsyg#^7=o&q@pd%mI@g18$s+7YM7$C*h+_7PR8E5C2Aqf&4Cpymm)Fg-+u!c@VV3*O zL~I=BvYwfmA9f+k=+&AS=U6t03Zi?epAf&e_?(3s6on77p?lLLRtF^%DlsP7qW@9P zOV`3#Pw=L2a~X>({%cd5T5%XApo?-;%2-n^@NiFZ({@{zd@5{C+-q2zn}K5Y#@@AE zKZ;#&;3bXj@Ra}W__O^K$o2!z{1xDH1%leCHf?i2-e!?QtOD~CYOnlG*A#7>IHgIy zPQFh4ysz3QpCSAqt%mBn^8U%1EoC^3PKhb9!sq!{JdVVCOqd(HV_$U~{O{NMd# zx^~x_gYB`-rR{oQ-*&)rq0jmr4{7$*@#lfc1q|Y~<{a C_D%Z$ literal 667 zcmV;M0%ZLkiwFP!000000NqyGZrVT){g-{Mod(-n>bF8@ixfp^Nc&KPtYtkW+nBYr zUMLj#-#csY-6%;P0}?W4XJ*fwJ!AVw1l<5}$xQgL2mj&c%{&HX5rcToa3Jo2?5N~a zD3AMOi71PFU^<4l$2|G*5Q9G{7BC7lgexDWW17HR$t_dIvXYB97gQdMfQJfX;!ZzH zNe_!0Um75k##_$NcvPTaI3h0KOUoDsH;a&2viucgxv|NL(vXoNaHdxHGBvp zU9Z6}Lle4*L-0T#HWSLWa%2qxu#FDd=K!(Yc(9K|BQH@@svMi#9j%@`I;bL9rHwU#iL8WvjZXKdE^=hltDOQJ69ZGczRpr;zs6(Pcp+cYu($ex& z)md$W0A$0Ax6^UY8jZ^(gRmza`jwDz7_n!hQVUvXaL)f?fNc#2qjuWuFX!7N%Y9=a zHV$;t%uMwMyWo2C8cob{ESp3L;XPGPh+kcO%)&K_!Ux&Vy=oGxgA#C=XcO(x|ETA{ zF>uxs{8PBOjKvlAr72F0C` zo76A+s*Cak!tc^*sLl(|pRCzZhI({LbdebE=U?zR68$ov|Crb2z!~5#e*GQ**nQI) z!nL__U}&=5YGpDu7zNC-sT~Znw~pTrRIFhfZH2Gv97*<=(2Z1HPh65<4}9 zdpAGsH3NJoT?+ivx1A!63l?sDG+kC1TYv(a>Jl1BH5sauL>741{{ZA&-4{Oz001x# BKyv^9 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/.part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/.part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7.crc deleted file mode 100644 index c0e184f89f44249ff5950572cdd5f48baaf0a082..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}A_~y8S2s6AlBn diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/.part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.crc b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/.part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa.crc new file mode 100644 index 0000000000000000000000000000000000000000..f83b4fa7bdceeec297d4e8ced85f14d5ef4ac890 GIT binary patch literal 12 TcmYc;N@ieSU}E?uvCS0#6LAAf literal 0 HcmV?d00001 diff --git a/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7 b/hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa similarity index 60% rename from hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/part-0-646-0-0-0d1eee35-27e0-b4e7-c1bb-cc68fc23a3f7 rename to hail_search/fixtures/GRCh38/SV_WES/projects/R0001_1kg.ht/rows/parts/part-0-702-0-0-4d0242f4-15be-5c06-45f9-2e54e9c95dfa index 0dca74ce0f4f9ec367c684dd58d234018ba50a4b..0c51afbfb967f29486ce15cdfa25d352f5519579 100644 GIT binary patch delta 39 xcmV+?0NDTI0qX$;=Kufz$&m%k8CR|1-3kHp0Pp}ZIW;yi0QaQW3IhzJ0RStR4ebB` delta 31 ncmaFO_?VIPJ_7^8&WWsN1wSyXeRP+Lf#Lg-8(hphOBfjd!io!d diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 775248242d..c1803a7bdf 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -1208,7 +1208,7 @@ class GcnvHailTableQuery(SvHailTableQuery): TRANSCRIPTS_ENUM_FIELD = SvHailTableQuery.ENUM_ANNOTATION_FIELDS[SvHailTableQuery.TRANSCRIPTS_FIELD] ENUM_ANNOTATION_FIELDS = {SvHailTableQuery.TRANSCRIPTS_FIELD: { **TRANSCRIPTS_ENUM_FIELD, - 'format_values': lambda values, r: GcnvHailTableQuery.TRANSCRIPTS_ENUM_FIELD['format_values']( + 'format_array_values': lambda values, r: GcnvHailTableQuery.TRANSCRIPTS_ENUM_FIELD['format_array_values']( GcnvHailTableQuery._get_gene_id_transcripts_override(values, r), r ), }} @@ -1247,6 +1247,8 @@ def get_allowed_sv_type_ids(self, sv_types): # TODO override genotype fields in genotypes response, actually return geneIds + # TODO filter family transcripts for gene counts + def _additional_annotation_fields(self): return {} diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 7b772f0afc..6ceffe16b6 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -4,7 +4,8 @@ from hail_search.test_utils import get_hail_search_body, FAMILY_2_VARIANT_SAMPLE_DATA, FAMILY_2_MISSING_SAMPLE_DATA, \ VARIANT1, VARIANT2, VARIANT3, VARIANT4, MULTI_PROJECT_SAMPLE_DATA, MULTI_PROJECT_MISSING_SAMPLE_DATA, \ LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS, SV_WGS_SAMPLE_DATA, \ - SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4, GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4 + SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4, GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4, \ + GCNV_MULTI_FAMILY_VARIANT1, GCNV_MULTI_FAMILY_VARIANT2, SV_WES_SAMPLE_DATA from hail_search.web_app import init_web_app PROJECT_2_VARIANT = { @@ -128,7 +129,8 @@ async def _assert_expected_search(self, results, gene_counts=None, **search_kwar self.assertEqual(resp_json['total'], len(results)) for i, result in enumerate(resp_json['results']): if result != results[i]: - import pdb; pdb.set_trace() + diff_k = {ky for ky, val in results[i].items() if val != result[ky]} + import pdb; pdb.set_trace() # TODO self.assertEqual(result, results[i]) if gene_counts: @@ -136,7 +138,7 @@ async def _assert_expected_search(self, results, gene_counts=None, **search_kwar self.assertEqual(resp.status, 200) gene_counts_json = await resp.json() if gene_counts_json != gene_counts: - import pdb; pdb.set_trace() + import pdb; pdb.set_trace() # TODO self.assertDictEqual(gene_counts_json, gene_counts) async def test_single_family_search(self): @@ -172,14 +174,30 @@ async def test_single_family_search(self): } ) - # async def test_single_project_search(self): - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', gene_counts={ - # 'ENSG00000097046': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, - # 'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, - # } - # ) - # + async def test_single_project_search(self): + await self._assert_expected_search( + [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', gene_counts={ + 'ENSG00000097046': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, + 'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}}, + } + ) + + await self._assert_expected_search( + [GCNV_MULTI_FAMILY_VARIANT1, GCNV_MULTI_FAMILY_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], sample_data=SV_WES_SAMPLE_DATA, gene_counts={ + 'ENSG00000129562': {'total': 2, 'families': {'F000002_2': 1, 'F000003_3': 1}}, + 'ENSG00000013364': {'total': 2, 'families': {'F000002_2': 1, 'F000003_3': 1}}, + 'ENSG00000079616': {'total': 2, 'families': {'F000002_2': 1, 'F000003_3': 1}}, + 'ENSG00000103495': {'total': 2, 'families': {'F000002_2': 1, 'F000003_3': 1}}, + 'ENSG00000167371': {'total': 2, 'families': {'F000002_2': 1, 'F000003_3': 1}}, + 'ENSG00000280789': {'total': 2, 'families': {'F000002_2': 1, 'F000003_3': 1}}, + 'ENSG00000280893': {'total': 2, 'families': {'F000002_2': 1, 'F000003_3': 1}}, + 'ENSG00000281348': {'total': 2, 'families': {'F000002_2': 1, 'F000003_3': 1}}, + 'ENSG00000275023': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000277258': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000277972': {'total': 1, 'families': {'F000002_2': 1}}, + } + ) + # async def test_multi_project_search(self): # await self._assert_expected_search( # [PROJECT_2_VARIANT, MULTI_PROJECT_VARIANT1, MULTI_PROJECT_VARIANT2, VARIANT3, VARIANT4], diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py index 446cd13040..7d0dc9b3de 100644 --- a/hail_search/test_utils.py +++ b/hail_search/test_utils.py @@ -60,6 +60,8 @@ 'sample_id': 'NA20883', 'individual_guid': 'I000035_na20883', 'family_guid': 'F000011_11', 'project_guid': 'R0003_test', 'affected': 'N', 'sex': 'F', }]} +SV_WES_SAMPLE_DATA = {'SV_WES': EXPECTED_SAMPLE_DATA['SV_WES'] + [FAMILY_3_SAMPLE]} + VARIANT1 = { 'variantId': '1-10439-AC-A', 'chrom': '1', @@ -562,9 +564,7 @@ 'predictions': {'strvctvre': 0.1809999942779541}, 'numExon': 0, 'svType': 'DUP', - 'transcripts': { - 'ENSG00000129562': [{'geneId': 'ENSG00000129562', 'majorConsequence': 'COPY_GAIN'}], - }, + 'transcripts': {}, '_sort': [14022417556], } @@ -603,13 +603,9 @@ 'numExon': 8, 'svType': 'DUP', 'transcripts': { - 'ENSG00000013364': [{'geneId': 'ENSG00000013364', 'majorConsequence': 'LOF'}], - 'ENSG00000079616': [{'geneId': 'ENSG00000079616', 'majorConsequence': 'LOF'}], 'ENSG00000103495': [{'geneId': 'ENSG00000103495', 'majorConsequence': 'COPY_GAIN'}], 'ENSG00000167371': [{'geneId': 'ENSG00000167371', 'majorConsequence': 'COPY_GAIN'}], - 'ENSG00000280789': [{'geneId': 'ENSG00000280789', 'majorConsequence': 'LOF'}], 'ENSG00000280893': [{'geneId': 'ENSG00000280893', 'majorConsequence': 'COPY_GAIN'}], - 'ENSG00000281348': [{'geneId': 'ENSG00000281348', 'majorConsequence': 'LOF'}], }, '_sort': [16029802672], } @@ -694,6 +690,36 @@ '_sort': [17038721781], } +GCNV_MULTI_FAMILY_VARIANT1 = deepcopy(GCNV_VARIANT1) +GCNV_MULTI_FAMILY_VARIANT1.update({ + 'pos': 22418039, + 'end': 22507821, + 'transcripts': { + 'ENSG00000129562': [{'geneId': 'ENSG00000129562', 'majorConsequence': 'COPY_GAIN'}], + }, +}) +GCNV_MULTI_FAMILY_VARIANT1['familyGuids'].append('F000003_3') +GCNV_MULTI_FAMILY_VARIANT1['genotypes'].update({'I000007_na20870': { + 'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3', + 'numAlt': 1, 'cn': 3, 'qs': 164, 'defragged': False, 'start': 22418039, 'end': 22507821, 'numExon': 0, + 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, +}}) + +GCNV_MULTI_FAMILY_VARIANT2 = deepcopy(GCNV_VARIANT2) +GCNV_MULTI_FAMILY_VARIANT2['numExon'] = 26 +GCNV_MULTI_FAMILY_VARIANT2['familyGuids'].append('F000003_3') +GCNV_MULTI_FAMILY_VARIANT2['genotypes'].update({'I000007_na20870': { + 'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3', + 'numAlt': 1, 'cn': 3, 'qs': 40, 'defragged': False, 'start': 29809156, 'end': 29815990, 'numExon': None, + 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, +}}) +GCNV_MULTI_FAMILY_VARIANT2['transcripts'].update({ + 'ENSG00000013364': [{'geneId': 'ENSG00000013364', 'majorConsequence': 'LOF'}], + 'ENSG00000079616': [{'geneId': 'ENSG00000079616', 'majorConsequence': 'LOF'}], + 'ENSG00000281348': [{'geneId': 'ENSG00000281348', 'majorConsequence': 'LOF'}], + 'ENSG00000280789': [{'geneId': 'ENSG00000280789', 'majorConsequence': 'LOF'}], +}) + LOCATION_SEARCH = { 'gene_ids': ['ENSG00000177000', 'ENSG00000097046'], 'intervals': ['2:1234-5678', '7:1-11100', '1:11785723-11806455', '1:91500851-91525764'], From 2e4f372f3b4432efa80b186250f32f462c65be3a Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 25 Aug 2023 16:49:16 -0400 Subject: [PATCH 11/16] genotype override fields --- hail_search/hail_search_query.py | 62 ++++++++++++++++---------------- hail_search/test_utils.py | 23 ++++++------ 2 files changed, 44 insertions(+), 41 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index c1803a7bdf..d2dac80def 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -43,7 +43,7 @@ class BaseHailTableQuery(object): MISSING_NUM_ALT = -1 GENOTYPE_FIELDS = {} - NESTED_GENOTYPE_FIELDS = {} + COMPUTED_GENOTYPE_FIELDS = {} GENOTYPE_QUERY_FIELDS = {} QUALITY_FILTER_FORMAT = {} POPULATIONS = {} @@ -100,7 +100,7 @@ def annotation_fields(self): 'sampleId', 'individualGuid', 'familyGuid', numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), self.MISSING_NUM_ALT), **{k: x[0][field] for k, field in self.GENOTYPE_FIELDS.items()}, - **{_to_camel_case(k): x[0][field][k] for field, v in self.NESTED_GENOTYPE_FIELDS.items() for k in v}, + **{_to_camel_case(k): v(x[0], k, r) for k, v in self.COMPUTED_GENOTYPE_FIELDS.items()}, )), 'populations': lambda r: hl.struct(**{ population: self.population_expression(r, population) for population in self.POPULATIONS.keys() @@ -1056,7 +1056,9 @@ class SvHailTableQuery(BaseHailTableQuery): DATA_TYPE = 'SV_WGS' GENOTYPE_FIELDS = {_to_camel_case(f): f for f in ['CN', 'GQ']} - NESTED_GENOTYPE_FIELDS = {'concordance': ['new_call', 'prev_call', 'prev_num_alt']} + COMPUTED_GENOTYPE_FIELDS = { + k: lambda entry, field, *args: entry.concordance[field] for k in ['new_call', 'prev_call', 'prev_num_alt'] + } GENOTYPE_QUERY_FIELDS = {'gq_sv': 'GQ', 'gq': None} TRANSCRIPTS_FIELD = 'sorted_gene_consequences' @@ -1186,22 +1188,31 @@ class GcnvHailTableQuery(SvHailTableQuery): GENOTYPE_FIELDS = { **SvHailTableQuery.GENOTYPE_FIELDS, **{f.lower(): f for f in ['QS', 'defragged']}, - **{_to_camel_case(f): f'sample_{f}' for f in ['start', 'end', 'num_exon', 'gene_ids']}, } del GENOTYPE_FIELDS['gq'] GENOTYPE_QUERY_FIELDS = {} - NESTED_GENOTYPE_FIELDS = { - 'concordance': SvHailTableQuery.NESTED_GENOTYPE_FIELDS['concordance'][:-1] + ['prev_overlap'] + GENOTYPE_OVERRIDE_FIELDS = { + 'start': (hl.min, lambda r: r.start_locus.position), + 'end': (hl.max, lambda r: r.end_locus.position), + 'num_exon': (hl.max, lambda r: r.num_exon), + 'gene_ids': ( + lambda entry_gene_ids: entry_gene_ids.fold(lambda s1, s2: s1.union(s2), hl.empty_set(hl.tstr)), + lambda r: hl.missing(hl.tset(hl.tstr)), + ), + } + COMPUTED_GENOTYPE_FIELDS = { + **SvHailTableQuery.COMPUTED_GENOTYPE_FIELDS, + **{k: lambda entry, field, r: hl.or_missing(r[field] != entry[f'sample_{field}'], entry[f'sample_{field}']) + for k in GENOTYPE_OVERRIDE_FIELDS.keys()}, } + COMPUTED_GENOTYPE_FIELDS['prev_overlap'] = COMPUTED_GENOTYPE_FIELDS.pop('prev_num_alt') CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS BASE_ANNOTATION_FIELDS = { **SvHailTableQuery.BASE_ANNOTATION_FIELDS, - 'pos': lambda r: GcnvHailTableQuery._get_genotype_override_field( - r, 'start', hl.min, default=r.start_locus.position), - 'end': lambda r: GcnvHailTableQuery._get_genotype_override_field( - r, 'end', hl.max, default=r.end_locus.position), - 'numExon': lambda r: GcnvHailTableQuery._get_genotype_override_field(r, 'num_exon', hl.max), + 'pos': lambda r: r.start, + 'end': lambda r: r.end, + 'numExon': lambda r: r.num_exon, } del BASE_ANNOTATION_FIELDS['bothsidesSupport'] @@ -1209,44 +1220,33 @@ class GcnvHailTableQuery(SvHailTableQuery): ENUM_ANNOTATION_FIELDS = {SvHailTableQuery.TRANSCRIPTS_FIELD: { **TRANSCRIPTS_ENUM_FIELD, 'format_array_values': lambda values, r: GcnvHailTableQuery.TRANSCRIPTS_ENUM_FIELD['format_array_values']( - GcnvHailTableQuery._get_gene_id_transcripts_override(values, r), r + hl.if_else(hl.is_missing(r.gene_ids), values, values.filter(lambda t: r.gene_ids.contains(t.geneId))), r, ), }} POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'} @staticmethod - def _get_genotype_override_field(r, field, agg, default=None): + def _get_genotype_override_field(r, field, agg, get_default): sample_field = f'sample_{field}' entries = r.family_entries.flatmap(lambda x: x) - if default is None: - default = r[field] return hl.if_else( entries.any(lambda g: hl.is_defined(g.GT) & hl.is_missing(g[sample_field])), - default, agg(entries.map(lambda g: g[sample_field])) + get_default(r), agg(entries.map(lambda g: g[sample_field])) ) - @classmethod - def _get_gene_id_transcripts_override(cls, transcripts, r): - empty_gene_set = hl.empty_set(hl.tstr) - geneotype_gene_ids_expr = cls._get_genotype_override_field( - r, 'gene_ids', - lambda entry_gene_ids: entry_gene_ids.fold(lambda s1, s2: s1.union(s2), empty_gene_set), - default=hl.missing(empty_gene_set.dtype)) - return hl.bind( - lambda gene_ids: hl.if_else( - hl.is_missing(gene_ids), transcripts, - transcripts.filter(lambda t: gene_ids.contains(t.geneId)), - ), geneotype_gene_ids_expr, - ) + # TODO actually return geneIds in genotypes + def _format_results(self, ht, annotation_fields): + ht = ht.annotate(**{ + k: self._get_genotype_override_field(ht, k, *args) for k, args in self.GENOTYPE_OVERRIDE_FIELDS.items() + }) + return super()._format_results(ht, annotation_fields) def get_allowed_sv_type_ids(self, sv_types): return super().get_allowed_sv_type_ids([ type.replace('gCNV_', '') for type in sv_types if type.startswith('gCNV_') ]) - # TODO override genotype fields in genotypes response, actually return geneIds - # TODO filter family transcripts for gene counts def _additional_annotation_fields(self): diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py index 7d0dc9b3de..01e0dcf9f3 100644 --- a/hail_search/test_utils.py +++ b/hail_search/test_utils.py @@ -546,7 +546,7 @@ 'genotypes': { 'I000004_hg00731': { 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', - 'numAlt': 1, 'cn': 3, 'qs': 38, 'defragged': False, 'start': 22438910, 'end': 22469796, 'numExon': 0, + 'numAlt': 1, 'cn': 3, 'qs': 38, 'defragged': False, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, }, 'I000005_hg00732': { @@ -584,17 +584,17 @@ 'genotypes': { 'I000004_hg00731': { 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', - 'numAlt': 1, 'cn': 3, 'qs': 29, 'defragged': False, 'start': 29809156, 'end': 29815990, 'numExon': 8, + 'numAlt': 1, 'cn': 3, 'qs': 29, 'defragged': False, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, }, 'I000005_hg00732': { 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', - 'numAlt': 1, 'cn': 3, 'qs': 46, 'defragged': False, 'start': 29809156, 'end': 29815990, 'numExon': 8, + 'numAlt': 1, 'cn': 3, 'qs': 46, 'defragged': False, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, }, 'I000006_hg00733': { 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', - 'numAlt': 1, 'cn': 3, 'qs': 37, 'defragged': False, 'start': 29809156, 'end': 29815990, 'numExon': 8, + 'numAlt': 1, 'cn': 3, 'qs': 37, 'defragged': False, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, } }, @@ -625,12 +625,12 @@ 'genotypes': { 'I000004_hg00731': { 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', - 'numAlt': 2, 'cn': 4, 'qs': 13, 'defragged': True, 'start': 38717327, 'end': 38719636, 'numExon': None, + 'numAlt': 2, 'cn': 4, 'qs': 13, 'defragged': True, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': True, 'prevCall': False, 'prevOverlap': False, }, 'I000005_hg00732': { 'sampleId': 'HG00732', 'individualGuid': 'I000005_hg00732', 'familyGuid': 'F000002_2', - 'numAlt': 1, 'cn': 3, 'qs': 7, 'defragged': False, 'start': 38717327, 'end': 38719636, 'numExon': None, + 'numAlt': 1, 'cn': 3, 'qs': 7, 'defragged': False, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': False, 'prevOverlap': True, }, 'I000006_hg00733': { @@ -664,7 +664,7 @@ 'genotypes': { 'I000004_hg00731': { 'sampleId': 'HG00731', 'individualGuid': 'I000004_hg00731', 'familyGuid': 'F000002_2', - 'numAlt': 1, 'cn': 3, 'qs': 28, 'defragged': False, 'start': 38721781, 'end': 38735703, 'numExon': 7, + 'numAlt': 1, 'cn': 3, 'qs': 28, 'defragged': False, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, }, 'I000005_hg00732': { @@ -674,7 +674,7 @@ }, 'I000006_hg00733': { 'sampleId': 'HG00733', 'individualGuid': 'I000006_hg00733', 'familyGuid': 'F000002_2', - 'numAlt': 1, 'cn': 3, 'qs': 29, 'defragged': False, 'start': 38721781, 'end': 38734440, 'numExon': 7, + 'numAlt': 1, 'cn': 3, 'qs': 29, 'defragged': False, 'start': None, 'end': 38734440, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, } }, @@ -701,16 +701,19 @@ GCNV_MULTI_FAMILY_VARIANT1['familyGuids'].append('F000003_3') GCNV_MULTI_FAMILY_VARIANT1['genotypes'].update({'I000007_na20870': { 'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3', - 'numAlt': 1, 'cn': 3, 'qs': 164, 'defragged': False, 'start': 22418039, 'end': 22507821, 'numExon': 0, + 'numAlt': 1, 'cn': 3, 'qs': 164, 'defragged': False, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, }}) +GCNV_MULTI_FAMILY_VARIANT1['genotypes']['I000004_hg00731'].update({'start': 22438910, 'end': 22469796}) GCNV_MULTI_FAMILY_VARIANT2 = deepcopy(GCNV_VARIANT2) GCNV_MULTI_FAMILY_VARIANT2['numExon'] = 26 GCNV_MULTI_FAMILY_VARIANT2['familyGuids'].append('F000003_3') +for genotype in GCNV_MULTI_FAMILY_VARIANT2['genotypes'].values(): + genotype.update({'numExon': 8}) GCNV_MULTI_FAMILY_VARIANT2['genotypes'].update({'I000007_na20870': { 'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3', - 'numAlt': 1, 'cn': 3, 'qs': 40, 'defragged': False, 'start': 29809156, 'end': 29815990, 'numExon': None, + 'numAlt': 1, 'cn': 3, 'qs': 40, 'defragged': False, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, }}) GCNV_MULTI_FAMILY_VARIANT2['transcripts'].update({ From bea8c3071f7def8e1ddf5fe4584683d008ed4076 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Fri, 25 Aug 2023 17:50:52 -0400 Subject: [PATCH 12/16] fix geneotype geen id response --- hail_search/hail_search_query.py | 6 +++--- hail_search/test_search.py | 1 + hail_search/test_utils.py | 4 ++-- hail_search/web_app.py | 2 ++ 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index d2dac80def..61937f962f 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -1202,8 +1202,9 @@ class GcnvHailTableQuery(SvHailTableQuery): } COMPUTED_GENOTYPE_FIELDS = { **SvHailTableQuery.COMPUTED_GENOTYPE_FIELDS, - **{k: lambda entry, field, r: hl.or_missing(r[field] != entry[f'sample_{field}'], entry[f'sample_{field}']) - for k in GENOTYPE_OVERRIDE_FIELDS.keys()}, + **{k: lambda entry, field, r: hl.or_missing( + hl.is_missing(r[field]) | (r[field] != entry[f'sample_{field}']), entry[f'sample_{field}'] + ) for k in GENOTYPE_OVERRIDE_FIELDS.keys()}, } COMPUTED_GENOTYPE_FIELDS['prev_overlap'] = COMPUTED_GENOTYPE_FIELDS.pop('prev_num_alt') @@ -1235,7 +1236,6 @@ def _get_genotype_override_field(r, field, agg, get_default): get_default(r), agg(entries.map(lambda g: g[sample_field])) ) - # TODO actually return geneIds in genotypes def _format_results(self, ht, annotation_fields): ht = ht.annotate(**{ k: self._get_genotype_override_field(ht, k, *args) for k, args in self.GENOTYPE_OVERRIDE_FIELDS.items() diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 6ceffe16b6..6018be0253 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -151,6 +151,7 @@ async def test_single_family_search(self): await self._assert_expected_search( [GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], omit_sample_type='VARIANTS', gene_counts={ + # TODO should be filtered to returned transcripts - just use entries? 'ENSG00000129562': {'total': 1, 'families': {'F000002_2': 1}}, 'ENSG00000013364': {'total': 1, 'families': {'F000002_2': 1}}, 'ENSG00000079616': {'total': 1, 'families': {'F000002_2': 1}}, diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py index 01e0dcf9f3..30d0d08f19 100644 --- a/hail_search/test_utils.py +++ b/hail_search/test_utils.py @@ -704,13 +704,13 @@ 'numAlt': 1, 'cn': 3, 'qs': 164, 'defragged': False, 'start': None, 'end': None, 'numExon': None, 'geneIds': None, 'newCall': False, 'prevCall': True, 'prevOverlap': False, }}) -GCNV_MULTI_FAMILY_VARIANT1['genotypes']['I000004_hg00731'].update({'start': 22438910, 'end': 22469796}) +GCNV_MULTI_FAMILY_VARIANT1['genotypes']['I000004_hg00731'].update({'start': 22438910, 'end': 22469796, 'geneIds': []}) GCNV_MULTI_FAMILY_VARIANT2 = deepcopy(GCNV_VARIANT2) GCNV_MULTI_FAMILY_VARIANT2['numExon'] = 26 GCNV_MULTI_FAMILY_VARIANT2['familyGuids'].append('F000003_3') for genotype in GCNV_MULTI_FAMILY_VARIANT2['genotypes'].values(): - genotype.update({'numExon': 8}) + genotype.update({'numExon': 8, 'geneIds': ['ENSG00000103495', 'ENSG00000167371', 'ENSG00000280893']}) GCNV_MULTI_FAMILY_VARIANT2['genotypes'].update({'I000007_na20870': { 'sampleId': 'NA20870', 'individualGuid': 'I000007_na20870', 'familyGuid': 'F000003_3', 'numAlt': 1, 'cn': 3, 'qs': 40, 'defragged': False, 'start': None, 'end': None, 'numExon': None, diff --git a/hail_search/web_app.py b/hail_search/web_app.py index 303ab82f5c..c6a3f5aa24 100644 --- a/hail_search/web_app.py +++ b/hail_search/web_app.py @@ -8,6 +8,8 @@ def _hl_json_default(o): if isinstance(o, hl.Struct) or isinstance(o, hl.utils.frozendict): return dict(o) + elif isinstance(o, set): + return sorted(o) def hl_json_dumps(obj): From fe4f735b55f77984d74a940e7b71072f88d3d206 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 5 Sep 2023 12:16:53 -0400 Subject: [PATCH 13/16] family specific sv gene counts --- hail_search/hail_search_query.py | 16 +- hail_search/test_search.py | 987 +++++++++++++++---------------- 2 files changed, 498 insertions(+), 505 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index 61937f962f..dd4974d39c 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -1227,8 +1227,9 @@ class GcnvHailTableQuery(SvHailTableQuery): POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'} - @staticmethod - def _get_genotype_override_field(r, field, agg, get_default): + @classmethod + def _get_genotype_override_field(cls, r, field): + agg, get_default = cls.GENOTYPE_OVERRIDE_FIELDS[field] sample_field = f'sample_{field}' entries = r.family_entries.flatmap(lambda x: x) return hl.if_else( @@ -1237,9 +1238,7 @@ def _get_genotype_override_field(r, field, agg, get_default): ) def _format_results(self, ht, annotation_fields): - ht = ht.annotate(**{ - k: self._get_genotype_override_field(ht, k, *args) for k, args in self.GENOTYPE_OVERRIDE_FIELDS.items() - }) + ht = ht.annotate(**{k: self._get_genotype_override_field(ht, k) for k in self.GENOTYPE_OVERRIDE_FIELDS}) return super()._format_results(ht, annotation_fields) def get_allowed_sv_type_ids(self, sv_types): @@ -1247,7 +1246,12 @@ def get_allowed_sv_type_ids(self, sv_types): type.replace('gCNV_', '') for type in sv_types if type.startswith('gCNV_') ]) - # TODO filter family transcripts for gene counts + @classmethod + def _gene_ids_expr(cls, ht): + gene_ids_expr = getattr(ht, 'gene_ids', None) + if gene_ids_expr is None: + gene_ids_expr = cls._get_genotype_override_field(ht, 'gene_ids') + return hl.or_else(gene_ids_expr, super()._gene_ids_expr(ht)) def _additional_annotation_fields(self): return {} diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 6018be0253..2a94444220 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -114,11 +114,11 @@ class HailSearchTestCase(AioHTTPTestCase): async def get_application(self): return init_web_app() - # async def test_status(self): - # async with self.client.request('GET', '/status') as resp: - # self.assertEqual(resp.status, 200) - # resp_json = await resp.json() - # self.assertDictEqual(resp_json, {'success': True}) + async def test_status(self): + async with self.client.request('GET', '/status') as resp: + self.assertEqual(resp.status, 200) + resp_json = await resp.json() + self.assertDictEqual(resp_json, {'success': True}) async def _assert_expected_search(self, results, gene_counts=None, **search_kwargs): search_body = get_hail_search_body(**search_kwargs) @@ -128,17 +128,12 @@ async def _assert_expected_search(self, results, gene_counts=None, **search_kwar self.assertSetEqual(set(resp_json.keys()), {'results', 'total'}) self.assertEqual(resp_json['total'], len(results)) for i, result in enumerate(resp_json['results']): - if result != results[i]: - diff_k = {ky for ky, val in results[i].items() if val != result[ky]} - import pdb; pdb.set_trace() # TODO self.assertEqual(result, results[i]) if gene_counts: async with self.client.request('POST', '/gene_counts', json=search_body) as resp: self.assertEqual(resp.status, 200) gene_counts_json = await resp.json() - if gene_counts_json != gene_counts: - import pdb; pdb.set_trace() # TODO self.assertDictEqual(gene_counts_json, gene_counts) async def test_single_family_search(self): @@ -151,15 +146,9 @@ async def test_single_family_search(self): await self._assert_expected_search( [GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], omit_sample_type='VARIANTS', gene_counts={ - # TODO should be filtered to returned transcripts - just use entries? - 'ENSG00000129562': {'total': 1, 'families': {'F000002_2': 1}}, - 'ENSG00000013364': {'total': 1, 'families': {'F000002_2': 1}}, - 'ENSG00000079616': {'total': 1, 'families': {'F000002_2': 1}}, 'ENSG00000103495': {'total': 1, 'families': {'F000002_2': 1}}, 'ENSG00000167371': {'total': 1, 'families': {'F000002_2': 1}}, - 'ENSG00000280789': {'total': 1, 'families': {'F000002_2': 1}}, 'ENSG00000280893': {'total': 1, 'families': {'F000002_2': 1}}, - 'ENSG00000281348': {'total': 1, 'families': {'F000002_2': 1}}, 'ENSG00000275023': {'total': 2, 'families': {'F000002_2': 2}}, 'ENSG00000277258': {'total': 1, 'families': {'F000002_2': 1}}, 'ENSG00000277972': {'total': 1, 'families': {'F000002_2': 1}}, @@ -199,486 +188,486 @@ async def test_single_project_search(self): } ) - # async def test_multi_project_search(self): - # await self._assert_expected_search( - # [PROJECT_2_VARIANT, MULTI_PROJECT_VARIANT1, MULTI_PROJECT_VARIANT2, VARIANT3, VARIANT4], - # gene_counts=GENE_COUNTS, sample_data=MULTI_PROJECT_SAMPLE_DATA, - # ) - # - # async def test_inheritance_filter(self): - # inheritance_mode = 'any_affected' - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - # ) - # - # await self._assert_expected_search( - # [SV_VARIANT2], inheritance_mode=inheritance_mode, annotations=NEW_SV_FILTER, sample_data=SV_WGS_SAMPLE_DATA, - # ) - # - # inheritance_mode = 'de_novo' - # await self._assert_expected_search( - # [VARIANT1, FAMILY_3_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [SV_VARIANT1], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - # ) - # - # inheritance_mode = 'x_linked_recessive' - # await self._assert_expected_search([], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES') - # await self._assert_expected_search([], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA) - # - # inheritance_mode = 'homozygous_recessive' - # await self._assert_expected_search( - # [VARIANT2], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [PROJECT_2_VARIANT1, VARIANT2], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, - # ) - # - # await self._assert_expected_search( - # [SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - # ) - # - # gt_inheritance_filter = {'genotype': {'I000006_hg00733': 'has_alt', 'I000005_hg00732': 'ref_ref'}} - # await self._assert_expected_search( - # [VARIANT2, VARIANT3], inheritance_filter=gt_inheritance_filter, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA) - # - # inheritance_mode = 'compound_het' - # await self._assert_expected_search( - # [[VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, gene_counts={ - # 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, - # 'ENSG00000177000': {'total': 1, 'families': {'F000002_2': 1}}, - # }, **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # await self._assert_expected_search( - # [[SV_VARIANT1, SV_VARIANT2]], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - # **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # inheritance_mode = 'recessive' - # await self._assert_expected_search( - # [PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, gene_counts={ - # 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, - # 'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}}, - # }, sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # await self._assert_expected_search( - # [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - # **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # async def test_quality_filter(self): - # quality_filter = {'vcf_filter': 'pass'} - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search([SV_VARIANT4], quality_filter=quality_filter, sample_data=SV_WGS_SAMPLE_DATA) - # - # await self._assert_expected_search( - # [VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40}, omit_sample_type='SV_WES', - # ) - # - # sv_quality_filter = {'min_gq_sv': 40} - # await self._assert_expected_search( - # [SV_VARIANT3, SV_VARIANT4], quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA, - # ) - # - # await self._assert_expected_search( - # [], annotations=NEW_SV_FILTER, quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA, - # ) - # - # await self._assert_expected_search( - # [VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40, 'vcf_filter': 'pass'}, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 60, 'affected_only': True}, - # omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [SV_VARIANT3, SV_VARIANT4], quality_filter={'min_gq_sv': 60, 'affected_only': True}, sample_data=SV_WGS_SAMPLE_DATA, - # ) - # - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter={'min_ab': 50}, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [VARIANT2, VARIANT3], quality_filter={'min_ab': 70, 'affected_only': True}, - # omit_sample_type='SV_WES', - # ) - # - # quality_filter = {'min_gq': 40, 'min_ab': 50} - # await self._assert_expected_search( - # [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', - # ) - # - # annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', - # annotations=annotations, pathogenicity={'clinvar': ['likely_pathogenic', 'vus_or_conflicting']}, - # ) - # - # await self._assert_expected_search( - # [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', - # annotations=annotations, pathogenicity={'clinvar': ['pathogenic']}, - # ) - # - # async def test_location_search(self): - # await self._assert_expected_search( - # [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', **LOCATION_SEARCH, - # ) - # - # sv_intervals = ['1:9310023-9380264'] - # await self._assert_expected_search( - # [SV_VARIANT1, SV_VARIANT2], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, gene_ids=['ENSG00000171621'], - # ) - # - # await self._assert_expected_search( - # [VARIANT1], omit_sample_type='SV_WES', **EXCLUDE_LOCATION_SEARCH, - # ) - # - # await self._assert_expected_search( - # [SV_VARIANT3, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, exclude_intervals=True, - # ) - # - # await self._assert_expected_search( - # [SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], omit_sample_type='SV_WES', - # intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1] - # ) - # - # async def test_variant_id_search(self): - # await self._assert_expected_search([VARIANT2], omit_sample_type='SV_WES', **RSID_SEARCH) - # - # await self._assert_expected_search([VARIANT1], omit_sample_type='SV_WES', **VARIANT_ID_SEARCH) - # - # await self._assert_expected_search( - # [VARIANT1], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][:1], - # ) - # - # await self._assert_expected_search( - # [], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][1:], - # ) - # - # await self._assert_expected_search([SV_VARIANT2, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, variant_keys=[ - # 'cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640', - # ]) - # - # async def test_frequency_filter(self): - # await self._assert_expected_search( - # [VARIANT1, VARIANT4], frequencies={'seqr': {'af': 0.2}}, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {'ac': 4}}, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {'hh': 1}}, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [VARIANT4], frequencies={'seqr': {'ac': 4, 'hh': 0}}, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [SV_VARIANT1], frequencies={'sv_callset': {'af': 0.05}}, sample_data=SV_WGS_SAMPLE_DATA, - # ) - # - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05}}, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05, 'hh': 1}}, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.005}}, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [SV_VARIANT1, SV_VARIANT3, SV_VARIANT4], frequencies={'gnomad_svs': {'af': 0.001}}, sample_data=SV_WGS_SAMPLE_DATA, - # ) - # - # await self._assert_expected_search( - # [VARIANT4], frequencies={'seqr': {'af': 0.2}, 'gnomad_genomes': {'ac': 50}}, - # omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {}, 'gnomad_genomes': {'af': None}}, - # omit_sample_type='SV_WES', - # ) - # - # annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', - # annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'likely_pathogenic', 'vus_or_conflicting']}, - # ) - # - # await self._assert_expected_search( - # [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', - # annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'vus_or_conflicting']}, - # ) - # - # async def test_annotations_filter(self): - # await self._assert_expected_search([VARIANT2], pathogenicity={'hgmd': ['hgmd_other']}, omit_sample_type='SV_WES') - # - # pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting', 'benign']} - # await self._assert_expected_search([VARIANT1, VARIANT2], pathogenicity=pathogenicity, omit_sample_type='SV_WES') - # - # pathogenicity['clinvar'] = pathogenicity['clinvar'][:1] - # await self._assert_expected_search( - # [VARIANT1, VARIANT4], pathogenicity=pathogenicity, annotations={'SCREEN': ['CTCF-only', 'DNase-only']}, - # omit_sample_type='SV_WES', - # ) - # - # annotations = { - # 'missense': ['missense_variant'], 'in_frame': ['inframe_insertion', 'inframe_deletion'], 'frameshift': None, - # 'structural_consequence': ['INTRONIC'], - # } - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search([VARIANT2, VARIANT4], annotations=annotations, omit_sample_type='SV_WES') - # - # await self._assert_expected_search([SV_VARIANT1], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) - # - # annotations['splice_ai'] = '0.005' - # await self._assert_expected_search( - # [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], annotations=annotations, omit_sample_type='SV_WES', - # ) - # - # annotations['structural'] = ['DEL'] - # await self._assert_expected_search([SV_VARIANT1, SV_VARIANT4], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) - # - # annotations = {'other': ['non_coding_transcript_exon_variant']} - # await self._assert_expected_search( - # [VARIANT1, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT], - # pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], - # gene_ids=LOCATION_SEARCH['gene_ids'][:1], annotations=annotations, omit_sample_type='SV_WES', - # ) - # - # async def test_secondary_annotations_filter(self): - # annotations_1 = {'missense': ['missense_variant']} - # annotations_2 = {'other': ['intron_variant']} - # - # await self._assert_expected_search( - # [[VARIANT3, VARIANT4]], inheritance_mode='compound_het', omit_sample_type='SV_WES', - # annotations=annotations_1, annotations_secondary=annotations_2, - # ) - # - # await self._assert_expected_search( - # [VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', - # annotations=annotations_1, annotations_secondary=annotations_2, - # ) - # - # await self._assert_expected_search( - # [[VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', - # annotations=annotations_2, annotations_secondary=annotations_1, - # ) - # - # sv_annotations_1 = {'structural': ['INS']} - # sv_annotations_2 = {'structural': ['DEL'], 'structural_consequence': ['INTRONIC']} - # - # await self._assert_expected_search( - # [[SV_VARIANT1, SV_VARIANT2]], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='compound_het', - # annotations=sv_annotations_1, annotations_secondary=sv_annotations_2, - # ) - # - # await self._assert_expected_search( - # [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='recessive', - # annotations=sv_annotations_2, annotations_secondary=sv_annotations_1, - # ) - # - # pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting']} - # await self._assert_expected_search( - # [VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', - # annotations=annotations_2, annotations_secondary=annotations_1, pathogenicity=pathogenicity, - # ) - # - # screen_annotations = {'SCREEN': ['CTCF-only']} - # await self._assert_expected_search( - # [], inheritance_mode='recessive', omit_sample_type='SV_WES', - # annotations=screen_annotations, annotations_secondary=annotations_1, - # ) - # - # await self._assert_expected_search( - # [[VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', - # annotations=screen_annotations, annotations_secondary=annotations_2, - # ) - # - # selected_transcript_annotations = {'other': ['non_coding_transcript_exon_variant']} - # await self._assert_expected_search( - # [VARIANT2, [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3, VARIANT4]], inheritance_mode='recessive', - # annotations=screen_annotations, annotations_secondary=selected_transcript_annotations, - # pathogenicity=pathogenicity, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3, VARIANT4]], - # annotations={**selected_transcript_annotations, **screen_annotations}, annotations_secondary=annotations_2, - # inheritance_mode='recessive', omit_sample_type='SV_WES', - # ) - # - # async def test_in_silico_filter(self): - # in_silico = {'eigen': '5.5', 'mut_taster': 'P'} - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', - # ) - # - # in_silico['requireScore'] = True - # await self._assert_expected_search( - # [VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, in_silico={'strvctvre': 0.1, 'requireScore': True}, - # ) - # - # async def test_search_errors(self): - # search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA) - # async with self.client.request('POST', '/search', json=search_body) as resp: - # self.assertEqual(resp.status, 400) - # reason = resp.reason - # self.assertEqual(reason, 'The following samples are available in seqr but missing the loaded data: NA19675, NA19678') - # - # search_body = get_hail_search_body(sample_data=MULTI_PROJECT_MISSING_SAMPLE_DATA) - # async with self.client.request('POST', '/search', json=search_body) as resp: - # self.assertEqual(resp.status, 400) - # reason = resp.reason - # self.assertEqual(reason, 'The following samples are available in seqr but missing the loaded data: NA19675, NA19678') - # - # search_body = get_hail_search_body( - # intervals=LOCATION_SEARCH['intervals'] + ['1:1-99999999999'], omit_sample_type='SV_WES', - # ) - # async with self.client.request('POST', '/search', json=search_body) as resp: - # self.assertEqual(resp.status, 400) - # reason = resp.reason - # self.assertEqual(reason, 'Invalid intervals: 1:1-99999999999') - # - # async def test_sort(self): - # await self._assert_expected_search( - # [_sorted(VARIANT2, [11, 11]), _sorted(VARIANT4, [11, 11]), _sorted(MULTI_FAMILY_VARIANT, [22, 24]), - # _sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='protein_consequence', - # ) - # - # await self._assert_expected_search( - # [_sorted(SV_VARIANT1, [11]), _sorted(SV_VARIANT2, [12]), _sorted(SV_VARIANT3, [12]), _sorted(SV_VARIANT4, [12])], - # sample_data=SV_WGS_SAMPLE_DATA, sort='protein_consequence', - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT4, [11, 11]), _sorted(SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [11, 22]), - # _sorted(SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT, [22, 22])], - # omit_sample_type='SV_WES', sort='protein_consequence', - # annotations={'other': ['non_coding_transcript_exon_variant'], 'splice_ai': '0'}, - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT1, [4]), _sorted(VARIANT2, [8]), _sorted(MULTI_FAMILY_VARIANT, [12.5]), - # _sorted(VARIANT4, [12.5])], omit_sample_type='SV_WES', sort='pathogenicity', - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT1, [4, None]), _sorted(VARIANT2, [8, 3]), _sorted(MULTI_FAMILY_VARIANT, [12.5, None]), - # _sorted(VARIANT4, [12.5, None])], omit_sample_type='SV_WES', sort='pathogenicity_hgmd', - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT2, [0]), _sorted(VARIANT4, [0.00026519427774474025]), - # _sorted(VARIANT1, [0.034449315071105957]), _sorted(MULTI_FAMILY_VARIANT, [0.38041073083877563])], - # omit_sample_type='SV_WES', sort='gnomad', - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT1, [0]), _sorted(MULTI_FAMILY_VARIANT, [0]), _sorted(VARIANT4, [0]), - # _sorted(VARIANT2, [0.28899794816970825])], omit_sample_type='SV_WES', sort='gnomad_exomes', - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT4, [0.02222222276031971]), _sorted(VARIANT1, [0.10000000149011612]), - # _sorted(VARIANT2, [0.31111112236976624]), _sorted(MULTI_FAMILY_VARIANT, [0.6666666865348816])], - # omit_sample_type='SV_WES', sort='callset_af', - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT4, [-29.899999618530273]), _sorted(VARIANT2, [-20.899999618530273]), - # _sorted(VARIANT1, [-4.668000221252441]), _sorted(MULTI_FAMILY_VARIANT, [-2.753999948501587]), ], - # omit_sample_type='SV_WES', sort='cadd', - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT2, [-0.19699999690055847]), - # _sorted(VARIANT1, [None]), _sorted(MULTI_FAMILY_VARIANT, [None])], omit_sample_type='SV_WES', sort='revel', - # ) - # - # await self._assert_expected_search( - # [_sorted(MULTI_FAMILY_VARIANT, [-0.009999999776482582]), _sorted(VARIANT2, [0]), _sorted(VARIANT4, [0]), - # _sorted(VARIANT1, [None])], omit_sample_type='SV_WES', sort='splice_ai', - # ) - # - # await self._assert_expected_search( - # [_sorted(MULTI_FAMILY_VARIANT, [0, -2]), _sorted(VARIANT2, [0, -1]), _sorted(VARIANT4, [0, -1]), _sorted(VARIANT1, [1, 0])], - # omit_sample_type='SV_WES', sort='in_omim', sort_metadata=['ENSG00000177000', 'ENSG00000097046'], - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT2, [0, -1]), _sorted(MULTI_FAMILY_VARIANT, [1, -1]), _sorted(VARIANT1, [1, 0]), _sorted(VARIANT4, [1, 0])], - # omit_sample_type='SV_WES', sort='in_omim', sort_metadata=['ENSG00000177000'], - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT2, [2, 2]), _sorted(MULTI_FAMILY_VARIANT, [4, 2]), _sorted(VARIANT4, [4, 4]), - # _sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='constraint', - # sort_metadata={'ENSG00000177000': 2, 'ENSG00000097046': 4}, - # ) - # - # await self._assert_expected_search( - # [_sorted(VARIANT2, [3, 3]), _sorted(MULTI_FAMILY_VARIANT, [None, 3]), _sorted(VARIANT1, [None, None]), - # _sorted(VARIANT4, [None, None])], omit_sample_type='SV_WES', sort='prioritized_gene', - # sort_metadata={'ENSG00000177000': 3}, - # ) - # - # # size sort only applies to SVs, so has no impact on other variants - # await self._assert_expected_search( - # [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], sort='size', omit_sample_type='SV_WES', - # ) - # - # await self._assert_expected_search( - # [_sorted(SV_VARIANT4, [-46343]), _sorted(SV_VARIANT1, [-104]), _sorted(SV_VARIANT2, [-50]), - # _sorted(SV_VARIANT3, [-50])], sample_data=SV_WGS_SAMPLE_DATA, sort='size', - # ) - # - # # sort applies to compound hets - # await self._assert_expected_search( - # [_sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]], - # sort='protein_consequence', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # await self._assert_expected_search( - # [[_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT3, [None])], - # _sorted(VARIANT2, [-0.19699999690055847])], - # sort='revel', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, - # ) - # - # await self._assert_expected_search( - # [[_sorted(VARIANT3, [-0.009999999776482582]), _sorted(VARIANT4, [0])], _sorted(VARIANT2, [0])], - # sort='splice_ai', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, - # ) + async def test_multi_project_search(self): + await self._assert_expected_search( + [PROJECT_2_VARIANT, MULTI_PROJECT_VARIANT1, MULTI_PROJECT_VARIANT2, VARIANT3, VARIANT4], + gene_counts=GENE_COUNTS, sample_data=MULTI_PROJECT_SAMPLE_DATA, + ) + + async def test_inheritance_filter(self): + inheritance_mode = 'any_affected' + await self._assert_expected_search( + [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + ) + + await self._assert_expected_search( + [SV_VARIANT2], inheritance_mode=inheritance_mode, annotations=NEW_SV_FILTER, sample_data=SV_WGS_SAMPLE_DATA, + ) + + inheritance_mode = 'de_novo' + await self._assert_expected_search( + [VARIANT1, FAMILY_3_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [SV_VARIANT1], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + ) + + inheritance_mode = 'x_linked_recessive' + await self._assert_expected_search([], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES') + await self._assert_expected_search([], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA) + + inheritance_mode = 'homozygous_recessive' + await self._assert_expected_search( + [VARIANT2], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [PROJECT_2_VARIANT1, VARIANT2], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, + ) + + await self._assert_expected_search( + [SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + ) + + gt_inheritance_filter = {'genotype': {'I000006_hg00733': 'has_alt', 'I000005_hg00732': 'ref_ref'}} + await self._assert_expected_search( + [VARIANT2, VARIANT3], inheritance_filter=gt_inheritance_filter, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA) + + inheritance_mode = 'compound_het' + await self._assert_expected_search( + [[VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, gene_counts={ + 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000177000': {'total': 1, 'families': {'F000002_2': 1}}, + }, **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[SV_VARIANT1, SV_VARIANT2]], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + **COMP_HET_ALL_PASS_FILTERS, + ) + + inheritance_mode = 'recessive' + await self._assert_expected_search( + [PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, gene_counts={ + 'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}}, + }, sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, + **COMP_HET_ALL_PASS_FILTERS, + ) + + async def test_quality_filter(self): + quality_filter = {'vcf_filter': 'pass'} + await self._assert_expected_search( + [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search([SV_VARIANT4], quality_filter=quality_filter, sample_data=SV_WGS_SAMPLE_DATA) + + await self._assert_expected_search( + [VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40}, omit_sample_type='SV_WES', + ) + + sv_quality_filter = {'min_gq_sv': 40} + await self._assert_expected_search( + [SV_VARIANT3, SV_VARIANT4], quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA, + ) + + await self._assert_expected_search( + [], annotations=NEW_SV_FILTER, quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA, + ) + + await self._assert_expected_search( + [VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40, 'vcf_filter': 'pass'}, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 60, 'affected_only': True}, + omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [SV_VARIANT3, SV_VARIANT4], quality_filter={'min_gq_sv': 60, 'affected_only': True}, sample_data=SV_WGS_SAMPLE_DATA, + ) + + await self._assert_expected_search( + [VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter={'min_ab': 50}, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [VARIANT2, VARIANT3], quality_filter={'min_ab': 70, 'affected_only': True}, + omit_sample_type='SV_WES', + ) + + quality_filter = {'min_gq': 40, 'min_ab': 50} + await self._assert_expected_search( + [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + ) + + annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters + await self._assert_expected_search( + [VARIANT1, VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + annotations=annotations, pathogenicity={'clinvar': ['likely_pathogenic', 'vus_or_conflicting']}, + ) + + await self._assert_expected_search( + [VARIANT2, FAMILY_3_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', + annotations=annotations, pathogenicity={'clinvar': ['pathogenic']}, + ) + + async def test_location_search(self): + await self._assert_expected_search( + [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', **LOCATION_SEARCH, + ) + + sv_intervals = ['1:9310023-9380264'] + await self._assert_expected_search( + [SV_VARIANT1, SV_VARIANT2], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, gene_ids=['ENSG00000171621'], + ) + + await self._assert_expected_search( + [VARIANT1], omit_sample_type='SV_WES', **EXCLUDE_LOCATION_SEARCH, + ) + + await self._assert_expected_search( + [SV_VARIANT3, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, exclude_intervals=True, + ) + + await self._assert_expected_search( + [SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], omit_sample_type='SV_WES', + intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1] + ) + + async def test_variant_id_search(self): + await self._assert_expected_search([VARIANT2], omit_sample_type='SV_WES', **RSID_SEARCH) + + await self._assert_expected_search([VARIANT1], omit_sample_type='SV_WES', **VARIANT_ID_SEARCH) + + await self._assert_expected_search( + [VARIANT1], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][:1], + ) + + await self._assert_expected_search( + [], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][1:], + ) + + await self._assert_expected_search([SV_VARIANT2, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, variant_keys=[ + 'cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640', + ]) + + async def test_frequency_filter(self): + await self._assert_expected_search( + [VARIANT1, VARIANT4], frequencies={'seqr': {'af': 0.2}}, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {'ac': 4}}, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {'hh': 1}}, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [VARIANT4], frequencies={'seqr': {'ac': 4, 'hh': 0}}, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [SV_VARIANT1], frequencies={'sv_callset': {'af': 0.05}}, sample_data=SV_WGS_SAMPLE_DATA, + ) + + await self._assert_expected_search( + [VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05}}, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.05, 'hh': 1}}, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.005}}, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [SV_VARIANT1, SV_VARIANT3, SV_VARIANT4], frequencies={'gnomad_svs': {'af': 0.001}}, sample_data=SV_WGS_SAMPLE_DATA, + ) + + await self._assert_expected_search( + [VARIANT4], frequencies={'seqr': {'af': 0.2}, 'gnomad_genomes': {'ac': 50}}, + omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], frequencies={'seqr': {}, 'gnomad_genomes': {'af': None}}, + omit_sample_type='SV_WES', + ) + + annotations = {'splice_ai': '0.0'} # Ensures no variants are filtered out by annotation/path filters + await self._assert_expected_search( + [VARIANT1, VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', + annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'likely_pathogenic', 'vus_or_conflicting']}, + ) + + await self._assert_expected_search( + [VARIANT2, VARIANT4], frequencies={'gnomad_genomes': {'af': 0.01}}, omit_sample_type='SV_WES', + annotations=annotations, pathogenicity={'clinvar': ['pathogenic', 'vus_or_conflicting']}, + ) + + async def test_annotations_filter(self): + await self._assert_expected_search([VARIANT2], pathogenicity={'hgmd': ['hgmd_other']}, omit_sample_type='SV_WES') + + pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting', 'benign']} + await self._assert_expected_search([VARIANT1, VARIANT2], pathogenicity=pathogenicity, omit_sample_type='SV_WES') + + pathogenicity['clinvar'] = pathogenicity['clinvar'][:1] + await self._assert_expected_search( + [VARIANT1, VARIANT4], pathogenicity=pathogenicity, annotations={'SCREEN': ['CTCF-only', 'DNase-only']}, + omit_sample_type='SV_WES', + ) + + annotations = { + 'missense': ['missense_variant'], 'in_frame': ['inframe_insertion', 'inframe_deletion'], 'frameshift': None, + 'structural_consequence': ['INTRONIC'], + } + await self._assert_expected_search( + [VARIANT1, VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search([VARIANT2, VARIANT4], annotations=annotations, omit_sample_type='SV_WES') + + await self._assert_expected_search([SV_VARIANT1], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) + + annotations['splice_ai'] = '0.005' + await self._assert_expected_search( + [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], annotations=annotations, omit_sample_type='SV_WES', + ) + + annotations['structural'] = ['DEL'] + await self._assert_expected_search([SV_VARIANT1, SV_VARIANT4], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) + + annotations = {'other': ['non_coding_transcript_exon_variant']} + await self._assert_expected_search( + [VARIANT1, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT], + pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_TRANSCRIPT_MULTI_FAMILY_VARIANT], + gene_ids=LOCATION_SEARCH['gene_ids'][:1], annotations=annotations, omit_sample_type='SV_WES', + ) + + async def test_secondary_annotations_filter(self): + annotations_1 = {'missense': ['missense_variant']} + annotations_2 = {'other': ['intron_variant']} + + await self._assert_expected_search( + [[VARIANT3, VARIANT4]], inheritance_mode='compound_het', omit_sample_type='SV_WES', + annotations=annotations_1, annotations_secondary=annotations_2, + ) + + await self._assert_expected_search( + [VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', + annotations=annotations_1, annotations_secondary=annotations_2, + ) + + await self._assert_expected_search( + [[VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', + annotations=annotations_2, annotations_secondary=annotations_1, + ) + + sv_annotations_1 = {'structural': ['INS']} + sv_annotations_2 = {'structural': ['DEL'], 'structural_consequence': ['INTRONIC']} + + await self._assert_expected_search( + [[SV_VARIANT1, SV_VARIANT2]], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='compound_het', + annotations=sv_annotations_1, annotations_secondary=sv_annotations_2, + ) + + await self._assert_expected_search( + [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='recessive', + annotations=sv_annotations_2, annotations_secondary=sv_annotations_1, + ) + + pathogenicity = {'clinvar': ['likely_pathogenic', 'vus_or_conflicting']} + await self._assert_expected_search( + [VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', + annotations=annotations_2, annotations_secondary=annotations_1, pathogenicity=pathogenicity, + ) + + screen_annotations = {'SCREEN': ['CTCF-only']} + await self._assert_expected_search( + [], inheritance_mode='recessive', omit_sample_type='SV_WES', + annotations=screen_annotations, annotations_secondary=annotations_1, + ) + + await self._assert_expected_search( + [[VARIANT3, VARIANT4]], inheritance_mode='recessive', omit_sample_type='SV_WES', + annotations=screen_annotations, annotations_secondary=annotations_2, + ) + + selected_transcript_annotations = {'other': ['non_coding_transcript_exon_variant']} + await self._assert_expected_search( + [VARIANT2, [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3, VARIANT4]], inheritance_mode='recessive', + annotations=screen_annotations, annotations_secondary=selected_transcript_annotations, + pathogenicity=pathogenicity, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_3, VARIANT4]], + annotations={**selected_transcript_annotations, **screen_annotations}, annotations_secondary=annotations_2, + inheritance_mode='recessive', omit_sample_type='SV_WES', + ) + + async def test_in_silico_filter(self): + in_silico = {'eigen': '5.5', 'mut_taster': 'P'} + await self._assert_expected_search( + [VARIANT1, VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', + ) + + in_silico['requireScore'] = True + await self._assert_expected_search( + [VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, in_silico={'strvctvre': 0.1, 'requireScore': True}, + ) + + async def test_search_errors(self): + search_body = get_hail_search_body(sample_data=FAMILY_2_MISSING_SAMPLE_DATA) + async with self.client.request('POST', '/search', json=search_body) as resp: + self.assertEqual(resp.status, 400) + reason = resp.reason + self.assertEqual(reason, 'The following samples are available in seqr but missing the loaded data: NA19675, NA19678') + + search_body = get_hail_search_body(sample_data=MULTI_PROJECT_MISSING_SAMPLE_DATA) + async with self.client.request('POST', '/search', json=search_body) as resp: + self.assertEqual(resp.status, 400) + reason = resp.reason + self.assertEqual(reason, 'The following samples are available in seqr but missing the loaded data: NA19675, NA19678') + + search_body = get_hail_search_body( + intervals=LOCATION_SEARCH['intervals'] + ['1:1-99999999999'], omit_sample_type='SV_WES', + ) + async with self.client.request('POST', '/search', json=search_body) as resp: + self.assertEqual(resp.status, 400) + reason = resp.reason + self.assertEqual(reason, 'Invalid intervals: 1:1-99999999999') + + async def test_sort(self): + await self._assert_expected_search( + [_sorted(VARIANT2, [11, 11]), _sorted(VARIANT4, [11, 11]), _sorted(MULTI_FAMILY_VARIANT, [22, 24]), + _sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='protein_consequence', + ) + + await self._assert_expected_search( + [_sorted(SV_VARIANT1, [11]), _sorted(SV_VARIANT2, [12]), _sorted(SV_VARIANT3, [12]), _sorted(SV_VARIANT4, [12])], + sample_data=SV_WGS_SAMPLE_DATA, sort='protein_consequence', + ) + + await self._assert_expected_search( + [_sorted(VARIANT4, [11, 11]), _sorted(SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, [11, 22]), + _sorted(SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT, [22, 22])], + omit_sample_type='SV_WES', sort='protein_consequence', + annotations={'other': ['non_coding_transcript_exon_variant'], 'splice_ai': '0'}, + ) + + await self._assert_expected_search( + [_sorted(VARIANT1, [4]), _sorted(VARIANT2, [8]), _sorted(MULTI_FAMILY_VARIANT, [12.5]), + _sorted(VARIANT4, [12.5])], omit_sample_type='SV_WES', sort='pathogenicity', + ) + + await self._assert_expected_search( + [_sorted(VARIANT1, [4, None]), _sorted(VARIANT2, [8, 3]), _sorted(MULTI_FAMILY_VARIANT, [12.5, None]), + _sorted(VARIANT4, [12.5, None])], omit_sample_type='SV_WES', sort='pathogenicity_hgmd', + ) + + await self._assert_expected_search( + [_sorted(VARIANT2, [0]), _sorted(VARIANT4, [0.00026519427774474025]), + _sorted(VARIANT1, [0.034449315071105957]), _sorted(MULTI_FAMILY_VARIANT, [0.38041073083877563])], + omit_sample_type='SV_WES', sort='gnomad', + ) + + await self._assert_expected_search( + [_sorted(VARIANT1, [0]), _sorted(MULTI_FAMILY_VARIANT, [0]), _sorted(VARIANT4, [0]), + _sorted(VARIANT2, [0.28899794816970825])], omit_sample_type='SV_WES', sort='gnomad_exomes', + ) + + await self._assert_expected_search( + [_sorted(VARIANT4, [0.02222222276031971]), _sorted(VARIANT1, [0.10000000149011612]), + _sorted(VARIANT2, [0.31111112236976624]), _sorted(MULTI_FAMILY_VARIANT, [0.6666666865348816])], + omit_sample_type='SV_WES', sort='callset_af', + ) + + await self._assert_expected_search( + [_sorted(VARIANT4, [-29.899999618530273]), _sorted(VARIANT2, [-20.899999618530273]), + _sorted(VARIANT1, [-4.668000221252441]), _sorted(MULTI_FAMILY_VARIANT, [-2.753999948501587]), ], + omit_sample_type='SV_WES', sort='cadd', + ) + + await self._assert_expected_search( + [_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT2, [-0.19699999690055847]), + _sorted(VARIANT1, [None]), _sorted(MULTI_FAMILY_VARIANT, [None])], omit_sample_type='SV_WES', sort='revel', + ) + + await self._assert_expected_search( + [_sorted(MULTI_FAMILY_VARIANT, [-0.009999999776482582]), _sorted(VARIANT2, [0]), _sorted(VARIANT4, [0]), + _sorted(VARIANT1, [None])], omit_sample_type='SV_WES', sort='splice_ai', + ) + + await self._assert_expected_search( + [_sorted(MULTI_FAMILY_VARIANT, [0, -2]), _sorted(VARIANT2, [0, -1]), _sorted(VARIANT4, [0, -1]), _sorted(VARIANT1, [1, 0])], + omit_sample_type='SV_WES', sort='in_omim', sort_metadata=['ENSG00000177000', 'ENSG00000097046'], + ) + + await self._assert_expected_search( + [_sorted(VARIANT2, [0, -1]), _sorted(MULTI_FAMILY_VARIANT, [1, -1]), _sorted(VARIANT1, [1, 0]), _sorted(VARIANT4, [1, 0])], + omit_sample_type='SV_WES', sort='in_omim', sort_metadata=['ENSG00000177000'], + ) + + await self._assert_expected_search( + [_sorted(VARIANT2, [2, 2]), _sorted(MULTI_FAMILY_VARIANT, [4, 2]), _sorted(VARIANT4, [4, 4]), + _sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='constraint', + sort_metadata={'ENSG00000177000': 2, 'ENSG00000097046': 4}, + ) + + await self._assert_expected_search( + [_sorted(VARIANT2, [3, 3]), _sorted(MULTI_FAMILY_VARIANT, [None, 3]), _sorted(VARIANT1, [None, None]), + _sorted(VARIANT4, [None, None])], omit_sample_type='SV_WES', sort='prioritized_gene', + sort_metadata={'ENSG00000177000': 3}, + ) + + # size sort only applies to SVs, so has no impact on other variants + await self._assert_expected_search( + [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], sort='size', omit_sample_type='SV_WES', + ) + + await self._assert_expected_search( + [_sorted(SV_VARIANT4, [-46343]), _sorted(SV_VARIANT1, [-104]), _sorted(SV_VARIANT2, [-50]), + _sorted(SV_VARIANT3, [-50])], sample_data=SV_WGS_SAMPLE_DATA, sort='size', + ) + + # sort applies to compound hets + await self._assert_expected_search( + [_sorted(VARIANT2, [11, 11]), [_sorted(VARIANT4, [11, 11]), _sorted(VARIANT3, [22, 24])]], + sort='protein_consequence', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[_sorted(VARIANT4, [-0.5260000228881836]), _sorted(VARIANT3, [None])], + _sorted(VARIANT2, [-0.19699999690055847])], + sort='revel', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, + ) + + await self._assert_expected_search( + [[_sorted(VARIANT3, [-0.009999999776482582]), _sorted(VARIANT4, [0])], _sorted(VARIANT2, [0])], + sort='splice_ai', inheritance_mode='recessive', omit_sample_type='SV_WES', **COMP_HET_ALL_PASS_FILTERS, + ) From 7f5863af9285f8c4b2bab7f042382385b62d0461 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 5 Sep 2023 13:05:37 -0400 Subject: [PATCH 14/16] add initial gcnv filter tests --- hail_search/test_search.py | 123 ++++++++++++++++++++++++++++++++++--- 1 file changed, 115 insertions(+), 8 deletions(-) diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 2a94444220..2c9caf94d2 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -200,10 +200,19 @@ async def test_inheritance_filter(self): [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', ) + await self._assert_expected_search( + [GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], inheritance_mode=inheritance_mode, + omit_sample_type='VARIANTS', + ) + await self._assert_expected_search( [SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, ) + await self._assert_expected_search( + [GCNV_VARIANT3], inheritance_mode=inheritance_mode, annotations=NEW_SV_FILTER, omit_sample_type='VARIANTS', + ) + await self._assert_expected_search( [SV_VARIANT2], inheritance_mode=inheritance_mode, annotations=NEW_SV_FILTER, sample_data=SV_WGS_SAMPLE_DATA, ) @@ -213,12 +222,17 @@ async def test_inheritance_filter(self): [VARIANT1, FAMILY_3_VARIANT, VARIANT4], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES', ) + await self._assert_expected_search( + [GCNV_VARIANT1], inheritance_mode=inheritance_mode, omit_sample_type='VARIANTS', + ) + await self._assert_expected_search( [SV_VARIANT1], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, ) inheritance_mode = 'x_linked_recessive' await self._assert_expected_search([], inheritance_mode=inheritance_mode, omit_sample_type='SV_WES') + await self._assert_expected_search([], inheritance_mode=inheritance_mode, omit_sample_type='VARIANTS') await self._assert_expected_search([], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA) inheritance_mode = 'homozygous_recessive' @@ -230,6 +244,10 @@ async def test_inheritance_filter(self): [PROJECT_2_VARIANT1, VARIANT2], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, ) + await self._assert_expected_search( + [GCNV_VARIANT1], inheritance_mode=inheritance_mode, omit_sample_type='VARIANTS', + ) + await self._assert_expected_search( [SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, ) @@ -246,9 +264,17 @@ async def test_inheritance_filter(self): }, **COMP_HET_ALL_PASS_FILTERS, ) + await self._assert_expected_search( + [[GCNV_VARIANT3, GCNV_VARIANT4]], inheritance_mode=inheritance_mode, omit_sample_type='VARIANTS', gene_counts={ + 'ENSG00000275023': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000277258': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000277972': {'total': 1, 'families': {'F000002_2': 1}}, + }, **COMP_HET_ALL_PASS_FILTERS, + ) + await self._assert_expected_search( [[SV_VARIANT1, SV_VARIANT2]], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - **COMP_HET_ALL_PASS_FILTERS, + **COMP_HET_ALL_PASS_FILTERS, gene_counts={'ENSG00000171621': {'total': 2, 'families': {'F000002_2': 2}}}, ) inheritance_mode = 'recessive' @@ -259,9 +285,20 @@ async def test_inheritance_filter(self): }, sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS, ) + await self._assert_expected_search( + [GCNV_VARIANT1, [GCNV_VARIANT3, GCNV_VARIANT4]], inheritance_mode=inheritance_mode, omit_sample_type='VARIANTS', gene_counts={ + 'ENSG00000275023': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000277258': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000277972': {'total': 1, 'families': {'F000002_2': 1}}, + }, **COMP_HET_ALL_PASS_FILTERS, + ) + await self._assert_expected_search( [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - **COMP_HET_ALL_PASS_FILTERS, + **COMP_HET_ALL_PASS_FILTERS, gene_counts={ + 'ENSG00000171621': {'total': 2, 'families': {'F000002_2': 2}}, + 'ENSG00000184986': {'total': 1, 'families': {'F000002_2': 1}}, + } ) async def test_quality_filter(self): @@ -270,12 +307,26 @@ async def test_quality_filter(self): [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT], quality_filter=quality_filter, omit_sample_type='SV_WES', ) + await self._assert_expected_search( + [GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], quality_filter=quality_filter, + omit_sample_type='VARIANTS', + ) + await self._assert_expected_search([SV_VARIANT4], quality_filter=quality_filter, sample_data=SV_WGS_SAMPLE_DATA) await self._assert_expected_search( [VARIANT2, MULTI_FAMILY_VARIANT], quality_filter={'min_gq': 40}, omit_sample_type='SV_WES', ) + gcnv_quality_filter = {'min_qs': 20} + await self._assert_expected_search( + [GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT4], quality_filter=gcnv_quality_filter, omit_sample_type='VARIANTS', + ) + + await self._assert_expected_search( + [], annotations=NEW_SV_FILTER, quality_filter=gcnv_quality_filter, omit_sample_type='VARIANTS', + ) + sv_quality_filter = {'min_gq_sv': 40} await self._assert_expected_search( [SV_VARIANT3, SV_VARIANT4], quality_filter=sv_quality_filter, sample_data=SV_WGS_SAMPLE_DATA, @@ -294,6 +345,11 @@ async def test_quality_filter(self): omit_sample_type='SV_WES', ) + await self._assert_expected_search( + [GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], quality_filter={'min_qs': 10, 'affected_only': True}, + omit_sample_type='VARIANTS', + ) + await self._assert_expected_search( [SV_VARIANT3, SV_VARIANT4], quality_filter={'min_gq_sv': 60, 'affected_only': True}, sample_data=SV_WGS_SAMPLE_DATA, ) @@ -328,7 +384,11 @@ async def test_location_search(self): [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', **LOCATION_SEARCH, ) - sv_intervals = ['1:9310023-9380264'] + sv_intervals = ['1:9310023-9380264', '17:38717636-38724781'] + await self._assert_expected_search( + [GCNV_VARIANT3, GCNV_VARIANT4], intervals=sv_intervals, gene_ids=['ENSG00000275023'], omit_sample_type='VARIANTS', + ) + await self._assert_expected_search( [SV_VARIANT1, SV_VARIANT2], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, gene_ids=['ENSG00000171621'], ) @@ -337,6 +397,10 @@ async def test_location_search(self): [VARIANT1], omit_sample_type='SV_WES', **EXCLUDE_LOCATION_SEARCH, ) + await self._assert_expected_search( + [GCNV_VARIANT1, GCNV_VARIANT2], intervals=sv_intervals, exclude_intervals=True, omit_sample_type='VARIANTS', + ) + await self._assert_expected_search( [SV_VARIANT3, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, intervals=sv_intervals, exclude_intervals=True, ) @@ -359,6 +423,10 @@ async def test_variant_id_search(self): [], omit_sample_type='SV_WES', variant_ids=VARIANT_ID_SEARCH['variant_ids'][1:], ) + await self._assert_expected_search([GCNV_VARIANT1, GCNV_VARIANT4], omit_sample_type='VARIANTS', variant_keys=[ + 'suffix_95340_DUP', 'suffix_140608_DUP', + ]) + await self._assert_expected_search([SV_VARIANT2, SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, variant_keys=[ 'cohort_2911.chr1.final_cleanup_INS_chr1_160', 'phase2_DEL_chr14_4640', ]) @@ -380,8 +448,13 @@ async def test_frequency_filter(self): [VARIANT4], frequencies={'seqr': {'ac': 4, 'hh': 0}}, omit_sample_type='SV_WES', ) + sv_callset_filter = {'sv_callset': {'af': 0.05}} + await self._assert_expected_search( + [GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], frequencies=sv_callset_filter, omit_sample_type='VARIANTS', + ) + await self._assert_expected_search( - [SV_VARIANT1], frequencies={'sv_callset': {'af': 0.05}}, sample_data=SV_WGS_SAMPLE_DATA, + [SV_VARIANT1], frequencies=sv_callset_filter, sample_data=SV_WGS_SAMPLE_DATA, ) await self._assert_expected_search( @@ -435,7 +508,7 @@ async def test_annotations_filter(self): annotations = { 'missense': ['missense_variant'], 'in_frame': ['inframe_insertion', 'inframe_deletion'], 'frameshift': None, - 'structural_consequence': ['INTRONIC'], + 'structural_consequence': ['INTRONIC', 'LOF'], } await self._assert_expected_search( [VARIANT1, VARIANT2, VARIANT4], pathogenicity=pathogenicity, annotations=annotations, omit_sample_type='SV_WES', @@ -443,6 +516,10 @@ async def test_annotations_filter(self): await self._assert_expected_search([VARIANT2, VARIANT4], annotations=annotations, omit_sample_type='SV_WES') + await self._assert_expected_search( + [GCNV_VARIANT3, GCNV_VARIANT4], annotations=annotations, omit_sample_type='VARIANTS', + ) + await self._assert_expected_search([SV_VARIANT1], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) annotations['splice_ai'] = '0.005' @@ -453,6 +530,11 @@ async def test_annotations_filter(self): annotations['structural'] = ['DEL'] await self._assert_expected_search([SV_VARIANT1, SV_VARIANT4], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) + await self._assert_expected_search([], annotations=annotations, omit_sample_type='VARIANTS') + + annotations['structural'].append('gCNV_DEL') + await self._assert_expected_search([GCNV_VARIANT3], annotations=annotations, omit_sample_type='VARIANTS') + annotations = {'other': ['non_coding_transcript_exon_variant']} await self._assert_expected_search( [VARIANT1, SELECTED_ANNOTATION_TRANSCRIPT_VARIANT_2, SELECTED_ANNOTATION_TRANSCRIPT_MULTI_FAMILY_VARIANT], @@ -483,8 +565,18 @@ async def test_secondary_annotations_filter(self): annotations=annotations_2, annotations_secondary=annotations_1, ) - sv_annotations_1 = {'structural': ['INS']} - sv_annotations_2 = {'structural': ['DEL'], 'structural_consequence': ['INTRONIC']} + sv_annotations_1 = {'structural': ['INS', 'LOF']} + sv_annotations_2 = {'structural': ['DEL', 'gCNV_DUP'], 'structural_consequence': ['INTRONIC']} + + await self._assert_expected_search( + [[GCNV_VARIANT3, GCNV_VARIANT4]], omit_sample_type='VARIANTS', inheritance_mode='compound_het', + annotations=sv_annotations_1, annotations_secondary=sv_annotations_2, + ) + + await self._assert_expected_search( + [GCNV_VARIANT1, [GCNV_VARIANT3, GCNV_VARIANT4]], omit_sample_type='VARIANTS', inheritance_mode='recessive', + annotations=sv_annotations_2, annotations_secondary=sv_annotations_1, + ) await self._assert_expected_search( [[SV_VARIANT1, SV_VARIANT2]], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='compound_het', @@ -537,8 +629,13 @@ async def test_in_silico_filter(self): [VARIANT2, VARIANT4], in_silico=in_silico, omit_sample_type='SV_WES', ) + sv_in_silico = {'strvctvre': 0.1, 'requireScore': True} await self._assert_expected_search( - [SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, in_silico={'strvctvre': 0.1, 'requireScore': True}, + [GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], omit_sample_type='VARIANTS', in_silico=sv_in_silico, + ) + + await self._assert_expected_search( + [SV_VARIANT4], sample_data=SV_WGS_SAMPLE_DATA, in_silico=sv_in_silico, ) async def test_search_errors(self): @@ -568,6 +665,11 @@ async def test_sort(self): _sorted(VARIANT1, [None, None])], omit_sample_type='SV_WES', sort='protein_consequence', ) + await self._assert_expected_search( + [_sorted(GCNV_VARIANT2, [0]), _sorted(GCNV_VARIANT3, [0]), _sorted(GCNV_VARIANT4, [0]), + _sorted(GCNV_VARIANT1, [3])], omit_sample_type='VARIANTS', sort='protein_consequence', + ) + await self._assert_expected_search( [_sorted(SV_VARIANT1, [11]), _sorted(SV_VARIANT2, [12]), _sorted(SV_VARIANT3, [12]), _sorted(SV_VARIANT4, [12])], sample_data=SV_WGS_SAMPLE_DATA, sort='protein_consequence', @@ -650,6 +752,11 @@ async def test_sort(self): [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], sort='size', omit_sample_type='SV_WES', ) + await self._assert_expected_search( + [_sorted(GCNV_VARIANT1, [-30886]), _sorted(GCNV_VARIANT4, [-13922]), _sorted(GCNV_VARIANT2, [-6834]), + _sorted(GCNV_VARIANT3, [-2309])], omit_sample_type='VARIANTS', sort='protein_consequence', + ) + await self._assert_expected_search( [_sorted(SV_VARIANT4, [-46343]), _sorted(SV_VARIANT1, [-104]), _sorted(SV_VARIANT2, [-50]), _sorted(SV_VARIANT3, [-50])], sample_data=SV_WGS_SAMPLE_DATA, sort='size', From 420c5e84a0b6831bd327dcb29c5a17a85a2ae043 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 5 Sep 2023 15:38:06 -0400 Subject: [PATCH 15/16] fix unit tests --- hail_search/hail_search_query.py | 19 ++++++++-------- hail_search/test_search.py | 38 ++++++++++++++++---------------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py index dd4974d39c..fd0e81fe61 100644 --- a/hail_search/hail_search_query.py +++ b/hail_search/hail_search_query.py @@ -658,7 +658,7 @@ def _filter_compound_hets(self): ch_ht = ch_ht.filter(ch_ht.comp_het_family_entries.any(hl.is_defined)) # Get possible pairs of variants within the same gene - ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht)) + ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht, comp_het=True)) ch_ht = ch_ht.explode(ch_ht.gene_ids) formatted_rows_expr = hl.agg.collect(ch_ht.row) if HAS_ALLOWED_SECONDARY_ANNOTATION in self._ht.row: @@ -693,7 +693,7 @@ def _filter_compound_hets(self): return ch_ht @classmethod - def _gene_ids_expr(cls, ht): + def _gene_ids_expr(cls, ht, comp_het=False): return hl.set(ht[cls.TRANSCRIPTS_FIELD].map(lambda t: t.gene_id)) def _is_valid_comp_het_family(self, entries_1, entries_2): @@ -1228,10 +1228,10 @@ class GcnvHailTableQuery(SvHailTableQuery): POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'} @classmethod - def _get_genotype_override_field(cls, r, field): + def _get_genotype_override_field(cls, r, field, family_entries_field=None): agg, get_default = cls.GENOTYPE_OVERRIDE_FIELDS[field] sample_field = f'sample_{field}' - entries = r.family_entries.flatmap(lambda x: x) + entries = r[family_entries_field or 'family_entries'].flatmap(lambda x: x) return hl.if_else( entries.any(lambda g: hl.is_defined(g.GT) & hl.is_missing(g[sample_field])), get_default(r), agg(entries.map(lambda g: g[sample_field])) @@ -1247,11 +1247,12 @@ def get_allowed_sv_type_ids(self, sv_types): ]) @classmethod - def _gene_ids_expr(cls, ht): - gene_ids_expr = getattr(ht, 'gene_ids', None) - if gene_ids_expr is None: - gene_ids_expr = cls._get_genotype_override_field(ht, 'gene_ids') - return hl.or_else(gene_ids_expr, super()._gene_ids_expr(ht)) + def _gene_ids_expr(cls, ht, comp_het=False): + family_entries_field = 'comp_het_family_entries' if comp_het else None + return hl.or_else( + cls._get_genotype_override_field(ht, 'gene_ids', family_entries_field=family_entries_field), + super()._gene_ids_expr(ht), + ) def _additional_annotation_fields(self): return {} diff --git a/hail_search/test_search.py b/hail_search/test_search.py index 2c9caf94d2..e162353634 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -99,7 +99,7 @@ # Ensures no variants are filtered out by annotation/path filters for compound hets COMP_HET_ALL_PASS_FILTERS = { 'annotations': {'splice_ai': '0.0'}, 'pathogenicity': {'clinvar': ['likely_pathogenic']}, - 'structural': ['DEL', 'CPX', 'INS'], + 'structural': ['DEL', 'CPX', 'INS', 'gCNV_DEL', 'gCNV_DUP'], } NEW_SV_FILTER = {'new_structural_variants': ['NEW']} @@ -245,7 +245,7 @@ async def test_inheritance_filter(self): ) await self._assert_expected_search( - [GCNV_VARIANT1], inheritance_mode=inheritance_mode, omit_sample_type='VARIANTS', + [GCNV_VARIANT3], inheritance_mode=inheritance_mode, omit_sample_type='VARIANTS', ) await self._assert_expected_search( @@ -274,7 +274,7 @@ async def test_inheritance_filter(self): await self._assert_expected_search( [[SV_VARIANT1, SV_VARIANT2]], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, - **COMP_HET_ALL_PASS_FILTERS, gene_counts={'ENSG00000171621': {'total': 2, 'families': {'F000002_2': 2}}}, + **COMP_HET_ALL_PASS_FILTERS, gene_counts={'ENSG00000171621': {'total': 2, 'families': {'F000011_11': 2}}}, ) inheritance_mode = 'recessive' @@ -286,8 +286,8 @@ async def test_inheritance_filter(self): ) await self._assert_expected_search( - [GCNV_VARIANT1, [GCNV_VARIANT3, GCNV_VARIANT4]], inheritance_mode=inheritance_mode, omit_sample_type='VARIANTS', gene_counts={ - 'ENSG00000275023': {'total': 2, 'families': {'F000002_2': 2}}, + [GCNV_VARIANT3, [GCNV_VARIANT3, GCNV_VARIANT4]], inheritance_mode=inheritance_mode, omit_sample_type='VARIANTS', gene_counts={ + 'ENSG00000275023': {'total': 3, 'families': {'F000002_2': 3}}, 'ENSG00000277258': {'total': 1, 'families': {'F000002_2': 1}}, 'ENSG00000277972': {'total': 1, 'families': {'F000002_2': 1}}, }, **COMP_HET_ALL_PASS_FILTERS, @@ -517,7 +517,7 @@ async def test_annotations_filter(self): await self._assert_expected_search([VARIANT2, VARIANT4], annotations=annotations, omit_sample_type='SV_WES') await self._assert_expected_search( - [GCNV_VARIANT3, GCNV_VARIANT4], annotations=annotations, omit_sample_type='VARIANTS', + [GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], annotations=annotations, omit_sample_type='VARIANTS', ) await self._assert_expected_search([SV_VARIANT1], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) @@ -527,13 +527,10 @@ async def test_annotations_filter(self): [VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], annotations=annotations, omit_sample_type='SV_WES', ) - annotations['structural'] = ['DEL'] - await self._assert_expected_search([SV_VARIANT1, SV_VARIANT4], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) - - await self._assert_expected_search([], annotations=annotations, omit_sample_type='VARIANTS') + annotations['structural'] = ['gCNV_DUP', 'DEL'] + await self._assert_expected_search([GCNV_VARIANT1, GCNV_VARIANT2, GCNV_VARIANT3, GCNV_VARIANT4], annotations=annotations, omit_sample_type='VARIANTS') - annotations['structural'].append('gCNV_DEL') - await self._assert_expected_search([GCNV_VARIANT3], annotations=annotations, omit_sample_type='VARIANTS') + await self._assert_expected_search([SV_VARIANT1, SV_VARIANT4], annotations=annotations, sample_data=SV_WGS_SAMPLE_DATA) annotations = {'other': ['non_coding_transcript_exon_variant']} await self._assert_expected_search( @@ -565,19 +562,22 @@ async def test_secondary_annotations_filter(self): annotations=annotations_2, annotations_secondary=annotations_1, ) - sv_annotations_1 = {'structural': ['INS', 'LOF']} - sv_annotations_2 = {'structural': ['DEL', 'gCNV_DUP'], 'structural_consequence': ['INTRONIC']} + gcnv_annotations_1 = {'structural': ['gCNV_DUP']} + gcnv_annotations_2 = {'structural_consequence': ['LOF']} await self._assert_expected_search( [[GCNV_VARIANT3, GCNV_VARIANT4]], omit_sample_type='VARIANTS', inheritance_mode='compound_het', - annotations=sv_annotations_1, annotations_secondary=sv_annotations_2, + annotations=gcnv_annotations_1, annotations_secondary=gcnv_annotations_2, ) await self._assert_expected_search( - [GCNV_VARIANT1, [GCNV_VARIANT3, GCNV_VARIANT4]], omit_sample_type='VARIANTS', inheritance_mode='recessive', - annotations=sv_annotations_2, annotations_secondary=sv_annotations_1, + [GCNV_VARIANT3, [GCNV_VARIANT3, GCNV_VARIANT4]], omit_sample_type='VARIANTS', inheritance_mode='recessive', + annotations=gcnv_annotations_2, annotations_secondary=gcnv_annotations_1, ) + sv_annotations_1 = {'structural': ['INS', 'LOF']} + sv_annotations_2 = {'structural': ['DEL', 'gCNV_DUP'], 'structural_consequence': ['INTRONIC']} + await self._assert_expected_search( [[SV_VARIANT1, SV_VARIANT2]], sample_data=SV_WGS_SAMPLE_DATA, inheritance_mode='compound_het', annotations=sv_annotations_1, annotations_secondary=sv_annotations_2, @@ -753,8 +753,8 @@ async def test_sort(self): ) await self._assert_expected_search( - [_sorted(GCNV_VARIANT1, [-30886]), _sorted(GCNV_VARIANT4, [-13922]), _sorted(GCNV_VARIANT2, [-6834]), - _sorted(GCNV_VARIANT3, [-2309])], omit_sample_type='VARIANTS', sort='protein_consequence', + [_sorted(GCNV_VARIANT1, [-171766]), _sorted(GCNV_VARIANT2, [-17768]), _sorted(GCNV_VARIANT4, [-14487]), + _sorted(GCNV_VARIANT3, [-2666])], omit_sample_type='VARIANTS', sort='size', ) await self._assert_expected_search( From 4ce6fe5e907e5424e6cce5e264c5af0befdb1858 Mon Sep 17 00:00:00 2001 From: Hana Snow Date: Tue, 5 Sep 2023 15:39:35 -0400 Subject: [PATCH 16/16] fix family id --- hail_search/test_search.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hail_search/test_search.py b/hail_search/test_search.py index e162353634..18940d3a83 100644 --- a/hail_search/test_search.py +++ b/hail_search/test_search.py @@ -296,8 +296,8 @@ async def test_inheritance_filter(self): await self._assert_expected_search( [[SV_VARIANT1, SV_VARIANT2], SV_VARIANT4], inheritance_mode=inheritance_mode, sample_data=SV_WGS_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS, gene_counts={ - 'ENSG00000171621': {'total': 2, 'families': {'F000002_2': 2}}, - 'ENSG00000184986': {'total': 1, 'families': {'F000002_2': 1}}, + 'ENSG00000171621': {'total': 2, 'families': {'F000011_11': 2}}, + 'ENSG00000184986': {'total': 1, 'families': {'F000011_11': 1}}, } )