Skip to content

Commit

Permalink
Merge pull request #3882 from broadinstitute/comp-het-different-famil…
Browse files Browse the repository at this point in the history
…y-samples

accurately compute comp hets if different samples for data types
  • Loading branch information
hanars authored Feb 14, 2024
2 parents 5cdb7c4 + 4f6ec12 commit 4985353
Show file tree
Hide file tree
Showing 6 changed files with 72 additions and 23 deletions.
14 changes: 8 additions & 6 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ def __init__(self, sample_data, sort=XPOS, sort_metadata=None, num_results=100,
self._is_multi_data_type_comp_het = False
self.max_unaffected_samples = None
self._load_table_kwargs = {}
self.entry_samples_by_family_guid = {}

if sample_data:
self._load_filtered_table(sample_data, **kwargs)
Expand Down Expand Up @@ -366,8 +367,7 @@ def _filter_entries_table(self, ht, sample_data, inheritance_filter=None, qualit

return ht, ch_ht

@classmethod
def _add_entry_sample_families(cls, ht, sample_data):
def _add_entry_sample_families(self, ht, sample_data):
ht_globals = hl.eval(ht.globals)

missing_samples = set()
Expand All @@ -380,12 +380,14 @@ def _add_entry_sample_families(cls, ht, sample_data):
if missing_family_samples:
missing_samples.update(missing_family_samples)
else:
sample_index_data = [
(ht_family_samples.index(s['sample_id']), self._sample_entry_data(s, family_guid, ht_globals))
for s in samples
]
family_sample_index_data.append(
(ht_globals.family_guids.index(family_guid), [
(ht_family_samples.index(s['sample_id']), cls._sample_entry_data(s, family_guid, ht_globals))
for s in samples
])
(ht_globals.family_guids.index(family_guid), sample_index_data)
)
self.entry_samples_by_family_guid[family_guid] = [s['sampleId'] for _, s in sample_index_data]

if missing_samples:
raise HTTPBadRequest(
Expand Down
20 changes: 17 additions & 3 deletions hail_search/queries/multi_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,13 @@ def _load_filtered_table(self, *args, **kwargs):
for data_type in sv_data_types:
self._current_sv_data_type = data_type
sv_query = self._data_type_queries[data_type]
self.max_unaffected_samples = max(variant_query.max_unaffected_samples, sv_query.max_unaffected_samples)
merged_ht = self._filter_data_type_comp_hets(variant_ht, variant_families, sv_query)
self.max_unaffected_samples = min(variant_query.max_unaffected_samples, sv_query.max_unaffected_samples)
merged_ht = self._filter_data_type_comp_hets(variant_query, variant_families, sv_query)
if merged_ht is not None:
self._comp_het_hts[data_type] = merged_ht.key_by()

def _filter_data_type_comp_hets(self, variant_ht, variant_families, sv_query):
def _filter_data_type_comp_hets(self, variant_query, variant_families, sv_query):
variant_ht = variant_query.unfiltered_comp_het_ht
sv_ht = sv_query.unfiltered_comp_het_ht
sv_type_del_ids = sv_query.get_allowed_sv_type_ids([f'{getattr(sv_query, "SV_TYPE_PREFIX", "")}DEL'])
self._sv_type_del_id = list(sv_type_del_ids)[0] if sv_type_del_ids else None
Expand All @@ -63,6 +64,19 @@ def _filter_data_type_comp_hets(self, variant_ht, variant_families, sv_query):
if variant_families != sv_families:
variant_ht = self._family_filtered_ch_ht(variant_ht, overlapped_families, variant_families)
sv_ht = self._family_filtered_ch_ht(sv_ht, overlapped_families, sv_families)
else:
overlapped_families = variant_families

variant_samples_by_family = variant_query.entry_samples_by_family_guid
sv_samples_by_family = sv_query.entry_samples_by_family_guid
if any(f for f in overlapped_families if variant_samples_by_family[f] != sv_samples_by_family[f]):
sv_sample_indices = hl.array([[
sv_samples_by_family[f].index(s) if s in sv_samples_by_family[f] else None
for s in variant_samples_by_family[f]
] for f in overlapped_families])
sv_ht = sv_ht.annotate(family_entries=hl.enumerate(sv_sample_indices).starmap(
lambda family_i, indices: indices.map(lambda sample_i: sv_ht.family_entries[family_i][sample_i])
))

variant_ch_ht = variant_ht.group_by('gene_ids').aggregate(v1=hl.agg.collect(variant_ht.row))
sv_ch_ht = sv_ht.group_by('gene_ids').aggregate(v2=hl.agg.collect(sv_ht.row))
Expand Down
22 changes: 17 additions & 5 deletions hail_search/queries/sv.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,18 @@ class SvHailTableQuery(BaseHailTableQuery):
def _get_sample_type(cls, *args):
return cls.DATA_TYPE.split('_')[-1]

def import_filtered_table(self, project_samples, *args, parsed_intervals=None, **kwargs):
if len(project_samples) > 1 and parsed_intervals:
def import_filtered_table(self, project_samples, *args, parsed_intervals=None, padded_interval=None, **kwargs):
if len(project_samples) > 1 and (parsed_intervals or padded_interval):
# For multi-project interval search, faster to first read and filter the annotation table and then add entries
ht = self._read_table('annotations.ht')
ht = self._filter_annotated_table(ht, parsed_intervals=parsed_intervals)
ht = self._filter_annotated_table(ht, parsed_intervals=parsed_intervals, padded_interval=padded_interval)
self._load_table_kwargs['variant_ht'] = ht.select()
parsed_intervals = None
padded_interval = None

return super().import_filtered_table(project_samples, *args, parsed_intervals=parsed_intervals, **kwargs)
return super().import_filtered_table(project_samples, *args, parsed_intervals=parsed_intervals, padded_interval=padded_interval, **kwargs)

def _filter_annotated_table(self, ht, *args, parsed_intervals=None, exclude_intervals=False, **kwargs):
def _filter_annotated_table(self, ht, *args, parsed_intervals=None, exclude_intervals=False, padded_interval=None, **kwargs):
if parsed_intervals:
interval_filter = hl.array(parsed_intervals).any(lambda interval: hl.if_else(
ht.start_locus.contig == ht.end_locus.contig,
Expand All @@ -76,9 +77,20 @@ def _filter_annotated_table(self, ht, *args, parsed_intervals=None, exclude_inte
if exclude_intervals:
interval_filter = ~interval_filter
ht = ht.filter(interval_filter)
if padded_interval:
padding = int((padded_interval['end'] - padded_interval['start']) * padded_interval['padding'])
ht = ht.filter(hl.all([
ht.start_locus.contig == f"chr{padded_interval['chrom']}",
self._locus_in_range(ht.start_locus, padded_interval['start'], padding),
self._locus_in_range(ht.end_locus, padded_interval['end'], padding)
]))

return super()._filter_annotated_table(ht, *args, **kwargs)

@staticmethod
def _locus_in_range(locus, position, padding):
return (max(position - padding, 1) < locus.position) & (min(position + padding, 3e8) > locus.position)

def _parse_annotations(self, annotations, *args, **kwargs):
parsed_annotations = super()._parse_annotations(annotations, *args, **kwargs)
parsed_annotations[NEW_SV_FIELD] = (annotations or {}).get(NEW_SV_FIELD)
Expand Down
27 changes: 27 additions & 0 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,21 @@ async def test_location_search(self):
intervals=LOCATION_SEARCH['intervals'][-1:], gene_ids=LOCATION_SEARCH['gene_ids'][:1]
)

await self._assert_expected_search(
[GCNV_VARIANT4], padded_interval={'chrom': '17', 'start': 38720781, 'end': 38738703, 'padding': 0.2},
omit_sample_type='SNV_INDEL',
)

await self._assert_expected_search(
[], padded_interval={'chrom': '17', 'start': 38720781, 'end': 38738703, 'padding': 0.1},
omit_sample_type='SNV_INDEL',
)

await self._assert_expected_search(
[SV_VARIANT4], padded_interval={'chrom': '14', 'start': 106692244, 'end': 106742587, 'padding': 0.1},
sample_data=SV_WGS_SAMPLE_DATA,
)

async def test_variant_id_search(self):
await self._assert_expected_search([VARIANT2], omit_sample_type='SV_WES', **RSID_SEARCH)

Expand Down Expand Up @@ -910,6 +925,18 @@ async def test_secondary_annotations_filter(self):
annotations=gcnv_annotations_2, annotations_secondary=selected_transcript_annotations,
)

# Search works with a different number of samples within the family
missing_gt_gcnv_variant = {
**GCNV_VARIANT4, 'genotypes': {k: v for k, v in GCNV_VARIANT4['genotypes'].items() if k != 'I000005_hg00732'}
}
await self._assert_expected_search(
[[MULTI_DATA_TYPE_COMP_HET_VARIANT2, missing_gt_gcnv_variant]],
inheritance_mode='compound_het', pathogenicity=pathogenicity,
annotations=gcnv_annotations_2, annotations_secondary=selected_transcript_annotations,
sample_data={**EXPECTED_SAMPLE_DATA, 'SV_WES': [EXPECTED_SAMPLE_DATA['SV_WES'][0], EXPECTED_SAMPLE_DATA['SV_WES'][2]]}

)

# Do not return pairs where annotations match in a non-paired gene
await self._assert_expected_search(
[GCNV_VARIANT3], inheritance_mode='recessive',
Expand Down
9 changes: 1 addition & 8 deletions seqr/utils/search/hail_search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,10 @@ def hail_variant_lookup(user, variant_id, samples=None, dataset_type=Sample.DATA
variants = [variant]

if is_sv and sample_data and variant['svType'] in {'DEL', 'DUP'}:
start = variant['pos']
end = variant['end']
offset = 0.2
if variant.get('endChrom'):
start -= 50
end += 50
offset = None
del body['variant_id']
body.update({
'sample_data': sample_data,
'intervals': [_format_interval(chrom=variant['chrom'], start=start, end=end, offset=offset)],
'padded_interval': {'chrom': variant['chrom'], 'start': variant['pos'], 'end': variant['end'], 'padding': 0.2},
'annotations': {'structural': [variant['svType'], f"gCNV_{variant['svType']}"]}
})
variants += _execute_search(body, user)['results']
Expand Down
3 changes: 2 additions & 1 deletion seqr/utils/search/hail_search_utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,8 @@ def test_variant_lookup(self):
})
self._test_minimal_search_call(expected_search_body={
'genome_version': 'GRCh38', 'data_type': 'SV_WES', 'annotations': {'structural': ['DEL', 'gCNV_DEL']},
'intervals': ['17:38718997-38738487'], 'sample_data': {'SV_WGS': SV_WGS_SAMPLE_DATA},
'padded_interval': {'chrom': '17', 'start': 38721781, 'end': 38735703, 'padding': 0.2},
'sample_data': {'SV_WGS': SV_WGS_SAMPLE_DATA},
})

# No second lookup call is made for non DELs/DUPs
Expand Down

0 comments on commit 4985353

Please sign in to comment.