Skip to content

Commit

Permalink
Merge pull request #3879 from broadinstitute/sv-lookup
Browse files Browse the repository at this point in the history
Sv lookup
  • Loading branch information
hanars authored Feb 14, 2024
2 parents f6e32f9 + e81d795 commit 2476aa9
Show file tree
Hide file tree
Showing 11 changed files with 128 additions and 48 deletions.
10 changes: 6 additions & 4 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class BaseHailTableQuery(object):

GENOTYPE_FIELDS = {}
COMPUTED_GENOTYPE_FIELDS = {}
GENOTYPE_OVERRIDE_FIELDS = {}
GENOTYPE_QUERY_FIELDS = {}
QUALITY_FILTER_FORMAT = {}
POPULATIONS = {}
Expand Down Expand Up @@ -92,15 +93,16 @@ def _format_population_config(cls, pop_config):
base_pop_config.pop('sort', None)
return base_pop_config

def annotation_fields(self):
def annotation_fields(self, include_genotype_overrides=True):
annotation_fields = {
GENOTYPES_FIELD: lambda r: r.family_entries.flatmap(lambda x: x).filter(
lambda gt: hl.is_defined(gt.individualGuid)
).group_by(lambda x: x.individualGuid).map_values(lambda x: x[0].select(
'sampleId', 'sampleType', 'individualGuid', 'familyGuid',
numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), self.MISSING_NUM_ALT),
**{k: x[0][field] for k, field in self.GENOTYPE_FIELDS.items()},
**{_to_camel_case(k): v(x[0], k, r) for k, v in self.COMPUTED_GENOTYPE_FIELDS.items()},
**{_to_camel_case(k): v(x[0], k, r) for k, v in self.COMPUTED_GENOTYPE_FIELDS.items()
if include_genotype_overrides or k not in self.GENOTYPE_OVERRIDE_FIELDS},
)),
'populations': lambda r: hl.struct(**{
population: self.population_expression(r, population) for population in self.POPULATIONS.keys()
Expand Down Expand Up @@ -1029,15 +1031,15 @@ def lookup_variant(self, variant_id, sample_data=None):
ht = self._read_table('annotations.ht', drop_globals=['paths', 'versions'])
ht = ht.filter(hl.is_defined(ht[XPOS]))

annotation_fields = self.annotation_fields()
annotation_fields = self.annotation_fields(include_genotype_overrides=False)
entry_annotations = {k: annotation_fields[k] for k in [FAMILY_GUID_FIELD, GENOTYPES_FIELD]}
annotation_fields.update({
FAMILY_GUID_FIELD: lambda ht: hl.empty_array(hl.tstr),
GENOTYPES_FIELD: lambda ht: hl.empty_dict(hl.tstr, hl.tstr),
'genotypeFilters': lambda ht: hl.str(''),
})

formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=bool(sample_data))
formatted = self._format_results(ht.key_by(), annotation_fields=annotation_fields, include_genotype_overrides=False)

variants = formatted.aggregate(hl.agg.take(formatted.row, 1))
if not variants:
Expand Down
2 changes: 1 addition & 1 deletion hail_search/queries/sv.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def _parse_annotations(self, annotations, *args, **kwargs):

def _get_family_passes_quality_filter(self, quality_filter, parsed_annotations=None, **kwargs):
passes_quality = super()._get_family_passes_quality_filter(quality_filter)
if not parsed_annotations[NEW_SV_FIELD]:
if not (parsed_annotations or {}).get(NEW_SV_FIELD):
return passes_quality

entries_has_new_call = lambda entries: entries.any(lambda x: x.concordance.new_call)
Expand Down
15 changes: 15 additions & 0 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,11 @@ async def test_variant_lookup(self):
resp_json = await resp.json()
self.assertDictEqual(resp_json, {**SV_VARIANT4, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': ''})

async with self.client.request('POST', '/lookup', json={**body, 'sample_data': SV_WGS_SAMPLE_DATA['SV_WGS']}) as resp:
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, SV_VARIANT4)

body.update({'variant_id': 'suffix_140608_DUP', 'data_type': 'SV_WES'})
async with self.client.request('POST', '/lookup', json=body) as resp:
self.assertEqual(resp.status, 200)
Expand All @@ -670,6 +675,16 @@ async def test_variant_lookup(self):
**GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'familyGuids': [], 'genotypes': {}, 'genotypeFilters': '',
})

async with self.client.request('POST', '/lookup', json={**body, 'sample_data': EXPECTED_SAMPLE_DATA['SV_WES']}) as resp:
self.assertEqual(resp.status, 200)
resp_json = await resp.json()
self.assertDictEqual(resp_json, {
**GCNV_VARIANT4, 'numExon': 8, 'end': 38736268, 'genotypes': {
individual: {k: v for k, v in genotype.items() if k not in {'start', 'end', 'numExon', 'geneIds'}}
for individual, genotype in GCNV_VARIANT4['genotypes'].items()
}
})

async def test_frequency_filter(self):
sv_callset_filter = {'sv_callset': {'af': 0.05}}
await self._assert_expected_search(
Expand Down
36 changes: 33 additions & 3 deletions seqr/utils/search/hail_search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,44 @@ def get_hail_variants_for_variant_ids(samples, genome_version, parsed_variant_id
return response_json['results']


def hail_variant_lookup(user, variant_id, samples=None, **kwargs):
def hail_variant_lookup(user, variant_id, samples=None, dataset_type=Sample.DATASET_TYPE_VARIANT_CALLS, sample_type=None, **kwargs):
data_type = dataset_type.replace('_only', '')
is_sv = data_type == Sample.DATASET_TYPE_SV_CALLS
if is_sv:
if not sample_type:
from seqr.utils.search.utils import InvalidSearchException
raise InvalidSearchException('Sample type must be specified to look up a structural variant')
data_type = f'{data_type}_{sample_type}'

body = {
'variant_id': variant_id,
'data_type': data_type,
**kwargs,
}
sample_data = None
if samples:
body['sample_data'] = _get_sample_data(samples)[Sample.DATASET_TYPE_VARIANT_CALLS]
return _execute_search(body, user, path='lookup', exception_map={404: 'Variant not present in seqr'})
sample_data = _get_sample_data(samples)
body['sample_data'] = sample_data.pop(data_type)
variant = _execute_search(body, user, path='lookup', exception_map={404: 'Variant not present in seqr'})
variants = [variant]

if is_sv and sample_data and variant['svType'] in {'DEL', 'DUP'}:
start = variant['pos']
end = variant['end']
offset = 0.2
if variant.get('endChrom'):
start -= 50
end += 50
offset = None
del body['variant_id']
body.update({
'sample_data': sample_data,
'intervals': [_format_interval(chrom=variant['chrom'], start=start, end=end, offset=offset)],
'annotations': {'structural': [variant['svType'], f"gCNV_{variant['svType']}"]}
})
variants += _execute_search(body, user)['results']

return variants


def _format_search_body(samples, genome_version, num_results, search):
Expand Down
61 changes: 46 additions & 15 deletions seqr/utils/search/hail_search_utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,15 @@
from hail_search.test_utils import get_hail_search_body, EXPECTED_SAMPLE_DATA, FAMILY_1_SAMPLE_DATA, \
FAMILY_2_ALL_SAMPLE_DATA, ALL_AFFECTED_SAMPLE_DATA, CUSTOM_AFFECTED_SAMPLE_DATA, HAIL_BACKEND_VARIANTS, \
LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS, FAMILY_2_VARIANT_SAMPLE_DATA, \
FAMILY_2_MITO_SAMPLE_DATA, EXPECTED_SAMPLE_DATA_WITH_SEX, VARIANT_LOOKUP_VARIANT, MULTI_PROJECT_SAMPLE_DATA
FAMILY_2_MITO_SAMPLE_DATA, EXPECTED_SAMPLE_DATA_WITH_SEX, VARIANT_LOOKUP_VARIANT, MULTI_PROJECT_SAMPLE_DATA, \
GCNV_VARIANT4, SV_VARIANT2
MOCK_HOST = 'http://test-hail-host'

SV_WGS_SAMPLE_DATA = [{
'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14', 'project_guid': 'R0004_non_analyst_project',
'affected': 'A', 'sample_id': 'NA21234',
}]


@mock.patch('seqr.utils.search.hail_search_utils.HAIL_BACKEND_SERVICE_HOSTNAME', MOCK_HOST)
class HailSearchUtilsTests(SearchTestHelper, TestCase):
Expand All @@ -28,11 +34,12 @@ def setUp(self):
'results': HAIL_BACKEND_VARIANTS, 'total': 5,
})

def _test_minimal_search_call(self, expected_search_body=None, **kwargs):
def _test_minimal_search_call(self, expected_search_body=None, call_offset=-1, url_path='search', **kwargs):
expected_search = expected_search_body or get_hail_search_body(genome_version='GRCh37', **kwargs)

executed_request = responses.calls[-1].request
executed_request = responses.calls[call_offset].request
self.assertEqual(executed_request.headers.get('From'), '[email protected]')
self.assertEqual(executed_request.url.split('/')[-1], url_path)
self.assertDictEqual(json.loads(executed_request.body), expected_search)

def _test_expected_search_call(self, search_fields=None, gene_ids=None, intervals=None, exclude_intervals= None,
Expand Down Expand Up @@ -149,8 +156,7 @@ def test_query_variants(self):
query_variants(self.results_model, user=self.user)
sv_sample_data = {
'SV_WES': FAMILY_2_VARIANT_SAMPLE_DATA['SNV_INDEL'],
'SV_WGS': [{'individual_guid': 'I000018_na21234', 'family_guid': 'F000014_14',
'project_guid': 'R0004_non_analyst_project', 'affected': 'A', 'sample_id': 'NA21234'}],
'SV_WGS': SV_WGS_SAMPLE_DATA,
}
self._test_expected_search_call(search_fields=['annotations'], dataset_type='SV', sample_data=sv_sample_data)

Expand Down Expand Up @@ -208,34 +214,59 @@ def test_get_variant_query_gene_counts(self):
gene_counts = get_variant_query_gene_counts(self.results_model, self.user)
self.assertDictEqual(gene_counts, GENE_COUNTS)
self.assert_cached_results({'gene_aggs': gene_counts})
self._test_expected_search_call(sort=None)
self._test_expected_search_call(url_path='gene_counts', sort=None)

@responses.activate
def test_variant_lookup(self):
responses.add(responses.POST, f'{MOCK_HOST}:5000/lookup', status=200, json=VARIANT_LOOKUP_VARIANT)
variant = variant_lookup(self.user, '1-10439-AC-A', genome_version='37', foo='bar')
self.assertDictEqual(variant, VARIANT_LOOKUP_VARIANT)
self._test_minimal_search_call(expected_search_body={
'variant_id': ['1', 10439, 'AC', 'A'], 'genome_version': 'GRCh37', 'foo': 'bar',
self.assertListEqual(variant, [VARIANT_LOOKUP_VARIANT])
self._test_minimal_search_call(url_path='lookup', expected_search_body={
'variant_id': ['1', 10439, 'AC', 'A'], 'genome_version': 'GRCh37', 'foo': 'bar', 'data_type': 'SNV_INDEL',
})

variant_lookup(self.user, '1-10439-AC-A', genome_version='37', families=self.families)
self._test_minimal_search_call(expected_search_body={
self._test_minimal_search_call(url_path='lookup', expected_search_body={
'variant_id': ['1', 10439, 'AC', 'A'], 'genome_version': 'GRCh37',
'sample_data': ALL_AFFECTED_SAMPLE_DATA['SNV_INDEL'],
'sample_data': ALL_AFFECTED_SAMPLE_DATA['SNV_INDEL'], 'data_type': 'SNV_INDEL',
})

with self.assertRaises(InvalidSearchException) as cm:
variant_lookup(self.user, 'prefix_123_DEL')
self.assertEqual(str(cm.exception), 'Invalid variant prefix_123_DEL')
variant_lookup(self.user, 'suffix_140608_DUP')
self.assertEqual(str(cm.exception), 'Sample type must be specified to look up a structural variant')

responses.add(responses.POST, f'{MOCK_HOST}:5000/lookup', status=200, json=GCNV_VARIANT4)
variant_lookup(self.user, 'suffix_140608_DUP', sample_type='WES')
self._test_minimal_search_call(url_path='lookup', expected_search_body={
'variant_id': 'suffix_140608_DUP', 'genome_version': 'GRCh38', 'data_type': 'SV_WES',
})

sv_families = Family.objects.filter(id__in=[2, 14])
variant_lookup(self.user, 'suffix_140608_DUP', sample_type='WES', families=sv_families)
self._test_minimal_search_call(url_path='lookup', call_offset=-2, expected_search_body={
'variant_id': 'suffix_140608_DUP', 'genome_version': 'GRCh38', 'data_type': 'SV_WES',
'sample_data': ALL_AFFECTED_SAMPLE_DATA['SV_WES']
})
self._test_minimal_search_call(expected_search_body={
'genome_version': 'GRCh38', 'data_type': 'SV_WES', 'annotations': {'structural': ['DEL', 'gCNV_DEL']},
'intervals': ['17:38718997-38738487'], 'sample_data': {'SV_WGS': SV_WGS_SAMPLE_DATA},
})

# No second lookup call is made for non DELs/DUPs
responses.add(responses.POST, f'{MOCK_HOST}:5000/lookup', status=200, json=SV_VARIANT2)
variant_lookup(self.user, 'cohort_2911.chr1.final_cleanup_INS_chr1_160', sample_type='WGS', families=sv_families)
self._test_minimal_search_call(url_path='lookup', expected_search_body={
'variant_id': 'cohort_2911.chr1.final_cleanup_INS_chr1_160', 'genome_version': 'GRCh38', 'data_type': 'SV_WGS',
'sample_data': SV_WGS_SAMPLE_DATA
})

responses.add(responses.POST, f'{MOCK_HOST}:5000/lookup', status=404)
with self.assertRaises(HTTPError) as cm:
variant_lookup(self.user, '1-10439-AC-A')
self.assertEqual(cm.exception.response.status_code, 404)
self.assertEqual(str(cm.exception), 'Variant not present in seqr')
self._test_minimal_search_call(expected_search_body={
'variant_id': ['1', 10439, 'AC', 'A'], 'genome_version': 'GRCh38'
self._test_minimal_search_call(url_path='lookup', expected_search_body={
'variant_id': ['1', 10439, 'AC', 'A'], 'genome_version': 'GRCh38', 'data_type': 'SNV_INDEL',
})

@responses.activate
Expand Down
10 changes: 4 additions & 6 deletions seqr/utils/search/search_utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,18 @@ def test_variant_lookup(self, mock_variant_lookup):
mock_variant_lookup.return_value = VARIANT_LOOKUP_VARIANT
variant = variant_lookup(self.user, '1-10439-AC-A', genome_version='38')
self.assertDictEqual(variant, VARIANT_LOOKUP_VARIANT)
mock_variant_lookup.assert_called_with(self.user, ('1', 10439, 'AC', 'A'), genome_version='GRCh38')
mock_variant_lookup.assert_called_with(self.user, ('1', 10439, 'AC', 'A'), genome_version='GRCh38',
dataset_type='SNV_INDEL_only')
cache_key = 'variant_lookup_results__1-10439-AC-A__38__test_user'
self.assert_cached_results(variant, cache_key=cache_key)

variant = variant_lookup(self.user, '1-10439-AC-A', genome_version='37', families=self.families)
self.assertDictEqual(variant, VARIANT_LOOKUP_VARIANT)
mock_variant_lookup.assert_called_with(self.user, ('1', 10439, 'AC', 'A'), genome_version='GRCh37', samples=mock.ANY)
mock_variant_lookup.assert_called_with(self.user, ('1', 10439, 'AC', 'A'), genome_version='GRCh37', samples=mock.ANY,
dataset_type='SNV_INDEL_only')
expected_samples = {s for s in self.search_samples if s.guid not in NON_SNP_INDEL_SAMPLES}
self.assertSetEqual(set(mock_variant_lookup.call_args.kwargs['samples']), expected_samples)

with self.assertRaises(InvalidSearchException) as cm:
variant_lookup(self.user, '100-10439-AC-A')
self.assertEqual(str(cm.exception), 'Invalid variant 100-10439-AC-A')

mock_variant_lookup.reset_mock()
self.set_cache(variant)
cached_variant = variant_lookup(self.user, '1-10439-AC-A', genome_version='38')
Expand Down
7 changes: 3 additions & 4 deletions seqr/utils/search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,15 +161,14 @@ def variant_lookup(user, variant_id, families=None, genome_version=None, **kwarg
return variant

parsed_variant_id = _parse_variant_id(variant_id)
if not parsed_variant_id:
raise InvalidSearchException(f'Invalid variant {variant_id}')
dataset_type = DATASET_TYPE_SNP_INDEL_ONLY if parsed_variant_id else Sample.DATASET_TYPE_SV_CALLS

if families:
samples, _ = _get_families_search_data(families, dataset_type=DATASET_TYPE_SNP_INDEL_ONLY)
samples, _ = _get_families_search_data(families, dataset_type=dataset_type)
kwargs['samples'] = samples

lookup_func = backend_specific_call(_raise_search_error('Hail backend is disabled'), hail_variant_lookup)
variant = lookup_func(user, parsed_variant_id, genome_version=GENOME_VERSION_LOOKUP[genome_version], **kwargs)
variant = lookup_func(user, parsed_variant_id or variant_id, genome_version=GENOME_VERSION_LOOKUP[genome_version], dataset_type=dataset_type, **kwargs)
safe_redis_set_json(cache_key, variant, expire=timedelta(weeks=2))
return variant

Expand Down
8 changes: 4 additions & 4 deletions seqr/views/apis/variant_search_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,12 +541,12 @@ def variant_lookup_handler(request):
kwargs.get('genome_version', GENOME_VERSION_GRCh38), request.user,
)

variant = variant_lookup(request.user, families=families, **kwargs)
saved_variants, _ = _get_saved_variant_models([variant], families) if families else (None, None)
variants = variant_lookup(request.user, families=families, **kwargs)
saved_variants, _ = _get_saved_variant_models(variants, families) if families else (None, None)
response = get_variants_response(
request, saved_variants=saved_variants, response_variants=[variant],
request, saved_variants=saved_variants, response_variants=variants,
add_all_context=include_genotypes, add_locus_list_detail=include_genotypes,
)
response['variant'] = variant
response['variants'] = variants

return create_json_response(response)
12 changes: 6 additions & 6 deletions seqr/views/apis/variant_search_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@ def _assert_expected_single_variant_results_context(self, response_json, omit_fi

@mock.patch('seqr.views.apis.variant_search_api.variant_lookup')
def test_variant_lookup(self, mock_variant_lookup):
mock_variant_lookup.return_value = VARIANT_LOOKUP_VARIANT
mock_variant_lookup.return_value = [VARIANT_LOOKUP_VARIANT]

url = f'{reverse(variant_lookup_handler)}?variantId=1-10439-AC-A&genomeVersion=38'
self.check_require_login(url)
Expand All @@ -776,15 +776,15 @@ def test_variant_lookup(self, mock_variant_lookup):
'rnaSeqData': {},
'savedVariantsByGuid': {},
'transcriptsById': {},
'variant': VARIANT_LOOKUP_VARIANT,
'variants': [VARIANT_LOOKUP_VARIANT],
}
self.assertDictEqual(response.json(), expected_body)
mock_variant_lookup.assert_called_with(self.no_access_user, variant_id='1-10439-AC-A', genome_version='38', families=None)

variant = {**VARIANTS[0], 'familyGuids': [], 'genotypes': {}}
mock_variant_lookup.return_value = variant
mock_variant_lookup.return_value = [variant]
expected_body.update({
'variant': variant,
'variants': [variant],
'genesById': {'ENSG00000227232': EXPECTED_GENE, 'ENSG00000268903': EXPECTED_GENE},
'transcriptsById': EXPECTED_SEARCH_RESPONSE['transcriptsById'],
})
Expand All @@ -794,11 +794,11 @@ def test_variant_lookup(self, mock_variant_lookup):
self.assertDictEqual(response.json(), expected_body)

self.login_collaborator()
mock_variant_lookup.return_value = SINGLE_FAMILY_VARIANT
mock_variant_lookup.return_value = [SINGLE_FAMILY_VARIANT]
response = self.client.get(f'{url.replace("38", "37")}&include_genotypes=true')
self.assertEqual(response.status_code, 200)
self._assert_expected_single_variant_results_context(
response.json(), variant=SINGLE_FAMILY_VARIANT, omit_fields={'searchedVariants'},
response.json(), variants=[SINGLE_FAMILY_VARIANT], omit_fields={'searchedVariants'},
)
mock_variant_lookup.assert_called_with(
self.collaborator_user, variant_id='1-10439-AC-A', genome_version='37', families=mock.ANY,
Expand Down
4 changes: 2 additions & 2 deletions ui/pages/SummaryData/components/VariantLookup.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ const FIELDS = [
{ required: true, ...GENOME_VERSION_FIELD },
]

const VariantDisplay = ({ variant }) => (variant ? <Variant variant={variant} /> : null)
const VariantDisplay = ({ variants }) => (variants || []).map(variant => <Variant variant={variant} />)

VariantDisplay.propTypes = {
variant: PropTypes.object,
variants: PropTypes.arrayOf(PropTypes.object),
}

const onSubmit = updateQueryParams => (data) => {
Expand Down
Loading

0 comments on commit 2476aa9

Please sign in to comment.