Skip to content

Commit

Permalink
Merge branch 'dev' of https://github.com/broadinstitute/seqr into hai…
Browse files Browse the repository at this point in the history
…l-backend-sv-wgs
  • Loading branch information
hanars committed Aug 24, 2023
2 parents e4b6953 + f065c72 commit 4cdc65b
Show file tree
Hide file tree
Showing 14 changed files with 266 additions and 98 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# _seqr_ Changes

## dev

## 8/22/23
* Add db indices to optimize RNA data queries (REQUIRES DB MIGRATION)

## 7/11/23
Expand Down
1 change: 1 addition & 0 deletions deploy/docker/seqr/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ EXPOSE 8000
ENV TERM=xterm

COPY deploy/docker/seqr/readiness_probe /
COPY deploy/docker/seqr/wait_for_routes /
COPY deploy/docker/seqr/bin/*.sh /usr/local/bin/
COPY deploy/docker/seqr/config/*.py ./
COPY deploy/docker/seqr/bashrc /root/.bashrc
Expand Down
26 changes: 26 additions & 0 deletions deploy/docker/seqr/wait_for_routes
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash

###
# Waits for network endpoints. Intended usage is within Kubernetes CronJobs to wait for sidecar availability.
# Usage: ./wait_for_routes https://www.google.com/ https://www.broadinstitute.org https://www.broadins.org
###

RETRY_COUNT=10
SLEEP_S=2

for route in "$@"
do
retries=0
until [ "$retries" -ge 10 ]
do
curl -s $route -o /dev/null && echo "Successful ping of $route" && break
retries=$((retries+1))
if [ "$retries" -eq 10 ]; then
echo "Route ${route} wasn't available after ${RETRY_COUNT} connection attempts"
exit 1
else
echo "Unable to connect to ${route}, retrying. Attempt ${retries}/${RETRY_COUNT}"
sleep $SLEEP_S
fi
done
done
23 changes: 23 additions & 0 deletions hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,6 +778,29 @@ def _omim_sort(cls, r, omim_gene_set):
def _gene_rank_sort(cls, r, gene_ranks):
return [hl.min(cls._gene_ids_expr(r).map(gene_ranks.get))]

def gene_counts(self):
selects = {
'gene_ids': self._gene_ids_expr,
'families': self.BASE_ANNOTATION_FIELDS['familyGuids'],
}
ch_ht = None
if self._comp_het_ht:
ch_ht = self._comp_het_ht.explode(self._comp_het_ht[GROUPED_VARIANTS_FIELD])
ch_ht = ch_ht.select(**{k: v(ch_ht[GROUPED_VARIANTS_FIELD]) for k, v in selects.items()})

if self._ht:
ht = self._ht.select(**{k: v(self._ht) for k, v in selects.items()})
if ch_ht:
ht = ht.join(ch_ht, 'outer')
ht = ht.transmute(**{k: hl.or_else(ht[k], ht[f'{k}_1']) for k in selects})
else:
ht = ch_ht

ht = ht.explode('gene_ids').explode('families')
return ht.aggregate(hl.agg.group_by(
ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families))
))


class VariantHailTableQuery(BaseHailTableQuery):

Expand Down
7 changes: 5 additions & 2 deletions hail_search/search.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from hail_search.hail_search_query import QUERY_CLASS_MAP


def search_hail_backend(request):
def search_hail_backend(request, gene_counts=False):
sample_data = request.pop('sample_data', {})
genome_version = request.pop('genome_version')

Expand All @@ -12,7 +12,10 @@ def search_hail_backend(request):
query_cls = QUERY_CLASS_MAP[single_data_type]

query = query_cls(sample_data, genome_version, **request)
return query.search()
if gene_counts:
return query.gene_counts()
else:
return query.search()


def load_globals():
Expand Down
36 changes: 26 additions & 10 deletions hail_search/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from hail_search.test_utils import get_hail_search_body, FAMILY_2_VARIANT_SAMPLE_DATA, FAMILY_2_MISSING_SAMPLE_DATA, \
VARIANT1, VARIANT2, VARIANT3, VARIANT4, MULTI_PROJECT_SAMPLE_DATA, MULTI_PROJECT_MISSING_SAMPLE_DATA, \
LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, SV_WGS_SAMPLE_DATA, SV_VARIANT1, \
SV_VARIANT2, SV_VARIANT3, SV_VARIANT4
LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS, SV_WGS_SAMPLE_DATA, \
SV_VARIANT1, SV_VARIANT2, SV_VARIANT3, SV_VARIANT4
from hail_search.web_app import init_web_app

PROJECT_2_VARIANT = {
Expand Down Expand Up @@ -119,7 +119,7 @@ async def test_status(self):
resp_json = await resp.json()
self.assertDictEqual(resp_json, {'success': True})

async def _assert_expected_search(self, results, **search_kwargs):
async def _assert_expected_search(self, results, gene_counts=None, **search_kwargs):
search_body = get_hail_search_body(**search_kwargs)
async with self.client.request('POST', '/search', json=search_body) as resp:
self.assertEqual(resp.status, 200)
Expand All @@ -129,9 +129,18 @@ async def _assert_expected_search(self, results, **search_kwargs):
for i, result in enumerate(resp_json['results']):
self.assertEqual(result, results[i])

if gene_counts:
async with self.client.request('POST', '/gene_counts', json=search_body) as resp:
self.assertEqual(resp.status, 200)
gene_counts_json = await resp.json()
self.assertDictEqual(gene_counts_json, gene_counts)

async def test_single_family_search(self):
await self._assert_expected_search(
[VARIANT1, VARIANT2, VARIANT3, VARIANT4], sample_data=FAMILY_2_VARIANT_SAMPLE_DATA,
[VARIANT1, VARIANT2, VARIANT3, VARIANT4], sample_data=FAMILY_2_VARIANT_SAMPLE_DATA, gene_counts={
'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}},
}
)

await self._assert_expected_search(
Expand All @@ -140,13 +149,16 @@ async def test_single_family_search(self):

async def test_single_project_search(self):
await self._assert_expected_search(
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES',
[VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', gene_counts={
'ENSG00000097046': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}},
'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}},
}
)

async def test_multi_project_search(self):
await self._assert_expected_search(
[PROJECT_2_VARIANT, MULTI_PROJECT_VARIANT1, MULTI_PROJECT_VARIANT2, VARIANT3, VARIANT4],
sample_data=MULTI_PROJECT_SAMPLE_DATA,
gene_counts=GENE_COUNTS, sample_data=MULTI_PROJECT_SAMPLE_DATA,
)

async def test_inheritance_filter(self):
Expand Down Expand Up @@ -195,8 +207,10 @@ async def test_inheritance_filter(self):

inheritance_mode = 'compound_het'
await self._assert_expected_search(
[[VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA,
**COMP_HET_ALL_PASS_FILTERS,
[[VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, sample_data=MULTI_PROJECT_SAMPLE_DATA, gene_counts={
'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
'ENSG00000177000': {'total': 1, 'families': {'F000002_2': 1}},
}, **COMP_HET_ALL_PASS_FILTERS,
)

await self._assert_expected_search(
Expand All @@ -206,8 +220,10 @@ async def test_inheritance_filter(self):

inheritance_mode = 'recessive'
await self._assert_expected_search(
[PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode,
sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS,
[PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode=inheritance_mode, gene_counts={
'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}},
}, sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS,
)

await self._assert_expected_search(
Expand Down
5 changes: 5 additions & 0 deletions hail_search/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,11 @@
VARIANT_ID_SEARCH = {'variant_ids': [['1', 10439, 'AC', 'A'], ['1', 91511686, 'TCA', 'G']], 'rs_ids': []}
RSID_SEARCH = {'variant_ids': [], 'rs_ids': ['rs1801131']}

GENE_COUNTS = {
'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000011_11': 1}},
}


def get_hail_search_body(genome_version='GRCh38', num_results=100, sample_data=None, omit_sample_type=None, **search_body):
sample_data = sample_data or EXPECTED_SAMPLE_DATA
Expand Down
5 changes: 5 additions & 0 deletions hail_search/web_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ def hl_json_dumps(obj):
return json.dumps(obj, default=_hl_json_default)


async def gene_counts(request: web.Request) -> web.Response:
return web.json_response(search_hail_backend(await request.json(), gene_counts=True), dumps=hl_json_dumps)


async def search(request: web.Request) -> web.Response:
hail_results, total_results = search_hail_backend(await request.json())
return web.json_response({'results': hail_results, 'total': total_results}, dumps=hl_json_dumps)
Expand All @@ -28,6 +32,7 @@ def init_web_app():
app.add_routes([
web.get('/status', status),
web.post('/search', search),
web.post('/gene_counts', gene_counts),
])
load_globals()
return app
8 changes: 4 additions & 4 deletions seqr/utils/search/hail_search_utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from seqr.models import Family
from seqr.utils.search.utils import get_variant_query_gene_counts, query_variants, get_single_variant, \
get_variants_for_variant_ids, InvalidSearchException
from seqr.utils.search.search_utils_tests import SearchTestHelper, MOCK_COUNTS
from seqr.utils.search.search_utils_tests import SearchTestHelper
from hail_search.test_utils import get_hail_search_body, EXPECTED_SAMPLE_DATA, FAMILY_1_SAMPLE_DATA, \
FAMILY_2_ALL_SAMPLE_DATA, ALL_AFFECTED_SAMPLE_DATA, CUSTOM_AFFECTED_SAMPLE_DATA, HAIL_BACKEND_VARIANTS, \
LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH
LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS
MOCK_HOST = 'http://test-hail-host'


Expand Down Expand Up @@ -155,10 +155,10 @@ def test_query_variants(self):

@responses.activate
def test_get_variant_query_gene_counts(self):
responses.add(responses.POST, f'{MOCK_HOST}:5000/gene_counts', json=MOCK_COUNTS, status=200)
responses.add(responses.POST, f'{MOCK_HOST}:5000/gene_counts', json=GENE_COUNTS, status=200)

gene_counts = get_variant_query_gene_counts(self.results_model, self.user)
self.assertDictEqual(gene_counts, MOCK_COUNTS)
self.assertDictEqual(gene_counts, GENE_COUNTS)
self.assert_cached_results({'gene_aggs': gene_counts})
self._test_expected_search_call(sort=None)

Expand Down
13 changes: 4 additions & 9 deletions seqr/utils/search/search_utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,12 @@
import json
import mock

from hail_search.test_utils import GENE_COUNTS
from seqr.models import Family, Sample, VariantSearch, VariantSearchResults
from seqr.utils.search.utils import get_single_variant, get_variants_for_variant_ids, get_variant_query_gene_counts, \
query_variants, InvalidSearchException
from seqr.views.utils.test_utils import PARSED_VARIANTS, PARSED_COMPOUND_HET_VARIANTS_MULTI_PROJECT, GENE_FIELDS

MOCK_COUNTS = {
'ENSG00000135953': {'total': 3, 'families': {'F000003_3': 2, 'F000002_2': 1, 'F000005_5': 1}},
'ENSG00000228198': {'total': 5, 'families': {'F000003_3': 4, 'F000002_2': 1, 'F000005_5': 1}},
'ENSG00000240361': {'total': 2, 'families': {'F000003_3': 2}},
}


class SearchTestHelper(object):

Expand Down Expand Up @@ -354,12 +349,12 @@ def test_invalid_search_get_variant_query_gene_counts(self):

def test_get_variant_query_gene_counts(self, mock_get_variants):
def _mock_get_variants(families, search, user, previous_search_results, genome_version, **kwargs):
previous_search_results['gene_aggs'] = MOCK_COUNTS
return MOCK_COUNTS
previous_search_results['gene_aggs'] = GENE_COUNTS
return GENE_COUNTS
mock_get_variants.side_effect = _mock_get_variants

gene_counts = get_variant_query_gene_counts(self.results_model, self.user)
self.assertDictEqual(gene_counts, MOCK_COUNTS)
self.assertDictEqual(gene_counts, GENE_COUNTS)
results_cache = {'gene_aggs': gene_counts}
self.assert_cached_results(results_cache)
self._test_expected_search_call(
Expand Down
63 changes: 56 additions & 7 deletions seqr/views/apis/report_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,8 +713,9 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('gene_annotation')+1, 'gene_annotation_details')
READ_RNA_TABLE_COLUMNS.insert(READ_RNA_TABLE_COLUMNS.index('alignment_log_file')+1, 'alignment_postprocessing')
READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']
CALLED_VARIANT_FILE_COLUMN = 'called_variants_dna_file'
CALLED_TABLE_COLUMNS = [
'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', CALLED_VARIANT_FILE_COLUMN, 'md5sum',
'caller_software', 'variant_types', 'analysis_details',
]

Expand Down Expand Up @@ -912,7 +913,9 @@ def gregor_export(request):
('experiment_dna_short_read', EXPERIMENT_TABLE_COLUMNS, airtable_rows),
('aligned_dna_short_read', READ_TABLE_COLUMNS, airtable_rows),
('aligned_dna_short_read_set', READ_SET_TABLE_COLUMNS, airtable_rows),
('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows),
('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, [
row for row in airtable_rows if row.get(CALLED_VARIANT_FILE_COLUMN)
]),
('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows),
('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows),
('experiment', EXPERIMENT_LOOKUP_TABLE_COLUMNS, experiment_lookup_rows),
Expand Down Expand Up @@ -1033,6 +1036,19 @@ def _get_experiment_lookup_row(is_rna, row_data):
}


is_integer = lambda val, *args: val.isnumeric() or re.match(r'^[\d{3},]*\d{3}$', val)
DATA_TYPE_VALIDATORS = {
'string': lambda val, validator: (not validator.get('is_bucket_path')) or val.startswith('gs://'),
'enumeration': lambda val, validator: val in validator['enumerations'],
'integer': is_integer,
'float': lambda val, validator: is_integer(val) or re.match(r'^\d+.\d+$', val),
'date': lambda val, validator: bool(re.match(r'^\d{4}-\d{2}-\d{2}$', val)),
}
DATA_TYPE_ERROR_FORMATTERS = {
'string': lambda validator: ' are a google bucket path starting with gs://',
'enumeration': lambda validator: f': {", ".join(validator["enumerations"])}',
}

def _validate_gregor_files(file_data):
errors = []
warnings = []
Expand Down Expand Up @@ -1070,6 +1086,26 @@ def _validate_gregor_files(file_data):
warnings.append(
f'The following columns are included in the "{file_name}" data model but are missing in the report: {col_summary}'
)
invalid_data_type_columns = {
col: validator['data_type'] for col, validator in table_validator.items()
if validator.get('data_type') and validator['data_type'] not in DATA_TYPE_VALIDATORS
}
if invalid_data_type_columns:
col_summary = ', '.join(sorted([f'{col} ({data_type})' for col, data_type in invalid_data_type_columns.items()]))
warnings.append(
f'The following columns are included in the "{file_name}" data model but have an unsupported data type: {col_summary}'
)
invalid_enum_columns = [
col for col, validator in table_validator.items()
if validator.get('data_type') == 'enumeration' and not validator.get('enumerations')
]
if invalid_enum_columns:
for col in invalid_enum_columns:
table_validator[col]['data_type'] = None
col_summary = ', '.join(sorted(invalid_enum_columns))
warnings.append(
f'The following columns are specified as "enumeration" in the "{file_name}" data model but are missing the allowed values definition: {col_summary}'
)

for column in columns:
_validate_column_data(
Expand Down Expand Up @@ -1112,15 +1148,18 @@ def _has_required_table(table, validator, tables):


def _validate_column_data(column, file_name, data, column_validator, warnings, errors):
enum = column_validator.get('enumerations')
data_type = column_validator.get('data_type')
data_type_validator = DATA_TYPE_VALIDATORS.get(data_type)
unique = column_validator.get('is_unique')
required = column_validator.get('required')
recommended = column in WARN_MISSING_TABLE_COLUMNS.get(file_name, [])
if not (required or enum or recommended):
if not (required or unique or recommended or data_type_validator):
return

missing = []
warn_missing = []
invalid = []
grouped_values = defaultdict(set)
for row in data:
value = row.get(column)
if not value:
Expand All @@ -1130,9 +1169,13 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e
check_recommend_condition = WARN_MISSING_CONDITIONAL_COLUMNS.get(column)
if not check_recommend_condition or check_recommend_condition(row):
warn_missing.append(_get_row_id(row))
elif enum and value not in enum:
elif data_type_validator and not data_type_validator(value, column_validator):
invalid.append(f'{_get_row_id(row)} ({value})')
if missing or warn_missing or invalid:
elif unique:
grouped_values[value].add(_get_row_id(row))

duplicates = [f'{k} ({", ".join(sorted(v))})' for k, v in grouped_values.items() if len(v) > 1]
if missing or warn_missing or invalid or duplicates:
airtable_summary = ' (from Airtable)' if column in ALL_AIRTABLE_COLUMNS else ''
error_template = f'The following entries {{issue}} "{column}"{airtable_summary} in the "{file_name}" table'
if missing:
Expand All @@ -1141,8 +1184,14 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e
)
if invalid:
invalid_values = f'Invalid values: {", ".join(sorted(invalid))}'
allowed = DATA_TYPE_ERROR_FORMATTERS[data_type](column_validator) \
if data_type in DATA_TYPE_ERROR_FORMATTERS else f' have data type {data_type}'
errors.append(
f'{error_template.format(issue="have invalid values for")}. Allowed values{allowed}. {invalid_values}'
)
if duplicates:
errors.append(
f'{error_template.format(issue="have invalid values for")}. Allowed values: {", ".join(enum)}. {invalid_values}'
f'{error_template.format(issue="have non-unique values for")}: {", ".join(sorted(duplicates))}'
)
if warn_missing:
warnings.append(
Expand Down
Loading

0 comments on commit 4cdc65b

Please sign in to comment.