Skip to content

Commit

Permalink
Merge branch 'rna-seq-optimization' of https://github.com/broadinstit…
Browse files Browse the repository at this point in the history
…ute/seqr into rna-seq-manage-cleanup
  • Loading branch information
hanars committed Feb 6, 2024
2 parents e6b9a9e + b57c19e commit 7736faf
Show file tree
Hide file tree
Showing 13 changed files with 136 additions and 57 deletions.
23 changes: 15 additions & 8 deletions hail_search/queries/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,10 +250,12 @@ def _load_filtered_table(self, sample_data, intervals=None, **kwargs):
def _get_table_path(cls, path, use_ssd_dir=False):
return f'{SSD_DATASETS_DIR if use_ssd_dir else DATASETS_DIR}/{cls.GENOME_VERSION}/{cls.DATA_TYPE}/{path}'

def _read_table(self, path, drop_globals=None, use_ssd_dir=False):
def _read_table(self, path, drop_globals=None, use_ssd_dir=False, skip_missing_field=None):
table_path = self._get_table_path(path, use_ssd_dir=use_ssd_dir)
if 'variant_ht' in self._load_table_kwargs:
ht = self._query_table_annotations(self._load_table_kwargs['variant_ht'], table_path)
if skip_missing_field and not ht.any(hl.is_defined(ht[skip_missing_field])):
return None
ht_globals = hl.read_table(table_path).globals
if drop_globals:
ht_globals = ht_globals.drop(*drop_globals)
Expand All @@ -275,11 +277,17 @@ def _parse_sample_data(self, sample_data):
logger.info(f'Loading {self.DATA_TYPE} data for {len(family_samples)} families in {len(project_samples)} projects')
return project_samples, family_samples

def _load_filtered_project_hts(self, project_samples, **kwargs):
def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, **kwargs):
filtered_project_hts = []
exception_messages = set()
for project_guid, project_sample_data in project_samples.items():
project_ht = self._read_table(f'projects/{project_guid}.ht', use_ssd_dir=True)
for i, (project_guid, project_sample_data) in enumerate(project_samples.items()):
project_ht = self._read_table(
f'projects/{project_guid}.ht',
use_ssd_dir=True,
skip_missing_field='entries' if skip_all_missing or i > 0 else None,
)
if project_ht is None:
continue
try:
filtered_project_hts.append(self._filter_entries_table(project_ht, project_sample_data, **kwargs))
except HTTPBadRequest as e:
Expand Down Expand Up @@ -960,10 +968,9 @@ def lookup_variant(self, variant_id, sample_data=None):

if sample_data:
project_samples, _ = self._parse_sample_data(sample_data)
for pht, _ in self._load_filtered_project_hts(project_samples):
for pht, _ in self._load_filtered_project_hts(project_samples, skip_all_missing=True):
project_entries = pht.aggregate(hl.agg.take(hl.struct(**{k: v(pht) for k, v in entry_annotations.items()}), 1))
if project_entries:
variant[FAMILY_GUID_FIELD] += project_entries[0][FAMILY_GUID_FIELD]
variant[GENOTYPES_FIELD].update(project_entries[0][GENOTYPES_FIELD])
variant[FAMILY_GUID_FIELD] += project_entries[0][FAMILY_GUID_FIELD]
variant[GENOTYPES_FIELD].update(project_entries[0][GENOTYPES_FIELD])

return variant
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ def test_command(self, mock_logger, mock_send_email, mock_send_slack, mock_subpr
])
mock_logger.warining.assert_not_called()

mock_redis.return_value.delete.assert_called_with('search_results__*')
mock_utils_logger.info.assert_called_with('Reset 1 cached results')
mock_redis.return_value.delete.assert_called_with('search_results__*', 'variant_lookup_results__*')
mock_utils_logger.info.assert_called_with('Reset 2 cached results')

# Tests Sample models created/updated
updated_sample_models = Sample.objects.filter(guid__in={
Expand Down
7 changes: 4 additions & 3 deletions seqr/management/tests/reset_cached_search_results_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def setUpTestData(cls):
@mock.patch('seqr.views.utils.variant_utils.logger')
@mock.patch('seqr.management.commands.reset_cached_search_results.logger')
def test_command(self, mock_command_logger, mock_utils_logger, mock_redis):
mock_redis.return_value.keys.side_effect = lambda pattern: [pattern]
mock_redis.return_value.keys.side_effect = lambda pattern: [pattern] if pattern != 'variant_lookup_results__*' else []

# Test command with a --project argument
call_command('reset_cached_search_results', '--project={}'.format(PROJECT_NAME))
Expand All @@ -47,9 +47,10 @@ def test_command(self, mock_command_logger, mock_utils_logger, mock_redis):

# Test command for reset metadata
mock_redis.reset_mock()
mock_redis.return_value.keys.side_effect = lambda pattern: [pattern]
call_command('reset_cached_search_results', '--reset-index-metadata')
mock_redis.return_value.delete.assert_called_with('search_results__*', 'index_metadata__*')
mock_utils_logger.info.assert_called_with('Reset 2 cached results')
mock_redis.return_value.delete.assert_called_with('search_results__*', 'variant_lookup_results__*', 'index_metadata__*')
mock_utils_logger.info.assert_called_with('Reset 3 cached results')
mock_command_logger.info.assert_called_with('Reset cached search results for all projects')

# Test with connection error
Expand Down
5 changes: 2 additions & 3 deletions seqr/utils/search/hail_search_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from django.db.models import F, Min, Count

import requests
from reference_data.models import Omim, GeneConstraint, GENOME_VERSION_LOOKUP, GENOME_VERSION_GRCh38
from reference_data.models import Omim, GeneConstraint, GENOME_VERSION_LOOKUP
from seqr.models import Sample, PhenotypePrioritization
from seqr.utils.search.constants import PRIORITIZED_GENE_SORT, X_LINKED_RECESSIVE
from seqr.utils.xpos_utils import MIN_POS, MAX_POS
Expand Down Expand Up @@ -74,9 +74,8 @@ def get_hail_variants_for_variant_ids(samples, genome_version, parsed_variant_id
return response_json['results']


def hail_variant_lookup(user, variant_id, genome_version=None, samples=None, **kwargs):
def hail_variant_lookup(user, variant_id, samples=None, **kwargs):
body = {
'genome_version': GENOME_VERSION_LOOKUP[genome_version or GENOME_VERSION_GRCh38],
'variant_id': variant_id,
**kwargs,
}
Expand Down
17 changes: 13 additions & 4 deletions seqr/utils/search/search_utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ def set_up(self):
def set_cache(self, cached):
self.mock_redis.get.return_value = json.dumps(cached)

def assert_cached_results(self, expected_results, sort='xpos'):
cache_key = f'search_results__{self.results_model.guid}__{sort}'
def assert_cached_results(self, expected_results, sort='xpos', cache_key=None):
cache_key = cache_key or f'search_results__{self.results_model.guid}__{sort}'
self.mock_redis.set.assert_called_with(cache_key, mock.ANY)
self.assertEqual(json.loads(self.mock_redis.set.call_args.args[1]), expected_results)
self.mock_redis.expire.assert_called_with(cache_key, timedelta(weeks=2))
Expand All @@ -55,18 +55,27 @@ def test_variant_lookup(self, mock_variant_lookup):
mock_variant_lookup.return_value = VARIANT_LOOKUP_VARIANT
variant = variant_lookup(self.user, '1-10439-AC-A', genome_version='38')
self.assertDictEqual(variant, VARIANT_LOOKUP_VARIANT)
mock_variant_lookup.assert_called_with(self.user, ('1', 10439, 'AC', 'A'), genome_version='38')
mock_variant_lookup.assert_called_with(self.user, ('1', 10439, 'AC', 'A'), genome_version='GRCh38')
cache_key = 'variant_lookup_results__1-10439-AC-A__38__test_user'
self.assert_cached_results(variant, cache_key=cache_key)

variant = variant_lookup(self.user, '1-10439-AC-A', genome_version='37', families=self.families)
self.assertDictEqual(variant, VARIANT_LOOKUP_VARIANT)
mock_variant_lookup.assert_called_with(self.user, ('1', 10439, 'AC', 'A'), genome_version='37', samples=mock.ANY)
mock_variant_lookup.assert_called_with(self.user, ('1', 10439, 'AC', 'A'), genome_version='GRCh37', samples=mock.ANY)
expected_samples = {s for s in self.search_samples if s.guid not in NON_SNP_INDEL_SAMPLES}
self.assertSetEqual(set(mock_variant_lookup.call_args.kwargs['samples']), expected_samples)

with self.assertRaises(InvalidSearchException) as cm:
variant_lookup(self.user, '100-10439-AC-A')
self.assertEqual(str(cm.exception), 'Invalid variant 100-10439-AC-A')

mock_variant_lookup.reset_mock()
self.set_cache(variant)
cached_variant = variant_lookup(self.user, '1-10439-AC-A', genome_version='38')
self.assertDictEqual(variant, cached_variant)
mock_variant_lookup.assert_not_called()
self.mock_redis.get.assert_called_with(cache_key)

def test_get_single_variant(self, mock_get_variants_for_ids):
mock_get_variants_for_ids.return_value = [PARSED_VARIANTS[0]]
variant = get_single_variant(self.families, '2-103343353-GAGA-G', user=self.user)
Expand Down
13 changes: 11 additions & 2 deletions seqr/utils/search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from copy import deepcopy
from datetime import timedelta

from reference_data.models import GENOME_VERSION_LOOKUP, GENOME_VERSION_GRCh38
from seqr.models import Sample, Individual, Project
from seqr.utils.redis_utils import safe_redis_get_json, safe_redis_set_json
from seqr.utils.search.constants import XPOS_SORT_KEY, PRIORITIZED_GENE_SORT, RECESSIVE, COMPOUND_HET, \
Expand Down Expand Up @@ -152,7 +153,13 @@ def _get_variants_for_variant_ids(families, variant_ids, user, dataset_type=None
)


def variant_lookup(user, variant_id, families=None, **kwargs):
def variant_lookup(user, variant_id, families=None, genome_version=None, **kwargs):
genome_version = genome_version or GENOME_VERSION_GRCh38
cache_key = f'variant_lookup_results__{variant_id}__{genome_version}__{user}'
variant = safe_redis_get_json(cache_key)
if variant:
return variant

parsed_variant_id = _parse_variant_id(variant_id)
if not parsed_variant_id:
raise InvalidSearchException(f'Invalid variant {variant_id}')
Expand All @@ -162,7 +169,9 @@ def variant_lookup(user, variant_id, families=None, **kwargs):
kwargs['samples'] = samples

lookup_func = backend_specific_call(_raise_search_error('Hail backend is disabled'), hail_variant_lookup)
return lookup_func(user, parsed_variant_id, **kwargs)
variant = lookup_func(user, parsed_variant_id, genome_version=GENOME_VERSION_LOOKUP[genome_version], **kwargs)
safe_redis_set_json(cache_key, variant, expire=timedelta(weeks=2))
return variant


def _get_search_cache_key(search_model, sort=None):
Expand Down
7 changes: 7 additions & 0 deletions seqr/views/apis/data_manager_api_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,13 @@ def _set_file_iter_stdout(rows):
mock_file_iter.stdout = [('\t'.join([str(col) for col in row]) + '\n').encode() for row in rows]
mock_subprocess.side_effect = [mock_does_file_exist, mock_file_iter]

_set_file_iter_stdout([])
invalid_body = {**body, 'file': body['file'].replace('tsv', 'xlsx')}
response = self.client.post(url, content_type='application/json', data=json.dumps(invalid_body))
self.assertEqual(response.status_code, 400)
self.assertDictEqual(
response.json(), {'error': 'Unexpected iterated file type: gs://rna_data/muscle_samples.xlsx'})

_set_file_iter_stdout([['']])
response = self.client.post(url, content_type='application/json', data=json.dumps(body))
self.assertEqual(response.status_code, 400)
Expand Down
6 changes: 5 additions & 1 deletion seqr/views/utils/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
logger = SeqrLogger(__name__)


MAX_UNSAVED_DATA_PER_SAMPLE = 5000


def load_mapping_file(mapping_file_path, user):
file_content = parse_file(mapping_file_path, file_iter(mapping_file_path, user=user))
return load_mapping_file_content(file_content)
Expand Down Expand Up @@ -389,7 +392,8 @@ def _load_rna_seq_file(
continue

if current_sample != sample_guid:
if len(samples_by_guid[current_sample]) > 5000:
# If a large amount of data has been parsed for the previous sample, save and do not keep in memory
if len(samples_by_guid[current_sample]) > MAX_UNSAVED_DATA_PER_SAMPLE:
save_sample_data(current_sample, samples_by_guid[current_sample])
del samples_by_guid[current_sample]
current_sample = sample_guid
Expand Down
2 changes: 1 addition & 1 deletion seqr/views/utils/file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def parse_file(filename, stream, iter_file=False):
elif filename.endswith('.json') and not iter_file:
return json.loads(stream.read())

raise ValueError("Unexpected file type: {}".format(filename))
raise ValueError(f"Unexpected{' iterated' if iter_file else ''} file type: {filename}")


def _parse_excel_string_cell(cell):
Expand Down
2 changes: 1 addition & 1 deletion seqr/views/utils/orm_to_json_utils_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def test_json_for_saved_variants_with_tags(self):
self.assertSetEqual(set(var_2['noteGuids']), set(v2_note_guids))

self.assertSetEqual(set(json['variantTagsByGuid'].keys()), v1_tag_guids | v2_tag_guids)
self.assertSetEqual(set(next(iter(json['variantTagsByGuid'].values())).keys()), TAG_FIELDS)
self.assertSetEqual(set(json['variantTagsByGuid']['VT1726961_2103343353_r0390_100'].keys()), TAG_FIELDS)
for tag_guid in v1_tag_guids:
self.assertListEqual(json['variantTagsByGuid'][tag_guid]['variantGuids'], [variant_guid_1])
for tag_guid in v2_tag_guids:
Expand Down
1 change: 1 addition & 0 deletions seqr/views/utils/variant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def reset_cached_search_results(project, reset_index_metadata=False):
keys_to_delete += redis_client.keys(pattern='search_results__{}*'.format(guid))
else:
keys_to_delete = redis_client.keys(pattern='search_results__*')
keys_to_delete += redis_client.keys(pattern='variant_lookup_results__*')
if reset_index_metadata:
keys_to_delete += redis_client.keys(pattern='index_metadata__*')
if keys_to_delete:
Expand Down
96 changes: 68 additions & 28 deletions ui/shared/components/panel/variants/Annotations.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import { Popup, Label, Icon } from 'semantic-ui-react'

import {
getGenesById,
getTranscriptsById,
getLocusListIntervalsByChromProject,
getOmimIntervalsByChrom,
getFamiliesByGuid,
Expand Down Expand Up @@ -218,6 +219,21 @@ const getLitSearch = (genes, variations) => {
return search
}

const shouldShowNonDefaultTranscriptInfoIcon = (variant, transcript, transcriptsById) => {
const allVariantTranscripts = Object.values(variant.transcripts)?.flat() || []
const canonical = allVariantTranscripts.find(t => t.canonical) || null
const mane = allVariantTranscripts.find(
t => transcriptsById[t.transcriptId]?.isManeSelect || false,
) || null

const result = canonical !== null &&
mane !== null &&
transcript.transcriptId !== canonical.transcriptId &&
transcript.transcriptId !== mane.transcriptId

return result
}

const VARIANT_LINKS = [
{
name: 'gnomAD',
Expand Down Expand Up @@ -417,7 +433,7 @@ const svSizeDisplay = (size) => {
return `${(size / 1000000).toFixed(2) / 1}Mb`
}

const Annotations = React.memo(({ variant, mainGeneId, showMainGene }) => {
const Annotations = React.memo(({ variant, mainGeneId, showMainGene, transcriptsById }) => {
const {
rsid, svType, numExon, pos, end, svTypeDetail, svSourceDetail, cpxIntervals, algorithms, bothsidesSupport,
endChrom,
Expand Down Expand Up @@ -457,32 +473,51 @@ const Annotations = React.memo(({ variant, mainGeneId, showMainGene }) => {
return (
<div>
{(mainTranscript.majorConsequence || svType) && (
<Modal
modalName={`${variant.variantId}-annotations`}
title="Transcripts"
size="large"
trigger={
<ButtonLink size={svType && 'big'}>
{svType ? (SVTYPE_LOOKUP[svType] || svType) : mainTranscript.majorConsequence.replace(/_/g, ' ')}
{svType && (svTypeDetail || svSourceDetail) && (
<Popup
trigger={<Icon name="info circle" size="small" corner="top right" />}
content={
<div>
{(SVTYPE_DETAILS[svType] || {})[svTypeDetail] || svTypeDetail || ''}
{svTypeDetail && <br />}
{svSourceDetail && `Inserted from chr${svSourceDetail.chrom}`}
</div>
}
position="top center"
/>
)}
</ButtonLink>
}
popup={transcriptPopupProps}
>
<Transcripts variant={variant} />
</Modal>
<div>
<Modal
modalName={`${variant.variantId}-annotations`}
title="Transcripts"
size="large"
trigger={
<ButtonLink size={svType && 'big'}>
{svType ? (SVTYPE_LOOKUP[svType] || svType) : mainTranscript.majorConsequence.replace(/_/g, ' ')}
{svType && (svTypeDetail || svSourceDetail) && (
<Popup
trigger={<Icon name="info circle" size="small" corner="top right" />}
content={
<div>
{(SVTYPE_DETAILS[svType] || {})[svTypeDetail] || svTypeDetail || ''}
{svTypeDetail && <br />}
{svSourceDetail && `Inserted from chr${svSourceDetail.chrom}`}
</div>
}
position="top center"
/>
)}
</ButtonLink>
}
popup={transcriptPopupProps}
>
<Transcripts variant={variant} />
</Modal>
<HorizontalSpacer width={2} />
{shouldShowNonDefaultTranscriptInfoIcon(variant, mainTranscript, transcriptsById) && (
<span>
<Popup
trigger={<Icon name="info circle" color="yellow" />}
content={
<div>
This transcript is neither the Gencode Canonical transcript nor the MANE transcript.
It has been selected by seqr as it has the most severe consequence for the variant
given your search parameters.
Click on the consequence to see alternate transcripts which may have other consequences.
</div>
}
position="top left"
/>
</span>
)}
</div>
)}
{svType && end && !endChrom && end !== pos && (
<b>
Expand Down Expand Up @@ -602,6 +637,11 @@ Annotations.propTypes = {
variant: PropTypes.object,
mainGeneId: PropTypes.string,
showMainGene: PropTypes.bool,
transcriptsById: PropTypes.object.isRequired,
}

export default Annotations
const mapAnnotationsStateToProps = state => ({
transcriptsById: getTranscriptsById(state),
})

export default connect(mapAnnotationsStateToProps)(Annotations)
10 changes: 6 additions & 4 deletions ui/shared/components/panel/variants/Annotations.test.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import React from 'react'
import { shallow, configure } from 'enzyme'
import Adapter from '@wojtekmaj/enzyme-adapter-react-17'
import { getUser } from 'redux/selectors'
import configureStore from 'redux-mock-store'
import Annotations from './Annotations'
import { VARIANT, SV_VARIANT } from '../fixtures'
import { STATE1, VARIANT, SV_VARIANT } from '../fixtures'

configure({ adapter: new Adapter() })

test('shallow-render without crashing', () => {
shallow(<Annotations variant={VARIANT} />)
shallow(<Annotations variant={SV_VARIANT} />)
const store = configureStore()(STATE1)

shallow(<Annotations store={store} variant={VARIANT} />)
shallow(<Annotations store={store} variant={SV_VARIANT} />)
})

0 comments on commit 7736faf

Please sign in to comment.