Merge pull request #4381 from broadinstitute/dev

Dev
broadinstitute · Sep 19, 2024 · 8d474f7 · 8d474f7
2 parents 4d44c7d + e204ced
commit 8d474f7
Show file tree

Hide file tree

Showing 43 changed files with 772 additions and 506 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## dev
 
+## 9/19/24
+* Update Biosample choices (REQUIRES DB MIGRATION)
+* Add support for Azure OAuth
+
 ## 8/14/24
 * Remove ONT support (REQUIRES DB MIGRATION)
 * Add "Validated Name" functional tag (REQUIRES DB MIGRATION)

diff --git a/hail_search/constants.py b/hail_search/constants.py
@@ -88,3 +88,5 @@
     ('likely_disease_causing', 'DM?', 'DM?'),
     ('hgmd_other', 'DP', None),
 ]
+
+MAX_LOAD_INTERVALS = 1000
diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
@@ -6,15 +6,15 @@
 
 from hail_search.constants import AFFECTED_ID, ALT_ALT, ANNOTATION_OVERRIDE_FIELDS, ANY_AFFECTED, COMP_HET_ALT, \
     COMPOUND_HET, GENOME_VERSION_GRCh38, GROUPED_VARIANTS_FIELD, ALLOWED_TRANSCRIPTS, ALLOWED_SECONDARY_TRANSCRIPTS,  HAS_ANNOTATION_OVERRIDE, \
-    HAS_ALT, HAS_REF,INHERITANCE_FILTERS, PATH_FREQ_OVERRIDE_CUTOFF, MALE, RECESSIVE, REF_ALT, REF_REF, \
+    HAS_ALT, HAS_REF,INHERITANCE_FILTERS, PATH_FREQ_OVERRIDE_CUTOFF, MALE, RECESSIVE, REF_ALT, REF_REF, MAX_LOAD_INTERVALS, \
     UNAFFECTED_ID, X_LINKED_RECESSIVE, XPOS, OMIM_SORT, FAMILY_GUID_FIELD, GENOTYPES_FIELD, AFFECTED_ID_MAP
 
 DATASETS_DIR = os.environ.get('DATASETS_DIR', '/hail_datasets')
 SSD_DATASETS_DIR = os.environ.get('SSD_DATASETS_DIR', DATASETS_DIR)
 
 # Number of filtered genes at which pre-filtering a table by gene-intervals does not improve performance
 # Estimated based on behavior for several representative gene lists
-MAX_GENE_INTERVALS = int(os.environ.get('MAX_GENE_INTERVALS', 100))
+MAX_GENE_INTERVALS = int(os.environ.get('MAX_GENE_INTERVALS', MAX_LOAD_INTERVALS))
 
 # Optimal number of entry table partitions, balancing parallelization with partition overhead
 # Experimentally determined based on compound het search performance:
@@ -92,6 +92,7 @@ def load_globals(cls):
         ht_path = cls._get_table_path('annotations.ht')
         ht_globals = hl.eval(hl.read_table(ht_path).globals.select(*cls.GLOBALS))
         cls.LOADED_GLOBALS = {k: ht_globals[k] for k in cls.GLOBALS}
+        return cls.LOADED_GLOBALS
 
     @classmethod
     def _format_population_config(cls, pop_config):
@@ -237,7 +238,8 @@ def __init__(self, sample_data, sort=XPOS, sort_metadata=None, num_results=100,
         self._has_secondary_annotations = False
         self._is_multi_data_type_comp_het = False
         self.max_unaffected_samples = None
-        self._load_table_kwargs = {'_n_partitions': min(MAX_PARTITIONS, (os.cpu_count() or 2)-1)}
+        self._n_partitions = min(MAX_PARTITIONS, (os.cpu_count() or 2)-1)
+        self._load_table_kwargs = {'_n_partitions': self._n_partitions}
         self.entry_samples_by_family_guid = {}
 
         if sample_data:

diff --git a/hail_search/queries/mito.py b/hail_search/queries/mito.py
@@ -6,10 +6,9 @@
 
 from hail_search.constants import ABSENT_PATH_SORT_OFFSET, CLINVAR_KEY, CLINVAR_MITO_KEY, CLINVAR_LIKELY_PATH_FILTER, CLINVAR_PATH_FILTER, \
     CLINVAR_PATH_RANGES, CLINVAR_PATH_SIGNIFICANCES, ALLOWED_TRANSCRIPTS, ALLOWED_SECONDARY_TRANSCRIPTS, PATHOGENICTY_SORT_KEY, CONSEQUENCE_SORT, \
-    PATHOGENICTY_HGMD_SORT_KEY
+    PATHOGENICTY_HGMD_SORT_KEY, MAX_LOAD_INTERVALS
 from hail_search.queries.base import BaseHailTableQuery, PredictionPath, QualityFilterFormat
 
-MAX_LOAD_INTERVALS = 1000
 
 logger = logging.getLogger(__name__)
 
@@ -211,10 +210,15 @@ def _parse_variant_keys(self, variant_ids=None, **kwargs):
         ]
 
     def _prefilter_entries_table(self, ht, parsed_intervals=None, exclude_intervals=False, **kwargs):
+        num_intervals = len(parsed_intervals or [])
         if exclude_intervals and parsed_intervals:
             ht = hl.filter_intervals(ht, parsed_intervals, keep=False)
-        elif len(parsed_intervals or []) >= MAX_LOAD_INTERVALS:
+        elif num_intervals >= MAX_LOAD_INTERVALS:
             ht = hl.filter_intervals(ht, parsed_intervals)
+
+        if '_n_partitions' not in self._load_table_kwargs and num_intervals > self._n_partitions:
+            ht = ht.naive_coalesce(self._n_partitions)
+
         return ht
 
     def _get_allowed_consequence_ids(self, annotations):

diff --git a/hail_search/search.py b/hail_search/search.py
@@ -33,5 +33,7 @@ def lookup_variants(request):
 
 
 def load_globals():
-    for cls in QUERY_CLASS_MAP.values():
-        cls.load_globals()
+    return {
+        str(k): v.load_globals()
+        for k, v in QUERY_CLASS_MAP.items()
+    }
diff --git a/hail_search/test_search.py b/hail_search/test_search.py
@@ -1,6 +1,7 @@
 from aiohttp.test_utils import AioHTTPTestCase
 import asyncio
 from copy import deepcopy
+import hail as hl
 import time
 from unittest import mock
 
@@ -219,6 +220,32 @@ async def test_status(self):
             resp_json = await resp.json()
         self.assertDictEqual(resp_json, {'success': True})
 
+    async def test_reload_globals(self):
+        async with self.client.request('POST', '/reload_globals') as resp:
+            resp_json = await resp.json()
+        self.assertTrue(
+            resp_json["('SNV_INDEL', 'GRCh38')"]['versions']['gnomad_genomes'],
+        )
+        with mock.patch('hail_search.queries.base.hl.read_table') as mock_read_table:
+            mock_read_table.return_value = hl.Table.parallelize(
+                [],
+                hl.tstruct(),
+                globals=hl.Struct(
+                    enums=hl.Struct(reloaded_enum=1),
+                    versions=hl.Struct(reloaded_version=2),
+                )
+            )
+            async with self.client.request('POST', '/reload_globals') as resp:
+                self.assertEqual(resp.status, 200)
+                resp_json = await resp.json()
+        self.assertDictEqual(
+            resp_json["('SNV_INDEL', 'GRCh38')"],
+            {
+                'enums': {'reloaded_enum': 1},
+                'versions': {'reloaded_version': 2},
+            },
+        )
+
     async def _assert_expected_search(self, results, gene_counts=None, **search_kwargs):
         search_body = get_hail_search_body(**search_kwargs)
         async with self.client.request('POST', '/search', json=search_body) as resp:

diff --git a/hail_search/web_app.py b/hail_search/web_app.py
@@ -97,6 +97,14 @@ async def multi_lookup(request: web.Request) -> web.Response:
     return web.json_response({'results': result}, dumps=hl_json_dumps)
 
 
+async def reload_globals(request: web.Request) -> web.Response:
+    result = await sync_to_async_hail_query(request, lambda _: load_globals())
+    return web.json_response(
+        result,
+        dumps=hl_json_dumps
+    )
+
+
 async def status(request: web.Request) -> web.Response:
     # Make sure the hail backend process is still alive.
     await sync_to_async_hail_query(request, lambda _: hl.eval(1 + 1))
@@ -116,6 +124,7 @@ async def init_web_app():
     app = web.Application(middlewares=[error_middleware], client_max_size=(1024**2)*10)
     app.add_routes([
         web.get('/status', status),
+        web.post('/reload_globals', reload_globals),
         web.post('/search', search),
         web.post('/gene_counts', gene_counts),
         web.post('/lookup', lookup),

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -22,7 +22,7 @@ click==8.1.3
     # via pip-tools
 coverage==5.1
     # via -r requirements-dev.in
-django==4.2.15
+django==4.2.16
     # via
     #   -c requirements.txt
     #   django-appconf
@@ -83,3 +83,4 @@ wheel==0.38.4
 # The following packages are considered to be unsafe in a requirements file:
 # pip
 # setuptools
+zipp>=3.19.1 # not directly required, pinned by Snyk to avoid a vulnerability
diff --git a/requirements.txt b/requirements.txt
@@ -26,7 +26,7 @@ defusedxml==0.7.1
     # via
     #   python3-openid
     #   social-auth-core
-django==4.2.15
+django==4.2.16
     # via
     #   -r requirements.in
     #   django-anymail

diff --git a/seqr/migrations/0074_alter_individual_add_primary_biosample_types.py b/seqr/migrations/0074_alter_individual_add_primary_biosample_types.py
@@ -0,0 +1,22 @@
+# Generated by Django 4.2.15 on 2024-09-17 15:33
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('seqr', '0073_alter_variantfunctionaldata_functional_data_tag'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='individual',
+            name='primary_biosample',
+            field=models.CharField(blank=True, choices=[
+                ('IP', 'CL:0000034'), ('MO', 'CL:0000576'), ('LY', 'CL:0000542'), ('FI', 'CL:0000057'),
+                ('EM', 'UBERON:0005291'), ('NP', 'CL:0011020'), ('CE', 'UBERON:0002037'), ('CA', 'UBERON:0001133')
+            ], max_length=2, null=True),
+        ),
+    ]
+
diff --git a/seqr/models.py b/seqr/models.py
@@ -553,6 +553,14 @@ class Individual(ModelWithGUID):
         ('CF', 'UBERON:0001359'),  # cerebrospinal fluid
         ('U', 'UBERON:0001088'),  # urine
         ('NE', 'UBERON:0019306'),  # nose epithelium
+        ('IP', 'CL:0000034'),  # iPSC
+        ('MO', 'CL:0000576'),  # monocytes - PBMCs
+        ('LY', 'CL:0000542'),  # lymphocytes - LCLs
+        ('FI', 'CL:0000057'),  # fibroblasts
+        ('EM', 'UBERON:0005291'),  # embryonic tissue
+        ('NP', 'CL:0011020'),  # iPSC NPC
+        ('CE', 'UBERON:0002037'),  # cerebellum tissue
+        ('CA', 'UBERON:0001133'),  # cardiac tissue
     ]
 
     ANALYTE_CHOICES = [

diff --git a/seqr/urls.py b/seqr/urls.py
@@ -136,7 +136,7 @@
 from seqr.views.apis.awesomebar_api import awesomebar_autocomplete_handler
 from seqr.views.apis.auth_api import login_required_error, login_view, logout_view, policies_required_error
 from seqr.views.apis.igv_api import fetch_igv_track, receive_igv_table_handler, update_individual_igv_sample, \
-    igv_genomes_proxy, receive_bulk_igv_table_handler
+    receive_bulk_igv_table_handler
 from seqr.views.apis.analysis_group_api import update_analysis_group_handler, delete_analysis_group_handler, \
     update_dynamic_analysis_group_handler, delete_dynamic_analysis_group_handler
 from seqr.views.apis.project_api import create_project_handler, update_project_handler, delete_project_handler, \
@@ -277,7 +277,6 @@
     'gene_info/(?P<gene_id>[^/]+)/note/(?P<note_guid>[^/]+)/delete': delete_gene_note_handler,
 
     'hpo_terms/(?P<hpo_parent_id>[^/]+)': get_hpo_terms,
-    'igv_genomes/(?P<cloud_host>[^/]+)/(?P<file_path>.*)': igv_genomes_proxy,
 
     'locus_lists/(?P<locus_list_guid>[^/]+)/update': update_locus_list_handler,
     'locus_lists/(?P<locus_list_guid>[^/]+)/delete': delete_locus_list_handler,

diff --git a/seqr/utils/social_auth_pipeline.py b/seqr/utils/social_auth_pipeline.py
@@ -37,3 +37,9 @@ def log_signed_in(backend, response, is_new=False, *args, **kwargs):
     logger.info('Logged in {} ({})'.format(response['email'], backend.name), extra={'user_email': response['email']})
     if is_new:
         logger.info('Created user {} ({})'.format(response['email'], backend.name), extra={'user_email': response['email']})
+
+
+def log_azure_signed_in(backend, details, is_new=False, *args, **kwargs):
+    logger.info('Logged in {} ({})'.format(details['email'], backend.name), extra={'user_email': details['email']})
+    if is_new:
+        logger.info('Created user {} ({})'.format(details['email'], backend.name), extra={'user_email': details['email']})
diff --git a/seqr/views/apis/auth_api.py b/seqr/views/apis/auth_api.py
@@ -12,14 +12,14 @@
 
 from seqr.utils.logging_utils import SeqrLogger
 from seqr.views.utils.json_utils import create_json_response
-from seqr.views.utils.terra_api_utils import google_auth_enabled, remove_token
+from seqr.views.utils.terra_api_utils import oauth_enabled, remove_token
 from settings import LOGIN_URL, POLICY_REQUIRED_URL
 
 logger = SeqrLogger(__name__)
 
 
 def login_view(request):
-    if google_auth_enabled():
+    if oauth_enabled():
         raise PermissionDenied('Username/ password authentication is disabled')
 
     request_json = json.loads(request.body)

diff --git a/seqr/views/apis/auth_api_tests.py b/seqr/views/apis/auth_api_tests.py
@@ -6,6 +6,8 @@
 from seqr.views.apis.auth_api import login_view, logout_view, login_required_error, policies_required_error
 from django.contrib.auth.models import User
 
+from seqr.views.utils.test_utils import TEST_OAUTH2_PROVIDER
+
 
 class AuthAPITest(TestCase):
     fixtures = ['users']
@@ -88,7 +90,7 @@ def test_login_view(self):
         self.assertEqual(response.status_code, 401)
         self.assertEqual(response.reason_phrase, 'Invalid credentials')
 
-    @mock.patch('seqr.views.utils.terra_api_utils.SOCIAL_AUTH_GOOGLE_OAUTH2_KEY', 'test_key')
+    @mock.patch('seqr.views.utils.terra_api_utils.SOCIAL_AUTH_PROVIDER', TEST_OAUTH2_PROVIDER)
     def test_login_view_with_google(self):
         url = reverse(login_view)
         response = self.client.post(url)

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
@@ -33,8 +33,8 @@
 
 from seqr.models import Sample, RnaSample, Individual, Project, PhenotypePrioritization
 
-from settings import KIBANA_SERVER, KIBANA_ELASTICSEARCH_PASSWORD, SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL, BASE_URL, \
-    LOADING_DATASETS_DIR, PIPELINE_RUNNER_SERVER
+from settings import KIBANA_SERVER, KIBANA_ELASTICSEARCH_PASSWORD, KIBANA_ELASTICSEARCH_USER, \
+    SEQR_SLACK_LOADING_NOTIFICATION_CHANNEL, BASE_URL, LOADING_DATASETS_DIR, PIPELINE_RUNNER_SERVER
 
 logger = SeqrLogger(__name__)
 
@@ -448,11 +448,19 @@ def load_phenotype_prioritization_data(request):
 def validate_callset(request):
     request_json = json.loads(request.body)
     validate_vcf_exists(
-        request_json['filePath'], request.user, allowed_exts=DATA_TYPE_FILE_EXTS.get(request_json['datasetType'])
+        _callset_path(request_json), request.user, allowed_exts=DATA_TYPE_FILE_EXTS.get(request_json['datasetType']),
+        path_name=request_json['filePath'],
     )
     return create_json_response({'success': True})
 
 
+def _callset_path(request_json):
+    file_path = request_json['filePath']
+    if not AirtableSession.is_airtable_enabled():
+        file_path = os.path.join(LOADING_DATASETS_DIR, file_path.lstrip('/'))
+    return file_path
+
+
 @pm_or_data_manager_required
 def get_loaded_projects(request, sample_type, dataset_type):
     projects = get_internal_projects().filter(is_demo=False)
@@ -515,7 +523,7 @@ def load_data(request):
         individual_ids = _get_valid_project_samples(project_samples, sample_type, request.user)
 
     loading_args = (
-        project_models, sample_type, dataset_type, request_json['genomeVersion'], request_json['filePath'],
+        project_models, sample_type, dataset_type, request_json['genomeVersion'], _callset_path(request_json),
     )
     if has_airtable:
         success_message = f'*{request.user.email}* triggered loading internal {sample_type} {dataset_type} data for {len(projects)} projects'
@@ -529,6 +537,8 @@ def load_data(request):
             *loading_args, user=request.user, pedigree_dir=LOADING_DATASETS_DIR, raise_pedigree_error=True,
         )
         response = requests.post(f'{PIPELINE_RUNNER_SERVER}/loading_pipeline_enqueue', json=request_json, timeout=60)
+        if response.status_code == 409:
+            raise ErrorsWarningsException(['Loading pipeline is already running. Wait for it to complete and resubmit'])
         response.raise_for_status()
         logger.info('Triggered loading pipeline', request.user, detail=request_json)
 
@@ -618,7 +628,7 @@ def proxy_to_kibana(request):
     headers = convert_django_meta_to_http_headers(request)
     headers['Host'] = KIBANA_SERVER
     if KIBANA_ELASTICSEARCH_PASSWORD:
-        token = base64.b64encode('kibana:{}'.format(KIBANA_ELASTICSEARCH_PASSWORD).encode('utf-8'))
+        token = base64.b64encode('{}:{}'.format(KIBANA_ELASTICSEARCH_USER, KIBANA_ELASTICSEARCH_PASSWORD).encode('utf-8'))
         headers['Authorization'] = 'Basic {}'.format(token.decode('utf-8'))
 
     url = "http://{host}{path}".format(host=KIBANA_SERVER, path=request.get_full_path())