From 4f34d3c727462339659a6be993756991d42fa169 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 10 Aug 2023 13:33:25 -0400
Subject: [PATCH 1/5] cache globals for hail backend

---
 hail_search/constants.py         |  2 +-
 hail_search/hail_search_query.py | 45 ++++++++++++++++++++------------
 hail_search/search.py            | 11 +++++---
 hail_search/web_app.py           |  7 +++--
 4 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/hail_search/constants.py b/hail_search/constants.py
index 4da596dc3b..1c24cf4421 100644
--- a/hail_search/constants.py
+++ b/hail_search/constants.py
@@ -1,4 +1,4 @@
-GENOME_VERSION_GRCh38_DISPLAY = 'GRCh38'
+GENOME_VERSION_GRCh38 = 'GRCh38'
 
 AFFECTED = 'A'
 UNAFFECTED = 'N'
diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py
index f2df77100b..542de7f157 100644
--- a/hail_search/hail_search_query.py
+++ b/hail_search/hail_search_query.py
@@ -5,7 +5,7 @@
 import os
 
 from hail_search.constants import AFFECTED, UNAFFECTED, AFFECTED_ID, UNAFFECTED_ID, MALE, VARIANT_DATASET, \
-    VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38_DISPLAY, INHERITANCE_FILTERS, \
+    VARIANT_KEY_FIELD, GNOMAD_GENOMES_FIELD, XPOS, GENOME_VERSION_GRCh38, INHERITANCE_FILTERS, \
     ANY_AFFECTED, X_LINKED_RECESSIVE, REF_REF, REF_ALT, COMP_HET_ALT, ALT_ALT, HAS_ALT, HAS_REF, \
     ANNOTATION_OVERRIDE_FIELDS, SCREEN_KEY, SPLICE_AI_FIELD, CLINVAR_KEY, HGMD_KEY, CLINVAR_PATH_SIGNIFICANCES, \
     CLINVAR_PATH_FILTER, CLINVAR_LIKELY_PATH_FILTER, CLINVAR_PATH_RANGES, HGMD_PATH_RANGES, PATH_FREQ_OVERRIDE_CUTOFF
@@ -26,6 +26,8 @@ def _to_camel_case(snake_case_str):
 
 class BaseHailTableQuery(object):
 
+    DATA_TYPE = None
+
     GENOTYPE_QUERY_MAP = {
         REF_REF: lambda gt: gt.is_hom_ref(),
         REF_ALT: lambda gt: gt.is_het(),
@@ -42,6 +44,7 @@ class BaseHailTableQuery(object):
     POPULATION_KEYS = ['AF', 'AC', 'AN', 'Hom', 'Hemi', 'Het']
     PREDICTION_FIELDS_CONFIG = {}
 
+    GENOME_VERSIONS = [GENOME_VERSION_GRCh38]
     GLOBALS = ['enums']
     CORE_FIELDS = [XPOS]
     BASE_ANNOTATION_FIELDS = {
@@ -59,6 +62,15 @@ class BaseHailTableQuery(object):
         XPOS: lambda r: [r.xpos],
     }
 
+    @classmethod
+    def load_globals(cls):
+        globals = {}
+        for genome_version in cls.GENOME_VERSIONS:
+            ht_path = cls._get_generic_table_path(genome_version, 'annotations.ht')
+            globals_ht = hl.read_table(ht_path).head(0).select()
+            globals[genome_version] = {k: hl.eval(globals_ht[k]) for k in cls.GLOBALS}
+        return globals
+
     @classmethod
     def _format_population_config(cls, pop_config):
         base_pop_config = {field.lower(): field for field in cls.POPULATION_KEYS}
@@ -89,7 +101,7 @@ def annotation_fields(self):
             for k, enum_config in self.ENUM_ANNOTATION_FIELDS.items()
         })
 
-        if self._genome_version == GENOME_VERSION_GRCh38_DISPLAY:
+        if self._genome_version == GENOME_VERSION_GRCh38:
             annotation_fields.update(self.LIFTOVER_ANNOTATION_FIELDS)
         return annotation_fields
 
@@ -139,17 +151,19 @@ def _enum_field(value, enum, ht_globals=None, annotate_value=None, format_value=
 
         return value
 
-    def __init__(self, data_type, sample_data, genome_version, sort=XPOS, num_results=100, **kwargs):
+    def __init__(self, sample_data, genome_version, globals, sort=XPOS, num_results=100, **kwargs):
         self._genome_version = genome_version
         self._sort = sort
         self._num_results = num_results
-        self._data_type = data_type
         self._ht = None
-        self._enums = None
-        self._globals = None
+        self._globals = globals
 
         self._load_filtered_table(sample_data, **kwargs)
 
+    @property
+    def _enums(self):
+        return self._globals['enums']
+
     def _load_filtered_table(self, sample_data, intervals=None, exclude_intervals=False, variant_ids=None, **kwargs):
         parsed_intervals, variant_ids = self._parse_intervals(intervals, variant_ids)
         excluded_intervals = None
@@ -160,8 +174,12 @@ def _load_filtered_table(self, sample_data, intervals=None, exclude_intervals=Fa
         self.import_filtered_table(
             sample_data, excluded_intervals=excluded_intervals, variant_ids=variant_ids, **kwargs)
 
+    @classmethod
+    def _get_generic_table_path(cls, genome_version, path):
+        return f'{DATASETS_DIR}/{genome_version}/{cls.DATA_TYPE}/{path}'
+
     def _get_table_path(self, path):
-        return f'{DATASETS_DIR}/{self._genome_version}/{self._data_type}/{path}'
+        return self._get_generic_table_path(self._genome_version, path)
 
     def _read_table(self, path):
         return hl.read_table(self._get_table_path(path), **self._load_table_kwargs)
@@ -173,7 +191,7 @@ def import_filtered_table(self, sample_data, **kwargs):
             family_samples[s['family_guid']].append(s)
             project_samples[s['project_guid']].append(s)
 
-        logger.info(f'Loading {self._data_type} data for {len(family_samples)} families in {len(project_samples)} projects')
+        logger.info(f'Loading {self.DATA_TYPE} data for {len(family_samples)} families in {len(project_samples)} projects')
         if len(family_samples) == 1:
             family_guid, family_sample_data = list(family_samples.items())[0]
             family_ht = self._read_table(f'families/{family_guid}.ht')
@@ -209,11 +227,6 @@ def import_filtered_table(self, sample_data, **kwargs):
             annotations_ht_path, families_ht.key).first().drop(*families_ht.key)
         ht = families_ht.annotate(**annotation_ht_query_result)
 
-        # Get globals
-        annotation_globals_ht = hl.read_table(annotations_ht_path).head(0).select()
-        self._globals = {k: hl.eval(annotation_globals_ht[k]) for k in self.GLOBALS}
-        self._enums = self._globals.pop('enums')
-
         self._ht = ht.transmute(
             genotypes=ht.family_entries.flatmap(lambda x: x).filter(
                 lambda gt: hl.is_defined(gt.individualGuid)
@@ -576,6 +589,8 @@ def _get_sort_expressions(self, ht, sort):
 
 class VariantHailTableQuery(BaseHailTableQuery):
 
+    DATA_TYPE = VARIANT_DATASET
+
     GENOTYPE_FIELDS = {f.lower(): f for f in ['DP', 'GQ', 'AB']}
     QUALITY_FILTER_FORMAT = {
         'AB': QualityFilterFormat(override=lambda gt: ~gt.GT.is_het(), scale=100),
@@ -779,6 +794,4 @@ def _format_results(self, ht):
         return super()._format_results(ht)
 
 
-QUERY_CLASS_MAP = {
-    VARIANT_DATASET: VariantHailTableQuery,
-}
+QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery]}
diff --git a/hail_search/search.py b/hail_search/search.py
index 716aae5e7b..fb45b765e9 100644
--- a/hail_search/search.py
+++ b/hail_search/search.py
@@ -1,15 +1,20 @@
 from hail_search.hail_search_query import QUERY_CLASS_MAP
 
 
-def search_hail_backend(request):
+def search_hail_backend(request, all_globals):
     sample_data = request.pop('sample_data', {})
+    genome_version = request.pop('genome_version')
 
     data_types = list(sample_data.keys())
     single_data_type = data_types[0] if len(data_types) == 1 else None
 
     sample_data = sample_data[single_data_type]
-    data_type = single_data_type
     query_cls = QUERY_CLASS_MAP[single_data_type]
+    globals = all_globals[single_data_type][genome_version]
 
-    query = query_cls(data_type, sample_data=sample_data, **request)
+    query = query_cls(sample_data, genome_version, globals, **request)
     return query.search()
+
+
+def load_globals():
+    return {k: cls.load_globals() for k, cls in QUERY_CLASS_MAP.items()}
diff --git a/hail_search/web_app.py b/hail_search/web_app.py
index cf538cf751..02d2ae5293 100644
--- a/hail_search/web_app.py
+++ b/hail_search/web_app.py
@@ -2,7 +2,9 @@
 import json
 import hail as hl
 
-from hail_search.search import search_hail_backend
+from hail_search.search import search_hail_backend, load_globals
+
+APP_GLOBALS = 'APP_GLOBALS'
 
 
 def _hl_json_default(o):
@@ -15,7 +17,7 @@ def hl_json_dumps(obj):
 
 
 async def search(request: web.Request) -> web.Response:
-    hail_results, total_results = search_hail_backend(await request.json())
+    hail_results, total_results = search_hail_backend(await request.json(), request.app[APP_GLOBALS])
     return web.json_response({'results': hail_results, 'total': total_results}, dumps=hl_json_dumps)
 
 
@@ -29,4 +31,5 @@ def init_web_app():
         web.get('/status', status),
         web.post('/search', search),
     ])
+    app[APP_GLOBALS] = load_globals()
     return app

From fe930454d191904f4cef8373b533dc4043f5c3e8 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 10 Aug 2023 13:44:14 -0400
Subject: [PATCH 2/5] codacy tweaks

---
 hail_search/hail_search_query.py | 10 +++++-----
 hail_search/search.py            |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py
index 542de7f157..e12eaf025f 100644
--- a/hail_search/hail_search_query.py
+++ b/hail_search/hail_search_query.py
@@ -64,12 +64,12 @@ class BaseHailTableQuery(object):
 
     @classmethod
     def load_globals(cls):
-        globals = {}
+        loaded_globals = {}
         for genome_version in cls.GENOME_VERSIONS:
             ht_path = cls._get_generic_table_path(genome_version, 'annotations.ht')
             globals_ht = hl.read_table(ht_path).head(0).select()
-            globals[genome_version] = {k: hl.eval(globals_ht[k]) for k in cls.GLOBALS}
-        return globals
+            loaded_globals[genome_version] = {k: hl.eval(globals_ht[k]) for k in cls.GLOBALS}
+        return loaded_globals
 
     @classmethod
     def _format_population_config(cls, pop_config):
@@ -151,12 +151,12 @@ def _enum_field(value, enum, ht_globals=None, annotate_value=None, format_value=
 
         return value
 
-    def __init__(self, sample_data, genome_version, globals, sort=XPOS, num_results=100, **kwargs):
+    def __init__(self, sample_data, genome_version, data_type_globals, sort=XPOS, num_results=100, **kwargs):
         self._genome_version = genome_version
         self._sort = sort
         self._num_results = num_results
         self._ht = None
-        self._globals = globals
+        self._globals = data_type_globals
 
         self._load_filtered_table(sample_data, **kwargs)
 
diff --git a/hail_search/search.py b/hail_search/search.py
index fb45b765e9..428b226378 100644
--- a/hail_search/search.py
+++ b/hail_search/search.py
@@ -10,9 +10,9 @@ def search_hail_backend(request, all_globals):
 
     sample_data = sample_data[single_data_type]
     query_cls = QUERY_CLASS_MAP[single_data_type]
-    globals = all_globals[single_data_type][genome_version]
+    data_type_globals = all_globals[single_data_type][genome_version]
 
-    query = query_cls(sample_data, genome_version, globals, **request)
+    query = query_cls(sample_data, genome_version, data_type_globals, **request)
     return query.search()
 
 

From cea4096d1e167c02f221de381c4bace09707f03a Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 10 Aug 2023 15:17:49 -0400
Subject: [PATCH 3/5] pr feedback

---
 hail_search/hail_search_query.py | 15 +++++++++------
 hail_search/search.py            |  8 ++++----
 hail_search/web_app.py           |  6 ++----
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py
index e12eaf025f..4cd234a52d 100644
--- a/hail_search/hail_search_query.py
+++ b/hail_search/hail_search_query.py
@@ -27,6 +27,7 @@ def _to_camel_case(snake_case_str):
 class BaseHailTableQuery(object):
 
     DATA_TYPE = None
+    LOADED_GLOBALS = None
 
     GENOTYPE_QUERY_MAP = {
         REF_REF: lambda gt: gt.is_hom_ref(),
@@ -64,12 +65,11 @@ class BaseHailTableQuery(object):
 
     @classmethod
     def load_globals(cls):
-        loaded_globals = {}
+        cls.LOADED_GLOBALS = {}
         for genome_version in cls.GENOME_VERSIONS:
             ht_path = cls._get_generic_table_path(genome_version, 'annotations.ht')
-            globals_ht = hl.read_table(ht_path).head(0).select()
-            loaded_globals[genome_version] = {k: hl.eval(globals_ht[k]) for k in cls.GLOBALS}
-        return loaded_globals
+            ht_globals = hl.eval(hl.read_table(ht_path).globals.select(*cls.GLOBALS))
+            cls.LOADED_GLOBALS[genome_version] = {k: ht_globals[k] for k in cls.GLOBALS}
 
     @classmethod
     def _format_population_config(cls, pop_config):
@@ -151,15 +151,18 @@ def _enum_field(value, enum, ht_globals=None, annotate_value=None, format_value=
 
         return value
 
-    def __init__(self, sample_data, genome_version, data_type_globals, sort=XPOS, num_results=100, **kwargs):
+    def __init__(self, sample_data, genome_version, sort=XPOS, num_results=100, **kwargs):
         self._genome_version = genome_version
         self._sort = sort
         self._num_results = num_results
         self._ht = None
-        self._globals = data_type_globals
 
         self._load_filtered_table(sample_data, **kwargs)
 
+    @property
+    def _globals(self):
+        return self.LOADED_GLOBALS[self._genome_version]
+
     @property
     def _enums(self):
         return self._globals['enums']
diff --git a/hail_search/search.py b/hail_search/search.py
index 428b226378..eec334a80f 100644
--- a/hail_search/search.py
+++ b/hail_search/search.py
@@ -1,7 +1,7 @@
 from hail_search.hail_search_query import QUERY_CLASS_MAP
 
 
-def search_hail_backend(request, all_globals):
+def search_hail_backend(request):
     sample_data = request.pop('sample_data', {})
     genome_version = request.pop('genome_version')
 
@@ -10,11 +10,11 @@ def search_hail_backend(request, all_globals):
 
     sample_data = sample_data[single_data_type]
     query_cls = QUERY_CLASS_MAP[single_data_type]
-    data_type_globals = all_globals[single_data_type][genome_version]
 
-    query = query_cls(sample_data, genome_version, data_type_globals, **request)
+    query = query_cls(sample_data, genome_version, **request)
     return query.search()
 
 
 def load_globals():
-    return {k: cls.load_globals() for k, cls in QUERY_CLASS_MAP.items()}
+    for cls in QUERY_CLASS_MAP.values():
+        cls.load_globals()
diff --git a/hail_search/web_app.py b/hail_search/web_app.py
index 02d2ae5293..0dc5775a6b 100644
--- a/hail_search/web_app.py
+++ b/hail_search/web_app.py
@@ -4,8 +4,6 @@
 
 from hail_search.search import search_hail_backend, load_globals
 
-APP_GLOBALS = 'APP_GLOBALS'
-
 
 def _hl_json_default(o):
     if isinstance(o, hl.Struct) or isinstance(o, hl.utils.frozendict):
@@ -17,7 +15,7 @@ def hl_json_dumps(obj):
 
 
 async def search(request: web.Request) -> web.Response:
-    hail_results, total_results = search_hail_backend(await request.json(), request.app[APP_GLOBALS])
+    hail_results, total_results = search_hail_backend(await request.json())
     return web.json_response({'results': hail_results, 'total': total_results}, dumps=hl_json_dumps)
 
 
@@ -31,5 +29,5 @@ def init_web_app():
         web.get('/status', status),
         web.post('/search', search),
     ])
-    app[APP_GLOBALS] = load_globals()
+    load_globals()
     return app

From 6aaa66bd4097882cc42d2c23e02dce9d52b1343a Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 15 Aug 2023 13:06:05 -0400
Subject: [PATCH 4/5] handle dup individual ids for sample metadata

---
 seqr/fixtures/1kg_project.json      |  2 +-
 seqr/views/apis/report_api.py       | 14 ++++++++------
 seqr/views/apis/report_api_tests.py | 18 ++++++++++--------
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/seqr/fixtures/1kg_project.json b/seqr/fixtures/1kg_project.json
index 1c522b8bf8..7137b39277 100644
--- a/seqr/fixtures/1kg_project.json
+++ b/seqr/fixtures/1kg_project.json
@@ -695,7 +695,7 @@
         "created_by": null,
         "last_modified_date": "2017-03-13T09:07:50.158Z",
         "family": 8,
-        "individual_id": "NA20877",
+        "individual_id": "NA20888",
         "mother_id": null,
         "father_id": null,
         "sex": "M",
diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index d5ae198e92..fa105833f3 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -205,9 +205,9 @@ def sample_metadata_export(request, project_guid):
     )
     family_rows_by_id = {row['family_id']: row for row in family_rows}
 
-    rows_by_subject_id = {row['subject_id']: row for row in subject_rows}
+    rows_by_subject_family_id = {(row['subject_id'], row['family_guid']): row for row in subject_rows}
     for row in sample_rows:
-        rows_by_subject_id[row['subject_id']].update(row)
+        rows_by_subject_family_id[(row['subject_id'], row['family_guid'])].update(row)
 
     for rows in discovery_rows:
         for i, row in enumerate(rows):
@@ -216,9 +216,9 @@ def sample_metadata_export(request, project_guid):
                 parsed_row.update({
                     '{}-{}'.format(k, i + 1): row[k] for k in DISCOVERY_TABLE_METADATA_VARIANT_COLUMNS if row.get(k)
                 })
-                rows_by_subject_id[row['subject_id']].update(parsed_row)
+                rows_by_subject_family_id[(row['subject_id'], row['family_guid'])].update(parsed_row)
 
-    rows = list(rows_by_subject_id.values())
+    rows = list(rows_by_subject_family_id.values())
     all_features = set()
     for row in rows:
         row.update(family_rows_by_id[row['family_id']])
@@ -349,9 +349,10 @@ def _parse_anvil_metadata(individual_samples, user, include_collaborator=False,
             subject_rows.append(subject_row)
 
             sample_row = _get_sample_row(sample, has_dbgap_submission, airtable_metadata)
+            sample_row['family_guid'] = family_subject_row['family_guid']
             sample_rows.append(sample_row)
 
-            discovery_row = _get_discovery_rows(sample, parsed_variants, male_individual_guids)
+            discovery_row = _get_discovery_rows(sample, parsed_variants, male_individual_guids, family_subject_row['family_guid'])
             discovery_rows.append(discovery_row)
 
     return subject_rows, sample_rows, family_rows, discovery_rows
@@ -561,12 +562,13 @@ def _get_sample_row(sample, has_dbgap_submission, airtable_metadata):
         sample_row['dbgap_sample_id'] = airtable_metadata.get('dbgap_sample_id', '')
     return sample_row
 
-def _get_discovery_rows(sample, parsed_variants, male_individual_guids):
+def _get_discovery_rows(sample, parsed_variants, male_individual_guids, family_guid):
     individual = sample.individual
     discovery_row = {
         'entity:discovery_id': individual.individual_id,
         'subject_id': individual.individual_id,
         'sample_id': sample.sample_id,
+        'family_guid': family_guid,
     }
     discovery_rows = []
     for genotypes, parsed_variant in parsed_variants:
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 11fb5818e0..a99b5ce0c8 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -613,6 +613,7 @@ def test_sample_metadata_export(self, mock_google_authenticated):
         self.assertEqual(response.status_code, 200)
         response_json = response.json()
         self.assertListEqual(list(response_json.keys()), ['rows'])
+        self.assertEqual(len(response_json['rows']), 16 + len(self.ADDITIONAL_SAMPLES))
         expected_samples.update({
             'NA19679', 'NA20870', 'HG00732', 'NA20876', 'NA20874', 'NA20875', 'NA19678', 'NA19675', 'HG00731',
             'NA20872', 'NA20881', 'HG00733',
@@ -621,6 +622,7 @@ def test_sample_metadata_export(self, mock_google_authenticated):
         self.assertSetEqual({r['sample_id'] for r in response_json['rows']}, expected_samples)
         test_row = next(r for r in response_json['rows'] if r['sample_id'] == 'NA20889')
         self.assertDictEqual(EXPECTED_NO_AIRTABLE_SAMPLE_METADATA_ROW, test_row)
+        self.assertEqual(len([r['subject_id'] for r in response_json['rows'] if r['subject_id'] == 'NA20888']), 2)
 
         self.check_no_analyst_no_access(url)
 
@@ -687,13 +689,13 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'The following tables are required in the data model but absent from the reports: subject',
             'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id',
             'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata',
-            'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
-            'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
-            'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
-            'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
+            'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
+            'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
+            'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
+            'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
         ] + skipped_file_validation_warnings[1:6] + skipped_file_validation_warnings[7:])
         self.assertListEqual(response.json()['errors'], [
-            'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20877, Broad_NA20881',
+            'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)',
             'The following entries are missing required "aligned_dna_short_read_set_id" (from Airtable) in the "aligned_dna_short_read_set" table: NA19675_1',
         ])
@@ -813,15 +815,15 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
         sample_filter = "OR({CollaboratorSampleID}='HG00731',{CollaboratorSampleID}='HG00732',{CollaboratorSampleID}='HG00733'," \
                         "{CollaboratorSampleID}='NA19675_1',{CollaboratorSampleID}='NA19678',{CollaboratorSampleID}='NA19679'," \
                         "{CollaboratorSampleID}='NA20870',{CollaboratorSampleID}='NA20872',{CollaboratorSampleID}='NA20874'," \
-                        "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20877'," \
-                        "{CollaboratorSampleID}='NA20881')"
+                        "{CollaboratorSampleID}='NA20875',{CollaboratorSampleID}='NA20876',{CollaboratorSampleID}='NA20881'," \
+                        "{CollaboratorSampleID}='NA20888')"
         sample_fields = ['CollaboratorSampleID', 'SMID', 'CollaboratorSampleID', 'Recontactable']
         self._assert_expected_airtable_call(0, sample_filter, sample_fields)
         secondary_sample_filter = "OR({SeqrCollaboratorSampleID}='HG00731',{SeqrCollaboratorSampleID}='HG00732'," \
                         "{SeqrCollaboratorSampleID}='HG00733',{SeqrCollaboratorSampleID}='NA19678'," \
                         "{SeqrCollaboratorSampleID}='NA19679',{SeqrCollaboratorSampleID}='NA20870',{SeqrCollaboratorSampleID}='NA20872'," \
                         "{SeqrCollaboratorSampleID}='NA20874',{SeqrCollaboratorSampleID}='NA20875',{SeqrCollaboratorSampleID}='NA20876'," \
-                        "{SeqrCollaboratorSampleID}='NA20877',{SeqrCollaboratorSampleID}='NA20881')"
+                        "{SeqrCollaboratorSampleID}='NA20881',{SeqrCollaboratorSampleID}='NA20888')"
         sample_fields[0] = 'SeqrCollaboratorSampleID'
         self._assert_expected_airtable_call(1, secondary_sample_filter, sample_fields)
         metadata_fields = [

From 13c3dc366009515dfc1136d8f35b9f1144513666 Mon Sep 17 00:00:00 2001
From: snyk-bot <snyk-bot@snyk.io>
Date: Wed, 16 Aug 2023 14:06:01 +0000
Subject: [PATCH 5/5] fix: requirements.txt to reduce vulnerabilities

The following vulnerabilities are fixed by pinning transitive dependencies:
- https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5813746
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 111982db5f..da497692d4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ cffi==1.15.1
     # via cryptography
 charset-normalizer==3.0.1
     # via requests
-cryptography==41.0.2
+cryptography==41.0.3
     # via social-auth-core
 defusedxml==0.7.1
     # via