From e93cb3d01b51b628e3750ad25724a1e2ac40d3e9 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 11 Aug 2023 14:50:02 -0400
Subject: [PATCH 01/16] add gene counts endpoint

---
 hail_search/hail_search_query.py | 17 +++++++++++++++++
 hail_search/search.py            |  7 +++++--
 hail_search/web_app.py           |  4 ++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py
index 3c4fc4a51d..3466f5aa58 100644
--- a/hail_search/hail_search_query.py
+++ b/hail_search/hail_search_query.py
@@ -685,6 +685,23 @@ def _sort_order(self, ht):
     def _get_sort_expressions(self, ht, sort):
         return self.SORTS[sort](ht)
 
+    def gene_counts(self):
+        if self._comp_het_ht:
+            ht = self._comp_het_ht.explode(self._comp_het_ht[GROUPED_VARIANTS_FIELD])
+            ht = ht.transmute(**ht[GROUPED_VARIANTS_FIELD])
+            if self._ht:
+                ht = ht.join(self._ht, 'outer')
+        else:
+            ht = self._ht
+
+        ht = ht.select(
+            gene_ids=hl.set(ht.sortedTranscriptConsequences.map(lambda t: t.gene_id)),
+            families=self.BASE_ANNOTATION_FIELDS['familyGuids'](ht),
+        ).explode('gene_ids').explode('families')
+        return ht.aggregate(hl.agg.group_by(
+            ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families))
+        ))
+
 
 class VariantHailTableQuery(BaseHailTableQuery):
 
diff --git a/hail_search/search.py b/hail_search/search.py
index 716aae5e7b..07db813797 100644
--- a/hail_search/search.py
+++ b/hail_search/search.py
@@ -1,7 +1,7 @@
 from hail_search.hail_search_query import QUERY_CLASS_MAP
 
 
-def search_hail_backend(request):
+def search_hail_backend(request, gene_counts=False):
     sample_data = request.pop('sample_data', {})
 
     data_types = list(sample_data.keys())
@@ -12,4 +12,7 @@ def search_hail_backend(request):
     query_cls = QUERY_CLASS_MAP[single_data_type]
 
     query = query_cls(data_type, sample_data=sample_data, **request)
-    return query.search()
+    if gene_counts:
+        return query.gene_counts()
+    else:
+        return query.search()
diff --git a/hail_search/web_app.py b/hail_search/web_app.py
index cf538cf751..d56bc331ec 100644
--- a/hail_search/web_app.py
+++ b/hail_search/web_app.py
@@ -14,6 +14,10 @@ def hl_json_dumps(obj):
     return json.dumps(obj, default=_hl_json_default)
 
 
+async def gene_counts(request: web.Request) -> web.Response:
+    return web.json_response(search_hail_backend(await request.json(), gene_counts=True))
+
+
 async def search(request: web.Request) -> web.Response:
     hail_results, total_results = search_hail_backend(await request.json())
     return web.json_response({'results': hail_results, 'total': total_results}, dumps=hl_json_dumps)

From 79d4589379795005f88c4d3fd647b3b965aa9d13 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 11 Aug 2023 14:51:49 -0400
Subject: [PATCH 02/16] use geen_ids helper

---
 hail_search/hail_search_query.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py
index 43db291bc5..85781b77b5 100644
--- a/hail_search/hail_search_query.py
+++ b/hail_search/hail_search_query.py
@@ -728,7 +728,7 @@ def gene_counts(self):
             ht = self._ht
 
         ht = ht.select(
-            gene_ids=hl.set(ht.sortedTranscriptConsequences.map(lambda t: t.gene_id)),
+            gene_ids=self._gene_ids_expr(ht),
             families=self.BASE_ANNOTATION_FIELDS['familyGuids'](ht),
         ).explode('gene_ids').explode('families')
         return ht.aggregate(hl.agg.group_by(

From 6b6d844f7ed038f252f1df659965c2017398a4a7 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 11 Aug 2023 15:40:28 -0400
Subject: [PATCH 03/16] tes gene counts

---
 hail_search/hail_search_query.py             | 24 ++++++++------
 hail_search/test_search.py                   | 34 ++++++++++++++------
 hail_search/test_utils.py                    |  5 +++
 hail_search/web_app.py                       |  3 +-
 seqr/utils/search/hail_search_utils_tests.py |  8 ++---
 seqr/utils/search/search_utils_tests.py      | 13 +++-----
 6 files changed, 55 insertions(+), 32 deletions(-)

diff --git a/hail_search/hail_search_query.py b/hail_search/hail_search_query.py
index 85781b77b5..9a2b7214ef 100644
--- a/hail_search/hail_search_query.py
+++ b/hail_search/hail_search_query.py
@@ -719,18 +719,24 @@ def _gene_rank_sort(cls, r, gene_ranks):
         return []
 
     def gene_counts(self):
+        selects = {
+            'gene_ids': self._gene_ids_expr,
+            'families': self.BASE_ANNOTATION_FIELDS['familyGuids'],
+        }
+        ch_ht = None
         if self._comp_het_ht:
-            ht = self._comp_het_ht.explode(self._comp_het_ht[GROUPED_VARIANTS_FIELD])
-            ht = ht.transmute(**ht[GROUPED_VARIANTS_FIELD])
-            if self._ht:
-                ht = ht.join(self._ht, 'outer')
+            ch_ht = self._comp_het_ht.explode(self._comp_het_ht[GROUPED_VARIANTS_FIELD])
+            ch_ht = ch_ht.select(**{k: v(ch_ht[GROUPED_VARIANTS_FIELD]) for k, v in selects.items()})
+
+        if self._ht:
+            ht = self._ht.select(**{k: v(self._ht) for k, v in selects.items()})
+            if ch_ht:
+                ht = ht.join(ch_ht, 'outer')
+                ht = ht.transmute(**{k: hl.or_else(ht[k], ht[f'{k}_1']) for k in selects})
         else:
-            ht = self._ht
+            ht = ch_ht
 
-        ht = ht.select(
-            gene_ids=self._gene_ids_expr(ht),
-            families=self.BASE_ANNOTATION_FIELDS['familyGuids'](ht),
-        ).explode('gene_ids').explode('families')
+        ht = ht.explode('gene_ids').explode('families')
         return ht.aggregate(hl.agg.group_by(
             ht.gene_ids, hl.struct(total=hl.agg.count(), families=hl.agg.counter(ht.families))
         ))
diff --git a/hail_search/test_search.py b/hail_search/test_search.py
index b260b8d6c7..39d344909e 100644
--- a/hail_search/test_search.py
+++ b/hail_search/test_search.py
@@ -3,7 +3,7 @@
 
 from hail_search.test_utils import get_hail_search_body, FAMILY_2_VARIANT_SAMPLE_DATA, FAMILY_2_MISSING_SAMPLE_DATA, \
     VARIANT1, VARIANT2, VARIANT3, VARIANT4, MULTI_PROJECT_SAMPLE_DATA, MULTI_PROJECT_MISSING_SAMPLE_DATA, \
-    LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH
+    LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS
 from hail_search.web_app import init_web_app
 
 PROJECT_2_VARIANT = {
@@ -113,7 +113,7 @@ async def test_status(self):
             resp_json = await resp.json()
         self.assertDictEqual(resp_json, {'success': True})
 
-    async def _assert_expected_search(self, results, **search_kwargs):
+    async def _assert_expected_search(self, results, gene_counts=None, **search_kwargs):
         search_body = get_hail_search_body(**search_kwargs)
         async with self.client.request('POST', '/search', json=search_body) as resp:
             self.assertEqual(resp.status, 200)
@@ -123,20 +123,32 @@ async def _assert_expected_search(self, results, **search_kwargs):
         for i, result in enumerate(resp_json['results']):
             self.assertEqual(result, results[i])
 
+        if gene_counts:
+            async with self.client.request('POST', '/gene_counts', json=search_body) as resp:
+                self.assertEqual(resp.status, 200)
+                gene_counts_json = await resp.json()
+            self.assertDictEqual(gene_counts_json, gene_counts)
+
     async def test_single_family_search(self):
         await self._assert_expected_search(
-            [VARIANT1, VARIANT2, VARIANT3, VARIANT4], sample_data=FAMILY_2_VARIANT_SAMPLE_DATA,
+            [VARIANT1, VARIANT2, VARIANT3, VARIANT4], sample_data=FAMILY_2_VARIANT_SAMPLE_DATA, gene_counts={
+                'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
+                'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}},
+            }
         )
 
     async def test_single_project_search(self):
         await self._assert_expected_search(
-            [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES',
+            [VARIANT1, VARIANT2, MULTI_FAMILY_VARIANT, VARIANT4], omit_sample_type='SV_WES', gene_counts={
+                'ENSG00000097046': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}},
+                'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000003_3': 1}},
+            }
         )
 
     async def test_multi_project_search(self):
         await self._assert_expected_search(
             [PROJECT_2_VARIANT, MULTI_PROJECT_VARIANT1, MULTI_PROJECT_VARIANT2, VARIANT3, VARIANT4],
-            sample_data=MULTI_PROJECT_SAMPLE_DATA,
+            gene_counts=GENE_COUNTS, sample_data=MULTI_PROJECT_SAMPLE_DATA,
         )
 
     async def test_inheritance_filter(self):
@@ -163,13 +175,17 @@ async def test_inheritance_filter(self):
             [VARIANT2, VARIANT3], inheritance_filter=gt_inheritance_filter, sample_data=FAMILY_2_VARIANT_SAMPLE_DATA)
 
         await self._assert_expected_search(
-            [[VARIANT3, VARIANT4]], inheritance_mode='compound_het', sample_data=MULTI_PROJECT_SAMPLE_DATA,
-            **COMP_HET_ALL_PASS_FILTERS,
+            [[VARIANT3, VARIANT4]], inheritance_mode='compound_het', sample_data=MULTI_PROJECT_SAMPLE_DATA, gene_counts={
+                'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
+                'ENSG00000177000': {'total': 1, 'families': {'F000002_2': 1}},
+            }, **COMP_HET_ALL_PASS_FILTERS,
         )
 
         await self._assert_expected_search(
-            [PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive',
-            sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS,
+            [PROJECT_2_VARIANT1, VARIANT2, [VARIANT3, VARIANT4]], inheritance_mode='recessive', gene_counts={
+                'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
+                'ENSG00000177000': {'total': 2, 'families': {'F000002_2': 2}},
+            }, sample_data=MULTI_PROJECT_SAMPLE_DATA, **COMP_HET_ALL_PASS_FILTERS,
         )
 
     async def test_quality_filter(self):
diff --git a/hail_search/test_utils.py b/hail_search/test_utils.py
index 27ef225dc8..6ac29fe9ab 100644
--- a/hail_search/test_utils.py
+++ b/hail_search/test_utils.py
@@ -362,6 +362,11 @@
 VARIANT_ID_SEARCH = {'variant_ids': [['1', 10439, 'AC', 'A'], ['1', 91511686, 'TCA', 'G']], 'rs_ids': []}
 RSID_SEARCH = {'variant_ids': [], 'rs_ids': ['rs1801131']}
 
+GENE_COUNTS = {
+    'ENSG00000097046': {'total': 2, 'families': {'F000002_2': 2}},
+    'ENSG00000177000': {'total': 3, 'families': {'F000002_2': 2, 'F000011_11': 1}},
+}
+
 
 def get_hail_search_body(genome_version='GRCh38', num_results=100, sample_data=None, omit_sample_type=None, **search_body):
     sample_data = sample_data or EXPECTED_SAMPLE_DATA
diff --git a/hail_search/web_app.py b/hail_search/web_app.py
index d56bc331ec..8a288e64db 100644
--- a/hail_search/web_app.py
+++ b/hail_search/web_app.py
@@ -15,7 +15,7 @@ def hl_json_dumps(obj):
 
 
 async def gene_counts(request: web.Request) -> web.Response:
-    return web.json_response(search_hail_backend(await request.json(), gene_counts=True))
+    return web.json_response(search_hail_backend(await request.json(), gene_counts=True), dumps=hl_json_dumps)
 
 
 async def search(request: web.Request) -> web.Response:
@@ -32,5 +32,6 @@ def init_web_app():
     app.add_routes([
         web.get('/status', status),
         web.post('/search', search),
+        web.post('/gene_counts', gene_counts),
     ])
     return app
diff --git a/seqr/utils/search/hail_search_utils_tests.py b/seqr/utils/search/hail_search_utils_tests.py
index b4dc36b882..9090249b2a 100644
--- a/seqr/utils/search/hail_search_utils_tests.py
+++ b/seqr/utils/search/hail_search_utils_tests.py
@@ -8,10 +8,10 @@
 from seqr.models import Family
 from seqr.utils.search.utils import get_variant_query_gene_counts, query_variants, get_single_variant, \
     get_variants_for_variant_ids, InvalidSearchException
-from seqr.utils.search.search_utils_tests import SearchTestHelper, MOCK_COUNTS
+from seqr.utils.search.search_utils_tests import SearchTestHelper
 from hail_search.test_utils import get_hail_search_body, EXPECTED_SAMPLE_DATA, FAMILY_1_SAMPLE_DATA, \
     FAMILY_2_ALL_SAMPLE_DATA, ALL_AFFECTED_SAMPLE_DATA, CUSTOM_AFFECTED_SAMPLE_DATA, HAIL_BACKEND_VARIANTS, \
-    LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH
+    LOCATION_SEARCH, EXCLUDE_LOCATION_SEARCH, VARIANT_ID_SEARCH, RSID_SEARCH, GENE_COUNTS
 MOCK_HOST = 'http://test-hail-host'
 
 
@@ -155,10 +155,10 @@ def test_query_variants(self):
 
     @responses.activate
     def test_get_variant_query_gene_counts(self):
-        responses.add(responses.POST, f'{MOCK_HOST}:5000/gene_counts', json=MOCK_COUNTS, status=200)
+        responses.add(responses.POST, f'{MOCK_HOST}:5000/gene_counts', json=GENE_COUNTS, status=200)
 
         gene_counts = get_variant_query_gene_counts(self.results_model, self.user)
-        self.assertDictEqual(gene_counts, MOCK_COUNTS)
+        self.assertDictEqual(gene_counts, GENE_COUNTS)
         self.assert_cached_results({'gene_aggs': gene_counts})
         self._test_expected_search_call(sort=None)
 
diff --git a/seqr/utils/search/search_utils_tests.py b/seqr/utils/search/search_utils_tests.py
index afb0816f98..ccfec426da 100644
--- a/seqr/utils/search/search_utils_tests.py
+++ b/seqr/utils/search/search_utils_tests.py
@@ -4,17 +4,12 @@
 import json
 import mock
 
+from hail_search.test_utils import GENE_COUNTS
 from seqr.models import Family, Sample, VariantSearch, VariantSearchResults
 from seqr.utils.search.utils import get_single_variant, get_variants_for_variant_ids, get_variant_query_gene_counts, \
     query_variants, InvalidSearchException
 from seqr.views.utils.test_utils import PARSED_VARIANTS, PARSED_COMPOUND_HET_VARIANTS_MULTI_PROJECT, GENE_FIELDS
 
-MOCK_COUNTS = {
-    'ENSG00000135953': {'total': 3, 'families': {'F000003_3': 2, 'F000002_2': 1, 'F000005_5': 1}},
-    'ENSG00000228198': {'total': 5, 'families': {'F000003_3': 4, 'F000002_2': 1, 'F000005_5': 1}},
-    'ENSG00000240361': {'total': 2, 'families': {'F000003_3': 2}},
-}
-
 
 class SearchTestHelper(object):
 
@@ -354,12 +349,12 @@ def test_invalid_search_get_variant_query_gene_counts(self):
 
     def test_get_variant_query_gene_counts(self, mock_get_variants):
         def _mock_get_variants(families, search, user, previous_search_results, genome_version, **kwargs):
-            previous_search_results['gene_aggs'] = MOCK_COUNTS
-            return MOCK_COUNTS
+            previous_search_results['gene_aggs'] = GENE_COUNTS
+            return GENE_COUNTS
         mock_get_variants.side_effect = _mock_get_variants
 
         gene_counts = get_variant_query_gene_counts(self.results_model, self.user)
-        self.assertDictEqual(gene_counts, MOCK_COUNTS)
+        self.assertDictEqual(gene_counts, GENE_COUNTS)
         results_cache = {'gene_aggs': gene_counts}
         self.assert_cached_results(results_cache)
         self._test_expected_search_call(

From 0cd2e89f17132628d05a523a4d28211e43e4a439 Mon Sep 17 00:00:00 2001
From: Shifa Zhang <zhangshifa07504@gmail.com>
Date: Wed, 16 Aug 2023 14:52:40 -0400
Subject: [PATCH 04/16] First draft for the new in-silico thresholds.

---
 .../components/panel/variants/Predictions.jsx | 43 ++++++------
 ui/shared/utils/constants.js                  | 68 ++++++++++++++-----
 2 files changed, 74 insertions(+), 37 deletions(-)

diff --git a/ui/shared/components/panel/variants/Predictions.jsx b/ui/shared/components/panel/variants/Predictions.jsx
index c4dcd7e5d9..300a1671d9 100644
--- a/ui/shared/components/panel/variants/Predictions.jsx
+++ b/ui/shared/components/panel/variants/Predictions.jsx
@@ -5,7 +5,7 @@ import { connect } from 'react-redux'
 import { Icon, Transition, Popup } from 'semantic-ui-react'
 
 import { getGenesById } from 'redux/selectors'
-import { PREDICTOR_FIELDS, getVariantMainGeneId } from 'shared/utils/constants'
+import { PREDICTOR_FIELDS, getPredictColor, getVariantMainGeneId } from 'shared/utils/constants'
 import { snakecaseToTitlecase } from 'shared/utils/stringUtils'
 import { HorizontalSpacer } from '../../Spacers'
 import { ButtonLink } from '../../StyledComponents'
@@ -20,7 +20,7 @@ const PredictionValue = styled.span`
 const NUM_TO_SHOW_ABOVE_THE_FOLD = 6 // how many predictors to show immediately
 
 const predictionFieldValue = (
-  predictions, { field, dangerThreshold, warningThreshold, indicatorMap, infoField, infoTitle },
+  predictions, { field, pathHigher, thresholds, indicatorMap, infoField, infoTitle },
 ) => {
   let value = predictions[field]
   if (value === null || value === undefined) {
@@ -29,22 +29,19 @@ const predictionFieldValue = (
 
   const infoValue = predictions[infoField]
 
-  if (dangerThreshold) {
-    value = parseFloat(value).toPrecision(2)
-    let color = 'green'
-    if (value >= dangerThreshold) {
-      color = 'red'
-    } else if (value >= warningThreshold) {
-      color = 'yellow'
-    }
-    return { value, color, infoValue, infoTitle, dangerThreshold, warningThreshold }
+  if (thresholds) {
+    value = parseFloat(value).toPrecision(3)
+    const color = getPredictColor(value, pathHigher, thresholds)
+    return { value, color, infoValue, infoTitle, pathHigher, thresholds }
   }
 
   return indicatorMap[value[0]] || indicatorMap[value]
 }
 
+const PATHOGENIC_COLORS = ['green', 'light green', 'grey', 'yellow', 'red', 'dark red']
+
 const Prediction = (
-  { field, fieldTitle, value, color, infoValue, infoTitle, warningThreshold, dangerThreshold, href },
+  { field, fieldTitle, value, color, infoValue, infoTitle, pathHigher, thresholds, href },
 ) => {
   const indicator = infoValue ? (
     <Popup
@@ -54,13 +51,21 @@ const Prediction = (
     />
   ) : <Icon name="circle" size="small" color={color} />
   const fieldName = fieldTitle || snakecaseToTitlecase(field)
-  const fieldDisplay = dangerThreshold ? (
+  const fieldDisplay = thresholds ? (
     <Popup
       header={`${fieldName} Color Ranges`}
       content={
         <div>
-          <div>{`Red > ${dangerThreshold}`}</div>
-          {warningThreshold < dangerThreshold && <div>{`Yellow > ${warningThreshold}`}</div>}
+          {thresholds.map((th, i) => {
+            if (!th) {
+              return null
+            }
+            const t = pathHigher ? th : -1 * th
+            if (i < 3) {
+              return <div>{`${PATHOGENIC_COLORS[i]} ${pathHigher ? '<' : '>'}= ${t}`}</div>
+            }
+            return <div>{`${PATHOGENIC_COLORS[i]} ${pathHigher ? '>' : '<'}= ${t}`}</div>
+          }).filter(e => !!e)}
         </div>
       }
       trigger={<span>{fieldName}</span>}
@@ -85,8 +90,8 @@ Prediction.propTypes = {
   infoTitle: PropTypes.string,
   fieldTitle: PropTypes.string,
   color: PropTypes.string,
-  warningThreshold: PropTypes.number,
-  dangerThreshold: PropTypes.number,
+  pathHigher: PropTypes.bool,
+  thresholds: PropTypes.arrayOf(PropTypes.number),
   href: PropTypes.string,
 }
 
@@ -116,8 +121,8 @@ class Predictions extends React.PureComponent {
     if (gene && gene.primateAi) {
       genePredictors.primate_ai = {
         field: 'primate_ai',
-        warningThreshold: gene.primateAi.percentile25,
-        dangerThreshold: gene.primateAi.percentile75,
+        pathHigher: gene.primateAi.percentile75 >= gene.primateAi.percentile25,
+        thresholds: [null, null, gene.primateAi.percentile25, gene.primateAi.percentile75, null],
       }
     }
 
diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js
index 709b72b471..644941f16a 100644
--- a/ui/shared/utils/constants.js
+++ b/ui/shared/utils/constants.js
@@ -1316,16 +1316,37 @@ export const SV_IN_SILICO_GROUP = 'Structural'
 export const NO_SV_IN_SILICO_GROUPS = [MISSENSE_IN_SILICO_GROUP, CODING_IN_SILICO_GROUP]
 export const SPLICE_AI_FIELD = 'splice_ai'
 
+const PRED_COLOR_MAP = ['green', '#90ee90', 'grey', 'yellow', 'red', '#8b0000']
+
+export const getPredictColor = (value, pathHigher, thresholds) => {
+  let colorIndex = 0
+  const v = pathHigher ? value : -1 * value
+  if (v <= thresholds[1]) {
+    if (v > thresholds[0]) {
+      colorIndex = 1
+    }
+  } else {
+    colorIndex = 5
+    if (v < thresholds[2]) {
+      colorIndex = 2
+    } else if (v < thresholds[3]) {
+      colorIndex = 3
+    } else if (v < thresholds[4]) {
+      colorIndex = 4
+    }
+  }
+  return PRED_COLOR_MAP[colorIndex]
+}
+
 export const PREDICTOR_FIELDS = [
-  { field: 'cadd', group: CODING_IN_SILICO_GROUP, warningThreshold: 10, dangerThreshold: 20, min: 1, max: 99 },
-  { field: 'revel', group: MISSENSE_IN_SILICO_GROUP, warningThreshold: 0.5, dangerThreshold: 0.75 },
-  { field: 'primate_ai', group: MISSENSE_IN_SILICO_GROUP, warningThreshold: 0.5, dangerThreshold: 0.7 },
-  { field: 'mpc', group: MISSENSE_IN_SILICO_GROUP, warningThreshold: 1, dangerThreshold: 2, max: 5 },
+  { field: 'cadd', group: CODING_IN_SILICO_GROUP, thresholds: [0.15, 22.7, 25.3, 28.1, null], min: 1, max: 99 },
+  { field: 'revel', group: MISSENSE_IN_SILICO_GROUP, thresholds: [0.016, 0.29, 0.644, 0.773, 0.932] },
+  { field: 'primate_ai', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, 0.483, 0.79, 0.867, null] },
+  { field: 'mpc', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, null, 1.36, 1.828, null], max: 5 },
   {
     field: SPLICE_AI_FIELD,
     group: SPLICING_IN_SILICO_GROUP,
-    warningThreshold: 0.5,
-    dangerThreshold: 0.8,
+    thresholds: [null, null, 0.5, 0.8, null],
     infoField: 'splice_ai_consequence',
     infoTitle: 'Predicted Consequence',
     fieldTitle: 'SpliceAI',
@@ -1333,21 +1354,32 @@ export const PREDICTOR_FIELDS = [
       `https://spliceailookup.broadinstitute.org/#variant=${chrom}-${pos}-${ref}-${alt}&hg=${genomeVersion}&distance=1000&mask=1`
     ),
   },
-  { field: 'eigen', group: CODING_IN_SILICO_GROUP, warningThreshold: 1, dangerThreshold: 2, max: 99 },
-  { field: 'dann', displayOnly: true, warningThreshold: 0.93, dangerThreshold: 0.96 },
-  { field: 'strvctvre', group: SV_IN_SILICO_GROUP, warningThreshold: 0.5, dangerThreshold: 0.75 },
-  { field: 'polyphen', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: POLYPHEN_MAP },
-  { field: 'sift', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: INDICATOR_MAP },
+  { field: 'eigen', group: CODING_IN_SILICO_GROUP, thresholds: [null, null, 1, 2, null], max: 99 },
+  { field: 'dann', displayOnly: true, thresholds: [null, null, 0.93, 0.96, null] },
+  { field: 'strvctvre', group: SV_IN_SILICO_GROUP, thresholds: [null, null, 0.5, 0.75, null] },
+  { field: 'polyphen', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, 0.113, 0.978, 0.999, null], indicatorMap: POLYPHEN_MAP },
+  { field: 'sift', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, 0.08, 0.001, 0, null], indicatorMap: INDICATOR_MAP },
   { field: 'mut_taster', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: MUTTASTER_MAP },
-  { field: 'fathmm', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: FATHMM_MAP },
-  { field: 'vest', warningThreshold: 0.5, dangerThreshold: 0.764 },
-  { field: 'mut_pred', warningThreshold: 0.392, dangerThreshold: 0.737 },
-  { field: 'apogee', warningThreshold: 0.5, dangerThreshold: 0.5 },
-  { field: 'gnomad_noncoding', fieldTitle: 'gnomAD Constraint', displayOnly: true, warningThreshold: 2.18, dangerThreshold: 4 },
+  { field: 'fathmm', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, 3.32, -4.14, -5.04, null], indicatorMap: FATHMM_MAP },
+  { field: 'vest', thresholds: [null, 0.449, 0.764, 0.861, 0.965] },
+  { field: 'mut_pred', thresholds: [0.01, 0.391, 0.737, 0.829, 0.932] },
+  { field: 'apogee', thresholds: [null, null, 0.5, 0.5, null] },
+  { field: 'gnomad_noncoding', fieldTitle: 'gnomAD Constraint', displayOnly: true, thresholds: [null, null, 2.18, 4, null], warningThreshold: 2.18, dangerThreshold: 4 },
   { field: 'haplogroup_defining', indicatorMap: { Y: { color: 'green', value: '' } } },
   { field: 'mitotip', indicatorMap: MITOTIP_MAP },
-  { field: 'hmtvar', warningThreshold: 0.35, dangerThreshold: 0.35 },
-]
+  { field: 'hmtvar', thresholds: [null, null, 0.35, 0.35, null] },
+].map(({ thresholds, ...pred }) => {
+  if (!thresholds) {
+    return pred
+  }
+  const noneNullThresholds = thresholds.filter(t => t)
+  const pathHigher = noneNullThresholds[1] >= noneNullThresholds[0]
+  return {
+    ...pred,
+    pathHigher,
+    thresholds: pathHigher ? thresholds : thresholds.map(t => (t === null ? null : -1 * t)),
+  }
+})
 
 export const getVariantMainGeneId = ({ transcripts = {}, mainTranscriptId, selectedMainTranscriptId }) => {
   if (selectedMainTranscriptId || mainTranscriptId) {

From 7357125bbc080ddecea1005bc41b8692cb3277fe Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 16 Aug 2023 15:09:16 -0400
Subject: [PATCH 05/16] better validation

---
 seqr/views/apis/report_api.py | 57 ++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 5 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 3e1e67995c..3ff0f8e08f 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -1033,6 +1033,21 @@ def _get_experiment_lookup_row(is_rna, row_data):
     }
 
 
+DATA_TYPE_VALIDATORS = {
+    'string': (
+        lambda val, validator: (not validator.get('is_bucket_path')) or val.startswith('gs://'),
+        lambda validator: ' are a google bucket path starting with gs://'
+    ),
+    'enumeration': (
+        lambda val, validator: val in validator['enumerations'],
+        lambda validator: f': {", ".join(column_validator["enumerations"])}',
+    ),
+    'integer': (lambda val, validator: val.replace(',', '').isnumeric(), None),
+    'float': (lambda val, validator: val.replace(',', '').replace('.', '').isnumeric(), None),
+    'date': (lambda val, validator: bool(re.match(r'^\d{4}-\d{2}-\d{2}$', val)), None),
+}
+
+
 def _validate_gregor_files(file_data):
     errors = []
     warnings = []
@@ -1070,6 +1085,26 @@ def _validate_gregor_files(file_data):
             warnings.append(
                 f'The following columns are included in the "{file_name}" data model but are missing in the report: {col_summary}'
             )
+        invalid_data_type_columns = {
+            col: validator['data_type'] for col, validator in table_validator.items()
+            if validator.get('data_type') and validator['data_type'] not in DATA_TYPE_VALIDATORS
+        }
+        if invalid_data_type_columns:
+            col_summary = ', '.join(sorted([f'{col} ({data_type})' for col, data_type in invalid_data_type_columns.items()]))
+            warnings.append(
+                f'The following columns are included in the "{file_name}" data model but have an unsupported data type: {col_summary}'
+            )
+        invalid_enum_columns = [
+            col for col, validator in table_validator.items()
+            if validator.get('data_type') == 'enumeration' and not validator.get('enumerations')
+        ]
+        if invalid_enum_columns:
+            for col in invalid_enum_columns:
+                table_validator[col]['data_type'] = None
+            col_summary = ', '.join(sorted(invalid_enum_columns))
+            warnings.append(
+                f'The following columns are specified as "enumeration" in the "{file_name}" data model but are missing the allowed values definition: {col_summary}'
+            )
 
         for column in columns:
             _validate_column_data(
@@ -1112,15 +1147,18 @@ def _has_required_table(table, validator, tables):
 
 
 def _validate_column_data(column, file_name, data, column_validator, warnings, errors):
-    enum = column_validator.get('enumerations')
+    data_type = column_validator.get('data_type')
+    data_type_validator, allowed_formatter = DATA_TYPE_VALIDATORS.get(data_type)
+    unique = column_validator.get('is_unique')
     required = column_validator.get('required')
     recommended = column in WARN_MISSING_TABLE_COLUMNS.get(file_name, [])
-    if not (required or enum or recommended):
+    if not (required or unique or recommended or data_type_validator):
         return
 
     missing = []
     warn_missing = []
     invalid = []
+    grouped_values = defaultdict(set)
     for row in data:
         value = row.get(column)
         if not value:
@@ -1130,9 +1168,13 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e
                 check_recommend_condition = WARN_MISSING_CONDITIONAL_COLUMNS.get(column)
                 if not check_recommend_condition or check_recommend_condition(row):
                     warn_missing.append(_get_row_id(row))
-        elif enum and value not in enum:
+        elif data_type_validator and not data_type_validator(value, column_validator):
             invalid.append(f'{_get_row_id(row)} ({value})')
-    if missing or warn_missing or invalid:
+        elif unique:
+            grouped_values[value].add(_get_row_id(row))
+
+    duplicates = [f'{k} ({", ".join(v)})' for k, v in grouped_values.items() if len(v) > 1]
+    if missing or warn_missing or invalid or duplicates:
         airtable_summary = ' (from Airtable)' if column in ALL_AIRTABLE_COLUMNS else ''
         error_template = f'The following entries {{issue}} "{column}"{airtable_summary} in the "{file_name}" table'
         if missing:
@@ -1141,8 +1183,13 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e
             )
         if invalid:
             invalid_values = f'Invalid values: {", ".join(sorted(invalid))}'
+            allowed = allowed_formatter(column_validator) if allowed_formatter else f' have data type {data_type}'
+            errors.append(
+                f'{error_template.format(issue="have invalid values for")}. Allowed values{allowed}. {invalid_values}'
+            )
+        if duplicates:
             errors.append(
-                f'{error_template.format(issue="have invalid values for")}. Allowed values: {", ".join(enum)}. {invalid_values}'
+                f'{error_template.format(issue="have non-unique values for")}: {", ".join(sorted(duplicates))}'
             )
         if warn_missing:
             warnings.append(

From c3586be3fc204a8846d9b4ff3df3534e258402a5 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 16 Aug 2023 15:53:08 -0400
Subject: [PATCH 06/16] add tests

---
 seqr/views/apis/report_api.py       | 28 ++++++++--------
 seqr/views/apis/report_api_tests.py | 51 ++++++++++++++++-------------
 2 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 3ff0f8e08f..4a9fef873f 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -1034,19 +1034,16 @@ def _get_experiment_lookup_row(is_rna, row_data):
 
 
 DATA_TYPE_VALIDATORS = {
-    'string': (
-        lambda val, validator: (not validator.get('is_bucket_path')) or val.startswith('gs://'),
-        lambda validator: ' are a google bucket path starting with gs://'
-    ),
-    'enumeration': (
-        lambda val, validator: val in validator['enumerations'],
-        lambda validator: f': {", ".join(column_validator["enumerations"])}',
-    ),
-    'integer': (lambda val, validator: val.replace(',', '').isnumeric(), None),
-    'float': (lambda val, validator: val.replace(',', '').replace('.', '').isnumeric(), None),
-    'date': (lambda val, validator: bool(re.match(r'^\d{4}-\d{2}-\d{2}$', val)), None),
+    'string': lambda val, validator: (not validator.get('is_bucket_path')) or val.startswith('gs://'),
+    'enumeration': lambda val, validator: val in validator['enumerations'],
+    'integer': lambda val, validator: val.replace(',', '').isnumeric(),
+    'float': lambda val, validator: val.replace(',', '').replace('.', '').isnumeric(),
+    'date': lambda val, validator: bool(re.match(r'^\d{4}-\d{2}-\d{2}$', val)),
+}
+DATA_TYPE_ERROR_FORMATTERS = {
+    'string': lambda validator: ' are a google bucket path starting with gs://',
+    'enumeration': lambda validator: f': {", ".join(validator["enumerations"])}',
 }
-
 
 def _validate_gregor_files(file_data):
     errors = []
@@ -1148,7 +1145,7 @@ def _has_required_table(table, validator, tables):
 
 def _validate_column_data(column, file_name, data, column_validator, warnings, errors):
     data_type = column_validator.get('data_type')
-    data_type_validator, allowed_formatter = DATA_TYPE_VALIDATORS.get(data_type)
+    data_type_validator = DATA_TYPE_VALIDATORS.get(data_type)
     unique = column_validator.get('is_unique')
     required = column_validator.get('required')
     recommended = column in WARN_MISSING_TABLE_COLUMNS.get(file_name, [])
@@ -1173,7 +1170,7 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e
         elif unique:
             grouped_values[value].add(_get_row_id(row))
 
-    duplicates = [f'{k} ({", ".join(v)})' for k, v in grouped_values.items() if len(v) > 1]
+    duplicates = [f'{k} ({", ".join(sorted(v))})' for k, v in grouped_values.items() if len(v) > 1]
     if missing or warn_missing or invalid or duplicates:
         airtable_summary = ' (from Airtable)' if column in ALL_AIRTABLE_COLUMNS else ''
         error_template = f'The following entries {{issue}} "{column}"{airtable_summary} in the "{file_name}" table'
@@ -1183,7 +1180,8 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e
             )
         if invalid:
             invalid_values = f'Invalid values: {", ".join(sorted(invalid))}'
-            allowed = allowed_formatter(column_validator) if allowed_formatter else f' have data type {data_type}'
+            allowed = DATA_TYPE_ERROR_FORMATTERS[data_type](column_validator) \
+                if data_type in DATA_TYPE_ERROR_FORMATTERS else f' have data type {data_type}'
             errors.append(
                 f'{error_template.format(issue="have invalid values for")}. Allowed values{allowed}. {invalid_values}'
             )
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index b03234cebe..d01af29f22 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -209,7 +209,7 @@
         'target_insert_size_wes': '385',
         'sequencing_platform_wes': 'NovaSeq',
         'aligned_dna_short_read_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram',
-        'aligned_dna_short_read_index_file_wes': 'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai',
+        'aligned_dna_short_read_index_file_wes': 'NA',
         'md5sum_wes': '129c28163df082',
         'reference_assembly': 'GRCh38',
         'alignment_software_dna': 'BWA-MEM-2.3',
@@ -295,7 +295,7 @@
         'md5sum_wes': 'a6f6308866765ce8',
         'md5sum_wgs': '2aa33e8c32020b1c',
         'reference_assembly': 'GRCh38',
-        'alignment_software_dna': 'BWA 0.7.15.r1140',
+        'alignment_software_dna': 'BWA-MEM-2.3',
         'mean_coverage_wes': '42.8',
         'mean_coverage_wgs': '36.1',
         'analysis_details': '',
@@ -388,23 +388,23 @@
             'table': 'participant',
             'required': True,
             'columns': [
-                {'column': 'participant_id', 'required': True},
-                {'column': 'internal_project_id'},
-                {'column': 'gregor_center', 'required': True, 'enumerations': ['BCM', 'BROAD', 'UW']},
-                {'column': 'consent_code', 'required': True, 'enumerations': ['GRU', 'HMB']},
-                {'column': 'recontactable', 'enumerations': ['Yes', 'No']},
-                {'column': 'prior_testing'},
+                {'column': 'participant_id', 'required': True, 'data_type': 'string'},
+                {'column': 'internal_project_id', 'data_type': 'reference'},
+                {'column': 'gregor_center', 'required': True, 'data_type': 'enumeration', 'enumerations': ['BCM', 'BROAD', 'UW']},
+                {'column': 'consent_code', 'required': True, 'data_type': 'enumeration', 'enumerations': ['GRU', 'HMB']},
+                {'column': 'recontactable', 'data_type': 'enumeration', 'enumerations': ['Yes', 'No']},
+                {'column': 'prior_testing', 'data_type': 'enumeration'},
                 {'column': 'family_id', 'required': True},
                 {'column': 'paternal_id'},
                 {'column': 'maternal_id'},
                 {'column': 'proband_relationship', 'required': True},
-                {'column': 'sex', 'required': True, 'enumerations': ['Male', 'Female', 'Unknown']},
-                {'column': 'reported_race', 'enumerations': ['Asian', 'White', 'Black']},
-                {'column': 'reported_ethnicity', 'enumerations': ['Hispanic or Latino', 'Not Hispanic or Latino']},
+                {'column': 'sex', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Male', 'Female', 'Unknown']},
+                {'column': 'reported_race', 'data_type': 'enumeration', 'enumerations': ['Asian', 'White', 'Black']},
+                {'column': 'reported_ethnicity', 'data_type': 'enumeration', 'enumerations': ['Hispanic or Latino', 'Not Hispanic or Latino']},
                 {'column': 'ancestry_metadata'},
-                {'column': 'affected_status', 'required': True, 'enumerations': ['Affected', 'Unaffected', 'Unknown']},
+                {'column': 'affected_status', 'required': True, 'data_type': 'enumeration', 'enumerations': ['Affected', 'Unaffected', 'Unknown']},
                 {'column': 'phenotype_description'},
-                {'column': 'age_at_enrollment'},
+                {'column': 'age_at_enrollment', 'data_type': 'date'},
             ],
         },
         {
@@ -413,13 +413,13 @@
             'columns': [
                 {'column': 'aligned_dna_short_read_id', 'required': True},
                 {'column': 'experiment_dna_short_read_id', 'required': True},
-                {'column': 'aligned_dna_short_read_file'},
-                {'column': 'aligned_dna_short_read_index_file'},
-                {'column': 'alignment_software'},
+                {'column': 'aligned_dna_short_read_file', 'is_unique': True, 'data_type': 'string', 'is_bucket_path': True},
+                {'column': 'aligned_dna_short_read_index_file', 'data_type': 'string', 'is_bucket_path': True},
+                {'column': 'alignment_software', 'is_unique': True},
                 {'column': 'analysis_details'},
-                {'column': 'md5sum'},
-                {'column': 'mean_coverage', 'required': True},
-                {'column': 'reference_assembly'},
+                {'column': 'md5sum', 'is_unique': True},
+                {'column': 'mean_coverage', 'required': True, 'data_type': 'float'},
+                {'column': 'reference_assembly', 'data_type': 'integer'},
                 {'column': 'reference_assembly_details'},
                 {'column': 'reference_assembly_uri'},
                 {'column': 'quality_issues'},
@@ -816,6 +816,8 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set',
             'The following columns are included in the "participant" table but are missing from the data model: age_at_last_observation, ancestry_detail, pmid_id, proband_relationship_detail, sex_detail, twin_id',
             'The following columns are included in the "participant" data model but are missing in the report: ancestry_metadata',
+            'The following columns are included in the "participant" data model but have an unsupported data type: internal_project_id (reference)',
+            'The following columns are specified as "enumeration" in the "participant" data model but are missing the allowed values definition: prior_testing',
             'The following entries are missing recommended "recontactable" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881',
             'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
@@ -824,7 +826,11 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
         self.assertListEqual(response.json()['errors'], [
             'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)',
+            'The following entries have invalid values for "age_at_enrollment" in the "participant" table. Allowed values have data type date. Invalid values: Broad_NA19675_1 (18)',
+            'The following entries have invalid values for "aligned_dna_short_read_index_file" (from Airtable) in the "aligned_dna_short_read" table. Allowed values are a google bucket path starting with gs://. Invalid values: VCGS_FAM203_621_D2 (NA)',
+            'The following entries have invalid values for "reference_assembly" (from Airtable) in the "aligned_dna_short_read" table. Allowed values have data type integer. Invalid values: NA20888 (GRCh38), VCGS_FAM203_621_D2 (GRCh38)',
             'The following entries are missing required "mean_coverage" (from Airtable) in the "aligned_dna_short_read" table: VCGS_FAM203_621_D2',
+            'The following entries have non-unique values for "alignment_software" (from Airtable) in the "aligned_dna_short_read" table: BWA-MEM-2.3 (NA20888, VCGS_FAM203_621_D2)',
         ])
 
         responses.add(responses.GET, MOCK_DATA_MODEL_URL, status=404)
@@ -976,20 +982,19 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
         self.assertIn([
             'Broad_exome_VCGS_FAM203_621_D2_1', 'Broad_exome_VCGS_FAM203_621_D2',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.cram',
-            'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_COL_FAM1_1_D1.crai', '129c28163df082', 'GRCh38',
-            '', '', '', 'BWA-MEM-2.3', 'DOI:10.5281/zenodo.4469317', '',
+            'NA', '129c28163df082', 'GRCh38', '', '', '', 'BWA-MEM-2.3', 'DOI:10.5281/zenodo.4469317', '',
         ], read_file)
         self.assertIn([
             'Broad_exome_NA20888_1', 'Broad_exome_NA20888',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.cram',
             'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888.crai', 'a6f6308866765ce8', 'GRCh38', '', '',
-            '42.8', 'BWA 0.7.15.r1140', '', '',
+            '42.8', 'BWA-MEM-2.3', '', '',
         ], read_file)
         self.assertEqual([
              'Broad_genome_NA20888_1_1', 'Broad_genome_NA20888_1',
              'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.cram',
              'gs://fc-eb352699-d849-483f-aefe-9d35ce2b21ac/Broad_NA20888_1.crai', '2aa33e8c32020b1c', 'GRCh38', '', '',
-             '36.1', 'BWA 0.7.15.r1140', '', '',
+             '36.1', 'BWA-MEM-2.3', '', '',
         ] in read_file, has_second_project)
 
         self.assertEqual(len(read_set_file), num_airtable_rows)

From 78fc0b721843909f6776d5c1b56b558d123c3578 Mon Sep 17 00:00:00 2001
From: Shifa Zhang <zhangshifa07504@gmail.com>
Date: Thu, 17 Aug 2023 16:32:18 -0400
Subject: [PATCH 07/16] Update per review.

---
 .../components/panel/variants/Predictions.jsx | 69 +++++++++++++------
 ui/shared/utils/constants.js                  | 68 ++++++------------
 2 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/ui/shared/components/panel/variants/Predictions.jsx b/ui/shared/components/panel/variants/Predictions.jsx
index 300a1671d9..fd60ceefc7 100644
--- a/ui/shared/components/panel/variants/Predictions.jsx
+++ b/ui/shared/components/panel/variants/Predictions.jsx
@@ -5,10 +5,10 @@ import { connect } from 'react-redux'
 import { Icon, Transition, Popup } from 'semantic-ui-react'
 
 import { getGenesById } from 'redux/selectors'
-import { PREDICTOR_FIELDS, getPredictColor, getVariantMainGeneId } from 'shared/utils/constants'
+import { PREDICTOR_FIELDS, PRED_COLOR_MAP, getVariantMainGeneId } from 'shared/utils/constants'
 import { snakecaseToTitlecase } from 'shared/utils/stringUtils'
 import { HorizontalSpacer } from '../../Spacers'
-import { ButtonLink } from '../../StyledComponents'
+import { ButtonLink, ColoredIcon } from '../../StyledComponents'
 
 const PredictionValue = styled.span`
   margin-left: 5px;
@@ -19,8 +19,27 @@ const PredictionValue = styled.span`
 
 const NUM_TO_SHOW_ABOVE_THE_FOLD = 6 // how many predictors to show immediately
 
+const comparePathScores = (value, i, thresholds) => {
+  if (i < 2) { // Benign thresholds
+    if (i === 0) {
+      return value <= thresholds[0]
+    }
+    return (thresholds[0] === undefined || value > thresholds[0]) && value <= thresholds[1]
+  }
+
+  if (i === 2) { // Grey area
+    return (thresholds[1] === undefined || value > thresholds[1]) && value < thresholds[2]
+  }
+
+  // Pathogenic thresholds
+  if (i === 5) {
+    return true
+  }
+  return value >= thresholds[i - 1] && (thresholds[i] === undefined || value < thresholds[i])
+}
+
 const predictionFieldValue = (
-  predictions, { field, pathHigher, thresholds, indicatorMap, infoField, infoTitle },
+  predictions, { field, thresholds, indicatorMap, infoField, infoTitle },
 ) => {
   let value = predictions[field]
   if (value === null || value === undefined) {
@@ -31,17 +50,17 @@ const predictionFieldValue = (
 
   if (thresholds) {
     value = parseFloat(value).toPrecision(3)
-    const color = getPredictColor(value, pathHigher, thresholds)
-    return { value, color, infoValue, infoTitle, pathHigher, thresholds }
+    const color = PRED_COLOR_MAP.find((clr, i) => comparePathScores(value, i, thresholds))
+    return { value, color, infoValue, infoTitle, thresholds }
   }
 
   return indicatorMap[value[0]] || indicatorMap[value]
 }
 
-const PATHOGENIC_COLORS = ['green', 'light green', 'grey', 'yellow', 'red', 'dark red']
+const coloredIcon = color => <ColoredIcon name="circle" size="small" color={color} />
 
 const Prediction = (
-  { field, fieldTitle, value, color, infoValue, infoTitle, pathHigher, thresholds, href },
+  { field, fieldTitle, value, color, infoValue, infoTitle, thresholds, href },
 ) => {
   const indicator = infoValue ? (
     <Popup
@@ -49,23 +68,32 @@ const Prediction = (
       content={infoValue}
       trigger={<Icon name="question circle" size="small" color={color} />}
     />
-  ) : <Icon name="circle" size="small" color={color} />
+  ) : coloredIcon(color)
   const fieldName = fieldTitle || snakecaseToTitlecase(field)
   const fieldDisplay = thresholds ? (
     <Popup
       header={`${fieldName} Color Ranges`}
       content={
         <div>
-          {thresholds.map((th, i) => {
-            if (!th) {
-              return null
-            }
-            const t = pathHigher ? th : -1 * th
-            if (i < 3) {
-              return <div>{`${PATHOGENIC_COLORS[i]} ${pathHigher ? '<' : '>'}= ${t}`}</div>
-            }
-            return <div>{`${PATHOGENIC_COLORS[i]} ${pathHigher ? '>' : '<'}= ${t}`}</div>
-          }).filter(e => !!e)}
+          {[0, 1].map(i => thresholds[i] !== undefined && (
+            <div>
+              {coloredIcon(PRED_COLOR_MAP[i])}
+              {i > 0 && thresholds[i - 1] !== undefined && ` > ${thresholds[i - 1]} and`}
+              {` <= ${thresholds[i]}`}
+            </div>
+          ))}
+          <div>
+            {coloredIcon(PRED_COLOR_MAP[2])}
+            {thresholds[1] === undefined ? '' : ` > ${thresholds[1]} and`}
+            {` < ${thresholds[2]}`}
+          </div>
+          {[2, 3, 4].map(i => thresholds[i] !== undefined && (
+            <div>
+              {coloredIcon(PRED_COLOR_MAP[i + 1])}
+              {` >= ${thresholds[i]}`}
+              {i < 4 && thresholds[i + 1] !== undefined && ` and < ${thresholds[i + 1]}`}
+            </div>
+          ))}
         </div>
       }
       trigger={<span>{fieldName}</span>}
@@ -90,7 +118,6 @@ Prediction.propTypes = {
   infoTitle: PropTypes.string,
   fieldTitle: PropTypes.string,
   color: PropTypes.string,
-  pathHigher: PropTypes.bool,
   thresholds: PropTypes.arrayOf(PropTypes.number),
   href: PropTypes.string,
 }
@@ -121,8 +148,8 @@ class Predictions extends React.PureComponent {
     if (gene && gene.primateAi) {
       genePredictors.primate_ai = {
         field: 'primate_ai',
-        pathHigher: gene.primateAi.percentile75 >= gene.primateAi.percentile25,
-        thresholds: [null, null, gene.primateAi.percentile25, gene.primateAi.percentile75, null],
+        thresholds: [undefined, undefined, gene.primateAi.percentile25.toPrecision(3),
+          gene.primateAi.percentile75.toPrecision(3), undefined],
       }
     }
 
diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js
index 644941f16a..7b87421bfe 100644
--- a/ui/shared/utils/constants.js
+++ b/ui/shared/utils/constants.js
@@ -1316,37 +1316,17 @@ export const SV_IN_SILICO_GROUP = 'Structural'
 export const NO_SV_IN_SILICO_GROUPS = [MISSENSE_IN_SILICO_GROUP, CODING_IN_SILICO_GROUP]
 export const SPLICE_AI_FIELD = 'splice_ai'
 
-const PRED_COLOR_MAP = ['green', '#90ee90', 'grey', 'yellow', 'red', '#8b0000']
-
-export const getPredictColor = (value, pathHigher, thresholds) => {
-  let colorIndex = 0
-  const v = pathHigher ? value : -1 * value
-  if (v <= thresholds[1]) {
-    if (v > thresholds[0]) {
-      colorIndex = 1
-    }
-  } else {
-    colorIndex = 5
-    if (v < thresholds[2]) {
-      colorIndex = 2
-    } else if (v < thresholds[3]) {
-      colorIndex = 3
-    } else if (v < thresholds[4]) {
-      colorIndex = 4
-    }
-  }
-  return PRED_COLOR_MAP[colorIndex]
-}
+export const PRED_COLOR_MAP = ['green', '#90ee90', 'grey', 'yellow', 'red', '#8b0000']
 
 export const PREDICTOR_FIELDS = [
-  { field: 'cadd', group: CODING_IN_SILICO_GROUP, thresholds: [0.15, 22.7, 25.3, 28.1, null], min: 1, max: 99 },
+  { field: 'cadd', group: CODING_IN_SILICO_GROUP, thresholds: [0.15, 22.7, 25.3, 28.1, undefined], min: 1, max: 99 },
   { field: 'revel', group: MISSENSE_IN_SILICO_GROUP, thresholds: [0.016, 0.29, 0.644, 0.773, 0.932] },
-  { field: 'primate_ai', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, 0.483, 0.79, 0.867, null] },
-  { field: 'mpc', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, null, 1.36, 1.828, null], max: 5 },
+  { field: 'primate_ai', group: MISSENSE_IN_SILICO_GROUP, thresholds: [undefined, 0.483, 0.79, 0.867, undefined] },
+  { field: 'mpc', group: MISSENSE_IN_SILICO_GROUP, thresholds: [undefined, undefined, 1.36, 1.828, undefined], max: 5 },
   {
     field: SPLICE_AI_FIELD,
     group: SPLICING_IN_SILICO_GROUP,
-    thresholds: [null, null, 0.5, 0.8, null],
+    thresholds: [undefined, undefined, 0.5, 0.8, undefined],
     infoField: 'splice_ai_consequence',
     infoTitle: 'Predicted Consequence',
     fieldTitle: 'SpliceAI',
@@ -1354,32 +1334,26 @@ export const PREDICTOR_FIELDS = [
       `https://spliceailookup.broadinstitute.org/#variant=${chrom}-${pos}-${ref}-${alt}&hg=${genomeVersion}&distance=1000&mask=1`
     ),
   },
-  { field: 'eigen', group: CODING_IN_SILICO_GROUP, thresholds: [null, null, 1, 2, null], max: 99 },
-  { field: 'dann', displayOnly: true, thresholds: [null, null, 0.93, 0.96, null] },
-  { field: 'strvctvre', group: SV_IN_SILICO_GROUP, thresholds: [null, null, 0.5, 0.75, null] },
-  { field: 'polyphen', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, 0.113, 0.978, 0.999, null], indicatorMap: POLYPHEN_MAP },
-  { field: 'sift', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, 0.08, 0.001, 0, null], indicatorMap: INDICATOR_MAP },
+  { field: 'eigen', group: CODING_IN_SILICO_GROUP, thresholds: [undefined, undefined, 1, 2, undefined], max: 99 },
+  { field: 'dann', displayOnly: true, thresholds: [undefined, undefined, 0.93, 0.96, undefined] },
+  { field: 'strvctvre', group: SV_IN_SILICO_GROUP, thresholds: [undefined, undefined, 0.5, 0.75, undefined] },
+  { field: 'polyphen', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: POLYPHEN_MAP },
+  { field: 'sift', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: INDICATOR_MAP },
   { field: 'mut_taster', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: MUTTASTER_MAP },
-  { field: 'fathmm', group: MISSENSE_IN_SILICO_GROUP, thresholds: [null, 3.32, -4.14, -5.04, null], indicatorMap: FATHMM_MAP },
-  { field: 'vest', thresholds: [null, 0.449, 0.764, 0.861, 0.965] },
+  { field: 'fathmm', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: FATHMM_MAP },
+  { field: 'vest', thresholds: [undefined, 0.449, 0.764, 0.861, 0.965] },
   { field: 'mut_pred', thresholds: [0.01, 0.391, 0.737, 0.829, 0.932] },
-  { field: 'apogee', thresholds: [null, null, 0.5, 0.5, null] },
-  { field: 'gnomad_noncoding', fieldTitle: 'gnomAD Constraint', displayOnly: true, thresholds: [null, null, 2.18, 4, null], warningThreshold: 2.18, dangerThreshold: 4 },
+  { field: 'apogee', thresholds: [undefined, undefined, 0.5, 0.5, undefined] },
+  {
+    field: 'gnomad_noncoding',
+    fieldTitle: 'gnomAD Constraint',
+    displayOnly: true,
+    thresholds: [undefined, undefined, 2.18, 4, undefined],
+  },
   { field: 'haplogroup_defining', indicatorMap: { Y: { color: 'green', value: '' } } },
   { field: 'mitotip', indicatorMap: MITOTIP_MAP },
-  { field: 'hmtvar', thresholds: [null, null, 0.35, 0.35, null] },
-].map(({ thresholds, ...pred }) => {
-  if (!thresholds) {
-    return pred
-  }
-  const noneNullThresholds = thresholds.filter(t => t)
-  const pathHigher = noneNullThresholds[1] >= noneNullThresholds[0]
-  return {
-    ...pred,
-    pathHigher,
-    thresholds: pathHigher ? thresholds : thresholds.map(t => (t === null ? null : -1 * t)),
-  }
-})
+  { field: 'hmtvar', thresholds: [undefined, undefined, 0.35, 0.35, undefined] },
+]
 
 export const getVariantMainGeneId = ({ transcripts = {}, mainTranscriptId, selectedMainTranscriptId }) => {
   if (selectedMainTranscriptId || mainTranscriptId) {

From 43ae552a71258ffe2c735df06e503d60524d9c4d Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 18 Aug 2023 10:10:22 -0400
Subject: [PATCH 08/16] update called variant file filter missing

---
 seqr/views/apis/report_api.py       | 7 +++++--
 seqr/views/apis/report_api_tests.py | 9 +++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 4a9fef873f..1d422526b3 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -713,8 +713,9 @@ def _get_sample_airtable_metadata(sample_ids, user, include_collaborator=False):
 READ_RNA_TABLE_COLUMNS.insert(10, 'gene_annotation_details')
 READ_RNA_TABLE_COLUMNS.insert(13, 'alignment_postprocessing')
 READ_SET_TABLE_COLUMNS = ['aligned_dna_short_read_set_id', 'aligned_dna_short_read_id']
+CALLED_VARIANT_FILE_COLUMN = 'called_variants_dna_file'
 CALLED_TABLE_COLUMNS = [
-    'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
+    'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', CALLED_VARIANT_FILE_COLUMN, 'md5sum',
     'caller_software', 'variant_types', 'analysis_details',
 ]
 
@@ -912,7 +913,9 @@ def gregor_export(request):
         ('experiment_dna_short_read', EXPERIMENT_TABLE_COLUMNS, airtable_rows),
         ('aligned_dna_short_read', READ_TABLE_COLUMNS, airtable_rows),
         ('aligned_dna_short_read_set', READ_SET_TABLE_COLUMNS, airtable_rows),
-        ('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, airtable_rows),
+        ('called_variants_dna_short_read', CALLED_TABLE_COLUMNS, [
+            row for row in airtable_rows if row.get(CALLED_VARIANT_FILE_COLUMN)
+        ]),
         ('experiment_rna_short_read', EXPERIMENT_RNA_TABLE_COLUMNS, airtable_rna_rows),
         ('aligned_rna_short_read', READ_RNA_TABLE_COLUMNS, airtable_rna_rows),
         ('experiment', EXPERIMENT_LOOKUP_TABLE_COLUMNS, experiment_lookup_rows),
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index d01af29f22..ff6f324ad5 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -299,9 +299,9 @@
         'mean_coverage_wes': '42.8',
         'mean_coverage_wgs': '36.1',
         'analysis_details': '',
-        'called_variants_dna_short_read_id': 'NA',
+        'called_variants_dna_short_read_id': '',
         'aligned_dna_short_read_set_id': 'Broad_NA20888_D1',
-        'called_variants_dna_file': 'NA',
+        'called_variants_dna_file': '',
         'caller_software': 'NA',
         'variant_types': 'SNV',
       },
@@ -1003,7 +1003,7 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
         self.assertIn(['Broad_NA20888_D1', 'Broad_exome_NA20888_1'], read_set_file)
         self.assertEqual(['Broad_NA20888_D1', 'Broad_genome_NA20888_1_1'] in read_set_file, has_second_project)
 
-        self.assertEqual(len(called_file), num_airtable_rows)
+        self.assertEqual(len(called_file), 2)
         self.assertEqual(called_file[0], [
             'called_variants_dna_short_read_id', 'aligned_dna_short_read_set_id', 'called_variants_dna_file', 'md5sum',
             'caller_software', 'variant_types', 'analysis_details',
@@ -1012,9 +1012,6 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
             'SX2-3', 'BCM_H7YG5DSX2', 'gs://fc-fed09429-e563-44a7-aaeb-776c8336ba02/COL_FAM1_1_D1.SV.vcf',
             '129c28163df082', 'gatk4.1.2', 'SNV', 'DOI:10.5281/zenodo.4469317',
         ], called_file)
-        self.assertIn(['NA', 'Broad_NA20888_D1', 'NA', 'a6f6308866765ce8', 'NA', 'SNV', ''], called_file)
-        self.assertEqual(
-            ['NA', 'Broad_NA20888_D1', 'NA', '2aa33e8c32020b1c', 'NA', 'SNV', ''] in called_file, has_second_project)
 
         self.assertEqual(len(experiment_rna_file), 2)
         self.assertEqual(experiment_rna_file[0], [

From 0a858611be6974df64913fd01f435e517b56f51c Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Fri, 18 Aug 2023 10:38:18 -0400
Subject: [PATCH 09/16] better flot and int validation and testing

---
 seqr/views/apis/report_api.py       |  5 +++--
 seqr/views/apis/report_api_tests.py | 30 ++++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 4a9fef873f..b52ac8cc6f 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -1033,11 +1033,12 @@ def _get_experiment_lookup_row(is_rna, row_data):
     }
 
 
+is_integer = lambda val, *args: val.isnumeric() or re.match(r'^[\d{3},]*\d{3}$', val)
 DATA_TYPE_VALIDATORS = {
     'string': lambda val, validator: (not validator.get('is_bucket_path')) or val.startswith('gs://'),
     'enumeration': lambda val, validator: val in validator['enumerations'],
-    'integer': lambda val, validator: val.replace(',', '').isnumeric(),
-    'float': lambda val, validator: val.replace(',', '').replace('.', '').isnumeric(),
+    'integer': is_integer,
+    'float': lambda val, validator: is_integer(val) or re.match(r'^\d+.\d+$', val),
     'date': lambda val, validator: bool(re.match(r'^\d{4}-\d{2}-\d{2}$', val)),
 }
 DATA_TYPE_ERROR_FORMATTERS = {
diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index d01af29f22..e3a46a930d 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -441,6 +441,33 @@
             'required': 'CONDITIONAL (aligned_dna_short_read_set, dna_read_data)',
             'columns': [{'column': 'analyte_id', 'required': True}],
         },
+        {
+            'table': 'experiment_rna_short_read',
+            'columns': [
+                {'column': 'experiment_rna_short_read_id', 'required': True},
+                {'column': 'analyte_id', 'required': True},
+                {'column': 'experiment_sample_id'},
+                {'column': 'seq_library_prep_kit_method'},
+                {'column': 'library_prep_type'},
+                {'column': 'experiment_type'},
+                {'column': 'read_length', 'data_type': 'integer'},
+                {'column': 'single_or_paired_ends'},
+                {'column': 'date_data_generation', 'data_type': 'float'},
+                {'column': 'sequencing_platform'},
+                {'column': 'within_site_batch_name'},
+                {'column': 'RIN', 'data_type': 'float'},
+                {'column': 'estimated_library_size'},
+                {'column': 'total_reads', 'data_type': 'integer'},
+                {'column': 'percent_rRNA', 'data_type': 'float'},
+                {'column': 'percent_mRNA', 'data_type': 'float'},
+                {'column': 'percent_mtRNA', 'data_type': 'float'},
+                {'column': 'percent_Globin', 'data_type': 'float'},
+                {'column': 'percent_UMI', 'data_type': 'float'},
+                {'column': '5prime3prime_bias', 'data_type': 'float'},
+                {'column': 'percent_GC', 'data_type': 'float'},
+                {'column': 'percent_chrX_Y', 'data_type': 'float'},
+            ],
+        },
     ]
 }
 
@@ -822,7 +849,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'The following entries are missing recommended "reported_race" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries are missing recommended "phenotype_description" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries are missing recommended "age_at_enrollment" in the "participant" table: Broad_HG00731, Broad_NA20870, Broad_NA20872, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
-        ] + skipped_file_validation_warnings[1:5] + skipped_file_validation_warnings[7:])
+        ] + skipped_file_validation_warnings[1:5] + skipped_file_validation_warnings[7:8] + skipped_file_validation_warnings[9:])
         self.assertListEqual(response.json()['errors'], [
             'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00731, Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
             'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)',
@@ -831,6 +858,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
             'The following entries have invalid values for "reference_assembly" (from Airtable) in the "aligned_dna_short_read" table. Allowed values have data type integer. Invalid values: NA20888 (GRCh38), VCGS_FAM203_621_D2 (GRCh38)',
             'The following entries are missing required "mean_coverage" (from Airtable) in the "aligned_dna_short_read" table: VCGS_FAM203_621_D2',
             'The following entries have non-unique values for "alignment_software" (from Airtable) in the "aligned_dna_short_read" table: BWA-MEM-2.3 (NA20888, VCGS_FAM203_621_D2)',
+            'The following entries have invalid values for "date_data_generation" (from Airtable) in the "experiment_rna_short_read" table. Allowed values have data type float. Invalid values: NA19679 (2023-02-11)',
         ])
 
         responses.add(responses.GET, MOCK_DATA_MODEL_URL, status=404)

From 04676d87f2f72b90345bfcfc3701159163bb59fd Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Fri, 18 Aug 2023 13:26:48 -0400
Subject: [PATCH 10/16] Add "wait_for_routes" helper script for CronJobs.
 (#3565)

* add wait_for_routes script

* add wait

* wait for routes

* Add wait_for_routes helper script

* Flush out description

* Update wait_for_routes
---
 deploy/docker/seqr/Dockerfile      |  1 +
 deploy/docker/seqr/wait_for_routes | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)
 create mode 100755 deploy/docker/seqr/wait_for_routes

diff --git a/deploy/docker/seqr/Dockerfile b/deploy/docker/seqr/Dockerfile
index 2093aef700..22a7b63a27 100644
--- a/deploy/docker/seqr/Dockerfile
+++ b/deploy/docker/seqr/Dockerfile
@@ -93,6 +93,7 @@ EXPOSE 8000
 ENV TERM=xterm
 
 COPY deploy/docker/seqr/readiness_probe /
+COPY deploy/docker/seqr/wait_for_routes /
 COPY deploy/docker/seqr/bin/*.sh /usr/local/bin/
 COPY deploy/docker/seqr/config/*.py ./
 COPY deploy/docker/seqr/bashrc /root/.bashrc
diff --git a/deploy/docker/seqr/wait_for_routes b/deploy/docker/seqr/wait_for_routes
new file mode 100755
index 0000000000..f8263fe1ac
--- /dev/null
+++ b/deploy/docker/seqr/wait_for_routes
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+###
+# Waits for network endpoints.  Intended usage is within Kubernetes CronJobs to wait for sidecar availability.
+# Usage: ./wait_for_routes https://www.google.com/ https://www.broadinstitute.org https://www.broadins.org
+###
+
+RETRY_COUNT=10
+SLEEP_S=2
+
+for route in "$@"
+do
+    retries=0
+    until [ "$retries" -ge 10 ]
+    do
+        curl -s $route -o /dev/null && echo "Successful ping of $route" && break
+        retries=$((retries+1))
+        if [ "$retries" -eq 10 ]; then
+            echo "Route ${route} wasn't available after ${RETRY_COUNT} connection attempts"
+            exit 1
+        else
+            echo "Unable to connect to ${route}, retrying. Attempt ${retries}/${RETRY_COUNT}"
+            sleep $SLEEP_S
+        fi
+    done
+done

From 0cbc4af350974ba2415636c553d7e3ff8f2f27f3 Mon Sep 17 00:00:00 2001
From: Shifa Zhang <zhangshifa07504@gmail.com>
Date: Fri, 18 Aug 2023 14:06:24 -0400
Subject: [PATCH 11/16] Update the threshold search and popup info.

---
 .../components/panel/variants/Predictions.jsx | 58 ++++++-------------
 ui/shared/utils/constants.js                  | 12 ++--
 2 files changed, 23 insertions(+), 47 deletions(-)

diff --git a/ui/shared/components/panel/variants/Predictions.jsx b/ui/shared/components/panel/variants/Predictions.jsx
index fd60ceefc7..b569b56c0a 100644
--- a/ui/shared/components/panel/variants/Predictions.jsx
+++ b/ui/shared/components/panel/variants/Predictions.jsx
@@ -19,25 +19,6 @@ const PredictionValue = styled.span`
 
 const NUM_TO_SHOW_ABOVE_THE_FOLD = 6 // how many predictors to show immediately
 
-const comparePathScores = (value, i, thresholds) => {
-  if (i < 2) { // Benign thresholds
-    if (i === 0) {
-      return value <= thresholds[0]
-    }
-    return (thresholds[0] === undefined || value > thresholds[0]) && value <= thresholds[1]
-  }
-
-  if (i === 2) { // Grey area
-    return (thresholds[1] === undefined || value > thresholds[1]) && value < thresholds[2]
-  }
-
-  // Pathogenic thresholds
-  if (i === 5) {
-    return true
-  }
-  return value >= thresholds[i - 1] && (thresholds[i] === undefined || value < thresholds[i])
-}
-
 const predictionFieldValue = (
   predictions, { field, thresholds, indicatorMap, infoField, infoTitle },
 ) => {
@@ -50,14 +31,19 @@ const predictionFieldValue = (
 
   if (thresholds) {
     value = parseFloat(value).toPrecision(3)
-    const color = PRED_COLOR_MAP.find((clr, i) => comparePathScores(value, i, thresholds))
+    const color = PRED_COLOR_MAP.find(
+      (clr, i) => (thresholds[i - 1] || thresholds[i - 1]) &&
+        (thresholds[i - 1] === undefined || value >= thresholds[i - 1]) &&
+        (thresholds[i] === undefined || value < thresholds[i]),
+    )
     return { value, color, infoValue, infoTitle, thresholds }
   }
 
   return indicatorMap[value[0]] || indicatorMap[value]
 }
 
-const coloredIcon = color => <ColoredIcon name="circle" size="small" color={color} />
+const coloredIcon = color => (color === 'darkred' ? <ColoredIcon name="circle" size="small" color="#8b0000" /> :
+<Icon name="circle" size="small" color={color} />)
 
 const Prediction = (
   { field, fieldTitle, value, color, infoValue, infoTitle, thresholds, href },
@@ -74,27 +60,17 @@ const Prediction = (
     <Popup
       header={`${fieldName} Color Ranges`}
       content={
-        <div>
-          {[0, 1].map(i => thresholds[i] !== undefined && (
-            <div>
-              {coloredIcon(PRED_COLOR_MAP[i])}
-              {i > 0 && thresholds[i - 1] !== undefined && ` > ${thresholds[i - 1]} and`}
-              {` <= ${thresholds[i]}`}
-            </div>
-          ))}
-          <div>
-            {coloredIcon(PRED_COLOR_MAP[2])}
-            {thresholds[1] === undefined ? '' : ` > ${thresholds[1]} and`}
-            {` < ${thresholds[2]}`}
-          </div>
-          {[2, 3, 4].map(i => thresholds[i] !== undefined && (
-            <div>
-              {coloredIcon(PRED_COLOR_MAP[i + 1])}
-              {` >= ${thresholds[i]}`}
-              {i < 4 && thresholds[i + 1] !== undefined && ` and < ${thresholds[i + 1]}`}
+        PRED_COLOR_MAP.map((c, i) => {
+          if (thresholds[i] === undefined && thresholds[i - 1] === undefined) {
+            return null
+          }
+          return (
+            <div key={c}>
+              {coloredIcon(c)}
+              {thresholds[i] === undefined ? ` >= ${thresholds[i - 1]}` : ` < ${thresholds[i]}`}
             </div>
-          ))}
-        </div>
+          )
+        })
       }
       trigger={<span>{fieldName}</span>}
     />
diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js
index 7b87421bfe..675ea87708 100644
--- a/ui/shared/utils/constants.js
+++ b/ui/shared/utils/constants.js
@@ -1316,12 +1316,12 @@ export const SV_IN_SILICO_GROUP = 'Structural'
 export const NO_SV_IN_SILICO_GROUPS = [MISSENSE_IN_SILICO_GROUP, CODING_IN_SILICO_GROUP]
 export const SPLICE_AI_FIELD = 'splice_ai'
 
-export const PRED_COLOR_MAP = ['green', '#90ee90', 'grey', 'yellow', 'red', '#8b0000']
+export const PRED_COLOR_MAP = ['green', 'olive', 'grey', 'yellow', 'red', 'darkred']
 
 export const PREDICTOR_FIELDS = [
-  { field: 'cadd', group: CODING_IN_SILICO_GROUP, thresholds: [0.15, 22.7, 25.3, 28.1, undefined], min: 1, max: 99 },
-  { field: 'revel', group: MISSENSE_IN_SILICO_GROUP, thresholds: [0.016, 0.29, 0.644, 0.773, 0.932] },
-  { field: 'primate_ai', group: MISSENSE_IN_SILICO_GROUP, thresholds: [undefined, 0.483, 0.79, 0.867, undefined] },
+  { field: 'cadd', group: CODING_IN_SILICO_GROUP, thresholds: [0.151, 22.8, 25.3, 28.1, undefined], min: 1, max: 99 },
+  { field: 'revel', group: MISSENSE_IN_SILICO_GROUP, thresholds: [0.0161, 0.291, 0.644, 0.773, 0.932] },
+  { field: 'primate_ai', group: MISSENSE_IN_SILICO_GROUP, thresholds: [undefined, 0.484, 0.79, 0.867, undefined] },
   { field: 'mpc', group: MISSENSE_IN_SILICO_GROUP, thresholds: [undefined, undefined, 1.36, 1.828, undefined], max: 5 },
   {
     field: SPLICE_AI_FIELD,
@@ -1341,8 +1341,8 @@ export const PREDICTOR_FIELDS = [
   { field: 'sift', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: INDICATOR_MAP },
   { field: 'mut_taster', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: MUTTASTER_MAP },
   { field: 'fathmm', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: FATHMM_MAP },
-  { field: 'vest', thresholds: [undefined, 0.449, 0.764, 0.861, 0.965] },
-  { field: 'mut_pred', thresholds: [0.01, 0.391, 0.737, 0.829, 0.932] },
+  { field: 'vest', thresholds: [undefined, 0.450, 0.764, 0.861, 0.965] },
+  { field: 'mut_pred', thresholds: [0.0101, 0.392, 0.737, 0.829, 0.932] },
   { field: 'apogee', thresholds: [undefined, undefined, 0.5, 0.5, undefined] },
   {
     field: 'gnomad_noncoding',

From 9d755dbb6ef868d8615136402d567cea4b85276b Mon Sep 17 00:00:00 2001
From: Shifa Zhang <zhangshifa07504@gmail.com>
Date: Mon, 21 Aug 2023 15:35:26 -0400
Subject: [PATCH 12/16] Update per review.

---
 ui/shared/components/panel/variants/Predictions.jsx | 5 ++---
 ui/shared/utils/constants.js                        | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/ui/shared/components/panel/variants/Predictions.jsx b/ui/shared/components/panel/variants/Predictions.jsx
index b569b56c0a..3b80cb6842 100644
--- a/ui/shared/components/panel/variants/Predictions.jsx
+++ b/ui/shared/components/panel/variants/Predictions.jsx
@@ -32,7 +32,7 @@ const predictionFieldValue = (
   if (thresholds) {
     value = parseFloat(value).toPrecision(3)
     const color = PRED_COLOR_MAP.find(
-      (clr, i) => (thresholds[i - 1] || thresholds[i - 1]) &&
+      (clr, i) => (thresholds[i - 1] || thresholds[i]) &&
         (thresholds[i - 1] === undefined || value >= thresholds[i - 1]) &&
         (thresholds[i] === undefined || value < thresholds[i]),
     )
@@ -42,8 +42,7 @@ const predictionFieldValue = (
   return indicatorMap[value[0]] || indicatorMap[value]
 }
 
-const coloredIcon = color => (color === 'darkred' ? <ColoredIcon name="circle" size="small" color="#8b0000" /> :
-<Icon name="circle" size="small" color={color} />)
+const coloredIcon = color => React.createElement(color.startsWith('#') ? ColoredIcon : Icon, { name: 'circle', size: 'small', color })
 
 const Prediction = (
   { field, fieldTitle, value, color, infoValue, infoTitle, thresholds, href },
diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js
index 675ea87708..1323503fd2 100644
--- a/ui/shared/utils/constants.js
+++ b/ui/shared/utils/constants.js
@@ -1316,7 +1316,7 @@ export const SV_IN_SILICO_GROUP = 'Structural'
 export const NO_SV_IN_SILICO_GROUPS = [MISSENSE_IN_SILICO_GROUP, CODING_IN_SILICO_GROUP]
 export const SPLICE_AI_FIELD = 'splice_ai'
 
-export const PRED_COLOR_MAP = ['green', 'olive', 'grey', 'yellow', 'red', 'darkred']
+export const PRED_COLOR_MAP = ['green', 'olive', 'grey', 'yellow', 'red', '#8b0000']
 
 export const PREDICTOR_FIELDS = [
   { field: 'cadd', group: CODING_IN_SILICO_GROUP, thresholds: [0.151, 22.8, 25.3, 28.1, undefined], min: 1, max: 99 },

From 09f7036b7f032492aa8e726bbfc552086040a97e Mon Sep 17 00:00:00 2001
From: Shifa Zhang <zhangshifa07504@gmail.com>
Date: Tue, 22 Aug 2023 09:27:50 -0400
Subject: [PATCH 13/16] Move color map definition.

---
 ui/shared/components/panel/variants/Predictions.jsx | 4 +++-
 ui/shared/utils/constants.js                        | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ui/shared/components/panel/variants/Predictions.jsx b/ui/shared/components/panel/variants/Predictions.jsx
index 3b80cb6842..395bb5b0e5 100644
--- a/ui/shared/components/panel/variants/Predictions.jsx
+++ b/ui/shared/components/panel/variants/Predictions.jsx
@@ -5,7 +5,7 @@ import { connect } from 'react-redux'
 import { Icon, Transition, Popup } from 'semantic-ui-react'
 
 import { getGenesById } from 'redux/selectors'
-import { PREDICTOR_FIELDS, PRED_COLOR_MAP, getVariantMainGeneId } from 'shared/utils/constants'
+import { PREDICTOR_FIELDS, getVariantMainGeneId } from 'shared/utils/constants'
 import { snakecaseToTitlecase } from 'shared/utils/stringUtils'
 import { HorizontalSpacer } from '../../Spacers'
 import { ButtonLink, ColoredIcon } from '../../StyledComponents'
@@ -19,6 +19,8 @@ const PredictionValue = styled.span`
 
 const NUM_TO_SHOW_ABOVE_THE_FOLD = 6 // how many predictors to show immediately
 
+export const PRED_COLOR_MAP = ['green', 'olive', 'grey', 'yellow', 'red', '#8b0000']
+
 const predictionFieldValue = (
   predictions, { field, thresholds, indicatorMap, infoField, infoTitle },
 ) => {
diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js
index 1323503fd2..caf2f5919f 100644
--- a/ui/shared/utils/constants.js
+++ b/ui/shared/utils/constants.js
@@ -1316,8 +1316,6 @@ export const SV_IN_SILICO_GROUP = 'Structural'
 export const NO_SV_IN_SILICO_GROUPS = [MISSENSE_IN_SILICO_GROUP, CODING_IN_SILICO_GROUP]
 export const SPLICE_AI_FIELD = 'splice_ai'
 
-export const PRED_COLOR_MAP = ['green', 'olive', 'grey', 'yellow', 'red', '#8b0000']
-
 export const PREDICTOR_FIELDS = [
   { field: 'cadd', group: CODING_IN_SILICO_GROUP, thresholds: [0.151, 22.8, 25.3, 28.1, undefined], min: 1, max: 99 },
   { field: 'revel', group: MISSENSE_IN_SILICO_GROUP, thresholds: [0.0161, 0.291, 0.644, 0.773, 0.932] },

From c6e60a241f6e4442d0fb050e803d9b205bd16eb5 Mon Sep 17 00:00:00 2001
From: Shifa Zhang <zhangshifa07504@gmail.com>
Date: Tue, 22 Aug 2023 09:47:43 -0400
Subject: [PATCH 14/16] Fix codacy.

---
 ui/shared/utils/constants.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js
index caf2f5919f..9334446cab 100644
--- a/ui/shared/utils/constants.js
+++ b/ui/shared/utils/constants.js
@@ -1339,7 +1339,7 @@ export const PREDICTOR_FIELDS = [
   { field: 'sift', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: INDICATOR_MAP },
   { field: 'mut_taster', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: MUTTASTER_MAP },
   { field: 'fathmm', group: MISSENSE_IN_SILICO_GROUP, indicatorMap: FATHMM_MAP },
-  { field: 'vest', thresholds: [undefined, 0.450, 0.764, 0.861, 0.965] },
+  { field: 'vest', thresholds: [undefined, 0.45, 0.764, 0.861, 0.965] },
   { field: 'mut_pred', thresholds: [0.0101, 0.392, 0.737, 0.829, 0.932] },
   { field: 'apogee', thresholds: [undefined, undefined, 0.5, 0.5, undefined] },
   {

From 60cc895dbf4285c8f977b7620c9a26c26549fb14 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 22 Aug 2023 10:13:35 -0400
Subject: [PATCH 15/16] bump changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9808f0fea5..73d72d8dda 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,8 @@
 # _seqr_ Changes
 
 ## dev
+
+## 8/22/23
 * Add db indices to optimize RNA data queries (REQUIRES DB MIGRATION)
 
 ## 7/11/23

From dea5ab2da1fff967e089dde0e33827802101c5a3 Mon Sep 17 00:00:00 2001
From: Shifa Zhang <zhangshifa07504@gmail.com>
Date: Tue, 22 Aug 2023 10:28:08 -0400
Subject: [PATCH 16/16] Remove unnecessary export.

---
 ui/shared/components/panel/variants/Predictions.jsx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ui/shared/components/panel/variants/Predictions.jsx b/ui/shared/components/panel/variants/Predictions.jsx
index 395bb5b0e5..7899621545 100644
--- a/ui/shared/components/panel/variants/Predictions.jsx
+++ b/ui/shared/components/panel/variants/Predictions.jsx
@@ -19,7 +19,7 @@ const PredictionValue = styled.span`
 
 const NUM_TO_SHOW_ABOVE_THE_FOLD = 6 // how many predictors to show immediately
 
-export const PRED_COLOR_MAP = ['green', 'olive', 'grey', 'yellow', 'red', '#8b0000']
+const PRED_COLOR_MAP = ['green', 'olive', 'grey', 'yellow', 'red', '#8b0000']
 
 const predictionFieldValue = (
   predictions, { field, thresholds, indicatorMap, infoField, infoTitle },