From 641edd57a72725aff3172932cbe4ab36609ce431 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Wed, 13 Mar 2024 17:04:06 -0400
Subject: [PATCH 01/19] stream to file no dedup

---
 seqr/views/apis/data_manager_api.py | 15 +++++++++------
 seqr/views/utils/dataset_utils.py   | 19 ++++++++++---------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
index 3e1739435a..cd83b47e2d 100644
--- a/seqr/views/apis/data_manager_api.py
+++ b/seqr/views/apis/data_manager_api.py
@@ -272,10 +272,13 @@ def update_rna_seq(request):
 
     file_name_prefix = f'rna_sample_data__{data_type}__{datetime.now().isoformat()}'
 
+    sample_files = {}
+
     def _save_sample_data(sample_guid, sample_data):
-        file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid))
-        with gzip.open(file_name, 'wt') as f:
-            json.dump(sample_data, f)
+        if sample_guid not in sample_files:
+            file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid))
+            sample_files[sample_guid] = gzip.open(file_name, 'wt')
+        sample_files[sample_guid].wirte(json.dumps(sample_data))
 
     try:
         sample_guids, info, warnings = load_rna_seq(
@@ -300,7 +303,7 @@ def _load_saved_sample_data(file_name_prefix, sample_guid):
     file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid))
     if os.path.exists(file_name):
         with gzip.open(file_name, 'rt') as f:
-            return json.load(f)
+            return [json.loads(line) for line in f.readlines()]
     return None
 
 
@@ -312,10 +315,10 @@ def load_rna_seq_sample_data(request, sample_guid):
     request_json = json.loads(request.body)
     file_name = request_json['fileName']
     data_type = request_json['dataType']
-    data_by_gene = _load_saved_sample_data(file_name, sample_guid)
+    data_rows = _load_saved_sample_data(file_name, sample_guid)
 
     model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class']
-    model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_by_gene.values()])
+    model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_rows])
     update_model_from_json(sample, {'is_active': True}, user=request.user)
 
     return create_json_response({'success': True})
diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py
index 8824860637..9a196a51ee 100644
--- a/seqr/views/utils/dataset_utils.py
+++ b/seqr/views/utils/dataset_utils.py
@@ -402,7 +402,8 @@ def _load_rna_seq_file(
         if existing_data and existing_data != row_dict:
             mismatches[sample_guid].add(gene_or_unique_id)
 
-        samples_by_guid[sample_guid][gene_or_unique_id] = row_dict
+        #samples_by_guid[sample_guid][gene_or_unique_id] = row_dict
+        save_sample_data(sample_guid, row_dict)
 
     errors, warnings = _process_rna_errors(
         gene_ids, missing_required_fields, unmatched_samples, ignore_extra_samples, loaded_samples,
@@ -508,18 +509,18 @@ def save_sample_data(sample_guid, sample_data):
             update_sample_models()
             created_samples.update(samples_to_create.keys())
 
-        prev_data = load_saved_data(sample_guid) or {}
-        new_mismatches = {k for k, v in prev_data.items() if k in sample_data and v != sample_data[k]}
-        if new_mismatches:
-            mismatches[sample_guid].update(new_mismatches)
-        sample_data.update(prev_data)
+        # prev_data = load_saved_data(sample_guid) or {}
+        # new_mismatches = {k for k, v in prev_data.items() if k in sample_data and v != sample_data[k]}
+        # if new_mismatches:
+        #     mismatches[sample_guid].update(new_mismatches)
+        # sample_data.update(prev_data)
 
-        if post_process:
-            post_process(sample_data)
+        # if post_process:
+        #     post_process(sample_data)
 
         sample_guids_to_load.add(sample_guid)
         save_data(sample_guid, sample_data)
-        return new_mismatches
+        #return new_mismatches
 
     def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id_mapping):
         if sample_key in potential_samples:

From fa2eee179c3a211235421d67466f8c93919bcd22 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 14 Mar 2024 10:51:23 -0400
Subject: [PATCH 02/19] fix typo

---
 seqr/views/apis/data_manager_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
index cd83b47e2d..aa41218645 100644
--- a/seqr/views/apis/data_manager_api.py
+++ b/seqr/views/apis/data_manager_api.py
@@ -278,7 +278,7 @@ def _save_sample_data(sample_guid, sample_data):
         if sample_guid not in sample_files:
             file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid))
             sample_files[sample_guid] = gzip.open(file_name, 'wt')
-        sample_files[sample_guid].wirte(json.dumps(sample_data))
+        sample_files[sample_guid].write(json.dumps(sample_data))
 
     try:
         sample_guids, info, warnings = load_rna_seq(

From 9c4056f65602e79febb07b6bad4c10145a984b01 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 14 Mar 2024 11:34:21 -0400
Subject: [PATCH 03/19] update parsing code

---
 seqr/views/apis/data_manager_api.py | 12 +++--
 seqr/views/utils/dataset_utils.py   | 72 +++++++++++------------------
 2 files changed, 35 insertions(+), 49 deletions(-)

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
index aa41218645..b726d32eba 100644
--- a/seqr/views/apis/data_manager_api.py
+++ b/seqr/views/apis/data_manager_api.py
@@ -20,7 +20,8 @@
 from seqr.utils.vcf_utils import validate_vcf_exists
 
 from seqr.views.utils.airflow_utils import trigger_data_loading, write_data_loading_pedigree
-from seqr.views.utils.dataset_utils import load_rna_seq, load_phenotype_prioritization_data_file, RNA_DATA_TYPE_CONFIGS
+from seqr.views.utils.dataset_utils import load_rna_seq, load_phenotype_prioritization_data_file, RNA_DATA_TYPE_CONFIGS, \
+    post_process_rna_data
 from seqr.views.utils.file_utils import parse_file, get_temp_upload_directory, load_uploaded_file
 from seqr.views.utils.json_utils import create_json_response
 from seqr.views.utils.json_to_orm_utils import update_model_from_json
@@ -277,12 +278,12 @@ def update_rna_seq(request):
     def _save_sample_data(sample_guid, sample_data):
         if sample_guid not in sample_files:
             file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid))
-            sample_files[sample_guid] = gzip.open(file_name, 'wt')
+            sample_files[sample_guid] = gzip.open(file_name, 'a')
         sample_files[sample_guid].write(json.dumps(sample_data))
 
     try:
         sample_guids, info, warnings = load_rna_seq(
-            data_type, file_path, _save_sample_data, lambda sample_guid: _load_saved_sample_data(file_name_prefix, sample_guid),
+            data_type, file_path, _save_sample_data,
             user=request.user, mapping_file=mapping_file, ignore_extra_samples=request_json.get('ignoreExtraSamples'))
     except ValueError as e:
         return create_json_response({'error': str(e)}, status=400)
@@ -315,9 +316,12 @@ def load_rna_seq_sample_data(request, sample_guid):
     request_json = json.loads(request.body)
     file_name = request_json['fileName']
     data_type = request_json['dataType']
+    config = RNA_DATA_TYPE_CONFIGS[data_type]
+
     data_rows = _load_saved_sample_data(file_name, sample_guid)
+    post_process_rna_data(sample_guid, data_rows, **config.get('post_process_kwargs', {}))
 
-    model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class']
+    model_cls = config['model_class']
     model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_rows])
     update_model_from_json(sample, {'is_active': True}, user=request.user)
 
diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py
index 9a196a51ee..42cfc7b6d1 100644
--- a/seqr/views/utils/dataset_utils.py
+++ b/seqr/views/utils/dataset_utils.py
@@ -20,9 +20,6 @@
 logger = SeqrLogger(__name__)
 
 
-MAX_UNSAVED_DATA_PER_SAMPLE = 5000
-
-
 def load_mapping_file(mapping_file_path, user):
     file_content = parse_file(mapping_file_path, file_iter(mapping_file_path, user=user))
     return load_mapping_file_content(file_content)
@@ -283,7 +280,7 @@ def _get_splice_id(row):
 
 
 def _add_splice_rank(sample_data_rows):
-    sorted_data_rows = sorted([data_row for data_row in sample_data_rows.values()], key=lambda d: d[P_VALUE_COL])
+    sorted_data_rows = sorted([data_row for data_row in sample_data_rows], key=lambda d: d[P_VALUE_COL])
     for i, data_row in enumerate(sorted_data_rows):
         data_row['rank'] = i
 
@@ -305,8 +302,10 @@ def _add_splice_rank(sample_data_rows):
         'additional_kwargs': {
             'format_fields': SPLICE_OUTLIER_FORMATTER,
             'allow_missing_gene': True,
-            'get_unique_key': _get_splice_id,
+        },
+        'post_process_kwargs': {
             'post_process': _add_splice_rank,
+            'get_unique_key': _get_splice_id,
         },
     },
 }
@@ -330,8 +329,8 @@ def _validate_rna_header(header, column_map):
 
 
 def _load_rna_seq_file(
-        file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample, mismatches,
-        column_map, mapping_file=None, get_unique_key=None, allow_missing_gene=False, ignore_extra_samples=False,
+        file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample,
+        column_map, mapping_file=None, allow_missing_gene=False, ignore_extra_samples=False,
         should_skip=None, format_fields=None,
 ):
 
@@ -339,7 +338,6 @@ def _load_rna_seq_file(
     if mapping_file:
         sample_id_to_individual_id_mapping = load_mapping_file_content(mapping_file)
 
-    samples_by_guid = defaultdict(dict)
     f = file_iter(file_path, user=user)
     parsed_f = parse_file(file_path.replace('.gz', ''), f, iter_file=True)
     header = next(parsed_f)
@@ -349,7 +347,6 @@ def _load_rna_seq_file(
     unmatched_samples = set()
     missing_required_fields = defaultdict(set)
     gene_ids = set()
-    current_sample = None
     for line in tqdm(parsed_f, unit=' rows'):
         row = dict(zip(header, line))
         if should_skip and should_skip(row):
@@ -390,35 +387,12 @@ def _load_rna_seq_file(
             # If there are definite errors, do not process/save data, just continue to check for additional errors
             continue
 
-        if current_sample != sample_guid:
-            # If a large amount of data has been parsed for the previous sample, save and do not keep in memory
-            if len(samples_by_guid[current_sample]) > MAX_UNSAVED_DATA_PER_SAMPLE:
-                save_sample_data(current_sample, samples_by_guid[current_sample])
-                del samples_by_guid[current_sample]
-            current_sample = sample_guid
-
-        gene_or_unique_id = get_unique_key(row_dict) if get_unique_key else gene_id
-        existing_data = samples_by_guid[sample_guid].get(gene_or_unique_id)
-        if existing_data and existing_data != row_dict:
-            mismatches[sample_guid].add(gene_or_unique_id)
-
-        #samples_by_guid[sample_guid][gene_or_unique_id] = row_dict
         save_sample_data(sample_guid, row_dict)
 
     errors, warnings = _process_rna_errors(
         gene_ids, missing_required_fields, unmatched_samples, ignore_extra_samples, loaded_samples,
     )
 
-    if not errors:
-        for sample_guid, sample_data in samples_by_guid.items():
-            save_sample_data(sample_guid, sample_data)
-
-    if mismatches:
-        errors = [
-            f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatch_ids)}'
-            for sample_guid, mismatch_ids in mismatches.items()
-        ] + errors
-
     if errors:
         raise ErrorsWarningsException(errors)
 
@@ -454,7 +428,7 @@ def _process_rna_errors(gene_ids, missing_required_fields, unmatched_samples, ig
     return errors, warnings
 
 
-def _load_rna_seq(model_cls, file_path, save_data, load_saved_data, *args, user=None, create_models_before_save=False, post_process=None, **kwargs):
+def _load_rna_seq(model_cls, file_path, save_data, *args, user=None, create_models_before_save=False, **kwargs):
     projects = get_internal_projects()
     data_source = file_path.split('/')[-1].split('_-_')[-1]
 
@@ -474,7 +448,6 @@ def _load_rna_seq(model_cls, file_path, save_data, load_saved_data, *args, user=
     existing_samples_by_guid = {}
     samples_to_create = {}
     created_samples = set()
-    mismatches = defaultdict(set)
 
     def update_sample_models():
         remaining_samples_to_create = [s for key, s in samples_to_create.items() if key not in created_samples]
@@ -509,18 +482,8 @@ def save_sample_data(sample_guid, sample_data):
             update_sample_models()
             created_samples.update(samples_to_create.keys())
 
-        # prev_data = load_saved_data(sample_guid) or {}
-        # new_mismatches = {k for k, v in prev_data.items() if k in sample_data and v != sample_data[k]}
-        # if new_mismatches:
-        #     mismatches[sample_guid].update(new_mismatches)
-        # sample_data.update(prev_data)
-
-        # if post_process:
-        #     post_process(sample_data)
-
         sample_guids_to_load.add(sample_guid)
         save_data(sample_guid, sample_data)
-        #return new_mismatches
 
     def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id_mapping):
         if sample_key in potential_samples:
@@ -542,7 +505,7 @@ def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id
 
     warnings, not_loaded_count = _load_rna_seq_file(
         file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample,
-        mismatches, *args, **kwargs)
+        *args, **kwargs)
     message = f'Parsed {len(sample_guids_to_load) + not_loaded_count} RNA-seq samples'
     info = [message]
     logger.info(message, user)
@@ -565,6 +528,25 @@ def get_matched_sample(sample_key, unmatched_samples, sample_id_to_individual_id
     return sample_guids_to_load, info, warnings
 
 
+def post_process_rna_data(sample_guid, data, get_unique_key=None, post_process=None):
+    mismatches = set()
+
+    data_by_key = {}
+    for row in data:
+        gene_or_unique_id = get_unique_key(row) if get_unique_key else row[GENE_ID_COL]
+        existing_data = data_by_key.get(gene_or_unique_id)
+        if existing_data and existing_data != row:
+            mismatches.add(gene_or_unique_id)
+
+    if mismatches:
+        raise ErrorsWarningsException([
+            f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatches)}'
+        ])
+
+    if post_process:
+        post_process(data)
+
+
 RNA_MODEL_DISPLAY_NAME = {
   RnaSeqOutlier: 'Expression Outlier',
   RnaSeqSpliceOutlier: 'Splice Outlier',

From ac512b2da28c31493407be7fcb62dedaade12d7c Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 14 Mar 2024 12:11:38 -0400
Subject: [PATCH 04/19] fix sample processing

---
 seqr/views/apis/data_manager_api.py |  8 +++++---
 seqr/views/utils/dataset_utils.py   | 12 ++++++------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/seqr/views/apis/data_manager_api.py b/seqr/views/apis/data_manager_api.py
index b726d32eba..a9706a8e02 100644
--- a/seqr/views/apis/data_manager_api.py
+++ b/seqr/views/apis/data_manager_api.py
@@ -278,8 +278,8 @@ def update_rna_seq(request):
     def _save_sample_data(sample_guid, sample_data):
         if sample_guid not in sample_files:
             file_name = os.path.join(get_temp_upload_directory(), _get_sample_file_name(file_name_prefix, sample_guid))
-            sample_files[sample_guid] = gzip.open(file_name, 'a')
-        sample_files[sample_guid].write(json.dumps(sample_data))
+            sample_files[sample_guid] = gzip.open(file_name, 'at')
+        sample_files[sample_guid].write(f'{json.dumps(sample_data)}\n')
 
     try:
         sample_guids, info, warnings = load_rna_seq(
@@ -319,7 +319,9 @@ def load_rna_seq_sample_data(request, sample_guid):
     config = RNA_DATA_TYPE_CONFIGS[data_type]
 
     data_rows = _load_saved_sample_data(file_name, sample_guid)
-    post_process_rna_data(sample_guid, data_rows, **config.get('post_process_kwargs', {}))
+    data_rows, error = post_process_rna_data(sample_guid, data_rows, **config.get('post_process_kwargs', {}))
+    if error:
+        return create_json_response({'error': error}, status=400)
 
     model_cls = config['model_class']
     model_cls.bulk_create(request.user, [model_cls(sample=sample, **data) for data in data_rows])
diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py
index 42cfc7b6d1..88d4a553d3 100644
--- a/seqr/views/utils/dataset_utils.py
+++ b/seqr/views/utils/dataset_utils.py
@@ -537,15 +537,15 @@ def post_process_rna_data(sample_guid, data, get_unique_key=None, post_process=N
         existing_data = data_by_key.get(gene_or_unique_id)
         if existing_data and existing_data != row:
             mismatches.add(gene_or_unique_id)
+        data_by_key[gene_or_unique_id] = row
 
-    if mismatches:
-        raise ErrorsWarningsException([
-            f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatches)}'
-        ])
-
-    if post_process:
+    error = f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {", ".join(mismatches)}' if mismatches else None
+    data = data_by_key.values()
+    if post_process and not error:
         post_process(data)
 
+    return data, error
+
 
 RNA_MODEL_DISPLAY_NAME = {
   RnaSeqOutlier: 'Expression Outlier',

From 175bfbeebb07781ec3e880fa19f780749193bb97 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 14 Mar 2024 12:50:39 -0400
Subject: [PATCH 05/19] update tests

---
 seqr/views/apis/data_manager_api_tests.py | 104 +++++++++++-----------
 1 file changed, 51 insertions(+), 53 deletions(-)

diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
index 71eb3ccf4d..59d0a7ed12 100644
--- a/seqr/views/apis/data_manager_api_tests.py
+++ b/seqr/views/apis/data_manager_api_tests.py
@@ -275,44 +275,42 @@
 RNA_SPLICE_SAMPLE_GUID = 'S000151_na19675_1'
 PLACEHOLDER_GUID = 'S0000100'
 RNA_FILE_ID = 'gs://rna_data/new_muscle_samples.tsv.gz'
-SAMPLE_GENE_OUTLIER_DATA = {
-    'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'p_value': '0.01', 'p_adjust': '0.13', 'z_score': '-3.1'},
-    'ENSG00000233750': {'gene_id': 'ENSG00000233750', 'p_value': '0.064', 'p_adjust': '0.0000057', 'z_score': '7.8'},
-}
-SAMPLE_GENE_TPM_DATA = {
-    'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'tpm': '7.8'},
-    'ENSG00000233750': {'gene_id': 'ENSG00000233750', 'tpm': '0.0'},
-}
-SAMPLE_GENE_SPLICE_DATA = {
-    'ENSG00000233750-2-167254166-167258349-*-psi3': {
+SAMPLE_GENE_OUTLIER_DATA = [
+    {'gene_id': 'ENSG00000240361', 'p_value': '0.01', 'p_adjust': '0.13', 'z_score': '-3.1'},
+    {'gene_id': 'ENSG00000233750', 'p_value': '0.064', 'p_adjust': '0.0000057', 'z_score': '7.8'},
+]
+SAMPLE_GENE_TPM_DATA = [
+    {'gene_id': 'ENSG00000240361', 'tpm': '7.8'},
+    {'gene_id': 'ENSG00000233750', 'tpm': '0.0'},
+]
+SAMPLE_GENE_SPLICE_DATA = [
+    {
         'chrom': '2', 'start': 167254166, 'end': 167258349, 'strand': '*', 'type': 'psi3',
         'p_value': 1.56e-25, 'z_score': -4.9, 'delta_psi': -0.46, 'read_count': 166, 'gene_id': 'ENSG00000233750',
-        'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, 'rank': 1,
+        'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20,
     },
-    'ENSG00000240361-7-132885746-132975168-*-psi5': {
+    {
         'chrom': '7', 'start': 132885746, 'end': 132975168, 'strand': '*', 'type': 'psi5',
         'p_value': 1.08e-56, 'z_score': -6.53, 'delta_psi': -0.85, 'read_count': 231, 'gene_id': 'ENSG00000240361',
-        'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, 'rank': 0,
+        'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20,
     },
-}
+]
 SAMPLE_GENE_SPLICE_DATA2 = {
-    '-2-167258096-167258349-*-psi3': {
         'chrom': '2', 'start': 167258096, 'end': 167258349, 'strand': '*', 'type': 'psi3',
         'p_value': 1.56e-25, 'z_score': 6.33, 'delta_psi': 0.45, 'read_count': 143, 'gene_id': '',
-        'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20, 'rank': 0,
+        'rare_disease_samples_with_junction': 1, 'rare_disease_samples_total': 20,
     }
-}
 RNA_OUTLIER_SAMPLE_DATA = {
-    RNA_MUSCLE_SAMPLE_GUID: json.dumps(SAMPLE_GENE_OUTLIER_DATA),
-    PLACEHOLDER_GUID: json.dumps({'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'p_value': '0.04', 'p_adjust': '0.112', 'z_score': '1.9'}}),
+    RNA_MUSCLE_SAMPLE_GUID: '\n'.join([json.dumps(row) for row in SAMPLE_GENE_OUTLIER_DATA]) + '\n',
+    PLACEHOLDER_GUID: json.dumps({'gene_id': 'ENSG00000240361', 'p_value': '0.04', 'p_adjust': '0.112', 'z_score': '1.9'}) + '\n',
 }
 RNA_TPM_SAMPLE_DATA = {
-    RNA_MUSCLE_SAMPLE_GUID: json.dumps(SAMPLE_GENE_TPM_DATA),
-    PLACEHOLDER_GUID: json.dumps({'ENSG00000240361': {'gene_id': 'ENSG00000240361', 'tpm': '0.112'}}),
+    RNA_MUSCLE_SAMPLE_GUID: '\n'.join([json.dumps(row) for row in SAMPLE_GENE_TPM_DATA]) + '\n',
+    PLACEHOLDER_GUID: json.dumps({'gene_id': 'ENSG00000240361', 'tpm': '0.112'}) + '\n',
 }
 RNA_SPLICE_SAMPLE_DATA = {
-    RNA_SPLICE_SAMPLE_GUID: json.dumps(SAMPLE_GENE_SPLICE_DATA),
-    PLACEHOLDER_GUID: json.dumps(SAMPLE_GENE_SPLICE_DATA2),
+    RNA_SPLICE_SAMPLE_GUID: '\n'.join([json.dumps(row) for row in SAMPLE_GENE_SPLICE_DATA]) + '\n',
+    PLACEHOLDER_GUID: json.dumps(SAMPLE_GENE_SPLICE_DATA2) + '\n',
 }
 RNA_FILENAME_TEMPLATE = 'rna_sample_data__{}__2020-04-15T00:00:00'
 
@@ -670,8 +668,8 @@ def test_kibana_proxy(self):
                 ['NA20870', '1kg project nåme with uniçøde', 'ENSG00000240361', 'fibroblasts', 'detail2', 0.01, 0.13, -3.1],
             ],
             'write_data': {
-                '{"ENSG00000233750": {"gene_id": "ENSG00000233750", "p_value": "0.064", "p_adjust": "0.0000057", "z_score": "7.8"}}',
-                '{"ENSG00000240361": {"gene_id": "ENSG00000240361", "p_value": "0.01", "p_adjust": "0.13", "z_score": "-3.1"}}'
+                '{"gene_id": "ENSG00000233750", "p_value": "0.064", "p_adjust": "0.0000057", "z_score": "7.8"}\n',
+                '{"gene_id": "ENSG00000240361", "p_value": "0.01", "p_adjust": "0.13", "z_score": "-3.1"}\n'
             },
             'new_data': [
                 ['NA19675_D2', '1kg project nåme with uniçøde', 'ENSG00000240361', 'muscle', 'detail1', 0.01, 0.13, -3.1],
@@ -702,8 +700,8 @@ def test_kibana_proxy(self):
                 ['NA20870', 'Test Reprocessed Project', 'ENSG00000240361', 'NA20870', 'muscle', 7.8],
                 ['NA20870', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA20870', 'fibroblasts', 0.0],
             ],
-            'write_data': {'{"ENSG00000240361": {"gene_id": "ENSG00000240361", "tpm": "7.8"}}',
-                           '{"ENSG00000233750": {"gene_id": "ENSG00000233750", "tpm": "0.0"}}'},
+            'write_data': {'{"gene_id": "ENSG00000240361", "tpm": "7.8"}\n',
+                           '{"gene_id": "ENSG00000233750", "tpm": "0.0"}\n'},
             'new_data': [
                 # existing sample NA19675_D2
                 ['NA19675_D2', '1kg project nåme with uniçøde', 'ENSG00000240361', 'NA19675_D2', 'muscle', 7.8],
@@ -726,6 +724,7 @@ def test_kibana_proxy(self):
             'get_models_json': lambda models: list(models.values_list('gene_id', 'tpm')),
             'expected_models_json': [('ENSG00000240361', 7.8), ('ENSG00000233750', 0.0)],
             'sample_guid': RNA_MUSCLE_SAMPLE_GUID,
+            'mismatch_field': 'tpm',
         },
         'splice_outlier': {
             'model_cls': RnaSeqSpliceOutlier,
@@ -744,14 +743,14 @@ def test_kibana_proxy(self):
                 ['NA20870', '1kg project nåme with uniçøde', 'ENSG00000135953', 'chr2', 167258096, 167258349, '*', 'XIRP2',
                  'psi3', 1.56E-25, 6.33, 0.45, 143, 'muscle', 0.03454739, 1, 20],
             ],
-            'write_data': {'{"ENSG00000233750-2-167258096-167258349-*-psi3": {"chrom": "2", "start": 167258096,'
+            'write_data': {'{"chrom": "2", "start": 167258096,'
                            ' "end": 167258349, "strand": "*", "type": "psi3", "p_value": 1.56e-25, "z_score": 6.33,'
                            ' "delta_psi": 0.45, "read_count": 143, "gene_id": "ENSG00000233750",'
-                           ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20, "rank": 0}}',
-                           '{"ENSG00000135953-2-167258096-167258349-*-psi3": {"chrom": "2", "start": 167258096,'
+                           ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20}\n',
+                           '{"chrom": "2", "start": 167258096,'
                            ' "end": 167258349, "strand": "*", "type": "psi3", "p_value": 1.56e-25, "z_score": 6.33,'
                            ' "delta_psi": 0.45, "read_count": 143, "gene_id": "ENSG00000135953",'
-                           ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20, "rank": 0}}',
+                           ' "rare_disease_samples_with_junction": 1, "rare_disease_samples_total": 20}\n',
             },
             'new_data': [
                 # existing sample NA19675_1
@@ -778,13 +777,13 @@ def test_kibana_proxy(self):
             'allow_missing_gene': True,
             'get_models_json': lambda models: list(
                 models.values_list('gene_id', 'chrom', 'start', 'end', 'strand', 'type', 'p_value', 'z_score', 'delta_psi',
-                                   'read_count', 'rare_disease_samples_with_junction', 'rare_disease_samples_total')),
+                                   'read_count', 'rare_disease_samples_with_junction', 'rare_disease_samples_total', 'rank')),
             'expected_models_json': [
-                ('ENSG00000233750', '2', 167254166, 167258349, '*', 'psi3', 1.56e-25, -4.9, -0.46, 166, 1, 20),
-                ('ENSG00000240361', '7', 132885746, 132975168, '*', 'psi5', 1.08e-56, -6.53, -0.85, 231, 1, 20)
+                ('ENSG00000233750', '2', 167254166, 167258349, '*', 'psi3', 1.56e-25, -4.9, -0.46, 166, 1, 20, 1),
+                ('ENSG00000240361', '7', 132885746, 132975168, '*', 'psi5', 1.08e-56, -6.53, -0.85, 231, 1, 20, 0)
             ],
             'sample_guid': RNA_SPLICE_SAMPLE_GUID,
-            'row_id': 'ENSG00000240361-7-132885746-132886973-*-psi5',
+            'row_id': 'ENSG00000233750-2-167254166-167258349-*-psi3',
         },
     }
 
@@ -876,15 +875,6 @@ def _set_file_iter_stdout(rows):
                      f'{", ".join(sorted([col for col in header if col not in params["optional_headers"]]))}',
         })
 
-        mismatch_row = loaded_data_row[:-1] + [loaded_data_row[-1] - 2]
-        _set_file_iter_stdout([header, loaded_data_row, loaded_data_row, mismatch_row])
-        response = self.client.post(url, content_type='application/json', data=json.dumps(body))
-        self.assertEqual(response.status_code, 400)
-        self.assertDictEqual(response.json(), {
-            'errors': [f'Error in {loaded_data_row[0]}: mismatched entries for {params.get("row_id", mismatch_row[2])}'],
-            'warnings': None,
-        })
-
         missing_sample_row = ['NA19675_D3'] + loaded_data_row[1:]
         _set_file_iter_stdout([header, loaded_data_row, missing_sample_row])
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
@@ -1007,9 +997,9 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s
             f'{RNA_FILENAME_TEMPLATE.format(data_type)}__{new_sample_guid if sample_guid == PLACEHOLDER_GUID else sample_guid}.json.gz': data
             for sample_guid, data in params['parsed_file_data'].items()
         }
-        mock_open.assert_has_calls([mock.call(filename, 'wt') for filename in expected_files])
+        mock_open.assert_has_calls([mock.call(filename, 'at') for filename in expected_files])
         self.assertEqual(
-            ''.join([call.args[0] for call in mock_files[filename].__enter__.return_value.write.call_args_list]),
+            ''.join([call.args[0] for call in mock_files[filename].write.call_args_list]),
             expected_files[filename],
         )
 
@@ -1025,7 +1015,7 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s
                                  num_created_samples=2)
 
         self.assertSetEqual(
-            {''.join([call.args[0] for call in mock_file.__enter__.return_value.write.call_args_list]) for mock_file in mock_files.values()},
+            {''.join([call.args[0] for call in mock_file.write.call_args_list]) for mock_file in mock_files.values()},
             params['write_data'],
         )
 
@@ -1042,11 +1032,11 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s
         self.assertTrue(second_tissue_sample_guid != new_sample_guid)
         self.assertTrue(second_tissue_sample_guid in response_json['sampleGuids'])
         mock_open.assert_has_calls([
-            mock.call(f'{RNA_FILENAME_TEMPLATE.format(data_type)}__{sample_guid}.json.gz', 'wt')
+            mock.call(f'{RNA_FILENAME_TEMPLATE.format(data_type)}__{sample_guid}.json.gz', 'at')
             for sample_guid in response_json['sampleGuids']
         ])
         self.assertSetEqual(
-            {''.join([call.args[0] for call in mock_file.__enter__.return_value.write.call_args_list]) for mock_file in mock_files.values()},
+            {''.join([call.args[0] for call in mock_file.write.call_args_list]) for mock_file in mock_files.values()},
             params['write_data'],
         )
 
@@ -1066,12 +1056,12 @@ def test_load_rna_seq_sample_data(self, mock_open, mock_os):
                 model_cls = params['model_cls']
                 model_cls.objects.all().delete()
                 self.reset_logs()
-                mock_open.return_value.__enter__.return_value.read.return_value = params['parsed_file_data'][sample_guid]
+                parsed_file_lines = params['parsed_file_data'][sample_guid].strip().split('\n')
+                mock_open.return_value.__enter__.return_value.readlines.return_value = parsed_file_lines
                 file_name = RNA_FILENAME_TEMPLATE.format(data_type)
 
-                response = self.client.post(url, content_type='application/json', data=json.dumps({
-                    'fileName': file_name, 'dataType': data_type,
-                }))
+                body = {'fileName': file_name, 'dataType': data_type}
+                response = self.client.post(url, content_type='application/json', data=json.dumps(body))
                 self.assertEqual(response.status_code, 200)
                 self.assertDictEqual(response.json(), {'success': True})
 
@@ -1092,6 +1082,14 @@ def test_load_rna_seq_sample_data(self, mock_open, mock_os):
 
                 self.assertListEqual(list(params['get_models_json'](models)), params['expected_models_json'])
 
+                mismatch_row = {**json.loads(parsed_file_lines[0]), params.get('mismatch_field', 'p_value'): '0.05'}
+                mock_open.return_value.__enter__.return_value.readlines.return_value = parsed_file_lines + [json.dumps(mismatch_row)]
+                response = self.client.post(url, content_type='application/json', data=json.dumps(body))
+                self.assertEqual(response.status_code, 400)
+                self.assertDictEqual(response.json(), {
+                    'error': f'Error in {sample_guid.split("_", 1)[-1].upper()}: mismatched entries for {params.get("row_id", mismatch_row["gene_id"])}'
+                })
+
     @classmethod
     def _join_data(cls, data):
         return ['\t'.join(line).encode('utf-8') for line in data]

From 71d732d2047c2a51bde2f901d0aec95c74fa5a22 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Thu, 14 Mar 2024 15:25:45 -0400
Subject: [PATCH 06/19] fix loading command

---
 seqr/management/commands/load_rna_seq.py  | 40 +++++++++++++++++------
 seqr/views/apis/data_manager_api_tests.py |  4 +--
 seqr/views/utils/dataset_utils.py         | 14 +++-----
 3 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/seqr/management/commands/load_rna_seq.py b/seqr/management/commands/load_rna_seq.py
index 8aff327956..fa00715bb3 100644
--- a/seqr/management/commands/load_rna_seq.py
+++ b/seqr/management/commands/load_rna_seq.py
@@ -1,9 +1,10 @@
 import logging
+from collections import defaultdict
 from django.core.management.base import BaseCommand
 
 from seqr.models import Sample
 from seqr.views.utils.file_utils import parse_file
-from seqr.views.utils.dataset_utils import load_rna_seq, RNA_DATA_TYPE_CONFIGS
+from seqr.views.utils.dataset_utils import load_rna_seq, post_process_rna_data, RNA_DATA_TYPE_CONFIGS
 
 logger = logging.getLogger(__name__)
 
@@ -24,18 +25,37 @@ def handle(self, *args, **options):
                 mapping_file = parse_file(options['mapping_file'], f)
 
         data_type = options['data_type']
-        self.model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class']
+        model_cls = RNA_DATA_TYPE_CONFIGS[data_type]['model_class']
 
-        sample_guids, _, _ = load_rna_seq(
-            data_type, options['input_file'], self._save_sample_data, lambda *args: {}, create_models_before_save=True,
+        sample_data_by_guid = defaultdict(list)
+
+        def _save_sample_data(sample_guid, row):
+            sample_data_by_guid[sample_guid].append(row)
+
+        possible_sample_guids, _, _ = load_rna_seq(
+            data_type, options['input_file'], _save_sample_data,
             mapping_file=mapping_file, ignore_extra_samples=options['ignore_extra_samples'])
 
+        sample_models_by_guid = {
+            s['guid']: s for s in Sample.objects.filter(guid__in=sample_data_by_guid).values('guid', 'id', 'sample_id')
+        }
+        errors = []
+        sample_guids = []
+        for sample_guid in possible_sample_guids:
+            data_rows, error = post_process_rna_data(sample_guid, sample_data_by_guid[sample_guid])
+            if error:
+                errors.append(error)
+                continue
+
+            sample_guids.append(sample_guid)
+            sample_model = sample_models_by_guid[sample_guid]
+            models = model_cls.objects.bulk_create(
+                [model_cls(sample_id=sample_model['id'], **data) for data in data_rows], batch_size=1000)
+            logger.info(f'create {len(models)} {model_cls.__name__} for {sample_model["sample_id"]}')
+
         Sample.bulk_update(user=None, update_json={'is_active': True}, guid__in=sample_guids)
 
-        logger.info('DONE')
+        for error in errors:
+            logger.info(error)
 
-    def _save_sample_data(self, sample_guid, data_by_gene):
-        sample = Sample.objects.get(guid=sample_guid)
-        models = self.model_cls.objects.bulk_create(
-            [self.model_cls(sample=sample, **data) for data in data_by_gene.values()], batch_size=1000)
-        logger.info(f'create {len(models)} {self.model_cls.__name__} for {sample.sample_id}')
+        logger.info('DONE')
diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
index 59d0a7ed12..cbaf6fe977 100644
--- a/seqr/views/apis/data_manager_api_tests.py
+++ b/seqr/views/apis/data_manager_api_tests.py
@@ -969,7 +969,7 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s
                    'parentEntityIds': [params['sample_guid']], 'updateType': 'bulk_delete'}}),
                 ('update 1 Samples', {'dbUpdate': {
                     'dbEntity': 'Sample', 'entityIds': [params['sample_guid']],
-                    'updateType': 'bulk_update', 'updateFields': ['data_source']}}),
+                    'updateType': 'bulk_update', 'updateFields': ['data_source', 'is_active']}}),
             ])
         self.assertTrue(params['sample_guid'] in response_json['sampleGuids'])
         self.assertEqual(mock_send_slack.call_count, 2)
@@ -987,7 +987,7 @@ def _test_basic_data_loading(data, num_parsed_samples, num_loaded_samples, new_s
         # test database models are correct
         self.assertEqual(model_cls.objects.count(), params['initial_model_count'] - deleted_count)
         sample_guid = self._check_rna_sample_model(individual_id=1, data_source='new_muscle_samples.tsv.gz',
-                                                   tissue_type=params.get('sample_tissue_type'))
+                                                   tissue_type=params.get('sample_tissue_type'), is_active_sample=False)
         self.assertSetEqual(set(response_json['sampleGuids']), {sample_guid, new_sample_guid})
 
         # test correct file interactions
diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py
index 88d4a553d3..fdf1f4746b 100644
--- a/seqr/views/utils/dataset_utils.py
+++ b/seqr/views/utils/dataset_utils.py
@@ -428,7 +428,7 @@ def _process_rna_errors(gene_ids, missing_required_fields, unmatched_samples, ig
     return errors, warnings
 
 
-def _load_rna_seq(model_cls, file_path, save_data, *args, user=None, create_models_before_save=False, **kwargs):
+def _load_rna_seq(model_cls, file_path, save_data, *args, user=None, **kwargs):
     projects = get_internal_projects()
     data_source = file_path.split('/')[-1].split('_-_')[-1]
 
@@ -447,13 +447,11 @@ def _load_rna_seq(model_cls, file_path, save_data, *args, user=None, create_mode
     sample_guids_to_load = set()
     existing_samples_by_guid = {}
     samples_to_create = {}
-    created_samples = set()
 
     def update_sample_models():
-        remaining_samples_to_create = [s for key, s in samples_to_create.items() if key not in created_samples]
-        if remaining_samples_to_create:
+        if samples_to_create:
             _create_samples(
-                remaining_samples_to_create,
+                samples_to_create.values(),
                 user=user,
                 data_source=data_source,
                 sample_type=Sample.SAMPLE_TYPE_RNA,
@@ -470,7 +468,7 @@ def update_sample_models():
         if to_delete:
             model_cls.bulk_delete(user, to_delete)
 
-        Sample.bulk_update(user, {'data_source': data_source}, guid__in=existing_samples_by_guid)
+        Sample.bulk_update(user, {'data_source': data_source, 'is_active': False}, guid__in=existing_samples_by_guid)
         for guid in to_delete_sample_individuals:
             existing_samples_by_guid[guid]['dataSource'] = data_source
 
@@ -478,10 +476,6 @@ def save_sample_data(sample_guid, sample_data):
         if not sample_data:
             return
 
-        if create_models_before_save:
-            update_sample_models()
-            created_samples.update(samples_to_create.keys())
-
         sample_guids_to_load.add(sample_guid)
         save_data(sample_guid, sample_data)
 

From 796c71b0c8484789eaa753910113f702cd7e19cc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 16 Mar 2024 23:13:43 +0000
Subject: [PATCH 07/19] Bump follow-redirects from 1.15.4 to 1.15.6 in /ui

Bumps [follow-redirects](https://github.com/follow-redirects/follow-redirects) from 1.15.4 to 1.15.6.
- [Release notes](https://github.com/follow-redirects/follow-redirects/releases)
- [Commits](https://github.com/follow-redirects/follow-redirects/compare/v1.15.4...v1.15.6)

---
updated-dependencies:
- dependency-name: follow-redirects
  dependency-type: indirect
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 ui/package-lock.json | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/ui/package-lock.json b/ui/package-lock.json
index 080a18222d..ab391c8066 100644
--- a/ui/package-lock.json
+++ b/ui/package-lock.json
@@ -8485,9 +8485,9 @@
       "dev": true
     },
     "node_modules/follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true,
       "funding": [
         {
@@ -25853,9 +25853,9 @@
       "dev": true
     },
     "follow-redirects": {
-      "version": "1.15.4",
-      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz",
-      "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==",
+      "version": "1.15.6",
+      "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz",
+      "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==",
       "dev": true
     },
     "foreach": {

From b83121a3dee1e89587761d942d530d8f4a2fb317 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 18 Mar 2024 12:30:20 -0400
Subject: [PATCH 08/19] properly parse clinvar significance

---
 ui/pages/Report/components/VariantMetadata.jsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ui/pages/Report/components/VariantMetadata.jsx b/ui/pages/Report/components/VariantMetadata.jsx
index 6f03815d0e..2f7799961b 100644
--- a/ui/pages/Report/components/VariantMetadata.jsx
+++ b/ui/pages/Report/components/VariantMetadata.jsx
@@ -1,7 +1,7 @@
 import React from 'react'
 
 import LoadReportTable from 'shared/components/table/LoadReportTable'
-import { VARIANT_METADATA_COLUMNS } from 'shared/utils/constants'
+import { clinvarSignificance, VARIANT_METADATA_COLUMNS } from 'shared/utils/constants'
 
 const VIEW_ALL_PAGES = [
   { name: 'GREGoR', downloadName: 'GREGoR', path: 'gregor' },
@@ -13,7 +13,7 @@ const COLUMNS = [
   ...VARIANT_METADATA_COLUMNS.slice(0, -1),
   { name: 'allele_balance_or_heteroplasmy_percentage' },
   { name: 'ClinGen allele ID', format: ({ clinvar }) => clinvar?.alleleId },
-  { name: 'ClinVar Clinical Significance', format: ({ clinvar }) => clinvar?.clinicalSignificance },
+  { name: 'ClinVar Clinical Significance', format: ({ clinvar }) => clinvarSignificance(clinvar).pathogenicity },
   { name: 'ClinVar gold star', format: ({ clinvar }) => clinvar?.goldStars },
   { name: 'known_condition_name' },
   { name: 'condition_id' },

From e14de3e3383b68b3707cdef69fcaafd46e50cfe0 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 18 Mar 2024 13:24:25 -0400
Subject: [PATCH 09/19] add better filtration for reloading SNV/INDEL variants

---
 seqr/views/utils/variant_utils.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/seqr/views/utils/variant_utils.py b/seqr/views/utils/variant_utils.py
index 3daba835ca..4af1fb3450 100644
--- a/seqr/views/utils/variant_utils.py
+++ b/seqr/views/utils/variant_utils.py
@@ -102,10 +102,13 @@ def update_project_saved_variant_json(project_id, family_guids=None, dataset_typ
 
 def saved_variants_dataset_type_filter(dataset_type):
     xpos_filter_key = 'xpos__gte' if dataset_type == Sample.DATASET_TYPE_MITO_CALLS else 'xpos__lt'
-    return {
-        'alt__isnull': dataset_type == Sample.DATASET_TYPE_SV_CALLS,
-        xpos_filter_key: get_xpos('M', 1),
-    }
+    dataset_filter = {xpos_filter_key: get_xpos('M', 1)}
+    if dataset_type == Sample.DATASET_TYPE_SV_CALLS:
+        dataset_filter['alt__isnull'] = True
+    else:
+        # Filter out manual variants with invalid characters, such as those used for STRs
+        dataset_filter['alt__regex'] = '^[ACGT]$'
+    return dataset_filter
 
 
 def parse_saved_variant_json(variant_json, family):

From a4c09a4ccb54d209eba1bb39a3b6719fbd36a145 Mon Sep 17 00:00:00 2001
From: Julia Klugherz <juliaklugherz@gmail.com>
Date: Mon, 18 Mar 2024 13:25:56 -0400
Subject: [PATCH 10/19] add clinvar pathogenicity to ui constants

---
 ui/shared/utils/constants.js | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ui/shared/utils/constants.js b/ui/shared/utils/constants.js
index c8d0671dc7..a7c0a96132 100644
--- a/ui/shared/utils/constants.js
+++ b/ui/shared/utils/constants.js
@@ -513,6 +513,7 @@ const CLINVAR_MIN_RISK_PATHOGENICITY = 'likely_risk_allele'
 const CLINVAR_PATHOGENICITIES = [
   'pathogenic',
   'pathogenic/likely_pathogenic',
+  'pathogenic/likely_pathogenic/established_risk_allele',
   'pathogenic/likely_pathogenic/likely_risk_allele',
   'pathogenic/likely_risk_allele',
   'likely_pathogenic',

From 4d8ca4d372b958f8496f526cf0065d88dd7694af Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 18 Mar 2024 16:43:51 -0400
Subject: [PATCH 11/19] add conditional column validation

---
 seqr/views/apis/report_api.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index fceb6bfcdf..840300aa7b 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -695,6 +695,24 @@ def _has_required_table(table, validator, tables):
     return tables.isdisjoint(validator)
 
 
+def _is_required_col(required_validator, row):
+    if not required_validator:
+        return False
+
+    if required_validator is True:
+        return True
+
+    match = re.match(r'CONDITIONAL \(([\w+(\s)?]+) = ([\w+(\s)?]+)\)', required_validator)
+    if not match:
+        return True
+
+    field, value = match.groups()
+    return row[field] == value
+
+
+
+
+
 def _validate_column_data(column, file_name, data, column_validator, warnings, errors):
     data_type = column_validator.get('data_type')
     data_type_validator = DATA_TYPE_VALIDATORS.get(data_type)
@@ -712,7 +730,7 @@ def _validate_column_data(column, file_name, data, column_validator, warnings, e
     for row in data:
         value = row.get(column)
         if not value:
-            if required:
+            if _is_required_col(required, row):
                 missing.append(_get_row_id(row))
             elif recommended:
                 check_recommend_condition = WARN_MISSING_CONDITIONAL_COLUMNS.get(column)

From 4879c39262cdc2b96b310d360150780f179bc255 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 18 Mar 2024 17:16:31 -0400
Subject: [PATCH 12/19] update tests

---
 seqr/views/apis/report_api_tests.py      | 6 +++---
 seqr/views/utils/anvil_metadata_utils.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index 647ff3d730..f37031f63b 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -447,7 +447,7 @@
     'participant': {
         'internal_project_id': {'data_type': 'reference'},
         'prior_testing': {'data_type': 'enumeration'},
-        'proband_relationship': {'required': True},
+        'proband_relationship': {'required': 'CONDITIONAL (sex = Male)'},
         'reported_race': {'enumerations': ['Asian', 'White', 'Black']},
         'age_at_enrollment': {'data_type': 'date'}
     },
@@ -717,7 +717,7 @@ def test_gregor_export(self, mock_subprocess, mock_temp_dir, mock_open, mock_dat
         ] + [
             'The following tables are required in the data model but absent from the reports: subject, dna_read_data_set',
         ] + [
-            'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00732, Broad_HG00733, Broad_NA19678, Broad_NA19679, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881, Broad_NA20888',
+            'The following entries are missing required "proband_relationship" in the "participant" table: Broad_HG00732, Broad_NA19678, Broad_NA20870, Broad_NA20872, Broad_NA20874, Broad_NA20875, Broad_NA20876, Broad_NA20881',
             'The following entries have invalid values for "reported_race" in the "participant" table. Allowed values: Asian, White, Black. Invalid values: Broad_NA19675_1 (Middle Eastern or North African)',
             'The following entries have invalid values for "age_at_enrollment" in the "participant" table. Allowed values have data type date. Invalid values: Broad_NA19675_1 (18)',
             'The following entries have invalid values for "reference_assembly" (from Airtable) in the "aligned_dna_short_read" table. Allowed values have data type integer. Invalid values: NA20888 (GRCh38), VCGS_FAM203_621_D2 (GRCh38)',
@@ -1126,7 +1126,7 @@ def test_family_metadata(self):
             'consanguinity': 'Unknown',
             'condition_id': 'OMIM:615123',
             'known_condition_name': '',
-            'condition_inheritance': '',
+            'condition_inheritance': 'Unknown',
         })
 
         # Test empty project
diff --git a/seqr/views/utils/anvil_metadata_utils.py b/seqr/views/utils/anvil_metadata_utils.py
index 800887229e..4ba6094f0e 100644
--- a/seqr/views/utils/anvil_metadata_utils.py
+++ b/seqr/views/utils/anvil_metadata_utils.py
@@ -565,5 +565,5 @@ def _format_omim_conditions(conditions):
         'known_condition_name': '|'.join(sorted({o['phenotype_description'] for o in conditions if o.get('phenotype_description')})),
         'condition_inheritance': '|'.join(sorted({
             MIM_INHERITANCE_MAP.get(i, i) for o in conditions if o.get('phenotype_inheritance') for i in o['phenotype_inheritance'].split(', ')
-        }))
+        })) or 'Unknown',
     }

From ed11ef2d82b0d47dbbd263036d300633c4394322 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 18 Mar 2024 21:51:08 +0000
Subject: [PATCH 13/19] Bump django from 3.2.24 to 3.2.25

Bumps [django](https://github.com/django/django) from 3.2.24 to 3.2.25.
- [Commits](https://github.com/django/django/compare/3.2.24...3.2.25)

---
updated-dependencies:
- dependency-name: django
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements-dev.txt | 2 +-
 requirements.txt     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements-dev.txt b/requirements-dev.txt
index e22f64d378..20a944635d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -22,7 +22,7 @@ click==8.1.3
     # via pip-tools
 coverage==5.1
     # via -r requirements-dev.in
-django==3.2.24
+django==3.2.25
     # via
     #   -c requirements.txt
     #   django-appconf
diff --git a/requirements.txt b/requirements.txt
index 774e584491..453c702f08 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -26,7 +26,7 @@ defusedxml==0.7.1
     # via
     #   python3-openid
     #   social-auth-core
-django==3.2.24
+django==3.2.25
     # via
     #   -r requirements.in
     #   django-anymail

From eb3a67b3884b1ce3da5d186df638cb03767bf3d0 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 19 Mar 2024 15:57:55 -0400
Subject: [PATCH 14/19] use tags to determine gene_known_for_phenotype

---
 seqr/views/apis/report_api.py            |  1 -
 seqr/views/utils/anvil_metadata_utils.py | 17 +++++------------
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/seqr/views/apis/report_api.py b/seqr/views/apis/report_api.py
index 840300aa7b..682eba561d 100644
--- a/seqr/views/apis/report_api.py
+++ b/seqr/views/apis/report_api.py
@@ -893,7 +893,6 @@ def _add_row(row, family_id, row_type):
         individual_data_types={i.individual_id: i.data_types for i in individuals},
         add_row=_add_row,
         variant_json_fields=['clinvar', 'variantId'],
-        saved_variant_annotations={'tags': ArrayAgg('varianttag__variant_tag_type__name', distinct=True)},
         mme_values={'variant_ids': ArrayAgg('matchmakersubmissiongenes__saved_variant__saved_variant_json__variantId')},
         include_metadata=True,
         include_mondo=True,
diff --git a/seqr/views/utils/anvil_metadata_utils.py b/seqr/views/utils/anvil_metadata_utils.py
index 4ba6094f0e..e7dc41a648 100644
--- a/seqr/views/utils/anvil_metadata_utils.py
+++ b/seqr/views/utils/anvil_metadata_utils.py
@@ -126,7 +126,7 @@ def parse_anvil_metadata(
         variant_json_fields: Iterable[str] = None, post_process_variant: Callable[[dict, list[dict]], dict] = None,
         include_no_individual_families: bool = False, omit_airtable: bool = False, include_metadata: bool = False,
         include_discovery_sample_id: bool = False, include_mondo: bool = False, include_parent_mnvs: bool = False,
-        proband_only_variants: bool = False, saved_variant_annotations: dict = None):
+        proband_only_variants: bool = False):
 
     individual_samples = individual_samples or (_get_loaded_before_date_project_individual_samples(projects, max_loaded_date) \
         if max_loaded_date else _get_all_project_individual_samples(projects))
@@ -147,7 +147,6 @@ def parse_anvil_metadata(
 
     saved_variants_by_family = _get_parsed_saved_discovery_variants_by_family(
         list(family_data_by_id.keys()), variant_filter=variant_filter, variant_json_fields=variant_json_fields,
-        saved_variant_annotations=saved_variant_annotations,
     )
 
     condition_map = _get_condition_map(family_data_by_id.values())
@@ -285,21 +284,14 @@ def _post_process_variant_metadata(v, gene_variants, include_parent_mnvs=False):
 
 
 def _get_parsed_saved_discovery_variants_by_family(
-        families: Iterable[Family], variant_filter: dict, variant_json_fields: list[str], saved_variant_annotations: dict,
+        families: Iterable[Family], variant_filter: dict, variant_json_fields: list[str],
 ):
     tag_types = VariantTagType.objects.filter(project__isnull=True, category=DISCOVERY_CATEGORY)
 
-    annotations = {
-        'gene_known_for_phenotype': Case(When(
-            Q(family__post_discovery_omim_numbers__len=0, family__mondo_id__isnull=True),
-            then=Value('Candidate')), default=Value('Known')
-        ),
-        **(saved_variant_annotations or {}),
-    }
     project_saved_variants = SavedVariant.objects.filter(
         varianttag__variant_tag_type__in=tag_types, family__id__in=families,
         **(variant_filter or {}),
-    ).order_by('created_date').distinct().annotate(**annotations)
+    ).order_by('created_date').distinct().annotate(tags=ArrayAgg('varianttag__variant_tag_type__name', distinct=True))
 
     variants = []
     gene_ids = set()
@@ -321,8 +313,9 @@ def _get_parsed_saved_discovery_variants_by_family(
             'hgvsc': (main_transcript.get('hgvsc') or '').split(':')[-1],
             'hgvsp': (main_transcript.get('hgvsp') or '').split(':')[-1],
             'seqr_chosen_consequence': main_transcript.get('majorConsequence'),
+            'gene_known_for_phenotype': 'Known' if 'Known gene for phenotype' in variant.tags else 'Candidate',
             **{k: variant_json.get(k) for k in ['genotypes', 'svType', 'svName', 'end'] + (variant_json_fields or [])},
-            **{k: getattr(variant, k) for k in ['family_id', 'ref', 'alt', *annotations.keys()]},
+            **{k: getattr(variant, k) for k in ['family_id', 'ref', 'alt', 'tags']},
         })
 
     genes_by_id = get_genes(gene_ids)

From c8c8b4df376e3882b7d4e60eaf0d0beec9bb11b2 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 19 Mar 2024 16:09:51 -0400
Subject: [PATCH 15/19] update tests

---
 seqr/views/apis/report_api_tests.py       | 12 ++++++------
 seqr/views/apis/summary_data_api_tests.py |  7 +++++--
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/seqr/views/apis/report_api_tests.py b/seqr/views/apis/report_api_tests.py
index f37031f63b..55ba602a9a 100644
--- a/seqr/views/apis/report_api_tests.py
+++ b/seqr/views/apis/report_api_tests.py
@@ -616,7 +616,7 @@ def test_anvil_export(self, mock_google_authenticated,  mock_zip):
             '1_248367227_HG00731', 'HG00731', 'HG00731', 'RP11', 'Known', 'paternal',
             'Homozygous', 'GRCh37', '1', '248367227', 'TC', 'T', '-', '-', '-', '-', '-', '-', '-'], discovery_file)
         self.assertIn([
-            '21_3343353_NA19675_1', 'NA19675_1', 'NA19675', 'RP11', 'Known', 'de novo',
+            '21_3343353_NA19675_1', 'NA19675_1', 'NA19675', 'RP11', 'Candidate', 'de novo',
             'Heterozygous', 'GRCh37', '21', '3343353', 'GAGA', 'G', 'c.375_377delTCT', 'p.Leu126del', 'ENST00000258436',
             '-', '-', '-', '-'], discovery_file)
         self.assertIn([
@@ -993,7 +993,7 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
         ])
         self.assertIn([
             'Broad_NA19675_1_21_3343353', 'Broad_NA19675_1', '', 'SNV/INDEL', 'GRCh37', '21', '3343353', 'GAGA', 'G', '',
-            'RP11', 'ENST00000258436', 'c.375_377delTCT', 'p.Leu126del', 'Heterozygous', '', 'de novo', '', '', 'Known',
+            'RP11', 'ENST00000258436', 'c.375_377delTCT', 'p.Leu126del', 'Heterozygous', '', 'de novo', '', '', 'Candidate',
             'Myasthenic syndrome, congenital, 8, with pre- and postsynaptic defects', 'OMIM:615120', 'Autosomal recessive|X-linked',
             'Full', '', '', 'SR-ES', '',
         ], genetic_findings_file)
@@ -1006,12 +1006,12 @@ def _assert_expected_gregor_files(self, mock_open, has_second_project=False):
             self.assertIn([
                 'Broad_NA20889_1_248367227', 'Broad_NA20889', '', 'SNV/INDEL', 'GRCh37', '1', '248367227', 'TC', 'T',
                 '', 'OR4G11P', 'ENST00000505820', 'c.3955G>A', 'c.1586-17C>G', 'Heterozygous', '', 'unknown',
-                'Broad_NA20889_1_249045487', '', 'Known', 'IRIDA syndrome', 'MONDO:0008788', 'Autosomal dominant',
+                'Broad_NA20889_1_249045487', '', 'Candidate', 'IRIDA syndrome', 'MONDO:0008788', 'Autosomal dominant',
                 'Full', '', '', 'SR-ES', '',
             ], genetic_findings_file)
             self.assertIn([
                 'Broad_NA20889_1_249045487', 'Broad_NA20889', '', 'SNV/INDEL', 'GRCh37', '1', '249045487', 'A', 'G', '',
-                'OR4G11P', '', '', '', 'Heterozygous', '', 'unknown', 'Broad_NA20889_1_248367227', '', 'Known',
+                'OR4G11P', '', '', '', 'Heterozygous', '', 'unknown', 'Broad_NA20889_1_248367227', '', 'Candidate',
                 'IRIDA syndrome', 'MONDO:0008788', 'Autosomal dominant', 'Full', '', '', 'SR-ES', '',
             ], genetic_findings_file)
 
@@ -1225,7 +1225,7 @@ def test_variant_metadata(self):
             'family_history': 'Yes',
             'gene': 'OR4G11P',
             'gene_id': 'ENSG00000240361',
-            'gene_known_for_phenotype': 'Known',
+            'gene_known_for_phenotype': 'Candidate',
             'genetic_findings_id': 'NA20889_1_248367227',
             'hgvsc': 'c.3955G>A',
             'hgvsp': 'c.1586-17C>G',
@@ -1253,7 +1253,7 @@ def test_variant_metadata(self):
             'family_history': 'Yes',
             'gene': None,
             'gene_id': None,
-            'gene_known_for_phenotype': 'Known',
+            'gene_known_for_phenotype': 'Candidate',
             'genetic_findings_id': 'NA20889_1_249045487',
             'participant_id': 'NA20889',
             'pos': 249045487,
diff --git a/seqr/views/apis/summary_data_api_tests.py b/seqr/views/apis/summary_data_api_tests.py
index 62b682bc93..c1b94f1e08 100644
--- a/seqr/views/apis/summary_data_api_tests.py
+++ b/seqr/views/apis/summary_data_api_tests.py
@@ -37,8 +37,8 @@
     "num_saved_variants": 2,
     "solve_status": "Unsolved",
     "sample_id": "NA20889",
-    "gene_known_for_phenotype-1": "Known",
-    "gene_known_for_phenotype-2": "Known",
+    "gene_known_for_phenotype-1": "Candidate",
+    "gene_known_for_phenotype-2": "Candidate",
     "variant_inheritance-1": "unknown",
     "variant_inheritance-2": "unknown",
     'genetic_findings_id-1': 'NA20889_1_248367227',
@@ -105,6 +105,8 @@
     'allele_balance_or_heteroplasmy_percentage-2': None,
     'notes-1': None,
     'notes-2': None,
+    'tags-1': ['Tier 1 - Novel gene and phenotype'],
+    'tags-2': ['Tier 1 - Novel gene and phenotype'],
 }
 EXPECTED_SAMPLE_METADATA_ROW = {
     "dbgap_submission": "No",
@@ -147,6 +149,7 @@
     'alt-1': 'T',
     'chrom-1': '1',
     'gene_known_for_phenotype-1': 'Candidate',
+    'tags-1': ['Tier 1 - Novel gene and phenotype'],
     'pos-1': 248367227,
     'end-1': None,
     'ref-1': 'TC',

From 4a37a44f55aaadcd4bc03b9eeebf287cdc59f9dd Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 19 Mar 2024 16:12:22 -0400
Subject: [PATCH 16/19] add hail to status endpoint

---
 hail_search/web_app.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hail_search/web_app.py b/hail_search/web_app.py
index fc274d2c31..bc5b3aab61 100644
--- a/hail_search/web_app.py
+++ b/hail_search/web_app.py
@@ -47,8 +47,12 @@ def hl_json_dumps(obj):
     return json.dumps(obj, default=_hl_json_default)
 
 async def sync_to_async_hail_query(request: web.Request, query: Callable, *args, timeout_s=QUERY_TIMEOUT_S, **kwargs):
+    request_body = None
+    if request.body_exists:
+        request_body = await request.json()
+
     loop = asyncio.get_running_loop()
-    future = loop.run_in_executor(request.app.pool, functools.partial(query, await request.json(), *args, **kwargs))
+    future = loop.run_in_executor(request.app.pool, functools.partial(query, request_body, *args, **kwargs))
     try:
         return await asyncio.wait_for(future, timeout_s)
     except asyncio.TimeoutError:
@@ -94,6 +98,7 @@ async def multi_lookup(request: web.Request) -> web.Response:
 
 
 async def status(request: web.Request) -> web.Response:
+    _ = await sync_to_async_hail_query(request, lambda _: hl.eval(1 + 1))
     return web.json_response({'success': True})
 
 

From 5d4bf803379e6de8cce69f3f5c75c0e23ca2c2c6 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 19 Mar 2024 16:17:04 -0400
Subject: [PATCH 17/19] do not allow unneccessary GTEX data in rna upload

---
 seqr/management/tests/load_rna_seq_tests.py | 2 --
 seqr/views/apis/data_manager_api_tests.py   | 2 --
 seqr/views/utils/dataset_utils.py           | 6 ++----
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/seqr/management/tests/load_rna_seq_tests.py b/seqr/management/tests/load_rna_seq_tests.py
index 6353809f5a..936967322a 100644
--- a/seqr/management/tests/load_rna_seq_tests.py
+++ b/seqr/management/tests/load_rna_seq_tests.py
@@ -68,9 +68,7 @@ def test_tpm(self, mock_utils_logger):
                 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000240361\t12.6\t\n',
                 'NA19675_D2\t1kg project nåme with uniçøde\t\tENSG00000233750\t1.26\t\n',
                 'NA19678_D1\t1kg project nåme with uniçøde\t\tENSG00000233750\t 6.04\twhole_blood\n',
-                'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000240361\t3.1\tinvalid\n',
                 'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n',
-                'GTEX-001\t1kg project nåme with uniçøde\t\tENSG00000233750\t7.8\tmuscle\n',
                 'NA19678\tTest Reprocessed Project\t\tENSG00000240361\t0.2\twhole_blood\n',
             ],
             unmatched_samples='NA19677, NA19678, NA19678_D1',
diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
index cbaf6fe977..5f6c6bbc9b 100644
--- a/seqr/views/apis/data_manager_api_tests.py
+++ b/seqr/views/apis/data_manager_api_tests.py
@@ -708,8 +708,6 @@ def test_kibana_proxy(self):
                 ['NA19675_D2', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA19675_D2', 'muscle', 0.0],
                 # no matched individual NA19675_D3
                 ['NA19675_D3', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA19675_D3', 'fibroblasts', 0.064],
-                # skip GTEX samples
-                ['GTEX_001', '1kg project nåme with uniçøde', 'ENSG00000233750', 'NA19675_D3', 'whole_blood', 1.95],
                 # a different project sample NA20888
                 ['NA20888', 'Test Reprocessed Project', 'ENSG00000240361', 'NA20888', 'muscle', 0.112],
                 # a project mismatched sample NA20878
diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py
index fdf1f4746b..2738652a76 100644
--- a/seqr/views/utils/dataset_utils.py
+++ b/seqr/views/utils/dataset_utils.py
@@ -294,7 +294,7 @@ def _add_splice_rank(sample_data_rows):
     'tpm': {
         'model_class': RnaSeqTpm,
         'columns': TPM_HEADER_COLS,
-        'additional_kwargs': {'should_skip': lambda row: row[SAMPLE_ID_COL].startswith('GTEX')},
+        'additional_kwargs': {},
     },
     'splice_outlier': {
         'model_class': RnaSeqSpliceOutlier,
@@ -331,7 +331,7 @@ def _validate_rna_header(header, column_map):
 def _load_rna_seq_file(
         file_path, user, potential_loaded_samples, update_sample_models, save_sample_data, get_matched_sample,
         column_map, mapping_file=None, allow_missing_gene=False, ignore_extra_samples=False,
-        should_skip=None, format_fields=None,
+        format_fields=None,
 ):
 
     sample_id_to_individual_id_mapping = {}
@@ -349,8 +349,6 @@ def _load_rna_seq_file(
     gene_ids = set()
     for line in tqdm(parsed_f, unit=' rows'):
         row = dict(zip(header, line))
-        if should_skip and should_skip(row):
-            continue
 
         row_dict = {mapped_key: row[col] for mapped_key, col in column_map.items()}
         for mapped_key, format_func in (format_fields or {}).items():

From 2f348151f7bdb5d3af6f3f4396a2a24a9e722b75 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 19 Mar 2024 16:19:49 -0400
Subject: [PATCH 18/19] Add comment

---
 hail_search/web_app.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hail_search/web_app.py b/hail_search/web_app.py
index bc5b3aab61..83efad67af 100644
--- a/hail_search/web_app.py
+++ b/hail_search/web_app.py
@@ -98,7 +98,8 @@ async def multi_lookup(request: web.Request) -> web.Response:
 
 
 async def status(request: web.Request) -> web.Response:
-    _ = await sync_to_async_hail_query(request, lambda _: hl.eval(1 + 1))
+    # Make sure the hail backend process is still alive.
+    await sync_to_async_hail_query(request, lambda _: hl.eval(1 + 1))
     return web.json_response({'success': True})
 
 

From 841188995c35df5ab482413761d449e045bf9f4f Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Tue, 19 Mar 2024 16:38:39 -0400
Subject: [PATCH 19/19] more decriptive error for unmatched sample ID

---
 seqr/management/tests/load_rna_seq_tests.py | 8 ++++----
 seqr/views/apis/data_manager_api_tests.py   | 8 ++++----
 seqr/views/utils/dataset_utils.py           | 2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/seqr/management/tests/load_rna_seq_tests.py b/seqr/management/tests/load_rna_seq_tests.py
index 936967322a..2b95be2185 100644
--- a/seqr/management/tests/load_rna_seq_tests.py
+++ b/seqr/management/tests/load_rna_seq_tests.py
@@ -71,7 +71,7 @@ def test_tpm(self, mock_utils_logger):
                 'NA19677\t1kg project nåme with uniçøde\t\tENSG00000233750\t5.31\tmuscle\n',
                 'NA19678\tTest Reprocessed Project\t\tENSG00000240361\t0.2\twhole_blood\n',
             ],
-            unmatched_samples='NA19677, NA19678, NA19678_D1',
+            unmatched_samples='NA19677 (1kg project nåme with uniçøde), NA19678 (Test Reprocessed Project), NA19678_D1 (1kg project nåme with uniçøde)',
             additional_errors=['Samples missing required "tissue": NA19675_D2'],
         )
 
@@ -106,7 +106,7 @@ def test_tpm(self, mock_utils_logger):
             mock.call('DONE'),
         ])
         mock_utils_logger.warning.assert_has_calls([
-            mock.call('Skipped loading for the following 2 unmatched samples: NA19677, NA19678', None),
+            mock.call('Skipped loading for the following 2 unmatched samples: NA19677 (1kg project nåme with uniçøde), NA19678 (Test Reprocessed Project)', None),
         ])
 
         # Test a new sample created for a mismatched tissue and a row with 0.0 tpm
@@ -134,13 +134,13 @@ def test_outlier(self):
                 'NA19675_D3\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n',
                 'NA19675_D4\t1kg project nåme with uniçøde\tENSG00000233750\tdetail1\t0.064\t0.0000057\t7.8\tmuscle\n',
             ],
-            unmatched_samples='NA19675_D3, NA19675_D4',
+            unmatched_samples='NA19675_D3 (1kg project nåme with uniçøde), NA19675_D4 (1kg project nåme with uniçøde)',
         )
 
         self.mock_open.return_value.__enter__.return_value.__iter__.return_value = ['NA19675_D4\tNA19678']
         with self.assertRaises(ErrorsWarningsException) as e:
             call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--mapping-file', 'map.tsv')
-        self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3'])
+        self.assertEqual(e.exception.errors, ['Unable to find matches for the following samples: NA19675_D3 (1kg project nåme with uniçøde)'])
 
         call_command('load_rna_seq', 'outlier', RNA_FILE_ID, '--ignore-extra-samples')
 
diff --git a/seqr/views/apis/data_manager_api_tests.py b/seqr/views/apis/data_manager_api_tests.py
index 5f6c6bbc9b..1fbadfd7ef 100644
--- a/seqr/views/apis/data_manager_api_tests.py
+++ b/seqr/views/apis/data_manager_api_tests.py
@@ -678,7 +678,7 @@ def test_kibana_proxy(self):
                 ['NA19675_D3', 'Test Reprocessed Project', 'ENSG00000233750', 'muscle', 'detail1', 0.064, '0.0000057', 7.8],
                 ['NA20888', 'Test Reprocessed Project', 'ENSG00000240361', 'muscle', '', 0.04, 0.112, 1.9],
             ],
-            'skipped_samples': 'NA19675_D3',
+            'skipped_samples': 'NA19675_D3 (Test Reprocessed Project)',
             'sample_tissue_type': 'M',
             'num_parsed_samples': 3,
             'initial_model_count': 3,
@@ -713,7 +713,7 @@ def test_kibana_proxy(self):
                 # a project mismatched sample NA20878
                 ['NA20878', 'Test Reprocessed Project', 'ENSG00000233750', 'NA20878', 'fibroblasts', 0.064],
             ],
-            'skipped_samples': 'NA19675_D3, NA20878',
+            'skipped_samples': 'NA19675_D3 (1kg project nåme with uniçøde), NA20878 (Test Reprocessed Project)',
             'sample_tissue_type': 'M',
             'num_parsed_samples': 4,
             'initial_model_count': 4,
@@ -766,7 +766,7 @@ def test_kibana_proxy(self):
                 ['NA20878', 'Test Reprocessed Project', 'ENSG00000233750', 'chr2', 167258096, 167258349, '*', 'XIRP2', 'psi3',
                  1.56E-25, 6.33, 0.45, 143, 'fibroblasts', 0.03454739, 1, 20],
             ],
-            'skipped_samples': 'NA19675_D3, NA20878',
+            'skipped_samples': 'NA19675_D3 (1kg project nåme with uniçøde), NA20878 (Test Reprocessed Project)',
             'sample_tissue_type': 'F',
             'num_parsed_samples': 4,
             'initial_model_count': 7,
@@ -877,7 +877,7 @@ def _set_file_iter_stdout(rows):
         _set_file_iter_stdout([header, loaded_data_row, missing_sample_row])
         response = self.client.post(url, content_type='application/json', data=json.dumps(body))
         self.assertEqual(response.status_code, 400)
-        self.assertDictEqual(response.json(), {'errors': ['Unable to find matches for the following samples: NA19675_D3'], 'warnings': None})
+        self.assertDictEqual(response.json(), {'errors': ['Unable to find matches for the following samples: NA19675_D3 (1kg project nåme with uniçøde)'], 'warnings': None})
 
         unknown_gene_id_row1 = loaded_data_row[:2] + ['NOT_A_GENE_ID1'] + loaded_data_row[3:]
         unknown_gene_id_row2 = loaded_data_row[:2] + ['NOT_A_GENE_ID2'] + loaded_data_row[3:]
diff --git a/seqr/views/utils/dataset_utils.py b/seqr/views/utils/dataset_utils.py
index 2738652a76..087b327718 100644
--- a/seqr/views/utils/dataset_utils.py
+++ b/seqr/views/utils/dataset_utils.py
@@ -414,7 +414,7 @@ def _process_rna_errors(gene_ids, missing_required_fields, unmatched_samples, ig
         errors.append(f'Unknown Gene IDs: {", ".join(sorted(unknown_gene_ids))}')
 
     if unmatched_samples:
-        unmatched_sample_ids = ', '.join(sorted([sample_key[0] for sample_key in unmatched_samples]))
+        unmatched_sample_ids = ', '.join(sorted({f'{sample_key[0]} ({sample_key[1]})' for sample_key in unmatched_samples}))
         if ignore_extra_samples:
             warnings.append(f'Skipped loading for the following {len(unmatched_samples)} unmatched samples: {unmatched_sample_ids}')
         else: