From 4b56a07d51b6204e7b47395205387b176b7086b9 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Sun, 11 Feb 2024 23:53:47 -0500
Subject: [PATCH 1/7] maintain sepearte comp het and main hts

---
 hail_search/queries/base.py             | 203 +++++++++++-------------
 hail_search/queries/mito.py             |  12 +-
 hail_search/queries/multi_data_types.py |  29 +++-
 hail_search/queries/snv_indel.py        |   8 +-
 hail_search/queries/snv_indel_37.py     |   4 +-
 hail_search/queries/sv.py               |  22 +--
 hail_search/web_app.py                  |   2 +-
 7 files changed, 149 insertions(+), 131 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index ddae69ea30..b449d58b48 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -233,20 +233,6 @@ def _load_filtered_table(self, sample_data, intervals=None, **kwargs):
         self.import_filtered_table(
             sample_data, parsed_intervals=parsed_intervals, **kwargs)
 
-        if self._has_comp_het_search:
-            self._comp_het_ht = self._filter_compound_hets()
-            if self._is_recessive_search:
-                self._ht = self._ht.filter(self._ht.family_entries.any(hl.is_defined))
-                if self._has_secondary_annotations:
-                    annotation_filters = self._get_annotation_filters(self._ht)
-                    if annotation_filters:
-                        self._ht = self._ht.filter(hl.any(annotation_filters))
-                    else:
-                        # Data type only has annotations for second hit
-                        self._ht = None
-            else:
-                self._ht = None
-
     @classmethod
     def _get_table_path(cls, path, use_ssd_dir=False):
         return f'{SSD_DATASETS_DIR if use_ssd_dir else DATASETS_DIR}/{cls.GENOME_VERSION}/{cls.DATA_TYPE}/{path}'
@@ -292,7 +278,7 @@ def _load_filtered_project_hts(self, project_samples, skip_all_missing=False, **
                 continue
             try:
                 filtered_project_hts.append(
-                    (self._filter_entries_table(project_ht, project_sample_data, **kwargs), len(project_sample_data))
+                    (*self._filter_entries_table(project_ht, project_sample_data, **kwargs), len(project_sample_data))
                 )
             except HTTPBadRequest as e:
                 exception_messages.add(e.reason)
@@ -312,37 +298,43 @@ def import_filtered_table(self, sample_data, intervals=None, **kwargs):
             family_ht = family_ht.annotate_globals(
                 family_guids=[family_guid], family_samples={family_guid: family_ht.sample_ids},
             )
-            families_ht = self._filter_entries_table(family_ht, family_sample_data, **kwargs)
+            families_ht, comp_het_families_ht = self._filter_entries_table(family_ht, family_sample_data, **kwargs)
         else:
             filtered_project_hts = self._load_filtered_project_hts(project_samples, **kwargs)
-            families_ht, num_families = filtered_project_hts[0]
-            entry_type = families_ht.family_entries.dtype.element_type
-            for project_ht, num_project_families in filtered_project_hts[1:]:
-                families_ht = families_ht.join(project_ht, how='outer')
-                families_ht = families_ht.select_globals(
-                    family_guids=families_ht.family_guids.extend(families_ht.family_guids_1)
-                )
-                select_fields = {
-                    'filters': families_ht.filters.union(families_ht.filters_1),
-                    'family_entries': hl.bind(
-                        lambda a1, a2: a1.extend(a2),
-                        hl.or_else(families_ht.family_entries, hl.empty_array(entry_type)),
-                        hl.or_else(families_ht.family_entries_1, hl.empty_array(entry_type)),
-                    ),
-                }
-                if 'comp_het_family_entries_1' in families_ht.row:
-                    missing_arr = lambda count: hl.range(count).map(lambda i: hl.missing(entry_type))
-                    select_fields['comp_het_family_entries'] = hl.bind(
-                        lambda a1, a2: a1.extend(a2),
-                        hl.or_else(families_ht.comp_het_family_entries, missing_arr(num_families)),
-                        hl.or_else(families_ht.comp_het_family_entries_1, missing_arr(num_project_families)),
-                    )
-                families_ht = families_ht.select(**select_fields)
+            families_ht, comp_het_families_ht, num_families = filtered_project_hts[0]
+            main_ht = comp_het_families_ht if families_ht is None else families_ht
+            entry_type = main_ht.family_entries.dtype.element_type
+            for project_ht, comp_het_project_ht, num_project_families in filtered_project_hts[1:]:
+                if families_ht is not None:
+                    families_ht = _add_project_ht(self, families_ht, project_ht, entry_type)
+                if comp_het_families_ht is not None:
+                    comp_het_families_ht = _add_project_ht(self, comp_het_families_ht, comp_het_project_ht, entry_type)
                 num_families += num_project_families
 
-        self._ht = self._query_table_annotations(families_ht, self._get_table_path('annotations.ht'))
+        #  TODO add pre-processing for annotations so do not even read in tables if not going to have vaild annotations
+        if comp_het_families_ht is not None:
+            comp_het_ht = self._query_table_annotations(comp_het_families_ht, self._get_table_path('annotations.ht'))
+            self._comp_het_ht = self._filter_annotated_table(comp_het_ht, is_comp_het=True, **kwargs)
+            self._comp_het_ht = self._filter_compound_hets()
 
-        self._filter_annotated_table(**kwargs)
+        if families_ht is not None:
+            ht = self._query_table_annotations(families_ht, self._get_table_path('annotations.ht'))
+            self._ht = self._filter_annotated_table(ht, **kwargs)
+
+    def _add_project_ht(self, families_ht, project_ht, entry_type):
+        families_ht = families_ht.join(project_ht, how='outer')
+        families_ht = families_ht.select_globals(
+            family_guids=families_ht.family_guids.extend(families_ht.family_guids_1)
+        )
+        select_fields = {
+            'filters': families_ht.filters.union(families_ht.filters_1),
+            'family_entries': hl.bind(
+                lambda a1, a2: a1.extend(a2),
+                hl.or_else(families_ht.family_entries, hl.empty_array(entry_type)),
+                hl.or_else(families_ht.family_entries_1, hl.empty_array(entry_type)),
+            ),
+        }
+        return families_ht.select(**select_fields)
 
     def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, quality_filter=None,
                               **kwargs):
@@ -362,11 +354,11 @@ def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inherita
             ))
             ht = ht.filter(ht.family_entries.any(hl.is_defined))
 
-        ht = self._filter_inheritance(
+        ht, ch_ht = self._filter_inheritance(
             ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
         )
 
-        return ht.select_globals('family_guids')
+        return ht, ch_ht
 
     @classmethod
     def _add_entry_sample_families(cls, ht, sample_data):
@@ -432,25 +424,20 @@ def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sorted_f
             lambda entries: hl.or_missing(entries.any(any_valid_entry), entries))
         )
 
-        filter_mode_map = {}
-        if (inheritance_filter or inheritance_mode) and not is_any_affected:
-            filter_mode_map[inheritance_mode] = 'family_entries'
+        comp_het_ht = None
         if self._has_comp_het_search:
-            filter_mode_map[COMPOUND_HET] = 'comp_het_family_entries'
-
-        for mode, field in sorted(filter_mode_map.items()):
-            ht = self._filter_families_inheritance(
-                ht, mode, inheritance_filter, sorted_family_sample_data, field,
+            comp_het_ht = self._filter_families_inheritance(
+                ht, COMPOUND_HET, inheritance_filter, sorted_family_sample_data,
             )
 
-        filter_expr = ht.family_entries.any(hl.is_defined)
-        if self._has_comp_het_search:
-            ch_filter = ht.comp_het_family_entries.any(hl.is_defined)
-            filter_expr = (filter_expr | ch_filter) if self._is_recessive_search else ch_filter
+        if (inheritance_filter or inheritance_mode) and not is_any_affected:
+            ht = None if inheritance_mode == COMPOUND_HET else self._filter_families_inheritance(
+                ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
+            )
 
-        return ht.filter(filter_expr)
+        return ht, comp_het_ht
 
-    def _filter_families_inheritance(self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data, field):
+    def _filter_families_inheritance(self, ht, inheritance_mode, inheritance_filter, sorted_family_sample_data):
         individual_genotype_filter = (inheritance_filter or {}).get('genotype')
 
         entry_indices_by_gt = defaultdict(lambda: defaultdict(list))
@@ -467,12 +454,11 @@ def _filter_families_inheritance(self, ht, inheritance_mode, inheritance_filter,
 
         for genotype, entry_indices in entry_indices_by_gt.items():
             entry_indices = hl.dict(entry_indices)
-            family_entries = ht[field] if field in ht.row else ht.family_entries
-            ht = ht.annotate(**{field: hl.enumerate(family_entries).map(
+            ht = ht.annotate(family_entries=hl.enumerate(ht.family_entries).map(
                 lambda x: self._valid_genotype_family_entries(x[1], entry_indices.get(x[0]), genotype, inheritance_mode)
-            )})
+            ))
 
-        return ht
+        return ht.filter(ht.family_entries.any(hl.is_defined)).select_globals('family_guids')
 
     @classmethod
     def _valid_genotype_family_entries(cls, entries, gentoype_entry_indices, genotype, inheritance_mode):
@@ -525,30 +511,30 @@ def _parse_variant_keys(self, variant_keys=None, **kwargs):
     def _prefilter_entries_table(self, ht, **kwargs):
         return ht
 
-    def _filter_annotated_table(self, gene_ids=None, rs_ids=None, frequencies=None, in_silico=None, pathogenicity=None,
-                                annotations=None, annotations_secondary=None, **kwargs):
+    def _filter_annotated_table(self, ht, gene_ids=None, rs_ids=None, frequencies=None, in_silico=None, pathogenicity=None,
+                                annotations=None, annotations_secondary=None, is_comp_het=False, **kwargs):
         if gene_ids:
-            self._filter_by_gene_ids(gene_ids)
+            ht = self._filter_by_gene_ids(ht, gene_ids)
 
         if rs_ids:
-            self._filter_rs_ids(rs_ids)
+            ht = self._filter_rs_ids(ht, rs_ids)
 
-        self._filter_by_frequency(frequencies, pathogenicity)
+        ht = self._filter_by_frequency(ht, frequencies, pathogenicity)
 
-        self._filter_by_in_silico(in_silico)
+        ht = self._filter_by_in_silico(ht, in_silico)
 
-        self._filter_by_annotations(pathogenicity, annotations, annotations_secondary)
+        return self._filter_by_annotations(ht, pathogenicity, annotations, annotations_secondary, is_comp_het)
 
-    def _filter_by_gene_ids(self, gene_ids):
+    def _filter_by_gene_ids(self, ht, gene_ids):
         gene_ids = hl.set(gene_ids)
-        self._ht = self._ht.annotate(
-            gene_transcripts=self._ht[self.TRANSCRIPTS_FIELD].filter(lambda t: gene_ids.contains(t.gene_id))
+        ht = ht.annotate(
+            gene_transcripts=ht[self.TRANSCRIPTS_FIELD].filter(lambda t: gene_ids.contains(t.gene_id))
         )
-        self._ht = self._ht.filter(hl.is_defined(self._ht.gene_transcripts.first()))
+        return ht.filter(hl.is_defined(ht.gene_transcripts.first()))
 
-    def _filter_rs_ids(self, rs_ids):
+    def _filter_rs_ids(self, ht, rs_ids):
         rs_id_set = hl.set(rs_ids)
-        self._ht = self._ht.filter(rs_id_set.contains(self._ht.rsid))
+        return ht.filter(rs_id_set.contains(ht.rsid))
 
     def _parse_intervals(self, intervals, **kwargs):
         parsed_variant_keys = self._parse_variant_keys(**kwargs)
@@ -584,16 +570,16 @@ def _parse_intervals(self, intervals, **kwargs):
     def _should_add_chr_prefix(self):
         return True
 
-    def _filter_by_frequency(self, frequencies, pathogenicity):
+    def _filter_by_frequency(self, ht, frequencies, pathogenicity):
         frequencies = {k: v for k, v in (frequencies or {}).items() if k in self.POPULATIONS}
         if not frequencies:
-            return
+            return ht
 
-        path_override_filter = self._frequency_override_filter(pathogenicity)
+        path_override_filter = self._frequency_override_filter(ht, pathogenicity)
         filters = []
         for pop, freqs in sorted(frequencies.items()):
             pop_filters = []
-            pop_expr = self._ht[self.POPULATION_FIELDS.get(pop, pop)]
+            pop_expr = ht[self.POPULATION_FIELDS.get(pop, pop)]
             pop_config = self._format_population_config(self.POPULATIONS[pop])
             if freqs.get('af') is not None:
                 af_field = pop_config.get('filter_af') or pop_config['af']
@@ -618,22 +604,23 @@ def _filter_by_frequency(self, frequencies, pathogenicity):
                 filters.append(hl.is_missing(pop_expr) | hl.all(pop_filters))
 
         if filters:
-            self._ht = self._ht.filter(hl.all(filters))
+            ht = ht.filter(hl.all(filters))
+        return ht
 
-    def _frequency_override_filter(self, pathogenicity):
+    def _frequency_override_filter(self, ht, pathogenicity):
         return None
 
-    def _filter_by_in_silico(self, in_silico_filters):
+    def _filter_by_in_silico(self, ht, in_silico_filters):
         in_silico_filters = in_silico_filters or {}
         require_score = in_silico_filters.get('requireScore', False)
         in_silico_filters = {k: v for k, v in in_silico_filters.items() if k in self.PREDICTION_FIELDS_CONFIG and v}
         if not in_silico_filters:
-            return
+            return ht
 
         in_silico_qs = []
         missing_qs = []
         for in_silico, value in in_silico_filters.items():
-            score_filter, ht_value = self._get_in_silico_filter(in_silico, value)
+            score_filter, ht_value = self._get_in_silico_filter(ht, in_silico, value)
             in_silico_qs.append(score_filter)
             if not require_score:
                 missing_qs.append(hl.is_missing(ht_value))
@@ -641,41 +628,46 @@ def _filter_by_in_silico(self, in_silico_filters):
         if missing_qs:
             in_silico_qs.append(hl.all(missing_qs))
 
-        self._ht = self._ht.filter(hl.any(in_silico_qs))
+        return ht.filter(hl.any(in_silico_qs))
 
-    def _get_in_silico_filter(self, in_silico, value):
+    def _get_in_silico_filter(self, ht, in_silico, value):
         score_path = self.PREDICTION_FIELDS_CONFIG[in_silico]
         enum_lookup = self._get_enum_lookup(*score_path[:2])
         if enum_lookup is not None:
-            ht_value = self._ht[score_path.source][f'{score_path.field}_id']
+            ht_value = ht[score_path.source][f'{score_path.field}_id']
             score_filter = ht_value == enum_lookup[value]
         else:
-            ht_value = self._ht[score_path.source][score_path.field]
+            ht_value = ht[score_path.source][score_path.field]
             score_filter = ht_value >= float(value)
 
         return score_filter, ht_value
 
-    def _filter_by_annotations(self, pathogenicity, annotations, annotations_secondary):
+    def _filter_by_annotations(self, ht, pathogenicity, annotations, annotations_secondary, is_comp_het):
         annotations = annotations or {}
-        annotation_override_filters = self._get_annotation_override_filters(annotations, pathogenicity=pathogenicity)
+        annotation_override_filters = self._get_annotation_override_filters(ht, annotations, pathogenicity=pathogenicity)
 
-        annotation_exprs, _ = self._get_allowed_consequences_annotations(annotations, annotation_override_filters)
-        if self._has_comp_het_search:
+        annotation_exprs, _ = self._get_allowed_consequences_annotations(ht, annotations, annotation_override_filters)
+        if is_comp_het or (self._has_comp_het_search and not annotation_exprs):
             secondary_exprs, allowed_secondary_consequences = self._get_allowed_consequences_annotations(
-                annotations_secondary or {}, annotation_override_filters, is_secondary=True)
+                ht, annotations_secondary or {}, annotation_override_filters, is_secondary=True)
             if secondary_exprs:
                 annotation_exprs.update({f'{k}_secondary': v for k, v in secondary_exprs.items()})
             if secondary_exprs or allowed_secondary_consequences:
                 self._has_secondary_annotations = True
+            if self._has_secondary_annotations and not is_comp_het:
+                # Data type only has annotations for second hit, so no query to be done on the main ht
+                return None
 
         if not annotation_exprs:
-            return
+            return ht
 
-        self._ht = self._ht.annotate(**annotation_exprs)
-        annotation_filters = self._get_annotation_filters(self._ht) + self._get_annotation_filters(self._ht, is_secondary=True)
-        self._ht = self._ht.filter(hl.any(annotation_filters))
+        ht = ht.annotate(**annotation_exprs)
+        annotation_filters = self._get_annotation_filters(ht)
+        if is_comp_het:
+            annotation_filters += self._get_annotation_filters(ht, is_secondary=True)
+        return ht.filter(hl.any(annotation_filters))
 
-    def _get_allowed_consequences_annotations(self, annotations, annotation_filters, is_secondary=False):
+    def _get_allowed_consequences_annotations(self, ht, annotations, annotation_filters, is_secondary=False):
         allowed_consequences = {
             ann for field, anns in annotations.items()
             if anns and (field not in ANNOTATION_OVERRIDE_FIELDS) for ann in anns
@@ -686,7 +678,7 @@ def _get_allowed_consequences_annotations(self, annotations, annotation_filters,
         transcript_consequence_filter = self._get_transcript_consequence_filter(allowed_consequence_ids, allowed_consequences)
         has_consequence_filter = transcript_consequence_filter is not None
         if has_consequence_filter:
-            allowed_transcripts = self._ht[self.TRANSCRIPTS_FIELD].filter(transcript_consequence_filter)
+            allowed_transcripts = ht[self.TRANSCRIPTS_FIELD].filter(transcript_consequence_filter)
             annotation_exprs[ALLOWED_TRANSCRIPTS] = allowed_transcripts
         if annotation_filters:
             annotation_exprs[HAS_ANNOTATION_OVERRIDE] = hl.any(annotation_filters)
@@ -702,11 +694,12 @@ def _get_transcript_consequence_filter(self, allowed_consequence_ids, allowed_co
         allowed_consequence_ids = hl.set(allowed_consequence_ids)
         return lambda gc: allowed_consequence_ids.contains(gc.major_consequence_id)
 
-    def _get_annotation_override_filters(self, annotations, **kwargs):
+    def _get_annotation_override_filters(self, ht, annotations, **kwargs):
         return []
 
     @staticmethod
     def _get_annotation_filters(ht, is_secondary=False):
+        # TODO not needed for anything except comp het search, just directly filter for everything else
         suffix = '_secondary' if is_secondary else ''
         annotation_filters = []
 
@@ -722,9 +715,7 @@ def _get_annotation_filters(ht, is_secondary=False):
 
     def _filter_compound_hets(self):
         # pylint: disable=pointless-string-statement
-        ch_ht = self._ht
-        if self._is_recessive_search:
-            ch_ht = ch_ht.filter(ch_ht.comp_het_family_entries.any(hl.is_defined))
+        ch_ht = self._comp_het_ht
 
         # Get possible pairs of variants within the same gene
         ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht, comp_het=True))
@@ -813,8 +804,8 @@ def _filter_compound_hets(self):
 
     def _filter_grouped_compound_hets(self, ch_ht):
         # Filter variant pairs for family and genotype
-        ch_ht = ch_ht.annotate(valid_families=hl.enumerate(ch_ht.v1.comp_het_family_entries).map(
-            lambda x: self._is_valid_comp_het_family(ch_ht, x[1], ch_ht.v2.comp_het_family_entries[x[0]])
+        ch_ht = ch_ht.annotate(valid_families=hl.enumerate(ch_ht.v1.family_entries).map(
+            lambda x: self._is_valid_comp_het_family(ch_ht, x[1], ch_ht.v2.family_entries[x[0]])
         ))
         ch_ht = ch_ht.filter(ch_ht.valid_families.any(lambda x: x))
         ch_ht = ch_ht.select(**{k: self._annotated_comp_het_variant(ch_ht, k) for k in ['v1', 'v2']})
@@ -827,7 +818,7 @@ def _annotated_comp_het_variant(ch_ht, field):
         return variant.annotate(
             comp_het_gene_ids=ch_ht.comp_het_gene_ids,
             family_entries=hl.enumerate(ch_ht.valid_families).filter(
-                lambda x: x[1]).map(lambda x: variant.comp_het_family_entries[x[0]]),
+                lambda x: x[1]).map(lambda x: variant.family_entries[x[0]]),
         )
 
     @classmethod
@@ -974,7 +965,7 @@ def lookup_variant(self, variant_id, sample_data=None):
 
         if sample_data:
             project_samples, _ = self._parse_sample_data(sample_data)
-            for pht, _ in self._load_filtered_project_hts(project_samples, skip_all_missing=True):
+            for pht, _, _ in self._load_filtered_project_hts(project_samples, skip_all_missing=True):
                 project_entries = pht.aggregate(hl.agg.take(hl.struct(**{k: v(pht) for k, v in entry_annotations.items()}), 1))
                 variant[FAMILY_GUID_FIELD] += project_entries[0][FAMILY_GUID_FIELD]
                 variant[GENOTYPES_FIELD].update(project_entries[0][GENOTYPES_FIELD])
diff --git a/hail_search/queries/mito.py b/hail_search/queries/mito.py
index d0c3830bd5..46e5525fd8 100644
--- a/hail_search/queries/mito.py
+++ b/hail_search/queries/mito.py
@@ -219,19 +219,19 @@ def _get_transcript_consequence_filter(self, allowed_consequence_ids, allowed_co
              if canonical_consequences else allowed_consequence_ids
         ).contains)
 
-    def _get_annotation_override_filters(self, annotations, pathogenicity=None, **kwargs):
+    def _get_annotation_override_filters(self, ht, annotations, pathogenicity=None, **kwargs):
         annotation_filters = []
 
         for key in self.PATHOGENICITY_FILTERS.keys():
             path_terms = (pathogenicity or {}).get(key)
             if path_terms:
-                annotation_filters.append(self._has_path_expr(path_terms, key))
+                annotation_filters.append(self._has_path_expr(ht,path_terms, key))
 
         return annotation_filters
 
-    def _frequency_override_filter(self, pathogenicity):
+    def _frequency_override_filter(self, ht, pathogenicity):
         path_terms = self._get_clinvar_path_filters(pathogenicity)
-        return self._has_path_expr(path_terms, CLINVAR_KEY) if path_terms else None
+        return self._has_path_expr(ht, path_terms, CLINVAR_KEY) if path_terms else None
 
     @staticmethod
     def _get_clinvar_path_filters(pathogenicity):
@@ -239,7 +239,7 @@ def _get_clinvar_path_filters(pathogenicity):
             f for f in (pathogenicity or {}).get(CLINVAR_KEY) or [] if f in CLINVAR_PATH_SIGNIFICANCES
         }
 
-    def _has_path_expr(self, terms, field):
+    def _has_path_expr(self, ht, terms, field):
         subfield, range_configs = self.PATHOGENICITY_FILTERS[field]
         field_name = self.PATHOGENICITY_FIELD_MAP.get(field, field)
         enum_lookup = self._get_enum_lookup(field_name, subfield)
@@ -254,7 +254,7 @@ def _has_path_expr(self, terms, field):
                 ranges.append([None, None])
 
         ranges = [r for r in ranges if r[0] is not None]
-        value = self._ht[field_name][f'{subfield}_id']
+        value = ht[field_name][f'{subfield}_id']
         return hl.any(lambda r: (value >= r[0]) & (value <= r[1]), ranges)
 
     def _format_results(self, ht, *args, **kwargs):
diff --git a/hail_search/queries/multi_data_types.py b/hail_search/queries/multi_data_types.py
index 44d74a382b..cee14dbac6 100644
--- a/hail_search/queries/multi_data_types.py
+++ b/hail_search/queries/multi_data_types.py
@@ -71,7 +71,7 @@ def _filter_data_type_comp_hets(self, variant_ht, variant_families, sv_query):
     @staticmethod
     def _family_filtered_ch_ht(ht, overlapped_families, families, key):
         family_indices = hl.array([families.index(family_guid) for family_guid in overlapped_families])
-        ht = ht.annotate(comp_het_family_entries=family_indices.map(lambda i: ht.comp_het_family_entries[i]))
+        ht = ht.annotate(family_entries=family_indices.map(lambda i: ht.family_entries[i]))
         return ht.group_by('gene_ids').aggregate(**{key: hl.agg.collect(ht.row)})
 
     def _is_valid_comp_het_family(self, ch_ht, entries_1, entries_2):
@@ -93,6 +93,9 @@ def _comp_het_entry_has_ref(self, gt1, gt2):
 
     def format_search_ht(self):
         hts = []
+        import logging
+        import time
+        logger = logging.getLogger(__name__)
         for data_type, query in self._data_type_queries.items():
             dt_ht = query.format_search_ht()
             if dt_ht is None:
@@ -101,6 +104,28 @@ def format_search_ht(self):
             if merged_sort_expr is not None:
                 dt_ht = dt_ht.annotate(_sort=merged_sort_expr)
             hts.append(dt_ht.select('_sort', **{data_type: dt_ht.row}))
+            # start = time.perf_counter()
+            # logger.info(f'{data_type}: {dt_ht.count()} ({time.perf_counter() - start:0.4f}s)')
+            """
+            Hom-recessive only:
+            SV_WGS: 0 (5.9890s)
+            MITO: 0 (2.4309s)
+            SNV_INDEL: 3 (16.8396s)
+
+            All recessive (with comp het)
+            SV_WGS: 0 (14.6799s)
+            MITO: 0 (8.7807s)
+            SNV_INDEL: 11 (170.8936s)
+            comp het SV_WGS: 0 (86.7876s)
+            Actual total: ~304s
+            
+            With updates:
+            SV_WGS: 0 (20.0788s)
+            MITO: 0 (9.6441s)
+            SNV_INDEL: 11 (106.1276s)
+            SV_WGS: 0 (82.6384s)
+            Actual total: ~217s
+            """
 
         for data_type, ch_ht in self._comp_het_hts.items():
             ch_ht = ch_ht.annotate(
@@ -111,6 +136,8 @@ def format_search_ht(self):
                 _sort=hl.sorted([ch_ht.v1._sort, ch_ht.v2._sort])[0],
                 **{f'comp_het_{data_type}': ch_ht.row},
             ))
+            # start = time.perf_counter()
+            # logger.info(f'comp het {data_type}: {ch_ht.count()} ({time.perf_counter() - start:0.4f}s)')
 
         ht = hts[0]
         for sub_ht in hts[1:]:
diff --git a/hail_search/queries/snv_indel.py b/hail_search/queries/snv_indel.py
index bcc06d0015..0eac76c7f9 100644
--- a/hail_search/queries/snv_indel.py
+++ b/hail_search/queries/snv_indel.py
@@ -93,14 +93,14 @@ def _get_gnomad_af_prefilter(self, frequencies=None, pathogenicity=None, **kwarg
 
         return 'is_gt_10_percent' if af_cutoff > PREFILTER_FREQ_CUTOFF else True
 
-    def _get_annotation_override_filters(self, annotations, *args, **kwargs):
-        annotation_filters = super()._get_annotation_override_filters(annotations, *args, **kwargs)
+    def _get_annotation_override_filters(self, ht, annotations, *args, **kwargs):
+        annotation_filters = super()._get_annotation_override_filters(ht, annotations, *args, **kwargs)
 
         if annotations.get(SCREEN_KEY):
             allowed_consequences = hl.set(self._get_enum_terms_ids(SCREEN_KEY.lower(), 'region_type', annotations[SCREEN_KEY]))
-            annotation_filters.append(allowed_consequences.contains(self._ht.screen.region_type_ids.first()))
+            annotation_filters.append(allowed_consequences.contains(ht.screen.region_type_ids.first()))
         if annotations.get(SPLICE_AI_FIELD):
-            score_filter, _ = self._get_in_silico_filter(SPLICE_AI_FIELD, annotations[SPLICE_AI_FIELD])
+            score_filter, _ = self._get_in_silico_filter(ht, SPLICE_AI_FIELD, annotations[SPLICE_AI_FIELD])
             annotation_filters.append(score_filter)
 
         return annotation_filters
diff --git a/hail_search/queries/snv_indel_37.py b/hail_search/queries/snv_indel_37.py
index b52b14f53a..2ef261692b 100644
--- a/hail_search/queries/snv_indel_37.py
+++ b/hail_search/queries/snv_indel_37.py
@@ -11,6 +11,6 @@ class SnvIndelHailTableQuery37(SnvIndelHailTableQuery):
     def _should_add_chr_prefix(self):
         return False
 
-    def _get_annotation_override_filters(self, annotations, *args, **kwargs):
+    def _get_annotation_override_filters(self, ht, annotations, *args, **kwargs):
         annotations = {k: v for k, v in annotations.items() if k != SCREEN_KEY}
-        return super()._get_annotation_override_filters(annotations, *args, **kwargs)
+        return super()._get_annotation_override_filters(ht, annotations, *args, **kwargs)
diff --git a/hail_search/queries/sv.py b/hail_search/queries/sv.py
index 5e363adbfc..f88f722a27 100644
--- a/hail_search/queries/sv.py
+++ b/hail_search/queries/sv.py
@@ -54,18 +54,18 @@ class SvHailTableQuery(BaseHailTableQuery):
     def _get_sample_type(cls, *args):
         return cls.DATA_TYPE.split('_')[-1]
 
-    def _filter_annotated_table(self, *args, parsed_intervals=None, exclude_intervals=False, **kwargs):
+    def _filter_annotated_table(self, ht, *args, parsed_intervals=None, exclude_intervals=False, **kwargs):
         if parsed_intervals:
             interval_filter = hl.array(parsed_intervals).any(lambda interval: hl.if_else(
-                self._ht.start_locus.contig == self._ht.end_locus.contig,
-                interval.overlaps(hl.interval(self._ht.start_locus, self._ht.end_locus)),
-                interval.contains(self._ht.start_locus) | interval.contains(self._ht.end_locus),
+                ht.start_locus.contig == ht.end_locus.contig,
+                interval.overlaps(hl.interval(ht.start_locus, ht.end_locus)),
+                interval.contains(ht.start_locus) | interval.contains(ht.end_locus),
             ))
             if exclude_intervals:
                 interval_filter = ~interval_filter
-            self._ht = self._ht.filter(interval_filter)
+            ht = ht.filter(interval_filter)
 
-        return super()._filter_annotated_table(*args, **kwargs)
+        return super()._filter_annotated_table(ht, *args, **kwargs)
 
     def _get_family_passes_quality_filter(self, quality_filter, annotations=None, **kwargs):
         passes_quality = super()._get_family_passes_quality_filter(quality_filter)
@@ -78,18 +78,18 @@ def _get_family_passes_quality_filter(self, quality_filter, annotations=None, **
 
         return lambda entries: entries_has_new_call(entries) & passes_quality(entries)
 
-    def _get_allowed_consequences_annotations(self, annotations, annotation_filters, is_secondary=False):
+    def _get_allowed_consequences_annotations(self, ht, annotations, annotation_filters, is_secondary=False):
         if is_secondary:
             # SV search can specify secondary SV types, as well as secondary consequences
-            annotation_filters = self._get_annotation_override_filters(annotations)
-        return super()._get_allowed_consequences_annotations(annotations, annotation_filters)
+            annotation_filters = self._get_annotation_override_filters(ht, annotations)
+        return super()._get_allowed_consequences_annotations(ht, annotations, annotation_filters)
 
-    def _get_annotation_override_filters(self, annotations, **kwargs):
+    def _get_annotation_override_filters(self, ht, annotations, **kwargs):
         annotation_filters = []
         if annotations.get(STRUCTURAL_ANNOTATION_FIELD):
             allowed_type_ids = self.get_allowed_sv_type_ids(annotations[STRUCTURAL_ANNOTATION_FIELD])
             if allowed_type_ids:
-                annotation_filters.append(hl.set(allowed_type_ids).contains(self._ht.sv_type_id))
+                annotation_filters.append(hl.set(allowed_type_ids).contains(ht.sv_type_id))
 
         return annotation_filters
 
diff --git a/hail_search/web_app.py b/hail_search/web_app.py
index 4d091df91c..fa171b8a18 100644
--- a/hail_search/web_app.py
+++ b/hail_search/web_app.py
@@ -65,7 +65,7 @@ async def init_web_app():
         spark_conf['spark.driver.memory'] = f'{int((int(MACHINE_MEM)-11)*JVM_MEMORY_FRACTION)}g'
     if JAVA_OPTS_XSS:
         spark_conf.update({f'spark.{field}.extraJavaOptions': f'-Xss{JAVA_OPTS_XSS}' for field in ['driver', 'executor']})
-    hl.init(idempotent=True, spark_conf=spark_conf or None)
+    hl.init(idempotent=True, spark_conf=spark_conf or None, backend='local')
     load_globals()
     app = web.Application(middlewares=[error_middleware], client_max_size=(1024**2)*10)
     app.add_routes([

From 414f87d790836cb57a73dcb9cf2cee9650bce6f2 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 12 Feb 2024 00:23:11 -0500
Subject: [PATCH 2/7] fix bugs

---
 hail_search/queries/base.py             | 12 ++++--------
 hail_search/queries/gcnv.py             |  9 ++++-----
 hail_search/queries/multi_data_types.py |  2 ++
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index b449d58b48..d32a6d73ea 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -212,10 +212,6 @@ def __init__(self, sample_data, sort=XPOS, sort_metadata=None, num_results=100,
         if sample_data:
             self._load_filtered_table(sample_data, inheritance_mode=inheritance_mode, **kwargs)
 
-    @property
-    def _is_recessive_search(self):
-        return self._inheritance_mode == RECESSIVE
-
     @property
     def _has_comp_het_search(self):
         return self._inheritance_mode in {RECESSIVE, COMPOUND_HET}
@@ -306,9 +302,9 @@ def import_filtered_table(self, sample_data, intervals=None, **kwargs):
             entry_type = main_ht.family_entries.dtype.element_type
             for project_ht, comp_het_project_ht, num_project_families in filtered_project_hts[1:]:
                 if families_ht is not None:
-                    families_ht = _add_project_ht(self, families_ht, project_ht, entry_type)
+                    families_ht = self._add_project_ht(self, families_ht, project_ht, entry_type)
                 if comp_het_families_ht is not None:
-                    comp_het_families_ht = _add_project_ht(self, comp_het_families_ht, comp_het_project_ht, entry_type)
+                    comp_het_families_ht = self._add_project_ht(self, comp_het_families_ht, comp_het_project_ht, entry_type)
                 num_families += num_project_families
 
         #  TODO add pre-processing for annotations so do not even read in tables if not going to have vaild annotations
@@ -718,7 +714,7 @@ def _filter_compound_hets(self):
         ch_ht = self._comp_het_ht
 
         # Get possible pairs of variants within the same gene
-        ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht, comp_het=True))
+        ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht))
         ch_ht = ch_ht.explode(ch_ht.gene_ids)
 
         # Filter allowed transcripts to the grouped gene
@@ -822,7 +818,7 @@ def _annotated_comp_het_variant(ch_ht, field):
         )
 
     @classmethod
-    def _gene_ids_expr(cls, ht, comp_het=False):
+    def _gene_ids_expr(cls, ht):
         return hl.set(ht[cls.TRANSCRIPTS_FIELD].map(lambda t: t.gene_id))
 
     def _is_valid_comp_het_family(self, ch_ht, entries_1, entries_2):
diff --git a/hail_search/queries/gcnv.py b/hail_search/queries/gcnv.py
index 3ac38ef235..868020e3fe 100644
--- a/hail_search/queries/gcnv.py
+++ b/hail_search/queries/gcnv.py
@@ -62,10 +62,10 @@ class GcnvHailTableQuery(SvHailTableQuery):
     POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'}
 
     @classmethod
-    def _get_genotype_override_field(cls, r, field, family_entries_field=None):
+    def _get_genotype_override_field(cls, r, field):
         agg, get_default = cls.GENOTYPE_OVERRIDE_FIELDS[field]
         sample_field = f'sample_{field}'
-        entries = r[family_entries_field or 'family_entries'].flatmap(lambda x: x)
+        entries = r.family_entries.flatmap(lambda x: x)
         return hl.if_else(
             entries.any(lambda g: hl.is_defined(g.GT) & hl.is_missing(g[sample_field])),
             get_default(r), agg(entries.map(lambda g: g[sample_field]))
@@ -85,10 +85,9 @@ def get_allowed_sv_type_ids(self, sv_types):
         ])
 
     @classmethod
-    def _gene_ids_expr(cls, ht, comp_het=False):
-        family_entries_field = 'comp_het_family_entries' if comp_het else None
+    def _gene_ids_expr(cls, ht):
         return hl.or_else(
-            cls._get_genotype_override_field(ht, 'gene_ids', family_entries_field=family_entries_field),
+            cls._get_genotype_override_field(ht, 'gene_ids'),
             super()._gene_ids_expr(ht),
         )
 
diff --git a/hail_search/queries/multi_data_types.py b/hail_search/queries/multi_data_types.py
index cee14dbac6..01420934e9 100644
--- a/hail_search/queries/multi_data_types.py
+++ b/hail_search/queries/multi_data_types.py
@@ -70,6 +70,7 @@ def _filter_data_type_comp_hets(self, variant_ht, variant_families, sv_query):
 
     @staticmethod
     def _family_filtered_ch_ht(ht, overlapped_families, families, key):
+        # TODO only remap families if different
         family_indices = hl.array([families.index(family_guid) for family_guid in overlapped_families])
         ht = ht.annotate(family_entries=family_indices.map(lambda i: ht.family_entries[i]))
         return ht.group_by('gene_ids').aggregate(**{key: hl.agg.collect(ht.row)})
@@ -125,6 +126,7 @@ def format_search_ht(self):
             SNV_INDEL: 11 (106.1276s)
             SV_WGS: 0 (82.6384s)
             Actual total: ~217s
+            (actual-actual: 244.699374)
             """
 
         for data_type, ch_ht in self._comp_het_hts.items():

From f14e486325ea849b409884a73b024fcbdff154d1 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 12 Feb 2024 01:09:59 -0500
Subject: [PATCH 3/7] fix multi project comp het search

---
 hail_search/queries/base.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index d32a6d73ea..969367c730 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -302,9 +302,13 @@ def import_filtered_table(self, sample_data, intervals=None, **kwargs):
             entry_type = main_ht.family_entries.dtype.element_type
             for project_ht, comp_het_project_ht, num_project_families in filtered_project_hts[1:]:
                 if families_ht is not None:
-                    families_ht = self._add_project_ht(self, families_ht, project_ht, entry_type)
+                    families_ht = self._add_project_ht(families_ht, project_ht, default=hl.empty_array(entry_type))
                 if comp_het_families_ht is not None:
-                    comp_het_families_ht = self._add_project_ht(self, comp_het_families_ht, comp_het_project_ht, entry_type)
+                    comp_het_families_ht = self._add_project_ht(
+                        comp_het_families_ht, comp_het_project_ht,
+                        default=hl.range(num_families).map(lambda i: hl.missing(entry_type)),
+                        default_1=hl.range(num_project_families).map(lambda i: hl.missing(entry_type)),
+                    )
                 num_families += num_project_families
 
         #  TODO add pre-processing for annotations so do not even read in tables if not going to have vaild annotations
@@ -317,20 +321,22 @@ def import_filtered_table(self, sample_data, intervals=None, **kwargs):
             ht = self._query_table_annotations(families_ht, self._get_table_path('annotations.ht'))
             self._ht = self._filter_annotated_table(ht, **kwargs)
 
-    def _add_project_ht(self, families_ht, project_ht, entry_type):
+    def _add_project_ht(self, families_ht, project_ht, default, default_1=None):
+        if default_1 is None:
+            default_1 = default
+
         families_ht = families_ht.join(project_ht, how='outer')
         families_ht = families_ht.select_globals(
             family_guids=families_ht.family_guids.extend(families_ht.family_guids_1)
         )
-        select_fields = {
-            'filters': families_ht.filters.union(families_ht.filters_1),
-            'family_entries': hl.bind(
+        return families_ht.select(
+            filters=families_ht.filters.union(families_ht.filters_1),
+            family_entries=hl.bind(
                 lambda a1, a2: a1.extend(a2),
-                hl.or_else(families_ht.family_entries, hl.empty_array(entry_type)),
-                hl.or_else(families_ht.family_entries_1, hl.empty_array(entry_type)),
+                hl.or_else(families_ht.family_entries, default),
+                hl.or_else(families_ht.family_entries_1, default_1),
             ),
-        }
-        return families_ht.select(**select_fields)
+        )
 
     def _filter_entries_table(self, ht, sample_data, inheritance_mode=None, inheritance_filter=None, quality_filter=None,
                               **kwargs):
@@ -567,6 +573,7 @@ def _should_add_chr_prefix(self):
         return True
 
     def _filter_by_frequency(self, ht, frequencies, pathogenicity):
+        # TODO do not filter if af == 1
         frequencies = {k: v for k, v in (frequencies or {}).items() if k in self.POPULATIONS}
         if not frequencies:
             return ht
@@ -642,6 +649,8 @@ def _filter_by_annotations(self, ht, pathogenicity, annotations, annotations_sec
         annotations = annotations or {}
         annotation_override_filters = self._get_annotation_override_filters(ht, annotations, pathogenicity=pathogenicity)
 
+        # TODO confirm primary and secondary annotations are actually different before annotating etc -
+        #  ignore empty arrays and data-type specific fields
         annotation_exprs, _ = self._get_allowed_consequences_annotations(ht, annotations, annotation_override_filters)
         if is_comp_het or (self._has_comp_het_search and not annotation_exprs):
             secondary_exprs, allowed_secondary_consequences = self._get_allowed_consequences_annotations(

From e5cb704dc26f81d43e4d0c6229ae0df8a59eede3 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 12 Feb 2024 01:47:18 -0500
Subject: [PATCH 4/7] fix no inheritance search

---
 hail_search/queries/base.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index 969367c730..27e947659f 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -432,10 +432,13 @@ def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sorted_f
                 ht, COMPOUND_HET, inheritance_filter, sorted_family_sample_data,
             )
 
-        if (inheritance_filter or inheritance_mode) and not is_any_affected:
-            ht = None if inheritance_mode == COMPOUND_HET else self._filter_families_inheritance(
-                ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
-            )
+        if is_any_affected or not (inheritance_filter and inheritance_mode):
+            # No sample-specific inheritance filtering needed
+            sorted_family_sample_data = []
+
+        ht = None if inheritance_mode == COMPOUND_HET else self._filter_families_inheritance(
+            ht, inheritance_mode, inheritance_filter, sorted_family_sample_data,
+        )
 
         return ht, comp_het_ht
 
@@ -650,7 +653,9 @@ def _filter_by_annotations(self, ht, pathogenicity, annotations, annotations_sec
         annotation_override_filters = self._get_annotation_override_filters(ht, annotations, pathogenicity=pathogenicity)
 
         # TODO confirm primary and secondary annotations are actually different before annotating etc -
-        #  ignore empty arrays and data-type specific fields
+        #  ignore empty arrays and data-type specific fields from other data types and different sorts
+        # Run _get_allowed_consequence_ids on both before loading to determine if different
+        # also check diff overrides somehow
         annotation_exprs, _ = self._get_allowed_consequences_annotations(ht, annotations, annotation_override_filters)
         if is_comp_het or (self._has_comp_het_search and not annotation_exprs):
             secondary_exprs, allowed_secondary_consequences = self._get_allowed_consequences_annotations(

From d624bec8aa2b1e1e9fd603027dc22de2ce9ff758 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 12 Feb 2024 02:05:29 -0500
Subject: [PATCH 5/7] oop

---
 hail_search/queries/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index 27e947659f..a274dc3d05 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -432,7 +432,7 @@ def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sorted_f
                 ht, COMPOUND_HET, inheritance_filter, sorted_family_sample_data,
             )
 
-        if is_any_affected or not (inheritance_filter and inheritance_mode):
+        if is_any_affected or not (inheritance_filter or inheritance_mode):
             # No sample-specific inheritance filtering needed
             sorted_family_sample_data = []
 
@@ -654,8 +654,8 @@ def _filter_by_annotations(self, ht, pathogenicity, annotations, annotations_sec
 
         # TODO confirm primary and secondary annotations are actually different before annotating etc -
         #  ignore empty arrays and data-type specific fields from other data types and different sorts
-        # Run _get_allowed_consequence_ids on both before loading to determine if different
-        # also check diff overrides somehow
+        #  Run _get_allowed_consequence_ids on both before loading to determine if different
+        #  also check diff overrides somehow
         annotation_exprs, _ = self._get_allowed_consequences_annotations(ht, annotations, annotation_override_filters)
         if is_comp_het or (self._has_comp_het_search and not annotation_exprs):
             secondary_exprs, allowed_secondary_consequences = self._get_allowed_consequences_annotations(

From fe1c7dc540153f59f887e44823bf95e8ee85b09a Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 12 Feb 2024 02:08:59 -0500
Subject: [PATCH 6/7] clean up

---
 hail_search/queries/base.py             |  7 ------
 hail_search/queries/multi_data_types.py | 30 -------------------------
 hail_search/web_app.py                  |  2 +-
 3 files changed, 1 insertion(+), 38 deletions(-)

diff --git a/hail_search/queries/base.py b/hail_search/queries/base.py
index a274dc3d05..bd8364ce3e 100644
--- a/hail_search/queries/base.py
+++ b/hail_search/queries/base.py
@@ -311,7 +311,6 @@ def import_filtered_table(self, sample_data, intervals=None, **kwargs):
                     )
                 num_families += num_project_families
 
-        #  TODO add pre-processing for annotations so do not even read in tables if not going to have vaild annotations
         if comp_het_families_ht is not None:
             comp_het_ht = self._query_table_annotations(comp_het_families_ht, self._get_table_path('annotations.ht'))
             self._comp_het_ht = self._filter_annotated_table(comp_het_ht, is_comp_het=True, **kwargs)
@@ -576,7 +575,6 @@ def _should_add_chr_prefix(self):
         return True
 
     def _filter_by_frequency(self, ht, frequencies, pathogenicity):
-        # TODO do not filter if af == 1
         frequencies = {k: v for k, v in (frequencies or {}).items() if k in self.POPULATIONS}
         if not frequencies:
             return ht
@@ -652,10 +650,6 @@ def _filter_by_annotations(self, ht, pathogenicity, annotations, annotations_sec
         annotations = annotations or {}
         annotation_override_filters = self._get_annotation_override_filters(ht, annotations, pathogenicity=pathogenicity)
 
-        # TODO confirm primary and secondary annotations are actually different before annotating etc -
-        #  ignore empty arrays and data-type specific fields from other data types and different sorts
-        #  Run _get_allowed_consequence_ids on both before loading to determine if different
-        #  also check diff overrides somehow
         annotation_exprs, _ = self._get_allowed_consequences_annotations(ht, annotations, annotation_override_filters)
         if is_comp_het or (self._has_comp_het_search and not annotation_exprs):
             secondary_exprs, allowed_secondary_consequences = self._get_allowed_consequences_annotations(
@@ -709,7 +703,6 @@ def _get_annotation_override_filters(self, ht, annotations, **kwargs):
 
     @staticmethod
     def _get_annotation_filters(ht, is_secondary=False):
-        # TODO not needed for anything except comp het search, just directly filter for everything else
         suffix = '_secondary' if is_secondary else ''
         annotation_filters = []
 
diff --git a/hail_search/queries/multi_data_types.py b/hail_search/queries/multi_data_types.py
index 01420934e9..dd950954f0 100644
--- a/hail_search/queries/multi_data_types.py
+++ b/hail_search/queries/multi_data_types.py
@@ -70,7 +70,6 @@ def _filter_data_type_comp_hets(self, variant_ht, variant_families, sv_query):
 
     @staticmethod
     def _family_filtered_ch_ht(ht, overlapped_families, families, key):
-        # TODO only remap families if different
         family_indices = hl.array([families.index(family_guid) for family_guid in overlapped_families])
         ht = ht.annotate(family_entries=family_indices.map(lambda i: ht.family_entries[i]))
         return ht.group_by('gene_ids').aggregate(**{key: hl.agg.collect(ht.row)})
@@ -94,9 +93,6 @@ def _comp_het_entry_has_ref(self, gt1, gt2):
 
     def format_search_ht(self):
         hts = []
-        import logging
-        import time
-        logger = logging.getLogger(__name__)
         for data_type, query in self._data_type_queries.items():
             dt_ht = query.format_search_ht()
             if dt_ht is None:
@@ -105,30 +101,6 @@ def format_search_ht(self):
             if merged_sort_expr is not None:
                 dt_ht = dt_ht.annotate(_sort=merged_sort_expr)
             hts.append(dt_ht.select('_sort', **{data_type: dt_ht.row}))
-            # start = time.perf_counter()
-            # logger.info(f'{data_type}: {dt_ht.count()} ({time.perf_counter() - start:0.4f}s)')
-            """
-            Hom-recessive only:
-            SV_WGS: 0 (5.9890s)
-            MITO: 0 (2.4309s)
-            SNV_INDEL: 3 (16.8396s)
-
-            All recessive (with comp het)
-            SV_WGS: 0 (14.6799s)
-            MITO: 0 (8.7807s)
-            SNV_INDEL: 11 (170.8936s)
-            comp het SV_WGS: 0 (86.7876s)
-            Actual total: ~304s
-            
-            With updates:
-            SV_WGS: 0 (20.0788s)
-            MITO: 0 (9.6441s)
-            SNV_INDEL: 11 (106.1276s)
-            SV_WGS: 0 (82.6384s)
-            Actual total: ~217s
-            (actual-actual: 244.699374)
-            """
-
         for data_type, ch_ht in self._comp_het_hts.items():
             ch_ht = ch_ht.annotate(
                 v1=self._format_comp_het_result(ch_ht.v1, SNV_INDEL_DATA_TYPE),
@@ -138,8 +110,6 @@ def format_search_ht(self):
                 _sort=hl.sorted([ch_ht.v1._sort, ch_ht.v2._sort])[0],
                 **{f'comp_het_{data_type}': ch_ht.row},
             ))
-            # start = time.perf_counter()
-            # logger.info(f'comp het {data_type}: {ch_ht.count()} ({time.perf_counter() - start:0.4f}s)')
 
         ht = hts[0]
         for sub_ht in hts[1:]:
diff --git a/hail_search/web_app.py b/hail_search/web_app.py
index fa171b8a18..4d091df91c 100644
--- a/hail_search/web_app.py
+++ b/hail_search/web_app.py
@@ -65,7 +65,7 @@ async def init_web_app():
         spark_conf['spark.driver.memory'] = f'{int((int(MACHINE_MEM)-11)*JVM_MEMORY_FRACTION)}g'
     if JAVA_OPTS_XSS:
         spark_conf.update({f'spark.{field}.extraJavaOptions': f'-Xss{JAVA_OPTS_XSS}' for field in ['driver', 'executor']})
-    hl.init(idempotent=True, spark_conf=spark_conf or None, backend='local')
+    hl.init(idempotent=True, spark_conf=spark_conf or None)
     load_globals()
     app = web.Application(middlewares=[error_middleware], client_max_size=(1024**2)*10)
     app.add_routes([

From 5cbe0d440f9906df9b4b3299a74294f6445f3fb7 Mon Sep 17 00:00:00 2001
From: Hana Snow <hsnow@broadinstitute.org>
Date: Mon, 12 Feb 2024 02:18:59 -0500
Subject: [PATCH 7/7] clean up

---
 hail_search/queries/multi_data_types.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hail_search/queries/multi_data_types.py b/hail_search/queries/multi_data_types.py
index dd950954f0..f0ee5be221 100644
--- a/hail_search/queries/multi_data_types.py
+++ b/hail_search/queries/multi_data_types.py
@@ -101,6 +101,7 @@ def format_search_ht(self):
             if merged_sort_expr is not None:
                 dt_ht = dt_ht.annotate(_sort=merged_sort_expr)
             hts.append(dt_ht.select('_sort', **{data_type: dt_ht.row}))
+
         for data_type, ch_ht in self._comp_het_hts.items():
             ch_ht = ch_ht.annotate(
                 v1=self._format_comp_het_result(ch_ht.v1, SNV_INDEL_DATA_TYPE),