Skip to content

Commit

Permalink
Merge pull request #3575 from broadinstitute/hail-backend-sv-wes
Browse files Browse the repository at this point in the history
Hail backend - SV WES
  • Loading branch information
hanars authored Sep 8, 2023
2 parents 48143e0 + 9b8eddd commit e944207
Show file tree
Hide file tree
Showing 58 changed files with 463 additions and 31 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 3 additions & 0 deletions hail_search/fixtures/GRCh38/SV_WES/annotations.ht/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/24 17:42:53
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/25 10:47:38
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/25 11:56:33
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
125 changes: 106 additions & 19 deletions hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ class BaseHailTableQuery(object):
HAS_ALT: lambda gt: gt.is_non_ref(),
HAS_REF: lambda gt: gt.is_hom_ref() | gt.is_het_ref(),
}
MISSING_NUM_ALT = -1

GENOTYPE_FIELDS = {}
NESTED_GENOTYPE_FIELDS = {}
COMPUTED_GENOTYPE_FIELDS = {}
GENOTYPE_QUERY_FIELDS = {}
QUALITY_FILTER_FORMAT = {}
POPULATIONS = {}
Expand All @@ -63,7 +64,7 @@ class BaseHailTableQuery(object):
'response_key': 'transcripts',
'empty_array': True,
'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}),
'format_array_values': lambda values: values.group_by(lambda t: t.geneId),
'format_array_values': lambda values, *args: values.group_by(lambda t: t.geneId),
},
}
LIFTOVER_ANNOTATION_FIELDS = {
Expand Down Expand Up @@ -97,9 +98,9 @@ def annotation_fields(self):
lambda gt: hl.is_defined(gt.individualGuid)
).group_by(lambda x: x.individualGuid).map_values(lambda x: x[0].select(
'sampleId', 'individualGuid', 'familyGuid',
numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), -1),
numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), self.MISSING_NUM_ALT),
**{k: x[0][field] for k, field in self.GENOTYPE_FIELDS.items()},
**{_to_camel_case(k): x[0][field][k] for field, v in self.NESTED_GENOTYPE_FIELDS.items() for k in v},
**{_to_camel_case(k): v(x[0], k, r) for k, v in self.COMPUTED_GENOTYPE_FIELDS.items()},
)),
'populations': lambda r: hl.struct(**{
population: self.population_expression(r, population) for population in self.POPULATIONS.keys()
Expand Down Expand Up @@ -144,7 +145,7 @@ def _get_enum_lookup(self, field, subfield):

def _get_enum_terms_ids(self, field, subfield, terms):
enum = self._get_enum_lookup(field, subfield)
return {enum[t] for t in terms if enum.get(t)}
return {enum[t] for t in terms if enum.get(t) is not None}

def _format_enum_response(self, k, enum):
enum_config = self.ENUM_ANNOTATION_FIELDS.get(k, {})
Expand All @@ -162,7 +163,7 @@ def _format_enum(cls, r, field, enum, empty_array=False, format_array_values=Non
value = hl.or_else(value, hl.empty_array(value.dtype.element_type))
value = value.map(lambda x: cls._enum_field(x, enum, **kwargs))
if format_array_values:
value = format_array_values(value)
value = format_array_values(value, r)
return value

return cls._enum_field(value, enum, **kwargs)
Expand Down Expand Up @@ -365,7 +366,7 @@ def _add_entry_sample_families(cls, ht, sample_data):

ht = ht.transmute(
family_entries=family_sample_indices.map(lambda sample_indices: sample_indices.map(
lambda i: hl.or_else(ht.entries[i], cls._missing_entry(ht.entries[i])).annotate(
lambda i: ht.entries[i].annotate(
sampleId=sample_index_id_map.get(i),
individualGuid=sample_index_individual_map.get(i),
familyGuid=sample_index_family_map.get(i),
Expand All @@ -376,11 +377,6 @@ def _add_entry_sample_families(cls, ht, sample_data):

return ht, sample_id_family_index_map, num_families

@classmethod
def _missing_entry(cls, entry):
entry_type = dict(**entry.dtype)
return hl.struct(**{k: hl.missing(v) for k, v in entry_type.items()})

def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)

Expand Down Expand Up @@ -662,7 +658,7 @@ def _filter_compound_hets(self):
ch_ht = ch_ht.filter(ch_ht.comp_het_family_entries.any(hl.is_defined))

# Get possible pairs of variants within the same gene
ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht))
ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht, comp_het=True))
ch_ht = ch_ht.explode(ch_ht.gene_ids)
formatted_rows_expr = hl.agg.collect(ch_ht.row)
if HAS_ALLOWED_SECONDARY_ANNOTATION in self._ht.row:
Expand Down Expand Up @@ -697,7 +693,7 @@ def _filter_compound_hets(self):
return ch_ht

@classmethod
def _gene_ids_expr(cls, ht):
def _gene_ids_expr(cls, ht, comp_het=False):
return hl.set(ht[cls.TRANSCRIPTS_FIELD].map(lambda t: t.gene_id))

def _is_valid_comp_het_family(self, entries_1, entries_2):
Expand Down Expand Up @@ -1060,7 +1056,9 @@ class SvHailTableQuery(BaseHailTableQuery):
DATA_TYPE = 'SV_WGS'

GENOTYPE_FIELDS = {_to_camel_case(f): f for f in ['CN', 'GQ']}
NESTED_GENOTYPE_FIELDS = {'concordance': ['new_call', 'prev_call', 'prev_num_alt']}
COMPUTED_GENOTYPE_FIELDS = {
k: lambda entry, field, *args: entry.concordance[field] for k in ['new_call', 'prev_call', 'prev_num_alt']
}
GENOTYPE_QUERY_FIELDS = {'gq_sv': 'GQ', 'gq': None}

TRANSCRIPTS_FIELD = 'sorted_gene_consequences'
Expand All @@ -1069,7 +1067,6 @@ class SvHailTableQuery(BaseHailTableQuery):
BASE_ANNOTATION_FIELDS = {
'bothsidesSupport': lambda r: r.bothsides_support,
'chrom': lambda r: r.start_locus.contig.replace('^chr', ''),
'endChrom': lambda r: hl.or_missing(r.start_locus.contig != r.end_locus.contig, r.end_locus.contig.replace('^chr', '')),
'pos': lambda r: r.start_locus.position,
'end': lambda r: r.end_locus.position,
'rg37LocusEnd': lambda r: hl.or_missing(
Expand Down Expand Up @@ -1144,16 +1141,19 @@ def _get_consequence_filter(self, allowed_consequence_ids, annotation_exprs):
def _get_annotation_override_filters(self, annotations, **kwargs):
annotation_filters = []
if annotations.get(STRUCTURAL_ANNOTATION_FIELD):
allowed_type_ids = self._get_enum_terms_ids('sv_type', None, annotations[STRUCTURAL_ANNOTATION_FIELD])
allowed_type_ids = self.get_allowed_sv_type_ids(annotations[STRUCTURAL_ANNOTATION_FIELD])
if allowed_type_ids:
annotation_filters.append(hl.set(allowed_type_ids).contains(self._ht.sv_type_id))

return annotation_filters

def get_allowed_sv_type_ids(self, sv_types):
return self._get_enum_terms_ids('sv_type', None, sv_types)

def _additional_annotation_fields(self):
sv_type_enum = self._enums['sv_type']
insertion_type_id = sv_type_enum.index('INS')
get_end_chrom = self.BASE_ANNOTATION_FIELDS['endChrom']
get_end_chrom = lambda r: hl.or_missing(r.start_locus.contig != r.end_locus.contig, r.end_locus.contig.replace('^chr', ''))
return {
'cpxIntervals': lambda r: self._format_enum(
r, 'cpx_intervals', {'type': sv_type_enum}, annotate_value=lambda val, *args: {
Expand All @@ -1171,4 +1171,91 @@ def _additional_annotation_fields(self):
}


QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery, SvHailTableQuery]}
class GcnvHailTableQuery(SvHailTableQuery):

DATA_TYPE = 'SV_WES'

# gCNV data has no ref/ref calls so a missing entry indicates ref/ref
GENOTYPE_QUERY_MAP = {
**BaseHailTableQuery.GENOTYPE_QUERY_MAP,
REF_REF: hl.is_missing,
HAS_REF: lambda gt: hl.is_missing(gt) | gt.is_het_ref(),
HAS_ALT: hl.is_defined,
COMP_HET_ALT: hl.is_defined,
}
MISSING_NUM_ALT = 0

GENOTYPE_FIELDS = {
**SvHailTableQuery.GENOTYPE_FIELDS,
**{f.lower(): f for f in ['QS', 'defragged']},
}
del GENOTYPE_FIELDS['gq']
GENOTYPE_QUERY_FIELDS = {}
GENOTYPE_OVERRIDE_FIELDS = {
'start': (hl.min, lambda r: r.start_locus.position),
'end': (hl.max, lambda r: r.end_locus.position),
'num_exon': (hl.max, lambda r: r.num_exon),
'gene_ids': (
lambda entry_gene_ids: entry_gene_ids.fold(lambda s1, s2: s1.union(s2), hl.empty_set(hl.tstr)),
lambda r: hl.missing(hl.tset(hl.tstr)),
),
}
COMPUTED_GENOTYPE_FIELDS = {
**SvHailTableQuery.COMPUTED_GENOTYPE_FIELDS,
**{k: lambda entry, field, r: hl.or_missing(
hl.is_missing(r[field]) | (r[field] != entry[f'sample_{field}']), entry[f'sample_{field}']
) for k in GENOTYPE_OVERRIDE_FIELDS.keys()},
}
COMPUTED_GENOTYPE_FIELDS['prev_overlap'] = COMPUTED_GENOTYPE_FIELDS.pop('prev_num_alt')

CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS
BASE_ANNOTATION_FIELDS = {
**SvHailTableQuery.BASE_ANNOTATION_FIELDS,
'pos': lambda r: r.start,
'end': lambda r: r.end,
'numExon': lambda r: r.num_exon,
}
del BASE_ANNOTATION_FIELDS['bothsidesSupport']

TRANSCRIPTS_ENUM_FIELD = SvHailTableQuery.ENUM_ANNOTATION_FIELDS[SvHailTableQuery.TRANSCRIPTS_FIELD]
ENUM_ANNOTATION_FIELDS = {SvHailTableQuery.TRANSCRIPTS_FIELD: {
**TRANSCRIPTS_ENUM_FIELD,
'format_array_values': lambda values, r: GcnvHailTableQuery.TRANSCRIPTS_ENUM_FIELD['format_array_values'](
hl.if_else(hl.is_missing(r.gene_ids), values, values.filter(lambda t: r.gene_ids.contains(t.geneId))), r,
),
}}

POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'}

@classmethod
def _get_genotype_override_field(cls, r, field, family_entries_field=None):
agg, get_default = cls.GENOTYPE_OVERRIDE_FIELDS[field]
sample_field = f'sample_{field}'
entries = r[family_entries_field or 'family_entries'].flatmap(lambda x: x)
return hl.if_else(
entries.any(lambda g: hl.is_defined(g.GT) & hl.is_missing(g[sample_field])),
get_default(r), agg(entries.map(lambda g: g[sample_field]))
)

def _format_results(self, ht, annotation_fields):
ht = ht.annotate(**{k: self._get_genotype_override_field(ht, k) for k in self.GENOTYPE_OVERRIDE_FIELDS})
return super()._format_results(ht, annotation_fields)

def get_allowed_sv_type_ids(self, sv_types):
return super().get_allowed_sv_type_ids([
type.replace('gCNV_', '') for type in sv_types if type.startswith('gCNV_')
])

@classmethod
def _gene_ids_expr(cls, ht, comp_het=False):
family_entries_field = 'comp_het_family_entries' if comp_het else None
return hl.or_else(
cls._get_genotype_override_field(ht, 'gene_ids', family_entries_field=family_entries_field),
super()._gene_ids_expr(ht),
)

def _additional_annotation_fields(self):
return {}


QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery, SvHailTableQuery, GcnvHailTableQuery]}
Loading

0 comments on commit e944207

Please sign in to comment.