Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hail backend - SV WES #3575

Merged
merged 20 commits into from
Sep 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 3 additions & 0 deletions hail_search/fixtures/GRCh38/SV_WES/annotations.ht/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/24 17:42:53
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/25 10:47:38
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/25 11:56:33
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
125 changes: 106 additions & 19 deletions hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ class BaseHailTableQuery(object):
HAS_ALT: lambda gt: gt.is_non_ref(),
HAS_REF: lambda gt: gt.is_hom_ref() | gt.is_het_ref(),
}
MISSING_NUM_ALT = -1

GENOTYPE_FIELDS = {}
NESTED_GENOTYPE_FIELDS = {}
COMPUTED_GENOTYPE_FIELDS = {}
GENOTYPE_QUERY_FIELDS = {}
QUALITY_FILTER_FORMAT = {}
POPULATIONS = {}
Expand All @@ -63,7 +64,7 @@ class BaseHailTableQuery(object):
'response_key': 'transcripts',
'empty_array': True,
'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}),
'format_array_values': lambda values: values.group_by(lambda t: t.geneId),
'format_array_values': lambda values, *args: values.group_by(lambda t: t.geneId),
},
}
LIFTOVER_ANNOTATION_FIELDS = {
Expand Down Expand Up @@ -97,9 +98,9 @@ def annotation_fields(self):
lambda gt: hl.is_defined(gt.individualGuid)
).group_by(lambda x: x.individualGuid).map_values(lambda x: x[0].select(
'sampleId', 'individualGuid', 'familyGuid',
numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), -1),
numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), self.MISSING_NUM_ALT),
**{k: x[0][field] for k, field in self.GENOTYPE_FIELDS.items()},
**{_to_camel_case(k): x[0][field][k] for field, v in self.NESTED_GENOTYPE_FIELDS.items() for k in v},
**{_to_camel_case(k): v(x[0], k, r) for k, v in self.COMPUTED_GENOTYPE_FIELDS.items()},
)),
'populations': lambda r: hl.struct(**{
population: self.population_expression(r, population) for population in self.POPULATIONS.keys()
Expand Down Expand Up @@ -144,7 +145,7 @@ def _get_enum_lookup(self, field, subfield):

def _get_enum_terms_ids(self, field, subfield, terms):
enum = self._get_enum_lookup(field, subfield)
return {enum[t] for t in terms if enum.get(t)}
return {enum[t] for t in terms if enum.get(t) is not None}

def _format_enum_response(self, k, enum):
enum_config = self.ENUM_ANNOTATION_FIELDS.get(k, {})
Expand All @@ -162,7 +163,7 @@ def _format_enum(cls, r, field, enum, empty_array=False, format_array_values=Non
value = hl.or_else(value, hl.empty_array(value.dtype.element_type))
value = value.map(lambda x: cls._enum_field(x, enum, **kwargs))
if format_array_values:
value = format_array_values(value)
value = format_array_values(value, r)
return value

return cls._enum_field(value, enum, **kwargs)
Expand Down Expand Up @@ -365,7 +366,7 @@ def _add_entry_sample_families(cls, ht, sample_data):

ht = ht.transmute(
family_entries=family_sample_indices.map(lambda sample_indices: sample_indices.map(
lambda i: hl.or_else(ht.entries[i], cls._missing_entry(ht.entries[i])).annotate(
lambda i: ht.entries[i].annotate(
sampleId=sample_index_id_map.get(i),
individualGuid=sample_index_individual_map.get(i),
familyGuid=sample_index_family_map.get(i),
Expand All @@ -376,11 +377,6 @@ def _add_entry_sample_families(cls, ht, sample_data):

return ht, sample_id_family_index_map, num_families

@classmethod
def _missing_entry(cls, entry):
entry_type = dict(**entry.dtype)
return hl.struct(**{k: hl.missing(v) for k, v in entry_type.items()})

def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)

Expand Down Expand Up @@ -662,7 +658,7 @@ def _filter_compound_hets(self):
ch_ht = ch_ht.filter(ch_ht.comp_het_family_entries.any(hl.is_defined))

# Get possible pairs of variants within the same gene
ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht))
ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht, comp_het=True))
ch_ht = ch_ht.explode(ch_ht.gene_ids)
formatted_rows_expr = hl.agg.collect(ch_ht.row)
if HAS_ALLOWED_SECONDARY_ANNOTATION in self._ht.row:
Expand Down Expand Up @@ -697,7 +693,7 @@ def _filter_compound_hets(self):
return ch_ht

@classmethod
def _gene_ids_expr(cls, ht):
def _gene_ids_expr(cls, ht, comp_het=False):
return hl.set(ht[cls.TRANSCRIPTS_FIELD].map(lambda t: t.gene_id))

def _is_valid_comp_het_family(self, entries_1, entries_2):
Expand Down Expand Up @@ -1060,7 +1056,9 @@ class SvHailTableQuery(BaseHailTableQuery):
DATA_TYPE = 'SV_WGS'

GENOTYPE_FIELDS = {_to_camel_case(f): f for f in ['CN', 'GQ']}
NESTED_GENOTYPE_FIELDS = {'concordance': ['new_call', 'prev_call', 'prev_num_alt']}
COMPUTED_GENOTYPE_FIELDS = {
k: lambda entry, field, *args: entry.concordance[field] for k in ['new_call', 'prev_call', 'prev_num_alt']
}
GENOTYPE_QUERY_FIELDS = {'gq_sv': 'GQ', 'gq': None}

TRANSCRIPTS_FIELD = 'sorted_gene_consequences'
Expand All @@ -1069,7 +1067,6 @@ class SvHailTableQuery(BaseHailTableQuery):
BASE_ANNOTATION_FIELDS = {
'bothsidesSupport': lambda r: r.bothsides_support,
'chrom': lambda r: r.start_locus.contig.replace('^chr', ''),
'endChrom': lambda r: hl.or_missing(r.start_locus.contig != r.end_locus.contig, r.end_locus.contig.replace('^chr', '')),
'pos': lambda r: r.start_locus.position,
'end': lambda r: r.end_locus.position,
'rg37LocusEnd': lambda r: hl.or_missing(
Expand Down Expand Up @@ -1144,16 +1141,19 @@ def _get_consequence_filter(self, allowed_consequence_ids, annotation_exprs):
def _get_annotation_override_filters(self, annotations, **kwargs):
annotation_filters = []
if annotations.get(STRUCTURAL_ANNOTATION_FIELD):
allowed_type_ids = self._get_enum_terms_ids('sv_type', None, annotations[STRUCTURAL_ANNOTATION_FIELD])
allowed_type_ids = self.get_allowed_sv_type_ids(annotations[STRUCTURAL_ANNOTATION_FIELD])
if allowed_type_ids:
annotation_filters.append(hl.set(allowed_type_ids).contains(self._ht.sv_type_id))

return annotation_filters

def get_allowed_sv_type_ids(self, sv_types):
return self._get_enum_terms_ids('sv_type', None, sv_types)

def _additional_annotation_fields(self):
sv_type_enum = self._enums['sv_type']
insertion_type_id = sv_type_enum.index('INS')
get_end_chrom = self.BASE_ANNOTATION_FIELDS['endChrom']
get_end_chrom = lambda r: hl.or_missing(r.start_locus.contig != r.end_locus.contig, r.end_locus.contig.replace('^chr', ''))
return {
'cpxIntervals': lambda r: self._format_enum(
r, 'cpx_intervals', {'type': sv_type_enum}, annotate_value=lambda val, *args: {
Expand All @@ -1171,4 +1171,91 @@ def _additional_annotation_fields(self):
}


QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery, SvHailTableQuery]}
class GcnvHailTableQuery(SvHailTableQuery):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the file is getting bigger, how about splitting them into per-class files?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, Ben suggested the same thing on the other PR. I agree, but to keep the diff sane I will do a follow up PR to move everything into its own file


DATA_TYPE = 'SV_WES'

# gCNV data has no ref/ref calls so a missing entry indicates ref/ref
GENOTYPE_QUERY_MAP = {
**BaseHailTableQuery.GENOTYPE_QUERY_MAP,
REF_REF: hl.is_missing,
HAS_REF: lambda gt: hl.is_missing(gt) | gt.is_het_ref(),
HAS_ALT: hl.is_defined,
COMP_HET_ALT: hl.is_defined,
}
MISSING_NUM_ALT = 0

GENOTYPE_FIELDS = {
**SvHailTableQuery.GENOTYPE_FIELDS,
**{f.lower(): f for f in ['QS', 'defragged']},
}
del GENOTYPE_FIELDS['gq']
GENOTYPE_QUERY_FIELDS = {}
GENOTYPE_OVERRIDE_FIELDS = {
'start': (hl.min, lambda r: r.start_locus.position),
'end': (hl.max, lambda r: r.end_locus.position),
'num_exon': (hl.max, lambda r: r.num_exon),
'gene_ids': (
lambda entry_gene_ids: entry_gene_ids.fold(lambda s1, s2: s1.union(s2), hl.empty_set(hl.tstr)),
lambda r: hl.missing(hl.tset(hl.tstr)),
),
}
COMPUTED_GENOTYPE_FIELDS = {
**SvHailTableQuery.COMPUTED_GENOTYPE_FIELDS,
**{k: lambda entry, field, r: hl.or_missing(
hl.is_missing(r[field]) | (r[field] != entry[f'sample_{field}']), entry[f'sample_{field}']
) for k in GENOTYPE_OVERRIDE_FIELDS.keys()},
}
COMPUTED_GENOTYPE_FIELDS['prev_overlap'] = COMPUTED_GENOTYPE_FIELDS.pop('prev_num_alt')

CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS
BASE_ANNOTATION_FIELDS = {
**SvHailTableQuery.BASE_ANNOTATION_FIELDS,
'pos': lambda r: r.start,
'end': lambda r: r.end,
'numExon': lambda r: r.num_exon,
}
del BASE_ANNOTATION_FIELDS['bothsidesSupport']

TRANSCRIPTS_ENUM_FIELD = SvHailTableQuery.ENUM_ANNOTATION_FIELDS[SvHailTableQuery.TRANSCRIPTS_FIELD]
ENUM_ANNOTATION_FIELDS = {SvHailTableQuery.TRANSCRIPTS_FIELD: {
**TRANSCRIPTS_ENUM_FIELD,
'format_array_values': lambda values, r: GcnvHailTableQuery.TRANSCRIPTS_ENUM_FIELD['format_array_values'](
hl.if_else(hl.is_missing(r.gene_ids), values, values.filter(lambda t: r.gene_ids.contains(t.geneId))), r,
),
}}

POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'}

@classmethod
def _get_genotype_override_field(cls, r, field, family_entries_field=None):
agg, get_default = cls.GENOTYPE_OVERRIDE_FIELDS[field]
sample_field = f'sample_{field}'
entries = r[family_entries_field or 'family_entries'].flatmap(lambda x: x)
return hl.if_else(
entries.any(lambda g: hl.is_defined(g.GT) & hl.is_missing(g[sample_field])),
get_default(r), agg(entries.map(lambda g: g[sample_field]))
)

def _format_results(self, ht, annotation_fields):
ht = ht.annotate(**{k: self._get_genotype_override_field(ht, k) for k in self.GENOTYPE_OVERRIDE_FIELDS})
return super()._format_results(ht, annotation_fields)

def get_allowed_sv_type_ids(self, sv_types):
return super().get_allowed_sv_type_ids([
type.replace('gCNV_', '') for type in sv_types if type.startswith('gCNV_')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a case for doing this replacement in the pipeline? So that instead of passing the enum ids for gCNV_DEL and gCNV_DUP we instead pass the ids for DEL and DUP? Or are gCNV_DEL and gCNV_DUP used elsewhere?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tbh I had thought we already did, because gCNV_DUP and gCNV_DEL are in the enum, but then after I saw that they weren't I though this approach was easier. We need to return them to the client without the prefix so it would be doing an additional annotation to format them and remove the prefix after the query, and doing a one time string replace in python before doing the query seemed more efficient than having to modify every row in the table

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I agree the string replace in python is better. I guess, if we'd like to remove gCNV_DUP and gCNV_DEL from the enum it's no problem on my end.

])

@classmethod
def _gene_ids_expr(cls, ht, comp_het=False):
family_entries_field = 'comp_het_family_entries' if comp_het else None
return hl.or_else(
cls._get_genotype_override_field(ht, 'gene_ids', family_entries_field=family_entries_field),
super()._gene_ids_expr(ht),
)

def _additional_annotation_fields(self):
return {}


QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery, SvHailTableQuery, GcnvHailTableQuery]}
Loading
Loading