Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hail backend - SV WES #3575

Merged
merged 20 commits into from
Sep 8, 2023
Merged
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 82 additions & 11 deletions hail_search/hail_search_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class BaseHailTableQuery(object):
HAS_ALT: lambda gt: gt.is_non_ref(),
HAS_REF: lambda gt: gt.is_hom_ref() | gt.is_het_ref(),
}
MISSING_NUM_ALT = -1

GENOTYPE_FIELDS = {}
NESTED_GENOTYPE_FIELDS = {}
Expand All @@ -63,7 +64,7 @@ class BaseHailTableQuery(object):
'response_key': 'transcripts',
'empty_array': True,
'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}),
'format_values': lambda values: values.group_by(lambda t: t.geneId),
'format_values': lambda values, *args: values.group_by(lambda t: t.geneId),
},
}
LIFTOVER_ANNOTATION_FIELDS = {
Expand Down Expand Up @@ -97,7 +98,7 @@ def annotation_fields(self):
lambda gt: hl.is_defined(gt.individualGuid)
).group_by(lambda x: x.individualGuid).map_values(lambda x: x[0].select(
'sampleId', 'individualGuid', 'familyGuid',
numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), -1),
numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), self.MISSING_NUM_ALT),
**{k: x[0][field] for k, field in self.GENOTYPE_FIELDS.items()},
**{_to_camel_case(k): x[0][field][k] for field, v in self.NESTED_GENOTYPE_FIELDS.items() for k in v},
)),
Expand Down Expand Up @@ -144,7 +145,7 @@ def _get_enum_lookup(self, field, subfield):

def _get_enum_terms_ids(self, field, subfield, terms):
enum = self._get_enum_lookup(field, subfield)
return {enum[t] for t in terms if enum.get(t)}
return {enum[t] for t in terms if enum.get(t) is not None}

def _format_enum_response(self, k, enum):
enum_config = self.ENUM_ANNOTATION_FIELDS.get(k, {})
Expand All @@ -162,7 +163,7 @@ def _format_enum(cls, r, field, enum, empty_array=False, format_values=None, **k
value = hl.or_else(value, hl.empty_array(value.dtype.element_type))
value = value.map(lambda x: cls._enum_field(x, enum, **kwargs))
if format_values:
value = format_values(value)
value = format_values(value, r)
return value

return cls._enum_field(value, enum, **kwargs)
Expand Down Expand Up @@ -365,7 +366,7 @@ def _add_entry_sample_families(cls, ht, sample_data):

ht = ht.transmute(
family_entries=family_sample_indices.map(lambda sample_indices: sample_indices.map(
lambda i: hl.or_else(ht.entries[i], cls._missing_entry(ht.entries[i])).annotate(
lambda i: ht.entries[i].annotate(
sampleId=sample_index_id_map.get(i),
individualGuid=sample_index_individual_map.get(i),
familyGuid=sample_index_family_map.get(i),
Expand All @@ -376,11 +377,6 @@ def _add_entry_sample_families(cls, ht, sample_data):

return ht, sample_id_family_index_map, num_families

@classmethod
def _missing_entry(cls, entry):
entry_type = dict(**entry.dtype)
return hl.struct(**{k: hl.missing(v) for k, v in entry_type.items()})

def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map):
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT)

Expand Down Expand Up @@ -1158,4 +1154,79 @@ def _additional_annotation_fields(self):
}


QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery, SvHailTableQuery]}
class GcnvHailTableQuery(SvHailTableQuery):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the file is getting bigger, how about splitting them into per-class files?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, Ben suggested the same thing on the other PR. I agree, but to keep the diff sane I will do a follow up PR to move everything into its own file


DATA_TYPE = 'SV_WES'

# gCNV data has no ref/ref calls so a missing entry indicates ref/ref
GENOTYPE_QUERY_MAP = {
**BaseHailTableQuery.GENOTYPE_QUERY_MAP,
REF_REF: hl.is_missing,
HAS_REF: lambda gt: hl.is_missing(gt) | gt.is_het_ref(),
HAS_ALT: hl.is_defined,
COMP_HET_ALT: hl.is_defined,
}
MISSING_NUM_ALT = 0

GENOTYPE_FIELDS = {
**SvHailTableQuery.GENOTYPE_FIELDS,
**{f.lower(): f for f in ['QS', 'defragged']},
**{_to_camel_case(f): f'sample_{f}' for f in ['start', 'end', 'num_exon', 'gene_ids']},
}
del GENOTYPE_FIELDS['gq']
GENOTYPE_QUERY_FIELDS = {}
NESTED_GENOTYPE_FIELDS = {
'concordance': SvHailTableQuery.NESTED_GENOTYPE_FIELDS['concordance'][:-1] + ['prev_overlap']
}

CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS
BASE_ANNOTATION_FIELDS = {
**SvHailTableQuery.BASE_ANNOTATION_FIELDS,
'pos': lambda r: GcnvHailTableQuery._get_genotype_override_field(
r, 'start', hl.min, default=r.start_locus.position),
'end': lambda r: GcnvHailTableQuery._get_genotype_override_field(
r, 'end', hl.max, default=r.end_locus.position),
'numExon': lambda r: GcnvHailTableQuery._get_genotype_override_field(r, 'num_exon', hl.max),
}
del BASE_ANNOTATION_FIELDS['bothsidesSupport']

TRANSCRIPTS_ENUM_FIELD = SvHailTableQuery.ENUM_ANNOTATION_FIELDS[SvHailTableQuery.TRANSCRIPTS_FIELD]
ENUM_ANNOTATION_FIELDS = {SvHailTableQuery.TRANSCRIPTS_FIELD: {
**TRANSCRIPTS_ENUM_FIELD,
'format_values': lambda values, r: GcnvHailTableQuery.TRANSCRIPTS_ENUM_FIELD['format_values'](
GcnvHailTableQuery._get_gene_id_transcripts_override(values, r), r
),
}}

POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'}

@staticmethod
def _get_genotype_override_field(r, field, agg, default=None):
sample_field = f'sample_{field}'
entries = r.family_entries.flatmap(lambda x: x)
if default is None:
default = r[field]
return hl.if_else(
entries.any(lambda g: hl.is_defined(g.GT) & hl.is_missing(g[sample_field])),
default, agg(entries.map(lambda g: g[sample_field]))
)

@classmethod
def _get_gene_id_transcripts_override(cls, transcripts, r):
empty_gene_set = hl.empty_set(hl.tstr)
geneotype_gene_ids_expr = cls._get_genotype_override_field(
r, 'gene_ids',
lambda entry_gene_ids: entry_gene_ids.fold(lambda s1, s2: s1.union(s2), empty_gene_set),
default=hl.missing(empty_gene_set.dtype))
return hl.bind(
lambda gene_ids: hl.if_else(
hl.is_missing(gene_ids), transcripts,
transcripts.filter(lambda t: gene_ids.contains(t.geneId)),
), geneotype_gene_ids_expr,
)

def _additional_annotation_fields(self):
return {}


QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery, SvHailTableQuery, GcnvHailTableQuery]}
Loading