-
Notifications
You must be signed in to change notification settings - Fork 88
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hail backend - SV WES #3575
Hail backend - SV WES #3575
Changes from all commits
9f84c5c
521f591
734c4c1
ef924c9
5a7d7b5
784f386
7866de2
42a7044
c6c3a47
881df7d
919744d
c91f63c
2e4f372
bea8c30
fe4f735
7f5863a
420c5e8
4ce6fe5
98bfe91
9b8eddd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
This folder comprises a Hail (www.hail.is) native Table or MatrixTable. | ||
Written with version 0.2.109-b71b065e4bb6 | ||
Created at 2023/08/24 17:42:53 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
This folder comprises a Hail (www.hail.is) native Table or MatrixTable. | ||
Written with version 0.2.109-b71b065e4bb6 | ||
Created at 2023/08/25 10:47:38 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
This folder comprises a Hail (www.hail.is) native Table or MatrixTable. | ||
Written with version 0.2.109-b71b065e4bb6 | ||
Created at 2023/08/25 11:56:33 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,9 +40,10 @@ class BaseHailTableQuery(object): | |
HAS_ALT: lambda gt: gt.is_non_ref(), | ||
HAS_REF: lambda gt: gt.is_hom_ref() | gt.is_het_ref(), | ||
} | ||
MISSING_NUM_ALT = -1 | ||
|
||
GENOTYPE_FIELDS = {} | ||
NESTED_GENOTYPE_FIELDS = {} | ||
COMPUTED_GENOTYPE_FIELDS = {} | ||
GENOTYPE_QUERY_FIELDS = {} | ||
QUALITY_FILTER_FORMAT = {} | ||
POPULATIONS = {} | ||
|
@@ -63,7 +64,7 @@ class BaseHailTableQuery(object): | |
'response_key': 'transcripts', | ||
'empty_array': True, | ||
'format_value': lambda value: value.rename({k: _to_camel_case(k) for k in value.keys()}), | ||
'format_array_values': lambda values: values.group_by(lambda t: t.geneId), | ||
'format_array_values': lambda values, *args: values.group_by(lambda t: t.geneId), | ||
}, | ||
} | ||
LIFTOVER_ANNOTATION_FIELDS = { | ||
|
@@ -97,9 +98,9 @@ def annotation_fields(self): | |
lambda gt: hl.is_defined(gt.individualGuid) | ||
).group_by(lambda x: x.individualGuid).map_values(lambda x: x[0].select( | ||
'sampleId', 'individualGuid', 'familyGuid', | ||
numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), -1), | ||
numAlt=hl.if_else(hl.is_defined(x[0].GT), x[0].GT.n_alt_alleles(), self.MISSING_NUM_ALT), | ||
**{k: x[0][field] for k, field in self.GENOTYPE_FIELDS.items()}, | ||
**{_to_camel_case(k): x[0][field][k] for field, v in self.NESTED_GENOTYPE_FIELDS.items() for k in v}, | ||
**{_to_camel_case(k): v(x[0], k, r) for k, v in self.COMPUTED_GENOTYPE_FIELDS.items()}, | ||
)), | ||
'populations': lambda r: hl.struct(**{ | ||
population: self.population_expression(r, population) for population in self.POPULATIONS.keys() | ||
|
@@ -144,7 +145,7 @@ def _get_enum_lookup(self, field, subfield): | |
|
||
def _get_enum_terms_ids(self, field, subfield, terms): | ||
enum = self._get_enum_lookup(field, subfield) | ||
return {enum[t] for t in terms if enum.get(t)} | ||
return {enum[t] for t in terms if enum.get(t) is not None} | ||
|
||
def _format_enum_response(self, k, enum): | ||
enum_config = self.ENUM_ANNOTATION_FIELDS.get(k, {}) | ||
|
@@ -162,7 +163,7 @@ def _format_enum(cls, r, field, enum, empty_array=False, format_array_values=Non | |
value = hl.or_else(value, hl.empty_array(value.dtype.element_type)) | ||
value = value.map(lambda x: cls._enum_field(x, enum, **kwargs)) | ||
if format_array_values: | ||
value = format_array_values(value) | ||
value = format_array_values(value, r) | ||
return value | ||
|
||
return cls._enum_field(value, enum, **kwargs) | ||
|
@@ -365,7 +366,7 @@ def _add_entry_sample_families(cls, ht, sample_data): | |
|
||
ht = ht.transmute( | ||
family_entries=family_sample_indices.map(lambda sample_indices: sample_indices.map( | ||
lambda i: hl.or_else(ht.entries[i], cls._missing_entry(ht.entries[i])).annotate( | ||
lambda i: ht.entries[i].annotate( | ||
sampleId=sample_index_id_map.get(i), | ||
individualGuid=sample_index_individual_map.get(i), | ||
familyGuid=sample_index_family_map.get(i), | ||
|
@@ -376,11 +377,6 @@ def _add_entry_sample_families(cls, ht, sample_data): | |
|
||
return ht, sample_id_family_index_map, num_families | ||
|
||
@classmethod | ||
def _missing_entry(cls, entry): | ||
entry_type = dict(**entry.dtype) | ||
return hl.struct(**{k: hl.missing(v) for k, v in entry_type.items()}) | ||
|
||
def _filter_inheritance(self, ht, inheritance_mode, inheritance_filter, sample_data, sample_id_family_index_map): | ||
any_valid_entry = lambda x: self.GENOTYPE_QUERY_MAP[HAS_ALT](x.GT) | ||
|
||
|
@@ -662,7 +658,7 @@ def _filter_compound_hets(self): | |
ch_ht = ch_ht.filter(ch_ht.comp_het_family_entries.any(hl.is_defined)) | ||
|
||
# Get possible pairs of variants within the same gene | ||
ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht)) | ||
ch_ht = ch_ht.annotate(gene_ids=self._gene_ids_expr(ch_ht, comp_het=True)) | ||
ch_ht = ch_ht.explode(ch_ht.gene_ids) | ||
formatted_rows_expr = hl.agg.collect(ch_ht.row) | ||
if HAS_ALLOWED_SECONDARY_ANNOTATION in self._ht.row: | ||
|
@@ -697,7 +693,7 @@ def _filter_compound_hets(self): | |
return ch_ht | ||
|
||
@classmethod | ||
def _gene_ids_expr(cls, ht): | ||
def _gene_ids_expr(cls, ht, comp_het=False): | ||
return hl.set(ht[cls.TRANSCRIPTS_FIELD].map(lambda t: t.gene_id)) | ||
|
||
def _is_valid_comp_het_family(self, entries_1, entries_2): | ||
|
@@ -1060,7 +1056,9 @@ class SvHailTableQuery(BaseHailTableQuery): | |
DATA_TYPE = 'SV_WGS' | ||
|
||
GENOTYPE_FIELDS = {_to_camel_case(f): f for f in ['CN', 'GQ']} | ||
NESTED_GENOTYPE_FIELDS = {'concordance': ['new_call', 'prev_call', 'prev_num_alt']} | ||
COMPUTED_GENOTYPE_FIELDS = { | ||
k: lambda entry, field, *args: entry.concordance[field] for k in ['new_call', 'prev_call', 'prev_num_alt'] | ||
} | ||
GENOTYPE_QUERY_FIELDS = {'gq_sv': 'GQ', 'gq': None} | ||
|
||
TRANSCRIPTS_FIELD = 'sorted_gene_consequences' | ||
|
@@ -1069,7 +1067,6 @@ class SvHailTableQuery(BaseHailTableQuery): | |
BASE_ANNOTATION_FIELDS = { | ||
'bothsidesSupport': lambda r: r.bothsides_support, | ||
'chrom': lambda r: r.start_locus.contig.replace('^chr', ''), | ||
'endChrom': lambda r: hl.or_missing(r.start_locus.contig != r.end_locus.contig, r.end_locus.contig.replace('^chr', '')), | ||
'pos': lambda r: r.start_locus.position, | ||
'end': lambda r: r.end_locus.position, | ||
'rg37LocusEnd': lambda r: hl.or_missing( | ||
|
@@ -1144,16 +1141,19 @@ def _get_consequence_filter(self, allowed_consequence_ids, annotation_exprs): | |
def _get_annotation_override_filters(self, annotations, **kwargs): | ||
annotation_filters = [] | ||
if annotations.get(STRUCTURAL_ANNOTATION_FIELD): | ||
allowed_type_ids = self._get_enum_terms_ids('sv_type', None, annotations[STRUCTURAL_ANNOTATION_FIELD]) | ||
allowed_type_ids = self.get_allowed_sv_type_ids(annotations[STRUCTURAL_ANNOTATION_FIELD]) | ||
if allowed_type_ids: | ||
annotation_filters.append(hl.set(allowed_type_ids).contains(self._ht.sv_type_id)) | ||
|
||
return annotation_filters | ||
|
||
def get_allowed_sv_type_ids(self, sv_types): | ||
return self._get_enum_terms_ids('sv_type', None, sv_types) | ||
|
||
def _additional_annotation_fields(self): | ||
sv_type_enum = self._enums['sv_type'] | ||
insertion_type_id = sv_type_enum.index('INS') | ||
get_end_chrom = self.BASE_ANNOTATION_FIELDS['endChrom'] | ||
get_end_chrom = lambda r: hl.or_missing(r.start_locus.contig != r.end_locus.contig, r.end_locus.contig.replace('^chr', '')) | ||
return { | ||
'cpxIntervals': lambda r: self._format_enum( | ||
r, 'cpx_intervals', {'type': sv_type_enum}, annotate_value=lambda val, *args: { | ||
|
@@ -1171,4 +1171,91 @@ def _additional_annotation_fields(self): | |
} | ||
|
||
|
||
QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery, SvHailTableQuery]} | ||
class GcnvHailTableQuery(SvHailTableQuery): | ||
|
||
DATA_TYPE = 'SV_WES' | ||
|
||
# gCNV data has no ref/ref calls so a missing entry indicates ref/ref | ||
GENOTYPE_QUERY_MAP = { | ||
**BaseHailTableQuery.GENOTYPE_QUERY_MAP, | ||
REF_REF: hl.is_missing, | ||
HAS_REF: lambda gt: hl.is_missing(gt) | gt.is_het_ref(), | ||
HAS_ALT: hl.is_defined, | ||
COMP_HET_ALT: hl.is_defined, | ||
} | ||
MISSING_NUM_ALT = 0 | ||
|
||
GENOTYPE_FIELDS = { | ||
**SvHailTableQuery.GENOTYPE_FIELDS, | ||
**{f.lower(): f for f in ['QS', 'defragged']}, | ||
} | ||
del GENOTYPE_FIELDS['gq'] | ||
GENOTYPE_QUERY_FIELDS = {} | ||
GENOTYPE_OVERRIDE_FIELDS = { | ||
'start': (hl.min, lambda r: r.start_locus.position), | ||
'end': (hl.max, lambda r: r.end_locus.position), | ||
'num_exon': (hl.max, lambda r: r.num_exon), | ||
'gene_ids': ( | ||
lambda entry_gene_ids: entry_gene_ids.fold(lambda s1, s2: s1.union(s2), hl.empty_set(hl.tstr)), | ||
lambda r: hl.missing(hl.tset(hl.tstr)), | ||
), | ||
} | ||
COMPUTED_GENOTYPE_FIELDS = { | ||
**SvHailTableQuery.COMPUTED_GENOTYPE_FIELDS, | ||
**{k: lambda entry, field, r: hl.or_missing( | ||
hl.is_missing(r[field]) | (r[field] != entry[f'sample_{field}']), entry[f'sample_{field}'] | ||
) for k in GENOTYPE_OVERRIDE_FIELDS.keys()}, | ||
} | ||
COMPUTED_GENOTYPE_FIELDS['prev_overlap'] = COMPUTED_GENOTYPE_FIELDS.pop('prev_num_alt') | ||
|
||
CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS | ||
BASE_ANNOTATION_FIELDS = { | ||
**SvHailTableQuery.BASE_ANNOTATION_FIELDS, | ||
'pos': lambda r: r.start, | ||
'end': lambda r: r.end, | ||
'numExon': lambda r: r.num_exon, | ||
} | ||
del BASE_ANNOTATION_FIELDS['bothsidesSupport'] | ||
|
||
TRANSCRIPTS_ENUM_FIELD = SvHailTableQuery.ENUM_ANNOTATION_FIELDS[SvHailTableQuery.TRANSCRIPTS_FIELD] | ||
ENUM_ANNOTATION_FIELDS = {SvHailTableQuery.TRANSCRIPTS_FIELD: { | ||
**TRANSCRIPTS_ENUM_FIELD, | ||
'format_array_values': lambda values, r: GcnvHailTableQuery.TRANSCRIPTS_ENUM_FIELD['format_array_values']( | ||
hl.if_else(hl.is_missing(r.gene_ids), values, values.filter(lambda t: r.gene_ids.contains(t.geneId))), r, | ||
), | ||
}} | ||
|
||
POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'} | ||
|
||
@classmethod | ||
def _get_genotype_override_field(cls, r, field, family_entries_field=None): | ||
agg, get_default = cls.GENOTYPE_OVERRIDE_FIELDS[field] | ||
sample_field = f'sample_{field}' | ||
entries = r[family_entries_field or 'family_entries'].flatmap(lambda x: x) | ||
return hl.if_else( | ||
entries.any(lambda g: hl.is_defined(g.GT) & hl.is_missing(g[sample_field])), | ||
get_default(r), agg(entries.map(lambda g: g[sample_field])) | ||
) | ||
|
||
def _format_results(self, ht, annotation_fields): | ||
ht = ht.annotate(**{k: self._get_genotype_override_field(ht, k) for k in self.GENOTYPE_OVERRIDE_FIELDS}) | ||
return super()._format_results(ht, annotation_fields) | ||
|
||
def get_allowed_sv_type_ids(self, sv_types): | ||
return super().get_allowed_sv_type_ids([ | ||
type.replace('gCNV_', '') for type in sv_types if type.startswith('gCNV_') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a case for doing this replacement in the pipeline? So that instead of passing the enum ids for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. tbh I had thought we already did, because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I agree the string replace in python is better. I guess, if we'd like to remove |
||
]) | ||
|
||
@classmethod | ||
def _gene_ids_expr(cls, ht, comp_het=False): | ||
family_entries_field = 'comp_het_family_entries' if comp_het else None | ||
return hl.or_else( | ||
cls._get_genotype_override_field(ht, 'gene_ids', family_entries_field=family_entries_field), | ||
super()._gene_ids_expr(ht), | ||
) | ||
|
||
def _additional_annotation_fields(self): | ||
return {} | ||
|
||
|
||
QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery, SvHailTableQuery, GcnvHailTableQuery]} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since the file is getting bigger, how about splitting them into per-class files?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, Ben suggested the same thing on the other PR. I agree, but to keep the diff sane I will do a follow up PR to move everything into its own file