Skip to content

Commit

Permalink
Merge pull request #3597 from broadinstitute/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
hanars authored Sep 11, 2023
2 parents cd73180 + c6832b3 commit efc3193
Show file tree
Hide file tree
Showing 68 changed files with 991 additions and 454 deletions.
2 changes: 2 additions & 0 deletions hail_search/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
PATHOGENICTY_SORT_KEY = 'pathogenicity'
PATHOGENICTY_HGMD_SORT_KEY = 'pathogenicity_hgmd'
ABSENT_PATH_SORT_OFFSET = 12.5
CONSEQUENCE_SORT = 'protein_consequence'
OMIM_SORT = 'in_omim'

ALT_ALT = 'alt_alt'
REF_REF = 'ref_ref'
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
3 changes: 3 additions & 0 deletions hail_search/fixtures/GRCh38/SV_WES/annotations.ht/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/24 17:42:53
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/25 10:47:38
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.109-b71b065e4bb6
Created at 2023/08/25 11:56:33
Empty file.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Empty file added hail_search/queries/__init__.py
Empty file.
436 changes: 35 additions & 401 deletions hail_search/hail_search_query.py → hail_search/queries/base.py

Large diffs are not rendered by default.

91 changes: 91 additions & 0 deletions hail_search/queries/gcnv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import hail as hl

from hail_search.constants import COMP_HET_ALT, HAS_ALT, HAS_REF, REF_REF
from hail_search.queries.sv import SvHailTableQuery


class GcnvHailTableQuery(SvHailTableQuery):

DATA_TYPE = 'SV_WES'

# gCNV data has no ref/ref calls so a missing entry indicates ref/ref
GENOTYPE_QUERY_MAP = {
**SvHailTableQuery.GENOTYPE_QUERY_MAP,
REF_REF: hl.is_missing,
HAS_REF: lambda gt: hl.is_missing(gt) | gt.is_het_ref(),
HAS_ALT: hl.is_defined,
COMP_HET_ALT: hl.is_defined,
}
MISSING_NUM_ALT = 0

GENOTYPE_FIELDS = {
**SvHailTableQuery.GENOTYPE_FIELDS,
**{f.lower(): f for f in ['QS', 'defragged']},
}
del GENOTYPE_FIELDS['gq']
GENOTYPE_QUERY_FIELDS = {}
GENOTYPE_OVERRIDE_FIELDS = {
'start': (hl.min, lambda r: r.start_locus.position),
'end': (hl.max, lambda r: r.end_locus.position),
'num_exon': (hl.max, lambda r: r.num_exon),
'gene_ids': (
lambda entry_gene_ids: entry_gene_ids.fold(lambda s1, s2: s1.union(s2), hl.empty_set(hl.tstr)),
lambda r: hl.missing(hl.tset(hl.tstr)),
),
}
COMPUTED_GENOTYPE_FIELDS = {
**SvHailTableQuery.COMPUTED_GENOTYPE_FIELDS,
**{k: lambda entry, field, r: hl.or_missing(
hl.is_missing(r[field]) | (r[field] != entry[f'sample_{field}']), entry[f'sample_{field}']
) for k in GENOTYPE_OVERRIDE_FIELDS.keys()},
}
COMPUTED_GENOTYPE_FIELDS['prev_overlap'] = COMPUTED_GENOTYPE_FIELDS.pop('prev_num_alt')

CORE_FIELDS = SvHailTableQuery.CORE_FIELDS[:-1]
BASE_ANNOTATION_FIELDS = {
**SvHailTableQuery.BASE_ANNOTATION_FIELDS,
'pos': lambda r: r.start,
'end': lambda r: r.end,
'numExon': lambda r: r.num_exon,
}
del BASE_ANNOTATION_FIELDS['bothsidesSupport']

TRANSCRIPTS_ENUM_FIELD = SvHailTableQuery.ENUM_ANNOTATION_FIELDS[SvHailTableQuery.TRANSCRIPTS_FIELD]
ENUM_ANNOTATION_FIELDS = {SvHailTableQuery.TRANSCRIPTS_FIELD: {
**TRANSCRIPTS_ENUM_FIELD,
'format_array_values': lambda values, r: GcnvHailTableQuery.TRANSCRIPTS_ENUM_FIELD['format_array_values'](
hl.if_else(hl.is_missing(r.gene_ids), values, values.filter(lambda t: r.gene_ids.contains(t.geneId))), r,
),
}}

POPULATIONS = {k: v for k, v in SvHailTableQuery.POPULATIONS.items() if k != 'gnomad_svs'}

@classmethod
def _get_genotype_override_field(cls, r, field, family_entries_field=None):
agg, get_default = cls.GENOTYPE_OVERRIDE_FIELDS[field]
sample_field = f'sample_{field}'
entries = r[family_entries_field or 'family_entries'].flatmap(lambda x: x)
return hl.if_else(
entries.any(lambda g: hl.is_defined(g.GT) & hl.is_missing(g[sample_field])),
get_default(r), agg(entries.map(lambda g: g[sample_field]))
)

def _format_results(self, ht, annotation_fields):
ht = ht.annotate(**{k: self._get_genotype_override_field(ht, k) for k in self.GENOTYPE_OVERRIDE_FIELDS})
return super()._format_results(ht, annotation_fields)

def get_allowed_sv_type_ids(self, sv_types):
return super().get_allowed_sv_type_ids([
type.replace('gCNV_', '') for type in sv_types if type.startswith('gCNV_')
])

@classmethod
def _gene_ids_expr(cls, ht, comp_het=False):
family_entries_field = 'comp_het_family_entries' if comp_het else None
return hl.or_else(
cls._get_genotype_override_field(ht, 'gene_ids', family_entries_field=family_entries_field),
super()._gene_ids_expr(ht),
)

def _additional_annotation_fields(self):
return {}
62 changes: 62 additions & 0 deletions hail_search/queries/multi_data_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import hail as hl

from hail_search.constants import CONSEQUENCE_SORT, OMIM_SORT
from hail_search.queries.base import BaseHailTableQuery
from hail_search.queries.variants import VariantHailTableQuery
from hail_search.queries.sv import SvHailTableQuery
from hail_search.queries.gcnv import GcnvHailTableQuery

QUERY_CLASS_MAP = {cls.DATA_TYPE: cls for cls in [VariantHailTableQuery, SvHailTableQuery, GcnvHailTableQuery]}


class MultiDataTypeHailTableQuery(BaseHailTableQuery):

def __init__(self, sample_data, *args, **kwargs):
self._data_type_queries = {k: QUERY_CLASS_MAP[k](v, *args, **kwargs) for k, v in sample_data.items()}
super().__init__(sample_data, *args, **kwargs)

def _load_filtered_table(self, *args, **kwargs):
pass

def format_search_ht(self):
ht = None
for data_type, query in self._data_type_queries.items():
dt_ht = query.format_search_ht()
merged_sort_expr = self._merged_sort_expr(data_type, dt_ht)
if merged_sort_expr is not None:
dt_ht = dt_ht.annotate(_sort=merged_sort_expr)
dt_ht = dt_ht.select('_sort', **{data_type: dt_ht.row})
if ht is None:
ht = dt_ht
else:
ht = ht.join(dt_ht, 'outer')
ht = ht.transmute(_sort=hl.or_else(ht._sort, ht._sort_1))
return ht

def _merged_sort_expr(self, data_type, ht):
# Certain sorts have an extra element for variant-type data, so need to add an element for SV data
if not data_type.startswith('SV'):
return None

if self._sort == CONSEQUENCE_SORT:
return hl.array([hl.float64(4.5)]).extend(ht._sort.map(hl.float64))
elif self._sort == OMIM_SORT:
return hl.array([hl.int64(0)]).extend(ht._sort)
elif self._sort_metadata:
return ht._sort[:1].extend(ht._sort)

return None

def _format_collected_rows(self, collected):
return super()._format_collected_rows([
next(row.get(data_type) for data_type in self._data_type_queries if row.get(data_type))
for row in collected
])

def format_gene_counts_ht(self):
hts = [query.format_gene_counts_ht() for query in self._data_type_queries.values()]
ht = hts[0]
for dt_ht in hts[1:]:
ht = ht.join(dt_ht, 'outer')
ht = ht.transmute(**{k: hl.or_else(ht[k], ht[f'{k}_1']) for k in ['gene_ids', 'families']})
return ht
116 changes: 116 additions & 0 deletions hail_search/queries/sv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import hail as hl


from hail_search.constants import CONSEQUENCE_SORT, NEW_SV_FIELD, STRUCTURAL_ANNOTATION_FIELD
from hail_search.queries.base import BaseHailTableQuery, PredictionPath


class SvHailTableQuery(BaseHailTableQuery):

DATA_TYPE = 'SV_WGS'

GENOTYPE_FIELDS = {f.lower(): f for f in ['CN', 'GQ']}
COMPUTED_GENOTYPE_FIELDS = {
k: lambda entry, field, *args: entry.concordance[field] for k in ['new_call', 'prev_call', 'prev_num_alt']
}
GENOTYPE_QUERY_FIELDS = {'gq_sv': 'GQ', 'gq': None}

TRANSCRIPTS_FIELD = 'sorted_gene_consequences'
TRANSCRIPT_CONSEQUENCE_FIELD = 'major_consequence'
CORE_FIELDS = BaseHailTableQuery.CORE_FIELDS + ['algorithms']
BASE_ANNOTATION_FIELDS = {
'bothsidesSupport': lambda r: r.bothsides_support,
'chrom': lambda r: r.start_locus.contig.replace('^chr', ''),
'pos': lambda r: r.start_locus.position,
'end': lambda r: r.end_locus.position,
'rg37LocusEnd': lambda r: hl.or_missing(
hl.is_defined(r.rg37_locus_end), hl.struct(contig=r.rg37_locus_end.contig, position=r.rg37_locus_end.position)
),
**BaseHailTableQuery.BASE_ANNOTATION_FIELDS,
}
ENUM_ANNOTATION_FIELDS = {
TRANSCRIPTS_FIELD: BaseHailTableQuery.ENUM_ANNOTATION_FIELDS['transcripts'],
}

POPULATIONS = {
'sv_callset': {'hemi': None, 'sort': 'callset_af'},
'gnomad_svs': {'id': 'ID', 'ac': None, 'an': None, 'hom': None, 'hemi': None, 'het': None, 'sort': 'gnomad'},
}
POPULATION_FIELDS = {'sv_callset': 'gt_stats'}
PREDICTION_FIELDS_CONFIG = {
'strvctvre': PredictionPath('strvctvre', 'score'),
}

SORTS = {
**BaseHailTableQuery.SORTS,
CONSEQUENCE_SORT: lambda r: [hl.min(r.sorted_gene_consequences.map(lambda g: g.major_consequence_id))],
'size': lambda r: [hl.if_else(
r.start_locus.contig == r.end_locus.contig, r.start_locus.position - r.end_locus.position, -50,
)],
}

def _parse_intervals(self, intervals, variant_ids, variant_keys=None, **kwargs):
parsed_intervals, _ = super()._parse_intervals(intervals, variant_ids=None, **kwargs)
return parsed_intervals, variant_keys

def _filter_annotated_table(self, *args, parsed_intervals=None, exclude_intervals=False, **kwargs):
if parsed_intervals:
interval_filter = hl.array(parsed_intervals).any(lambda interval: hl.if_else(
self._ht.start_locus.contig == self._ht.end_locus.contig,
interval.overlaps(hl.interval(self._ht.start_locus, self._ht.end_locus)),
interval.contains(self._ht.start_locus) | interval.contains(self._ht.end_locus),
))
if exclude_intervals:
interval_filter = ~interval_filter
self._ht = self._ht.filter(interval_filter)

return super()._filter_annotated_table(*args, **kwargs)

def _get_family_passes_quality_filter(self, quality_filter, annotations=None, **kwargs):
passes_quality = super()._get_family_passes_quality_filter(quality_filter)
if not (annotations or {}).get(NEW_SV_FIELD):
return passes_quality

entries_has_new_call = lambda entries: entries.any(lambda x: x.concordance.new_call)
if passes_quality is None:
return entries_has_new_call

return lambda entries: entries_has_new_call(entries) & passes_quality(entries)

def _get_allowed_consequences_annotations(self, annotations, annotation_filters, is_secondary=False):
if is_secondary:
# SV search can specify secondary SV types, as well as secondary consequences
annotation_filters = self._get_annotation_override_filters(annotations)
return super()._get_allowed_consequences_annotations(annotations, annotation_filters)

def _get_annotation_override_filters(self, annotations, **kwargs):
annotation_filters = []
if annotations.get(STRUCTURAL_ANNOTATION_FIELD):
allowed_type_ids = self.get_allowed_sv_type_ids(annotations[STRUCTURAL_ANNOTATION_FIELD])
if allowed_type_ids:
annotation_filters.append(hl.set(allowed_type_ids).contains(self._ht.sv_type_id))

return annotation_filters

def get_allowed_sv_type_ids(self, sv_types):
return self._get_enum_terms_ids('sv_type', None, sv_types)

def _additional_annotation_fields(self):
sv_type_enum = self._enums['sv_type']
insertion_type_id = sv_type_enum.index('INS')
get_end_chrom = lambda r: hl.or_missing(r.start_locus.contig != r.end_locus.contig, r.end_locus.contig.replace('^chr', ''))
return {
'cpxIntervals': lambda r: self._format_enum(
r, 'cpx_intervals', {'type': sv_type_enum}, annotate_value=lambda val, *args: {
'chrom': val.start.contig,
'start': val.start.position,
'end': val.end.position,
},
),
# For insertions, end_locus represents the svSourceDetail, otherwise represents the endChrom
'endChrom': lambda r: hl.or_missing(r.sv_type_id != insertion_type_id, get_end_chrom(r)),
'svSourceDetail': lambda r: hl.or_missing(r.sv_type_id == insertion_type_id, hl.bind(
lambda end_chrom: hl.or_missing(hl.is_defined(end_chrom), hl.struct(chrom=end_chrom)),
get_end_chrom(r),
)),
}
Loading

0 comments on commit efc3193

Please sign in to comment.