Skip to content

Commit

Permalink
Refactor hgvs_string creation to use mavehgvs
Browse files Browse the repository at this point in the history
  • Loading branch information
sallybg committed Jun 6, 2024
1 parent edc2ccf commit 88acaa5
Show file tree
Hide file tree
Showing 2 changed files with 194 additions and 3 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ dependencies = [
"pydantic>=2",
"python-dotenv",
"setuptools>=68.0", # tmp -- ensure 3.12 compatibility
"mavehgvs==0.6.1"
]
dynamic = ["version"]

Expand Down
196 changes: 193 additions & 3 deletions src/dcd_mapping/vrs_map.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Map transcripts to VRS objects."""
import logging
from itertools import cycle
import re

import click
from Bio.Seq import Seq
Expand All @@ -15,6 +16,12 @@
SequenceString,
)
from ga4gh.vrs.normalize import normalize
from mavehgvs.util import parse_variant_strings
from mavehgvs.variant import (
Variant,
VariantPosition
)
from mavehgvs import patterns

from dcd_mapping.lookup import (
get_chromosome_identifier,
Expand Down Expand Up @@ -64,6 +71,179 @@ def _process_any_aa_code(hgvs_pro_string: str) -> str:
return hgvs_pro_string


# create hgvs strings functions
# pre map
# post map
# protein-coding hgvs / transcript reference
# noncoding hgvs / chromosome reference (blat)


def _create_hgvs_strings_pre_protein(
raw_description: str,
sequence_id: str,
) -> list[str]:

raw_variants = _parse_raw_variant_str(raw_description)

# pass the list to mavehgvs parser, return list of variant objects
variants, errors = parse_variant_strings(raw_variants)

hgvs_strings = []

# for each parsed variant object:
for variant, error in zip(variants, errors):
if error is not None:
raise ValueError("Variant could not be parsed by mavehgvs: " + error)

if not isinstance(variant.positions, VariantPosition):
raise NotImplementedError("Pre-map VRS translation for variants spanning multiple positions has not been implemented.")

# TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
if re.search(patterns.protein.pro_fs, str(variant)):
raise NotImplementedError("Pre-map VRS translation not supported for fs variants denoted with protein hgvs strings")

hgvs_strings.append('ga4gh:' + sequence_id + ':' + str(variant))

return hgvs_strings


def _create_hgvs_strings_post_protein(
alignment: AlignmentResult,
raw_description: str,
tx: TxSelectResult,
) -> list[str]:

# assume annotation layer is protein

raw_variants = _parse_raw_variant_str(raw_description)

# pass the list to mavehgvs parser, return list of variant objects
variants, errors = parse_variant_strings(raw_variants)

hgvs_strings = []

# for each parsed variant object:
for variant, error in zip(variants, errors):
if error is not None:
raise ValueError("Variant could not be parsed by mavehgvs: " + error)

if not isinstance(variant.positions, VariantPosition):
raise NotImplementedError("Post-map VRS translation for variants spanning multiple positions has not been implemented.")

# TODO protein fs hgvs strings are not supported because they can't be handled by vrs allele translator
if re.search(patterns.protein.pro_fs, str(variant)):
raise NotImplementedError("Post-map VRS translation not supported for fs variants denoted with protein hgvs strings")

# adjust position (need offset)
# TODO check offset is accurate
variant.positions.position = variant.positions.position + tx.start
# create hgvs string
hgvs_strings.append(tx.np + ':' + str(variant))

# return a list of hgvs strings
return hgvs_strings


def _create_hgvs_strings_post_genomic(
alignment: AlignmentResult,
raw_description: str,
) -> list[str]:
# assume annotation layer is genomic

raw_variants = _parse_raw_variant_str(raw_description)

# pass the list to mavehgvs parser, return list of variant objects
variants, errors = parse_variant_strings(raw_variants)

hgvs_strings = []

# for each parsed variant object:
for variant, error in zip(variants, errors):
if error is not None:
raise ValueError("Variant could not be parsed by mavehgvs: " + error)

if not isinstance(variant.positions, VariantPosition):
raise NotImplementedError("Post-map VRS translation for variants spanning multiple positions has not been implemented.")

variant = _adjust_variant_to_ref(variant, alignment)

variant._prefix = 'g'

hgvs_strings.append(alignment.chrom + ':' + str(variant))

# return a list of hgvs strings
return hgvs_strings


def _adjust_variant_to_ref(
variant: Variant,
alignment: AlignmentResult,
) -> Variant:
"""
Takes a variant object relative to a scoreset's target sequence,
and returns a variant object that describes the variant
relative to the provided alignment result
"""
start = variant.positions.position - 1 #hgvs uses 1-based numbering for c. sequences, while blat hits are 0-based

# get hit
query_subrange_containing_hit = None
target_subrange_containing_hit = None

for query_subrange, target_subrange in zip(alignment.query_subranges, alignment.hit_subranges):
if start >= query_subrange.start and start < query_subrange.end:
query_subrange_containing_hit = query_subrange
target_subrange_containing_hit = target_subrange
break

# if hit is on positive strand
if alignment.strand == 1:
# get variant start relative to the reference (the "hit")
# distance from beginning of query to variant start position:
query_to_start = start - query_subrange_containing_hit.start
# distance from beginning of ref to the variant start position:
ref_to_start = target_subrange_containing_hit.start + query_to_start
# hgvs is 1-based, so convert back to 1-based
variant.positions.position = ref_to_start + 1

else: # if hit is on negative strand
# in this case, picture the rev comp of the query/variant as mapping to the positive strand of the ref
# the start of the reverse complement of the variant is the end of the "original" variant
# so we need to know where the end of the original variant is, relative to the query molecule
# for single-position variants, we'll assume the end (rev comp view) is equal to: start - 1

# the distance between the start and end of the variant is dependent on the number of positions covered by the variant!
# this is hardcoded for single-position variants, for now
end = start
# subtract 1 from end of hit range, because blat ranges are 0-based [start, end)
ref_to_start = (target_subrange_containing_hit.end -1 ) - (end - query_subrange_containing_hit.start)
# or could do ranges[i][0] + (end - hits[i][1]), is one better than the other? any cases where one might be inaccurate?
# hgvs is 1-based, so convert back to 1-based
variant.positions.position = ref_to_start + 1

# this is only tested for single position variants

revcomp_sequences_list = []
for sequence in variant._sequences:
revcomp_sequences_list.append(str(Seq(sequence).reverse_complement()))
variant._sequences = tuple(revcomp_sequences_list)

return variant


def _parse_raw_variant_str(raw_description: str) -> list[str]:
"""
Takes a string that may contain a list of variant descriptions
or a single variant description,
and returns a list of variant description strings
"""
if "[" in raw_description:
prefix = raw_description[0:2]
return [prefix + var for var in set(raw_description[3:-1].split(";"))]
else:
return [raw_description]


def _create_hgvs_strings(
alignment: AlignmentResult,
raw_description: str,
Expand Down Expand Up @@ -138,9 +318,15 @@ def _map_protein_coding_pro(
pre_mapped=vrs_variation,
post_mapped=vrs_variation,
)
hgvs_strings = _create_hgvs_strings(
align_result, row.hgvs_pro, AnnotationLayer.PROTEIN, transcript
)

# can't translate output of this function currently
pre_mapped_hgvs_strings = _create_hgvs_strings_pre_protein(row.hgvs_pro, sequence_id)

post_mapped_hgvs_strings = _create_hgvs_strings_post_protein(align_result, row.hgvs_pro, transcript)


hgvs_strings = _create_hgvs_strings(align_result, row.hgvs_pro, AnnotationLayer.PROTEIN, transcript)

pre_mapped_protein = _get_variation(
hgvs_strings,
AnnotationLayer.PROTEIN,
Expand Down Expand Up @@ -244,9 +430,13 @@ def _map_protein_coding(
variations.append(hgvs_pro_mappings)
if not _hgvs_nt_is_valid(row.hgvs_nt):
continue

post_mapped_hgvs_strings = _create_hgvs_strings_post_genomic(align_result, row.hgvs_nt)

hgvs_strings = _create_hgvs_strings(
align_result, row.hgvs_nt, AnnotationLayer.GENOMIC
)

pre_mapped_genomic = _get_variation(
hgvs_strings,
AnnotationLayer.GENOMIC,
Expand Down

0 comments on commit 88acaa5

Please sign in to comment.