Skip to content

Commit

Permalink
feat!: update tx segment with new cool-seq-tool structure changes (#176)
Browse files Browse the repository at this point in the history
* build!: update tx segment with new cool-seq-tool structure changes

* fix: kwarg name for chromosome genomic_ac changed

* add back internal sequence location method for setting ids for sequence location

* feat: assume coordinates are inter-residue when not specified

* test: update test that verifies templated sequence element defaults to inter-residue coords

* add null-safes

* add null-safes

* feat: update tx segment construction with new names from cool-seq-tool

* docs: remove strand from kwargs comment since it's not accepted anymore

* feat: update cool-seq-tool with latest release and fix tests

* removing unneeded check

* renaming chromosome to genomic_ac for consistency with cool-seq-tool

* handle potential errors returned from cool-seq-tool

* refactor: use sequencelocations directly from cst instead of creating new ones

* style: making whitespace nicer
  • Loading branch information
katiestahl authored Aug 21, 2024
1 parent 311aede commit b6988b7
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 119 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ dependencies = [
"ga4gh.vrs ~=2.0.0a10",
"biocommons.seqrepo",
"gene-normalizer ==0.4.0",
"cool-seq-tool ~=0.5.1",
"cool-seq-tool ~=0.7.0",
]
dynamic=["version"]

Expand Down
161 changes: 91 additions & 70 deletions src/fusor/fusor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from bioutils.accessions import coerce_namespace
from cool_seq_tool.app import CoolSeqTool
from cool_seq_tool.schemas import ResidueMode, Strand
from cool_seq_tool.schemas import CoordinateType, Strand
from ga4gh.core import ga4gh_identify
from ga4gh.core.domain_models import Gene
from ga4gh.vrs import models
Expand Down Expand Up @@ -231,7 +231,7 @@ async def transcript_segment_element(
:param kwargs:
If ``tx_to_genomic_coords``, possible key word arguments:
(From `cool_seq_tool.transcript_to_genomic_coords <https://coolseqtool.readthedocs.io/stable/reference/api/mappers/cool_seq_tool.mappers.exon_genomic_coords.html>`_)
(From `cool_seq_tool.tx_segment_to_genomic <https://coolseqtool.readthedocs.io/stable/reference/api/mappers/cool_seq_tool.mappers.exon_genomic_coords.html>`_)
* **gene** (``str | None = None``)
* **transcript** (``str | None = None``)
Expand All @@ -242,80 +242,69 @@ async def transcript_segment_element(
else:
(From `cool_seq_tool.genomic_to_transcript_exon_coordinates <https://coolseqtool.readthedocs.io/stable/reference/api/mappers/cool_seq_tool.mappers.exon_genomic_coords.html>`_)
(From `cool_seq_tool.genomic_to_tx_segment <https://coolseqtool.readthedocs.io/stable/reference/api/mappers/cool_seq_tool.mappers.exon_genomic_coords.html>`_)
* **chromosome**: (``Union[str, int]``)
* **start**: (``Optional[int] = None``)
* **end**: (``Optional[int] = None``)
* **strand**: (``Optional[int] = None``)
* **genomic_ac**: (``str``)
* **seg_start_genomic**: (``Optional[int] = None``)
* **seg_end_genomic**: (``Optional[int] = None``)
* **transcript**: (``Optional[str] = None``)
* **gene**: (``Optional[str] = None``)
* **residue_mode**: (``ResidueMode = ResidueMode.RESIDUE``)
:return: Transcript Segment Element, warning
"""
if tx_to_genomic_coords:
data = await self.cool_seq_tool.ex_g_coords_mapper.transcript_to_genomic_coordinates(
data = await self.cool_seq_tool.ex_g_coords_mapper.tx_segment_to_genomic(
**kwargs
)
else:
if "chromosome" in kwargs and kwargs.get("chromosome") is None:
if "genomic_ac" in kwargs and kwargs.get("genomic_ac") is None:
msg = (
"`chromosome` is required when going from genomic to"
"`genomic_ac` is required when going from genomic to"
" transcript exon coordinates"
)
_logger.warning(msg)
return None, [msg]
residue_mode = kwargs.get("residue_mode")
# TODO: Remove once fixed in cool_seq_tool - need to verify that this code isn't still needed
if residue_mode != ResidueMode.INTER_RESIDUE:
start = kwargs.get("start")
kwargs["start"] = start - 1 if start is not None else None
kwargs["residue_mode"] = "inter-residue"
chromosome = kwargs.get("chromosome")
# if chromosome is a string, assume it's an accession, fix it for the kwargs since CST expects this as alt_ac
if type(chromosome) is str:
kwargs["alt_ac"] = chromosome
data = await self.cool_seq_tool.ex_g_coords_mapper.genomic_to_transcript_exon_coordinates(
data = await self.cool_seq_tool.ex_g_coords_mapper.genomic_to_tx_segment(
**kwargs
)

if data.genomic_data is None:
return None, data.warnings
if data.errors:
return None, data.errors

genomic_data = data.genomic_data
genomic_data.transcript = coerce_namespace(genomic_data.transcript)
data.tx_ac = coerce_namespace(data.tx_ac)

normalized_gene_response = self._normalized_gene(
genomic_data.gene, use_minimal_gene=use_minimal_gene
data.gene, use_minimal_gene=use_minimal_gene
)
if not normalized_gene_response[0] and normalized_gene_response[1]:
return None, [normalized_gene_response[1]]

seg_start = data.seg_start
genomic_start_location = seg_start.genomic_location if seg_start else None
if genomic_start_location:
self._add_ids_to_sequence_location(
genomic_start_location, data.genomic_ac, seq_id_target_namespace
)

seg_end = data.seg_end
genomic_end_location = seg_end.genomic_location if seg_end else None
if genomic_end_location:
self._add_ids_to_sequence_location(
genomic_end_location, data.genomic_ac, seq_id_target_namespace
)

return (
TranscriptSegmentElement(
transcript=genomic_data.transcript,
exonStart=genomic_data.exon_start,
exonStartOffset=genomic_data.exon_start_offset,
exonEnd=genomic_data.exon_end,
exonEndOffset=genomic_data.exon_end_offset,
transcript=data.tx_ac,
# offset by 1 because in CST exons are 0-based
exonStart=seg_start.exon_ord + 1 if seg_start else None,
exonStartOffset=seg_start.offset if seg_start else None,
# offset by 1 because in CST exons are 0-based
exonEnd=seg_end.exon_ord + 1 if seg_end else None,
exonEndOffset=seg_end.offset if seg_end else None,
gene=normalized_gene_response[0],
elementGenomicStart=self._sequence_location(
genomic_data.start,
genomic_data.start + 1,
genomic_data.chr,
seq_id_target_namespace=seq_id_target_namespace,
)
if genomic_data.start
else None,
elementGenomicEnd=self._sequence_location(
genomic_data.end,
genomic_data.end + 1,
genomic_data.chr,
seq_id_target_namespace=seq_id_target_namespace,
)
if genomic_data.end
else None,
elementGenomicStart=genomic_start_location,
elementGenomicEnd=genomic_end_location,
),
None,
)
Expand All @@ -342,7 +331,7 @@ def templated_sequence_element(
end: int,
sequence_id: str,
strand: Strand,
residue_mode: ResidueMode = ResidueMode.RESIDUE,
coordinate_type: CoordinateType = CoordinateType.INTER_RESIDUE,
seq_id_target_namespace: str | None = None,
) -> TemplatedSequenceElement:
"""Create templated sequence element
Expand All @@ -351,13 +340,13 @@ def templated_sequence_element(
:param end: Genomic end
:param sequence_id: Chromosome accession for sequence
:param strand: Strand
:param residue_mode: Determines coordinate base used. Must be one of ``residue``
:param coordinate_type: Determines coordinate base used. Must be one of ``residue``
or ``inter-residue``.
:param seq_id_target_namespace: If want to use digest for ``sequence_id``, set
this to the namespace you want the digest for. Otherwise, leave as ``None``.
:return: Templated Sequence Element
"""
if residue_mode == ResidueMode.RESIDUE:
if coordinate_type == CoordinateType.RESIDUE:
start -= 1

region = self._sequence_location(
Expand Down Expand Up @@ -523,25 +512,9 @@ def _sequence_location(
:param seq_id_target_namespace: If want to use digest for ``sequence_id``, set
this to the namespace you want the digest for. Otherwise, leave as ``None``.
"""
try:
sequence_id = coerce_namespace(sequence_id)
except ValueError:
if not re.match(CURIE.__metadata__[0].pattern, sequence_id):
sequence_id = f"sequence.id:{sequence_id}"

if seq_id_target_namespace:
try:
seq_id = translate_identifier(
self.seqrepo, sequence_id, target_namespace=seq_id_target_namespace
)
except IDTranslationException:
_logger.warning(
"Unable to translate %s using %s as the target namespace",
sequence_id,
seq_id_target_namespace,
)
else:
sequence_id = seq_id
sequence_id = self._get_coerced_sequence_id(
sequence_id, seq_id_target_namespace
)

refget_accession = translate_identifier(self.seqrepo, sequence_id)

Expand Down Expand Up @@ -586,6 +559,54 @@ def _normalized_gene(
return gene, None
return None, f"gene-normalizer unable to normalize {query}"

def _add_ids_to_sequence_location(
self,
sequence_location: SequenceLocation,
genomic_ac: str,
seq_id_target_namespace: str | None = None,
) -> None:
"""Modify the sequence_location to have ga4gh_identified id and its sequenceReference with id from target namespace (refseq default)
:param sequence_location: the SequenceLocation to add/modify id's of
:param seq_id_target_namespace: the target namespace for the SequenceLocation's sequenceReference id, which is the genomic_ac
(defaults to refseq if none given)
"""
seq_ref_id = self._get_coerced_sequence_id(genomic_ac, seq_id_target_namespace)

if sequence_location:
sequence_location.id = ga4gh_identify(sequence_location)
if sequence_location.sequenceReference:
sequence_location.sequenceReference.id = seq_ref_id

def _get_coerced_sequence_id(
self, sequence_id: str, seq_id_target_namespace: str | None = None
) -> str:
"""Get the coerced sequence_id using a target namespace and log any errors
:param sequence_id: the sequence id to coerce
:param seq_id_target_namespace: the target namespace
"""
try:
sequence_id = coerce_namespace(sequence_id)
except ValueError:
if not re.match(CURIE.__metadata__[0].pattern, sequence_id):
sequence_id = f"sequence.id:{sequence_id}"

if seq_id_target_namespace:
try:
seq_id = translate_identifier(
self.seqrepo, sequence_id, target_namespace=seq_id_target_namespace
)
except IDTranslationException:
_logger.warning(
"Unable to translate %s using %s as the target namespace",
sequence_id,
seq_id_target_namespace,
)
else:
sequence_id = seq_id
return sequence_id

def generate_nomenclature(self, fusion: Fusion) -> str:
"""Generate human-readable nomenclature describing provided fusion
Expand Down
2 changes: 1 addition & 1 deletion src/fusor/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def check_exons(cls, values):
msg = "Must give values for either `exonStart`, `exonEnd`, or both"
exon_start = values.get("exonStart")
exon_end = values.get("exonEnd")
if (not exon_start) and (not exon_end):
if (exon_start is None) and (exon_end is None):
raise ValueError(msg)

if exon_start:
Expand Down
Loading

0 comments on commit b6988b7

Please sign in to comment.