From 442083c09778e97d8d6fe53b2c35cc54bcde1d48 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 17 Jul 2024 13:12:23 -0400 Subject: [PATCH 01/82] build!: remove vrsatile --- pyproject.toml | 5 ++--- src/fusor/fusor.py | 8 -------- src/fusor/models.py | 7 ------- src/fusor/nomenclature.py | 1 - src/fusor/tools.py | 1 - tests/test_fusor.py | 1 - 6 files changed, 2 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 81b62fa..195ebba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,10 +26,9 @@ description = "Computable object representation and validation for gene fusions" license = {file = "LICENSE"} dependencies = [ "pydantic == 2.*", - "ga4gh.vrsatile.pydantic ~=0.2.0", - "ga4gh.vrs ~=0.8.1", + "ga4gh.vrs ~=2.0.0a8", "biocommons.seqrepo", - "gene-normalizer ~=0.1.40-dev1", + "gene-normalizer ~=0.4.0", "cool-seq-tool ~=0.5.0", ] dynamic=["version"] diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 6d12c26..7dcc674 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -9,14 +9,6 @@ from cool_seq_tool.schemas import ResidueMode from ga4gh.core import ga4gh_identify from ga4gh.vrs import models -from ga4gh.vrsatile.pydantic.vrs_models import ( - CURIE, - Number, - SequenceInterval, - SequenceLocation, - VRSTypes, -) -from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, LocationDescriptor from gene.database import AbstractDatabase as GeneDatabase from gene.database import create_db from gene.query import QueryHandler diff --git a/src/fusor/models.py b/src/fusor/models.py index 124ee79..d5c4fa1 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -4,13 +4,6 @@ from enum import Enum from typing import Any, Literal -from ga4gh.vrsatile.pydantic import return_value -from ga4gh.vrsatile.pydantic.vrsatile_models import ( - CURIE, - GeneDescriptor, - LocationDescriptor, - SequenceDescriptor, -) from pydantic import ( BaseModel, ConfigDict, diff --git a/src/fusor/nomenclature.py b/src/fusor/nomenclature.py index ca755bc..c250df0 100644 --- a/src/fusor/nomenclature.py +++ b/src/fusor/nomenclature.py @@ -1,7 +1,6 @@ """Provide helper methods for fusion nomenclature generation.""" from biocommons.seqrepo.seqrepo import SeqRepo -from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation from fusor.exceptions import IDTranslationException from fusor.models import ( diff --git a/src/fusor/tools.py b/src/fusor/tools.py index 2e42ccc..fa3cf5a 100644 --- a/src/fusor/tools.py +++ b/src/fusor/tools.py @@ -3,7 +3,6 @@ import logging from biocommons.seqrepo.seqrepo import SeqRepo -from ga4gh.vrsatile.pydantic.vrs_models import CURIE from fusor.exceptions import IDTranslationException diff --git a/tests/test_fusor.py b/tests/test_fusor.py index 1039316..da57f19 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -3,7 +3,6 @@ import copy import pytest -from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, LocationDescriptor from fusor.exceptions import FUSORParametersException from fusor.models import ( From 4a538bb7cdcff86f1b9e842596623fdd065f8483 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 17 Jul 2024 13:23:19 -0400 Subject: [PATCH 02/82] wip: remove gene descriptor --- src/fusor/fusor.py | 52 ++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 7dcc674..c9625fc 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -8,10 +8,13 @@ from cool_seq_tool.app import CoolSeqTool from cool_seq_tool.schemas import ResidueMode from ga4gh.core import ga4gh_identify +from ga4gh.core.domain_models import Gene from ga4gh.vrs import models +from ga4gh.vrs.models import SequenceLocation from gene.database import AbstractDatabase as GeneDatabase from gene.database import create_db from gene.query import QueryHandler +from gene.schemas import CURIE from pydantic import ValidationError from fusor.exceptions import FUSORParametersException, IDTranslationException @@ -92,7 +95,7 @@ def _contains_element_type(kwargs: dict, elm_type: StructuralElementType) -> boo def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: """Construct fusion object. - :param fusion_type: explicitly specify fusion type. Unecessary if providing + :param fusion_type: explicitly specify fusion type. Unnecessary if providing fusion object in keyword args that includes ``type`` attribute. :return: constructed fusion object if successful :raise: FUSORParametersException if fusion type unable to be determined, @@ -284,7 +287,7 @@ async def transcript_segment_element( exon_end=genomic_data.exon_end, exon_end_offset=genomic_data.exon_end_offset, gene_descriptor=normalized_gene_response[0], - element_genomic_start=self._location_descriptor( + element_genomic_start=self._sequence_location( genomic_data.start, genomic_data.start + 1, genomic_data.chr, @@ -293,7 +296,7 @@ async def transcript_segment_element( ) if genomic_data.start else None, - element_genomic_end=self._location_descriptor( + element_genomic_end=self._sequence_location( genomic_data.end, genomic_data.end + 1, genomic_data.chr, @@ -353,7 +356,7 @@ def templated_sequence_element( if residue_mode == ResidueMode.RESIDUE: start -= 1 - region = self._location_descriptor( + region = self._sequence_location( start, end, sequence_id, @@ -460,7 +463,7 @@ def functional_domain( if not gene_descr: return None, warning - loc_descr = self._location_descriptor( + loc_descr = self._sequence_location( start, end, sequence_id, seq_id_target_namespace=seq_id_target_namespace ) @@ -511,7 +514,7 @@ def regulatory_element( _logger.warning(msg) return None, msg - def _location_descriptor( + def _sequence_location( self, start: int, end: int, @@ -519,7 +522,7 @@ def _location_descriptor( label: str | None = None, seq_id_target_namespace: str | None = None, use_location_id: bool = False, - ) -> LocationDescriptor: + ) -> SequenceLocation: """Create location descriptor :param start: Start position @@ -556,22 +559,11 @@ def _location_descriptor( else: sequence_id = seq_id - location = SequenceLocation( + return SequenceLocation( sequence_id=sequence_id, - interval=SequenceInterval(start=Number(value=start), end=Number(value=end)), + start=start, end=end, ) - if use_location_id: - _id = self._location_id(location.model_dump()) - else: - quote_id = quote(label) if label else quote(seq_id_input) - _id = f"fusor.location_descriptor:{quote_id}" - - location_descr = LocationDescriptor(id=_id, location=location) - - if label: - location_descr.label = label - return location_descr def add_additional_fields( self, @@ -628,7 +620,7 @@ def add_location_id(self, fusion: Fusion) -> Fusion: ]: if element_genomic: location = element_genomic.location - if location.type == VRSTypes.SEQUENCE_LOCATION.value: + if location.type == SequenceLocation: location_id = self._location_id(location.model_dump()) element_genomic.location_id = location_id if isinstance(fusion, CategoricalFusion) and fusion.critical_functional_domains: @@ -640,7 +632,7 @@ def add_location_id(self, fusion: Fusion) -> Fusion: element = fusion.regulatory_element if element.feature_location: location = element.feature_location - if location.type == VRSTypes.SEQUENCE_LOCATION.value: + if location.type == SequenceLocation: location_id = self._location_id(location.model_dump()) element.feature_location.location_id = location_id return fusion @@ -666,7 +658,7 @@ def add_translated_sequence_id( for element in fusion.structural_elements: if isinstance(element, TemplatedSequenceElement): location = element.region.location - if location.type == VRSTypes.SEQUENCE_LOCATION.value: + if location.type == SequenceLocation: try: new_id = translate_identifier( self.seqrepo, location.sequence_id, target_namespace @@ -682,7 +674,7 @@ def add_translated_sequence_id( ]: if loc_descr: location = loc_descr.location - if location.type == VRSTypes.SEQUENCE_LOCATION.value: + if location.type == SequenceLocation: try: new_id = translate_identifier( self.seqrepo, location.sequence_id, target_namespace @@ -708,6 +700,7 @@ def add_translated_sequence_id( domain.sequence_location.location.sequence_id = new_id return fusion + # TODO: should this be adding to the gene extensions or something instead? def add_gene_descriptor(self, fusion: Fusion) -> Fusion: """Add additional fields to ``gene_descriptor`` in fusion object @@ -722,7 +715,7 @@ def add_gene_descriptor(self, fusion: Fusion) -> Fusion: for obj in prop: if "gene_descriptor" in obj.model_fields: label = obj.gene_descriptor.label - norm_gene_descr, _ = self._normalized_gene_descriptor( + norm_gene_descr, _ = self._normalized_gene( label, use_minimal_gene_descr=False ) if norm_gene_descr: @@ -730,16 +723,16 @@ def add_gene_descriptor(self, fusion: Fusion) -> Fusion: if fusion.regulatory_element and fusion.regulatory_element.associated_gene: reg_el = fusion.regulatory_element label = reg_el.associated_gene.label - norm_gene_descr, _ = self._normalized_gene_descriptor( + norm_gene_descr, _ = self._normalized_gene( label, use_minimal_gene_descr=False ) if norm_gene_descr: reg_el.associated_gene = norm_gene_descr return fusion - def _normalized_gene_descriptor( + def _normalized_gene( self, query: str, use_minimal_gene_descr: bool = True - ) -> tuple[GeneDescriptor | None, str | None]: + ) -> tuple[Gene | None, str | None]: """Return gene descriptor from normalized response. :param query: Gene query @@ -753,7 +746,8 @@ def _normalized_gene_descriptor( if gene_norm_resp.match_type: gene_descr = gene_norm_resp.gene_descriptor if use_minimal_gene_descr: - gene_descr = GeneDescriptor( + # TODO: how to handle gene_id here? add to extensions?? + gene_descr = Gene( id=gene_descr.id, gene_id=gene_descr.gene_id, label=gene_descr.label ) return gene_descr, None From c0e8626015705fc5a3c33262c35f7ae34db70e51 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 17 Jul 2024 13:31:57 -0400 Subject: [PATCH 03/82] wip: remove gene descriptor --- src/fusor/models.py | 76 ++++++++++++++++----------------------------- 1 file changed, 27 insertions(+), 49 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index d5c4fa1..d557569 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -4,6 +4,9 @@ from enum import Enum from typing import Any, Literal +from ga4gh.core.domain_models import Gene +from ga4gh.vrs.models import SequenceLocation +from gene.schemas import CURIE from pydantic import ( BaseModel, ConfigDict, @@ -27,7 +30,8 @@ class FUSORTypes(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = "TranscriptSegmentElement" TEMPLATED_SEQUENCE_ELEMENT = "TemplatedSequenceElement" LINKER_SEQUENCE_ELEMENT = "LinkerSequenceElement" - GENE_ELEMENT = "GeneElement" + # TODO: I'm not sure if this needs to still be here or not + GENE = "Gene" UNKNOWN_GENE_ELEMENT = "UnknownGeneElement" MULTIPLE_POSSIBLE_GENES_ELEMENT = "MultiplePossibleGenesElement" REGULATORY_ELEMENT = "RegulatoryElement" @@ -56,11 +60,12 @@ class FunctionalDomain(BaseModel): type: Literal[FUSORTypes.FUNCTIONAL_DOMAIN] = FUSORTypes.FUNCTIONAL_DOMAIN status: DomainStatus - associated_gene: GeneDescriptor + associated_gene: Gene id: CURIE | None = Field(None, alias="_id") label: StrictStr | None = None - sequence_location: LocationDescriptor | None = None + sequence_location: SequenceLocation | None = None + # TODO: is this obsolete now that vrsatile has been removed? _get_id_val = field_validator("id")(return_value) model_config = ConfigDict( @@ -100,7 +105,7 @@ class StructuralElementType(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT.value TEMPLATED_SEQUENCE_ELEMENT = FUSORTypes.TEMPLATED_SEQUENCE_ELEMENT.value LINKER_SEQUENCE_ELEMENT = FUSORTypes.LINKER_SEQUENCE_ELEMENT.value - GENE_ELEMENT = FUSORTypes.GENE_ELEMENT.value + GENE_ELEMENT = Gene UNKNOWN_GENE_ELEMENT = FUSORTypes.UNKNOWN_GENE_ELEMENT.value MULTIPLE_POSSIBLE_GENES_ELEMENT = FUSORTypes.MULTIPLE_POSSIBLE_GENES_ELEMENT.value @@ -122,9 +127,9 @@ class TranscriptSegmentElement(BaseStructuralElement): exon_start_offset: StrictInt | None = 0 exon_end: StrictInt | None = None exon_end_offset: StrictInt | None = 0 - gene_descriptor: GeneDescriptor - element_genomic_start: LocationDescriptor | None = None - element_genomic_end: LocationDescriptor | None = None + gene: Gene + element_genomic_start: SequenceLocation | None = None + element_genomic_end: SequenceLocation | None = None @model_validator(mode="before") def check_exons(cls, values): @@ -209,7 +214,7 @@ class LinkerElement(BaseStructuralElement, extra="forbid"): type: Literal[FUSORTypes.LINKER_SEQUENCE_ELEMENT] = ( FUSORTypes.LINKER_SEQUENCE_ELEMENT ) - linker_sequence: SequenceDescriptor + linker_sequence: SequenceLocation @field_validator("linker_sequence", mode="before") def validate_sequence(cls, v): @@ -219,7 +224,7 @@ def validate_sequence(cls, v): v["sequence"] = v["sequence"].upper() except KeyError as e: raise TypeError from e - elif isinstance(v, SequenceDescriptor): + elif isinstance(v, SequenceLocation): v.sequence = v.sequence.upper() else: raise TypeError @@ -257,7 +262,7 @@ class TemplatedSequenceElement(BaseStructuralElement): type: Literal[FUSORTypes.TEMPLATED_SEQUENCE_ELEMENT] = ( FUSORTypes.TEMPLATED_SEQUENCE_ELEMENT ) - region: LocationDescriptor + region: SequenceLocation strand: Strand model_config = ConfigDict( @@ -285,27 +290,6 @@ class TemplatedSequenceElement(BaseStructuralElement): ) -class GeneElement(BaseStructuralElement): - """Define Gene Element class.""" - - type: Literal[FUSORTypes.GENE_ELEMENT] = FUSORTypes.GENE_ELEMENT - gene_descriptor: GeneDescriptor - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "type": "GeneElement", - "gene_descriptor": { - "id": "gene:BRAF", - "gene_id": "hgnc:1097", - "label": "BRAF", - "type": "GeneDescriptor", - }, - } - }, - ) - - class UnknownGeneElement(BaseStructuralElement): """Define UnknownGene class. This is primarily intended to represent a partner in the result of a fusion partner-agnostic assay, which identifies @@ -381,8 +365,8 @@ class RegulatoryElement(BaseModel): type: Literal[FUSORTypes.REGULATORY_ELEMENT] = FUSORTypes.REGULATORY_ELEMENT regulatory_class: RegulatoryClass feature_id: str | None = None - associated_gene: GeneDescriptor | None = None - feature_location: LocationDescriptor | None = None + associated_gene: Gene | None = None + feature_location: SequenceLocation | None = None _get_ref_id_val = field_validator("feature_id")(return_value) @@ -607,7 +591,7 @@ class Assay(BaseModelForbidExtra): AssayedFusionElements = list[ TranscriptSegmentElement - | GeneElement + | Gene | TemplatedSequenceElement | LinkerElement | UnknownGeneElement @@ -675,13 +659,10 @@ class AssayedFusion(AbstractFusion): }, "structural_elements": [ { - "type": "GeneElement", - "gene_descriptor": { - "id": "gene:EWSR1", - "gene_id": "hgnc:3058", - "label": "EWSR1", - "type": "GeneDescriptor", - }, + "type": "Gene", + "id": "gene:EWSR1", + "gene_id": "hgnc:3058", + "label": "EWSR1", }, {"type": "UnknownGeneElement"}, ], @@ -692,7 +673,7 @@ class AssayedFusion(AbstractFusion): CategoricalFusionElements = list[ TranscriptSegmentElement - | GeneElement + | Gene | TemplatedSequenceElement | LinkerElement | MultiplePossibleGenesElement @@ -774,13 +755,10 @@ class CategoricalFusion(AbstractFusion): }, }, { - "type": "GeneElement", - "gene_descriptor": { - "id": "gene:ALK", - "type": "GeneDescriptor", - "gene_id": "hgnc:427", - "label": "ALK", - }, + "type": "Gene", + "id": "gene:ALK", + "gene_id": "hgnc:427", + "label": "ALK", }, ], "regulatory_element": { From 24e1a4ca2b7851ba29e770a0d97c30f5692a04ea Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 17 Jul 2024 14:05:32 -0400 Subject: [PATCH 04/82] progress updating models and adding back gene element wrapper --- src/fusor/fusor.py | 20 ++++++++------- src/fusor/models.py | 52 ++++++++++++++++++++++++++++++++++++--- src/fusor/nomenclature.py | 18 ++++++++------ tests/conftest.py | 2 +- tests/test_fusor.py | 12 ++++----- tests/test_models.py | 22 ++++++----------- 6 files changed, 83 insertions(+), 43 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index c9625fc..348f7d6 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -2,7 +2,6 @@ import logging import re -from urllib.parse import quote from bioutils.accessions import coerce_namespace from cool_seq_tool.app import CoolSeqTool @@ -32,7 +31,6 @@ FunctionalDomain, Fusion, FusionType, - GeneElement, LinkerElement, MultiplePossibleGenesElement, RegulatoryClass, @@ -311,7 +309,7 @@ async def transcript_segment_element( def gene_element( self, gene: str, use_minimal_gene_descr: bool = True - ) -> tuple[GeneElement | None, str | None]: + ) -> tuple[Gene | None, str | None]: """Create gene element :param str gene: Gene @@ -320,12 +318,15 @@ def gene_element( gene-normalizer's gene descriptor will be used :return: GeneElement, warning """ - gene_descr, warning = self._normalized_gene_descriptor( + gene_descr, warning = self._normalized_gene( gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not gene_descr: return None, warning - return GeneElement(gene_descriptor=gene_descr), None + # TODO: I don't think I can actually pass in label and gene_id here... extensions?? + return Gene( + id=gene_descr.id, label=gene_descr.label, gene_id=gene_descr.gene_id + ), None def templated_sequence_element( self, @@ -561,10 +562,10 @@ def _sequence_location( return SequenceLocation( sequence_id=sequence_id, - start=start, end=end, + start=start, + end=end, ) - def add_additional_fields( self, fusion: Fusion, @@ -780,9 +781,10 @@ def generate_nomenclature(self, fusion: Fusion) -> str: parts.append(tx_segment_nomenclature(element)) elif isinstance(element, TemplatedSequenceElement): parts.append(templated_seq_nomenclature(element, self.seqrepo)) - elif isinstance(element, GeneElement): + # TODO: ? + elif isinstance(element, Gene): if not any( - [gene == element.gene_descriptor.label for gene in element_genes] # noqa: C419 + [gene == element.label for gene in element_genes] # noqa: C419 ): parts.append(gene_nomenclature(element)) else: diff --git a/src/fusor/models.py b/src/fusor/models.py index d557569..ba00f0a 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -19,6 +19,32 @@ from pydantic.fields import Field +def return_value(cls, v): + """Return value from object. + + :param ModelMetaclass cls: Pydantic Model ModelMetaclass + :param v: Model from vrs or vrsatile + :return: Value + """ + if v is not None: + try: + if isinstance(v, list): + tmp = list() + for item in v: + while True: + try: + item = item.root + except AttributeError: + break + tmp.append(item) + v = tmp + else: + v = v.root + except AttributeError: + pass + return v + + class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" @@ -30,8 +56,7 @@ class FUSORTypes(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = "TranscriptSegmentElement" TEMPLATED_SEQUENCE_ELEMENT = "TemplatedSequenceElement" LINKER_SEQUENCE_ELEMENT = "LinkerSequenceElement" - # TODO: I'm not sure if this needs to still be here or not - GENE = "Gene" + GENE_ELEMENT = "GeneElement" UNKNOWN_GENE_ELEMENT = "UnknownGeneElement" MULTIPLE_POSSIBLE_GENES_ELEMENT = "MultiplePossibleGenesElement" REGULATORY_ELEMENT = "RegulatoryElement" @@ -80,7 +105,7 @@ class FunctionalDomain(BaseModel): "id": "gene:NTRK1", "gene_id": "hgnc:8031", "label": "8031", - "type": "GeneDescriptor", + "type": "Gene", }, "sequence_location": { "id": "fusor.location_descriptor:NP_002520.2", @@ -105,7 +130,7 @@ class StructuralElementType(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT.value TEMPLATED_SEQUENCE_ELEMENT = FUSORTypes.TEMPLATED_SEQUENCE_ELEMENT.value LINKER_SEQUENCE_ELEMENT = FUSORTypes.LINKER_SEQUENCE_ELEMENT.value - GENE_ELEMENT = Gene + GENE_ELEMENT = FUSORTypes.GENE_ELEMENT.value UNKNOWN_GENE_ELEMENT = FUSORTypes.UNKNOWN_GENE_ELEMENT.value MULTIPLE_POSSIBLE_GENES_ELEMENT = FUSORTypes.MULTIPLE_POSSIBLE_GENES_ELEMENT.value @@ -245,6 +270,25 @@ def validate_sequence(cls, v): }, ) +class GeneElement(BaseStructuralElement): + """Define Gene Element class.""" + + type: Literal[FUSORTypes.GENE_ELEMENT] = FUSORTypes.GENE_ELEMENT + gene: Gene + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "type": "GeneElement", + "gene": { + "id": "gene:BRAF", + "gene_id": "hgnc:1097", + "label": "BRAF", + "type": "Gene", + }, + } + }, + ) class Strand(str, Enum): """Define possible values for strand""" diff --git a/src/fusor/nomenclature.py b/src/fusor/nomenclature.py index c250df0..6e826c2 100644 --- a/src/fusor/nomenclature.py +++ b/src/fusor/nomenclature.py @@ -1,10 +1,11 @@ """Provide helper methods for fusion nomenclature generation.""" from biocommons.seqrepo.seqrepo import SeqRepo +from ga4gh.core.domain_models import Gene +from ga4gh.vrs.models import SequenceLocation from fusor.exceptions import IDTranslationException from fusor.models import ( - GeneElement, RegulatoryClass, RegulatoryElement, TemplatedSequenceElement, @@ -115,20 +116,21 @@ def templated_seq_nomenclature(element: TemplatedSequenceElement, sr: SeqRepo) - raise ValueError -def gene_nomenclature(element: GeneElement) -> str: +def gene_nomenclature(element: Gene) -> str: """Return fusion nomenclature for gene element. :param element: a gene element object :return: element nomenclature representation :raises ValueError: if unable to retrieve gene ID """ - if element.gene_descriptor.gene_id: - gene_id = gene_id = element.gene_descriptor.gene_id + if element.gene_id: + gene_id = gene_id = element.gene_id if element.gene_descriptor.gene_id: - gene_id = element.gene_descriptor.gene_id - elif element.gene_descriptor.gene and element.gene_descriptor.gene.gene_id: - gene_id = element.gene_descriptor.gene.gene_id + gene_id = element.gene_id + # TODO: fix this? unsure where to store these so unsure where to access them + elif element.gene and element.gene.gene_id: + gene_id = element.gene.gene_id else: raise ValueError - return f"{element.gene_descriptor.label}({gene_id})" + return f"{element.label}({gene_id})" diff --git a/tests/conftest.py b/tests/conftest.py index 77814ed..418d595 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -50,7 +50,7 @@ def braf_gene_descriptor(): """Create gene descriptor params for BRAF.""" return { "id": "normalize.gene:BRAF", - "type": "GeneDescriptor", + "type": "Gene", "label": "BRAF", "xrefs": ["ensembl:ENSG00000157764", "ncbigene:673"], "alternate_labels": ["BRAF1", "BRAF-1", "NS7", "B-raf", "B-RAF1", "RAFB1"], diff --git a/tests/test_fusor.py b/tests/test_fusor.py index da57f19..c660db7 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -3,13 +3,13 @@ import copy import pytest +from ga4gh.core.domain_models import Gene from fusor.exceptions import FUSORParametersException from fusor.models import ( AssayedFusion, CategoricalFusion, FunctionalDomain, - GeneElement, LinkerElement, MultiplePossibleGenesElement, RegulatoryClass, @@ -23,13 +23,13 @@ @pytest.fixture(scope="module") def braf_gene_descr_min(): """Create minimal gene descriptor for BRAF""" - return GeneDescriptor(id="normalize.gene:BRAF", label="BRAF", gene_id="hgnc:1097") + return Gene(id="normalize.gene:BRAF", label="BRAF", gene_id="hgnc:1097") @pytest.fixture(scope="module") def braf_gene_descr(braf_gene_descriptor): """Create gene descriptor object for braf""" - return GeneDescriptor(**braf_gene_descriptor) + return Gene(**braf_gene_descriptor) @pytest.fixture(scope="module") @@ -481,7 +481,7 @@ def test__normalized_gene_descriptor(fusor_instance): resp = fusor_instance._normalized_gene_descriptor("BRAF") assert resp[0] assert resp[1] is None - assert isinstance(resp[0], GeneDescriptor) + assert isinstance(resp[0], Gene) resp = fusor_instance._normalized_gene_descriptor("B R A F") assert resp[0] is None @@ -781,7 +781,7 @@ def test_gene_element(fusor_instance, braf_gene_descr_min, braf_gene_descr): gc = fusor_instance.gene_element("BRAF", use_minimal_gene_descr=True) assert gc[0] assert gc[1] is None - assert isinstance(gc[0], GeneElement) + assert isinstance(gc[0], Gene) compare_gene_descriptor( gc[0].gene_descriptor.model_dump(), braf_gene_descr_min.model_dump() ) @@ -789,7 +789,7 @@ def test_gene_element(fusor_instance, braf_gene_descr_min, braf_gene_descr): gc = fusor_instance.gene_element("BRAF", use_minimal_gene_descr=False) assert gc[0] assert gc[1] is None - assert isinstance(gc[0], GeneElement) + assert isinstance(gc[0], Gene) compare_gene_descriptor( gc[0].gene_descriptor.model_dump(), braf_gene_descr.model_dump() ) diff --git a/tests/test_models.py b/tests/test_models.py index c77c031..2a93281 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -3,6 +3,7 @@ import copy import pytest +from ga4gh.core.domain_models import Gene from pydantic import ValidationError from fusor.models import ( @@ -13,7 +14,6 @@ CausativeEvent, EventType, FunctionalDomain, - GeneElement, LinkerElement, MultiplePossibleGenesElement, RegulatoryElement, @@ -593,7 +593,7 @@ def assert_genomic_region_test_element(test): def test_gene_element(gene_descriptors): """Test that Gene Element initializes correctly.""" - test_element = GeneElement(gene_descriptor=gene_descriptors[0]) + test_element = Gene(**gene_descriptors[0]) assert test_element.type == "GeneElement" assert test_element.gene_descriptor.id == "gene:G1" assert test_element.gene_descriptor.label == "G1" @@ -601,21 +601,15 @@ def test_gene_element(gene_descriptors): # test CURIE requirement with pytest.raises(ValidationError) as exc_info: - GeneElement( - gene_descriptor={ - "id": "G1", - "gene": {"gene_id": "hgnc:9339"}, - "label": "G1", - } + Gene( + id="G1", gene={"gene_id": "hgnc:9339"}, label="G1" ) msg = "String should match pattern '^\\w[^:]*:.+$'" check_validation_error(exc_info, msg) # test enum validation with pytest.raises(ValidationError) as exc_info: - assert GeneElement( - type="UnknownGeneElement", gene_descriptor=gene_descriptors[0] - ) + assert Gene(type="UnknownGeneElement", **gene_descriptors[0]) msg = "Input should be " check_validation_error(exc_info, msg) @@ -722,10 +716,9 @@ def test_fusion( ], regulatory_element=None, ) - assert fusion.structural_elements[0].type == "GeneElement" + assert fusion.structural_elements[0].type == "Gene" assert fusion.structural_elements[0].gene_descriptor.id == "gene:NTRK1" - assert fusion.structural_elements[1].type == "GeneElement" - assert fusion.structural_elements[1].gene_descriptor.type == "GeneDescriptor" + assert fusion.structural_elements[1].type == "Gene" # test that non-element properties are optional assert CategoricalFusion( @@ -914,7 +907,6 @@ def test_model_examples(): TranscriptSegmentElement, LinkerElement, TemplatedSequenceElement, - GeneElement, UnknownGeneElement, MultiplePossibleGenesElement, RegulatoryElement, From 3e52ff253981e0e9fe2d4288af1e390da3995d59 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 17 Jul 2024 14:07:41 -0400 Subject: [PATCH 05/82] adding back gene element --- src/fusor/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index ba00f0a..0f08236 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -635,7 +635,7 @@ class Assay(BaseModelForbidExtra): AssayedFusionElements = list[ TranscriptSegmentElement - | Gene + | GeneElement | TemplatedSequenceElement | LinkerElement | UnknownGeneElement @@ -717,7 +717,7 @@ class AssayedFusion(AbstractFusion): CategoricalFusionElements = list[ TranscriptSegmentElement - | Gene + | GeneElement | TemplatedSequenceElement | LinkerElement | MultiplePossibleGenesElement From 12ee93184dbc026e0d6e2d85f8e6be4eb9f85cc2 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 17 Jul 2024 14:43:43 -0400 Subject: [PATCH 06/82] Revert "progress updating models and adding back gene element wrapper" This reverts commit 24e1a4ca2b7851ba29e770a0d97c30f5692a04ea. --- src/fusor/fusor.py | 20 +++++++-------- src/fusor/models.py | 52 +++------------------------------------ src/fusor/nomenclature.py | 18 ++++++-------- tests/conftest.py | 2 +- tests/test_fusor.py | 12 ++++----- tests/test_models.py | 22 +++++++++++------ 6 files changed, 43 insertions(+), 83 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 348f7d6..c9625fc 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -2,6 +2,7 @@ import logging import re +from urllib.parse import quote from bioutils.accessions import coerce_namespace from cool_seq_tool.app import CoolSeqTool @@ -31,6 +32,7 @@ FunctionalDomain, Fusion, FusionType, + GeneElement, LinkerElement, MultiplePossibleGenesElement, RegulatoryClass, @@ -309,7 +311,7 @@ async def transcript_segment_element( def gene_element( self, gene: str, use_minimal_gene_descr: bool = True - ) -> tuple[Gene | None, str | None]: + ) -> tuple[GeneElement | None, str | None]: """Create gene element :param str gene: Gene @@ -318,15 +320,12 @@ def gene_element( gene-normalizer's gene descriptor will be used :return: GeneElement, warning """ - gene_descr, warning = self._normalized_gene( + gene_descr, warning = self._normalized_gene_descriptor( gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not gene_descr: return None, warning - # TODO: I don't think I can actually pass in label and gene_id here... extensions?? - return Gene( - id=gene_descr.id, label=gene_descr.label, gene_id=gene_descr.gene_id - ), None + return GeneElement(gene_descriptor=gene_descr), None def templated_sequence_element( self, @@ -562,10 +561,10 @@ def _sequence_location( return SequenceLocation( sequence_id=sequence_id, - start=start, - end=end, + start=start, end=end, ) + def add_additional_fields( self, fusion: Fusion, @@ -781,10 +780,9 @@ def generate_nomenclature(self, fusion: Fusion) -> str: parts.append(tx_segment_nomenclature(element)) elif isinstance(element, TemplatedSequenceElement): parts.append(templated_seq_nomenclature(element, self.seqrepo)) - # TODO: ? - elif isinstance(element, Gene): + elif isinstance(element, GeneElement): if not any( - [gene == element.label for gene in element_genes] # noqa: C419 + [gene == element.gene_descriptor.label for gene in element_genes] # noqa: C419 ): parts.append(gene_nomenclature(element)) else: diff --git a/src/fusor/models.py b/src/fusor/models.py index 0f08236..beb058b 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -19,32 +19,6 @@ from pydantic.fields import Field -def return_value(cls, v): - """Return value from object. - - :param ModelMetaclass cls: Pydantic Model ModelMetaclass - :param v: Model from vrs or vrsatile - :return: Value - """ - if v is not None: - try: - if isinstance(v, list): - tmp = list() - for item in v: - while True: - try: - item = item.root - except AttributeError: - break - tmp.append(item) - v = tmp - else: - v = v.root - except AttributeError: - pass - return v - - class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" @@ -56,7 +30,8 @@ class FUSORTypes(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = "TranscriptSegmentElement" TEMPLATED_SEQUENCE_ELEMENT = "TemplatedSequenceElement" LINKER_SEQUENCE_ELEMENT = "LinkerSequenceElement" - GENE_ELEMENT = "GeneElement" + # TODO: I'm not sure if this needs to still be here or not + GENE = "Gene" UNKNOWN_GENE_ELEMENT = "UnknownGeneElement" MULTIPLE_POSSIBLE_GENES_ELEMENT = "MultiplePossibleGenesElement" REGULATORY_ELEMENT = "RegulatoryElement" @@ -105,7 +80,7 @@ class FunctionalDomain(BaseModel): "id": "gene:NTRK1", "gene_id": "hgnc:8031", "label": "8031", - "type": "Gene", + "type": "GeneDescriptor", }, "sequence_location": { "id": "fusor.location_descriptor:NP_002520.2", @@ -130,7 +105,7 @@ class StructuralElementType(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT.value TEMPLATED_SEQUENCE_ELEMENT = FUSORTypes.TEMPLATED_SEQUENCE_ELEMENT.value LINKER_SEQUENCE_ELEMENT = FUSORTypes.LINKER_SEQUENCE_ELEMENT.value - GENE_ELEMENT = FUSORTypes.GENE_ELEMENT.value + GENE_ELEMENT = Gene UNKNOWN_GENE_ELEMENT = FUSORTypes.UNKNOWN_GENE_ELEMENT.value MULTIPLE_POSSIBLE_GENES_ELEMENT = FUSORTypes.MULTIPLE_POSSIBLE_GENES_ELEMENT.value @@ -270,25 +245,6 @@ def validate_sequence(cls, v): }, ) -class GeneElement(BaseStructuralElement): - """Define Gene Element class.""" - - type: Literal[FUSORTypes.GENE_ELEMENT] = FUSORTypes.GENE_ELEMENT - gene: Gene - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "type": "GeneElement", - "gene": { - "id": "gene:BRAF", - "gene_id": "hgnc:1097", - "label": "BRAF", - "type": "Gene", - }, - } - }, - ) class Strand(str, Enum): """Define possible values for strand""" diff --git a/src/fusor/nomenclature.py b/src/fusor/nomenclature.py index 6e826c2..c250df0 100644 --- a/src/fusor/nomenclature.py +++ b/src/fusor/nomenclature.py @@ -1,11 +1,10 @@ """Provide helper methods for fusion nomenclature generation.""" from biocommons.seqrepo.seqrepo import SeqRepo -from ga4gh.core.domain_models import Gene -from ga4gh.vrs.models import SequenceLocation from fusor.exceptions import IDTranslationException from fusor.models import ( + GeneElement, RegulatoryClass, RegulatoryElement, TemplatedSequenceElement, @@ -116,21 +115,20 @@ def templated_seq_nomenclature(element: TemplatedSequenceElement, sr: SeqRepo) - raise ValueError -def gene_nomenclature(element: Gene) -> str: +def gene_nomenclature(element: GeneElement) -> str: """Return fusion nomenclature for gene element. :param element: a gene element object :return: element nomenclature representation :raises ValueError: if unable to retrieve gene ID """ - if element.gene_id: - gene_id = gene_id = element.gene_id + if element.gene_descriptor.gene_id: + gene_id = gene_id = element.gene_descriptor.gene_id if element.gene_descriptor.gene_id: - gene_id = element.gene_id - # TODO: fix this? unsure where to store these so unsure where to access them - elif element.gene and element.gene.gene_id: - gene_id = element.gene.gene_id + gene_id = element.gene_descriptor.gene_id + elif element.gene_descriptor.gene and element.gene_descriptor.gene.gene_id: + gene_id = element.gene_descriptor.gene.gene_id else: raise ValueError - return f"{element.label}({gene_id})" + return f"{element.gene_descriptor.label}({gene_id})" diff --git a/tests/conftest.py b/tests/conftest.py index 418d595..77814ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -50,7 +50,7 @@ def braf_gene_descriptor(): """Create gene descriptor params for BRAF.""" return { "id": "normalize.gene:BRAF", - "type": "Gene", + "type": "GeneDescriptor", "label": "BRAF", "xrefs": ["ensembl:ENSG00000157764", "ncbigene:673"], "alternate_labels": ["BRAF1", "BRAF-1", "NS7", "B-raf", "B-RAF1", "RAFB1"], diff --git a/tests/test_fusor.py b/tests/test_fusor.py index c660db7..da57f19 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -3,13 +3,13 @@ import copy import pytest -from ga4gh.core.domain_models import Gene from fusor.exceptions import FUSORParametersException from fusor.models import ( AssayedFusion, CategoricalFusion, FunctionalDomain, + GeneElement, LinkerElement, MultiplePossibleGenesElement, RegulatoryClass, @@ -23,13 +23,13 @@ @pytest.fixture(scope="module") def braf_gene_descr_min(): """Create minimal gene descriptor for BRAF""" - return Gene(id="normalize.gene:BRAF", label="BRAF", gene_id="hgnc:1097") + return GeneDescriptor(id="normalize.gene:BRAF", label="BRAF", gene_id="hgnc:1097") @pytest.fixture(scope="module") def braf_gene_descr(braf_gene_descriptor): """Create gene descriptor object for braf""" - return Gene(**braf_gene_descriptor) + return GeneDescriptor(**braf_gene_descriptor) @pytest.fixture(scope="module") @@ -481,7 +481,7 @@ def test__normalized_gene_descriptor(fusor_instance): resp = fusor_instance._normalized_gene_descriptor("BRAF") assert resp[0] assert resp[1] is None - assert isinstance(resp[0], Gene) + assert isinstance(resp[0], GeneDescriptor) resp = fusor_instance._normalized_gene_descriptor("B R A F") assert resp[0] is None @@ -781,7 +781,7 @@ def test_gene_element(fusor_instance, braf_gene_descr_min, braf_gene_descr): gc = fusor_instance.gene_element("BRAF", use_minimal_gene_descr=True) assert gc[0] assert gc[1] is None - assert isinstance(gc[0], Gene) + assert isinstance(gc[0], GeneElement) compare_gene_descriptor( gc[0].gene_descriptor.model_dump(), braf_gene_descr_min.model_dump() ) @@ -789,7 +789,7 @@ def test_gene_element(fusor_instance, braf_gene_descr_min, braf_gene_descr): gc = fusor_instance.gene_element("BRAF", use_minimal_gene_descr=False) assert gc[0] assert gc[1] is None - assert isinstance(gc[0], Gene) + assert isinstance(gc[0], GeneElement) compare_gene_descriptor( gc[0].gene_descriptor.model_dump(), braf_gene_descr.model_dump() ) diff --git a/tests/test_models.py b/tests/test_models.py index 2a93281..c77c031 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -3,7 +3,6 @@ import copy import pytest -from ga4gh.core.domain_models import Gene from pydantic import ValidationError from fusor.models import ( @@ -14,6 +13,7 @@ CausativeEvent, EventType, FunctionalDomain, + GeneElement, LinkerElement, MultiplePossibleGenesElement, RegulatoryElement, @@ -593,7 +593,7 @@ def assert_genomic_region_test_element(test): def test_gene_element(gene_descriptors): """Test that Gene Element initializes correctly.""" - test_element = Gene(**gene_descriptors[0]) + test_element = GeneElement(gene_descriptor=gene_descriptors[0]) assert test_element.type == "GeneElement" assert test_element.gene_descriptor.id == "gene:G1" assert test_element.gene_descriptor.label == "G1" @@ -601,15 +601,21 @@ def test_gene_element(gene_descriptors): # test CURIE requirement with pytest.raises(ValidationError) as exc_info: - Gene( - id="G1", gene={"gene_id": "hgnc:9339"}, label="G1" + GeneElement( + gene_descriptor={ + "id": "G1", + "gene": {"gene_id": "hgnc:9339"}, + "label": "G1", + } ) msg = "String should match pattern '^\\w[^:]*:.+$'" check_validation_error(exc_info, msg) # test enum validation with pytest.raises(ValidationError) as exc_info: - assert Gene(type="UnknownGeneElement", **gene_descriptors[0]) + assert GeneElement( + type="UnknownGeneElement", gene_descriptor=gene_descriptors[0] + ) msg = "Input should be " check_validation_error(exc_info, msg) @@ -716,9 +722,10 @@ def test_fusion( ], regulatory_element=None, ) - assert fusion.structural_elements[0].type == "Gene" + assert fusion.structural_elements[0].type == "GeneElement" assert fusion.structural_elements[0].gene_descriptor.id == "gene:NTRK1" - assert fusion.structural_elements[1].type == "Gene" + assert fusion.structural_elements[1].type == "GeneElement" + assert fusion.structural_elements[1].gene_descriptor.type == "GeneDescriptor" # test that non-element properties are optional assert CategoricalFusion( @@ -907,6 +914,7 @@ def test_model_examples(): TranscriptSegmentElement, LinkerElement, TemplatedSequenceElement, + GeneElement, UnknownGeneElement, MultiplePossibleGenesElement, RegulatoryElement, From 61204737354ce8015a4ee65b8ed33542bc11a04b Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 17 Jul 2024 14:43:52 -0400 Subject: [PATCH 07/82] Revert "adding back gene element" This reverts commit 3e52ff253981e0e9fe2d4288af1e390da3995d59. --- src/fusor/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index beb058b..d557569 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -591,7 +591,7 @@ class Assay(BaseModelForbidExtra): AssayedFusionElements = list[ TranscriptSegmentElement - | GeneElement + | Gene | TemplatedSequenceElement | LinkerElement | UnknownGeneElement @@ -673,7 +673,7 @@ class AssayedFusion(AbstractFusion): CategoricalFusionElements = list[ TranscriptSegmentElement - | GeneElement + | Gene | TemplatedSequenceElement | LinkerElement | MultiplePossibleGenesElement From 6573780fab67ebc8e3ff118e07c2758ce4074ab1 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 09:59:20 -0400 Subject: [PATCH 08/82] converting descriptors --- src/fusor/examples/bcr_abl1.json | 213 +++++++++++++++++-------------- src/fusor/fusor.py | 32 ++--- src/fusor/models.py | 81 ++++++++---- 3 files changed, 184 insertions(+), 142 deletions(-) diff --git a/src/fusor/examples/bcr_abl1.json b/src/fusor/examples/bcr_abl1.json index b7a1b8d..c033117 100644 --- a/src/fusor/examples/bcr_abl1.json +++ b/src/fusor/examples/bcr_abl1.json @@ -1,112 +1,133 @@ { "type": "CategoricalFusion", - "structural_elements": [ - { - "type": "TranscriptSegmentElement", - "transcript": "refseq:NM_004327.3", - "gene_descriptor": { - "type": "GeneDescriptor", - "id": "normalize.gene:BCR", - "gene_id": "hgnc:1014", - "label": "BCR" - }, - "element_genomic_end": { - "id": "fusor.location_descriptor:NC_000022.11", - "type": "LocationDescriptor", - "label": "NC_000022.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000022.11", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 23253980 - }, - "end": { - "type": "Number", - "value": 23253981 + "structure": { + "type": "Adjacency", + "adjoinedSequences": [ + { + "type": "SequenceLocation", + "sequenceReference": { + "id": "GRCh38:chr22", + "type": "SequenceReference", + "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", + "residueAlphabet": "na" + }, + "end": 23290413, + "extensions": [ + { + "name": "NM_004327.4:e._14", + "description": "VICC exon representation of the aligned transcript boundary.", + "value": { + "exon_end": 14, + "exon_end_offset": 0, + "sequenceReference":{ + "type": "SequenceReference", + "id": "NM_004327.4", + "refgetAccession": "SQ.kpytJsXw3BwLC3oBSjHQS1kwxs4WO3I3", + "residueAlphabet": "na" } } - } - }, - "exon_end": 2, - "exon_end_offset": 182 - }, - { - "type": "LinkerSequenceElement", - "linker_sequence": { - "id": "sequence:ACTAAAGCG", - "type": "SequenceDescriptor", - "sequence": "ACTAAAGCG", - "residue_type": "SO:0000348" - } - }, - { - "type": "TranscriptSegmentElement", - "transcript": "refseq:NM_005157.5", - "exon_start": 2, - "exon_start_offset": -173, - "gene_descriptor": { - "id": "normalize.gene:ABL1", - "type": "GeneDescriptor", - "label": "ABL1", - "gene_id": "hgnc:76" - }, - "element_genomic_start": { - "id": "fusor.location_descriptor:NC_000009.12", - "type": "LocationDescriptor", - "label": "NC_000009.12", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000009.12", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 130854064 + }, + { + "name": "NM_004327.4:c._2782", + "description": "Transcript SequenceLocation of the aligned transcript boundary.", + "value": { + "type": "SequenceLocation", + "sequenceReference": { + "id": "NM_004327.4", + "type": "SequenceReference", + "refgetAccession": "SQ.kpytJsXw3BwLC3oBSjHQS1kwxs4WO3I3", + "residueAlphabet": "na" }, - "end": { - "type": "Number", - "value": 130854065 - } + "end": 3234 + } + }, + { + "name": "gene", + "description": "The gene concept (BCR) associated with this fusion partner.", + "value": { + "code": "hgnc:1014", + "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", + "label": "BCR" } } - } + ]}, + { + "type": "SequenceLocation", + "sequenceReference": { + "id": "GRCh38:chr9", + "type": "SequenceReference", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", + "residueAlphabet": "na" + }, + "start": 130854064, + "extensions": [ + { + "name": "NM_005157.6:e.2_", + "description": "VICC exon representation of the aligned transcript boundary.", + "value": { + "exon_start": 2, + "exon_start_offset": 0, + "sequenceReference":{ + "id": "NM_005157.6", + "type": "SequenceReference", + "refgetAccession": "SQ.w8Qg3x-PQ2akJrJQeGEN-_eBUMo1H1CL", + "residueAlphabet": "na" + } + } + }, + { + "name": "NM_005157.6:c.80_", + "description": "Transcript SequenceLocation of the aligned transcript boundary.", + "value": { + "type": "SequenceLocation", + "sequenceReference": { + "id": "NM_005157.6", + "type": "SequenceReference", + "refgetAccession": "SQ.w8Qg3x-PQ2akJrJQeGEN-_eBUMo1H1CL", + "residueAlphabet": "na" + }, + "end": 273 + } + }, + { + "name": "gene", + "description": "The gene concept (ABL1) associated with this fusion partner.", + "value": { + "code": "hgnc:76", + "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", + "label": "ABL1" + } + } + ] + }], + "linker": { + "type": "LiteralSequenceExpression", + "sequence": "CCCGTC" } - ], - "r_frame_preserved": true, - "critical_functional_domains": [ + }, + "readingFramePreserved": true, + "criticalFunctionalDomains": [ { "type": "FunctionalDomain", "status": "preserved", - "associated_gene": { - "id": "normalize.gene:hgnc%3A76", - "type": "GeneDescriptor", - "label": "ABL1", - "gene_id": "hgnc:76" + "gene": { + "code": "hgnc:76", + "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", + "label": "ABL1" }, - "_id": "interpro:IPR000980", + "id": "interpro:IPR000980", "label": "SH2 domain", - "sequence_location": { - "id": "fusor.location_descriptor:NP_005148.2", - "type": "LocationDescriptor", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NP_005148.2", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 127 - }, - "end": { - "type": "Number", - "value": 202 - } - } - } + "sequenceLocation": { + "type": "SequenceLocation", + "sequenceReference": { + "id": "GRCh38:chr22", + "type": "SequenceReference", + "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", + "residueAlphabet": "na" + }, + "start": 127, + "end": 202 } } ] -} +} \ No newline at end of file diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index c9625fc..242e1e5 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -32,8 +32,6 @@ FunctionalDomain, Fusion, FusionType, - GeneElement, - LinkerElement, MultiplePossibleGenesElement, RegulatoryClass, RegulatoryElement, @@ -311,7 +309,7 @@ async def transcript_segment_element( def gene_element( self, gene: str, use_minimal_gene_descr: bool = True - ) -> tuple[GeneElement | None, str | None]: + ) -> tuple[Gene | None, str | None]: """Create gene element :param str gene: Gene @@ -320,12 +318,12 @@ def gene_element( gene-normalizer's gene descriptor will be used :return: GeneElement, warning """ - gene_descr, warning = self._normalized_gene_descriptor( + gene_descr, warning = self._normalized_gene( gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not gene_descr: return None, warning - return GeneElement(gene_descriptor=gene_descr), None + return Gene(gene_descriptor=gene_descr), None def templated_sequence_element( self, @@ -700,7 +698,7 @@ def add_translated_sequence_id( domain.sequence_location.location.sequence_id = new_id return fusion - # TODO: should this be adding to the gene extensions or something instead? + # TODO: do we still need this? def add_gene_descriptor(self, fusion: Fusion) -> Fusion: """Add additional fields to ``gene_descriptor`` in fusion object @@ -716,7 +714,7 @@ def add_gene_descriptor(self, fusion: Fusion) -> Fusion: if "gene_descriptor" in obj.model_fields: label = obj.gene_descriptor.label norm_gene_descr, _ = self._normalized_gene( - label, use_minimal_gene_descr=False + label ) if norm_gene_descr: obj.gene_descriptor = norm_gene_descr @@ -724,33 +722,25 @@ def add_gene_descriptor(self, fusion: Fusion) -> Fusion: reg_el = fusion.regulatory_element label = reg_el.associated_gene.label norm_gene_descr, _ = self._normalized_gene( - label, use_minimal_gene_descr=False + label ) if norm_gene_descr: reg_el.associated_gene = norm_gene_descr return fusion def _normalized_gene( - self, query: str, use_minimal_gene_descr: bool = True + self, query: str ) -> tuple[Gene | None, str | None]: - """Return gene descriptor from normalized response. + """Return gene from normalized response. :param query: Gene query - :param use_minimal_gene_descr: ``True`` if minimal gene descriptor - (``id``, ``gene_id``, ``label``) will be used. ``False`` if - gene-normalizer's gene descriptor will be used - :return: Tuple with gene descriptor and None value for warnings if + :return: Tuple with gene and None value for warnings if successful, and None value with warning string if unsuccessful """ gene_norm_resp = self.gene_normalizer.normalize(query) if gene_norm_resp.match_type: - gene_descr = gene_norm_resp.gene_descriptor - if use_minimal_gene_descr: - # TODO: how to handle gene_id here? add to extensions?? - gene_descr = Gene( - id=gene_descr.id, gene_id=gene_descr.gene_id, label=gene_descr.label - ) - return gene_descr, None + gene = gene_norm_resp.gene + return gene, None return None, f"gene-normalizer unable to normalize {query}" def generate_nomenclature(self, fusion: Fusion) -> str: diff --git a/src/fusor/models.py b/src/fusor/models.py index d557569..4210193 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -5,7 +5,7 @@ from typing import Any, Literal from ga4gh.core.domain_models import Gene -from ga4gh.vrs.models import SequenceLocation +from ga4gh.vrs.models import SequenceLocation, Adjacency, LiteralSequenceExpression, SequenceString from gene.schemas import CURIE from pydantic import ( BaseModel, @@ -14,10 +14,36 @@ StrictInt, StrictStr, field_validator, - model_validator, + model_validator, ValidationError, ) from pydantic.fields import Field +def return_value(cls, v): + """Return value from object. + + :param ModelMetaclass cls: Pydantic Model ModelMetaclass + :param v: Model from vrs or vrsatile + :return: Value + """ + if v is not None: + try: + if isinstance(v, list): + tmp = list() + for item in v: + while True: + try: + item = item.root + except AttributeError: + break + tmp.append(item) + v = tmp + else: + v = v.root + except AttributeError: + pass + return v + + class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" @@ -30,8 +56,6 @@ class FUSORTypes(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = "TranscriptSegmentElement" TEMPLATED_SEQUENCE_ELEMENT = "TemplatedSequenceElement" LINKER_SEQUENCE_ELEMENT = "LinkerSequenceElement" - # TODO: I'm not sure if this needs to still be here or not - GENE = "Gene" UNKNOWN_GENE_ELEMENT = "UnknownGeneElement" MULTIPLE_POSSIBLE_GENES_ELEMENT = "MultiplePossibleGenesElement" REGULATORY_ELEMENT = "RegulatoryElement" @@ -45,7 +69,6 @@ class AdditionalFields(str, Enum): SEQUENCE_ID = "sequence_id" LOCATION_ID = "location_id" - GENE_DESCRIPTOR = "gene_descriptor" class DomainStatus(str, Enum): @@ -60,12 +83,11 @@ class FunctionalDomain(BaseModel): type: Literal[FUSORTypes.FUNCTIONAL_DOMAIN] = FUSORTypes.FUNCTIONAL_DOMAIN status: DomainStatus - associated_gene: Gene + gene: Gene id: CURIE | None = Field(None, alias="_id") label: StrictStr | None = None sequence_location: SequenceLocation | None = None - # TODO: is this obsolete now that vrsatile has been removed? _get_id_val = field_validator("id")(return_value) model_config = ConfigDict( @@ -76,22 +98,22 @@ class FunctionalDomain(BaseModel): "status": "lost", "label": "Tyrosine-protein kinase, catalytic domain", "_id": "interpro:IPR020635", - "associated_gene": { + "gene": { "id": "gene:NTRK1", "gene_id": "hgnc:8031", "label": "8031", - "type": "GeneDescriptor", + "type": "Gene", }, "sequence_location": { + # TODO: keep this? "id": "fusor.location_descriptor:NP_002520.2", - "type": "LocationDescriptor", - "location": { - "sequence_id": "ga4gh:SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 510}, - "end": {"type": "Number", "value": 781}, - }, + "type": "SequenceLocation", + "sequenceReference": { + "id": "GRCh38:chr22", + "type": "SequenceReference", + # TODO: get correct id here + "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", + "residueAlphabet": "na" }, }, } @@ -116,6 +138,7 @@ class BaseStructuralElement(ABC, BaseModel): type: StructuralElementType +# TODO: remove and replace constructor with adjacency item? class TranscriptSegmentElement(BaseStructuralElement): """Define TranscriptSegment class""" @@ -207,13 +230,12 @@ def check_exons(cls, values): }, ) - class LinkerElement(BaseStructuralElement, extra="forbid"): """Define Linker class (linker sequence)""" - type: Literal[FUSORTypes.LINKER_SEQUENCE_ELEMENT] = ( + type: Literal[ FUSORTypes.LINKER_SEQUENCE_ELEMENT - ) + ] = FUSORTypes.LINKER_SEQUENCE_ELEMENT linker_sequence: SequenceLocation @field_validator("linker_sequence", mode="before") @@ -222,16 +244,24 @@ def validate_sequence(cls, v): if isinstance(v, dict): try: v["sequence"] = v["sequence"].upper() - except KeyError as e: - raise TypeError from e + seq = v["sequence"] + except KeyError: + raise TypeError elif isinstance(v, SequenceLocation): v.sequence = v.sequence.upper() + seq = v.sequence else: raise TypeError + try: + LiteralSequenceExpression(sequence=SequenceString(seq)) + except ValidationError: + raise AssertionError("sequence does not match regex '^[A-Za-z*\\-]*$'") + return v model_config = ConfigDict( + arbitrary_types_allowed=True, json_schema_extra={ "example": { "type": "LinkerSequenceElement", @@ -271,7 +301,7 @@ class TemplatedSequenceElement(BaseStructuralElement): "type": "TemplatedSequenceElement", "region": { "id": "chr12:44908821-44908822(+)", - "type": "LocationDescriptor", + "type": "SequenceLocation", "location_id": "ga4gh:VSL.AG54ZRBhg6pwpPLafF4KgaAHpdFio6l5", "location": { "type": "SequenceLocation", @@ -539,6 +569,7 @@ def structural_elements_ends(cls, values): if elements[0].exon_end is None and not values["regulatory_element"]: msg = "5' TranscriptSegmentElement fusion partner must contain ending exon position" raise ValueError(msg) + # TODO: how to verify this now with adjacency model? elif isinstance(elements[0], LinkerElement): msg = "First structural element cannot be LinkerSequence" raise ValueError(msg) @@ -593,7 +624,7 @@ class Assay(BaseModelForbidExtra): TranscriptSegmentElement | Gene | TemplatedSequenceElement - | LinkerElement + | Adjacency | UnknownGeneElement ] @@ -675,7 +706,7 @@ class AssayedFusion(AbstractFusion): TranscriptSegmentElement | Gene | TemplatedSequenceElement - | LinkerElement + | Adjacency | MultiplePossibleGenesElement ] From 44e057461965b24061c97807989daa23da19f8e0 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 10:09:28 -0400 Subject: [PATCH 09/82] remove todo --- src/fusor/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 4210193..b42fd66 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -569,7 +569,6 @@ def structural_elements_ends(cls, values): if elements[0].exon_end is None and not values["regulatory_element"]: msg = "5' TranscriptSegmentElement fusion partner must contain ending exon position" raise ValueError(msg) - # TODO: how to verify this now with adjacency model? elif isinstance(elements[0], LinkerElement): msg = "First structural element cannot be LinkerSequence" raise ValueError(msg) From 77967326e9087ecef6c56e1ab8eae4c0f90123ac Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 12:17:26 -0400 Subject: [PATCH 10/82] wip: adding back GeneElement wrapper, updating to camelCase, removing obsolete code --- src/fusor/fusor.py | 1 + src/fusor/models.py | 160 +++++++++++++++++++++----------------------- 2 files changed, 79 insertions(+), 82 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 242e1e5..322ea16 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -32,6 +32,7 @@ FunctionalDomain, Fusion, FusionType, + LinkerElement, MultiplePossibleGenesElement, RegulatoryClass, RegulatoryElement, diff --git a/src/fusor/models.py b/src/fusor/models.py index b42fd66..40a3285 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -5,7 +5,11 @@ from typing import Any, Literal from ga4gh.core.domain_models import Gene -from ga4gh.vrs.models import SequenceLocation, Adjacency, LiteralSequenceExpression, SequenceString +from ga4gh.vrs.models import ( + LiteralSequenceExpression, + SequenceLocation, + SequenceString, +) from gene.schemas import CURIE from pydantic import ( BaseModel, @@ -13,37 +17,12 @@ StrictBool, StrictInt, StrictStr, + ValidationError, field_validator, - model_validator, ValidationError, + model_validator, ) from pydantic.fields import Field - -def return_value(cls, v): - """Return value from object. - - :param ModelMetaclass cls: Pydantic Model ModelMetaclass - :param v: Model from vrs or vrsatile - :return: Value - """ - if v is not None: - try: - if isinstance(v, list): - tmp = list() - for item in v: - while True: - try: - item = item.root - except AttributeError: - break - tmp.append(item) - v = tmp - else: - v = v.root - except AttributeError: - pass - return v - - +# TODO: add back minimum information function from normalizers (id (hgnc or whatever passed by normalizer) and label only) class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" @@ -56,6 +35,7 @@ class FUSORTypes(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = "TranscriptSegmentElement" TEMPLATED_SEQUENCE_ELEMENT = "TemplatedSequenceElement" LINKER_SEQUENCE_ELEMENT = "LinkerSequenceElement" + GENE_ELEMENT = "GeneElement" UNKNOWN_GENE_ELEMENT = "UnknownGeneElement" MULTIPLE_POSSIBLE_GENES_ELEMENT = "MultiplePossibleGenesElement" REGULATORY_ELEMENT = "RegulatoryElement" @@ -77,7 +57,6 @@ class DomainStatus(str, Enum): LOST = "lost" PRESERVED = "preserved" - class FunctionalDomain(BaseModel): """Define FunctionalDomain class""" @@ -86,9 +65,7 @@ class FunctionalDomain(BaseModel): gene: Gene id: CURIE | None = Field(None, alias="_id") label: StrictStr | None = None - sequence_location: SequenceLocation | None = None - - _get_id_val = field_validator("id")(return_value) + sequenceLocation: SequenceLocation | None = None model_config = ConfigDict( populate_by_name=True, @@ -97,6 +74,7 @@ class FunctionalDomain(BaseModel): "type": "FunctionalDomain", "status": "lost", "label": "Tyrosine-protein kinase, catalytic domain", + # TODO: verify this field isn't getting populated/used "_id": "interpro:IPR020635", "gene": { "id": "gene:NTRK1", @@ -105,7 +83,7 @@ class FunctionalDomain(BaseModel): "type": "Gene", }, "sequence_location": { - # TODO: keep this? + # TODO: keep this? - yes, use standardized ids as seen in Jeremy's PR "id": "fusor.location_descriptor:NP_002520.2", "type": "SequenceLocation", "sequenceReference": { @@ -113,7 +91,7 @@ class FunctionalDomain(BaseModel): "type": "SequenceReference", # TODO: get correct id here "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", - "residueAlphabet": "na" + "residueAlphabet": "na", }, }, } @@ -138,7 +116,6 @@ class BaseStructuralElement(ABC, BaseModel): type: StructuralElementType -# TODO: remove and replace constructor with adjacency item? class TranscriptSegmentElement(BaseStructuralElement): """Define TranscriptSegment class""" @@ -146,13 +123,13 @@ class TranscriptSegmentElement(BaseStructuralElement): FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT ) transcript: CURIE - exon_start: StrictInt | None = None - exon_start_offset: StrictInt | None = 0 - exon_end: StrictInt | None = None - exon_end_offset: StrictInt | None = 0 + exonStart: StrictInt | None = None + exonStartOffset: StrictInt | None = 0 + exonEnd: StrictInt | None = None + exonEndOffset: StrictInt | None = 0 gene: Gene - element_genomic_start: SequenceLocation | None = None - element_genomic_end: SequenceLocation | None = None + elementGenomicStart: SequenceLocation | None = None + elementGenomicEnd: SequenceLocation | None = None @model_validator(mode="before") def check_exons(cls, values): @@ -182,7 +159,6 @@ def check_exons(cls, values): values["exon_end_offset"] = None return values - _get_transcript_val = field_validator("transcript")(return_value) model_config = ConfigDict( json_schema_extra={ "example": { @@ -230,13 +206,14 @@ def check_exons(cls, values): }, ) + class LinkerElement(BaseStructuralElement, extra="forbid"): """Define Linker class (linker sequence)""" - type: Literal[ + type: Literal[FUSORTypes.LINKER_SEQUENCE_ELEMENT] = ( FUSORTypes.LINKER_SEQUENCE_ELEMENT - ] = FUSORTypes.LINKER_SEQUENCE_ELEMENT - linker_sequence: SequenceLocation + ) + linkerSequence: LiteralSequenceExpression @field_validator("linker_sequence", mode="before") def validate_sequence(cls, v): @@ -253,6 +230,7 @@ def validate_sequence(cls, v): else: raise TypeError + # TODO: remove this validation try: LiteralSequenceExpression(sequence=SequenceString(seq)) except ValidationError: @@ -320,6 +298,25 @@ class TemplatedSequenceElement(BaseStructuralElement): ) +class GeneElement(BaseStructuralElement): + """Define Gene Element class.""" + + type: Literal[FUSORTypes.GENE_ELEMENT] = FUSORTypes.GENE_ELEMENT + gene: Gene + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "type": "GeneElement", + "gene": { + "id": "hgnc:1097", + "label": "BRAF", + "type": "Gene", + }, + } + }, + ) + class UnknownGeneElement(BaseStructuralElement): """Define UnknownGene class. This is primarily intended to represent a partner in the result of a fusion partner-agnostic assay, which identifies @@ -393,12 +390,10 @@ class RegulatoryElement(BaseModel): """ type: Literal[FUSORTypes.REGULATORY_ELEMENT] = FUSORTypes.REGULATORY_ELEMENT - regulatory_class: RegulatoryClass - feature_id: str | None = None - associated_gene: Gene | None = None - feature_location: SequenceLocation | None = None - - _get_ref_id_val = field_validator("feature_id")(return_value) + regulatoryClass: RegulatoryClass + featureId: CURIE | None = None + associatedGene: Gene | None = None + featureLocation: SequenceLocation | None = None @model_validator(mode="before") def ensure_min_values(cls, values): @@ -451,8 +446,8 @@ class AbstractFusion(BaseModel, ABC): """Define Fusion class""" type: FusionType - regulatory_element: RegulatoryElement | None = None - structural_elements: list[BaseStructuralElement] + regulatoryElement: RegulatoryElement | None = None + structuralElements: list[BaseStructuralElement] @classmethod def _access_object_attr( @@ -528,12 +523,12 @@ def enforce_element_quantities(cls, values): "Fusions must contain >= 2 structural elements, or >=1 structural element " "and a regulatory element" ) - structural_elements = values.get("structural_elements", []) - if not structural_elements: + structure = values.get("structure", []) + if not structure: raise ValueError(qt_error_msg) - num_structural_elements = len(structural_elements) + num_structure = len(structure) reg_element = values.get("regulatory_element") - if (num_structural_elements + bool(reg_element)) < 2: + if (num_structure + bool(reg_element)) < 2: raise ValueError(qt_error_msg) uq_gene_msg = "Fusions must form a chimeric transcript from two or more genes, or a novel interaction between a rearranged regulatory element with the expressed product of a partner gene." @@ -545,7 +540,7 @@ def enforce_element_quantities(cls, values): if gene_id: gene_ids.append(gene_id) - for element in structural_elements: + for element in structure: gene_id = cls._fetch_gene_id( obj=element, gene_descriptor_field="gene_descriptor" ) @@ -554,17 +549,17 @@ def enforce_element_quantities(cls, values): unique_gene_ids = set(gene_ids) if len(unique_gene_ids) == 1 and len(gene_ids) == ( - num_structural_elements + bool(reg_element) + num_structure + bool(reg_element) ): raise ValueError(uq_gene_msg) return values @model_validator(mode="after") - def structural_elements_ends(cls, values): + def structure_ends(cls, values): """Ensure start/end elements are of legal types and have fields required by their position. """ - elements = values.structural_elements + elements = values.structure if isinstance(elements[0], TranscriptSegmentElement): if elements[0].exon_end is None and not values["regulatory_element"]: msg = "5' TranscriptSegmentElement fusion partner must contain ending exon position" @@ -599,13 +594,10 @@ class Assay(BaseModelForbidExtra): """Information pertaining to the assay used in identifying the fusion.""" type: Literal["Assay"] = "Assay" - assay_name: StrictStr | None = None - assay_id: CURIE | None = None - method_uri: CURIE | None = None - fusion_detection: Evidence | None = None - - _get_assay_id_val = field_validator("assay_id")(return_value) - _get_method_uri_val = field_validator("method_uri")(return_value) + assayName: StrictStr | None = None + assayId: CURIE | None = None + methodUri: CURIE | None = None + fusionDetection: Evidence | None = None model_config = ConfigDict( json_schema_extra={ @@ -621,9 +613,9 @@ class Assay(BaseModelForbidExtra): AssayedFusionElements = list[ TranscriptSegmentElement - | Gene + | GeneElement | TemplatedSequenceElement - | Adjacency + | LinkerElement | UnknownGeneElement ] @@ -645,8 +637,8 @@ class CausativeEvent(BaseModelForbidExtra): """ type: Literal[FUSORTypes.CAUSATIVE_EVENT] = FUSORTypes.CAUSATIVE_EVENT - event_type: EventType - event_description: StrictStr | None = None + eventType: EventType + eventDescription: StrictStr | None = None model_config = ConfigDict( json_schema_extra={ @@ -667,8 +659,8 @@ class AssayedFusion(AbstractFusion): """ type: Literal[FUSORTypes.ASSAYED_FUSION] = FUSORTypes.ASSAYED_FUSION - structural_elements: AssayedFusionElements - causative_event: CausativeEvent | None = None + structure: AssayedFusionElements + causativeEvent: CausativeEvent | None = None assay: Assay | None = None model_config = ConfigDict( @@ -687,12 +679,16 @@ class AssayedFusion(AbstractFusion): "assay_name": "fluorescence in-situ hybridization assay", "fusion_detection": "inferred", }, - "structural_elements": [ + "structure": [ { + "type": "GeneElement", + "gene": { "type": "Gene", "id": "gene:EWSR1", + # TODO: This should be mappings instead "gene_id": "hgnc:3058", "label": "EWSR1", + } }, {"type": "UnknownGeneElement"}, ], @@ -703,9 +699,9 @@ class AssayedFusion(AbstractFusion): CategoricalFusionElements = list[ TranscriptSegmentElement - | Gene + | GeneElement | TemplatedSequenceElement - | Adjacency + | LinkerElement | MultiplePossibleGenesElement ] @@ -718,9 +714,9 @@ class CategoricalFusion(AbstractFusion): """ type: Literal[FUSORTypes.CATEGORICAL_FUSION] = FUSORTypes.CATEGORICAL_FUSION - r_frame_preserved: StrictBool | None = None - critical_functional_domains: list[FunctionalDomain] | None = None - structural_elements: CategoricalFusionElements + readingFramePreserved: StrictBool | None = None + criticalFunctionalDomains: list[FunctionalDomain] | None = None + structure: CategoricalFusionElements model_config = ConfigDict( json_schema_extra={ @@ -741,7 +737,7 @@ class CategoricalFusion(AbstractFusion): }, } ], - "structural_elements": [ + "structure": [ { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_152263.3", From c67d58889cdb3a3ace87aa187e1467e716ec1ee7 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 12:27:56 -0400 Subject: [PATCH 11/82] updating models --- src/fusor/fusor.py | 18 +++++-------- src/fusor/models.py | 61 ++++++++++++++++----------------------------- 2 files changed, 28 insertions(+), 51 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 322ea16..e21e343 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -2,7 +2,6 @@ import logging import re -from urllib.parse import quote from bioutils.accessions import coerce_namespace from cool_seq_tool.app import CoolSeqTool @@ -52,6 +51,7 @@ _logger = logging.getLogger(__name__) +# TODO: add back minimum information function for fetches from normalizers class FUSOR: """Class for modifying fusion objects.""" @@ -560,10 +560,10 @@ def _sequence_location( return SequenceLocation( sequence_id=sequence_id, - start=start, end=end, + start=start, + end=end, ) - def add_additional_fields( self, fusion: Fusion, @@ -714,24 +714,18 @@ def add_gene_descriptor(self, fusion: Fusion) -> Fusion: for obj in prop: if "gene_descriptor" in obj.model_fields: label = obj.gene_descriptor.label - norm_gene_descr, _ = self._normalized_gene( - label - ) + norm_gene_descr, _ = self._normalized_gene(label) if norm_gene_descr: obj.gene_descriptor = norm_gene_descr if fusion.regulatory_element and fusion.regulatory_element.associated_gene: reg_el = fusion.regulatory_element label = reg_el.associated_gene.label - norm_gene_descr, _ = self._normalized_gene( - label - ) + norm_gene_descr, _ = self._normalized_gene(label) if norm_gene_descr: reg_el.associated_gene = norm_gene_descr return fusion - def _normalized_gene( - self, query: str - ) -> tuple[Gene | None, str | None]: + def _normalized_gene(self, query: str) -> tuple[Gene | None, str | None]: """Return gene from normalized response. :param query: Gene query diff --git a/src/fusor/models.py b/src/fusor/models.py index 40a3285..2f9d357 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -22,7 +22,6 @@ model_validator, ) from pydantic.fields import Field -# TODO: add back minimum information function from normalizers (id (hgnc or whatever passed by normalizer) and label only) class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" @@ -57,6 +56,7 @@ class DomainStatus(str, Enum): LOST = "lost" PRESERVED = "preserved" + class FunctionalDomain(BaseModel): """Define FunctionalDomain class""" @@ -69,12 +69,12 @@ class FunctionalDomain(BaseModel): model_config = ConfigDict( populate_by_name=True, + # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "FunctionalDomain", "status": "lost", "label": "Tyrosine-protein kinase, catalytic domain", - # TODO: verify this field isn't getting populated/used "_id": "interpro:IPR020635", "gene": { "id": "gene:NTRK1", @@ -83,13 +83,11 @@ class FunctionalDomain(BaseModel): "type": "Gene", }, "sequence_location": { - # TODO: keep this? - yes, use standardized ids as seen in Jeremy's PR "id": "fusor.location_descriptor:NP_002520.2", "type": "SequenceLocation", "sequenceReference": { "id": "GRCh38:chr22", "type": "SequenceReference", - # TODO: get correct id here "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", "residueAlphabet": "na", }, @@ -160,6 +158,7 @@ def check_exons(cls, values): return values model_config = ConfigDict( + # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "TranscriptSegmentElement", @@ -215,31 +214,9 @@ class LinkerElement(BaseStructuralElement, extra="forbid"): ) linkerSequence: LiteralSequenceExpression - @field_validator("linker_sequence", mode="before") - def validate_sequence(cls, v): - """Enforce nucleotide base code requirements on sequence literals.""" - if isinstance(v, dict): - try: - v["sequence"] = v["sequence"].upper() - seq = v["sequence"] - except KeyError: - raise TypeError - elif isinstance(v, SequenceLocation): - v.sequence = v.sequence.upper() - seq = v.sequence - else: - raise TypeError - - # TODO: remove this validation - try: - LiteralSequenceExpression(sequence=SequenceString(seq)) - except ValidationError: - raise AssertionError("sequence does not match regex '^[A-Za-z*\\-]*$'") - - return v - model_config = ConfigDict( arbitrary_types_allowed=True, + # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "LinkerSequenceElement", @@ -274,6 +251,7 @@ class TemplatedSequenceElement(BaseStructuralElement): strand: Strand model_config = ConfigDict( + # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "TemplatedSequenceElement", @@ -317,6 +295,7 @@ class GeneElement(BaseStructuralElement): }, ) + class UnknownGeneElement(BaseStructuralElement): """Define UnknownGene class. This is primarily intended to represent a partner in the result of a fusion partner-agnostic assay, which identifies @@ -408,6 +387,7 @@ def ensure_min_values(cls, values): return values model_config = ConfigDict( + # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "RegulatoryElement", @@ -600,12 +580,13 @@ class Assay(BaseModelForbidExtra): fusionDetection: Evidence | None = None model_config = ConfigDict( + # TODO: verify this example json once models approved json_schema_extra={ "example": { - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred", + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred", } } ) @@ -641,11 +622,12 @@ class CausativeEvent(BaseModelForbidExtra): eventDescription: StrictStr | None = None model_config = ConfigDict( + # TODO: verify this example json once models approved json_schema_extra={ "example": { "type": "CausativeEvent", - "event_type": "rearrangement", - "event_description": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", + "eventType": "rearrangement", + "eventDescription": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", } }, ) @@ -664,6 +646,7 @@ class AssayedFusion(AbstractFusion): assay: Assay | None = None model_config = ConfigDict( + # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "AssayedFusion", @@ -683,12 +666,11 @@ class AssayedFusion(AbstractFusion): { "type": "GeneElement", "gene": { - "type": "Gene", - "id": "gene:EWSR1", - # TODO: This should be mappings instead - "gene_id": "hgnc:3058", - "label": "EWSR1", - } + "type": "Gene", + "id": "gene:EWSR1", + "gene_id": "hgnc:3058", + "label": "EWSR1", + }, }, {"type": "UnknownGeneElement"}, ], @@ -719,6 +701,7 @@ class CategoricalFusion(AbstractFusion): structure: CategoricalFusionElements model_config = ConfigDict( + # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "CategoricalFusion", From 6ad8e33c2bcec76b7028dd4e68b2c6acb0979a0b Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 12:34:32 -0400 Subject: [PATCH 12/82] fix: gene element type --- src/fusor/models.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 2f9d357..5017b22 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -8,7 +8,6 @@ from ga4gh.vrs.models import ( LiteralSequenceExpression, SequenceLocation, - SequenceString, ) from gene.schemas import CURIE from pydantic import ( @@ -17,12 +16,11 @@ StrictBool, StrictInt, StrictStr, - ValidationError, - field_validator, model_validator, ) from pydantic.fields import Field + class BaseModelForbidExtra(BaseModel, extra="forbid"): """Base Pydantic model class with extra values forbidden.""" @@ -103,7 +101,7 @@ class StructuralElementType(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT.value TEMPLATED_SEQUENCE_ELEMENT = FUSORTypes.TEMPLATED_SEQUENCE_ELEMENT.value LINKER_SEQUENCE_ELEMENT = FUSORTypes.LINKER_SEQUENCE_ELEMENT.value - GENE_ELEMENT = Gene + GENE_ELEMENT = FUSORTypes.GENE_ELEMENT.value UNKNOWN_GENE_ELEMENT = FUSORTypes.UNKNOWN_GENE_ELEMENT.value MULTIPLE_POSSIBLE_GENES_ELEMENT = FUSORTypes.MULTIPLE_POSSIBLE_GENES_ELEMENT.value From c1e8fadc57296af32af3eb907ceb10ced959d1ad Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 13:30:45 -0400 Subject: [PATCH 13/82] wip: update constructors with updated param names from models --- pyproject.toml | 3 +- src/fusor/fusor.py | 180 ++++++++++++++++++++------------------------- 2 files changed, 80 insertions(+), 103 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 195ebba..c338763 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -153,10 +153,11 @@ ignore = [ # S101 - assert # B011 - assert-false # N805 - invalid-first-argument-name-for-method +# N805 - invalid-argument-name # INP001 - implicit-namespace-package # SLF001 - private-member-access "tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "INP001", "SLF001"] -"src/fusor/models.py" = ["ANN201", "N805", "ANN001", "ANN2", "ANN102"] +"src/fusor/models.py" = ["ANN201", "N803", "N805", "ANN001", "ANN2", "ANN102"] [tool.ruff.format] docstring-code-format = true diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index e21e343..0caa667 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -31,6 +31,7 @@ FunctionalDomain, Fusion, FusionType, + GeneElement, LinkerElement, MultiplePossibleGenesElement, RegulatoryClass, @@ -51,7 +52,6 @@ _logger = logging.getLogger(__name__) -# TODO: add back minimum information function for fetches from normalizers class FUSOR: """Class for modifying fusion objects.""" @@ -84,7 +84,7 @@ def _contains_element_type(kwargs: dict, elm_type: StructuralElementType) -> boo :param elm_type: element type to match :return: True if at least one element of given type is found, False otherwise. """ - for c in kwargs["structural_elements"]: + for c in kwargs["structure"]: if (isinstance(c, dict) and c.get("type") == elm_type) or ( isinstance(c, BaseStructuralElement) and c.type == elm_type ): @@ -122,8 +122,8 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: # try to infer from provided attributes categorical_attributes = any( [ - "critical_functional_domains" in kwargs, - "r_frame_preserved" in kwargs, + "criticalFunctionalDomains" in kwargs, + "readingFramePreserved" in kwargs, self._contains_element_type( kwargs, StructuralElementType.MULTIPLE_POSSIBLE_GENES_ELEMENT ), @@ -131,7 +131,7 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: ) assayed_attributes = any( [ - "causative_event" in kwargs, + "causativeEvent" in kwargs, "assay" in kwargs, self._contains_element_type( kwargs, StructuralElementType.UNKNOWN_GENE_ELEMENT @@ -156,27 +156,27 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: @staticmethod def categorical_fusion( - structural_elements: CategoricalFusionElements, + structure: CategoricalFusionElements, regulatory_element: RegulatoryElement | None = None, critical_functional_domains: list[FunctionalDomain] | None = None, - r_frame_preserved: bool | None = None, + reading_frame_preserved: bool | None = None, ) -> CategoricalFusion: """Construct a categorical fusion object - :param structural_elements: elements constituting the fusion - :param regulatory_element: affected regulatory element - :param critical_functional_domains: lost or preserved functional domains - :param r_frame_preserved: ``True`` if reading frame is preserved. ``False`` + :param structure: elements constituting the fusion + :param regulatoryElement: affected regulatory element + :param criticalFunctionalDomains: lost or preserved functional domains + :param readingFramePreserved: ``True`` if reading frame is preserved. ``False`` otherwise :return: CategoricalFusion if construction successful :raise: FUSORParametersException if given incorrect fusion properties """ try: fusion = CategoricalFusion( - structural_elements=structural_elements, - critical_functional_domains=critical_functional_domains, - r_frame_preserved=r_frame_preserved, - regulatory_element=regulatory_element, + structure=structure, + criticalFunctionalDomains=critical_functional_domains, + readingFramePreserved=reading_frame_preserved, + regulatoryElement=regulatory_element, ) except ValidationError as e: raise FUSORParametersException(str(e)) from e @@ -184,24 +184,24 @@ def categorical_fusion( @staticmethod def assayed_fusion( - structural_elements: AssayedFusionElements, + structure: AssayedFusionElements, causative_event: CausativeEvent | None = None, assay: Assay | None = None, regulatory_element: RegulatoryElement | None = None, ) -> AssayedFusion: """Construct an assayed fusion object - :param structural_elements: elements constituting the fusion - :param causative_event: event causing the fusion + :param structure: elements constituting the fusion + :param causativeEvent: event causing the fusion :param assay: how knowledge of the fusion was obtained - :param regulatory_element: affected regulatory elements + :param regulatoryElement: affected regulatory elements :return: Tuple containing optional AssayedFusion if construction successful, and any relevant validation warnings """ try: fusion = AssayedFusion( - structural_elements=structural_elements, - regulatory_element=regulatory_element, - causative_event=causative_event, + structure=structure, + regulatoryElement=regulatory_element, + causativeEvent=causative_event, assay=assay, ) except ValidationError as e: @@ -219,9 +219,9 @@ async def transcript_segment_element( :param tx_to_genomic_coords: `True` if going from transcript to genomic coordinates. ``False`` if going from genomic to transcript exon coordinates. - :param use_minimal_gene_descr: `True` if minimal gene descriptor - (``id``, ``gene_id``, ``label``) will be used. ``False`` if - gene-normalizer's gene descriptor will be used + :param use_minimal_gene_descr: `True` if minimal gene object + (``id``, ``label``) will be used. ``False`` if + gene-normalizer's entire gene object will be used :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set this to the namespace you want the digest for. Otherwise, leave as ``None``. :param kwargs: @@ -272,7 +272,7 @@ async def transcript_segment_element( genomic_data = data.genomic_data genomic_data.transcript = coerce_namespace(genomic_data.transcript) - normalized_gene_response = self._normalized_gene_descriptor( + normalized_gene_response = self._normalized_gene( genomic_data.gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not normalized_gene_response[0] and normalized_gene_response[1]: @@ -281,12 +281,13 @@ async def transcript_segment_element( return ( TranscriptSegmentElement( transcript=genomic_data.transcript, - exon_start=genomic_data.exon_start, - exon_start_offset=genomic_data.exon_start_offset, - exon_end=genomic_data.exon_end, - exon_end_offset=genomic_data.exon_end_offset, - gene_descriptor=normalized_gene_response[0], - element_genomic_start=self._sequence_location( + exonStart=genomic_data.exonStart, + exonStartOffset=genomic_data.exonStartOffset, + exonEnd=genomic_data.exonEnd, + exonEndOffset=genomic_data.exonEndOffset, + # TODO: make sure this is correct/works (might be response.gene?) + gene=normalized_gene_response[0], + elementGenomicStart=self._sequence_location( genomic_data.start, genomic_data.start + 1, genomic_data.chr, @@ -295,7 +296,7 @@ async def transcript_segment_element( ) if genomic_data.start else None, - element_genomic_end=self._sequence_location( + elementGenomicEnd=self._sequence_location( genomic_data.end, genomic_data.end + 1, genomic_data.chr, @@ -314,17 +315,17 @@ def gene_element( """Create gene element :param str gene: Gene - :param bool use_minimal_gene_descr: `True` if minimal gene descriptor + :param bool use_minimal_gene_descr: `True` if minimal gene object (`id`, `gene_id`, `label`) will be used. `False` if - gene-normalizer's gene descriptor will be used + gene-normalizer's gene object will be used :return: GeneElement, warning """ - gene_descr, warning = self._normalized_gene( + normalized_gene, warning = self._normalized_gene( gene, use_minimal_gene_descr=use_minimal_gene_descr ) - if not gene_descr: + if not normalized_gene: return None, warning - return Gene(gene_descriptor=gene_descr), None + return normalized_gene, None def templated_sequence_element( self, @@ -433,9 +434,9 @@ def functional_domain( :param sequence_id: protein sequence on which provided coordinates are located :param start: start position on sequence :param end: end position on sequence - :param use_minimal_gene_descr: ``True`` if minimal gene descriptor (``id``, + :param use_minimal_gene_descr: ``True`` if minimal gene object (``id``, ``gene_id``, ``label``) will be used. ``False`` if gene-normalizer's gene - descriptor will be used + object will be used :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set this to the namespace you want the digest for. Otherwise, leave as ``None``. :return: Tuple with FunctionalDomain and None value for warnings if @@ -456,7 +457,7 @@ def functional_domain( if not seq: return None, warning - gene_descr, warning = self._normalized_gene_descriptor( + gene_descr, warning = self._normalized_gene( gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not gene_descr: @@ -469,10 +470,10 @@ def functional_domain( try: return ( FunctionalDomain( - _id=functional_domain_id, + id=functional_domain_id, label=name, status=status, - associated_gene=gene_descr, + associatedGene=gene_descr, sequence_location=loc_descr, ), None, @@ -490,12 +491,12 @@ def regulatory_element( ) -> tuple[RegulatoryElement | None, str | None]: """Create RegulatoryElement :param regulatory_class: one of {"promoter", "enhancer"} - :param gene: gene term to fetch normalized descriptor for - :param use_minimal_gene_descr: whether to use the minimal gene descriptor + :param gene: gene term to fetch normalized gene object for + :param use_minimal_gene_descr: whether to use the minimal gene object :return: Tuple with RegulatoryElement instance and None value for warnings if successful, or a None value and warning message if unsuccessful """ - gene_descr, warning = self._normalized_gene_descriptor( + gene_descr, warning = self._normalized_gene( gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not gene_descr: @@ -504,7 +505,7 @@ def regulatory_element( try: return ( RegulatoryElement( - regulatory_class=regulatory_class, associated_gene=gene_descr + regulatoryClass=regulatory_class, associatedGene=gene_descr ), None, ) @@ -520,24 +521,18 @@ def _sequence_location( sequence_id: str, label: str | None = None, seq_id_target_namespace: str | None = None, - use_location_id: bool = False, ) -> SequenceLocation: - """Create location descriptor + """Create sequence location :param start: Start position :param end: End position :param sequence_id: Accession for sequence :param label: label for location. If ``None``, ``sequence_id`` will be used as - Location Descriptor's ``id`` Else, label will be used as Location - Descriptor's ``id``. + Sequence Location's ``id`` Else, label will be used as Sequence Location's + ``id``. :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set this to the namespace you want the digest for. Otherwise, leave as ``None``. - :param use_location_id: Takes precedence over ``label`` or ``sequence_id`` - becoming Location Descriptor's id. ``True`` if use ga4gh digest as Location - Descriptor's id. ``False``, use default of ``label`` > ``sequence_id`` """ - seq_id_input = sequence_id - try: sequence_id = coerce_namespace(sequence_id) except ValueError: @@ -559,7 +554,8 @@ def _sequence_location( sequence_id = seq_id return SequenceLocation( - sequence_id=sequence_id, + id=sequence_id, + label=label, start=start, end=end, ) @@ -607,28 +603,28 @@ def add_location_id(self, fusion: Fusion) -> Fusion: :param fusion: A valid Fusion object. :return: Updated fusion with `location_id` fields set """ - for structural_element in fusion.structural_elements: + for structural_element in fusion.structure: if isinstance(structural_element, TemplatedSequenceElement): location = structural_element.region.location location_id = self._location_id(location.model_dump()) structural_element.region.location_id = location_id elif isinstance(structural_element, TranscriptSegmentElement): for element_genomic in [ - structural_element.element_genomic_start, - structural_element.element_genomic_end, + structural_element.elementGenomicStart, + structural_element.elementGenomicEnd, ]: if element_genomic: location = element_genomic.location if location.type == SequenceLocation: location_id = self._location_id(location.model_dump()) element_genomic.location_id = location_id - if isinstance(fusion, CategoricalFusion) and fusion.critical_functional_domains: - for domain in fusion.critical_functional_domains: + if isinstance(fusion, CategoricalFusion) and fusion.criticalFunctionalDomains: + for domain in fusion.criticalFunctionalDomains: location = domain.sequence_location.location location_id = self._location_id(location.model_dump()) domain.sequence_location.location_id = location_id - if fusion.regulatory_element: - element = fusion.regulatory_element + if fusion.regulatoryElement: + element = fusion.regulatoryElement if element.feature_location: location = element.feature_location if location.type == SequenceLocation: @@ -654,7 +650,7 @@ def add_translated_sequence_id( :param target_namespace: ID namespace to translate sequence IDs to :return: Updated fusion with ``sequence_id`` fields set """ - for element in fusion.structural_elements: + for element in fusion.structure: if isinstance(element, TemplatedSequenceElement): location = element.region.location if location.type == SequenceLocation: @@ -668,8 +664,8 @@ def add_translated_sequence_id( element.region.location.sequence_id = new_id elif isinstance(element, TranscriptSegmentElement): for loc_descr in [ - element.element_genomic_start, - element.element_genomic_end, + element.elementGenomicStart, + element.elementGenomicEnd, ]: if loc_descr: location = loc_descr.location @@ -681,8 +677,9 @@ def add_translated_sequence_id( except IDTranslationException: continue loc_descr.location.sequence_id = new_id - if fusion.type == "CategoricalFusion" and fusion.critical_functional_domains: - for domain in fusion.critical_functional_domains: + if fusion.type == "CategoricalFusion" and fusion.criticalFunctionalDomains: + # TODO: unreachable code? + for domain in fusion.criticalFunctionalDomains: if ( domain.sequence_location and domain.sequence_location.location @@ -699,33 +696,9 @@ def add_translated_sequence_id( domain.sequence_location.location.sequence_id = new_id return fusion - # TODO: do we still need this? - def add_gene_descriptor(self, fusion: Fusion) -> Fusion: - """Add additional fields to ``gene_descriptor`` in fusion object - - :param fusion: A valid Fusion object - :return: Updated fusion with additional fields set in ``gene_descriptor`` - """ - properties = [fusion.structural_elements] - if fusion.type == FusionType.CATEGORICAL_FUSION: - properties.append(fusion.critical_functional_domains) - - for prop in properties: - for obj in prop: - if "gene_descriptor" in obj.model_fields: - label = obj.gene_descriptor.label - norm_gene_descr, _ = self._normalized_gene(label) - if norm_gene_descr: - obj.gene_descriptor = norm_gene_descr - if fusion.regulatory_element and fusion.regulatory_element.associated_gene: - reg_el = fusion.regulatory_element - label = reg_el.associated_gene.label - norm_gene_descr, _ = self._normalized_gene(label) - if norm_gene_descr: - reg_el.associated_gene = norm_gene_descr - return fusion - - def _normalized_gene(self, query: str) -> tuple[Gene | None, str | None]: + def _normalized_gene( + self, query: str, use_minimal_gene_descr: bool + ) -> tuple[Gene | None, str | None]: """Return gene from normalized response. :param query: Gene query @@ -735,7 +708,10 @@ def _normalized_gene(self, query: str) -> tuple[Gene | None, str | None]: gene_norm_resp = self.gene_normalizer.normalize(query) if gene_norm_resp.match_type: gene = gene_norm_resp.gene - return gene, None + if use_minimal_gene_descr: + return gene, None + # TODO: remove normalize.gene from id + return Gene(id=gene.id, label=gene.label) return None, f"gene-normalizer unable to normalize {query}" def generate_nomenclature(self, fusion: Fusion) -> str: @@ -747,11 +723,11 @@ def generate_nomenclature(self, fusion: Fusion) -> str: """ parts = [] element_genes = [] - if fusion.regulatory_element: + if fusion.regulatoryElement: parts.append( - reg_element_nomenclature(fusion.regulatory_element, self.seqrepo) + reg_element_nomenclature(fusion.regulatoryElement, self.seqrepo) ) - for element in fusion.structural_elements: + for element in fusion.structure: if isinstance(element, MultiplePossibleGenesElement): parts.append("v") elif isinstance(element, UnknownGeneElement): @@ -760,14 +736,14 @@ def generate_nomenclature(self, fusion: Fusion) -> str: parts.append(element.linker_sequence.sequence) elif isinstance(element, TranscriptSegmentElement): if not any( - [gene == element.gene_descriptor.label for gene in element_genes] # noqa: C419 + [gene == element.gene.label for gene in element_genes] # noqa: C419 ): parts.append(tx_segment_nomenclature(element)) elif isinstance(element, TemplatedSequenceElement): parts.append(templated_seq_nomenclature(element, self.seqrepo)) elif isinstance(element, GeneElement): if not any( - [gene == element.gene_descriptor.label for gene in element_genes] # noqa: C419 + [gene == element.gene.label for gene in element_genes] # noqa: C419 ): parts.append(gene_nomenclature(element)) else: From 7eabc25fbd048a1ae7361ff6b8a3ac0b62b6572a Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 14:12:33 -0400 Subject: [PATCH 14/82] update constructors from model changes --- src/fusor/fusor.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 0caa667..aff80dc 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -9,7 +9,7 @@ from ga4gh.core import ga4gh_identify from ga4gh.core.domain_models import Gene from ga4gh.vrs import models -from ga4gh.vrs.models import SequenceLocation +from ga4gh.vrs.models import LiteralSequenceExpression, SequenceLocation, SequenceString from gene.database import AbstractDatabase as GeneDatabase from gene.database import create_db from gene.query import QueryHandler @@ -373,25 +373,18 @@ def templated_sequence_element( @staticmethod def linker_element( sequence: str, - residue_type: CURIE = "SO:0000348", ) -> tuple[LinkerElement | None, str | None]: """Create linker element :param sequence: Sequence - :param residue_type: Sequence Ontology code for residue type of ``sequence`` :return: Tuple containing a complete Linker element and None if successful, or a None value and warning message if unsuccessful """ try: - seq = sequence.upper() - params = { - "linker_sequence": { - "id": f"fusor.sequence:{seq}", - "sequence": seq, - "residue_type": residue_type, - } - } - return LinkerElement(**params), None + upper_seq = sequence.upper() + seq = SequenceString(upper_seq) + linker_sequence = LiteralSequenceExpression(sequence=seq) + return LinkerElement(linkerSequence=linker_sequence), None except ValidationError as e: msg = str(e) _logger.warning(msg) From 1e181441c1d1056a32ddb375cd7904fa31ef0361 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 14:51:50 -0400 Subject: [PATCH 15/82] minor fixes --- pyproject.toml | 5 +++-- src/fusor/models.py | 7 +++---- src/fusor/tools.py | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c338763..cd36df9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -152,12 +152,13 @@ ignore = [ # ANN102 - missing-type-cls # S101 - assert # B011 - assert-false +# N803 - invalid-argument-name # N805 - invalid-first-argument-name-for-method -# N805 - invalid-argument-name +# N815 - mixed-case-variable-in-class-scope # INP001 - implicit-namespace-package # SLF001 - private-member-access "tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "INP001", "SLF001"] -"src/fusor/models.py" = ["ANN201", "N803", "N805", "ANN001", "ANN2", "ANN102"] +"src/fusor/models.py" = ["ANN201", "N803", "N805", "N815", "ANN001", "ANN2", "ANN102"] [tool.ruff.format] docstring-code-format = true diff --git a/src/fusor/models.py b/src/fusor/models.py index 38c35ef..f77adbc 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -165,11 +165,10 @@ def check_exons(cls, values): "exon_start_offset": 0, "exon_end": 8, "exon_end_offset": 0, - "gene_descriptor": { - "id": "normalize.gene:TPM3", - "type": "GeneDescriptor", + "gene": { + "id": "hgnc:12012", + "type": "Gene", "label": "TPM3", - "gene_id": "hgnc:12012", }, "element_genomic_start": { "id": "fusor.location_descriptor:NC_000001.11", diff --git a/src/fusor/tools.py b/src/fusor/tools.py index fa3cf5a..1cdc71e 100644 --- a/src/fusor/tools.py +++ b/src/fusor/tools.py @@ -3,6 +3,7 @@ import logging from biocommons.seqrepo.seqrepo import SeqRepo +from gene.schemas import CURIE from fusor.exceptions import IDTranslationException From 76ef031cb03d048f283074e6a36c557f6ec34f63 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 15:40:55 -0400 Subject: [PATCH 16/82] fix: updating variable casing --- src/fusor/fusor.py | 2 +- src/fusor/models.py | 104 ++++++++++++++++++-------------------------- 2 files changed, 43 insertions(+), 63 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 64692d6..1d2f8aa 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -161,7 +161,7 @@ def categorical_fusion( reading_frame_preserved: bool | None = None, ) -> CategoricalFusion: """Construct a categorical fusion object - :param structural_elements: elements constituting the fusion + :param structure: elements constituting the fusion :param regulatory_element: affected regulatory element :param critical_functional_domains: lost or preserved functional domains :param reading_frame_preserved: ``True`` if reading frame is preserved. diff --git a/src/fusor/models.py b/src/fusor/models.py index f77adbc..bfcede4 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -67,26 +67,27 @@ class FunctionalDomain(BaseModel): model_config = ConfigDict( populate_by_name=True, - # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "FunctionalDomain", "status": "lost", "label": "Tyrosine-protein kinase, catalytic domain", - "_id": "interpro:IPR020635", + "id": "interpro:IPR020635", "gene": { "id": "gene:NTRK1", "gene_id": "hgnc:8031", "label": "8031", "type": "Gene", }, - "sequence_location": { - "id": "fusor.location_descriptor:NP_002520.2", + "sequenceLocation": { + "id": "NP_002520.2", + "start": 510, + "end": "781", "type": "SequenceLocation", "sequenceReference": { "id": "GRCh38:chr22", "type": "SequenceReference", - "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", + "refgetAccession": "SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", "residueAlphabet": "na", }, }, @@ -135,68 +136,55 @@ def check_exons(cls, values): """ msg = "Must give values for either `exon_start`, `exon_end`, or both" - exon_start = values.get("exon_start") - exon_end = values.get("exon_end") + exon_start = values.get("exonStart") + exon_end = values.get("exonEnd") if (not exon_start) and (not exon_end): raise ValueError(msg) if exon_start: - if not values.get("element_genomic_start"): - msg = "Must give `element_genomic_start` if `exon_start` is given" + if not values.get("elementGenomicStart"): + msg = "Must give `elementGenomicStart` if `exonStart` is given" raise ValueError(msg) else: - values["exon_start_offset"] = None + values["exonStartOffset"] = None if exon_end: - if not values.get("element_genomic_end"): - msg = "Must give `element_genomic_end` if `exon_end` is given" + if not values.get("elementGenomicEnd"): + msg = "Must give `elementGenomicEnd` if `exonEnd` is given" raise ValueError(msg) else: - values["exon_end_offset"] = None + values["exonEndOffset"] = None return values model_config = ConfigDict( - # TODO: update this example json once models approved + # TODO: verify this example json_schema_extra={ "example": { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_152263.3", - "exon_start": 1, - "exon_start_offset": 0, - "exon_end": 8, - "exon_end_offset": 0, + "exonStart": 1, + "exonStartOffset": 0, + "exonEnd": 8, + "exonEndOffset": 0, "gene": { "id": "hgnc:12012", "type": "Gene", "label": "TPM3", }, - "element_genomic_start": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", + "elementGenomicStart": { + "id": "NC_000001.11", + "type": "SequenceLocation", "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 154192135}, - "end": {"type": "Number", "value": 154192136}, - }, - }, + "start": 154192135, + "end": 154192136, + # do we need a sequence reference here? }, - "element_genomic_end": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", + "elementGenomicEnd": { + "id": "NC_000001.11", + "type": "SequenceLocation", "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 154170399}, - "end": {"type": "Number", "value": 154170400}, - }, - }, + "start": 154170399, + "end": 154170400, }, } }, @@ -217,7 +205,7 @@ class LinkerElement(BaseStructuralElement, extra="forbid"): json_schema_extra={ "example": { "type": "LinkerSequenceElement", - "linker_sequence": { + "linkerSequence": { "id": "sequence:ACGT", "type": "SequenceDescriptor", "sequence": "ACGT", @@ -377,9 +365,11 @@ def ensure_min_values(cls, values): `associated_gene` is set. """ if not ( - bool(values.get("feature_id")) ^ bool(values.get("feature_location")) - ) and not (values.get("associated_gene")): - msg = "Must set 1 of {`feature_id`, `associated_gene`} and/or `feature_location`" + bool(values.get("featureId")) ^ bool(values.get("featureLocation")) + ) and not (values.get("associatedGene")): + msg = ( + "Must set 1 of {`featureId`, `associatedGene`} and/or `featureLocation`" + ) raise ValueError(msg) return values @@ -459,7 +449,6 @@ def _access_object_attr( def _fetch_gene_id( cls, obj: dict | BaseModel, - gene_descriptor_field: str, ) -> str | None: """Get gene ID if element includes a gene annotation. @@ -468,14 +457,9 @@ def _fetch_gene_id( :param gene_descriptor_field: name of gene_descriptor field :return: gene ID if gene is defined """ - gene_descriptor = cls._access_object_attr(obj, gene_descriptor_field) - if gene_descriptor: - gene_value = cls._access_object_attr(gene_descriptor, "gene") - if gene_value: - gene_id = cls._access_object_attr(gene_value, "gene_id") - if gene_id: - return gene_id - gene_id = cls._access_object_attr(gene_descriptor, "gene_id") + gene_info = cls._access_object_attr(obj, "gene") + if gene_info: + gene_id = cls._access_object_attr(gene_info, "id") if gene_id: return gene_id return None @@ -505,23 +489,19 @@ def enforce_element_quantities(cls, values): if not structure: raise ValueError(qt_error_msg) num_structure = len(structure) - reg_element = values.get("regulatory_element") + reg_element = values.get("regulatoryElement") if (num_structure + bool(reg_element)) < 2: raise ValueError(qt_error_msg) uq_gene_msg = "Fusions must form a chimeric transcript from two or more genes, or a novel interaction between a rearranged regulatory element with the expressed product of a partner gene." gene_ids = [] if reg_element: - gene_id = cls._fetch_gene_id( - obj=reg_element, gene_descriptor_field="associated_gene" - ) + gene_id = cls._fetch_gene_id(obj=reg_element) if gene_id: gene_ids.append(gene_id) for element in structure: - gene_id = cls._fetch_gene_id( - obj=element, gene_descriptor_field="gene_descriptor" - ) + gene_id = cls._fetch_gene_id(obj=element) if gene_id: gene_ids.append(gene_id) From 6f740e9e98b81f0fa0570317736377a2e4d30748 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 15:43:23 -0400 Subject: [PATCH 17/82] updating docstring --- src/fusor/models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index bfcede4..1ebf1c2 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -130,8 +130,8 @@ class TranscriptSegmentElement(BaseStructuralElement): @model_validator(mode="before") def check_exons(cls, values): - """Check that at least one of {`exon_start`, `exon_end`} is set. - If set, check that the corresponding `element_genomic` field is set. + """Check that at least one of {`exonStart`, `exonEnd`} is set. + If set, check that the corresponding `elementGenomic` field is set. If not set, set corresponding offset to `None` """ From c928b355149b9957ab8caca226b6643ac6958339 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 15:47:46 -0400 Subject: [PATCH 18/82] fix: variable casing and error messages --- src/fusor/models.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 1ebf1c2..aca31de 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -135,7 +135,7 @@ def check_exons(cls, values): If not set, set corresponding offset to `None` """ - msg = "Must give values for either `exon_start`, `exon_end`, or both" + msg = "Must give values for either `exonStart`, `exonEnd`, or both" exon_start = values.get("exonStart") exon_end = values.get("exonEnd") if (not exon_start) and (not exon_end): @@ -348,7 +348,7 @@ class RegulatoryClass(str, Enum): class RegulatoryElement(BaseModel): """Define RegulatoryElement class. - `feature_id` would ideally be constrained as a CURIE, but Encode, our preferred + `featureId` would ideally be constrained as a CURIE, but Encode, our preferred feature ID source, doesn't currently have a registered CURIE structure for EH_ identifiers. Consequently, we permit any kind of free text. """ @@ -361,8 +361,8 @@ class RegulatoryElement(BaseModel): @model_validator(mode="before") def ensure_min_values(cls, values): - """Ensure that one of {`feature_id`, `feature_location`}, and/or - `associated_gene` is set. + """Ensure that one of {`featureId`, `featureLocation`}, and/or + `associatedGene` is set. """ if not ( bool(values.get("featureId")) ^ bool(values.get("featureLocation")) @@ -454,7 +454,6 @@ def _fetch_gene_id( :param obj: element to fetch gene from. Might not contain a gene (e.g. it's a TemplatedSequenceElement) so we have to use safe checks to fetch. - :param gene_descriptor_field: name of gene_descriptor field :return: gene ID if gene is defined """ gene_info = cls._access_object_attr(obj, "gene") @@ -519,7 +518,7 @@ def structure_ends(cls, values): """ elements = values.structure if isinstance(elements[0], TranscriptSegmentElement): - if elements[0].exon_end is None and not values["regulatory_element"]: + if elements[0].exonEnd is None and not values["regulatory_element"]: msg = "5' TranscriptSegmentElement fusion partner must contain ending exon position" raise ValueError(msg) elif isinstance(elements[0], LinkerElement): @@ -529,12 +528,12 @@ def structure_ends(cls, values): if len(elements) > 2: for element in elements[1:-1]: if isinstance(element, TranscriptSegmentElement) and ( - element.exon_start is None or element.exon_end is None + element.exonStart is None or element.exonEnd is None ): msg = "Connective TranscriptSegmentElement must include both start and end positions" raise ValueError(msg) if isinstance(elements[-1], TranscriptSegmentElement) and ( - elements[-1].exon_start is None + elements[-1].exonStart is None ): msg = "3' fusion partner junction must include " "starting position" raise ValueError From a2d2e10f8727616b91320b37042bacb3d3c29b2d Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 18 Jul 2024 16:03:25 -0400 Subject: [PATCH 19/82] revert featureId back to string --- src/fusor/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index aca31de..ea5cdd3 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -355,7 +355,7 @@ class RegulatoryElement(BaseModel): type: Literal[FUSORTypes.REGULATORY_ELEMENT] = FUSORTypes.REGULATORY_ELEMENT regulatoryClass: RegulatoryClass - featureId: CURIE | None = None + featureId: str | None = None associatedGene: Gene | None = None featureLocation: SequenceLocation | None = None From fe852974a390c6fd27094ff26fc251c3c4d2fc7f Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Fri, 19 Jul 2024 10:01:39 -0400 Subject: [PATCH 20/82] Update src/fusor/models.py Co-authored-by: Kori Kuzma --- src/fusor/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index ea5cdd3..a47947e 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -61,7 +61,7 @@ class FunctionalDomain(BaseModel): type: Literal[FUSORTypes.FUNCTIONAL_DOMAIN] = FUSORTypes.FUNCTIONAL_DOMAIN status: DomainStatus gene: Gene - id: CURIE | None = Field(None, alias="_id") + id: CURIE | None label: StrictStr | None = None sequenceLocation: SequenceLocation | None = None From eb9da5420368efa6575a7a2aaf2448277613e80c Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Fri, 19 Jul 2024 14:01:11 -0400 Subject: [PATCH 21/82] Update pyproject.toml Co-authored-by: Kori Kuzma --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cd36df9..b757c8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ description = "Computable object representation and validation for gene fusions" license = {file = "LICENSE"} dependencies = [ "pydantic == 2.*", - "ga4gh.vrs ~=2.0.0a8", + "ga4gh.vrs ~=2.0.0a10", "biocommons.seqrepo", "gene-normalizer ~=0.4.0", "cool-seq-tool ~=0.5.0", From bbae4bcd408812f4e271996077a93df8baa8d247 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Fri, 19 Jul 2024 14:01:17 -0400 Subject: [PATCH 22/82] Update pyproject.toml Co-authored-by: Kori Kuzma --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b757c8f..aff87b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "pydantic == 2.*", "ga4gh.vrs ~=2.0.0a10", "biocommons.seqrepo", - "gene-normalizer ~=0.4.0", + "gene-normalizer ~=0.4.1", "cool-seq-tool ~=0.5.0", ] dynamic=["version"] From 7634327cae44e8864678fa0662556be279e48c51 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Fri, 19 Jul 2024 14:11:02 -0400 Subject: [PATCH 23/82] Update src/fusor/models.py Co-authored-by: Kori Kuzma --- src/fusor/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index a47947e..d6b42da 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -87,7 +87,7 @@ class FunctionalDomain(BaseModel): "sequenceReference": { "id": "GRCh38:chr22", "type": "SequenceReference", - "refgetAccession": "SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", + "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", "residueAlphabet": "na", }, }, From a593c3b1d8d1d69befcff88d6c4f95abf6052559 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Fri, 19 Jul 2024 14:23:06 -0400 Subject: [PATCH 24/82] fixes from pr comments --- src/fusor/models.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index ea5cdd3..486a6ae 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -80,12 +80,13 @@ class FunctionalDomain(BaseModel): "type": "Gene", }, "sequenceLocation": { - "id": "NP_002520.2", + "id": "SL.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", "start": 510, "end": "781", "type": "SequenceLocation", "sequenceReference": { - "id": "GRCh38:chr22", + "id": "NP_002520.2", + "label": "GRCh38:chr22", "type": "SequenceReference", "refgetAccession": "SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", "residueAlphabet": "na", @@ -175,14 +176,21 @@ def check_exons(cls, values): "id": "NC_000001.11", "type": "SequenceLocation", "label": "NC_000001.11", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" + }, "start": 154192135, "end": 154192136, - # do we need a sequence reference here? }, "elementGenomicEnd": { "id": "NC_000001.11", "type": "SequenceLocation", "label": "NC_000001.11", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" + }, "start": 154170399, "end": 154170400, }, @@ -414,7 +422,7 @@ class AbstractFusion(BaseModel, ABC): type: FusionType regulatoryElement: RegulatoryElement | None = None - structuralElements: list[BaseStructuralElement] + structure: list[BaseStructuralElement] readingFramePreserved: StrictBool | None = None @classmethod @@ -441,9 +449,7 @@ def _access_object_attr( return None elif isinstance(obj, dict): return obj.get(attr_name) - else: - msg = "Unrecognized type, should only pass entities with properties" - raise ValueError(msg) + @classmethod def _fetch_gene_id( From 1c3959bd55d167cbfb24f27917565a8bd66ac524 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Fri, 19 Jul 2024 14:23:32 -0400 Subject: [PATCH 25/82] fixes from pr comments --- src/fusor/models.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 486a6ae..388fd87 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -178,7 +178,7 @@ def check_exons(cls, values): "label": "NC_000001.11", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", }, "start": 154192135, "end": 154192136, @@ -189,7 +189,7 @@ def check_exons(cls, values): "label": "NC_000001.11", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", }, "start": 154170399, "end": 154170400, @@ -450,7 +450,6 @@ def _access_object_attr( elif isinstance(obj, dict): return obj.get(attr_name) - @classmethod def _fetch_gene_id( cls, From 838e2a2e27c1a115b9fe230f572575cd3694050f Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Fri, 19 Jul 2024 14:23:58 -0400 Subject: [PATCH 26/82] wip: updating test examples with new models --- tests/test_models.py | 62 +++++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index e335875..e2ada5f 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -27,22 +27,20 @@ def gene_descriptors(): """Provide possible gene_descriptor input.""" return [ - {"id": "gene:G1", "gene": {"gene_id": "hgnc:9339"}, "label": "G1"}, - {"id": "gene:ABL", "gene": {"gene_id": "hgnc:76"}, "label": "ABL"}, - {"id": "gene:BCR1", "gene": {"gene_id": "hgnc:1014"}, "label": "BCR1"}, - {"id": "gene:NTRK1", "gene_id": "hgnc:8031", "label": "NTRK1"}, + {"id": "hgnc:9339", "label": "G1"}, + {"id": "hgnc:76", "label": "ABL"}, + {"id": "hgnc:1014", "label": "BCR1"}, + {"id": "hgnc:8031", "label": "NTRK1"}, { - "id": "gene:ALK", - "gene_id": "hgnc:1837", + "id": "hgnc:1837", "label": "ALK", }, - {"id": "gene:YAP1", "gene_id": "hgnc:16262", "label": "YAP1"}, + {"id": "hgnc:16262", "label": "YAP1"}, # alternate structure { - "id": "normalize.gene:BRAF", - "type": "GeneDescriptor", + "id": "hgnc:1097", + "type": "Gene", "label": "BRAF", - "gene_id": "hgnc:1097", }, ] @@ -53,7 +51,7 @@ def location_descriptors(): return [ { "id": "NC_000001.11:15455", - "type": "LocationDescriptor", + "type": "SequenceLocation", "location": { "sequence_id": "ncbi:NC_000001.11", "interval": { @@ -66,7 +64,7 @@ def location_descriptors(): }, { "id": "NC_000001.11:15566", - "type": "LocationDescriptor", + "type": "SequenceLocation", "location": { "sequence_id": "ncbi:NC_000001.11", "interval": { @@ -79,7 +77,7 @@ def location_descriptors(): }, { "id": "chr12:p12.1", - "type": "LocationDescriptor", + "type": "SequenceLocation", "location": { "species_id": "taxonomy:9606", "chr": "12", @@ -89,7 +87,7 @@ def location_descriptors(): }, { "id": "chr12:p12.2", - "type": "LocationDescriptor", + "type": "SequenceLocation", "location": { "species_id": "taxonomy:9606", "chr": "12", @@ -99,7 +97,7 @@ def location_descriptors(): }, { "id": "NC_000001.11:15455-15566", - "type": "LocationDescriptor", + "type": "SequenceLocation", "location": { "sequence_id": "ncbi:NC_000001.11", "interval": { @@ -112,7 +110,7 @@ def location_descriptors(): }, { "id": "chr12:p12.1-p12.2", - "type": "LocationDescriptor", + "type": "SequenceLocation", "location": { "species_id": "taxonomy:9606", "chr": "12", @@ -122,7 +120,7 @@ def location_descriptors(): }, { "id": "fusor.location_descriptor:NP_001123617.1", - "type": "LocationDescriptor", + "type": "SequenceLocation", "location": { "sequence_id": "ga4gh:SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7", "type": "SequenceLocation", @@ -134,7 +132,7 @@ def location_descriptors(): }, { "id": "fusor.location_descriptor:NP_002520.2", - "type": "LocationDescriptor", + "type": "SequenceLocation", "location": { "sequence_id": "ga4gh:SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", "type": "SequenceLocation", @@ -155,16 +153,16 @@ def functional_domains(gene_descriptors, location_descriptors): "type": "FunctionalDomain", "status": "preserved", "label": "WW domain", - "_id": "interpro:IPR001202", - "associated_gene": gene_descriptors[5], - "sequence_location": location_descriptors[6], + "id": "interpro:IPR001202", + "gene": gene_descriptors[5], + "sequenceLocation": location_descriptors[6], }, { "status": "lost", "label": "Tyrosine-protein kinase, catalytic domain", - "_id": "interpro:IPR020635", - "associated_gene": gene_descriptors[3], - "sequence_location": location_descriptors[7], + "id": "interpro:IPR020635", + "gene": gene_descriptors[3], + "sequenceLocation": location_descriptors[7], }, ] @@ -308,12 +306,11 @@ def test_functional_domain(functional_domains, gene_descriptors): assert test_domain.status == "preserved" assert test_domain.label == "WW domain" assert test_domain.id == "interpro:IPR001202" - assert test_domain.associated_gene.id == "gene:YAP1" - assert test_domain.associated_gene.gene_id == "hgnc:16262" - assert test_domain.associated_gene.label == "YAP1" - test_loc = test_domain.sequence_location + assert test_domain.gene.id == "hgnc:16262" + assert test_domain.gene.label == "YAP1" + test_loc = test_domain.sequenceLocation assert test_loc.id == "fusor.location_descriptor:NP_001123617.1" - assert test_loc.type == "LocationDescriptor" + assert test_loc.type == "SequenceLocation" assert test_loc.location.sequence_id == "ga4gh:SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7" assert test_loc.location.interval.type == "SequenceInterval" assert test_loc.location.interval.start.value == 171 @@ -324,12 +321,11 @@ def test_functional_domain(functional_domains, gene_descriptors): assert test_domain.status == "lost" assert test_domain.label == "Tyrosine-protein kinase, catalytic domain" assert test_domain.id == "interpro:IPR020635" - assert test_domain.associated_gene.id == "gene:NTRK1" - assert test_domain.associated_gene.gene_id == "hgnc:8031" - assert test_domain.associated_gene.label == "NTRK1" + assert test_domain.gene.id == "hgnc:8031" + assert test_domain.gene.label == "NTRK1" test_loc = test_domain.sequence_location assert test_loc.id == "fusor.location_descriptor:NP_002520.2" - assert test_loc.type == "LocationDescriptor" + assert test_loc.type == "SequenceLocation" assert test_loc.location.sequence_id == "ga4gh:SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6" assert test_loc.location.interval.type == "SequenceInterval" assert test_loc.location.interval.start.value == 510 From f5f56892ba32b0009072c8127be02e5170f49e78 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Fri, 19 Jul 2024 14:28:41 -0400 Subject: [PATCH 27/82] adding back unreachable else because ruff will complain otherwise --- src/fusor/models.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index a11e26d..da8cb27 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -18,7 +18,6 @@ StrictStr, model_validator, ) -from pydantic.fields import Field class BaseModelForbidExtra(BaseModel, extra="forbid"): @@ -449,6 +448,9 @@ def _access_object_attr( return None elif isinstance(obj, dict): return obj.get(attr_name) + else: + msg = "Unrecognized type, should only pass entities with properties" + raise ValueError(msg) @classmethod def _fetch_gene_id( From c3136ce7eb901bc041bf26b2d0f62bb377888380 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Fri, 19 Jul 2024 15:05:31 -0400 Subject: [PATCH 28/82] fix: update example models with placeholders for sequence location and sequence reference ids --- src/fusor/models.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index da8cb27..dfa393d 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -172,7 +172,8 @@ def check_exons(cls, values): "label": "TPM3", }, "elementGenomicStart": { - "id": "NC_000001.11", + # TODO: this digest may not be correct, but putting a placeholder example here for now + "id": "SL.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", "type": "SequenceLocation", "label": "NC_000001.11", "sequenceReference": { @@ -183,7 +184,8 @@ def check_exons(cls, values): "end": 154192136, }, "elementGenomicEnd": { - "id": "NC_000001.11", + # TODO: this digest may not be correct, but putting a placeholder example here for now + "id": "SL.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", "type": "SequenceLocation", "label": "NC_000001.11", "sequenceReference": { From 686b000ea2cc6874d426dbfc100894dcdb9e3f01 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:25:51 -0400 Subject: [PATCH 29/82] fix: casing for data to/from cool-seq-tool --- src/fusor/fusor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 1d2f8aa..e193826 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -272,6 +272,7 @@ async def transcript_segment_element( return None, data.warnings genomic_data = data.genomic_data + # should we be doing this before the call to cool_seq_tool? if the namespace prefix is present, that call will fail genomic_data.transcript = coerce_namespace(genomic_data.transcript) normalized_gene_response = self._normalized_gene( @@ -283,11 +284,10 @@ async def transcript_segment_element( return ( TranscriptSegmentElement( transcript=genomic_data.transcript, - exonStart=genomic_data.exonStart, - exonStartOffset=genomic_data.exonStartOffset, - exonEnd=genomic_data.exonEnd, - exonEndOffset=genomic_data.exonEndOffset, - # TODO: make sure this is correct/works (might be response.gene?) + exonStart=genomic_data.exon_start, + exonStartOffset=genomic_data.exon_start_offset, + exonEnd=genomic_data.exon_end, + exonEndOffset=genomic_data.exon_end_offset, gene=normalized_gene_response[0], elementGenomicStart=self._sequence_location( genomic_data.start, From 1ce7f23488651f31eeca5694c17485754217619f Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:28:55 -0400 Subject: [PATCH 30/82] Update src/fusor/fusor.py Co-authored-by: Kori Kuzma --- src/fusor/fusor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index e193826..5c8aa09 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -322,7 +322,7 @@ def gene_element( gene-normalizer's gene object will be used :return: GeneElement, warning """ - normalized_gene, warning = self._normalized_gene( + return self._normalized_gene( gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not normalized_gene: From 485035cbd10168abbede37f202d535ff3a3ce0f8 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:29:11 -0400 Subject: [PATCH 31/82] Update src/fusor/fusor.py Co-authored-by: Kori Kuzma --- src/fusor/fusor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 5c8aa09..b7ae9ff 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -705,8 +705,7 @@ def _normalized_gene( gene = gene_norm_resp.gene if use_minimal_gene_descr: return gene, None - # TODO: remove normalize.gene from id - return Gene(id=gene.id, label=gene.label) + return Gene(id=gene_norm_resp.normalized_id, label=gene.label) return None, f"gene-normalizer unable to normalize {query}" def generate_nomenclature(self, fusion: Fusion) -> str: From 9f1ee60019737581d7f120633443dad75753bc30 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:35:41 -0400 Subject: [PATCH 32/82] fix: minimal gene response when creating gene --- src/fusor/fusor.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index b7ae9ff..f225367 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -312,22 +312,17 @@ async def transcript_segment_element( ) def gene_element( - self, gene: str, use_minimal_gene_descr: bool = True + self, gene: str, use_minimal_gene: bool = True ) -> tuple[Gene | None, str | None]: """Create gene element :param str gene: Gene - :param bool use_minimal_gene_descr: `True` if minimal gene object + :param bool use_minimal_gene: `True` if minimal gene object (`id`, `gene_id`, `label`) will be used. `False` if gene-normalizer's gene object will be used :return: GeneElement, warning """ - return self._normalized_gene( - gene, use_minimal_gene_descr=use_minimal_gene_descr - ) - if not normalized_gene: - return None, warning - return normalized_gene, None + return self._normalized_gene(gene, use_minimal_gene=use_minimal_gene) def templated_sequence_element( self, @@ -692,20 +687,21 @@ def add_translated_sequence_id( return fusion def _normalized_gene( - self, query: str, use_minimal_gene_descr: bool + self, query: str, use_minimal_gene: bool ) -> tuple[Gene | None, str | None]: """Return gene from normalized response. :param query: Gene query + :param use_minimal_gene: bool Use minimal gene representation (id and label only) :return: Tuple with gene and None value for warnings if successful, and None value with warning string if unsuccessful """ gene_norm_resp = self.gene_normalizer.normalize(query) if gene_norm_resp.match_type: gene = gene_norm_resp.gene - if use_minimal_gene_descr: + if not use_minimal_gene: return gene, None - return Gene(id=gene_norm_resp.normalized_id, label=gene.label) + return Gene(id=gene_norm_resp.normalized_id, label=gene.label), None return None, f"gene-normalizer unable to normalize {query}" def generate_nomenclature(self, fusion: Fusion) -> str: From 5bbc77cd687d54327b70a4710b73fb8580c5fde0 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:39:06 -0400 Subject: [PATCH 33/82] fix: naming --- src/fusor/fusor.py | 18 +++++++++--------- src/fusor/models.py | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index f225367..7deba05 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -213,7 +213,7 @@ def assayed_fusion( async def transcript_segment_element( self, tx_to_genomic_coords: bool = True, - use_minimal_gene_descr: bool = True, + use_minimal_gene: bool = True, seq_id_target_namespace: str | None = None, **kwargs, ) -> tuple[TranscriptSegmentElement | None, list[str] | None]: @@ -221,7 +221,7 @@ async def transcript_segment_element( :param tx_to_genomic_coords: `True` if going from transcript to genomic coordinates. ``False`` if going from genomic to transcript exon coordinates. - :param use_minimal_gene_descr: `True` if minimal gene object + :param use_minimal_gene: `True` if minimal gene object (``id``, ``label``) will be used. ``False`` if gene-normalizer's entire gene object will be used :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set @@ -276,7 +276,7 @@ async def transcript_segment_element( genomic_data.transcript = coerce_namespace(genomic_data.transcript) normalized_gene_response = self._normalized_gene( - genomic_data.gene, use_minimal_gene_descr=use_minimal_gene_descr + genomic_data.gene, use_minimal_gene=use_minimal_gene ) if not normalized_gene_response[0] and normalized_gene_response[1]: return None, [normalized_gene_response[1]] @@ -412,7 +412,7 @@ def functional_domain( sequence_id: str, start: int, end: int, - use_minimal_gene_descr: bool = True, + use_minimal_gene: bool = True, seq_id_target_namespace: str | None = None, ) -> tuple[FunctionalDomain | None, str | None]: """Build functional domain instance. @@ -424,7 +424,7 @@ def functional_domain( :param sequence_id: protein sequence on which provided coordinates are located :param start: start position on sequence :param end: end position on sequence - :param use_minimal_gene_descr: ``True`` if minimal gene object (``id``, + :param use_minimal_gene: ``True`` if minimal gene object (``id``, ``gene_id``, ``label``) will be used. ``False`` if gene-normalizer's gene object will be used :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set @@ -448,7 +448,7 @@ def functional_domain( return None, warning gene_descr, warning = self._normalized_gene( - gene, use_minimal_gene_descr=use_minimal_gene_descr + gene, use_minimal_gene=use_minimal_gene ) if not gene_descr: return None, warning @@ -477,17 +477,17 @@ def regulatory_element( self, regulatory_class: RegulatoryClass, gene: str, - use_minimal_gene_descr: bool = True, + use_minimal_gene: bool = True, ) -> tuple[RegulatoryElement | None, str | None]: """Create RegulatoryElement :param regulatory_class: one of {"promoter", "enhancer"} :param gene: gene term to fetch normalized gene object for - :param use_minimal_gene_descr: whether to use the minimal gene object + :param use_minimal_gene: whether to use the minimal gene object :return: Tuple with RegulatoryElement instance and None value for warnings if successful, or a None value and warning message if unsuccessful """ gene_descr, warning = self._normalized_gene( - gene, use_minimal_gene_descr=use_minimal_gene_descr + gene, use_minimal_gene=use_minimal_gene ) if not gene_descr: return None, warning diff --git a/src/fusor/models.py b/src/fusor/models.py index dfa393d..930c6d9 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -59,7 +59,7 @@ class FunctionalDomain(BaseModel): type: Literal[FUSORTypes.FUNCTIONAL_DOMAIN] = FUSORTypes.FUNCTIONAL_DOMAIN status: DomainStatus - gene: Gene + associatedGene: Gene id: CURIE | None label: StrictStr | None = None sequenceLocation: SequenceLocation | None = None @@ -72,7 +72,7 @@ class FunctionalDomain(BaseModel): "status": "lost", "label": "Tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR020635", - "gene": { + "associatedGene": { "id": "gene:NTRK1", "gene_id": "hgnc:8031", "label": "8031", From 5a8bea2f00b9c2e7dc06254af89afec93d56d71b Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:40:39 -0400 Subject: [PATCH 34/82] Update src/fusor/models.py Co-authored-by: Kori Kuzma --- src/fusor/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 930c6d9..87048d9 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -79,7 +79,7 @@ class FunctionalDomain(BaseModel): "type": "Gene", }, "sequenceLocation": { - "id": "SL.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", + "id": "ga4gh:SL.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", "start": 510, "end": "781", "type": "SequenceLocation", From ca97171524e4639fe822cfbade6500bc34674cf1 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:40:51 -0400 Subject: [PATCH 35/82] Update src/fusor/models.py Co-authored-by: Kori Kuzma --- src/fusor/models.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 87048d9..087369a 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -73,9 +73,8 @@ class FunctionalDomain(BaseModel): "label": "Tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR020635", "associatedGene": { - "id": "gene:NTRK1", - "gene_id": "hgnc:8031", - "label": "8031", + "id": "hgnc:8031", + "label": "NTRK1", "type": "Gene", }, "sequenceLocation": { From 92fe98f16bc687bd872acb5e468d8697f4f15e0d Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:44:07 -0400 Subject: [PATCH 36/82] Update src/fusor/models.py Co-authored-by: Kori Kuzma --- src/fusor/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 087369a..a3d1374 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -83,7 +83,7 @@ class FunctionalDomain(BaseModel): "end": "781", "type": "SequenceLocation", "sequenceReference": { - "id": "NP_002520.2", + "id": "NC_000022.11", "label": "GRCh38:chr22", "type": "SequenceReference", "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", From 43f347ffa29b41326c12e160cc9ffe25adc88042 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:44:13 -0400 Subject: [PATCH 37/82] Update src/fusor/models.py Co-authored-by: Kori Kuzma --- src/fusor/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index a3d1374..8d565ac 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -183,8 +183,7 @@ def check_exons(cls, values): "end": 154192136, }, "elementGenomicEnd": { - # TODO: this digest may not be correct, but putting a placeholder example here for now - "id": "SL.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", + "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", "type": "SequenceLocation", "label": "NC_000001.11", "sequenceReference": { From dba14caed1cc530bbc6596d0168a2d2be93c95a7 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:44:21 -0400 Subject: [PATCH 38/82] Update src/fusor/fusor.py Co-authored-by: Kori Kuzma --- src/fusor/fusor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 7deba05..8d1f8fa 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -318,7 +318,7 @@ def gene_element( :param str gene: Gene :param bool use_minimal_gene: `True` if minimal gene object - (`id`, `gene_id`, `label`) will be used. `False` if + (`id` and `label`) will be used. `False` if gene-normalizer's gene object will be used :return: GeneElement, warning """ From a3e67d1e48a19b2585af5876bac29654e005a4ba Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 12:44:34 -0400 Subject: [PATCH 39/82] Update src/fusor/models.py Co-authored-by: Kori Kuzma --- src/fusor/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 8d565ac..6088f6e 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -171,8 +171,7 @@ def check_exons(cls, values): "label": "TPM3", }, "elementGenomicStart": { - # TODO: this digest may not be correct, but putting a placeholder example here for now - "id": "SL.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", + "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", "type": "SequenceLocation", "label": "NC_000001.11", "sequenceReference": { From 74db34c9bff5ffaf0551314257a01bfad0ac9710 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 13:27:06 -0400 Subject: [PATCH 40/82] updating constructor for SequenceLocation and adding SequenceReference --- src/fusor/fusor.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 7deba05..6fb16fe 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -9,7 +9,12 @@ from ga4gh.core import ga4gh_identify from ga4gh.core.domain_models import Gene from ga4gh.vrs import models -from ga4gh.vrs.models import LiteralSequenceExpression, SequenceLocation, SequenceString +from ga4gh.vrs.models import ( + LiteralSequenceExpression, + SequenceLocation, + SequenceReference, + SequenceString, +) from gene.database import AbstractDatabase as GeneDatabase from gene.database import create_db from gene.query import QueryHandler @@ -190,7 +195,7 @@ def assayed_fusion( ) -> AssayedFusion: """Construct an assayed fusion object :param structure: elements constituting the fusion - :param causativeEvent: event causing the fusion + :param causative_event: event causing the fusion :param assay: how knowledge of the fusion was obtained :param regulatory_element: affected regulatory elements :param reading_frame_preserved: ``True`` if reading frame is preserved. @@ -272,7 +277,6 @@ async def transcript_segment_element( return None, data.warnings genomic_data = data.genomic_data - # should we be doing this before the call to cool_seq_tool? if the namespace prefix is present, that call will fail genomic_data.transcript = coerce_namespace(genomic_data.transcript) normalized_gene_response = self._normalized_gene( @@ -293,7 +297,7 @@ async def transcript_segment_element( genomic_data.start, genomic_data.start + 1, genomic_data.chr, - label=genomic_data.chr, + label=genomic_data.transcript, seq_id_target_namespace=seq_id_target_namespace, ) if genomic_data.start @@ -302,7 +306,7 @@ async def transcript_segment_element( genomic_data.end, genomic_data.end + 1, genomic_data.chr, - label=genomic_data.chr, + label=genomic_data.transcript, seq_id_target_namespace=seq_id_target_namespace, ) if genomic_data.end @@ -543,11 +547,22 @@ def _sequence_location( else: sequence_id = seq_id + # TODO: unsure if this is correct - I'm getting SQ's instead of SL's + refget_accession = translate_identifier( + self.seqrepo, + label, + "ga4gh", + ) + return SequenceLocation( id=sequence_id, label=label, start=start, end=end, + # TODO: pretty sure this isn't quite right but along the right track for what we want, would love some guidance on this later + sequence_reference=SequenceReference( + id=label, refgetAccession=refget_accession.replace("ga4gh:", "") + ), ) def add_additional_fields( From cce615913e7a8281007657bae08c0163b2bea685 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 13:28:15 -0400 Subject: [PATCH 41/82] removing comment --- src/fusor/fusor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index c3d28ce..4e7fdc2 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -547,7 +547,6 @@ def _sequence_location( else: sequence_id = seq_id - # TODO: unsure if this is correct - I'm getting SQ's instead of SL's refget_accession = translate_identifier( self.seqrepo, label, From 58b0899c41bcedf044acb0265ad97ddecb8fb5fb Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 13:41:27 -0400 Subject: [PATCH 42/82] wip: start updates to nomenclature using new models --- src/fusor/nomenclature.py | 61 +++++++++++++++++---------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/src/fusor/nomenclature.py b/src/fusor/nomenclature.py index c250df0..ce1f675 100644 --- a/src/fusor/nomenclature.py +++ b/src/fusor/nomenclature.py @@ -1,6 +1,7 @@ """Provide helper methods for fusion nomenclature generation.""" from biocommons.seqrepo.seqrepo import SeqRepo +from ga4gh.vrs.models import SequenceLocation from fusor.exceptions import IDTranslationException from fusor.models import ( @@ -23,18 +24,19 @@ def reg_element_nomenclature(element: RegulatoryElement, sr: SeqRepo) -> str: or if missing element reference ID, genomic location, and associated gene """ - element_class = element.regulatory_class.value + element_class = element.regulatoryClass.value if element_class == RegulatoryClass.ENHANCER: type_string = "e" elif element_class == RegulatoryClass.PROMOTER: type_string = "p" else: - type_string = f"{element.regulatory_class.value}" + type_string = f"{element.regulatoryClass.value}" feature_string = "" - if element.feature_id: - feature_string += f"_{element.feature_id}" - elif element.feature_location: - start = element.feature_location + if element.featureId: + feature_string += f"_{element.featureId}" + elif element.featureLocation: + start = element.featureLocation + # TODO: update this with new model sequence_id = start.location.sequence_id refseq_id = str(translate_identifier(sr, sequence_id, "refseq")).split(":")[1] try: @@ -42,17 +44,12 @@ def reg_element_nomenclature(element: RegulatoryElement, sr: SeqRepo) -> str: except IDTranslationException as e: raise ValueError from e feature_string += f"_{refseq_id}(chr {chrom}):g.{start.location.interval.start.value}_{start.location.interval.end.value}" - if element.associated_gene: - if element.associated_gene.gene_id: - gene_id = gene_id = element.associated_gene.gene_id - - if element.associated_gene.gene_id: - gene_id = element.associated_gene.gene_id - elif element.associated_gene.gene and element.associated_gene.gene.gene_id: - gene_id = element.associated_gene.gene.gene_id + if element.associatedGene: + if element.associatedGene.id: + gene_id = element.associatedGene.id else: raise ValueError - feature_string += f"@{element.associated_gene.label}({gene_id})" + feature_string += f"@{element.associatedGene.label}({gene_id})" if not feature_string: raise ValueError return f"reg_{type_string}{feature_string}" @@ -69,21 +66,21 @@ def tx_segment_nomenclature(element: TranscriptSegmentElement) -> str: if ":" in transcript: transcript = transcript.split(":")[1] - prefix = f"{transcript}({element.gene_descriptor.label})" - start = element.exon_start if element.exon_start else "" - if element.exon_start_offset: - if element.exon_start_offset > 0: - start_offset = f"+{element.exon_start_offset}" + prefix = f"{transcript}({element.gene.label})" + start = element.exonStart if element.exonStart else "" + if element.exonStartOffset: + if element.exonStartOffset > 0: + start_offset = f"+{element.exonStartOffset}" else: - start_offset = str(element.exon_start_offset) + start_offset = str(element.exonStartOffset) else: start_offset = "" - end = element.exon_end if element.exon_end else "" - if element.exon_end_offset: - if element.exon_end_offset > 0: - end_offset = f"+{element.exon_end_offset}" + end = element.exonEnd if element.exonEnd else "" + if element.exonEndOffset: + if element.exonEndOffset > 0: + end_offset = f"+{element.exonEndOffset}" else: - end_offset = str(element.exon_end_offset) + end_offset = str(element.exonEndOffset) else: end_offset = "" return f"{prefix}:e.{start}{start_offset}{'_' if start and end else ''}{end}{end_offset}" @@ -93,6 +90,7 @@ def templated_seq_nomenclature(element: TemplatedSequenceElement, sr: SeqRepo) - """Return fusion nomenclature for templated sequence element. :param element: a templated sequence element + :param sr: SeqRepo instance to use :return: element nomenclature representation :raises ValueError: if location isn't a SequenceLocation or if unable to retrieve region or location @@ -122,13 +120,8 @@ def gene_nomenclature(element: GeneElement) -> str: :return: element nomenclature representation :raises ValueError: if unable to retrieve gene ID """ - if element.gene_descriptor.gene_id: - gene_id = gene_id = element.gene_descriptor.gene_id - - if element.gene_descriptor.gene_id: - gene_id = element.gene_descriptor.gene_id - elif element.gene_descriptor.gene and element.gene_descriptor.gene.gene_id: - gene_id = element.gene_descriptor.gene.gene_id + if element.gene.id: + gene_id = element.gene.id else: raise ValueError - return f"{element.gene_descriptor.label}({gene_id})" + return f"{element.gene.label}({gene_id})" From 634ad7f3f26e2c34a02ceeb8bb305c8240c0671a Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 13:55:32 -0400 Subject: [PATCH 43/82] wip: progress on sequence location constructor --- src/fusor/fusor.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 4e7fdc2..cd77861 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -297,6 +297,7 @@ async def transcript_segment_element( genomic_data.start, genomic_data.start + 1, genomic_data.chr, + transcript=genomic_data.transcript, label=genomic_data.transcript, seq_id_target_namespace=seq_id_target_namespace, ) @@ -306,6 +307,7 @@ async def transcript_segment_element( genomic_data.end, genomic_data.end + 1, genomic_data.chr, + transcript=genomic_data.transcript, label=genomic_data.transcript, seq_id_target_namespace=seq_id_target_namespace, ) @@ -513,6 +515,7 @@ def _sequence_location( start: int, end: int, sequence_id: str, + transcript: str | None = None, label: str | None = None, seq_id_target_namespace: str | None = None, ) -> SequenceLocation: @@ -521,9 +524,10 @@ def _sequence_location( :param start: Start position :param end: End position :param sequence_id: Accession for sequence - :param label: label for location. If ``None``, ``sequence_id`` will be used as - Sequence Location's ``id`` Else, label will be used as Sequence Location's - ``id``. + :param transcript: Associated transcript for the sequence + :param label: label for Sequence Location. If ``None``, ``sequence_id`` will be used as + Sequence Location's ``id`` and ``label`` Else, label will be used as Sequence Location's Sequence Reference's + ``id`` if no transcript is provided. :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set this to the namespace you want the digest for. Otherwise, leave as ``None``. """ @@ -549,18 +553,20 @@ def _sequence_location( refget_accession = translate_identifier( self.seqrepo, - label, + transcript, "ga4gh", ) return SequenceLocation( + # TODO: I think we want id here to be the ga4gh_identified SequenceLocation object here instead, but in order to get that, + # I need to make this object first? if I make the object to get that id and then make a new object with that id, that seems a bit too roundabout... id=sequence_id, + # TODO: and I think this is supposed to be sequence_id label=label, start=start, end=end, - # TODO: pretty sure this isn't quite right but along the right track for what we want, would love some guidance on this later sequence_reference=SequenceReference( - id=label, refgetAccession=refget_accession.replace("ga4gh:", "") + id=transcript, refgetAccession=refget_accession.replace("ga4gh:", "") ), ) From 7334ac881902eea93cf5870958012640820ad7e5 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 14:22:27 -0400 Subject: [PATCH 44/82] fix: tests and add sequence location id --- src/fusor/fusor.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index cd77861..bfb42a5 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -557,18 +557,19 @@ def _sequence_location( "ga4gh", ) - return SequenceLocation( - # TODO: I think we want id here to be the ga4gh_identified SequenceLocation object here instead, but in order to get that, - # I need to make this object first? if I make the object to get that id and then make a new object with that id, that seems a bit too roundabout... - id=sequence_id, - # TODO: and I think this is supposed to be sequence_id - label=label, + sequence_location_label = label if label else sequence_id + sequence_location = SequenceLocation( + label=sequence_location_label, start=start, end=end, sequence_reference=SequenceReference( id=transcript, refgetAccession=refget_accession.replace("ga4gh:", "") ), ) + sequence_location_id = ga4gh_identify(sequence_location) + sequence_location.id = sequence_location_id + + return sequence_location def add_additional_fields( self, From 9d08adf4106e690dda481b82cb1eb814aa6cd985 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 14:41:51 -0400 Subject: [PATCH 45/82] wip: update test examples --- tests/test_models.py | 209 +++++++++++++++++++------------------------ 1 file changed, 94 insertions(+), 115 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index e2ada5f..15f1899 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -24,8 +24,8 @@ @pytest.fixture(scope="module") -def gene_descriptors(): - """Provide possible gene_descriptor input.""" +def gene_examples(): + """Provide possible gene input.""" return [ {"id": "hgnc:9339", "label": "G1"}, {"id": "hgnc:76", "label": "ABL"}, @@ -46,35 +46,23 @@ def gene_descriptors(): @pytest.fixture(scope="module") -def location_descriptors(): - """Provide possible templated_sequence input.""" +def sequence_locations(): + """Provide possible sequence_location input.""" return [ { - "id": "NC_000001.11:15455", + "id": "NC_000001.11", "type": "SequenceLocation", - "location": { - "sequence_id": "ncbi:NC_000001.11", - "interval": { - "start": {"type": "Number", "value": 15455}, - "end": {"type": "Number", "value": 15456}, - }, - "type": "SequenceLocation", - }, - "label": "NC_000001.11:15455", + "start": 15455, + "end": 15456, + # TODO what to put for sequence reference here?? or other places in these examples? }, { - "id": "NC_000001.11:15566", + "id": "NC_000001.11", "type": "SequenceLocation", - "location": { - "sequence_id": "ncbi:NC_000001.11", - "interval": { - "start": {"type": "Number", "value": 15565}, - "end": {"type": "Number", "value": 15566}, - }, - "type": "SequenceLocation", - }, - "label": "NC_000001.11:15566", + "start": 15565, + "end": 15566, }, + # TODO: no clue what to do here - SequenceLocations don't support anything other than integers for start/end { "id": "chr12:p12.1", "type": "SequenceLocation", @@ -96,16 +84,10 @@ def location_descriptors(): "label": "chr12:p12.2", }, { - "id": "NC_000001.11:15455-15566", + "id": "NC_000001.11", "type": "SequenceLocation", - "location": { - "sequence_id": "ncbi:NC_000001.11", - "interval": { - "start": {"type": "Number", "value": 15455}, - "end": {"type": "Number", "value": 15566}, - }, - "type": "SequenceLocation", - }, + "start": 15455, + "end": 15566, "label": "NC_000001.11:15455-15566", }, { @@ -119,34 +101,34 @@ def location_descriptors(): "label": "chr12:p12.1-p12.2", }, { - "id": "fusor.location_descriptor:NP_001123617.1", + "id": "NP_001123617.1", "type": "SequenceLocation", - "location": { - "sequence_id": "ga4gh:SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 171}, - "end": {"type": "Number", "value": 204}, - }, + "start": 171, + "end": 204, + "label": "NP_001123617.1", + "sequenceReference": { + "id": "NM_001130145.3", + "refgetAccession": "SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7", + "type": "SequenceReference", }, }, { - "id": "fusor.location_descriptor:NP_002520.2", + "id": "NP_002520.2", "type": "SequenceLocation", - "location": { - "sequence_id": "ga4gh:SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 510}, - "end": {"type": "Number", "value": 781}, - }, + "start": 510, + "end": 781, + "label": "NP_002520.2", + "sequenceReference": { + "id": "NM_001130145.3", + "refgetAccession": "SQ.61McMQgrnlFJ2Vxh3SJ34hTzpxdubu--", + "type": "SequenceReference", }, }, ] @pytest.fixture(scope="module") -def functional_domains(gene_descriptors, location_descriptors): +def functional_domains(gene_examples, sequence_locations): """Provide possible functional_domains input.""" return [ { @@ -154,88 +136,88 @@ def functional_domains(gene_descriptors, location_descriptors): "status": "preserved", "label": "WW domain", "id": "interpro:IPR001202", - "gene": gene_descriptors[5], - "sequenceLocation": location_descriptors[6], + "associatedGene": gene_examples[5], + "sequenceLocation": sequence_locations[6], }, { "status": "lost", "label": "Tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR020635", - "gene": gene_descriptors[3], - "sequenceLocation": location_descriptors[7], + "associatedGene": gene_examples[3], + "sequenceLocation": sequence_locations[7], }, ] @pytest.fixture(scope="module") -def transcript_segments(location_descriptors, gene_descriptors): +def transcript_segments(sequence_locations, gene_examples): """Provide possible transcript_segment input.""" return [ { "transcript": "refseq:NM_152263.3", - "exon_start": 1, - "exon_start_offset": -9, - "exon_end": 8, - "exon_end_offset": 7, - "gene_descriptor": gene_descriptors[0], - "element_genomic_start": location_descriptors[2], - "element_genomic_end": location_descriptors[3], + "exonStart": 1, + "exonStartOffset": -9, + "exonEnd": 8, + "exonEndOffset": 7, + "gene": gene_examples[0], + "elementGenomicStart": sequence_locations[2], + "elementGenomicEnd": sequence_locations[3], }, { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_034348.3", - "exon_start": 1, - "exon_end": 8, - "gene_descriptor": gene_descriptors[3], - "element_genomic_start": location_descriptors[0], - "element_genomic_end": location_descriptors[1], + "exonStart": 1, + "exonEnd": 8, + "gene": gene_examples[3], + "elementGenomicStart": sequence_locations[0], + "elementGenomicEnd": sequence_locations[1], }, { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_938439.4", - "exon_start": 7, - "exon_end": 14, - "exon_end_offset": -5, - "gene_descriptor": gene_descriptors[4], - "element_genomic_start": location_descriptors[0], - "element_genomic_end": location_descriptors[1], + "exonStart": 7, + "exonEnd": 14, + "exonEndOffset": -5, + "gene": gene_examples[4], + "elementGenomicStart": sequence_locations[0], + "elementGenomicEnd": sequence_locations[1], }, { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_938439.4", - "exon_start": 7, - "gene_descriptor": gene_descriptors[4], - "element_genomic_start": location_descriptors[0], + "exonStart": 7, + "gene": gene_examples[4], + "elementGenomicStart": sequence_locations[0], }, ] @pytest.fixture(scope="module") -def gene_elements(gene_descriptors): +def gene_elements(gene_examples): """Provide possible gene element input data.""" return [ { "type": "GeneElement", - "gene_descriptor": gene_descriptors[1], + "gene": gene_examples[1], }, - {"type": "GeneElement", "gene_descriptor": gene_descriptors[0]}, + {"type": "GeneElement", "gene": gene_examples[0]}, {"type"}, ] @pytest.fixture(scope="module") -def templated_sequence_elements(location_descriptors): +def templated_sequence_elements(sequence_locations): """Provide possible templated sequence element input data.""" return [ { "type": "TemplatedSequenceElement", "strand": "+", - "region": location_descriptors[5], + "region": sequence_locations[5], }, { "type": "TemplatedSequenceElement", "strand": "-", - "region": location_descriptors[4], + "region": sequence_locations[4], }, ] @@ -248,19 +230,19 @@ def sequence_descriptors(): "id": "sequence:ACGT", "type": "SequenceDescriptor", "sequence": "ACGT", - "residue_type": "SO:0000348", + "residueType": "SO:0000348", }, { "id": "sequence:T", "type": "SequenceDescriptor", "sequence": "T", - "residue_type": "SO:0000348", + "residueType": "SO:0000348", }, { "id": "sequence:actgu", "type": "SequenceDescriptor", "sequence": "actgu", - "residue_type": "SO:0000348", + "residueType": "SO:0000348", }, ] @@ -269,9 +251,9 @@ def sequence_descriptors(): def linkers(sequence_descriptors): """Provide possible linker element input data.""" return [ - {"type": "LinkerSequenceElement", "linker_sequence": sequence_descriptors[0]}, - {"type": "LinkerSequenceElement", "linker_sequence": sequence_descriptors[1]}, - {"type": "LinkerSequenceElement", "linker_sequence": sequence_descriptors[2]}, + {"type": "LinkerSequenceElement", "linkerSequence": sequence_descriptors[0]}, + {"type": "LinkerSequenceElement", "linkerSequence": sequence_descriptors[1]}, + {"type": "LinkerSequenceElement", "linkerSequence": sequence_descriptors[2]}, ] @@ -282,9 +264,9 @@ def unknown_element(): @pytest.fixture(scope="module") -def regulatory_elements(gene_descriptors): +def regulatory_elements(gene_examples): """Provide possible regulatory_element input data.""" - return [{"regulatory_class": "promoter", "associated_gene": gene_descriptors[0]}] + return [{"regulatoryClass": "promoter", "associatedGene": gene_examples[0]}] def check_validation_error(exc_info, expected_msg: str, index: int = 0): @@ -299,30 +281,29 @@ def check_validation_error(exc_info, expected_msg: str, index: int = 0): assert exc_info.value.errors()[index]["msg"] == expected_msg -def test_functional_domain(functional_domains, gene_descriptors): +def test_functional_domain(functional_domains, gene_examples): """Test FunctionalDomain object initializes correctly""" test_domain = FunctionalDomain(**functional_domains[0]) assert test_domain.type == "FunctionalDomain" assert test_domain.status == "preserved" assert test_domain.label == "WW domain" assert test_domain.id == "interpro:IPR001202" - assert test_domain.gene.id == "hgnc:16262" - assert test_domain.gene.label == "YAP1" + assert test_domain.associatedGene.id == "hgnc:16262" + assert test_domain.associatedGene.label == "YAP1" test_loc = test_domain.sequenceLocation assert test_loc.id == "fusor.location_descriptor:NP_001123617.1" assert test_loc.type == "SequenceLocation" - assert test_loc.location.sequence_id == "ga4gh:SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7" - assert test_loc.location.interval.type == "SequenceInterval" - assert test_loc.location.interval.start.value == 171 - assert test_loc.location.interval.end.value == 204 + assert test_loc.sequence_id == "ga4gh:SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7" + assert test_loc.start.value == 171 + assert test_loc.end.value == 204 test_domain = FunctionalDomain(**functional_domains[1]) assert test_domain.type == "FunctionalDomain" assert test_domain.status == "lost" assert test_domain.label == "Tyrosine-protein kinase, catalytic domain" assert test_domain.id == "interpro:IPR020635" - assert test_domain.gene.id == "hgnc:8031" - assert test_domain.gene.label == "NTRK1" + assert test_domain.associatedGene.id == "hgnc:8031" + assert test_domain.associatedGene.label == "NTRK1" test_loc = test_domain.sequence_location assert test_loc.id == "fusor.location_descriptor:NP_002520.2" assert test_loc.type == "SequenceLocation" @@ -337,7 +318,7 @@ def test_functional_domain(functional_domains, gene_descriptors): status="gained", name="tyrosine kinase catalytic domain", id="interpro:IPR020635", - associated_gene=gene_descriptors[0], + associated_gene=gene_examples[0], ) msg = "Input should be 'lost' or 'preserved'" check_validation_error(exc_info, msg) @@ -348,7 +329,7 @@ def test_functional_domain(functional_domains, gene_descriptors): status="lost", label="tyrosine kinase catalytic domain", id="interpro_IPR020635", - associated_gene=gene_descriptors[0], + associated_gene=gene_examples[0], ) msg = "String should match pattern '^\\w[^:]*:.+$'" check_validation_error(exc_info, msg) @@ -540,7 +521,7 @@ def check_linker(actual, expected_id, expected_sequence): check_validation_error(exc_info, msg) -def test_genomic_region_element(templated_sequence_elements, location_descriptors): +def test_genomic_region_element(templated_sequence_elements, sequence_locations): """Test that TemplatedSequenceElement initializes correctly.""" def assert_genomic_region_test_element(test): @@ -581,15 +562,15 @@ def assert_genomic_region_test_element(test): # test enum validation with pytest.raises(ValidationError) as exc_info: assert TemplatedSequenceElement( - type="GeneElement", region=location_descriptors[0], strand="+" + type="GeneElement", region=sequence_locations[0], strand="+" ) msg = "Input should be " check_validation_error(exc_info, msg) -def test_gene_element(gene_descriptors): +def test_gene_element(gene_examples): """Test that Gene Element initializes correctly.""" - test_element = GeneElement(gene_descriptor=gene_descriptors[0]) + test_element = GeneElement(gene_descriptor=gene_examples[0]) assert test_element.type == "GeneElement" assert test_element.gene_descriptor.id == "gene:G1" assert test_element.gene_descriptor.label == "G1" @@ -609,9 +590,7 @@ def test_gene_element(gene_descriptors): # test enum validation with pytest.raises(ValidationError) as exc_info: - assert GeneElement( - type="UnknownGeneElement", gene_descriptor=gene_descriptors[0] - ) + assert GeneElement(type="UnknownGeneElement", gene_descriptor=gene_examples[0]) msg = "Input should be " check_validation_error(exc_info, msg) @@ -650,7 +629,7 @@ def test_event(): CausativeEvent(event_type="combination") -def test_regulatory_element(regulatory_elements, gene_descriptors): +def test_regulatory_element(regulatory_elements, gene_examples): """Test RegulatoryElement object initializes correctly""" test_reg_elmt = RegulatoryElement(**regulatory_elements[0]) assert test_reg_elmt.regulatory_class.value == "promoter" @@ -661,7 +640,7 @@ def test_regulatory_element(regulatory_elements, gene_descriptors): # check type constraint with pytest.raises(ValidationError) as exc_info: RegulatoryElement( - regulatory_class="notpromoter", associated_gene=gene_descriptors[0] + regulatory_class="notpromoter", associated_gene=gene_examples[0] ) assert exc_info.value.errors()[0]["msg"].startswith("Input should be") @@ -785,7 +764,7 @@ def test_fusion_element_count( unknown_element, gene_elements, transcript_segments, - gene_descriptors, + gene_examples, ): """Test fusion element count requirements.""" # elements are mandatory @@ -846,8 +825,8 @@ def test_fusion_element_count( assert AssayedFusion( type="AssayedFusion", structural_elements=[ - {"type": "GeneElement", "gene_descriptor": gene_descriptors[6]}, - {"type": "GeneElement", "gene_descriptor": gene_descriptors[6]}, + {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, + {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, ], causative_event={ "type": "CausativeEvent", @@ -865,13 +844,13 @@ def test_fusion_element_count( assert AssayedFusion( type="AssayedFusion", structural_elements=[ - {"type": "GeneElement", "gene_descriptor": gene_descriptors[6]}, + {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, ], regulatory_element={ "type": "RegulatoryElement", "regulatory_class": "enhancer", "feature_id": "EH111111111", - "associated_gene": gene_descriptors[6], + "associated_gene": gene_examples[6], }, causative_event={ "type": "CausativeEvent", From fc649b59d493842c6687488b20e44f0240062e48 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 14:45:40 -0400 Subject: [PATCH 46/82] removing incorrect test cases- adding placeholders for now --- tests/test_models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index 15f1899..d7bcc29 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -107,7 +107,7 @@ def sequence_locations(): "end": 204, "label": "NP_001123617.1", "sequenceReference": { - "id": "NM_001130145.3", + "id": "placeholder", "refgetAccession": "SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7", "type": "SequenceReference", }, @@ -119,7 +119,7 @@ def sequence_locations(): "end": 781, "label": "NP_002520.2", "sequenceReference": { - "id": "NM_001130145.3", + "id": "placeholder", "refgetAccession": "SQ.61McMQgrnlFJ2Vxh3SJ34hTzpxdubu--", "type": "SequenceReference", }, From 1e0c7df6d71ed8fb2ff9f388fbd3b7caef092233 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 15:35:36 -0400 Subject: [PATCH 47/82] fix constructing sequence location --- src/fusor/fusor.py | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index bfb42a5..14fc5ab 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -297,7 +297,6 @@ async def transcript_segment_element( genomic_data.start, genomic_data.start + 1, genomic_data.chr, - transcript=genomic_data.transcript, label=genomic_data.transcript, seq_id_target_namespace=seq_id_target_namespace, ) @@ -307,7 +306,6 @@ async def transcript_segment_element( genomic_data.end, genomic_data.end + 1, genomic_data.chr, - transcript=genomic_data.transcript, label=genomic_data.transcript, seq_id_target_namespace=seq_id_target_namespace, ) @@ -515,8 +513,6 @@ def _sequence_location( start: int, end: int, sequence_id: str, - transcript: str | None = None, - label: str | None = None, seq_id_target_namespace: str | None = None, ) -> SequenceLocation: """Create sequence location @@ -524,10 +520,6 @@ def _sequence_location( :param start: Start position :param end: End position :param sequence_id: Accession for sequence - :param transcript: Associated transcript for the sequence - :param label: label for Sequence Location. If ``None``, ``sequence_id`` will be used as - Sequence Location's ``id`` and ``label`` Else, label will be used as Sequence Location's Sequence Reference's - ``id`` if no transcript is provided. :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set this to the namespace you want the digest for. Otherwise, leave as ``None``. """ @@ -551,19 +543,13 @@ def _sequence_location( else: sequence_id = seq_id - refget_accession = translate_identifier( - self.seqrepo, - transcript, - "ga4gh", - ) + refget_accession = translate_identifier(self.seqrepo, sequence_id) - sequence_location_label = label if label else sequence_id sequence_location = SequenceLocation( - label=sequence_location_label, start=start, end=end, sequence_reference=SequenceReference( - id=transcript, refgetAccession=refget_accession.replace("ga4gh:", "") + id=sequence_id, refgetAccession=refget_accession.replace("ga4gh:", "") ), ) sequence_location_id = ga4gh_identify(sequence_location) @@ -689,7 +675,6 @@ def add_translated_sequence_id( continue loc_descr.location.sequence_id = new_id if fusion.type == "CategoricalFusion" and fusion.criticalFunctionalDomains: - # TODO: unreachable code? for domain in fusion.criticalFunctionalDomains: if ( domain.sequence_location From 33e932b02878b036bd5dc71befa2f1c4cf8ff191 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 15:47:13 -0400 Subject: [PATCH 48/82] fix: casing for sequencelocation --- src/fusor/fusor.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 14fc5ab..5f7dac1 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -297,7 +297,6 @@ async def transcript_segment_element( genomic_data.start, genomic_data.start + 1, genomic_data.chr, - label=genomic_data.transcript, seq_id_target_namespace=seq_id_target_namespace, ) if genomic_data.start @@ -306,7 +305,6 @@ async def transcript_segment_element( genomic_data.end, genomic_data.end + 1, genomic_data.chr, - label=genomic_data.transcript, seq_id_target_namespace=seq_id_target_namespace, ) if genomic_data.end @@ -334,7 +332,6 @@ def templated_sequence_element( end: int, sequence_id: str, strand: Strand, - label: str | None = None, add_location_id: bool = False, residue_mode: ResidueMode = ResidueMode.RESIDUE, seq_id_target_namespace: str | None = None, @@ -345,7 +342,6 @@ def templated_sequence_element( :param end: Genomic end :param sequence_id: Chromosome accession for sequence :param strand: Strand - :param label: Label for genomic location :param add_location_id: ``True`` if ``location_id`` will be added to ``region``. ``False`` otherwise. :param residue_mode: Determines coordinate base used. Must be one of ``residue`` @@ -361,7 +357,6 @@ def templated_sequence_element( start, end, sequence_id, - label=label, seq_id_target_namespace=seq_id_target_namespace, ) @@ -548,7 +543,7 @@ def _sequence_location( sequence_location = SequenceLocation( start=start, end=end, - sequence_reference=SequenceReference( + sequenceReference=SequenceReference( id=sequence_id, refgetAccession=refget_accession.replace("ga4gh:", "") ), ) From 091e23ac8932e3cc66b640f40ae722ccfbdee282 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 16:07:51 -0400 Subject: [PATCH 49/82] updating sequence locations examples --- tests/test_models.py | 76 ++++++++++++++++---------------------------- 1 file changed, 28 insertions(+), 48 deletions(-) diff --git a/tests/test_models.py b/tests/test_models.py index d7bcc29..f76f982 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -50,79 +50,59 @@ def sequence_locations(): """Provide possible sequence_location input.""" return [ { - "id": "NC_000001.11", + "id": "ga4gh:SL.-xC3omZDIKZEuotbbHWQMTC8sS3nOxTb", "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceLocation", + }, "start": 15455, "end": 15456, - # TODO what to put for sequence reference here?? or other places in these examples? }, { - "id": "NC_000001.11", + "id": "ga4gh:SL.-xC3omZDIKZEuotbbHWQMTC8sS3nOxTb", "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceLocation", + }, "start": 15565, "end": 15566, }, - # TODO: no clue what to do here - SequenceLocations don't support anything other than integers for start/end - { - "id": "chr12:p12.1", - "type": "SequenceLocation", - "location": { - "species_id": "taxonomy:9606", - "chr": "12", - "interval": {"start": "p12.1", "end": "p12.1"}, - }, - "label": "chr12:p12.1", - }, { - "id": "chr12:p12.2", + "id": "ga4gh:SL.-xC3omZDIKZEuotbbHWQMTC8sS3nOxTb", "type": "SequenceLocation", - "location": { - "species_id": "taxonomy:9606", - "chr": "12", - "interval": {"start": "p12.2", "end": "p12.2"}, + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceLocation", }, - "label": "chr12:p12.2", - }, - { - "id": "NC_000001.11", - "type": "SequenceLocation", "start": 15455, "end": 15566, - "label": "NC_000001.11:15455-15566", }, { - "id": "chr12:p12.1-p12.2", + "id": "ga4gh:SL.VJLxl42yYoa-0ZMa8dfakhZfcP0nWgpl", "type": "SequenceLocation", - "location": { - "species_id": "taxonomy:9606", - "chr": "12", - "interval": {"start": "p12.1", "end": "p12.2"}, - }, - "label": "chr12:p12.1-p12.2", - }, - { - "id": "NP_001123617.1", - "type": "SequenceLocation", - "start": 171, - "end": 204, - "label": "NP_001123617.1", "sequenceReference": { - "id": "placeholder", + "id": "refseq:NP_001123617.1", "refgetAccession": "SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7", - "type": "SequenceReference", + "type": "SequenceLocation", }, + "start": 171, + "end": 204, }, { - "id": "NP_002520.2", + "id": "ga4gh:SL.fZQW-qJwKlrVdae-idN_XXee5VTfEOgA", "type": "SequenceLocation", - "start": 510, - "end": 781, - "label": "NP_002520.2", "sequenceReference": { - "id": "placeholder", - "refgetAccession": "SQ.61McMQgrnlFJ2Vxh3SJ34hTzpxdubu--", - "type": "SequenceReference", + "id": "refseq:NP_002520.2", + "refgetAccession": "SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", + "type": "SequenceLocation", }, + "start": 510, + "end": 781, }, ] From 1fbee7cba87b375702afb9bf379fb5039af94779 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 17:03:51 -0400 Subject: [PATCH 50/82] updating tests --- src/fusor/models.py | 4 +- tests/test_models.py | 210 ++++++++++++++++++++++++++----------------- 2 files changed, 128 insertions(+), 86 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 6088f6e..bf010fb 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -207,15 +207,13 @@ class LinkerElement(BaseStructuralElement, extra="forbid"): model_config = ConfigDict( arbitrary_types_allowed=True, - # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "LinkerSequenceElement", "linkerSequence": { "id": "sequence:ACGT", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", "sequence": "ACGT", - "residue_type": "SO:0000348", }, } }, diff --git a/tests/test_models.py b/tests/test_models.py index f76f982..79e8630 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -55,7 +55,7 @@ def sequence_locations(): "sequenceReference": { "id": "refseq:NC_000001.11", "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "type": "SequenceLocation", + "type": "SequenceReference", }, "start": 15455, "end": 15456, @@ -66,18 +66,52 @@ def sequence_locations(): "sequenceReference": { "id": "refseq:NC_000001.11", "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "type": "SequenceLocation", + "type": "SequenceReference", }, "start": 15565, "end": 15566, }, + # TODO: the following 3 examples were made when intervals supported strings and need updated data chr12:p12.1-p12.2. I put in placeholders for now + { + "id": "ga4gh:SL.PPQ-aYd6dsSj7ulUEeqK8xZJP-yPrfdP", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000012.12", + "refgetAccession": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", + "type": "SequenceReference", + }, + "start": 1, + "end": 2, + }, + { + "id": "ga4gh:SL.OBeSv2B0pURlocL7viFiRwajew_GYGqN", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000012.12", + "refgetAccession": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", + "type": "SequenceReference", + }, + "start": 2, + "end": 3, + }, + { + "id": "ga4gh:SL.OBeSv2B0pURlocL7viFiRwajew_GYGqN", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000012.12", + "refgetAccession": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", + "type": "SequenceReference", + }, + "start": 1, + "end": 3, + }, { "id": "ga4gh:SL.-xC3omZDIKZEuotbbHWQMTC8sS3nOxTb", "type": "SequenceLocation", "sequenceReference": { "id": "refseq:NC_000001.11", "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "type": "SequenceLocation", + "type": "SequenceReference", }, "start": 15455, "end": 15566, @@ -88,7 +122,7 @@ def sequence_locations(): "sequenceReference": { "id": "refseq:NP_001123617.1", "refgetAccession": "SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7", - "type": "SequenceLocation", + "type": "SequenceReference", }, "start": 171, "end": 204, @@ -99,7 +133,7 @@ def sequence_locations(): "sequenceReference": { "id": "refseq:NP_002520.2", "refgetAccession": "SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", - "type": "SequenceLocation", + "type": "SequenceReference", }, "start": 510, "end": 781, @@ -203,37 +237,46 @@ def templated_sequence_elements(sequence_locations): @pytest.fixture(scope="module") -def sequence_descriptors(): - """Provide possible SequenceDescriptor input data""" +def literal_sequence_expressions(): + """Provide possible LiteralSequenceExpression input data""" return [ { "id": "sequence:ACGT", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", "sequence": "ACGT", "residueType": "SO:0000348", }, { "id": "sequence:T", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", "sequence": "T", "residueType": "SO:0000348", }, { "id": "sequence:actgu", - "type": "SequenceDescriptor", - "sequence": "actgu", + "type": "LiteralSequenceExpression", + "sequence": "ACTGU", "residueType": "SO:0000348", }, ] @pytest.fixture(scope="module") -def linkers(sequence_descriptors): +def linkers(literal_sequence_expressions): """Provide possible linker element input data.""" return [ - {"type": "LinkerSequenceElement", "linkerSequence": sequence_descriptors[0]}, - {"type": "LinkerSequenceElement", "linkerSequence": sequence_descriptors[1]}, - {"type": "LinkerSequenceElement", "linkerSequence": sequence_descriptors[2]}, + { + "type": "LinkerSequenceElement", + "linkerSequence": literal_sequence_expressions[0], + }, + { + "type": "LinkerSequenceElement", + "linkerSequence": literal_sequence_expressions[1], + }, + { + "type": "LinkerSequenceElement", + "linkerSequence": literal_sequence_expressions[2], + }, ] @@ -271,11 +314,14 @@ def test_functional_domain(functional_domains, gene_examples): assert test_domain.associatedGene.id == "hgnc:16262" assert test_domain.associatedGene.label == "YAP1" test_loc = test_domain.sequenceLocation - assert test_loc.id == "fusor.location_descriptor:NP_001123617.1" + assert "ga4gh:SL" in test_loc.id assert test_loc.type == "SequenceLocation" - assert test_loc.sequence_id == "ga4gh:SQ.sv5egNzqN5koJQH6w0M4tIK9tEDEfJl7" - assert test_loc.start.value == 171 - assert test_loc.end.value == 204 + assert test_loc.start == 171 + assert test_loc.end == 204 + test_ref = test_loc.sequenceReference + assert test_ref.id == "refseq:NP_001123617.1" + assert "SQ." in test_ref.refgetAccession + assert test_ref.type == "SequenceReference" test_domain = FunctionalDomain(**functional_domains[1]) assert test_domain.type == "FunctionalDomain" @@ -284,13 +330,15 @@ def test_functional_domain(functional_domains, gene_examples): assert test_domain.id == "interpro:IPR020635" assert test_domain.associatedGene.id == "hgnc:8031" assert test_domain.associatedGene.label == "NTRK1" - test_loc = test_domain.sequence_location - assert test_loc.id == "fusor.location_descriptor:NP_002520.2" + test_loc = test_domain.sequenceLocation + assert "ga4gh:SL" in test_loc.id assert test_loc.type == "SequenceLocation" - assert test_loc.location.sequence_id == "ga4gh:SQ.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6" - assert test_loc.location.interval.type == "SequenceInterval" - assert test_loc.location.interval.start.value == 510 - assert test_loc.location.interval.end.value == 781 + assert test_loc.start == 510 + assert test_loc.end == 781 + test_ref = test_loc.sequenceReference + assert test_ref.id == "refseq:NP_002520.2" + assert "SQ." in test_ref.refgetAccession + assert test_ref.type == "SequenceReference" # test status string with pytest.raises(ValidationError) as exc_info: @@ -298,7 +346,7 @@ def test_functional_domain(functional_domains, gene_examples): status="gained", name="tyrosine kinase catalytic domain", id="interpro:IPR020635", - associated_gene=gene_examples[0], + associatedGene=gene_examples[0], ) msg = "Input should be 'lost' or 'preserved'" check_validation_error(exc_info, msg) @@ -309,7 +357,7 @@ def test_functional_domain(functional_domains, gene_examples): status="lost", label="tyrosine kinase catalytic domain", id="interpro_IPR020635", - associated_gene=gene_examples[0], + associatedGene=gene_examples[0], ) msg = "String should match pattern '^\\w[^:]*:.+$'" check_validation_error(exc_info, msg) @@ -319,54 +367,45 @@ def test_transcript_segment_element(transcript_segments): """Test TranscriptSegmentElement object initializes correctly""" test_element = TranscriptSegmentElement(**transcript_segments[0]) assert test_element.transcript == "refseq:NM_152263.3" - assert test_element.exon_start == 1 - assert test_element.exon_start_offset == -9 - assert test_element.exon_end == 8 - assert test_element.exon_end_offset == 7 - assert test_element.gene_descriptor.id == "gene:G1" - assert test_element.gene_descriptor.label == "G1" - assert test_element.gene_descriptor.gene.gene_id == "hgnc:9339" - test_region_start = test_element.element_genomic_start - assert test_region_start.location.species_id == "taxonomy:9606" - assert test_region_start.location.type == "ChromosomeLocation" - assert test_region_start.location.chr == "12" - assert test_region_start.location.interval.start == "p12.1" - assert test_region_start.location.interval.end == "p12.1" - test_region_end = test_element.element_genomic_end - assert test_region_end.location.species_id == "taxonomy:9606" - assert test_region_end.location.type == "ChromosomeLocation" - assert test_region_end.location.chr == "12" - assert test_region_end.location.interval.start == "p12.2" - assert test_region_end.location.interval.end == "p12.2" + assert test_element.exonStart == 1 + assert test_element.exonStartOffset == -9 + assert test_element.exonEnd == 8 + assert test_element.exonEndOffset == 7 + assert test_element.gene.id == "hgnc:9339" + assert test_element.gene.label == "G1" + test_region_start = test_element.elementGenomicStart + assert test_region_start.type == "SequenceLocation" + test_region_end = test_element.elementGenomicEnd + assert test_region_end.type == "SequenceLocation" test_element = TranscriptSegmentElement(**transcript_segments[3]) assert test_element.transcript == "refseq:NM_938439.4" - assert test_element.exon_start == 7 - assert test_element.exon_start_offset == 0 - assert test_element.exon_end is None - assert test_element.exon_end_offset is None + assert test_element.exonStart == 7 + assert test_element.exonStartOffset == 0 + assert test_element.exonEnd is None + assert test_element.exonEndOffset is None # check CURIE requirement with pytest.raises(ValidationError) as exc_info: TranscriptSegmentElement( transcript="NM_152263.3", - exon_start="1", - exon_start_offset="-9", - exon_end="8", - exon_end_offset="7", - gene_descriptor={ - "id": "test:1", - "gene": {"id": "hgnc:1"}, + exonStart="1", + exonStartOffset="-9", + exonEnd="8", + exonEndOffset="7", + gene={ + "id": "hgnc:1", "label": "G1", }, - element_genomic_start={ + # TODO: get updated values for this from Jeremy + elementGenomicStart={ "location": { "species_id": "taxonomy:9606", "chr": "12", "interval": {"start": "p12.1", "end": "p12.1"}, } }, - element_genomic_end={ + elementGenomicEnd={ "location": { "species_id": "taxonomy:9606", "chr": "12", @@ -382,23 +421,23 @@ def test_transcript_segment_element(transcript_segments): assert TranscriptSegmentElement( type="TemplatedSequenceElement", transcript="NM_152263.3", - exon_start="1", - exon_start_offset="-9", - exon_end="8", - exon_end_offset="7", + exonStart="1", + exonStartOffset="-9", + exonEnd="8", + exonEndOffset="7", gene_descriptor={ "id": "test:1", "gene": {"id": "hgnc:1"}, "label": "G1", }, - element_genomic_start={ + elementGenomicStart={ "location": { "species_id": "taxonomy:9606", "chr": "12", "interval": {"start": "p12.1", "end": "p12.2"}, } }, - element_genomic_end={ + elementGenomicEnd={ "location": { "species_id": "taxonomy:9606", "chr": "12", @@ -414,37 +453,37 @@ def test_transcript_segment_element(transcript_segments): assert TranscriptSegmentElement( element_type="templated_sequence", transcript="NM_152263.3", - exon_start="1", - exon_start_offset="-9", + exonStart="1", + exonStartOffset="-9", gene_descriptor={ "id": "test:1", "gene": {"id": "hgnc:1"}, "label": "G1", }, ) - msg = "Value error, Must give `element_genomic_start` if `exon_start` is given" + msg = "Value error, Must give `elementGenomicStart` if `exonStart` is given" check_validation_error(exc_info, msg) - # Neither exon_start or exon_end given + # Neither exonStart or exonEnd given with pytest.raises(ValidationError) as exc_info: assert TranscriptSegmentElement( type="TranscriptSegmentElement", transcript="NM_152263.3", - exon_start_offset="-9", - exon_end_offset="7", + exonStartOffset="-9", + exonEndOffset="7", gene_descriptor={ "id": "test:1", "gene": {"id": "hgnc:1"}, "label": "G1", }, - element_genomic_start={ + elementGenomicStart={ "location": { "species_id": "taxonomy:9606", "chr": "12", "interval": {"start": "p12.1", "end": "p12.2"}, } }, - element_genomic_end={ + elementGenomicEnd={ "location": { "species_id": "taxonomy:9606", "chr": "12", @@ -452,7 +491,7 @@ def test_transcript_segment_element(transcript_segments): } }, ) - msg = "Value error, Must give values for either `exon_start`, `exon_end`, or both" + msg = "Value error, Must give values for either `exonStart`, `exonEnd`, or both" check_validation_error(exc_info, msg) @@ -461,10 +500,9 @@ def test_linker_element(linkers): def check_linker(actual, expected_id, expected_sequence): assert actual.type == "LinkerSequenceElement" - assert actual.linker_sequence.id == expected_id - assert actual.linker_sequence.sequence == expected_sequence - assert actual.linker_sequence.type == "SequenceDescriptor" - assert actual.linker_sequence.residue_type == "SO:0000348" + assert actual.linkerSequence.id == expected_id + assert actual.linkerSequence.sequence.root == expected_sequence + assert actual.linkerSequence.type == "LiteralSequenceExpression" for args in ( (LinkerElement(**linkers[0]), "sequence:ACGT", "ACGT"), @@ -475,7 +513,13 @@ def check_linker(actual, expected_id, expected_sequence): # check base validation with pytest.raises(ValidationError) as exc_info: - LinkerElement(linker_sequence={"id": "sequence:ACT1", "sequence": "ACT1"}) + LinkerElement(linkerSequence={"id": "sequence:ACT1", "sequence": "ACT1"}) + msg = "String should match pattern '^[A-Z*\\-]*$'" + check_validation_error(exc_info, msg) + + # check valid literal sequence expression + with pytest.raises(ValidationError) as exc_info: + LinkerElement(linkerSequence={"id": "sequence:actgu", "sequence": "actgu"}) msg = "String should match pattern '^[A-Z*\\-]*$'" check_validation_error(exc_info, msg) @@ -483,7 +527,7 @@ def check_linker(actual, expected_id, expected_sequence): with pytest.raises(ValidationError) as exc_info: assert LinkerElement( type="TemplatedSequenceElement", - linker_sequence={"id": "sequence:ATG", "sequence": "ATG"}, + linkerSequence={"id": "sequence:ATG", "sequence": "ATG"}, ) msg = ( "Input should be " @@ -494,7 +538,7 @@ def check_linker(actual, expected_id, expected_sequence): with pytest.raises(ValidationError) as exc_info: assert LinkerElement( type="LinkerSequenceElement", - linker_sequence={"id": "sequence:G", "sequence": "G"}, + linkerSequence={"id": "sequence:G", "sequence": "G"}, bonus_value="bonus", ) msg = "Extra inputs are not permitted" @@ -716,7 +760,7 @@ def test_fusion( structural_elements=[ { "type": "LinkerSequenceElement", - "linker_sequence": { + "linkerSequence": { "id": "a:b", "type": "SequenceDescriptor", "sequence": "AC", @@ -725,7 +769,7 @@ def test_fusion( }, { "type": "LinkerSequenceElement", - "linker_sequence": { + "linkerSequence": { "id": "a:b", "type": "SequenceDescriptor", "sequence": "AC", From c873d6a0087d2cb5cfeb8d5d4a08fc48b5468194 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 17:28:49 -0400 Subject: [PATCH 51/82] updating tests and adding option to getch gene id from alternate field --- src/fusor/models.py | 9 ++- tests/test_models.py | 153 +++++++++++++++++-------------------------- 2 files changed, 66 insertions(+), 96 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index bf010fb..6b87675 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -453,14 +453,19 @@ def _access_object_attr( def _fetch_gene_id( cls, obj: dict | BaseModel, + alt_field: str | None, ) -> str | None: """Get gene ID if element includes a gene annotation. :param obj: element to fetch gene from. Might not contain a gene (e.g. it's a TemplatedSequenceElement) so we have to use safe checks to fetch. + :param alt_field: the field to fetch the gene from, if it is not called "gene" (ex: associatedGene instead) :return: gene ID if gene is defined """ - gene_info = cls._access_object_attr(obj, "gene") + if alt_field: + gene_info = cls._access_object_attr(obj, alt_field) + else: + gene_info = cls._access_object_attr(obj, "gene") if gene_info: gene_id = cls._access_object_attr(gene_info, "id") if gene_id: @@ -499,7 +504,7 @@ def enforce_element_quantities(cls, values): uq_gene_msg = "Fusions must form a chimeric transcript from two or more genes, or a novel interaction between a rearranged regulatory element with the expressed product of a partner gene." gene_ids = [] if reg_element: - gene_id = cls._fetch_gene_id(obj=reg_element) + gene_id = cls._fetch_gene_id(obj=reg_element, alt_field="associatedGene") if gene_id: gene_ids.append(gene_id) diff --git a/tests/test_models.py b/tests/test_models.py index 79e8630..34cdf20 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,7 +1,5 @@ """Module for testing the fusion model.""" -import copy - import pytest from pydantic import ValidationError @@ -554,27 +552,15 @@ def assert_genomic_region_test_element(test): """ assert test.type == "TemplatedSequenceElement" assert test.strand.value == "+" - assert test.region.id == "chr12:p12.1-p12.2" - assert test.region.type == "LocationDescriptor" - assert test.region.location.species_id == "taxonomy:9606" - assert test.region.location.chr == "12" - assert test.region.location.interval.start == "p12.1" - assert test.region.location.interval.end == "p12.2" - assert test.region.label == "chr12:p12.1-p12.2" + assert "ga4gh:SL" in test.region.id + assert test.region.type == "SequenceLocation" + test_ref = test.region.sequenceReference + assert "refseq:" in test_ref.id + assert "SQ." in test_ref.refgetAccession test_element = TemplatedSequenceElement(**templated_sequence_elements[0]) assert_genomic_region_test_element(test_element) - genomic_region_elements_cpy = copy.deepcopy(templated_sequence_elements[0]) - genomic_region_elements_cpy["region"]["location"]["_id"] = "location:1" - test_element = TemplatedSequenceElement(**genomic_region_elements_cpy) - assert_genomic_region_test_element(test_element) - - genomic_region_elements_cpy = copy.deepcopy(templated_sequence_elements[0]) - genomic_region_elements_cpy["region"]["location_id"] = "location:1" - test_element = TemplatedSequenceElement(**genomic_region_elements_cpy) - assert_genomic_region_test_element(test_element) - with pytest.raises(ValidationError) as exc_info: TemplatedSequenceElement( region={"interval": {"start": 39408, "stop": 39414}}, @@ -594,23 +580,10 @@ def assert_genomic_region_test_element(test): def test_gene_element(gene_examples): """Test that Gene Element initializes correctly.""" - test_element = GeneElement(gene_descriptor=gene_examples[0]) + test_element = GeneElement(gene=gene_examples[0]) assert test_element.type == "GeneElement" - assert test_element.gene_descriptor.id == "gene:G1" - assert test_element.gene_descriptor.label == "G1" - assert test_element.gene_descriptor.gene.gene_id == "hgnc:9339" - - # test CURIE requirement - with pytest.raises(ValidationError) as exc_info: - GeneElement( - gene_descriptor={ - "id": "G1", - "gene": {"gene_id": "hgnc:9339"}, - "label": "G1", - } - ) - msg = "String should match pattern '^\\w[^:]*:.+$'" - check_validation_error(exc_info, msg) + assert test_element.gene.id == "hgnc:9339" + assert test_element.gene.label == "G1" # test enum validation with pytest.raises(ValidationError) as exc_info: @@ -646,34 +619,33 @@ def test_mult_gene_element(): def test_event(): """Test Event object initializes correctly""" rearrangement = EventType.REARRANGEMENT - test_event = CausativeEvent(event_type=rearrangement, event_description=None) - assert test_event.event_type == rearrangement + test_event = CausativeEvent(eventType=rearrangement, eventDescription=None) + assert test_event.eventType == rearrangement with pytest.raises(ValueError): # noqa: PT011 - CausativeEvent(event_type="combination") + CausativeEvent(eventType="combination") def test_regulatory_element(regulatory_elements, gene_examples): """Test RegulatoryElement object initializes correctly""" test_reg_elmt = RegulatoryElement(**regulatory_elements[0]) - assert test_reg_elmt.regulatory_class.value == "promoter" - assert test_reg_elmt.associated_gene.id == "gene:G1" - assert test_reg_elmt.associated_gene.gene.gene_id == "hgnc:9339" - assert test_reg_elmt.associated_gene.label == "G1" + assert test_reg_elmt.regulatoryClass.value == "promoter" + assert test_reg_elmt.associatedGene.id == "hgnc:9339" + assert test_reg_elmt.associatedGene.label == "G1" # check type constraint with pytest.raises(ValidationError) as exc_info: RegulatoryElement( - regulatory_class="notpromoter", associated_gene=gene_examples[0] + regulatoryClass="notpromoter", associatedGene=gene_examples[0] ) assert exc_info.value.errors()[0]["msg"].startswith("Input should be") # require minimum input with pytest.raises(ValidationError) as exc_info: - RegulatoryElement(regulatory_class="enhancer") + RegulatoryElement(regulatoryClass="enhancer") assert ( exc_info.value.errors()[0]["msg"] - == "Value error, Must set 1 of {`feature_id`, `associated_gene`} and/or `feature_location`" + == "Value error, Must set 1 of {`featureId`, `associatedGene`} and/or `featureLocation`" ) @@ -689,80 +661,77 @@ def test_fusion( """Test that Fusion object initializes correctly""" # test valid object fusion = CategoricalFusion( - reading_frame_preserved=True, - critical_functional_domains=[functional_domains[0]], - structural_elements=[transcript_segments[1], transcript_segments[2]], - regulatory_element=regulatory_elements[0], + readingFramePreserved=True, + criticalFunctionalDomains=[functional_domains[0]], + structure=[transcript_segments[1], transcript_segments[2]], + regulatoryElement=regulatory_elements[0], ) - assert fusion.structural_elements[0].transcript == "refseq:NM_034348.3" + assert fusion.structure[0].transcript == "refseq:NM_034348.3" # check correct parsing of nested items fusion = CategoricalFusion( - structural_elements=[ + structure=[ { "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", - "id": "gene:NTRK1", + "gene": { + "type": "Gene", + "id": "hgnc:8031", "label": "NTRK1", - "gene_id": "hgnc:8031", }, }, { "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", - "id": "gene:ABL1", + "gene": { + "type": "Gene", + "id": "hgnc:76", "label": "ABL1", - "gene_id": "hgnc:76", }, }, ], regulatory_element=None, ) - assert fusion.structural_elements[0].type == "GeneElement" - assert fusion.structural_elements[0].gene_descriptor.id == "gene:NTRK1" - assert fusion.structural_elements[1].type == "GeneElement" - assert fusion.structural_elements[1].gene_descriptor.type == "GeneDescriptor" + assert fusion.structure[0].type == "GeneElement" + assert fusion.structure[0].gene.label == "NTRK1" + assert fusion.structure[0].gene.id == "hgnc:8031" + assert fusion.structure[1].type == "GeneElement" + assert fusion.structure[1].gene.type == "Gene" # test that non-element properties are optional - assert CategoricalFusion( - structural_elements=[transcript_segments[1], transcript_segments[2]] - ) + assert CategoricalFusion(structure=[transcript_segments[1], transcript_segments[2]]) # test variety of element types assert AssayedFusion( type="AssayedFusion", - structural_elements=[ + structure=[ unknown_element, gene_elements[0], transcript_segments[2], templated_sequence_elements[1], linkers[0], ], - causative_event={ + causativeEvent={ "type": "CausativeEvent", - "event_type": "rearrangement", - "event_description": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", + "eventType": "rearrangement", + "eventDescription": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", }, assay={ "type": "Assay", - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred", + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred", }, ) with pytest.raises(ValidationError) as exc_info: assert CategoricalFusion( type="CategoricalFusion", - structural_elements=[ + structure=[ { "type": "LinkerSequenceElement", "linkerSequence": { "id": "a:b", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", "sequence": "AC", "residue_type": "SO:0000348", }, @@ -771,7 +740,7 @@ def test_fusion( "type": "LinkerSequenceElement", "linkerSequence": { "id": "a:b", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", "sequence": "AC", "residue_type": "SO:0000348", }, @@ -794,9 +763,9 @@ def test_fusion_element_count( # elements are mandatory with pytest.raises(ValidationError) as exc_info: assert AssayedFusion( - functional_domains=[functional_domains[1]], - causative_event="rearrangement", - regulatory_elements=[regulatory_elements[0]], + functionalDomains=[functional_domains[1]], + causativeEvent="rearrangement", + regulatoryElement=[regulatory_elements[0]], ) element_ct_msg = ( "Value error, Fusions must contain >= 2 structural elements, or >=1 structural element " @@ -807,8 +776,8 @@ def test_fusion_element_count( # must have >= 2 elements + regulatory elements with pytest.raises(ValidationError) as exc_info: assert AssayedFusion( - structural_elements=[unknown_element], - causative_event={ + structure=[unknown_element], + causativeEvent={ "type": "CausativeEvent", "event_type": "rearrangement", "event_description": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", @@ -826,21 +795,17 @@ def test_fusion_element_count( # unique gene requirements uq_gene_error_msg = "Value error, Fusions must form a chimeric transcript from two or more genes, or a novel interaction between a rearranged regulatory element with the expressed product of a partner gene." with pytest.raises(ValidationError) as exc_info: - assert CategoricalFusion( - structural_elements=[gene_elements[0], gene_elements[0]] - ) + assert CategoricalFusion(structure=[gene_elements[0], gene_elements[0]]) check_validation_error(exc_info, uq_gene_error_msg) with pytest.raises(ValidationError) as exc_info: - assert CategoricalFusion( - structural_elements=[gene_elements[1], transcript_segments[0]] - ) + assert CategoricalFusion(structure=[gene_elements[1], transcript_segments[0]]) check_validation_error(exc_info, uq_gene_error_msg) with pytest.raises(ValidationError) as exc_info: assert CategoricalFusion( - regulatory_element=regulatory_elements[0], - structural_elements=[transcript_segments[0]], + regulatoryElement=regulatory_elements[0], + structure=[transcript_segments[0]], ) check_validation_error(exc_info, uq_gene_error_msg) @@ -848,7 +813,7 @@ def test_fusion_element_count( with pytest.raises(ValidationError) as exc_info: assert AssayedFusion( type="AssayedFusion", - structural_elements=[ + structure=[ {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, ], @@ -867,10 +832,10 @@ def test_fusion_element_count( with pytest.raises(ValidationError) as exc_info: assert AssayedFusion( type="AssayedFusion", - structural_elements=[ + structure=[ {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, ], - regulatory_element={ + regulatoryElement={ "type": "RegulatoryElement", "regulatory_class": "enhancer", "feature_id": "EH111111111", @@ -894,7 +859,7 @@ def test_fusion_abstraction_validator(transcript_segments, linkers): """Test that instantiation of abstract fusion fails.""" # can't create base fusion with pytest.raises(ValidationError) as exc_info: - assert AbstractFusion(structural_elements=[transcript_segments[2], linkers[0]]) + assert AbstractFusion(structure=[transcript_segments[2], linkers[0]]) check_validation_error( exc_info, "Value error, Cannot instantiate Fusion abstract class" ) From ac72ecd11e670002b15a9c38538ee1178b0c4961 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 17:59:23 -0400 Subject: [PATCH 52/82] fix: json schema examples --- src/fusor/models.py | 145 +++++++++++++++++--------------------------- 1 file changed, 56 insertions(+), 89 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 6b87675..0160f06 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -78,9 +78,9 @@ class FunctionalDomain(BaseModel): "type": "Gene", }, "sequenceLocation": { - "id": "ga4gh:SL.vJvm06Wl5J7DXHynR9ksW7IK3_3jlFK6", + "id": "ga4gh:SL.ywhUSfEUrwG0E29Q3c47bbuc6gkqTGlO", "start": 510, - "end": "781", + "end": 781, "type": "SequenceLocation", "sequenceReference": { "id": "NC_000022.11", @@ -156,7 +156,6 @@ def check_exons(cls, values): return values model_config = ConfigDict( - # TODO: verify this example json_schema_extra={ "example": { "type": "TranscriptSegmentElement", @@ -240,22 +239,17 @@ class TemplatedSequenceElement(BaseStructuralElement): strand: Strand model_config = ConfigDict( - # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "TemplatedSequenceElement", "region": { - "id": "chr12:44908821-44908822(+)", + "id": "ga4gh:SL.q_LeFVIakQtxnGHgxC4yehpLUxd6QsEr", "type": "SequenceLocation", - "location_id": "ga4gh:VSL.AG54ZRBhg6pwpPLafF4KgaAHpdFio6l5", - "location": { - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 44908821}, - "end": {"type": "Number", "value": 44908822}, - }, + "start": 44908821, + "end": 44908822, + "sequenceReference": { + "id": "refseq:NC_000012.12", + "refgetAccession": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", }, "label": "chr12:44908821-44908822(+)", }, @@ -378,23 +372,19 @@ def ensure_min_values(cls, values): return values model_config = ConfigDict( - # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "RegulatoryElement", - "regulatory_class": "promoter", - "feature_location": { - "type": "LocationDescriptor", - "id": "fusor.location_descriptor:NC_000001.11", - "location": { - "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "type": "SequenceLocation", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 155593}, - "end": {"type": "Number", "value": 155610}, - }, + "regulatoryClass": "promoter", + "featureLocation": { + "id": "ga4gh:SL.9hqdPDfXC-m_t_bDH75FZHfaM6OKDtRw", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", }, + "start": 155593, + "end": 155610, }, } }, @@ -453,7 +443,7 @@ def _access_object_attr( def _fetch_gene_id( cls, obj: dict | BaseModel, - alt_field: str | None, + alt_field: str | None = None, ) -> str | None: """Get gene ID if element includes a gene annotation. @@ -566,7 +556,6 @@ class Assay(BaseModelForbidExtra): fusionDetection: Evidence | None = None model_config = ConfigDict( - # TODO: verify this example json once models approved json_schema_extra={ "example": { "methodUri": "pmid:33576979", @@ -608,7 +597,6 @@ class CausativeEvent(BaseModelForbidExtra): eventDescription: StrictStr | None = None model_config = ConfigDict( - # TODO: verify this example json once models approved json_schema_extra={ "example": { "type": "CausativeEvent", @@ -632,29 +620,27 @@ class AssayedFusion(AbstractFusion): assay: Assay | None = None model_config = ConfigDict( - # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "AssayedFusion", - "causative_event": { + "causativeEvent": { "type": "CausativeEvent", - "event_type": "rearrangement", - "event_description": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", + "eventType": "rearrangement", + "eventDescription": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", }, "assay": { "type": "Assay", - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred", + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred", }, "structure": [ { "type": "GeneElement", "gene": { "type": "Gene", - "id": "gene:EWSR1", - "gene_id": "hgnc:3058", + "id": "hgnc:3058", "label": "EWSR1", }, }, @@ -686,22 +672,20 @@ class CategoricalFusion(AbstractFusion): structure: CategoricalFusionElements model_config = ConfigDict( - # TODO: update this example json once models approved json_schema_extra={ "example": { "type": "CategoricalFusion", - "reading_frame_preserved": True, - "critical_functional_domains": [ + "readingFramePreserved": True, + "criticalFunctionalDomains": [ { "type": "FunctionalDomain", "status": "lost", "label": "cystatin domain", "id": "interpro:IPR000010", - "associated_gene": { - "id": "gene:CST1", - "gene_id": "hgnc:2743", + "associatedGene": { + "id": "hgnc:2743", "label": "CST1", - "type": "GeneDescriptor", + "type": "Gene", }, } ], @@ -709,59 +693,42 @@ class CategoricalFusion(AbstractFusion): { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_152263.3", - "exon_start": 1, - "exon_start_offset": 0, - "exon_end": 8, - "exon_end_offset": 0, - "gene_descriptor": { - "id": "gene:TPM3", - "gene_id": "hgnc:12012", - "type": "GeneDescriptor", + "exonStart": 1, + "exonStartOffset": 0, + "exonEnd": 8, + "exonEndOffset": 0, + "gene": { + "id": "hgnc:12012", + "type": "Gene", "label": "TPM3", }, - "element_genomic_start": { + "elementGenomicStart": { "id": "TPM3:exon1", - "type": "LocationDescriptor", - "location_id": "ga4gh:VSL.vyyyExx4enSZdWZr3z67-T8uVKH50uLi", - "location": { - "sequence_id": "ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 154192135}, - "end": {"type": "Number", "value": 154192136}, - "type": "SequenceInterval", - }, - }, + "type": "SequenceLocation", + "start": 154192135, + "end": 154192136, }, - "element_genomic_end": { + "elementGenomicEnd": { "id": "TPM3:exon8", - "type": "LocationDescriptor", - "location_id": "ga4gh:VSL._1bRdL4I6EtpBvVK5RUaXb0NN3k0gpqa", - "location": { - "sequence_id": "ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 154170398}, - "end": {"type": "Number", "value": 154170399}, - "type": "SequenceInterval", - }, - }, + "type": "SequenceLocation", + "start": 154170398, + "end": 154170399, }, }, { - "type": "Gene", - "id": "gene:ALK", - "gene_id": "hgnc:427", - "label": "ALK", + "type": "GeneElement", + "gene": { + "id": "hgnc:427", + "label": "ALK", + }, }, ], - "regulatory_element": { + "regulatoryElement": { "type": "RegulatoryElement", - "regulatory_class": "promoter", - "associated_gene": { - "id": "gene:BRAF", - "type": "GeneDescriptor", - "gene_id": "hgnc:1097", + "regulatoryClass": "promoter", + "associatedGene": { + "type": "Gene", + "id": "hgnc:1097", "label": "BRAF", }, }, From 3a9c20a790cfe3f54718fcc28831d7d0661ea881 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 18:07:23 -0400 Subject: [PATCH 53/82] wip: updating fusor tests --- tests/test_fusor.py | 101 ++++++++++++++++---------------------------- 1 file changed, 37 insertions(+), 64 deletions(-) diff --git a/tests/test_fusor.py b/tests/test_fusor.py index 20e612e..fb1965e 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -3,6 +3,8 @@ import copy import pytest +from ga4gh.core.domain_models import Gene +from ga4gh.vrs.models import SequenceLocation from fusor.exceptions import FUSORParametersException from fusor.models import ( @@ -23,24 +25,23 @@ @pytest.fixture(scope="module") def braf_gene_descr_min(): """Create minimal gene descriptor for BRAF""" - return GeneDescriptor(id="normalize.gene:BRAF", label="BRAF", gene_id="hgnc:1097") + return Gene(label="BRAF", id="hgnc:1097") @pytest.fixture(scope="module") def braf_gene_descr(braf_gene_descriptor): """Create gene descriptor object for braf""" - return GeneDescriptor(**braf_gene_descriptor) + return Gene(**braf_gene_descriptor) @pytest.fixture(scope="module") def linker_element(): """Create linker element test fixture.""" params = { - "linker_sequence": { + "linkerSequence": { "id": "fusor.sequence:ACT", "sequence": "ACT", - "residue_type": "SO:0000348", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", }, "type": "LinkerSequenceElement", } @@ -52,20 +53,12 @@ def location_descriptor_braf_domain(): """Create location descriptor fixture for BRAF catalytic domain""" params = { "id": "fusor.location_descriptor:NP_004324.2", - "type": "LocationDescriptor", - "location": { - "sequence_id": "refseq:NP_004324.2", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 458}, - "end": { - "type": "Number", - "value": 712, - }, - }, - }, + "type": "SequenceLocation", + "sequenceReference": {"id": "", "refgetAccession": ""}, + "start": 458, + "end": 712, } - return LocationDescriptor(**params) + return SequenceLocation(**params) @pytest.fixture(scope="module") @@ -73,20 +66,12 @@ def location_descriptor_braf_domain_seq_id(): """Create location descriptor fixture for BRAF catalytic domain""" params = { "id": "fusor.location_descriptor:NP_004324.2", - "type": "LocationDescriptor", - "location": { - "sequence_id": "ga4gh:SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 458}, - "end": { - "type": "Number", - "value": 712, - }, - }, - }, + "type": "SequenceLocation", + "sequenceReference": {"id": "", "refgetAccession": ""}, + "start": 458, + "end": 712, } - return LocationDescriptor(**params) + return SequenceLocation(**params) @pytest.fixture(scope="module") @@ -96,8 +81,8 @@ def functional_domain_min(braf_gene_descr_min, location_descriptor_braf_domain): "status": "preserved", "label": "Serine-threonine/tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR001245", - "associated_gene": braf_gene_descr_min, - "sequence_location": location_descriptor_braf_domain, + "associatedGene": braf_gene_descr_min, + "sequenceLocation": location_descriptor_braf_domain, } return FunctionalDomain(**params) @@ -109,8 +94,8 @@ def functional_domain(braf_gene_descr, location_descriptor_braf_domain): "status": "preserved", "label": "Serine-threonine/tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR001245", - "associated_gene": braf_gene_descr, - "sequence_location": location_descriptor_braf_domain, + "associatedGene": braf_gene_descr, + "sequenceLocation": location_descriptor_braf_domain, } return FunctionalDomain(**params) @@ -124,8 +109,8 @@ def functional_domain_seq_id( "status": "preserved", "label": "Serine-threonine/tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR001245", - "associated_gene": braf_gene_descr_min, - "sequence_location": location_descriptor_braf_domain_seq_id, + "associatedGene": braf_gene_descr_min, + "sequenceLocation": location_descriptor_braf_domain_seq_id, } return FunctionalDomain(**params) @@ -135,8 +120,8 @@ def regulatory_element(braf_gene_descr): """Create regulatory element test fixture.""" params = { "type": "RegulatoryElement", - "regulatory_class": "promoter", - "associated_gene": braf_gene_descr, + "regulatoryClass": "promoter", + "associatedGene": braf_gene_descr, } return RegulatoryElement(**params) @@ -144,7 +129,7 @@ def regulatory_element(braf_gene_descr): @pytest.fixture(scope="module") def regulatory_element_min(braf_gene_descr_min): """Create regulatory element test fixture with minimal gene descriptor.""" - params = {"regulatory_class": "promoter", "associated_gene": braf_gene_descr_min} + params = {"regulatoryClass": "promoter", "associatedGene": braf_gene_descr_min} return RegulatoryElement(**params) @@ -153,18 +138,12 @@ def location_descriptor_tpm3(): """Create location descriptor test fixture.""" params = { "id": "fusor.location_descriptor:NM_152263.3", - "type": "LocationDescriptor", - "location": { - "sequence_id": "refseq:NM_152263.3", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 154170398}, - "end": {"type": "Number", "value": 154170399}, - "type": "SequenceInterval", - }, - }, + "type": "SequenceLocation", + "sequenceReference": {"id": "", "refgetAccession": ""}, + "start": 154170398, + "end": 154170399, } - return LocationDescriptor(**params) + return SequenceLocation(**params) @pytest.fixture(scope="module") @@ -174,16 +153,10 @@ def templated_sequence_element(): "type": "TemplatedSequenceElement", "region": { "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 99}, - "end": {"type": "Number", "value": 150}, - }, - }, + "type": "SequenceLocation", + "sequenceReference": {"id": "", "refgetAccession": ""}, + "start": 99, + "end": 150, }, "strand": "+", } @@ -478,12 +451,12 @@ def test_add_location_id(fusor_instance, fusion_example, exhaustive_example): def test__normalized_gene_descriptor(fusor_instance): """Test that _normalized_gene_descriptor works correctly.""" # Actual response is tested in test_add_gene_descriptor - resp = fusor_instance._normalized_gene_descriptor("BRAF") + resp = fusor_instance._normalized_gene("BRAF") assert resp[0] assert resp[1] is None - assert isinstance(resp[0], GeneDescriptor) + assert isinstance(resp[0], Gene) - resp = fusor_instance._normalized_gene_descriptor("B R A F") + resp = fusor_instance._normalized_gene("B R A F") assert resp[0] is None assert resp[1] == "gene-normalizer unable to normalize B R A F" From ab3de62f8e874c5f7f70a22a4098bd30afbba814 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 18:15:24 -0400 Subject: [PATCH 54/82] update nomenclature to use new models --- src/fusor/nomenclature.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/fusor/nomenclature.py b/src/fusor/nomenclature.py index ce1f675..81415d4 100644 --- a/src/fusor/nomenclature.py +++ b/src/fusor/nomenclature.py @@ -35,15 +35,15 @@ def reg_element_nomenclature(element: RegulatoryElement, sr: SeqRepo) -> str: if element.featureId: feature_string += f"_{element.featureId}" elif element.featureLocation: - start = element.featureLocation + feature_location = element.featureLocation # TODO: update this with new model - sequence_id = start.location.sequence_id + sequence_id = feature_location.sequenceReference.id refseq_id = str(translate_identifier(sr, sequence_id, "refseq")).split(":")[1] try: chrom = str(translate_identifier(sr, sequence_id, "GRCh38")).split(":")[1] except IDTranslationException as e: raise ValueError from e - feature_string += f"_{refseq_id}(chr {chrom}):g.{start.location.interval.start.value}_{start.location.interval.end.value}" + feature_string += f"_{refseq_id}(chr {chrom}):g.{feature_location.start}_{feature_location.end}" if element.associatedGene: if element.associatedGene.id: gene_id = element.associatedGene.id @@ -95,13 +95,14 @@ def templated_seq_nomenclature(element: TemplatedSequenceElement, sr: SeqRepo) - :raises ValueError: if location isn't a SequenceLocation or if unable to retrieve region or location """ - if element.region and element.region.location: - location = element.region.location - if isinstance(location, SequenceLocation): - sequence_id = str(location.sequence_id) + region = element.region + if region: + sequence_reference = element.region.sequenceReference + if isinstance(sequence_reference, SequenceLocation): + sequence_id = str(sequence_reference.id) refseq_id = str(translate_identifier(sr, sequence_id, "refseq")) - start = location.interval.start.value - end = location.interval.end.value + start = region.start + end = region.end try: chrom = str(translate_identifier(sr, sequence_id, "GRCh38")).split(":")[ 1 From 6bf5f01503e9c1ae27d96b7628e6e6029aab37b8 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 22 Jul 2024 18:19:23 -0400 Subject: [PATCH 55/82] remove completed todo --- src/fusor/nomenclature.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fusor/nomenclature.py b/src/fusor/nomenclature.py index 81415d4..4939bf8 100644 --- a/src/fusor/nomenclature.py +++ b/src/fusor/nomenclature.py @@ -36,7 +36,6 @@ def reg_element_nomenclature(element: RegulatoryElement, sr: SeqRepo) -> str: feature_string += f"_{element.featureId}" elif element.featureLocation: feature_location = element.featureLocation - # TODO: update this with new model sequence_id = feature_location.sequenceReference.id refseq_id = str(translate_identifier(sr, sequence_id, "refseq")).split(":")[1] try: From b952c9c3898b081d30a50710190f1d528587a562 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Tue, 23 Jul 2024 12:59:33 -0400 Subject: [PATCH 56/82] wip: updating fusor tests --- tests/test_fusor.py | 51 ++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/tests/test_fusor.py b/tests/test_fusor.py index fb1965e..1272572 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -52,9 +52,13 @@ def linker_element(): def location_descriptor_braf_domain(): """Create location descriptor fixture for BRAF catalytic domain""" params = { - "id": "fusor.location_descriptor:NP_004324.2", + "id": "ga4gh:SL.Lm-hzZHlA8FU_cYaOtAIbMLdf4Kk-SF8", "type": "SequenceLocation", - "sequenceReference": {"id": "", "refgetAccession": ""}, + "sequenceReference": { + "id": "refseq:NP_004324.2", + "refgetAccession": "SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", + "type": "SequenceReference", + }, "start": 458, "end": 712, } @@ -65,9 +69,13 @@ def location_descriptor_braf_domain(): def location_descriptor_braf_domain_seq_id(): """Create location descriptor fixture for BRAF catalytic domain""" params = { - "id": "fusor.location_descriptor:NP_004324.2", + "id": "ga4gh:SL.Lm-hzZHlA8FU_cYaOtAIbMLdf4Kk-SF8", "type": "SequenceLocation", - "sequenceReference": {"id": "", "refgetAccession": ""}, + "sequenceReference": { + "id": "refseq:NP_004324.2", + "refgetAccession": "SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", + "type": "SequenceReference", + }, "start": 458, "end": 712, } @@ -137,9 +145,13 @@ def regulatory_element_min(braf_gene_descr_min): def location_descriptor_tpm3(): """Create location descriptor test fixture.""" params = { - "id": "fusor.location_descriptor:NM_152263.3", + "id": "ga4gh:SL.0cMJgKuY32ate6k95oLua6vv8JAJ4PzO", "type": "SequenceLocation", - "sequenceReference": {"id": "", "refgetAccession": ""}, + "sequenceReference": { + "id": "NM_152263.3", + "refgetAccession": "SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT", + "type": "SequenceReference", + }, "start": 154170398, "end": 154170399, } @@ -152,9 +164,13 @@ def templated_sequence_element(): params = { "type": "TemplatedSequenceElement", "region": { - "id": "fusor.location_descriptor:NC_000001.11", + "id": "ga4gh:SL.U7-HtnKxK9kKI1ZINiDM_m4I6O-p4Dc9", "type": "SequenceLocation", - "sequenceReference": {"id": "", "refgetAccession": ""}, + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", + }, "start": 99, "end": 150, }, @@ -169,17 +185,10 @@ def templated_sequence_element_ensg(): params = { "type": "TemplatedSequenceElement", "region": { - "id": "fusor.location_descriptor:ENSG00000157764", - "type": "LocationDescriptor", - "location": { - "type": "SequenceLocation", - "sequence_id": "ensembl:ENSG00000157764", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 140719328}, - "end": {"type": "Number", "value": 140719400}, - }, - }, + "id": "ENSG00000157764", + "type": "SequenceLocation", + "start": 140719328, + "end": 140719400, }, "strand": "-", } @@ -448,8 +457,8 @@ def test_add_location_id(fusor_instance, fusion_example, exhaustive_example): ) -def test__normalized_gene_descriptor(fusor_instance): - """Test that _normalized_gene_descriptor works correctly.""" +def test__normalized_gene(fusor_instance): + """Test that _normalized_gene works correctly.""" # Actual response is tested in test_add_gene_descriptor resp = fusor_instance._normalized_gene("BRAF") assert resp[0] From 16def6be480ac041fe3dd1c4fb20abf5df842dc4 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Tue, 23 Jul 2024 14:02:52 -0400 Subject: [PATCH 57/82] Update locations for mane transcript segment fixture/tests --- tests/test_fusor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_fusor.py b/tests/test_fusor.py index 1272572..e35d2b3 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -290,8 +290,8 @@ def mane_transcript_segment_element(): "label": "NC_000011.10", "location": { "interval": { - "end": {"type": "Number", "value": 9576094}, - "start": {"type": "Number", "value": 9576093}, + "end": {"type": "Number", "value": 9575887}, + "start": {"type": "Number", "value": 9575886}, "type": "SequenceInterval", }, "sequence_id": "refseq:NC_000011.10", @@ -750,7 +750,7 @@ async def test_transcript_segment_element( tsg = await fusor_instance.transcript_segment_element( tx_to_genomic_coords=False, chromosome="NC_000011.10", - start=9576094, + start=9575887, gene="WEE1", ) assert tsg[0] From a624de52f8556b24c692ef3d9214b1aa1f6c2716 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Tue, 23 Jul 2024 17:21:02 -0400 Subject: [PATCH 58/82] wip: update tests with new models --- tests/test_nomenclature.py | 125 ++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 71 deletions(-) diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index 90a223e..90ee02c 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -1,6 +1,7 @@ """Test nomenclature generation.""" import pytest +from ga4gh.core.domain_models import Gene from fusor.models import AssayedFusion, CategoricalFusion, TranscriptSegmentElement from fusor.nomenclature import tx_segment_nomenclature @@ -11,38 +12,36 @@ def reg_example(): """Nonsense fusion testing correct regulatory element description.""" return AssayedFusion( type="AssayedFusion", - regulatory_element={ + regulatoryElement={ "type": "RegulatoryElement", - "regulatory_class": "riboswitch", - "associated_gene": { - "id": "normalize.gene:ABL1", - "type": "GeneDescriptor", + "regulatoryClass": "riboswitch", + "associatedGene": { + "type": "Gene", "label": "ABL1", - "gene_id": "hgnc:76", + "id": "hgnc:76", }, }, - structural_elements=[ + structure=[ { "type": "GeneElement", - "gene_descriptor": { - "id": "normalize.gene:BCR", - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "label": "BCR", - "gene_id": "hgnc:1014", + "id": "hgnc:1014", }, }, {"type": "UnknownGeneElement"}, ], - causative_event={ + causativeEvent={ "type": "CausativeEvent", - "event_type": "rearrangement", + "eventType": "rearrangement", }, assay={ "type": "Assay", - "assay_name": "a", - "assay_id": "a:b", - "method_uri": "a:b", - "fusion_detection": "observed", + "assayName": "a", + "assayId": "a:b", + "methodUti": "a:b", + "fusionDetection": "observed", }, ) @@ -52,50 +51,45 @@ def reg_location_example(): """Nonsense fusion testing correct regulatory element description.""" return AssayedFusion( type="AssayedFusion", - regulatory_element={ + regulatoryElement={ "type": "RegulatoryElement", - "regulatory_class": "promoter", - "associated_gene": { - "id": "normalize.gene:P2RY8", - "type": "GeneDescriptor", + "regulatoryClass": "promoter", + "associatedGene": { + "type": "Gene", "label": "P2RY8", - "gene_id": "hgnc:15524", + "id": "hgnc:15524", }, - "feature_location": { - "type": "LocationDescriptor", - "id": "fusor.location_descriptor:NC_000023.11", - "location": { - "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", - "type": "SequenceLocation", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 1462581}, - "end": {"type": "Number", "value": 1534182}, - }, + "featureLocation": { + "type": "SequenceLocation", + "id": "ga4gh:SL.KMHXvX8m5fD8PcGlQu2Vja3m7bt2iqfK", + "sequenceReference": { + "id": "refseq:NC_000023.11", + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", }, + "start": 1462581, + "end": 1534182, }, }, structural_elements=[ { "type": "GeneElement", - "gene_descriptor": { - "id": "normalize.gene:SOX5", - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "label": "SOX5", - "gene_id": "hgnc:11201", + "id": "hgnc:11201", }, }, ], causative_event={ "type": "CausativeEvent", - "event_type": "rearrangement", + "eventType": "rearrangement", }, assay={ "type": "Assay", - "assay_name": "a", - "assay_id": "a:b", - "method_uri": "a:b", - "fusion_detection": "observed", + "assayName": "a", + "assayId": "a:b", + "methodUri": "a:b", + "fusionDetection": "observed", }, ) @@ -108,37 +102,31 @@ def exon_offset_example(): structural_elements=[ { "type": "GeneElement", - "gene_descriptor": { - "id": "normalize.gene:BRAF", - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "label": "BRAF", - "gene_id": "hgnc:1097", + "id": "hgnc:1097", }, }, { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_002529.3", - "exon_start": 2, - "exon_start_offset": 20, - "gene_descriptor": { - "id": "normalize.gene:NTRK1", - "type": "GeneDescriptor", + "exonStart": 2, + "exonStartOffset": 20, + "gene": { + "type": "Gene", "label": "NTRK1", - "gene_id": "hgnc:8031", + "id": "hgnc:8031", }, - "element_genomic_start": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 156864428}, - "end": {"type": "Number", "value": 156864429}, - }, + "elementGenomicStart": { + "id": "ga4gh:SL.XEvDpRaKgoeQuQrhRwGzGK2uanHY4en8", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", }, + "start": {"type": "Number", "value": 156864428}, + "end": {"type": "Number", "value": 156864429}, }, }, ], @@ -155,12 +143,7 @@ def tx_seg_example(): exon_start_offset=0, exon_end=8, exon_end_offset=0, - gene_descriptor={ - "id": "normalize.gene:TPM3", - "type": "GeneDescriptor", - "label": "TPM3", - "gene_id": "hgnc:12012", - }, + gene=Gene(id="hgnc:12012", label="TPM3"), element_genomic_start={ "id": "fusor.location_descriptor:NC_000001.11", "type": "LocationDescriptor", From 6a6211b79760e32ee066382d014912e7c9708016 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 24 Jul 2024 11:46:21 -0400 Subject: [PATCH 59/82] update tests and examples with new models --- src/fusor/examples/alk.json | 48 ++--- src/fusor/examples/bcr_abl1.json | 169 +++++++----------- src/fusor/examples/bcr_abl1_expanded.json | 151 ++++++---------- src/fusor/examples/ewsr1.json | 21 ++- src/fusor/examples/ewsr1_elements_only.json | 9 +- src/fusor/examples/ewsr1_no_assay.json | 13 +- .../examples/ewsr1_no_causative_event.json | 17 +- src/fusor/examples/igh_myc.json | 22 ++- src/fusor/examples/tpm3_ntrk1.json | 94 ++++------ src/fusor/examples/tpm3_pdgfrb.json | 160 +++++++---------- src/fusor/fusor.py | 4 +- tests/conftest.py | 34 ++-- tests/test_fusor.py | 139 +++++++------- tests/test_models.py | 36 ++-- tests/test_nomenclature.py | 94 +++++----- 15 files changed, 424 insertions(+), 587 deletions(-) diff --git a/src/fusor/examples/alk.json b/src/fusor/examples/alk.json index d536e6b..9939f69 100644 --- a/src/fusor/examples/alk.json +++ b/src/fusor/examples/alk.json @@ -1,49 +1,39 @@ { "type": "CategoricalFusion", - "critical_functional_domains": [ + "criticalFunctionalDomains": [ { "type": "FunctionalDomain", - "_id": "interpro:IPR017441", + "id": "interpro:IPR017441", "label": "Protein kinase, ATP binding site", "status": "preserved", - "associated_gene": { - "id": "normalize.gene:hgnc%3A427", - "type": "GeneDescriptor", + "associatedGene": { + "type": "Gene", "label": "ALK", - "gene_id": "hgnc:427" + "id": "hgnc:427" }, - "sequence_location": { - "id": "fusor.location_descriptor:NP_004295.2", - "type": "LocationDescriptor", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NP_004295.2", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 1122 - }, - "end": { - "type": "Number", - "value": 1150 - } - } - } + "sequenceLocation": { + "id": "ga4gh:SL.zKwNiezVOyfNBKwAnFuFWMdMlrcc3kBA", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NP_004295.2", + "refgetAccession": "SQ.q9CnK-HKWh9eqhOi8FlzR7M0pCmUrWPs", + "type": "SequenceReference" + }, + "start": 1122, + "end": 1150 } } ], - "structural_elements": [ + "structure": [ { "type": "MultiplePossibleGenesElement" }, { "type": "GeneElement", - "gene_descriptor": { - "id": "normalize.gene:ALK", - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "label": "ALK", - "gene_id": "hgnc:427" + "id": "hgnc:427" } } ] diff --git a/src/fusor/examples/bcr_abl1.json b/src/fusor/examples/bcr_abl1.json index c033117..a2b0caf 100644 --- a/src/fusor/examples/bcr_abl1.json +++ b/src/fusor/examples/bcr_abl1.json @@ -1,133 +1,94 @@ { "type": "CategoricalFusion", - "structure": { - "type": "Adjacency", - "adjoinedSequences": [ - { + "structure": [ + { + "type": "TranscriptSegmentElement", + "transcript": "refseq:NM_004327.3", + "gene": { + "type": "Gene", + "id": "hgnc:1014", + "label": "BCR" + }, + "elementGenomicEnd": { + "id": "ga4gh:SL.5f5OVC3zReewA4S78X1eO0oJwWNbGYsY", "type": "SequenceLocation", + "description": null, + "xrefs": null, + "alternate_labels": null, + "extensions": null, "sequenceReference": { - "id": "GRCh38:chr22", - "type": "SequenceReference", + "id": "refseq:NC_000022.11", "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", - "residueAlphabet": "na" - }, - "end": 23290413, - "extensions": [ - { - "name": "NM_004327.4:e._14", - "description": "VICC exon representation of the aligned transcript boundary.", - "value": { - "exon_end": 14, - "exon_end_offset": 0, - "sequenceReference":{ - "type": "SequenceReference", - "id": "NM_004327.4", - "refgetAccession": "SQ.kpytJsXw3BwLC3oBSjHQS1kwxs4WO3I3", - "residueAlphabet": "na" - } - } + "type": "SequenceReference" }, - { - "name": "NM_004327.4:c._2782", - "description": "Transcript SequenceLocation of the aligned transcript boundary.", - "value": { - "type": "SequenceLocation", - "sequenceReference": { - "id": "NM_004327.4", - "type": "SequenceReference", - "refgetAccession": "SQ.kpytJsXw3BwLC3oBSjHQS1kwxs4WO3I3", - "residueAlphabet": "na" - }, - "end": 3234 - } - }, - { - "name": "gene", - "description": "The gene concept (BCR) associated with this fusion partner.", - "value": { - "code": "hgnc:1014", - "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", - "label": "BCR" - } - } - ]}, - { + "start": 23253980, + "end": 23253981 + }, + "exonEnd": 2, + "exonEndOffset": 182 + }, + { + "type": "LinkerSequenceElement", + "linkerSequence": { + "id": "sequence:ACTAAAGCG", + "type": "SequenceDescriptor", + "sequence": "ACTAAAGCG" + } + }, + { + "type": "TranscriptSegmentElement", + "transcript": "refseq:NM_005157.5", + "exonStart": 2, + "exonStartOffset": -173, + "gene": { + "type": "Gene", + "label": "ABL1", + "id": "hgnc:76" + }, + "elementGenomicStart": { + "id": "ga4gh:SL.tZYgaEJP2-d4Guv-n5gyhqOc07qH9xr6", + "description": null, + "xrefs": null, + "alternate_labels": null, + "extensions": null, "type": "SequenceLocation", "sequenceReference": { - "id": "GRCh38:chr9", - "type": "SequenceReference", + "id": "refseq:NC_000009.12", "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", - "residueAlphabet": "na" + "type": "SequenceReference" }, "start": 130854064, - "extensions": [ - { - "name": "NM_005157.6:e.2_", - "description": "VICC exon representation of the aligned transcript boundary.", - "value": { - "exon_start": 2, - "exon_start_offset": 0, - "sequenceReference":{ - "id": "NM_005157.6", - "type": "SequenceReference", - "refgetAccession": "SQ.w8Qg3x-PQ2akJrJQeGEN-_eBUMo1H1CL", - "residueAlphabet": "na" - } - } - }, - { - "name": "NM_005157.6:c.80_", - "description": "Transcript SequenceLocation of the aligned transcript boundary.", - "value": { - "type": "SequenceLocation", - "sequenceReference": { - "id": "NM_005157.6", - "type": "SequenceReference", - "refgetAccession": "SQ.w8Qg3x-PQ2akJrJQeGEN-_eBUMo1H1CL", - "residueAlphabet": "na" - }, - "end": 273 - } - }, - { - "name": "gene", - "description": "The gene concept (ABL1) associated with this fusion partner.", - "value": { - "code": "hgnc:76", - "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", - "label": "ABL1" - } - } - ] - }], - "linker": { - "type": "LiteralSequenceExpression", - "sequence": "CCCGTC" + "end": 130854065 + } } - }, + ], "readingFramePreserved": true, "criticalFunctionalDomains": [ { "type": "FunctionalDomain", "status": "preserved", - "gene": { - "code": "hgnc:76", - "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", - "label": "ABL1" + "associatedGene": { + "type": "Gene", + "label": "ABL1", + "id": "hgnc:76" }, "id": "interpro:IPR000980", "label": "SH2 domain", "sequenceLocation": { + "id": "ga4gh:SL.VQe2sf2aYArPcvjygq38JvFxRuDniE15", + "description": null, + "xrefs": null, + "alternate_labels": null, + "extensions": null, "type": "SequenceLocation", "sequenceReference": { - "id": "GRCh38:chr22", - "type": "SequenceReference", - "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", - "residueAlphabet": "na" + "id": "refseq:NP_005148.2", + "refgetAccession": "SQ.dmFigTG-0fY6I54swb7PoDuxCeT6O3Wg", + "type": "SequenceReference" }, "start": 127, "end": 202 } } ] -} \ No newline at end of file +} diff --git a/src/fusor/examples/bcr_abl1_expanded.json b/src/fusor/examples/bcr_abl1_expanded.json index b3a3fe7..e6870a3 100644 --- a/src/fusor/examples/bcr_abl1_expanded.json +++ b/src/fusor/examples/bcr_abl1_expanded.json @@ -1,56 +1,43 @@ { "type": "CategoricalFusion", - "regulatory_element": null, - "structural_elements": [ + "regulatoryElement": null, + "structure": [ { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_004327.3", - "exon_start": null, - "exon_start_offset": null, - "exon_end": 2, - "exon_end_offset": 182, - "gene_descriptor": { - "id": "normalize.gene:BCR", - "type": "GeneDescriptor", + "exonStart": null, + "exonStartOffset": null, + "exonEnd": 2, + "exonEndOffset": 182, + "gene": { + "type": "Gene", "label": "BCR", "description": null, "xrefs": null, "alternate_labels": null, "extensions": null, - "gene_id": "hgnc:1014", - "gene": null + "id": "hgnc:1014" }, - "element_genomic_start": null, - "element_genomic_end": { - "id": "fusor.location_descriptor:NC_000022.11", - "type": "LocationDescriptor", - "label": "NC_000022.11", + "elementGenomicStart": null, + "elementGenomicEnd": { + "id": "ga4gh:SL.5f5OVC3zReewA4S78X1eO0oJwWNbGYsY", + "type": "SequenceLocation", "description": null, "xrefs": null, "alternate_labels": null, "extensions": null, - "location_id": "ga4gh:VSL.ZTj-QLP67Hy3fGiKeMaEFNxNXtxuHZmf", - "location": { - "id": null, - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 23253980 - }, - "end": { - "type": "Number", - "value": 23253981 - } - } - } + "sequenceReference": { + "id": "refseq:NC_000022.11", + "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", + "type": "SequenceReference" + }, + "start": 23253980, + "end": 23253981 } }, { "type": "LinkerSequenceElement", - "linker_sequence": { + "linkerSequence": { "id": "sequence:ACTAAAGCG", "type": "SequenceDescriptor", "label": null, @@ -58,101 +45,73 @@ "xrefs": null, "alternate_labels": null, "extensions": null, - "sequence_id": null, - "sequence": "ACTAAAGCG", - "residue_type": "SO:0000348" + "sequence": "ACTAAAGCG" } }, { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_005157.5", - "exon_start": 2, - "exon_start_offset": -173, - "exon_end": null, - "exon_end_offset": null, - "gene_descriptor": { - "id": "normalize.gene:ABL1", - "type": "GeneDescriptor", + "exonStart": 2, + "exonStartOffset": -173, + "exonEnd": null, + "exonEndOffset": null, + "gene": { + "type": "Gene", "label": "ABL1", "description": null, "xrefs": null, "alternate_labels": null, "extensions": null, - "gene_id": "hgnc:76", - "gene": null + "id": "hgnc:76" }, - "element_genomic_start": { - "id": "fusor.location_descriptor:NC_000009.12", - "type": "LocationDescriptor", - "label": "NC_000009.12", + "elementGenomicStart": { + "id": "ga4gh:SL.tZYgaEJP2-d4Guv-n5gyhqOc07qH9xr6", "description": null, "xrefs": null, "alternate_labels": null, "extensions": null, - "location_id": "ga4gh:VSL.k4CwU2y9HyAyeloODZj7jly02KMMEzUl", - "location": { - "id": null, - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 130854064 - }, - "end": { - "type": "Number", - "value": 130854065 - } - } - } + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000009.12", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", + "type": "SequenceReference" + }, + "start": 130854064, + "end": 130854065 }, - "element_genomic_end": null + "elementGenomicEnd": null } ], - "reading_frame_preserved": true, - "critical_functional_domains": [ + "readingFramePreserved": true, + "criticalFunctionalDomains": [ { "type": "FunctionalDomain", "status": "preserved", - "associated_gene": { - "id": "normalize.gene:hgnc%3A76", - "type": "GeneDescriptor", + "associatedGene": { + "type": "Gene", "label": "ABL1", "description": null, "xrefs": null, "alternate_labels": null, "extensions": null, - "gene_id": "hgnc:76", - "gene": null + "id": "hgnc:76" }, "id": "interpro:IPR000980", "label": "SH2 domain", - "sequence_location": { - "id": "fusor.location_descriptor:NP_005148.2", - "type": "LocationDescriptor", - "label": null, + "sequenceLocation": { + "id": "ga4gh:SL.VQe2sf2aYArPcvjygq38JvFxRuDniE15", "description": null, "xrefs": null, "alternate_labels": null, "extensions": null, - "location_id": "ga4gh:VSL.qRY4R0LrYDKItXVfM80vS9p-d9pzuyQp", - "location": { - "id": null, - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.dmFigTG-0fY6I54swb7PoDuxCeT6O3Wg", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 127 - }, - "end": { - "type": "Number", - "value": 202 - } - } - } + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NP_005148.2", + "refgetAccession": "SQ.dmFigTG-0fY6I54swb7PoDuxCeT6O3Wg", + "type": "SequenceReference" + }, + "start": 127, + "end": 202 } } ] diff --git a/src/fusor/examples/ewsr1.json b/src/fusor/examples/ewsr1.json index 2b5b17b..1864e45 100644 --- a/src/fusor/examples/ewsr1.json +++ b/src/fusor/examples/ewsr1.json @@ -1,28 +1,27 @@ { "type": "AssayedFusion", - "structural_elements": [ + "structure": [ { "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", - "id": "normalize.gene:EWSR1", + "gene": { + "type": "Gene", "label": "EWSR1", - "gene_id": "hgnc:3508" + "id": "hgnc:3508" } }, { "type": "UnknownGeneElement" } ], - "causative_event": { + "causativeEvent": { "type": "CausativeEvent", - "event_type": "rearrangement" + "eventType": "rearrangement" }, "assay": { "type": "Assay", - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred" + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred" } } diff --git a/src/fusor/examples/ewsr1_elements_only.json b/src/fusor/examples/ewsr1_elements_only.json index 103a5f8..e1333be 100644 --- a/src/fusor/examples/ewsr1_elements_only.json +++ b/src/fusor/examples/ewsr1_elements_only.json @@ -1,13 +1,12 @@ { "type": "AssayedFusion", - "structural_elements": [ + "structure": [ { "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", - "id": "normalize.gene:EWSR1", + "gene": { + "type": "Gene", "label": "EWSR1", - "gene_id": "hgnc:3508" + "id": "hgnc:3508" } }, { diff --git a/src/fusor/examples/ewsr1_no_assay.json b/src/fusor/examples/ewsr1_no_assay.json index 691e6db..1c7d560 100644 --- a/src/fusor/examples/ewsr1_no_assay.json +++ b/src/fusor/examples/ewsr1_no_assay.json @@ -1,21 +1,20 @@ { "type": "AssayedFusion", - "structural_elements": [ + "structure": [ { "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", - "id": "normalize.gene:EWSR1", + "gene": { + "type": "Gene", "label": "EWSR1", - "gene_id": "hgnc:3508" + "id": "hgnc:3508" } }, { "type": "UnknownGeneElement" } ], - "causative_event": { + "causativeEvent": { "type": "CausativeEvent", - "event_type": "rearrangement" + "eventType": "rearrangement" } } diff --git a/src/fusor/examples/ewsr1_no_causative_event.json b/src/fusor/examples/ewsr1_no_causative_event.json index dfa2748..9df5dea 100644 --- a/src/fusor/examples/ewsr1_no_causative_event.json +++ b/src/fusor/examples/ewsr1_no_causative_event.json @@ -1,13 +1,12 @@ { "type": "AssayedFusion", - "structural_elements": [ + "structure": [ { "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", - "id": "normalize.gene:EWSR1", + "gene": { + "type": "Gene", "label": "EWSR1", - "gene_id": "hgnc:3508" + "id": "hgnc:3508" } }, { @@ -16,9 +15,9 @@ ], "assay": { "type": "Assay", - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred" + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred" } } diff --git a/src/fusor/examples/igh_myc.json b/src/fusor/examples/igh_myc.json index ac49180..396846d 100644 --- a/src/fusor/examples/igh_myc.json +++ b/src/fusor/examples/igh_myc.json @@ -1,24 +1,22 @@ { "type": "CategoricalFusion", - "regulatory_element": { + "regulatoryElement": { "type": "RegulatoryElement", - "regulatory_class": "enhancer", - "associated_gene": { - "type": "GeneDescriptor", + "regulatoryClass": "enhancer", + "associatedGene": { + "type": "Gene", "label": "IGH", - "gene_id": "hgnc:5477", - "id": "normalize.gene:IGH" + "id": "hgnc:5477" }, - "feature_id": "EH38E3121735" + "featureId": "EH38E3121735" }, - "structural_elements": [ + "structure": [ { "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "label": "MYC", - "gene_id": "hgnc:7553", - "id": "normalize.gene:MYC" + "id": "hgnc:7553" } } ] diff --git a/src/fusor/examples/tpm3_ntrk1.json b/src/fusor/examples/tpm3_ntrk1.json index 8f65f10..55d68b4 100644 --- a/src/fusor/examples/tpm3_ntrk1.json +++ b/src/fusor/examples/tpm3_ntrk1.json @@ -1,80 +1,60 @@ { "type": "AssayedFusion", - "structural_elements": [ + "structure": [ { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_152263.3", - "exon_end": 8, - "exon_end_offset": 0, - "gene_descriptor": { - "id": "normalize.gene:TPM3", - "type": "GeneDescriptor", + "exonEnd": 8, + "exonEndOffset": 0, + "gene": { + "type": "Gene", "label": "TPM3", - "gene_id": "hgnc:12012" + "id": "hgnc:12012" }, - "element_genomic_end": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 154170399 - }, - "end": { - "type": "Number", - "value": 154170400 - } - } - } + "elementGenomicEnd": { + "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference" + }, + "start": 154170399, + "end": 154170400 } }, { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_002529.3", - "exon_start": 10, - "exon_start_offset": 0, - "gene_descriptor": { - "id": "normalize.gene:NTRK1", - "type": "GeneDescriptor", + "exonStart": 10, + "exonStartOffset": 0, + "gene": { + "type": "Gene", "label": "NTRK1", - "gene_id": "hgnc:8031" + "id": "hgnc:8031" }, - "element_genomic_start": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 156874626 - }, - "end": { - "type": "Number", - "value": 156874627 - } - } - } + "elementGenomicStart": { + "id": "ga4gh:SL.vwzO5bvePEutAuXGJAQRhFZgWNSMJDeq", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference" + }, + "start": 156874626, + "end": 156874627 } } ], - "causative_event": { + "causativeEvent": { "type": "CausativeEvent", - "event_type": "rearrangement" + "eventType": "rearrangement" }, "assay": { "type": "Assay", - "assay_name": "fluorescence in-situ hybridization assay", - "assay_id": "obi:OBI_0003094", - "fusion_detection": "inferred", - "method_uri": "pmid:33576979" + "assayName": "fluorescence in-situ hybridization assay", + "assayId": "obi:OBI_0003094", + "fusionDetection": "inferred", + "methodUri": "pmid:33576979" } } diff --git a/src/fusor/examples/tpm3_pdgfrb.json b/src/fusor/examples/tpm3_pdgfrb.json index 14b1599..ff737d2 100644 --- a/src/fusor/examples/tpm3_pdgfrb.json +++ b/src/fusor/examples/tpm3_pdgfrb.json @@ -1,124 +1,86 @@ { "type": "AssayedFusion", - "structural_elements": [ + "structure": [ { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_152263.3", - "exon_start": 1, - "exon_start_offset": 0, - "exon_end": 8, - "exon_end_offset": 0, - "gene_descriptor": { - "id": "normalize.gene:TPM3", - "type": "GeneDescriptor", + "exonStart": 1, + "exonStartOffset": 0, + "exonEnd": 8, + "exonEndOffset": 0, + "gene": { + "type": "Gene", "label": "TPM3", - "gene_id": "hgnc:12012" + "id": "hgnc:12012" }, - "element_genomic_start": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 154192135 - }, - "end": { - "type": "Number", - "value": 154192136 - } - } - } + "elementGenomicStart": { + "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference" + }, + "start": 154192135, + "end": 154192136 }, - "element_genomic_end": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 154170399 - }, - "end": { - "type": "Number", - "value": 154170400 - } - } - } + "elementGenomicEnd": { + "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference" + }, + "start": 154170399, + "end": 154170400 } }, { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_002609.3", - "exon_start": 11, - "exon_start_offset": 0, - "exon_end": 22, - "exon_end_offset": 0, - "gene_descriptor": { - "id": "normalize.gene:PDGFRB", - "type": "GeneDescriptor", + "exonStart": 11, + "exonStartOffset": 0, + "exonEnd": 22, + "exonEndOffset": 0, + "gene": { + "type": "Gene", "label": "PDGFRB", - "gene_id": "hgnc:8804" + "id": "hgnc:8804" }, - "element_genomic_start": { - "id": "fusor.location_descriptor:NC_000005.10", - "type": "LocationDescriptor", - "label": "NC_000005.10", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000005.10", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 150125577 - }, - "end": { - "type": "Number", - "value": 150125578 - } - } - } + "elementGenomicStart": { + "id": "ga4gh:SL.1mONzR4j65OuPg1yMgF-w-W2qu03qEc-", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000005.10", + "refgetAccession": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", + "type": "SequenceReference" + }, + "start": 150125577, + "end": 150125578 }, - "element_genomic_end": { - "id": "fusor.location_descriptor:NC_000005.10", - "type": "LocationDescriptor", - "label": "NC_000005.10", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000005.10", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 150113838 - }, - "end": { - "type": "Number", - "value": 150113839 - } - } - } + "elementGenomicEnd": { + "id": "ga4gh:SL.QFVGHvvSFg-9q_Tv04DjMbvTRnGs1kEr", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000005.10", + "refgetAccession": "SQ.aUiQCzCPZ2d0csHbMSbh2NzInhonSXwI", + "type": "SequenceReference" + }, + "start": 150113838, + "end": 150113839 } } ], - "causative_event": { + "causativeEvent": { "type": "CausativeEvent", - "event_type": "rearrangement" + "eventType": "rearrangement" }, "assay": { "type": "Assay", - "assay_name": "RT-PCR", - "assay_id": "obi:OBI_0000552", - "method_uri": "pmid:24034314", - "fusion_detection": "observed" + "assayName": "RT-PCR", + "assayId": "obi:OBI_0000552", + "methodUri": "pmid:24034314", + "fusionDetection": "observed" } } diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 5f7dac1..0290af8 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -724,7 +724,7 @@ def generate_nomenclature(self, fusion: Fusion) -> str: elif isinstance(element, UnknownGeneElement): parts.append("?") elif isinstance(element, LinkerElement): - parts.append(element.linker_sequence.sequence) + parts.append(element.linkerSequence.sequence) elif isinstance(element, TranscriptSegmentElement): if not any( [gene == element.gene.label for gene in element_genes] # noqa: C419 @@ -742,7 +742,7 @@ def generate_nomenclature(self, fusion: Fusion) -> str: if ( isinstance(fusion, AssayedFusion) and fusion.assay - and fusion.assay.fusion_detection == Evidence.INFERRED + and fusion.assay.fusionDetection == Evidence.INFERRED ): divider = "(::)" else: diff --git a/tests/conftest.py b/tests/conftest.py index 432392a..bd6091d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -50,7 +50,7 @@ def braf_gene_descriptor(): """Create gene descriptor params for BRAF.""" return { "id": "normalize.gene:BRAF", - "type": "GeneDescriptor", + "type": "Gene", "label": "BRAF", "xrefs": ["ensembl:ENSG00000157764", "ncbigene:673"], "alternate_labels": ["BRAF1", "BRAF-1", "NS7", "B-raf", "B-RAF1", "RAFB1"], @@ -158,7 +158,7 @@ def alk_gene_descriptor(): """Create test fixture for ALK gene descriptor params""" return { "id": "normalize.gene:ALK", - "type": "GeneDescriptor", + "type": "Gene", "label": "ALK", "description": None, "xrefs": ["ensembl:ENSG00000171094", "ncbigene:238"], @@ -288,7 +288,7 @@ def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): "_id": "interpro:IPR020635", "label": "Tyrosine-protein kinase, catalytic domain", "status": "lost", - "associated_gene": alk_gene_descriptor, + "associatedGene": alk_gene_descriptor, "sequence_location": { "id": "fusor.location_descriptor:NP_004295.2", "type": "LocationDescriptor", @@ -311,7 +311,7 @@ def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): }, } ], - "structural_elements": [ + "structure": [ { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_152263.3", @@ -321,7 +321,7 @@ def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): "exon_end_offset": 0, "gene_descriptor": { "id": "normalize.gene:TPM3", - "type": "GeneDescriptor", + "type": "Gene", "label": "TPM3", "description": None, "xrefs": ["ensembl:ENSG00000143549", "ncbigene:7170"], @@ -464,7 +464,7 @@ def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): "gene_id": "hgnc:12012", "gene": None, }, - "element_genomic_start": { + "elementGenomicStart": { "id": "fusor.location_descriptor:NC_000001.11", "type": "LocationDescriptor", "label": "NC_000001.11", @@ -484,7 +484,7 @@ def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): }, }, }, - "element_genomic_end": { + "elementGenomicEnd": { "id": "fusor.location_descriptor:NC_000001.11", "type": "LocationDescriptor", "label": "NC_000001.11", @@ -553,7 +553,7 @@ def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): "regulatory_element": { "type": "RegulatoryElement", "regulatory_class": "promoter", - "associated_gene": braf_gene_descriptor, + "associatedGene": braf_gene_descriptor, }, } @@ -570,9 +570,9 @@ def fusion_example(): "_id": "interpro:IPR020635", "label": "Tyrosine-protein kinase, catalytic domain", "status": "lost", - "associated_gene": { + "associatedGene": { "id": "normalize.gene:hgnc%3A427", - "type": "GeneDescriptor", + "type": "Gene", "label": "ALK", "gene_id": "hgnc:427", }, @@ -591,7 +591,7 @@ def fusion_example(): }, } ], - "structural_elements": [ + "structure": [ { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_152263.3", @@ -601,11 +601,11 @@ def fusion_example(): "exon_end_offset": 0, "gene_descriptor": { "id": "normalize.gene:TPM3", - "type": "GeneDescriptor", + "type": "Gene", "label": "TPM3", "gene_id": "hgnc:12012", }, - "element_genomic_start": { + "elementGenomicStart": { "id": "fusor.location_descriptor:NC_000001.11", "type": "LocationDescriptor", "label": "NC_000001.11", @@ -619,7 +619,7 @@ def fusion_example(): }, }, }, - "element_genomic_end": { + "elementGenomicEnd": { "id": "fusor.location_descriptor:NC_000001.11", "type": "LocationDescriptor", "label": "NC_000001.11", @@ -638,7 +638,7 @@ def fusion_example(): "type": "GeneElement", "gene_descriptor": { "id": "normalize.gene:ALK", - "type": "GeneDescriptor", + "type": "Gene", "label": "ALK", "gene_id": "hgnc:427", }, @@ -675,9 +675,9 @@ def fusion_example(): "regulatory_element": { "type": "RegulatoryElement", "regulatory_class": "promoter", - "associated_gene": { + "associatedGene": { "id": "gene:BRAF", - "type": "GeneDescriptor", + "type": "Gene", "label": "BRAF", "gene_id": "hgnc:1097", }, diff --git a/tests/test_fusor.py b/tests/test_fusor.py index e35d2b3..8b0c32a 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -233,10 +233,10 @@ def transcript_segment_element(): "gene_id": "hgnc:12012", "id": "normalize.gene:TPM3", "label": "TPM3", - "type": "GeneDescriptor", + "type": "Gene", }, "transcript": "refseq:NM_152263.3", - "element_genomic_end": { + "elementGenomicEnd": { "id": "fusor.location_descriptor:NC_000001.11", "label": "NC_000001.11", "location": { @@ -250,7 +250,7 @@ def transcript_segment_element(): }, "type": "LocationDescriptor", }, - "element_genomic_start": { + "elementGenomicStart": { "id": "fusor.location_descriptor:NC_000001.11", "label": "NC_000001.11", "location": { @@ -281,11 +281,11 @@ def mane_transcript_segment_element(): "gene_id": "hgnc:12761", "id": "normalize.gene:WEE1", "label": "WEE1", - "type": "GeneDescriptor", + "type": "Gene", }, "transcript": "refseq:NM_003390.4", - "element_genomic_end": None, - "element_genomic_start": { + "elementGenomicEnd": None, + "elementGenomicStart": { "id": "fusor.location_descriptor:NC_000011.10", "label": "NC_000011.10", "location": { @@ -308,7 +308,7 @@ def fusion_ensg_sequence_id(templated_sequence_element_ensg): """Create fixture using Ensemble gene ID.""" params = { "type": "CategoricalFusion", - "structural_elements": [ + "structure": [ templated_sequence_element_ensg, {"type": "MultiplePossibleGenesElement"}, ], @@ -364,34 +364,34 @@ def test_add_additional_fields(fusor_instance, fusion_example, fusion_ensg_seque fusion = CategoricalFusion(**fusion_example) expected_fusion = copy.deepcopy(fusion) - expected_fusion.critical_functional_domains[ + expected_fusion.criticalFunctionalDomains[ 0 ].sequence_location.location_id = "ga4gh:VSL.2CWYzSpOJfZq7KW4VIUKeP5SJtepRar0" - expected_fusion.critical_functional_domains[ + expected_fusion.criticalFunctionalDomains[ 0 ].sequence_location.location.sequence_id = ( "ga4gh:SQ.q9CnK-HKWh9eqhOi8FlzR7M0pCmUrWPs" ) - expected_fusion.structural_elements[ + expected_fusion.structure[ 0 - ].element_genomic_start.location_id = "ga4gh:VSL.H0IOyJ-DB4jTbbSBjQFvuPvMrZHAWSrW" - expected_fusion.structural_elements[ + ].elementGenomicStart.location_id = "ga4gh:VSL.H0IOyJ-DB4jTbbSBjQFvuPvMrZHAWSrW" + expected_fusion.structure[ 0 - ].element_genomic_start.location.sequence_id = ( + ].elementGenomicStart.location.sequence_id = ( "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" ) - expected_fusion.structural_elements[ + expected_fusion.structure[ 0 - ].element_genomic_end.location_id = "ga4gh:VSL.aarSLdMOQ8LoooPB2EoSth41yG_qRmDq" - expected_fusion.structural_elements[ + ].elementGenomicEnd.location_id = "ga4gh:VSL.aarSLdMOQ8LoooPB2EoSth41yG_qRmDq" + expected_fusion.structure[ 0 - ].element_genomic_end.location.sequence_id = ( + ].elementGenomicEnd.location.sequence_id = ( "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" ) - expected_fusion.structural_elements[ + expected_fusion.structure[ 3 ].region.location_id = "ga4gh:VSL.zd12pX_ju2gLq9a9UOYgM8AtbkuhnyUu" - expected_fusion.structural_elements[ + expected_fusion.structure[ 3 ].region.location.sequence_id = "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP" @@ -401,7 +401,7 @@ def test_add_additional_fields(fusor_instance, fusion_example, fusion_ensg_seque # test handling of unrecognized sequence IDs expected_fusion = copy.deepcopy(fusion_ensg_sequence_id) fusion = fusor_instance.add_additional_fields(fusion_ensg_sequence_id) - ts_reg = fusion.structural_elements[0].region + ts_reg = fusion.structure[0].region assert ts_reg.location.sequence_id == "ensembl:ENSG00000157764" assert ts_reg.location_id == "ga4gh:VSL.dUll0TA05efQf0TsmcP03mtdGcpP9jPH" @@ -411,22 +411,22 @@ def test_add_translated_sequence_id(fusor_instance, fusion_example): fusion = CategoricalFusion(**fusion_example) expected_fusion = copy.deepcopy(fusion) - expected_fusion.critical_functional_domains[ + expected_fusion.criticalFunctionalDomains[ 0 ].sequence_location.location.sequence_id = ( "ga4gh:SQ.q9CnK-HKWh9eqhOi8FlzR7M0pCmUrWPs" ) - expected_fusion.structural_elements[ + expected_fusion.structure[ 0 - ].element_genomic_start.location.sequence_id = ( + ].elementGenomicStart.location.sequence_id = ( "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" ) - expected_fusion.structural_elements[ + expected_fusion.structure[ 0 - ].element_genomic_end.location.sequence_id = ( + ].elementGenomicEnd.location.sequence_id = ( "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" ) - expected_fusion.structural_elements[ + expected_fusion.structure[ 3 ].region.location.sequence_id = "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP" @@ -440,20 +440,19 @@ def test_add_location_id(fusor_instance, fusion_example, exhaustive_example): actual = CategoricalFusion(**exhaustive_example) assert ( - fusion.critical_functional_domains[0].sequence_location.location_id - == actual.critical_functional_domains[0].sequence_location.location_id + fusion.criticalFunctionalDomains[0].sequence_location.location_id + == actual.criticalFunctionalDomains[0].sequence_location.location_id ) assert ( - fusion.structural_elements[0].element_genomic_start.location_id - == actual.structural_elements[0].element_genomic_start.location_id + fusion.structure[0].elementGenomicStart.location_id + == actual.structure[0].elementGenomicStart.location_id ) assert ( - fusion.structural_elements[0].element_genomic_end.location_id - == actual.structural_elements[0].element_genomic_end.location_id + fusion.structure[0].elementGenomicEnd.location_id + == actual.structure[0].elementGenomicEnd.location_id ) assert ( - fusion.structural_elements[3].region.location_id - == actual.structural_elements[3].region.location_id + fusion.structure[3].region.location_id == actual.structure[3].region.location_id ) @@ -481,10 +480,10 @@ def test_add_gene_descriptor(fusor_instance, exhaustive_example, fusion_example) e_gds = set() t_gds = set() for e_field in [ - expected_fusion.critical_functional_domains, - expected_fusion.structural_elements, + expected_fusion.criticalFunctionalDomains, + expected_fusion.structure, ]: - for t_field in [actual.critical_functional_domains, actual.structural_elements]: + for t_field in [actual.criticalFunctionalDomains, actual.structure]: for e_obj in e_field: for t_obj in t_field: if "gene_descriptor" in e_obj.model_fields: @@ -501,8 +500,8 @@ def test_add_gene_descriptor(fusor_instance, exhaustive_example, fusion_example) assert t_gds == e_gds compare_gene_descriptor( - actual.regulatory_element.associated_gene.model_dump(), - expected_fusion.regulatory_element.associated_gene.model_dump(), + actual.regulatory_element.associatedGene.model_dump(), + expected_fusion.regulatory_element.associatedGene.model_dump(), ) @@ -516,50 +515,50 @@ def test_fusion( """Test that fusion methods work correctly.""" # infer type from properties f = fusor_instance.fusion( - structural_elements=[ + structure=[ templated_sequence_element, linker_element, UnknownGeneElement(), ], - causative_event={ + causativeEvent={ "type": "CausativeEvent", - "event_type": "rearrangement", + "eventType": "rearrangement", "event_description": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", }, assay={ "type": "Assay", - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred", + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred", }, ) assert isinstance(f, AssayedFusion) f = fusor_instance.fusion( - structural_elements=[ + structure=[ transcript_segment_element, MultiplePossibleGenesElement(), ], - critical_functional_domains=[functional_domain], + criticalFunctionalDomains=[functional_domain], ) assert isinstance(f, CategoricalFusion) # catch conflicting property args with pytest.raises(FUSORParametersException) as excinfo: f = fusor_instance.fusion( - structural_elements=[ + structure=[ transcript_segment_element, UnknownGeneElement(), ], - causative_event="rearrangement", - critical_functional_domains=[functional_domain], + causativeEvent="rearrangement", + criticalFunctionalDomains=[functional_domain], ) assert str(excinfo.value) == "Received conflicting attributes" # handle indeterminate type with pytest.raises(FUSORParametersException) as excinfo: f = fusor_instance.fusion( - structural_elements=[ + structure=[ transcript_segment_element, templated_sequence_element, ] @@ -569,38 +568,38 @@ def test_fusion( # handle both type parameter options f = fusor_instance.fusion( fusion_type="AssayedFusion", - structural_elements=[ + structure=[ templated_sequence_element, linker_element, UnknownGeneElement(), ], - causative_event={ + causativeEvent={ "type": "CausativeEvent", - "event_type": "rearrangement", + "eventType": "rearrangement", }, assay={ "type": "Assay", - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred", + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred", }, ) assert isinstance(f, AssayedFusion) f = fusor_instance.fusion( type="CategoricalFusion", - structural_elements=[ + structure=[ transcript_segment_element, MultiplePossibleGenesElement(), ], - critical_functional_domains=[functional_domain], + criticalFunctionalDomains=[functional_domain], ) assert isinstance(f, CategoricalFusion) # catch and pass on validation errors with pytest.raises(FUSORParametersException) as excinfo: f = fusor_instance.fusion( - fusion_type="CategoricalFusion", structural_elements=[linker_element] + fusion_type="CategoricalFusion", structure=[linker_element] ) msg = "Fusions must contain >= 2 structural elements, or >=1 structural element and a regulatory element" assert msg in str(excinfo.value) @@ -657,11 +656,11 @@ async def test_transcript_segment_element( assert tsg[0].model_dump() == transcript_segment_element.model_dump() expected = copy.deepcopy(transcript_segment_element) - expected.element_genomic_start.location.sequence_id = ( + expected.elementGenomicStart.location.sequence_id = ( "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" ) - expected.element_genomic_end.location.sequence_id = ( - expected.element_genomic_start.location.sequence_id + expected.elementGenomicEnd.location.sequence_id = ( + expected.elementGenomicStart.location.sequence_id ) # Transcript Input @@ -690,8 +689,8 @@ async def test_transcript_segment_element( assert tsg[0].model_dump() == expected.model_dump() expected.exon_end_offset = -5 - expected.element_genomic_end.location.interval.start.value = 154170404 - expected.element_genomic_end.location.interval.end.value = 154170405 + expected.elementGenomicEnd.location.interval.start.value = 154170404 + expected.elementGenomicEnd.location.interval.end.value = 154170405 # Transcript Input tsg = await fusor_instance.transcript_segment_element( @@ -721,7 +720,7 @@ async def test_transcript_segment_element( expected.exon_end = None expected.exon_end_offset = None - expected.element_genomic_end = None + expected.elementGenomicEnd = None # Transcript Input tsg = await fusor_instance.transcript_segment_element( @@ -877,7 +876,7 @@ def compare_domains(actual, expected): expected = expected.model_dump() assert actual.keys() == expected.keys() for key in expected: - if key == "associated_gene": + if key == "associatedGene": compare_gene_descriptor(actual[key], expected[key]) elif key == "sequence_location": act_ld = actual["sequence_location"] @@ -1011,7 +1010,7 @@ def compare_re(actual, expected): expected = expected.model_dump() assert actual.keys() == expected.keys() assert actual["type"] == expected["type"] - compare_gene_descriptor(actual["associated_gene"], expected["associated_gene"]) + compare_gene_descriptor(actual["associatedGene"], expected["associatedGene"]) re = fusor_instance.regulatory_element(RegulatoryClass.PROMOTER, "BRAF") compare_re(re, regulatory_element_min) diff --git a/tests/test_models.py b/tests/test_models.py index 34cdf20..397989f 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -779,15 +779,15 @@ def test_fusion_element_count( structure=[unknown_element], causativeEvent={ "type": "CausativeEvent", - "event_type": "rearrangement", + "eventType": "rearrangement", "event_description": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", }, assay={ "type": "Assay", - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred", + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred", }, ) check_validation_error(exc_info, element_ct_msg) @@ -817,16 +817,16 @@ def test_fusion_element_count( {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, ], - causative_event={ + causativeEvent={ "type": "CausativeEvent", - "event_type": "read-through", + "eventType": "read-through", }, assay={ "type": "Assay", - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred", + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred", }, ) with pytest.raises(ValidationError) as exc_info: @@ -839,18 +839,18 @@ def test_fusion_element_count( "type": "RegulatoryElement", "regulatory_class": "enhancer", "feature_id": "EH111111111", - "associated_gene": gene_examples[6], + "associatedGene": gene_examples[6], }, - causative_event={ + causativeEvent={ "type": "CausativeEvent", - "event_type": "read-through", + "eventType": "read-through", }, assay={ "type": "Assay", - "method_uri": "pmid:33576979", - "assay_id": "obi:OBI_0003094", - "assay_name": "fluorescence in-situ hybridization assay", - "fusion_detection": "inferred", + "methodUri": "pmid:33576979", + "assayId": "obi:OBI_0003094", + "assayName": "fluorescence in-situ hybridization assay", + "fusionDetection": "inferred", }, ) diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index 90ee02c..e1216b6 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -65,12 +65,13 @@ def reg_location_example(): "sequenceReference": { "id": "refseq:NC_000023.11", "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "type": "SequenceReference", }, "start": 1462581, "end": 1534182, }, }, - structural_elements=[ + structure=[ { "type": "GeneElement", "gene": { @@ -80,7 +81,7 @@ def reg_location_example(): }, }, ], - causative_event={ + causativeEvent={ "type": "CausativeEvent", "eventType": "rearrangement", }, @@ -99,7 +100,7 @@ def exon_offset_example(): """Provide example of tx segment with positive exon end offset""" return CategoricalFusion( type="CategoricalFusion", - structural_elements=[ + structure=[ { "type": "GeneElement", "gene": { @@ -124,9 +125,10 @@ def exon_offset_example(): "sequenceReference": { "id": "refseq:NC_000001.11", "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, - "start": {"type": "Number", "value": 156864428}, - "end": {"type": "Number", "value": 156864429}, + "start": 156864428, + "end": 156864429, }, }, ], @@ -139,38 +141,32 @@ def tx_seg_example(): return TranscriptSegmentElement( type="TranscriptSegmentElement", transcript="refseq:NM_152263.3", - exon_start=1, - exon_start_offset=0, - exon_end=8, - exon_end_offset=0, + exonStart=1, + exonStartOffset=0, + exonEnd=8, + exonEndOffset=0, gene=Gene(id="hgnc:12012", label="TPM3"), - element_genomic_start={ - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 154192135}, - "end": {"type": "Number", "value": 154192136}, - }, + elementGenomicStart={ + "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, + "start": {"type": "Number", "value": 154192135}, + "end": {"type": "Number", "value": 154192136}, }, - element_genomic_end={ - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 154170399}, - "end": {"type": "Number", "value": 154170400}, - }, + elementGenomicEnd={ + "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, + "start": {"type": "Number", "value": 154170399}, + "end": {"type": "Number", "value": 154170400}, }, ) @@ -181,27 +177,23 @@ def junction_example(): return TranscriptSegmentElement( type="TranscriptSegmentElement", transcript="refseq:NM_152263.3", - exon_end=8, - exon_end_offset=0, - gene_descriptor={ - "id": "normalize.gene:TPM3", - "type": "GeneDescriptor", + exonEnd=8, + exonEndOffset=0, + gene={ + "type": "Gene", "label": "TPM3", - "gene_id": "hgnc:12012", + "id": "hgnc:12012", }, - element_genomic_end={ - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 154170399}, - "end": {"type": "Number", "value": 154170400}, - }, + elementGenomicEnd={ + "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, + "start": {"type": "Number", "value": 154170399}, + "end": {"type": "Number", "value": 154170400}, }, ) From 3fd9615dbfbd0128fa2716d12a72f38649b0fd44 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 24 Jul 2024 12:11:09 -0400 Subject: [PATCH 60/82] update tests and examples with new models --- src/fusor/examples/bcr_abl1.json | 2 +- src/fusor/examples/bcr_abl1_expanded.json | 2 +- src/fusor/fusor.py | 45 --- tests/conftest.py | 406 +++++++++++++--------- tests/test_fusor.py | 24 -- tests/test_models.py | 19 +- tests/test_nomenclature.py | 2 +- 7 files changed, 260 insertions(+), 240 deletions(-) diff --git a/src/fusor/examples/bcr_abl1.json b/src/fusor/examples/bcr_abl1.json index a2b0caf..18b3c5b 100644 --- a/src/fusor/examples/bcr_abl1.json +++ b/src/fusor/examples/bcr_abl1.json @@ -31,7 +31,7 @@ "type": "LinkerSequenceElement", "linkerSequence": { "id": "sequence:ACTAAAGCG", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", "sequence": "ACTAAAGCG" } }, diff --git a/src/fusor/examples/bcr_abl1_expanded.json b/src/fusor/examples/bcr_abl1_expanded.json index e6870a3..d063d22 100644 --- a/src/fusor/examples/bcr_abl1_expanded.json +++ b/src/fusor/examples/bcr_abl1_expanded.json @@ -39,7 +39,7 @@ "type": "LinkerSequenceElement", "linkerSequence": { "id": "sequence:ACTAAAGCG", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", "label": null, "description": null, "xrefs": null, diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 0290af8..e6b7ce8 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -332,7 +332,6 @@ def templated_sequence_element( end: int, sequence_id: str, strand: Strand, - add_location_id: bool = False, residue_mode: ResidueMode = ResidueMode.RESIDUE, seq_id_target_namespace: str | None = None, ) -> TemplatedSequenceElement: @@ -342,8 +341,6 @@ def templated_sequence_element( :param end: Genomic end :param sequence_id: Chromosome accession for sequence :param strand: Strand - :param add_location_id: ``True`` if ``location_id`` will be added to ``region``. - ``False`` otherwise. :param residue_mode: Determines coordinate base used. Must be one of ``residue`` or ``inter-residue``. :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set @@ -360,10 +357,6 @@ def templated_sequence_element( seq_id_target_namespace=seq_id_target_namespace, ) - if add_location_id: - location_id = self._location_id(region.location.model_dump()) - region.location_id = location_id - return TemplatedSequenceElement(region=region, strand=strand) @staticmethod @@ -575,7 +568,6 @@ def add_additional_fields( """ if add_all: self.add_translated_sequence_id(fusion, target_namespace) - self.add_location_id(fusion) else: if fields: for field in fields: @@ -583,47 +575,10 @@ def add_additional_fields( self.add_translated_sequence_id( fusion, target_namespace=target_namespace ) - elif field == AdditionalFields.LOCATION_ID.value: - self.add_location_id(fusion) else: _logger.warning("Invalid field: %s", field) return fusion - def add_location_id(self, fusion: Fusion) -> Fusion: - """Add `location_id` in fusion object. - - :param fusion: A valid Fusion object. - :return: Updated fusion with `location_id` fields set - """ - for structural_element in fusion.structure: - if isinstance(structural_element, TemplatedSequenceElement): - location = structural_element.region.location - location_id = self._location_id(location.model_dump()) - structural_element.region.location_id = location_id - elif isinstance(structural_element, TranscriptSegmentElement): - for element_genomic in [ - structural_element.elementGenomicStart, - structural_element.elementGenomicEnd, - ]: - if element_genomic: - location = element_genomic.location - if location.type == SequenceLocation: - location_id = self._location_id(location.model_dump()) - element_genomic.location_id = location_id - if isinstance(fusion, CategoricalFusion) and fusion.criticalFunctionalDomains: - for domain in fusion.criticalFunctionalDomains: - location = domain.sequence_location.location - location_id = self._location_id(location.model_dump()) - domain.sequence_location.location_id = location_id - if fusion.regulatoryElement: - element = fusion.regulatoryElement - if element.feature_location: - location = element.feature_location - if location.type == SequenceLocation: - location_id = self._location_id(location.model_dump()) - element.feature_location.location_id = location_id - return fusion - @staticmethod def _location_id(location: dict) -> CURIE: """Return GA4GH digest for location diff --git a/tests/conftest.py b/tests/conftest.py index bd6091d..cd16436 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -49,107 +49,122 @@ def fusor_instance(): def braf_gene_descriptor(): """Create gene descriptor params for BRAF.""" return { - "id": "normalize.gene:BRAF", "type": "Gene", + "id": "normalize.gene.hgnc:1097", "label": "BRAF", - "xrefs": ["ensembl:ENSG00000157764", "ncbigene:673"], - "alternate_labels": ["BRAF1", "BRAF-1", "NS7", "B-raf", "B-RAF1", "RAFB1"], + "mappings": [ + { + "coding": {"code": "673", "system": "ncbigene"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "ENSG00000157764", "system": "ensembl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS5863", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1943", "system": "iuphar"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "119066", "system": "orphanet"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "BRAF", "system": "cosmic"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "2284096", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "uc003vwc.5", "system": "ucsc"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "164757", "system": "omim"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "NM_004333", "system": "refseq"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS87555", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "P15056", "system": "uniprot"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "M95712", "system": "ena.embl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "OTTHUMG00000157457", "system": "vega"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1565476", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS94219", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS94218", "system": "ccds"}, + "relation": "relatedMatch", + }, + ], + "alternativeLabels": ["BRAF1", "BRAF-1", "RAFB1", "NS7", "B-RAF1", "B-raf"], "extensions": [ - {"type": "Extension", "name": "symbol_status", "value": "approved"}, { - "type": "Extension", "name": "approved_name", "value": "B-Raf proto-oncogene, serine/threonine kinase", }, { - "type": "Extension", - "name": "hgnc_locations", - "value": [ - { - "_id": "ga4gh:VCL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "7", - "interval": { - "end": "q34", - "start": "q34", - "type": "CytobandInterval", - }, - } - ], - }, - { - "type": "Extension", "name": "ensembl_locations", "value": [ { - "_id": "ga4gh:VSL.amNWL6i7F2nbSZAf2QLTRTujxuDrd0pR", "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", - "interval": { - "start": {"type": "Number", "value": 140719326}, - "end": {"type": "Number", "value": 140924929}, - "type": "SequenceInterval", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, + "start": 140719326, + "end": 140924929, } ], }, { - "type": "Extension", "name": "ncbi_locations", "value": [ { - "_id": "ga4gh:VCL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "7", - "interval": { - "end": "q34", - "start": "q34", - "type": "CytobandInterval", - }, - }, - { - "_id": "ga4gh:VSL.xZU3kL8F6t2ca6WH_26CWKfNW9-owhR4", "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", - "interval": { - "start": {"type": "Number", "value": 140713327}, - "end": {"type": "Number", "value": 140924929}, - "type": "SequenceInterval", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - }, - ], - }, - { - "type": "Extension", - "name": "associated_with", - "value": [ - "pubmed:2284096", - "refseq:NM_004333", - "iuphar:1943", - "orphanet:119066", - "cosmic:BRAF", - "ena.embl:M95712", - "ccds:CCDS87555", - "ucsc:uc003vwc.5", - "pubmed:1565476", - "vega:OTTHUMG00000157457", - "uniprot:P15056", - "ccds:CCDS5863", - "omim:164757", + "start": 140713327, + "end": 140924929, + } ], }, + {"name": "ncbi_gene_type", "value": "protein-coding"}, { - "type": "Extension", "name": "hgnc_locus_type", "value": "gene with protein product", }, - {"type": "Extension", "name": "ncbi_gene_type", "value": "protein-coding"}, - {"type": "Extension", "name": "ensembl_biotype", "value": "protein_coding"}, - {"type": "Extension", "name": "strand", "value": "-"}, + {"name": "ensembl_biotype", "value": "protein_coding"}, + {"name": "strand", "value": "-"}, + {"name": "symbol_status", "value": "approved"}, ], - "gene_id": "hgnc:1097", } @@ -157,121 +172,198 @@ def braf_gene_descriptor(): def alk_gene_descriptor(): """Create test fixture for ALK gene descriptor params""" return { - "id": "normalize.gene:ALK", + "id": "normalize.gene.hgnc:1097", "type": "Gene", - "label": "ALK", + "label": "BRAF", "description": None, - "xrefs": ["ensembl:ENSG00000171094", "ncbigene:238"], - "alternate_labels": ["NBLST3", "CD246", "ALK1"], + "alternativeLabels": ["BRAF1", "B-raf", "B-RAF1", "BRAF-1", "NS7", "RAFB1"], "extensions": [ + {"name": "symbol_status", "value": "approved", "description": None}, { - "type": "Extension", - "name": "symbol_status", - "value": "approved", - }, - { - "type": "Extension", "name": "approved_name", - "value": "ALK receptor tyrosine kinase", - }, - { - "type": "Extension", - "name": "hgnc_locations", - "value": [ - { - "_id": "ga4gh:VCL.VE7uJHat7zIWFf9AzNM85jj05r1dLzsD", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "2", - "interval": { - "end": "p23.1", - "start": "p23.2", - "type": "CytobandInterval", - }, - } - ], + "value": "B-Raf proto-oncogene, serine/threonine kinase", + "description": None, }, + {"name": "strand", "value": "-", "description": None}, { - "type": "Extension", "name": "ensembl_locations", "value": [ { - "_id": "ga4gh:VSL.-k3kxW3qMyV-oBTvTffVZojkJBLs0flu", + "id": "ga4gh:SL.fUv91vYrVHBMg-B_QW7UpOQj50g_49hb", "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", - "interval": { - "start": {"type": "Number", "value": 29192773}, - "end": {"type": "Number", "value": 29921586}, - "type": "SequenceInterval", + "digest": "fUv91vYrVHBMg-B_QW7UpOQj50g_49hb", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, + "start": 140719326, + "end": 140924929, } ], + "description": None, }, { - "type": "Extension", "name": "ncbi_locations", "value": [ { - "_id": "ga4gh:VCL.VE7uJHat7zIWFf9AzNM85jj05r1dLzsD", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "2", - "interval": { - "end": "p23.1", - "start": "p23.2", - "type": "CytobandInterval", - }, - }, - { - "_id": "ga4gh:VSL.-k3kxW3qMyV-oBTvTffVZojkJBLs0flu", + "id": "ga4gh:SL.0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B", "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", - "interval": { - "start": {"type": "Number", "value": 29192773}, - "end": {"type": "Number", "value": 29921586}, - "type": "SequenceInterval", + "digest": "0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - }, - ], - }, - { - "type": "Extension", - "name": "associated_with", - "value": [ - "ccds:CCDS33172", - "pubmed:8122112", - "orphanet:160020", - "ccds:CCDS86828", - "cosmic:ALK", - "uniprot:Q9UM73", - "omim:105590", - "iuphar:1839", - "hcdmdb:CD246", - "vega:OTTHUMG00000152034", - "ena.embl:D45915", - "refseq:NM_004304", - "ucsc:uc002rmy.4", + "start": 140713327, + "end": 140924929, + } ], + "description": None, }, { - "type": "Extension", "name": "hgnc_locus_type", "value": "gene with protein product", + "description": None, + }, + {"name": "ncbi_gene_type", "value": "protein-coding", "description": None}, + {"name": "ensembl_biotype", "value": "protein_coding", "description": None}, + ], + "mappings": [ + { + "coding": { + "label": None, + "system": "ensembl", + "version": None, + "code": "ENSG00000157764", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ncbigene", + "version": None, + "code": "673", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "cosmic", + "version": None, + "code": "BRAF", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ena.embl", + "version": None, + "code": "M95712", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "omim", + "version": None, + "code": "164757", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "iuphar", + "version": None, + "code": "1943", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ucsc", + "version": None, + "code": "uc003vwc.5", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "vega", + "version": None, + "code": "OTTHUMG00000157457", + }, + "relation": "relatedMatch", }, { - "type": "Extension", - "name": "ncbi_gene_type", - "value": "protein-coding", + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS87555", + }, + "relation": "relatedMatch", }, { - "type": "Extension", - "name": "ensembl_biotype", - "value": "protein_coding", + "coding": { + "label": None, + "system": "uniprot", + "version": None, + "code": "P15056", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "refseq", + "version": None, + "code": "NM_004333", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "pubmed", + "version": None, + "code": "1565476", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "orphanet", + "version": None, + "code": "119066", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "pubmed", + "version": None, + "code": "2284096", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS5863", + }, + "relation": "relatedMatch", }, - {"type": "Extension", "name": "strand", "value": "-"}, ], - "gene_id": "hgnc:427", - "gene": None, } @@ -513,7 +605,7 @@ def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): "type": "LinkerSequenceElement", "linker_sequence": { "id": "fusor.sequence:ACGT", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", "label": None, "description": None, "xrefs": None, @@ -647,7 +739,7 @@ def fusion_example(): "type": "LinkerSequenceElement", "linker_sequence": { "id": "fusor.sequence:ACGT", - "type": "SequenceDescriptor", + "type": "LiteralSequenceExpression", "sequence": "ACGT", "residue_type": "SO:0000348", }, diff --git a/tests/test_fusor.py b/tests/test_fusor.py index 8b0c32a..b702932 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -434,28 +434,6 @@ def test_add_translated_sequence_id(fusor_instance, fusion_example): assert actual_fusion.model_dump() == expected_fusion.model_dump() -def test_add_location_id(fusor_instance, fusion_example, exhaustive_example): - """Test that add_location_id method works correctly.""" - fusion = fusor_instance.add_location_id(CategoricalFusion(**fusion_example)) - actual = CategoricalFusion(**exhaustive_example) - - assert ( - fusion.criticalFunctionalDomains[0].sequence_location.location_id - == actual.criticalFunctionalDomains[0].sequence_location.location_id - ) - assert ( - fusion.structure[0].elementGenomicStart.location_id - == actual.structure[0].elementGenomicStart.location_id - ) - assert ( - fusion.structure[0].elementGenomicEnd.location_id - == actual.structure[0].elementGenomicEnd.location_id - ) - assert ( - fusion.structure[3].region.location_id == actual.structure[3].region.location_id - ) - - def test__normalized_gene(fusor_instance): """Test that _normalized_gene works correctly.""" # Actual response is tested in test_add_gene_descriptor @@ -474,7 +452,6 @@ def test_add_gene_descriptor(fusor_instance, exhaustive_example, fusion_example) expected_fusion = CategoricalFusion(**exhaustive_example) actual = CategoricalFusion(**fusion_example) fusor_instance.add_translated_sequence_id(actual) - fusor_instance.add_location_id(actual) fusor_instance.add_gene_descriptor(actual) e_gds = set() @@ -807,7 +784,6 @@ def test_templated_sequence_element( 150, "NC_000001.11", "+", - add_location_id=True, seq_id_target_namespace="ga4gh", ) assert tsg.model_dump() == expected diff --git a/tests/test_models.py b/tests/test_models.py index 397989f..728de0d 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -423,9 +423,8 @@ def test_transcript_segment_element(transcript_segments): exonStartOffset="-9", exonEnd="8", exonEndOffset="7", - gene_descriptor={ + gene={ "id": "test:1", - "gene": {"id": "hgnc:1"}, "label": "G1", }, elementGenomicStart={ @@ -453,9 +452,8 @@ def test_transcript_segment_element(transcript_segments): transcript="NM_152263.3", exonStart="1", exonStartOffset="-9", - gene_descriptor={ + gene={ "id": "test:1", - "gene": {"id": "hgnc:1"}, "label": "G1", }, ) @@ -469,9 +467,8 @@ def test_transcript_segment_element(transcript_segments): transcript="NM_152263.3", exonStartOffset="-9", exonEndOffset="7", - gene_descriptor={ + gene={ "id": "test:1", - "gene": {"id": "hgnc:1"}, "label": "G1", }, elementGenomicStart={ @@ -587,7 +584,7 @@ def test_gene_element(gene_examples): # test enum validation with pytest.raises(ValidationError) as exc_info: - assert GeneElement(type="UnknownGeneElement", gene_descriptor=gene_examples[0]) + assert GeneElement(type="UnknownGeneElement", gene=gene_examples[0]) msg = "Input should be " check_validation_error(exc_info, msg) @@ -809,13 +806,13 @@ def test_fusion_element_count( ) check_validation_error(exc_info, uq_gene_error_msg) - # use alternate gene descriptor structure + # use alternate gene structure with pytest.raises(ValidationError) as exc_info: assert AssayedFusion( type="AssayedFusion", structure=[ - {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, - {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, + {"type": "GeneElement", "gene": gene_examples[6]}, + {"type": "GeneElement", "gene": gene_examples[6]}, ], causativeEvent={ "type": "CausativeEvent", @@ -833,7 +830,7 @@ def test_fusion_element_count( assert AssayedFusion( type="AssayedFusion", structure=[ - {"type": "GeneElement", "gene_descriptor": gene_examples[6]}, + {"type": "GeneElement", "gene": gene_examples[6]}, ], regulatoryElement={ "type": "RegulatoryElement", diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index e1216b6..ef87b99 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -40,7 +40,7 @@ def reg_example(): "type": "Assay", "assayName": "a", "assayId": "a:b", - "methodUti": "a:b", + "methodUri": "a:b", "fusionDetection": "observed", }, ) From 9af8c720eced7484745b4b88f6eb85b9f2156771 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 24 Jul 2024 12:13:15 -0400 Subject: [PATCH 61/82] update tests and examples with new models --- tests/conftest.py | 82 +++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index cd16436..425b416 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -172,16 +172,16 @@ def braf_gene_descriptor(): def alk_gene_descriptor(): """Create test fixture for ALK gene descriptor params""" return { - "id": "normalize.gene.hgnc:1097", + "id": "normalize.gene.hgnc:427", "type": "Gene", - "label": "BRAF", + "label": "ALK", "description": None, - "alternativeLabels": ["BRAF1", "B-raf", "B-RAF1", "BRAF-1", "NS7", "RAFB1"], + "alternativeLabels": ["NBLST3", "CD246", "ALK1"], "extensions": [ {"name": "symbol_status", "value": "approved", "description": None}, { "name": "approved_name", - "value": "B-Raf proto-oncogene, serine/threonine kinase", + "value": "ALK receptor tyrosine kinase", "description": None, }, {"name": "strand", "value": "-", "description": None}, @@ -189,15 +189,15 @@ def alk_gene_descriptor(): "name": "ensembl_locations", "value": [ { - "id": "ga4gh:SL.fUv91vYrVHBMg-B_QW7UpOQj50g_49hb", + "id": "ga4gh:SL.V-yTsF-F4eHxeDHeU5KZIF3ZOzE2vUnG", "type": "SequenceLocation", - "digest": "fUv91vYrVHBMg-B_QW7UpOQj50g_49hb", + "digest": "V-yTsF-F4eHxeDHeU5KZIF3ZOzE2vUnG", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "refgetAccession": "SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", }, - "start": 140719326, - "end": 140924929, + "start": 29192773, + "end": 29921586, } ], "description": None, @@ -206,15 +206,15 @@ def alk_gene_descriptor(): "name": "ncbi_locations", "value": [ { - "id": "ga4gh:SL.0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B", + "id": "ga4gh:SL.V-yTsF-F4eHxeDHeU5KZIF3ZOzE2vUnG", "type": "SequenceLocation", - "digest": "0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B", + "digest": "V-yTsF-F4eHxeDHeU5KZIF3ZOzE2vUnG", "sequenceReference": { "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "refgetAccession": "SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g", }, - "start": 140713327, - "end": 140924929, + "start": 29192773, + "end": 29921586, } ], "description": None, @@ -233,7 +233,7 @@ def alk_gene_descriptor(): "label": None, "system": "ensembl", "version": None, - "code": "ENSG00000157764", + "code": "ENSG00000171094", }, "relation": "relatedMatch", }, @@ -242,115 +242,115 @@ def alk_gene_descriptor(): "label": None, "system": "ncbigene", "version": None, - "code": "673", + "code": "238", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "cosmic", + "system": "orphanet", "version": None, - "code": "BRAF", + "code": "160020", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "ena.embl", + "system": "hcdmdb", "version": None, - "code": "M95712", + "code": "CD246", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "omim", + "system": "ucsc", "version": None, - "code": "164757", + "code": "uc002rmy.4", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "iuphar", + "system": "refseq", "version": None, - "code": "1943", + "code": "NM_004304", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "ucsc", + "system": "ccds", "version": None, - "code": "uc003vwc.5", + "code": "CCDS33172", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "vega", + "system": "omim", "version": None, - "code": "OTTHUMG00000157457", + "code": "105590", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "ccds", + "system": "ena.embl", "version": None, - "code": "CCDS87555", + "code": "D45915", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "uniprot", + "system": "vega", "version": None, - "code": "P15056", + "code": "OTTHUMG00000152034", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "refseq", + "system": "uniprot", "version": None, - "code": "NM_004333", + "code": "Q9UM73", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "pubmed", + "system": "iuphar", "version": None, - "code": "1565476", + "code": "1839", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "orphanet", + "system": "pubmed", "version": None, - "code": "119066", + "code": "8122112", }, "relation": "relatedMatch", }, { "coding": { "label": None, - "system": "pubmed", + "system": "cosmic", "version": None, - "code": "2284096", + "code": "ALK", }, "relation": "relatedMatch", }, @@ -359,7 +359,7 @@ def alk_gene_descriptor(): "label": None, "system": "ccds", "version": None, - "code": "CCDS5863", + "code": "CCDS86828", }, "relation": "relatedMatch", }, From 871c3d906cf0e678df54bc6bcc0c451880971cd1 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 24 Jul 2024 12:42:11 -0400 Subject: [PATCH 62/82] update tests and examples with new models --- src/fusor/examples/bcr_abl1.json | 6 +- src/fusor/examples/bcr_abl1_expanded.json | 14 +- src/fusor/fusor.py | 5 +- src/fusor/nomenclature.py | 4 +- tests/conftest.py | 673 ++++++++++++---------- tests/test_fusor.py | 8 +- tests/test_models.py | 2 - tests/test_nomenclature.py | 12 +- 8 files changed, 408 insertions(+), 316 deletions(-) diff --git a/src/fusor/examples/bcr_abl1.json b/src/fusor/examples/bcr_abl1.json index 18b3c5b..2fddf59 100644 --- a/src/fusor/examples/bcr_abl1.json +++ b/src/fusor/examples/bcr_abl1.json @@ -14,7 +14,7 @@ "type": "SequenceLocation", "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "sequenceReference": { "id": "refseq:NC_000022.11", @@ -49,7 +49,7 @@ "id": "ga4gh:SL.tZYgaEJP2-d4Guv-n5gyhqOc07qH9xr6", "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "type": "SequenceLocation", "sequenceReference": { @@ -78,7 +78,7 @@ "id": "ga4gh:SL.VQe2sf2aYArPcvjygq38JvFxRuDniE15", "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "type": "SequenceLocation", "sequenceReference": { diff --git a/src/fusor/examples/bcr_abl1_expanded.json b/src/fusor/examples/bcr_abl1_expanded.json index d063d22..b00b115 100644 --- a/src/fusor/examples/bcr_abl1_expanded.json +++ b/src/fusor/examples/bcr_abl1_expanded.json @@ -14,7 +14,7 @@ "label": "BCR", "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "id": "hgnc:1014" }, @@ -24,7 +24,7 @@ "type": "SequenceLocation", "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "sequenceReference": { "id": "refseq:NC_000022.11", @@ -43,7 +43,7 @@ "label": null, "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "sequence": "ACTAAAGCG" } @@ -60,7 +60,7 @@ "label": "ABL1", "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "id": "hgnc:76" }, @@ -68,7 +68,7 @@ "id": "ga4gh:SL.tZYgaEJP2-d4Guv-n5gyhqOc07qH9xr6", "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "type": "SequenceLocation", "sequenceReference": { @@ -92,7 +92,7 @@ "label": "ABL1", "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "id": "hgnc:76" }, @@ -102,7 +102,7 @@ "id": "ga4gh:SL.VQe2sf2aYArPcvjygq38JvFxRuDniE15", "description": null, "xrefs": null, - "alternate_labels": null, + "alternativeLabels": null, "extensions": null, "type": "SequenceLocation", "sequenceReference": { diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index e6b7ce8..f102df7 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -416,8 +416,7 @@ def functional_domain( :param sequence_id: protein sequence on which provided coordinates are located :param start: start position on sequence :param end: end position on sequence - :param use_minimal_gene: ``True`` if minimal gene object (``id``, - ``gene_id``, ``label``) will be used. ``False`` if gene-normalizer's gene + :param use_minimal_gene: ``True`` if minimal gene object (``id``, ``label``) will be used. ``False`` if gene-normalizer's gene object will be used :param seq_id_target_namespace: If want to use digest for ``sequence_id``, set this to the namespace you want the digest for. Otherwise, leave as ``None``. @@ -679,7 +678,7 @@ def generate_nomenclature(self, fusion: Fusion) -> str: elif isinstance(element, UnknownGeneElement): parts.append("?") elif isinstance(element, LinkerElement): - parts.append(element.linkerSequence.sequence) + parts.append(element.linkerSequence.sequence.root) elif isinstance(element, TranscriptSegmentElement): if not any( [gene == element.gene.label for gene in element_genes] # noqa: C419 diff --git a/src/fusor/nomenclature.py b/src/fusor/nomenclature.py index 4939bf8..bf8bc58 100644 --- a/src/fusor/nomenclature.py +++ b/src/fusor/nomenclature.py @@ -1,7 +1,7 @@ """Provide helper methods for fusion nomenclature generation.""" from biocommons.seqrepo.seqrepo import SeqRepo -from ga4gh.vrs.models import SequenceLocation +from ga4gh.vrs.models import SequenceReference from fusor.exceptions import IDTranslationException from fusor.models import ( @@ -97,7 +97,7 @@ def templated_seq_nomenclature(element: TemplatedSequenceElement, sr: SeqRepo) - region = element.region if region: sequence_reference = element.region.sequenceReference - if isinstance(sequence_reference, SequenceLocation): + if isinstance(sequence_reference, SequenceReference): sequence_id = str(sequence_reference.id) refseq_id = str(translate_identifier(sr, sequence_id, "refseq")) start = region.start diff --git a/tests/conftest.py b/tests/conftest.py index 425b416..e3689b5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,11 +46,11 @@ def fusor_instance(): @pytest.fixture(scope="session") -def braf_gene_descriptor(): - """Create gene descriptor params for BRAF.""" +def braf_gene(): + """Create gene params for BRAF.""" return { "type": "Gene", - "id": "normalize.gene.hgnc:1097", + "id": "hgnc:1097", "label": "BRAF", "mappings": [ { @@ -169,10 +169,10 @@ def braf_gene_descriptor(): @pytest.fixture(scope="session") -def alk_gene_descriptor(): - """Create test fixture for ALK gene descriptor params""" +def alk_gene(): + """Create test fixture for ALK gene params""" return { - "id": "normalize.gene.hgnc:427", + "id": "hgnc:427", "type": "Gene", "label": "ALK", "description": None, @@ -367,39 +367,293 @@ def alk_gene_descriptor(): } +@pytest.fixture(scope="session") +def tpm3_gene(): + """Create test fixture for TPM3 gene""" + return { + "id": "hgnc:12012", + "type": "Gene", + "label": "TPM3", + "description": None, + "alternativeLabels": [ + "TM3", + "NEM1~withdrawn", + "TM30", + "TM5", + "TRK", + "HEL-S-82p", + "NEM1", + "OK/SW-cl.5", + "TM30nm", + "hscp30", + "FLJ35371", + "TPMsk3", + "HEL-189", + "CFTD", + "TPM3nu", + "TM-5", + "CAPM1", + ], + "extensions": [ + {"name": "symbol_status", "value": "approved", "description": None}, + {"name": "approved_name", "value": "tropomyosin 3", "description": None}, + { + "name": "previous_symbols", + "value": ["FLJ35371", "NEM1", "NEM1~withdrawn"], + "description": None, + }, + {"name": "strand", "value": "-", "description": None}, + { + "name": "ensembl_locations", + "value": [ + { + "id": "ga4gh:SL.cgdnkG0tZq9SpwTHMWMG4sjT9JGXQ-Ap", + "type": "SequenceLocation", + "digest": "cgdnkG0tZq9SpwTHMWMG4sjT9JGXQ-Ap", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 154155307, + "end": 154194648, + } + ], + "description": None, + }, + { + "name": "ncbi_locations", + "value": [ + { + "id": "ga4gh:SL.aVsAgF9lwnjLgy-DXECiDgavt5F0OsYR", + "type": "SequenceLocation", + "digest": "aVsAgF9lwnjLgy-DXECiDgavt5F0OsYR", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + }, + "start": 154155307, + "end": 154192100, + } + ], + "description": None, + }, + { + "name": "hgnc_locus_type", + "value": "gene with protein product", + "description": None, + }, + {"name": "ncbi_gene_type", "value": "protein-coding", "description": None}, + {"name": "ensembl_biotype", "value": "protein_coding", "description": None}, + ], + "mappings": [ + { + "coding": { + "label": None, + "system": "ensembl", + "version": None, + "code": "ENSG00000143549", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ncbigene", + "version": None, + "code": "7170", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS41403", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ucsc", + "version": None, + "code": "uc001fec.3", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "pubmed", + "version": None, + "code": "25369766", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS41401", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS60275", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "uniprot", + "version": None, + "code": "P06753", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "cosmic", + "version": None, + "code": "TPM3", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS60274", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ena.embl", + "version": None, + "code": "BC008425", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS41402", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS1060", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "omim", + "version": None, + "code": "191030", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "orphanet", + "version": None, + "code": "120227", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS41400", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "refseq", + "version": None, + "code": "NM_152263", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "vega", + "version": None, + "code": "OTTHUMG00000035853", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "pubmed", + "version": None, + "code": "1829807", + }, + "relation": "relatedMatch", + }, + { + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS72922", + }, + "relation": "relatedMatch", + }, + ], + } + + @pytest.fixture(scope="module") -def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): +def exhaustive_example(alk_gene, braf_gene, tpm3_gene): """Create test fixture for a fake fusion exemplifying most major field types, in 'expanded' form (ie properties augmented by VICC descriptors) """ return { "type": "CategoricalFusion", - "critical_functional_domains": [ + "criticalFunctionalDomains": [ { "type": "FunctionalDomain", - "_id": "interpro:IPR020635", + "id": "interpro:IPR020635", "label": "Tyrosine-protein kinase, catalytic domain", "status": "lost", - "associatedGene": alk_gene_descriptor, - "sequence_location": { - "id": "fusor.location_descriptor:NP_004295.2", - "type": "LocationDescriptor", - "label": None, - "description": None, - "xrefs": None, - "alternate_labels": None, - "extensions": None, - "location_id": "ga4gh:VSL.hQKhk6ZOOYZAmShXrzhfb6H3j65ovsKu", - "location": { - "id": None, - "type": "SequenceLocation", - "sequence_id": "refseq:NP_004295.2", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 1116}, - "end": {"type": "Number", "value": 1383}, - }, + "associatedGene": alk_gene, + "sequenceLocation": { + "id": "ga4gh:SL.aYx-iUOFEw7GVZb4fwrQLkQQahpiIAVp", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NP_004295.2", + "refgetAccession": "SQ.q9CnK-HKWh9eqhOi8FlzR7M0pCmUrWPs", + "type": "SequenceReference", }, + "start": 1116, + "end": 1383, }, } ], @@ -407,245 +661,85 @@ def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_152263.3", - "exon_start": 1, - "exon_start_offset": 0, - "exon_end": 8, - "exon_end_offset": 0, - "gene_descriptor": { - "id": "normalize.gene:TPM3", - "type": "Gene", - "label": "TPM3", - "description": None, - "xrefs": ["ensembl:ENSG00000143549", "ncbigene:7170"], - "alternate_labels": [ - "TM-5", - "TM5", - "NEM1~withdrawn", - "OK/SW-cl.5", - "TM30nm", - "TPMsk3", - "HEL-S-82p", - "TRK", - "CAPM1", - "TPM3nu", - "FLJ35371", - "TM30", - "TM3", - "CFTD", - "NEM1", - "hscp30", - "HEL-189", - ], - "extensions": [ - { - "type": "Extension", - "name": "symbol_status", - "value": "approved", - }, - { - "type": "Extension", - "name": "approved_name", - "value": "tropomyosin 3", - }, - { - "type": "Extension", - "name": "hgnc_locations", - "value": [ - { - "_id": "ga4gh:VCL.rmJvYV5JccRSEoMVxe5BmuHs9S2VZ4uR", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "1", - "interval": { - "end": "q21.3", - "start": "q21.3", - "type": "CytobandInterval", - }, - } - ], - }, - { - "type": "Extension", - "name": "ensembl_locations", - "value": [ - { - "_id": "ga4gh:VSL._ASa2-iBSDZSpC3JlpwJxzv4OY5M-5Ct", - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "interval": { - "start": {"type": "Number", "value": 154155307}, - "end": {"type": "Number", "value": 154194648}, - "type": "SequenceInterval", - }, - } - ], - }, - { - "type": "Extension", - "name": "ncbi_locations", - "value": [ - { - "_id": "ga4gh:VCL.rmJvYV5JccRSEoMVxe5BmuHs9S2VZ4uR", - "type": "ChromosomeLocation", - "species_id": "taxonomy:9606", - "chr": "1", - "interval": { - "end": "q21.3", - "start": "q21.3", - "type": "CytobandInterval", - }, - }, - { - "_id": "ga4gh:VSL.sGJqQhhTg3BYlndAP7nFzN7KoKID1yP_", - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "interval": { - "start": {"type": "Number", "value": 154155307}, - "end": {"type": "Number", "value": 154192100}, - "type": "SequenceInterval", - }, - }, - ], - }, - { - "type": "Extension", - "name": "associated_with", - "value": [ - "vega:OTTHUMG00000035853", - "ccds:CCDS41400", - "ccds:CCDS1060", - "ccds:CCDS41402", - "ccds:CCDS41401", - "pubmed:25369766", - "cosmic:TPM3", - "refseq:NM_152263", - "orphanet:120227", - "uniprot:P06753", - "ccds:CCDS72922", - "ccds:CCDS60274", - "ucsc:uc001fec.3", - "omim:191030", - "ccds:CCDS41403", - "ena.embl:BC008425", - "ccds:CCDS60275", - "pubmed:1829807", - ], - }, - { - "type": "Extension", - "name": "previous_symbols", - "value": ["NEM1", "NEM1~withdrawn", "FLJ35371"], - }, - { - "type": "Extension", - "name": "hgnc_locus_type", - "value": "gene with protein product", - }, - { - "type": "Extension", - "name": "ncbi_gene_type", - "value": "protein-coding", - }, - { - "type": "Extension", - "name": "ensembl_biotype", - "value": "protein_coding", - }, - {"type": "Extension", "name": "strand", "value": "-"}, - ], - "gene_id": "hgnc:12012", - "gene": None, - }, + "exonStart": 1, + "exonStartOffset": 0, + "exonEnd": 8, + "exonEndOffset": 0, + "gene": tpm3_gene, "elementGenomicStart": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", + "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", "description": None, "xrefs": None, - "alternate_labels": None, + "alternativeLabels": None, "extensions": None, - "location_id": "ga4gh:VSL.n7i6VMRAuSgAjwVopxhWAJdlPJMfk7KR", - "location": { - "id": None, - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 154192135}, - "end": {"type": "Number", "value": 154192136}, - }, + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, + "start": 154192135, + "end": 154192136, }, "elementGenomicEnd": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", + "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", "description": None, "xrefs": None, - "alternate_labels": None, + "alternativeLabels": None, "extensions": None, - "location_id": "ga4gh:VSL.wQ4TpNbsTPq_A-eQTL44gbP3f4fnp0vx", - "location": { - "id": None, - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 154170399}, - "end": {"type": "Number", "value": 154170400}, - }, + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, + "start": 154170399, + "end": 154170400, }, }, { "type": "GeneElement", - "gene_descriptor": alk_gene_descriptor, + "gene": alk_gene, }, { "type": "LinkerSequenceElement", - "linker_sequence": { + "linkerSequence": { "id": "fusor.sequence:ACGT", "type": "LiteralSequenceExpression", "label": None, "description": None, "xrefs": None, - "alternate_labels": None, + "alternativeLabels": None, "extensions": None, "sequence_id": None, "sequence": "ACGT", - "residue_type": "SO:0000348", }, }, { "type": "TemplatedSequenceElement", "region": { - "id": "fusor.location_descriptor:NC_000023.11", - "type": "LocationDescriptor", - "label": None, + "id": "ga4gh:SL.gb3ew2XQ-Doi1AtvlmajeZO7fS1eDPg_", "description": None, "xrefs": None, - "alternate_labels": None, + "alternativeLabels": None, "extensions": None, - "location_id": "ga4gh:VSL.zd12pX_ju2gLq9a9UOYgM8AtbkuhnyUu", - "location": { - "id": None, - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 44908820}, - "end": {"type": "Number", "value": 44908822}, - }, + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000023.11", + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "type": "SequenceReference", }, + "start": 44908820, + "end": 44908822, }, "strand": "+", }, {"type": "MultiplePossibleGenesElement"}, ], - "regulatory_element": { + "regulatoryElement": { "type": "RegulatoryElement", - "regulatory_class": "promoter", - "associatedGene": braf_gene_descriptor, + "regulatoryClass": "promoter", + "associatedGene": braf_gene, }, } @@ -655,11 +749,11 @@ def fusion_example(): """Create test fixture for a fake fusion without additional property expansion.""" return { "type": "CategoricalFusion", - "reading_frame_preserved": True, - "critical_functional_domains": [ + "readingFramePreserved": True, + "criticalFunctionalDomains": [ { "type": "FunctionalDomain", - "_id": "interpro:IPR020635", + "id": "interpro:IPR020635", "label": "Tyrosine-protein kinase, catalytic domain", "status": "lost", "associatedGene": { @@ -668,18 +762,20 @@ def fusion_example(): "label": "ALK", "gene_id": "hgnc:427", }, - "sequence_location": { - "id": "fusor.location_descriptor:NP_004295.2", - "type": "LocationDescriptor", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NP_004295.2", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 1116}, - "end": {"type": "Number", "value": 1383}, - }, + "sequenceLocation": { + "id": "ga4gh:SL.aYx-iUOFEw7GVZb4fwrQLkQQahpiIAVp", + "description": None, + "xrefs": None, + "alternativeLabels": None, + "extensions": None, + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NP_004295.2", + "refgetAccession": "SQ.q9CnK-HKWh9eqhOi8FlzR7M0pCmUrWPs", + "type": "SequenceReference", }, + "start": 1116, + "end": 1383, }, } ], @@ -687,91 +783,90 @@ def fusion_example(): { "type": "TranscriptSegmentElement", "transcript": "refseq:NM_152263.3", - "exon_start": 1, - "exon_start_offset": 0, - "exon_end": 8, - "exon_end_offset": 0, - "gene_descriptor": { - "id": "normalize.gene:TPM3", + "exonStart": 1, + "exonStartOffset": 0, + "exonEnd": 8, + "exonEndOffset": 0, + "gene": { "type": "Gene", "label": "TPM3", - "gene_id": "hgnc:12012", + "id": "hgnc:12012", }, "elementGenomicStart": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 154192135}, - "end": {"type": "Number", "value": 154192136}, - }, + "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", + "description": None, + "xrefs": None, + "alternativeLabels": None, + "extensions": None, + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, + "start": 154192135, + "end": 154192136, }, "elementGenomicEnd": { - "id": "fusor.location_descriptor:NC_000001.11", - "type": "LocationDescriptor", - "label": "NC_000001.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000001.11", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 154170399}, - "end": {"type": "Number", "value": 154170400}, - }, + "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", + "description": None, + "xrefs": None, + "alternativeLabels": None, + "extensions": None, + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, + "start": 154170399, + "end": 154170400, }, }, { "type": "GeneElement", - "gene_descriptor": { - "id": "normalize.gene:ALK", + "gene": { "type": "Gene", "label": "ALK", - "gene_id": "hgnc:427", + "id": "hgnc:427", }, }, { "type": "LinkerSequenceElement", - "linker_sequence": { + "linkerSequence": { "id": "fusor.sequence:ACGT", "type": "LiteralSequenceExpression", "sequence": "ACGT", - "residue_type": "SO:0000348", }, }, { "type": "TemplatedSequenceElement", "region": { - "id": "fusor.location_descriptor:NC_000023.11", - "type": "LocationDescriptor", - "location_id": "ga4gh:VSL.q0Hnb9gpYDyUuTix4Fesy5ungdnc4dWm", - "location": { - "type": "SequenceLocation", - "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 44908820}, - "end": {"type": "Number", "value": 44908822}, - }, + "id": "ga4gh:SL.gb3ew2XQ-Doi1AtvlmajeZO7fS1eDPg_", + "description": None, + "xrefs": None, + "alternativeLabels": None, + "extensions": None, + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000023.11", + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", + "type": "SequenceReference", }, + "start": 44908820, + "end": 44908822, }, "strand": "+", }, {"type": "MultiplePossibleGenesElement"}, ], - "regulatory_element": { + "regulatoryElement": { "type": "RegulatoryElement", - "regulatory_class": "promoter", + "regulatoryClass": "promoter", "associatedGene": { - "id": "gene:BRAF", "type": "Gene", "label": "BRAF", - "gene_id": "hgnc:1097", + "id": "hgnc:1097", }, }, } diff --git a/tests/test_fusor.py b/tests/test_fusor.py index b702932..b232dbd 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -328,12 +328,12 @@ def compare_gene_descriptor(actual: dict, expected: dict): assert set(actual["xrefs"]) == set(expected["xrefs"]), "xrefs" else: assert actual["xrefs"] == expected["xrefs"] - if expected["alternate_labels"]: - assert set(actual["alternate_labels"]) == set( - expected["alternate_labels"] + if expected["alternativeLabels"]: + assert set(actual["alternativeLabels"]) == set( + expected["alternativeLabels"] ), "alt labels" else: - assert actual["alternate_labels"] == expected["alternate_labels"] + assert actual["alternativeLabels"] == expected["alternativeLabels"] assert "extensions" in actual if expected["extensions"]: assert len(actual["extensions"]) == len( diff --git a/tests/test_models.py b/tests/test_models.py index 728de0d..20f20f9 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -730,7 +730,6 @@ def test_fusion( "id": "a:b", "type": "LiteralSequenceExpression", "sequence": "AC", - "residue_type": "SO:0000348", }, }, { @@ -739,7 +738,6 @@ def test_fusion( "id": "a:b", "type": "LiteralSequenceExpression", "sequence": "AC", - "residue_type": "SO:0000348", }, }, ], diff --git a/tests/test_nomenclature.py b/tests/test_nomenclature.py index ef87b99..875ad68 100644 --- a/tests/test_nomenclature.py +++ b/tests/test_nomenclature.py @@ -154,8 +154,8 @@ def tx_seg_example(): "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", "type": "SequenceReference", }, - "start": {"type": "Number", "value": 154192135}, - "end": {"type": "Number", "value": 154192136}, + "start": 154192135, + "end": 154192136, }, elementGenomicEnd={ "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", @@ -165,8 +165,8 @@ def tx_seg_example(): "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", "type": "SequenceReference", }, - "start": {"type": "Number", "value": 154170399}, - "end": {"type": "Number", "value": 154170400}, + "start": 154170399, + "end": 154170400, }, ) @@ -192,8 +192,8 @@ def junction_example(): "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", "type": "SequenceReference", }, - "start": {"type": "Number", "value": 154170399}, - "end": {"type": "Number", "value": 154170400}, + "start": 154170399, + "end": 154170400, }, ) From 01ef72203843f76f71cce2b24c762c7cf9f1ad36 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 24 Jul 2024 13:16:42 -0400 Subject: [PATCH 63/82] refactor: moving around logic to make more readable --- src/fusor/fusor.py | 97 ++-------------------------------------------- 1 file changed, 4 insertions(+), 93 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index f102df7..e96de43 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -23,7 +23,6 @@ from fusor.exceptions import FUSORParametersException, IDTranslationException from fusor.models import ( - AdditionalFields, Assay, AssayedFusion, AssayedFusionElements, @@ -544,40 +543,6 @@ def _sequence_location( return sequence_location - def add_additional_fields( - self, - fusion: Fusion, - add_all: bool = True, - fields: list[AdditionalFields] | None = None, - target_namespace: str = "ga4gh", - ) -> Fusion: - """Add additional fields to Fusion object. - - Possible fields are shown in ``AdditionalFields`` - - :param fusion: A valid Fusion object - :param add_all: ``True`` if all additional fields will be added in fusion - object. ``False`` if only select fields will be provided. If set to - ``True``, will always take precedence over ``fields``. - :param fields: Select fields that will be set. Must be a subset of - ``AdditionalFields`` - :param target_namespace: The namespace of identifiers to return for - ``sequence_id``. Default is ``ga4gh`` - :return: Updated fusion with specified fields set - """ - if add_all: - self.add_translated_sequence_id(fusion, target_namespace) - else: - if fields: - for field in fields: - if field == AdditionalFields.SEQUENCE_ID.value: - self.add_translated_sequence_id( - fusion, target_namespace=target_namespace - ) - else: - _logger.warning("Invalid field: %s", field) - return fusion - @staticmethod def _location_id(location: dict) -> CURIE: """Return GA4GH digest for location @@ -587,62 +552,8 @@ def _location_id(location: dict) -> CURIE: """ return ga4gh_identify(models.Location(**location)) - def add_translated_sequence_id( - self, fusion: Fusion, target_namespace: str = "ga4gh" - ) -> Fusion: - """Translate sequence_ids in fusion object. - - :param fusion: A valid Fusion object - :param target_namespace: ID namespace to translate sequence IDs to - :return: Updated fusion with ``sequence_id`` fields set - """ - for element in fusion.structure: - if isinstance(element, TemplatedSequenceElement): - location = element.region.location - if location.type == SequenceLocation: - try: - new_id = translate_identifier( - self.seqrepo, location.sequence_id, target_namespace - ) - except IDTranslationException: - pass - else: - element.region.location.sequence_id = new_id - elif isinstance(element, TranscriptSegmentElement): - for loc_descr in [ - element.elementGenomicStart, - element.elementGenomicEnd, - ]: - if loc_descr: - location = loc_descr.location - if location.type == SequenceLocation: - try: - new_id = translate_identifier( - self.seqrepo, location.sequence_id, target_namespace - ) - except IDTranslationException: - continue - loc_descr.location.sequence_id = new_id - if fusion.type == "CategoricalFusion" and fusion.criticalFunctionalDomains: - for domain in fusion.criticalFunctionalDomains: - if ( - domain.sequence_location - and domain.sequence_location.location - and (domain.sequence_location.location.type == "SequenceLocation") - ): - try: - new_id = translate_identifier( - self.seqrepo, - domain.sequence_location.location.sequence_id, - target_namespace, - ) - except IDTranslationException: - continue - domain.sequence_location.location.sequence_id = new_id - return fusion - def _normalized_gene( - self, query: str, use_minimal_gene: bool + self, query: str, use_minimal_gene: bool | None = None ) -> tuple[Gene | None, str | None]: """Return gene from normalized response. @@ -654,9 +565,9 @@ def _normalized_gene( gene_norm_resp = self.gene_normalizer.normalize(query) if gene_norm_resp.match_type: gene = gene_norm_resp.gene - if not use_minimal_gene: - return gene, None - return Gene(id=gene_norm_resp.normalized_id, label=gene.label), None + if use_minimal_gene: + return Gene(id=gene_norm_resp.normalized_id, label=gene.label), None + return gene, None return None, f"gene-normalizer unable to normalize {query}" def generate_nomenclature(self, fusion: Fusion) -> str: From b8115990233d79e304982422b2b62791d3c4f62c Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 24 Jul 2024 13:57:47 -0400 Subject: [PATCH 64/82] model updates --- src/fusor/fusor.py | 6 +- tests/test_fusor.py | 372 +++++++++++--------------------------------- 2 files changed, 95 insertions(+), 283 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index e96de43..b78261b 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -126,7 +126,7 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: # try to infer from provided attributes categorical_attributes = any( [ - "criticalFunctionalDomains" in kwargs, + "critical_functional_domains" in kwargs, self._contains_element_type( kwargs, StructuralElementType.MULTIPLE_POSSIBLE_GENES_ELEMENT ), @@ -134,7 +134,7 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: ) assayed_attributes = any( [ - "causativeEvent" in kwargs, + "causative_event" in kwargs, "assay" in kwargs, self._contains_element_type( kwargs, StructuralElementType.UNKNOWN_GENE_ELEMENT @@ -208,7 +208,7 @@ def assayed_fusion( regulatoryElement=regulatory_element, causativeEvent=causative_event, assay=assay, - reading_frame_preserved=reading_frame_preserved, + readingFramePreserved=reading_frame_preserved, ) except ValidationError as e: raise FUSORParametersException(str(e)) from e diff --git a/tests/test_fusor.py b/tests/test_fusor.py index b232dbd..7e879d6 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -23,15 +23,15 @@ @pytest.fixture(scope="module") -def braf_gene_descr_min(): - """Create minimal gene descriptor for BRAF""" +def braf_gene_obj_min(): + """Create minimal gene object for BRAF""" return Gene(label="BRAF", id="hgnc:1097") @pytest.fixture(scope="module") -def braf_gene_descr(braf_gene_descriptor): - """Create gene descriptor object for braf""" - return Gene(**braf_gene_descriptor) +def braf_gene_obj(braf_gene): + """Create gene object for braf""" + return Gene(**braf_gene) @pytest.fixture(scope="module") @@ -49,8 +49,8 @@ def linker_element(): @pytest.fixture(scope="module") -def location_descriptor_braf_domain(): - """Create location descriptor fixture for BRAF catalytic domain""" +def sequence_location_braf_domain(): + """Create sequence location fixture for BRAF catalytic domain""" params = { "id": "ga4gh:SL.Lm-hzZHlA8FU_cYaOtAIbMLdf4Kk-SF8", "type": "SequenceLocation", @@ -66,84 +66,65 @@ def location_descriptor_braf_domain(): @pytest.fixture(scope="module") -def location_descriptor_braf_domain_seq_id(): - """Create location descriptor fixture for BRAF catalytic domain""" - params = { - "id": "ga4gh:SL.Lm-hzZHlA8FU_cYaOtAIbMLdf4Kk-SF8", - "type": "SequenceLocation", - "sequenceReference": { - "id": "refseq:NP_004324.2", - "refgetAccession": "SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", - "type": "SequenceReference", - }, - "start": 458, - "end": 712, - } - return SequenceLocation(**params) - - -@pytest.fixture(scope="module") -def functional_domain_min(braf_gene_descr_min, location_descriptor_braf_domain): +def functional_domain_min(braf_gene_obj_min, sequence_location_braf_domain): """Create functional domain test fixture.""" params = { "status": "preserved", "label": "Serine-threonine/tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR001245", - "associatedGene": braf_gene_descr_min, - "sequenceLocation": location_descriptor_braf_domain, + "associatedGene": braf_gene_obj_min, + "sequenceLocation": sequence_location_braf_domain, } return FunctionalDomain(**params) @pytest.fixture(scope="module") -def functional_domain(braf_gene_descr, location_descriptor_braf_domain): +def functional_domain(braf_gene_obj, sequence_location_braf_domain): """Create functional domain test fixture.""" params = { "status": "preserved", "label": "Serine-threonine/tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR001245", - "associatedGene": braf_gene_descr, - "sequenceLocation": location_descriptor_braf_domain, + "associatedGene": braf_gene_obj, + "sequenceLocation": sequence_location_braf_domain, } return FunctionalDomain(**params) @pytest.fixture(scope="module") -def functional_domain_seq_id( - braf_gene_descr_min, location_descriptor_braf_domain_seq_id -): +def functional_domain_seq_id(braf_gene_obj_min, sequence_location_braf_domain): """Create functional domain test fixture.""" params = { "status": "preserved", "label": "Serine-threonine/tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR001245", - "associatedGene": braf_gene_descr_min, - "sequenceLocation": location_descriptor_braf_domain_seq_id, + "associatedGene": braf_gene_obj_min, + "sequenceLocation": sequence_location_braf_domain, } return FunctionalDomain(**params) @pytest.fixture(scope="module") -def regulatory_element(braf_gene_descr): +def regulatory_element(braf_gene_obj): """Create regulatory element test fixture.""" params = { "type": "RegulatoryElement", "regulatoryClass": "promoter", - "associatedGene": braf_gene_descr, + "associatedGene": braf_gene_obj, } return RegulatoryElement(**params) @pytest.fixture(scope="module") -def regulatory_element_min(braf_gene_descr_min): - """Create regulatory element test fixture with minimal gene descriptor.""" - params = {"regulatoryClass": "promoter", "associatedGene": braf_gene_descr_min} +def regulatory_element_min(braf_gene_obj_min): + """Create regulatory element test fixture with minimal gene object.""" + params = {"regulatoryClass": "promoter", "associatedGene": braf_gene_obj_min} return RegulatoryElement(**params) @pytest.fixture(scope="module") -def location_descriptor_tpm3(): - """Create location descriptor test fixture.""" +def sequence_location_tpm3(): + """Create sequence location for TPM3 test fixture.""" params = { "id": "ga4gh:SL.0cMJgKuY32ate6k95oLua6vv8JAJ4PzO", "type": "SequenceLocation", @@ -203,17 +184,10 @@ def templated_sequence_element_custom_id(): params = { "type": "TemplatedSequenceElement", "region": { - "id": "fusor.location_descriptor:custom_ID__1", - "type": "LocationDescriptor", - "location": { - "type": "SequenceLocation", - "sequence_id": "sequence.id:custom_ID__1", - "interval": { - "type": "SequenceInterval", - "start": {"type": "Number", "value": 200}, - "end": {"type": "Number", "value": 300}, - }, - }, + "id": "custom_ID__1", + "type": "SequenceLocation", + "start": 200, + "end": 300, }, "strand": "+", } @@ -225,44 +199,37 @@ def transcript_segment_element(): """Create transcript segment element test fixture""" params = { "type": "TranscriptSegmentElement", - "exon_end": 8, - "exon_end_offset": 0, - "exon_start": 1, - "exon_start_offset": 0, - "gene_descriptor": { - "gene_id": "hgnc:12012", - "id": "normalize.gene:TPM3", + "exonEnd": 8, + "exonEndOffset": 0, + "exonStart": 1, + "exonStartOffset": 0, + "gene": { + "id": "hgnc:12012", "label": "TPM3", "type": "Gene", }, "transcript": "refseq:NM_152263.3", "elementGenomicEnd": { - "id": "fusor.location_descriptor:NC_000001.11", - "label": "NC_000001.11", - "location": { - "interval": { - "end": {"type": "Number", "value": 154170400}, - "start": {"type": "Number", "value": 154170399}, - "type": "SequenceInterval", - }, - "sequence_id": "refseq:NC_000001.11", - "type": "SequenceLocation", + "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, - "type": "LocationDescriptor", + "start": 154170399, + "end": 154170400, }, "elementGenomicStart": { - "id": "fusor.location_descriptor:NC_000001.11", - "label": "NC_000001.11", - "location": { - "interval": { - "end": {"type": "Number", "value": 154192136}, - "start": {"type": "Number", "value": 154192135}, - "type": "SequenceInterval", - }, - "sequence_id": "refseq:NC_000001.11", - "type": "SequenceLocation", + "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", }, - "type": "LocationDescriptor", + "start": 154192135, + "end": 154192136, }, } return TranscriptSegmentElement(**params) @@ -273,31 +240,27 @@ def mane_transcript_segment_element(): """Create transcript segment element test fixture""" params = { "type": "TranscriptSegmentElement", - "exon_end": None, - "exon_end_offset": None, - "exon_start": 2, - "exon_start_offset": 0, - "gene_descriptor": { - "gene_id": "hgnc:12761", - "id": "normalize.gene:WEE1", + "exonEnd": None, + "exonEndOffset": None, + "exonStart": 2, + "exonStartOffset": 0, + "gene": { + "id": "hgnc:12761", "label": "WEE1", "type": "Gene", }, "transcript": "refseq:NM_003390.4", "elementGenomicEnd": None, "elementGenomicStart": { - "id": "fusor.location_descriptor:NC_000011.10", - "label": "NC_000011.10", - "location": { - "interval": { - "end": {"type": "Number", "value": 9575887}, - "start": {"type": "Number", "value": 9575886}, - "type": "SequenceInterval", - }, - "sequence_id": "refseq:NC_000011.10", - "type": "SequenceLocation", + "id": "ga4gh:SL.ge9FDGyBXkKhEMR6RUMFg3u13j85WmMd", + "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000011.10", + "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", + "type": "SequenceReference", }, - "type": "LocationDescriptor", + "start": 9575887, + "end": 9575886, }, } return TranscriptSegmentElement(**params) @@ -312,17 +275,16 @@ def fusion_ensg_sequence_id(templated_sequence_element_ensg): templated_sequence_element_ensg, {"type": "MultiplePossibleGenesElement"}, ], - "reading_frame_preserved": True, - "regulatory_element": None, + "readingFramePreserved": True, + "regulatoryElement": None, } return CategoricalFusion(**params) -def compare_gene_descriptor(actual: dict, expected: dict): - """Test that actual and expected gene descriptors match.""" +def compare_gene_obj(actual: dict, expected: dict): + """Test that actual and expected gene objects match.""" assert actual["id"] == expected["id"] assert actual["type"] == expected["type"] - assert actual["gene_id"] == expected["gene_id"] assert actual["label"] == expected["label"] if expected["xrefs"]: assert set(actual["xrefs"]) == set(expected["xrefs"]), "xrefs" @@ -359,84 +321,9 @@ def compare_gene_descriptor(actual: dict, expected: dict): ), "number of correct extensions" -def test_add_additional_fields(fusor_instance, fusion_example, fusion_ensg_sequence_id): - """Test that add_additional_fields method works correctly.""" - fusion = CategoricalFusion(**fusion_example) - - expected_fusion = copy.deepcopy(fusion) - expected_fusion.criticalFunctionalDomains[ - 0 - ].sequence_location.location_id = "ga4gh:VSL.2CWYzSpOJfZq7KW4VIUKeP5SJtepRar0" - expected_fusion.criticalFunctionalDomains[ - 0 - ].sequence_location.location.sequence_id = ( - "ga4gh:SQ.q9CnK-HKWh9eqhOi8FlzR7M0pCmUrWPs" - ) - expected_fusion.structure[ - 0 - ].elementGenomicStart.location_id = "ga4gh:VSL.H0IOyJ-DB4jTbbSBjQFvuPvMrZHAWSrW" - expected_fusion.structure[ - 0 - ].elementGenomicStart.location.sequence_id = ( - "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" - ) - expected_fusion.structure[ - 0 - ].elementGenomicEnd.location_id = "ga4gh:VSL.aarSLdMOQ8LoooPB2EoSth41yG_qRmDq" - expected_fusion.structure[ - 0 - ].elementGenomicEnd.location.sequence_id = ( - "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" - ) - expected_fusion.structure[ - 3 - ].region.location_id = "ga4gh:VSL.zd12pX_ju2gLq9a9UOYgM8AtbkuhnyUu" - expected_fusion.structure[ - 3 - ].region.location.sequence_id = "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP" - - actual_fusion = fusor_instance.add_additional_fields(fusion) - assert actual_fusion.model_dump() == expected_fusion.model_dump() - - # test handling of unrecognized sequence IDs - expected_fusion = copy.deepcopy(fusion_ensg_sequence_id) - fusion = fusor_instance.add_additional_fields(fusion_ensg_sequence_id) - ts_reg = fusion.structure[0].region - assert ts_reg.location.sequence_id == "ensembl:ENSG00000157764" - assert ts_reg.location_id == "ga4gh:VSL.dUll0TA05efQf0TsmcP03mtdGcpP9jPH" - - -def test_add_translated_sequence_id(fusor_instance, fusion_example): - """Test that add_translated_sequence_id method works correctly.""" - fusion = CategoricalFusion(**fusion_example) - - expected_fusion = copy.deepcopy(fusion) - expected_fusion.criticalFunctionalDomains[ - 0 - ].sequence_location.location.sequence_id = ( - "ga4gh:SQ.q9CnK-HKWh9eqhOi8FlzR7M0pCmUrWPs" - ) - expected_fusion.structure[ - 0 - ].elementGenomicStart.location.sequence_id = ( - "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" - ) - expected_fusion.structure[ - 0 - ].elementGenomicEnd.location.sequence_id = ( - "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" - ) - expected_fusion.structure[ - 3 - ].region.location.sequence_id = "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP" - - actual_fusion = fusor_instance.add_translated_sequence_id(fusion) - assert actual_fusion.model_dump() == expected_fusion.model_dump() - - def test__normalized_gene(fusor_instance): """Test that _normalized_gene works correctly.""" - # Actual response is tested in test_add_gene_descriptor + # TODO: test actual response resp = fusor_instance._normalized_gene("BRAF") assert resp[0] assert resp[1] is None @@ -447,41 +334,6 @@ def test__normalized_gene(fusor_instance): assert resp[1] == "gene-normalizer unable to normalize B R A F" -def test_add_gene_descriptor(fusor_instance, exhaustive_example, fusion_example): - """Test that add_gene_descriptor method works correctly.""" - expected_fusion = CategoricalFusion(**exhaustive_example) - actual = CategoricalFusion(**fusion_example) - fusor_instance.add_translated_sequence_id(actual) - fusor_instance.add_gene_descriptor(actual) - - e_gds = set() - t_gds = set() - for e_field in [ - expected_fusion.criticalFunctionalDomains, - expected_fusion.structure, - ]: - for t_field in [actual.criticalFunctionalDomains, actual.structure]: - for e_obj in e_field: - for t_obj in t_field: - if "gene_descriptor" in e_obj.model_fields: - e_gd = e_obj.gene_descriptor.label - e_gds.add(e_gd) - if "gene_descriptor" in t_obj.model_fields: - t_gd = t_obj.gene_descriptor.label - t_gds.add(t_gd) - if e_gd == t_gd: - compare_gene_descriptor( - t_obj.gene_descriptor.model_dump(), - e_obj.gene_descriptor.model_dump(), - ) - assert t_gds == e_gds - - compare_gene_descriptor( - actual.regulatory_element.associatedGene.model_dump(), - expected_fusion.regulatory_element.associatedGene.model_dump(), - ) - - def test_fusion( fusor_instance, linker_element, @@ -497,10 +349,10 @@ def test_fusion( linker_element, UnknownGeneElement(), ], - causativeEvent={ + causative_event={ "type": "CausativeEvent", "eventType": "rearrangement", - "event_description": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", + "eventDescription": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", }, assay={ "type": "Assay", @@ -516,7 +368,7 @@ def test_fusion( transcript_segment_element, MultiplePossibleGenesElement(), ], - criticalFunctionalDomains=[functional_domain], + critical_functional_domains=[functional_domain], ) assert isinstance(f, CategoricalFusion) @@ -527,8 +379,8 @@ def test_fusion( transcript_segment_element, UnknownGeneElement(), ], - causativeEvent="rearrangement", - criticalFunctionalDomains=[functional_domain], + causative_event="rearrangement", + critical_functional_domains=[functional_domain], ) assert str(excinfo.value) == "Received conflicting attributes" @@ -550,7 +402,7 @@ def test_fusion( linker_element, UnknownGeneElement(), ], - causativeEvent={ + causative_event={ "type": "CausativeEvent", "eventType": "rearrangement", }, @@ -569,7 +421,7 @@ def test_fusion( transcript_segment_element, MultiplePossibleGenesElement(), ], - criticalFunctionalDomains=[functional_domain], + critical_functional_domains=[functional_domain], ) assert isinstance(f, CategoricalFusion) @@ -614,7 +466,6 @@ async def test_transcript_segment_element( end=154170399, chromosome="NC_000001.11", tx_to_genomic_coords=False, - residue_mode="inter-residue", ) assert tsg[0] assert tsg[1] is None @@ -695,8 +546,8 @@ async def test_transcript_segment_element( assert tsg[1] is None assert tsg[0].model_dump() == expected.model_dump() - expected.exon_end = None - expected.exon_end_offset = None + expected.exonEnd = None + expected.exonEndOffset = None expected.elementGenomicEnd = None # Transcript Input @@ -734,25 +585,21 @@ async def test_transcript_segment_element( assert tsg[0].model_dump() == mane_transcript_segment_element.model_dump() -def test_gene_element(fusor_instance, braf_gene_descr_min, braf_gene_descr): +def test_gene_element(fusor_instance, braf_gene_obj_min, braf_gene_obj): """Test that gene_element works correctly.""" - gc = fusor_instance.gene_element("BRAF", use_minimal_gene_descr=True) + gc = fusor_instance.gene_element("BRAF", use_minimal_gene=True) assert gc[0] assert gc[1] is None assert isinstance(gc[0], GeneElement) - compare_gene_descriptor( - gc[0].gene_descriptor.model_dump(), braf_gene_descr_min.model_dump() - ) + compare_gene_obj(gc[0].gene.model_dump(), braf_gene_obj_min.model_dump()) - gc = fusor_instance.gene_element("BRAF", use_minimal_gene_descr=False) + gc = fusor_instance.gene_element("BRAF", use_minimal_gene=False) assert gc[0] assert gc[1] is None assert isinstance(gc[0], GeneElement) - compare_gene_descriptor( - gc[0].gene_descriptor.model_dump(), braf_gene_descr.model_dump() - ) + compare_gene_obj(gc[0].gene.model_dump(), braf_gene_obj.model_dump()) - gc = fusor_instance.gene_element("BRA F", use_minimal_gene_descr=True) + gc = fusor_instance.gene_element("BRA F", use_minimal_gene=True) assert gc[0] is None assert gc[1] == "gene-normalizer unable to normalize BRA F" @@ -853,7 +700,7 @@ def compare_domains(actual, expected): assert actual.keys() == expected.keys() for key in expected: if key == "associatedGene": - compare_gene_descriptor(actual[key], expected[key]) + compare_gene_obj(actual[key], expected[key]) elif key == "sequence_location": act_ld = actual["sequence_location"] exp_ld = expected["sequence_location"] @@ -882,7 +729,7 @@ def compare_domains(actual, expected): "NP_004324.2", 458, 712, - use_minimal_gene_descr=False, + use_minimal_gene=False, ) compare_domains(cd, functional_domain) @@ -894,7 +741,7 @@ def compare_domains(actual, expected): "NP_004324.2", 458, 712, - use_minimal_gene_descr=True, + use_minimal_gene=True, ) compare_domains(cd, functional_domain_min) @@ -907,7 +754,7 @@ def compare_domains(actual, expected): 458, 712, seq_id_target_namespace="ga4gh", - use_minimal_gene_descr=True, + use_minimal_gene=True, ) compare_domains(cd, functional_domain_seq_id) @@ -920,7 +767,7 @@ def compare_domains(actual, expected): 458, 712, seq_id_target_namespace="ga4gh", - use_minimal_gene_descr=True, + use_minimal_gene=True, ) assert cd[0] is None assert "Input should be 'lost' or 'preserved'" in cd[1] @@ -935,7 +782,7 @@ def compare_domains(actual, expected): 458, 712, seq_id_target_namespace="ga4gh", - use_minimal_gene_descr=True, + use_minimal_gene=True, ) assert cd[0] is None assert "Sequence_id must be a protein accession." in cd[1] @@ -951,7 +798,7 @@ def compare_domains(actual, expected): 458, 712, seq_id_target_namespace="ga4gh", - use_minimal_gene_descr=True, + use_minimal_gene=True, ) assert cd[0] is None assert f"Accession, {accession}, not found in SeqRepo" in cd[1] @@ -966,7 +813,7 @@ def compare_domains(actual, expected): 458, 712000, seq_id_target_namespace="ga4gh", - use_minimal_gene_descr=True, + use_minimal_gene=True, ) assert cd[0] is None assert ( @@ -986,45 +833,10 @@ def compare_re(actual, expected): expected = expected.model_dump() assert actual.keys() == expected.keys() assert actual["type"] == expected["type"] - compare_gene_descriptor(actual["associatedGene"], expected["associatedGene"]) + compare_gene_obj(actual["associatedGene"], expected["associatedGene"]) re = fusor_instance.regulatory_element(RegulatoryClass.PROMOTER, "BRAF") compare_re(re, regulatory_element_min) re = fusor_instance.regulatory_element(RegulatoryClass.PROMOTER, "BRAF", False) compare_re(re, regulatory_element) - - -def test__location_descriptor(fusor_instance, location_descriptor_tpm3): - """Test that _location_descriptor method works correctly.""" - ld = fusor_instance._location_descriptor(154170398, 154170399, "NM_152263.3") - assert ld.model_dump() == location_descriptor_tpm3.model_dump() - - expected = copy.deepcopy(location_descriptor_tpm3) - expected.location.sequence_id = "ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT" - ld = fusor_instance._location_descriptor( - 154170398, 154170399, "NM_152263.3", seq_id_target_namespace="ga4gh" - ) - assert ld.model_dump() == expected.model_dump() - - expected.id = "ga4gh:VSL._1bRdL4I6EtpBvVK5RUaXb0NN3k0gpqa" - ld = fusor_instance._location_descriptor( - 154170398, - 154170399, - "NM_152263.3", - seq_id_target_namespace="ga4gh", - use_location_id=True, - ) - assert ld.model_dump() == expected.model_dump() - - expected.location.sequence_id = "refseq:NM_152263.3" - expected.id = "fusor.location_descriptor:refseq%3ANM_152263.3" - ld = fusor_instance._location_descriptor(154170398, 154170399, "refseq:NM_152263.3") - assert ld.model_dump() == expected.model_dump() - - expected.id = "fusor.location_descriptor:example_label" - expected.label = "example_label" - ld = fusor_instance._location_descriptor( - 154170398, 154170399, "refseq:NM_152263.3", label="example_label" - ) - assert ld.model_dump() == expected.model_dump() From 9c083b982a3b547180f3ae9fed8491b6a1ec8627 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 24 Jul 2024 14:47:04 -0400 Subject: [PATCH 65/82] model updates --- src/fusor/fusor.py | 5 +- tests/conftest.py | 203 ++++++++++++++++++++++++++++++-------------- tests/test_fusor.py | 21 ++--- 3 files changed, 154 insertions(+), 75 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 9afbbcd..1c1f18f 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -564,8 +564,11 @@ def _normalized_gene( gene_norm_resp = self.gene_normalizer.normalize(query) if gene_norm_resp.match_type: gene = gene_norm_resp.gene + # remove normalize.gene from id (ex; 'normalize.gene.hgnc:1097' -> 'hgnc:1097') + gene_id = gene_norm_resp.normalized_id.replace("normalize.gene.", "") + gene.id = gene_id if use_minimal_gene: - return Gene(id=gene_norm_resp.normalized_id, label=gene.label), None + return Gene(id=gene_id, label=gene.label), None return gene, None return None, f"gene-normalizer unable to normalize {query}" diff --git a/tests/conftest.py b/tests/conftest.py index e5b6937..7a8d720 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -49,121 +49,197 @@ def fusor_instance(): def braf_gene(): """Create gene params for BRAF.""" return { - "type": "Gene", "id": "hgnc:1097", + "type": "Gene", "label": "BRAF", - "mappings": [ + "description": None, + "alternativeLabels": ["NS7", "BRAF1", "RAFB1", "B-RAF1", "BRAF-1", "B-raf"], + "extensions": [ + {"name": "symbol_status", "value": "approved", "description": None}, { - "coding": {"code": "673", "system": "ncbigene"}, - "relation": "relatedMatch", + "name": "approved_name", + "value": "B-Raf proto-oncogene, serine/threonine kinase", + "description": None, }, + {"name": "strand", "value": "-", "description": None}, { - "coding": {"code": "ENSG00000157764", "system": "ensembl"}, - "relation": "relatedMatch", + "name": "ensembl_locations", + "value": [ + { + "id": "ga4gh:SL.fUv91vYrVHBMg-B_QW7UpOQj50g_49hb", + "type": "SequenceLocation", + "digest": "fUv91vYrVHBMg-B_QW7UpOQj50g_49hb", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + }, + "start": 140719326, + "end": 140924929, + } + ], + "description": None, }, { - "coding": {"code": "CCDS5863", "system": "ccds"}, - "relation": "relatedMatch", + "name": "ncbi_locations", + "value": [ + { + "id": "ga4gh:SL.0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B", + "type": "SequenceLocation", + "digest": "0nPwKHYNnTmJ06G-gSmz8BEhB_NTp-0B", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + }, + "start": 140713327, + "end": 140924929, + } + ], + "description": None, }, { - "coding": {"code": "1943", "system": "iuphar"}, - "relation": "relatedMatch", + "name": "hgnc_locus_type", + "value": "gene with protein product", + "description": None, }, + {"name": "ncbi_gene_type", "value": "protein-coding", "description": None}, + {"name": "ensembl_biotype", "value": "protein_coding", "description": None}, + ], + "mappings": [ { - "coding": {"code": "119066", "system": "orphanet"}, + "coding": { + "label": None, + "system": "ensembl", + "version": None, + "code": "ENSG00000157764", + }, "relation": "relatedMatch", }, { - "coding": {"code": "BRAF", "system": "cosmic"}, + "coding": { + "label": None, + "system": "ncbigene", + "version": None, + "code": "673", + }, "relation": "relatedMatch", }, { - "coding": {"code": "2284096", "system": "pubmed"}, + "coding": { + "label": None, + "system": "cosmic", + "version": None, + "code": "BRAF", + }, "relation": "relatedMatch", }, { - "coding": {"code": "uc003vwc.5", "system": "ucsc"}, + "coding": { + "label": None, + "system": "ena.embl", + "version": None, + "code": "M95712", + }, "relation": "relatedMatch", }, { - "coding": {"code": "164757", "system": "omim"}, + "coding": { + "label": None, + "system": "omim", + "version": None, + "code": "164757", + }, "relation": "relatedMatch", }, { - "coding": {"code": "NM_004333", "system": "refseq"}, + "coding": { + "label": None, + "system": "iuphar", + "version": None, + "code": "1943", + }, "relation": "relatedMatch", }, { - "coding": {"code": "CCDS87555", "system": "ccds"}, + "coding": { + "label": None, + "system": "ucsc", + "version": None, + "code": "uc003vwc.5", + }, "relation": "relatedMatch", }, { - "coding": {"code": "P15056", "system": "uniprot"}, + "coding": { + "label": None, + "system": "vega", + "version": None, + "code": "OTTHUMG00000157457", + }, "relation": "relatedMatch", }, { - "coding": {"code": "M95712", "system": "ena.embl"}, + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS87555", + }, "relation": "relatedMatch", }, { - "coding": {"code": "OTTHUMG00000157457", "system": "vega"}, + "coding": { + "label": None, + "system": "uniprot", + "version": None, + "code": "P15056", + }, "relation": "relatedMatch", }, { - "coding": {"code": "1565476", "system": "pubmed"}, + "coding": { + "label": None, + "system": "refseq", + "version": None, + "code": "NM_004333", + }, "relation": "relatedMatch", }, { - "coding": {"code": "CCDS94219", "system": "ccds"}, + "coding": { + "label": None, + "system": "pubmed", + "version": None, + "code": "1565476", + }, "relation": "relatedMatch", }, { - "coding": {"code": "CCDS94218", "system": "ccds"}, + "coding": { + "label": None, + "system": "orphanet", + "version": None, + "code": "119066", + }, "relation": "relatedMatch", }, - ], - "alternativeLabels": ["BRAF1", "BRAF-1", "RAFB1", "NS7", "B-RAF1", "B-raf"], - "extensions": [ - { - "name": "approved_name", - "value": "B-Raf proto-oncogene, serine/threonine kinase", - }, { - "name": "ensembl_locations", - "value": [ - { - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", - }, - "start": 140719326, - "end": 140924929, - } - ], - }, - { - "name": "ncbi_locations", - "value": [ - { - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", - }, - "start": 140713327, - "end": 140924929, - } - ], + "coding": { + "label": None, + "system": "pubmed", + "version": None, + "code": "2284096", + }, + "relation": "relatedMatch", }, - {"name": "ncbi_gene_type", "value": "protein-coding"}, { - "name": "hgnc_locus_type", - "value": "gene with protein product", + "coding": { + "label": None, + "system": "ccds", + "version": None, + "code": "CCDS5863", + }, + "relation": "relatedMatch", }, - {"name": "ensembl_biotype", "value": "protein_coding"}, - {"name": "strand", "value": "-"}, - {"name": "symbol_status", "value": "approved"}, ], } @@ -757,10 +833,9 @@ def fusion_example(): "label": "Tyrosine-protein kinase, catalytic domain", "status": "lost", "associatedGene": { - "id": "normalize.gene:hgnc%3A427", "type": "Gene", "label": "ALK", - "gene_id": "hgnc:427", + "id": "hgnc:427", }, "sequenceLocation": { "id": "ga4gh:SL.aYx-iUOFEw7GVZb4fwrQLkQQahpiIAVp", diff --git a/tests/test_fusor.py b/tests/test_fusor.py index 734e8f6..dd5ad23 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -3,9 +3,9 @@ import copy import pytest +from cool_seq_tool.schemas import Strand from ga4gh.core.domain_models import Gene from ga4gh.vrs.models import SequenceLocation -from cool_seq_tool.schemas import Strand from fusor.exceptions import FUSORParametersException from fusor.models import ( @@ -147,6 +147,7 @@ def templated_sequence_element(): "type": "TemplatedSequenceElement", "region": { "id": "ga4gh:SL.U7-HtnKxK9kKI1ZINiDM_m4I6O-p4Dc9", + "digest": "U7-HtnKxK9kKI1ZINiDM_m4I6O-p4Dc9", "type": "SequenceLocation", "sequenceReference": { "id": "refseq:NC_000001.11", @@ -287,10 +288,10 @@ def compare_gene_obj(actual: dict, expected: dict): assert actual["id"] == expected["id"] assert actual["type"] == expected["type"] assert actual["label"] == expected["label"] - if expected["xrefs"]: - assert set(actual["xrefs"]) == set(expected["xrefs"]), "xrefs" + if expected.get("xrefs"): + assert set(actual.get("xrefs")) == set(expected["xrefs"]), "xrefs" else: - assert actual["xrefs"] == expected["xrefs"] + assert actual.get("xrefs") == expected.get("xrefs") if expected["alternativeLabels"]: assert set(actual["alternativeLabels"]) == set( expected["alternativeLabels"] @@ -315,7 +316,7 @@ def compare_gene_obj(actual: dict, expected: dict): ), f"{expected_ext['value']} value" else: assert actual_ext["value"] == expected_ext["value"] - assert actual_ext["type"] == expected_ext["type"] + assert actual_ext.get("type") == expected_ext.get("type") n_ext_correct += 1 assert n_ext_correct == len( expected["extensions"] @@ -441,6 +442,7 @@ async def test_transcript_segment_element( ): """Test that transcript_segment_element method works correctly""" # Transcript Input + # TODO: this test is now off by one after updating cool-seq-tool - need Jeremy's help in determining if the issue lies in fusor or CST tsg = await fusor_instance.transcript_segment_element( transcript="NM_152263.3", exon_start=1, exon_end=8, tx_to_genomic_coords=True ) @@ -485,11 +487,11 @@ async def test_transcript_segment_element( assert tsg[0].model_dump() == transcript_segment_element.model_dump() expected = copy.deepcopy(transcript_segment_element) - expected.elementGenomicStart.location.sequence_id = ( + expected.elementGenomicStart.sequenceReference.refgetAccession = ( "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" ) - expected.elementGenomicEnd.location.sequence_id = ( - expected.elementGenomicStart.location.sequence_id + expected.elementGenomicEnd.sequenceReference.refgetAccession = ( + expected.elementGenomicStart.sequenceReference.refgetAccession ) # Transcript Input @@ -623,10 +625,9 @@ def test_templated_sequence_element( assert tsg.model_dump() == templated_sequence_element.model_dump() expected = copy.deepcopy(templated_sequence_element.model_dump()) - expected["region"]["location"]["sequence_id"] = ( + expected["region"]["sequenceReference"]["refgetAccession"] = ( "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" ) - expected["region"]["location_id"] = "ga4gh:VSL.bL1N-PQfp4dGlEz6PEd34fGxdxo82Zkb" tsg = fusor_instance.templated_sequence_element( 100, 150, From db4d6cb16b3111221e4a017231a035dc73d11052 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 24 Jul 2024 15:29:42 -0400 Subject: [PATCH 66/82] updating models and tests --- src/fusor/fusor.py | 13 +++++++---- tests/test_fusor.py | 55 ++++++++++++++++++++++++++++----------------- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 1c1f18f..4fafc7b 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -313,7 +313,7 @@ async def transcript_segment_element( def gene_element( self, gene: str, use_minimal_gene: bool = True - ) -> tuple[Gene | None, str | None]: + ) -> tuple[GeneElement | None, str | None]: """Create gene element :param str gene: Gene @@ -322,7 +322,10 @@ def gene_element( gene-normalizer's gene object will be used :return: GeneElement, warning """ - return self._normalized_gene(gene, use_minimal_gene=use_minimal_gene) + gene_resp = self._normalized_gene(gene, use_minimal_gene=use_minimal_gene) + if gene_resp[0]: + return GeneElement(gene=gene_resp[0]), None + return None, gene_resp[1] def templated_sequence_element( self, @@ -370,7 +373,9 @@ def linker_element( try: upper_seq = sequence.upper() seq = SequenceString(upper_seq) - linker_sequence = LiteralSequenceExpression(sequence=seq) + linker_sequence = LiteralSequenceExpression( + sequence=seq, id=f"fusor.sequence:{sequence}" + ) return LinkerElement(linkerSequence=linker_sequence), None except ValidationError as e: msg = str(e) @@ -453,7 +458,7 @@ def functional_domain( label=name, status=status, associatedGene=gene_descr, - sequence_location=loc_descr, + sequenceLocation=loc_descr, ), None, ) diff --git a/tests/test_fusor.py b/tests/test_fusor.py index dd5ad23..2741508 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -40,7 +40,7 @@ def linker_element(): """Create linker element test fixture.""" params = { "linkerSequence": { - "id": "fusor.sequence:ACT", + "id": "fusor.sequence:act", "sequence": "ACT", "type": "LiteralSequenceExpression", }, @@ -66,6 +66,23 @@ def sequence_location_braf_domain(): return SequenceLocation(**params) +@pytest.fixture(scope="module") +def sequence_location_braf_ref_id_ga4gh(): + """Create sequence location fixture for BRAF catalytic domain""" + params = { + "id": "ga4gh:SL.Lm-hzZHlA8FU_cYaOtAIbMLdf4Kk-SF8", + "type": "SequenceLocation", + "sequenceReference": { + "id": "ga4gh:SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", + "refgetAccession": "SQ.cQvw4UsHHRRlogxbWCB8W-mKD4AraM9y", + "type": "SequenceReference", + }, + "start": 458, + "end": 712, + } + return SequenceLocation(**params) + + @pytest.fixture(scope="module") def functional_domain_min(braf_gene_obj_min, sequence_location_braf_domain): """Create functional domain test fixture.""" @@ -93,14 +110,14 @@ def functional_domain(braf_gene_obj, sequence_location_braf_domain): @pytest.fixture(scope="module") -def functional_domain_seq_id(braf_gene_obj_min, sequence_location_braf_domain): +def functional_domain_seq_id(braf_gene_obj_min, sequence_location_braf_ref_id_ga4gh): """Create functional domain test fixture.""" params = { "status": "preserved", "label": "Serine-threonine/tyrosine-protein kinase, catalytic domain", "id": "interpro:IPR001245", "associatedGene": braf_gene_obj_min, - "sequenceLocation": sequence_location_braf_domain, + "sequenceLocation": sequence_location_braf_ref_id_ga4gh, } return FunctionalDomain(**params) @@ -626,14 +643,13 @@ def test_templated_sequence_element( expected = copy.deepcopy(templated_sequence_element.model_dump()) expected["region"]["sequenceReference"]["refgetAccession"] = ( - "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" + "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO" ) tsg = fusor_instance.templated_sequence_element( 100, 150, "NC_000001.11", Strand.POSITIVE, - seq_id_target_namespace="ga4gh", ) assert tsg.model_dump() == expected @@ -707,23 +723,21 @@ def compare_domains(actual, expected): for key in expected: if key == "associatedGene": compare_gene_obj(actual[key], expected[key]) - elif key == "sequence_location": - act_ld = actual["sequence_location"] - exp_ld = expected["sequence_location"] - assert act_ld["id"] == exp_ld["id"] - assert act_ld["type"] == exp_ld["type"] - assert act_ld["location"]["type"] == exp_ld["location"]["type"] + elif key == "sequenceLocation": + act_sl = actual["sequenceLocation"] + exp_sl = expected["sequenceLocation"] + assert act_sl["id"] == exp_sl["id"] + assert act_sl["type"] == exp_sl["type"] + assert ( + act_sl["sequenceReference"]["type"] + == exp_sl["sequenceReference"]["type"] + ) assert ( - act_ld["location"]["sequence_id"] - == exp_ld["location"]["sequence_id"] + act_sl["sequenceReference"]["id"] + == exp_sl["sequenceReference"]["id"] ) - act_int = act_ld["location"]["interval"] - exp_int = exp_ld["location"]["interval"] - assert act_int["type"] == exp_int["type"] - assert act_int["start"]["type"] == exp_int["start"]["type"] - assert act_int["start"]["value"] == exp_int["start"]["value"] - assert act_int["end"]["type"] == exp_int["end"]["type"] - assert act_int["end"]["value"] == exp_int["end"]["value"] + assert exp_sl.get("start") == act_sl.get("start") + assert exp_sl.get("end") == act_sl.get("end") else: assert actual[key] == expected[key] @@ -822,6 +836,7 @@ def compare_domains(actual, expected): use_minimal_gene=True, ) assert cd[0] is None + # TODO: this is now off by one after updating seqrepo (response is now: End inter-residue coordinate (712000) is out of index on NP_004324.2) assert ( "End inter-residue coordinate (711999) is out of index on " "NP_004324.2" in cd[1] From 6c3b66314ad13ddb93cbef1a72ef2d0fc4740d33 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Wed, 24 Jul 2024 17:45:05 -0400 Subject: [PATCH 67/82] fix name --- src/fusor/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 52fa737..432e9bd 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -206,7 +206,6 @@ class LinkerElement(BaseStructuralElement, extra="forbid"): linkerSequence: LiteralSequenceExpression model_config = ConfigDict( - arbitrary_types_allowed=True, json_schema_extra={ "example": { "type": "LinkerSequenceElement", @@ -511,7 +510,7 @@ def structure_ends(cls, values): """ elements = values.structure if isinstance(elements[0], TranscriptSegmentElement): - if elements[0].exonEnd is None and not values["regulatory_element"]: + if elements[0].exonEnd is None and not values["regulatoryElement"]: msg = "5' TranscriptSegmentElement fusion partner must contain ending exon position" raise ValueError(msg) elif isinstance(elements[0], LinkerElement): From c78ee2a76ae63017deeea9f3c8f8d18138042707 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 25 Jul 2024 10:39:55 -0400 Subject: [PATCH 68/82] pin gene normalizer version where CURIE is still defined --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1617409..e689665 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "pydantic == 2.*", "ga4gh.vrs ~=2.0.0a10", "biocommons.seqrepo", - "gene-normalizer ~=0.4.1", + "gene-normalizer ==0.4.0", "cool-seq-tool ~=0.5.1", ] dynamic=["version"] From fb0579309c617e86256e099dd5bc5160a6196e83 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 25 Jul 2024 11:15:57 -0400 Subject: [PATCH 69/82] updating json schema examples for models, removing labael from sequencelocation for now, as it will be added later --- src/fusor/models.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 432e9bd..7ce09cd 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -84,8 +84,7 @@ class FunctionalDomain(BaseModel): "end": 781, "type": "SequenceLocation", "sequenceReference": { - "id": "NC_000022.11", - "label": "GRCh38:chr22", + "id": "refseq:NC_000022.11", "type": "SequenceReference", "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", "residueAlphabet": "na", @@ -173,8 +172,8 @@ def check_exons(cls, values): "elementGenomicStart": { "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", "type": "SequenceLocation", - "label": "NC_000001.11", "sequenceReference": { + "id": "refseq:NC_000001.11", "type": "SequenceReference", "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", }, @@ -184,8 +183,8 @@ def check_exons(cls, values): "elementGenomicEnd": { "id": "ga4gh:SL.rtR6x2NnJEpROlxiT_DY9C-spf6ijYQi", "type": "SequenceLocation", - "label": "NC_000001.11", "sequenceReference": { + "id": "refseq:NC_000001.11", "type": "SequenceReference", "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", }, From 2572534155033314069d4badcddad94d8076d17f Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 25 Jul 2024 11:22:02 -0400 Subject: [PATCH 70/82] remove sequencelocation label --- src/fusor/models.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 7ce09cd..2f4a9d2 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -243,7 +243,6 @@ class TemplatedSequenceElement(BaseStructuralElement): "id": "refseq:NC_000012.12", "refgetAccession": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", }, - "label": "chr12:44908821-44908822(+)", }, "strand": 1, } From 62745b88811b2573ede02e2806270fe692dcee89 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 25 Jul 2024 11:26:22 -0400 Subject: [PATCH 71/82] fix ruff errors --- src/fusor/nomenclature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/nomenclature.py b/src/fusor/nomenclature.py index aadd526..c1bf295 100644 --- a/src/fusor/nomenclature.py +++ b/src/fusor/nomenclature.py @@ -1,8 +1,8 @@ """Provide helper methods for fusion nomenclature generation.""" from biocommons.seqrepo.seqrepo import SeqRepo -from ga4gh.vrs.models import SequenceReference from cool_seq_tool.schemas import Strand +from ga4gh.vrs.models import SequenceReference from fusor.exceptions import IDTranslationException from fusor.models import ( From 87321357c0438792f44f4fc59e1d9c3c83ac42d7 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 25 Jul 2024 13:09:44 -0400 Subject: [PATCH 72/82] pinning pydantic version to stop validation error in tests --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e689665..fbd3eef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,8 +24,9 @@ classifiers = [ requires-python = ">=3.10" description = "Computable object representation and validation for gene fusions" license = {file = "LICENSE"} +# pydantic is pinned to 2.4.2 for now, since there is a change in later versions that causes a validation error for test_fusion in test_fusor.py dependencies = [ - "pydantic == 2.*", + "pydantic ==2.4.2", "ga4gh.vrs ~=2.0.0a10", "biocommons.seqrepo", "gene-normalizer ==0.4.0", From acab073f94e899f26115b70edfdbfc37933c7e01 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 25 Jul 2024 13:27:10 -0400 Subject: [PATCH 73/82] test: updating test to fail with unexpected sequence id provided --- tests/test_fusor.py | 74 ++++++--------------------------------------- 1 file changed, 10 insertions(+), 64 deletions(-) diff --git a/tests/test_fusor.py b/tests/test_fusor.py index 2741508..0fabb8a 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -7,7 +7,7 @@ from ga4gh.core.domain_models import Gene from ga4gh.vrs.models import SequenceLocation -from fusor.exceptions import FUSORParametersException +from fusor.exceptions import FUSORParametersException, IDTranslationException from fusor.models import ( AssayedFusion, CategoricalFusion, @@ -179,40 +179,6 @@ def templated_sequence_element(): return TemplatedSequenceElement(**params) -@pytest.fixture() -def templated_sequence_element_ensg(): - """Create test fixture using non-seqrepo-recognized sequence ID""" - params = { - "type": "TemplatedSequenceElement", - "region": { - "id": "ENSG00000157764", - "type": "SequenceLocation", - "start": 140719328, - "end": 140719400, - }, - "strand": -1, - } - return TemplatedSequenceElement(**params) - - -@pytest.fixture(scope="module") -def templated_sequence_element_custom_id(): - """Create test fixture using custom (ie unable to coerce namespace) - sequence identifier. - """ - params = { - "type": "TemplatedSequenceElement", - "region": { - "id": "custom_ID__1", - "type": "SequenceLocation", - "start": 200, - "end": 300, - }, - "strand": 1, - } - return TemplatedSequenceElement(**params) - - @pytest.fixture(scope="module") def transcript_segment_element(): """Create transcript segment element test fixture""" @@ -627,8 +593,6 @@ def test_gene_element(fusor_instance, braf_gene_obj_min, braf_gene_obj): def test_templated_sequence_element( fusor_instance, templated_sequence_element, - templated_sequence_element_ensg, - templated_sequence_element_custom_id, ): """Test that templated sequence element works correctly""" tsg = fusor_instance.templated_sequence_element( @@ -653,35 +617,17 @@ def test_templated_sequence_element( ) assert tsg.model_dump() == expected - tsg = fusor_instance.templated_sequence_element( - 140719329, 140719400, "ENSG00000157764", Strand.NEGATIVE - ) - assert tsg.model_dump() == templated_sequence_element_ensg.model_dump() - - # test untranslateable sequence ID - # adds "ensembl" namespace but unable to translate to ga4gh digest ID - expected = copy.deepcopy(templated_sequence_element_ensg.model_dump()) - tsg = fusor_instance.templated_sequence_element( - 140719329, - 140719400, - "ENSG00000157764", - Strand.NEGATIVE, - seq_id_target_namespace="ga4gh", - ) - assert tsg.model_dump() == expected - # test in-house/bespoke sequence ID # can't coerce namespace or translate to ga4gh ID - expected = copy.deepcopy(templated_sequence_element_custom_id.model_dump()) - tsg = fusor_instance.templated_sequence_element( - 200, - 300, - "custom_ID__1", - Strand.POSITIVE, - residue_mode="inter-residue", - seq_id_target_namespace="ga4gh", - ) - assert tsg.model_dump() == expected + with pytest.raises(IDTranslationException): + fusor_instance.templated_sequence_element( + 200, + 300, + "custom_ID__1", + Strand.POSITIVE, + residue_mode="inter-residue", + seq_id_target_namespace="ga4gh", + ) def test_linker_element(fusor_instance, linker_element): From a0a4529cdcc101bde4e1257e6cecfcaa6b87ec0e Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Thu, 25 Jul 2024 13:44:22 -0400 Subject: [PATCH 74/82] Update test fixtures for correct use of start and end" --- tests/test_fusor.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/test_fusor.py b/tests/test_fusor.py index 0fabb8a..73d1efe 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -203,7 +203,6 @@ def transcript_segment_element(): "type": "SequenceReference", }, "start": 154170399, - "end": 154170400, }, "elementGenomicStart": { "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", @@ -213,8 +212,7 @@ def transcript_segment_element(): "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", "type": "SequenceReference", }, - "start": 154192135, - "end": 154192136, + "end": 154192135, }, } return TranscriptSegmentElement(**params) @@ -245,7 +243,6 @@ def mane_transcript_segment_element(): "type": "SequenceReference", }, "start": 9575887, - "end": 9575886, }, } return TranscriptSegmentElement(**params) From f1b82a0c9ca18a62e04405b92a87ed6734dd821f Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 25 Jul 2024 14:33:48 -0400 Subject: [PATCH 75/82] update comment --- src/fusor/fusor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 4fafc7b..808021e 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -264,7 +264,7 @@ async def transcript_segment_element( kwargs["start"] = start - 1 if start is not None else None kwargs["residue_mode"] = "inter-residue" chromosome = kwargs.get("chromosome") - # if chromosome is a string, assume it's an accession, fix it for the kwargs since CST expects + # if chromosome is a string, assume it's an accession, fix it for the kwargs since CST expects this as alt_ac if type(chromosome) is str: kwargs["alt_ac"] = chromosome data = await self.cool_seq_tool.ex_g_coords_mapper.genomic_to_transcript_exon_coordinates( From e9a70b100be4418996855d45435e32743d864356 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Thu, 25 Jul 2024 14:39:54 -0400 Subject: [PATCH 76/82] Revert "Update test fixtures for correct use of start and end"" This reverts commit a0a4529cdcc101bde4e1257e6cecfcaa6b87ec0e. --- tests/test_fusor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_fusor.py b/tests/test_fusor.py index 73d1efe..0fabb8a 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -203,6 +203,7 @@ def transcript_segment_element(): "type": "SequenceReference", }, "start": 154170399, + "end": 154170400, }, "elementGenomicStart": { "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", @@ -212,7 +213,8 @@ def transcript_segment_element(): "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", "type": "SequenceReference", }, - "end": 154192135, + "start": 154192135, + "end": 154192136, }, } return TranscriptSegmentElement(**params) @@ -243,6 +245,7 @@ def mane_transcript_segment_element(): "type": "SequenceReference", }, "start": 9575887, + "end": 9575886, }, } return TranscriptSegmentElement(**params) From dd3f7c16352ca0f2bb53e505339eaa0263b38a90 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 29 Jul 2024 09:41:00 -0400 Subject: [PATCH 77/82] Update src/fusor/models.py Co-authored-by: Kori Kuzma --- src/fusor/models.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 2f4a9d2..1efaa3c 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -443,10 +443,7 @@ def _fetch_gene_id( :param alt_field: the field to fetch the gene from, if it is not called "gene" (ex: associatedGene instead) :return: gene ID if gene is defined """ - if alt_field: - gene_info = cls._access_object_attr(obj, alt_field) - else: - gene_info = cls._access_object_attr(obj, "gene") + gene_info = cls._access_object_attr(obj, alt_field if alt_field else "gene") if gene_info: gene_id = cls._access_object_attr(gene_info, "id") if gene_id: From 774f5590a6a47b606ce464682bde9a6121f23c3e Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 29 Jul 2024 09:41:44 -0400 Subject: [PATCH 78/82] Update src/fusor/fusor.py Co-authored-by: Kori Kuzma --- src/fusor/fusor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 808021e..3886adc 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -571,9 +571,9 @@ def _normalized_gene( gene = gene_norm_resp.gene # remove normalize.gene from id (ex; 'normalize.gene.hgnc:1097' -> 'hgnc:1097') gene_id = gene_norm_resp.normalized_id.replace("normalize.gene.", "") - gene.id = gene_id if use_minimal_gene: return Gene(id=gene_id, label=gene.label), None + gene.id = gene_id return gene, None return None, f"gene-normalizer unable to normalize {query}" From 8fd8d3fe602fb2ae1e56df3a0372b3188a8717c1 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 29 Jul 2024 09:42:50 -0400 Subject: [PATCH 79/82] Update src/fusor/fusor.py Co-authored-by: Kori Kuzma --- src/fusor/fusor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 3886adc..f8cb1da 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -569,8 +569,7 @@ def _normalized_gene( gene_norm_resp = self.gene_normalizer.normalize(query) if gene_norm_resp.match_type: gene = gene_norm_resp.gene - # remove normalize.gene from id (ex; 'normalize.gene.hgnc:1097' -> 'hgnc:1097') - gene_id = gene_norm_resp.normalized_id.replace("normalize.gene.", "") + gene_id = gene_norm_resp.normalized_id if use_minimal_gene: return Gene(id=gene_id, label=gene.label), None gene.id = gene_id From c664b7781c20696505476fed4d0401f7e0d434b0 Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Mon, 29 Jul 2024 09:49:26 -0400 Subject: [PATCH 80/82] fix: example data --- src/fusor/models.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/fusor/models.py b/src/fusor/models.py index 2f4a9d2..d8ed1f6 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -87,7 +87,6 @@ class FunctionalDomain(BaseModel): "id": "refseq:NC_000022.11", "type": "SequenceReference", "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", - "residueAlphabet": "na", }, }, } @@ -694,24 +693,31 @@ class CategoricalFusion(AbstractFusion): "label": "TPM3", }, "elementGenomicStart": { - "id": "TPM3:exon1", + "id": "ga4gh:SL.2K1vML0ofuYrYncrzzXUQOISRFJldZrO", "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", + }, "start": 154192135, "end": 154192136, }, "elementGenomicEnd": { - "id": "TPM3:exon8", + "id": "ga4gh:SL.m5_Spfzt1v4sfVw9u4kmuYn7dM7gyNeb", "type": "SequenceLocation", + "sequenceReference": { + "id": "refseq:NC_000001.11", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", + "type": "SequenceReference", + }, "start": 154170398, "end": 154170399, }, }, { "type": "GeneElement", - "gene": { - "id": "hgnc:427", - "label": "ALK", - }, + "gene": {"id": "hgnc:427", "label": "ALK", "type": "Gene"}, }, ], "regulatoryElement": { From 2880e184ef59904fec2b9647fd26d696cef7bdfc Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Tue, 30 Jul 2024 14:39:19 -0400 Subject: [PATCH 81/82] update fusion constructor to accept the body of a valid fusion (same casing as the fusion classes) --- pyproject.toml | 1 + src/fusor/fusor.py | 40 ++++++++++++++++++++-------------------- tests/test_fusor.py | 12 ++++++------ 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fbd3eef..c73862a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -160,6 +160,7 @@ ignore = [ # SLF001 - private-member-access "tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "INP001", "SLF001"] "src/fusor/models.py" = ["ANN201", "N803", "N805", "N815", "ANN001", "ANN2", "ANN102"] +"src/fusor/fusor.py" = ["N803", "N805"] [tool.ruff.format] docstring-code-format = true diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index f8cb1da..17db051 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -125,7 +125,7 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: # try to infer from provided attributes categorical_attributes = any( [ - "critical_functional_domains" in kwargs, + "criticalFunctionalDomains" in kwargs, self._contains_element_type( kwargs, StructuralElementType.MULTIPLE_POSSIBLE_GENES_ELEMENT ), @@ -133,7 +133,7 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: ) assayed_attributes = any( [ - "causative_event" in kwargs, + "causativeEvent" in kwargs, "assay" in kwargs, self._contains_element_type( kwargs, StructuralElementType.UNKNOWN_GENE_ELEMENT @@ -159,15 +159,15 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: @staticmethod def categorical_fusion( structure: CategoricalFusionElements, - regulatory_element: RegulatoryElement | None = None, - critical_functional_domains: list[FunctionalDomain] | None = None, - reading_frame_preserved: bool | None = None, + regulatoryElement: RegulatoryElement | None = None, + criticalFunctionalDomains: list[FunctionalDomain] | None = None, + readingFramePreserved: bool | None = None, ) -> CategoricalFusion: """Construct a categorical fusion object :param structure: elements constituting the fusion - :param regulatory_element: affected regulatory element - :param critical_functional_domains: lost or preserved functional domains - :param reading_frame_preserved: ``True`` if reading frame is preserved. + :param regulatoryElement: affected regulatory element + :param criticalFunctionalDomains: lost or preserved functional domains + :param readingFramePreserved: ``True`` if reading frame is preserved. ``False`` otherwise :return: CategoricalFusion if construction successful :raise: FUSORParametersException if given incorrect fusion properties @@ -175,9 +175,9 @@ def categorical_fusion( try: fusion = CategoricalFusion( structure=structure, - criticalFunctionalDomains=critical_functional_domains, - readingFramePreserved=reading_frame_preserved, - regulatoryElement=regulatory_element, + criticalFunctionalDomains=criticalFunctionalDomains, + readingFramePreserved=readingFramePreserved, + regulatoryElement=regulatoryElement, ) except ValidationError as e: raise FUSORParametersException(str(e)) from e @@ -186,17 +186,17 @@ def categorical_fusion( @staticmethod def assayed_fusion( structure: AssayedFusionElements, - causative_event: CausativeEvent | None = None, + causativeEvent: CausativeEvent | None = None, assay: Assay | None = None, - regulatory_element: RegulatoryElement | None = None, - reading_frame_preserved: bool | None = None, + regulatoryElement: RegulatoryElement | None = None, + readingFramePreserved: bool | None = None, ) -> AssayedFusion: """Construct an assayed fusion object :param structure: elements constituting the fusion - :param causative_event: event causing the fusion + :param causativeEvent: event causing the fusion :param assay: how knowledge of the fusion was obtained - :param regulatory_element: affected regulatory elements - :param reading_frame_preserved: ``True`` if reading frame is preserved. + :param regulatoryElement: affected regulatory elements + :param readingFramePreserved: ``True`` if reading frame is preserved. ``False`` otherwise :return: Tuple containing optional AssayedFusion if construction successful, and any relevant validation warnings @@ -204,10 +204,10 @@ def assayed_fusion( try: fusion = AssayedFusion( structure=structure, - regulatoryElement=regulatory_element, - causativeEvent=causative_event, + regulatoryElement=regulatoryElement, + causativeEvent=causativeEvent, assay=assay, - readingFramePreserved=reading_frame_preserved, + readingFramePreserved=readingFramePreserved, ) except ValidationError as e: raise FUSORParametersException(str(e)) from e diff --git a/tests/test_fusor.py b/tests/test_fusor.py index 0fabb8a..f37984a 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -334,7 +334,7 @@ def test_fusion( linker_element, UnknownGeneElement(), ], - causative_event={ + causativeEvent={ "type": "CausativeEvent", "eventType": "rearrangement", "eventDescription": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", @@ -353,7 +353,7 @@ def test_fusion( transcript_segment_element, MultiplePossibleGenesElement(), ], - critical_functional_domains=[functional_domain], + criticalFunctionalDomains=[functional_domain], ) assert isinstance(f, CategoricalFusion) @@ -364,8 +364,8 @@ def test_fusion( transcript_segment_element, UnknownGeneElement(), ], - causative_event="rearrangement", - critical_functional_domains=[functional_domain], + causativeEvent="rearrangement", + criticalFunctionalDomains=[functional_domain], ) assert str(excinfo.value) == "Received conflicting attributes" @@ -387,7 +387,7 @@ def test_fusion( linker_element, UnknownGeneElement(), ], - causative_event={ + causativeEvent={ "type": "CausativeEvent", "eventType": "rearrangement", }, @@ -406,7 +406,7 @@ def test_fusion( transcript_segment_element, MultiplePossibleGenesElement(), ], - critical_functional_domains=[functional_domain], + criticalFunctionalDomains=[functional_domain], ) assert isinstance(f, CategoricalFusion) From b9bff0068e7d0db4848b18a0758a6c51b992a09e Mon Sep 17 00:00:00 2001 From: Katie Stahl Date: Tue, 30 Jul 2024 14:56:23 -0400 Subject: [PATCH 82/82] Revert "update fusion constructor to accept the body of a valid fusion (same casing as the fusion classes)" This reverts commit 2880e184ef59904fec2b9647fd26d696cef7bdfc. --- pyproject.toml | 1 - src/fusor/fusor.py | 40 ++++++++++++++++++++-------------------- tests/test_fusor.py | 12 ++++++------ 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c73862a..fbd3eef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -160,7 +160,6 @@ ignore = [ # SLF001 - private-member-access "tests/*" = ["ANN001", "ANN2", "ANN102", "S101", "B011", "INP001", "SLF001"] "src/fusor/models.py" = ["ANN201", "N803", "N805", "N815", "ANN001", "ANN2", "ANN102"] -"src/fusor/fusor.py" = ["N803", "N805"] [tool.ruff.format] docstring-code-format = true diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 17db051..f8cb1da 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -125,7 +125,7 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: # try to infer from provided attributes categorical_attributes = any( [ - "criticalFunctionalDomains" in kwargs, + "critical_functional_domains" in kwargs, self._contains_element_type( kwargs, StructuralElementType.MULTIPLE_POSSIBLE_GENES_ELEMENT ), @@ -133,7 +133,7 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: ) assayed_attributes = any( [ - "causativeEvent" in kwargs, + "causative_event" in kwargs, "assay" in kwargs, self._contains_element_type( kwargs, StructuralElementType.UNKNOWN_GENE_ELEMENT @@ -159,15 +159,15 @@ def fusion(self, fusion_type: FusionType | None = None, **kwargs) -> Fusion: @staticmethod def categorical_fusion( structure: CategoricalFusionElements, - regulatoryElement: RegulatoryElement | None = None, - criticalFunctionalDomains: list[FunctionalDomain] | None = None, - readingFramePreserved: bool | None = None, + regulatory_element: RegulatoryElement | None = None, + critical_functional_domains: list[FunctionalDomain] | None = None, + reading_frame_preserved: bool | None = None, ) -> CategoricalFusion: """Construct a categorical fusion object :param structure: elements constituting the fusion - :param regulatoryElement: affected regulatory element - :param criticalFunctionalDomains: lost or preserved functional domains - :param readingFramePreserved: ``True`` if reading frame is preserved. + :param regulatory_element: affected regulatory element + :param critical_functional_domains: lost or preserved functional domains + :param reading_frame_preserved: ``True`` if reading frame is preserved. ``False`` otherwise :return: CategoricalFusion if construction successful :raise: FUSORParametersException if given incorrect fusion properties @@ -175,9 +175,9 @@ def categorical_fusion( try: fusion = CategoricalFusion( structure=structure, - criticalFunctionalDomains=criticalFunctionalDomains, - readingFramePreserved=readingFramePreserved, - regulatoryElement=regulatoryElement, + criticalFunctionalDomains=critical_functional_domains, + readingFramePreserved=reading_frame_preserved, + regulatoryElement=regulatory_element, ) except ValidationError as e: raise FUSORParametersException(str(e)) from e @@ -186,17 +186,17 @@ def categorical_fusion( @staticmethod def assayed_fusion( structure: AssayedFusionElements, - causativeEvent: CausativeEvent | None = None, + causative_event: CausativeEvent | None = None, assay: Assay | None = None, - regulatoryElement: RegulatoryElement | None = None, - readingFramePreserved: bool | None = None, + regulatory_element: RegulatoryElement | None = None, + reading_frame_preserved: bool | None = None, ) -> AssayedFusion: """Construct an assayed fusion object :param structure: elements constituting the fusion - :param causativeEvent: event causing the fusion + :param causative_event: event causing the fusion :param assay: how knowledge of the fusion was obtained - :param regulatoryElement: affected regulatory elements - :param readingFramePreserved: ``True`` if reading frame is preserved. + :param regulatory_element: affected regulatory elements + :param reading_frame_preserved: ``True`` if reading frame is preserved. ``False`` otherwise :return: Tuple containing optional AssayedFusion if construction successful, and any relevant validation warnings @@ -204,10 +204,10 @@ def assayed_fusion( try: fusion = AssayedFusion( structure=structure, - regulatoryElement=regulatoryElement, - causativeEvent=causativeEvent, + regulatoryElement=regulatory_element, + causativeEvent=causative_event, assay=assay, - readingFramePreserved=readingFramePreserved, + readingFramePreserved=reading_frame_preserved, ) except ValidationError as e: raise FUSORParametersException(str(e)) from e diff --git a/tests/test_fusor.py b/tests/test_fusor.py index f37984a..0fabb8a 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -334,7 +334,7 @@ def test_fusion( linker_element, UnknownGeneElement(), ], - causativeEvent={ + causative_event={ "type": "CausativeEvent", "eventType": "rearrangement", "eventDescription": "chr2:g.pter_8,247,756::chr11:g.15,825,273_cen_qter (der11) and chr11:g.pter_15,825,272::chr2:g.8,247,757_cen_qter (der2)", @@ -353,7 +353,7 @@ def test_fusion( transcript_segment_element, MultiplePossibleGenesElement(), ], - criticalFunctionalDomains=[functional_domain], + critical_functional_domains=[functional_domain], ) assert isinstance(f, CategoricalFusion) @@ -364,8 +364,8 @@ def test_fusion( transcript_segment_element, UnknownGeneElement(), ], - causativeEvent="rearrangement", - criticalFunctionalDomains=[functional_domain], + causative_event="rearrangement", + critical_functional_domains=[functional_domain], ) assert str(excinfo.value) == "Received conflicting attributes" @@ -387,7 +387,7 @@ def test_fusion( linker_element, UnknownGeneElement(), ], - causativeEvent={ + causative_event={ "type": "CausativeEvent", "eventType": "rearrangement", }, @@ -406,7 +406,7 @@ def test_fusion( transcript_segment_element, MultiplePossibleGenesElement(), ], - criticalFunctionalDomains=[functional_domain], + critical_functional_domains=[functional_domain], ) assert isinstance(f, CategoricalFusion)