diff --git a/src/fusor/examples/alk.json b/src/fusor/examples/alk.json index d536e6b..d8b8f00 100644 --- a/src/fusor/examples/alk.json +++ b/src/fusor/examples/alk.json @@ -38,10 +38,9 @@ "type": "MultiplePossibleGenesElement" }, { - "type": "GeneElement", - "gene_descriptor": { + "gene": { "id": "normalize.gene:ALK", - "type": "GeneDescriptor", + "type": "Gene", "label": "ALK", "gene_id": "hgnc:427" } diff --git a/src/fusor/examples/bcr_abl1.json b/src/fusor/examples/bcr_abl1.json index b7a1b8d..3716300 100644 --- a/src/fusor/examples/bcr_abl1.json +++ b/src/fusor/examples/bcr_abl1.json @@ -1,111 +1,131 @@ { "type": "CategoricalFusion", - "structural_elements": [ - { - "type": "TranscriptSegmentElement", - "transcript": "refseq:NM_004327.3", - "gene_descriptor": { - "type": "GeneDescriptor", - "id": "normalize.gene:BCR", - "gene_id": "hgnc:1014", - "label": "BCR" + "structure": { + "type": "Adjacency", + "adjoinedSequences": [{ + "type": "SequenceLocation", + "sequenceReference": { + "id": "GRCh38:chr22", + "type": "SequenceReference", + "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", + "residueAlphabet": "na" }, - "element_genomic_end": { - "id": "fusor.location_descriptor:NC_000022.11", - "type": "LocationDescriptor", - "label": "NC_000022.11", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000022.11", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 23253980 - }, - "end": { - "type": "Number", - "value": 23253981 + "end": 23290413, + "extensions": [ + { + "name": "NM_004327.4:e._14", + "description": "VICC exon representation of the aligned transcript boundary.", + "value": { + "exon_end": 14, + "exon_end_offset": 0, + "sequenceReference":{ + "type": "SequenceReference", + "id": "NM_004327.4", + "refgetAccession": "SQ.kpytJsXw3BwLC3oBSjHQS1kwxs4WO3I3", + "residueAlphabet": "na" } } - } - }, - "exon_end": 2, - "exon_end_offset": 182 - }, - { - "type": "LinkerSequenceElement", - "linker_sequence": { - "id": "sequence:ACTAAAGCG", - "type": "SequenceDescriptor", - "sequence": "ACTAAAGCG", - "residue_type": "SO:0000348" - } - }, - { - "type": "TranscriptSegmentElement", - "transcript": "refseq:NM_005157.5", - "exon_start": 2, - "exon_start_offset": -173, - "gene_descriptor": { - "id": "normalize.gene:ABL1", - "type": "GeneDescriptor", - "label": "ABL1", - "gene_id": "hgnc:76" - }, - "element_genomic_start": { - "id": "fusor.location_descriptor:NC_000009.12", - "type": "LocationDescriptor", - "label": "NC_000009.12", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NC_000009.12", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 130854064 + }, + { + "name": "NM_004327.4:c._2782", + "description": "Transcript SequenceLocation of the aligned transcript boundary.", + "value": { + "type": "SequenceLocation", + "sequenceReference": { + "id": "NM_004327.4", + "type": "SequenceReference", + "refgetAccession": "SQ.kpytJsXw3BwLC3oBSjHQS1kwxs4WO3I3", + "residueAlphabet": "na" }, - "end": { - "type": "Number", - "value": 130854065 - } + "end": 3234 + } + }, + { + "name": "gene", + "description": "The gene concept (BCR) associated with this fusion partner.", + "value": { + "code": "hgnc:1014", + "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", + "label": "BCR" } } - } + ]}, + { + "type": "SequenceLocation", + "sequenceReference": { + "id": "GRCh38:chr9", + "type": "SequenceReference", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", + "residueAlphabet": "na" + }, + "start": 130854064, + "extensions": [ + { + "name": "NM_005157.6:e.2_", + "description": "VICC exon representation of the aligned transcript boundary.", + "value": { + "exon_start": 2, + "exon_start_offset": 0, + "sequenceReference":{ + "id": "NM_005157.6", + "type": "SequenceReference", + "refgetAccession": "SQ.w8Qg3x-PQ2akJrJQeGEN-_eBUMo1H1CL", + "residueAlphabet": "na" + } + } + }, + { + "name": "NM_005157.6:c.80_", + "description": "Transcript SequenceLocation of the aligned transcript boundary.", + "value": { + "type": "SequenceLocation", + "sequenceReference": { + "id": "NM_005157.6", + "type": "SequenceReference", + "refgetAccession": "SQ.w8Qg3x-PQ2akJrJQeGEN-_eBUMo1H1CL", + "residueAlphabet": "na" + }, + "end": 273 + } + }, + { + "name": "gene", + "description": "The gene concept (ABL1) associated with this fusion partner.", + "value": { + "code": "hgnc:76", + "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", + "label": "ABL1" + } + } + ] + }], + "linker": { + "type": "LiteralSequenceExpression", + "sequence": "CCCGTC" } - ], - "r_frame_preserved": true, - "critical_functional_domains": [ + }, + "readingFramePreserved": true, + "criticalFunctionalDomains": [ { "type": "FunctionalDomain", "status": "preserved", - "associated_gene": { - "id": "normalize.gene:hgnc%3A76", - "type": "GeneDescriptor", - "label": "ABL1", - "gene_id": "hgnc:76" + "gene": { + "code": "hgnc:76", + "system": "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/", + "label": "ABL1" }, - "_id": "interpro:IPR000980", + "id": "interpro:IPR000980", "label": "SH2 domain", - "sequence_location": { - "id": "fusor.location_descriptor:NP_005148.2", - "type": "LocationDescriptor", - "location": { - "type": "SequenceLocation", - "sequence_id": "refseq:NP_005148.2", - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 127 - }, - "end": { - "type": "Number", - "value": 202 - } - } - } + "sequenceLocation": { + "type": "SequenceLocation", + "sequenceReference": { + "id": "GRCh38:chr22", + "type": "SequenceReference", + "refgetAccession": "SQ.7B7SHsmchAR0dFcDCuSFjJAo7tX87krQ", + "residueAlphabet": "na" + }, + "start": 127, + "end": 202 } } ] diff --git a/src/fusor/examples/ewsr1.json b/src/fusor/examples/ewsr1.json index 2b5b17b..96b15d4 100644 --- a/src/fusor/examples/ewsr1.json +++ b/src/fusor/examples/ewsr1.json @@ -2,9 +2,8 @@ "type": "AssayedFusion", "structural_elements": [ { - "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "id": "normalize.gene:EWSR1", "label": "EWSR1", "gene_id": "hgnc:3508" diff --git a/src/fusor/examples/ewsr1_elements_only.json b/src/fusor/examples/ewsr1_elements_only.json index 103a5f8..4e890b1 100644 --- a/src/fusor/examples/ewsr1_elements_only.json +++ b/src/fusor/examples/ewsr1_elements_only.json @@ -2,9 +2,8 @@ "type": "AssayedFusion", "structural_elements": [ { - "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "id": "normalize.gene:EWSR1", "label": "EWSR1", "gene_id": "hgnc:3508" diff --git a/src/fusor/examples/ewsr1_no_assay.json b/src/fusor/examples/ewsr1_no_assay.json index 691e6db..d54cd07 100644 --- a/src/fusor/examples/ewsr1_no_assay.json +++ b/src/fusor/examples/ewsr1_no_assay.json @@ -2,9 +2,8 @@ "type": "AssayedFusion", "structural_elements": [ { - "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "id": "normalize.gene:EWSR1", "label": "EWSR1", "gene_id": "hgnc:3508" diff --git a/src/fusor/examples/ewsr1_no_causative_event.json b/src/fusor/examples/ewsr1_no_causative_event.json index dfa2748..015dc93 100644 --- a/src/fusor/examples/ewsr1_no_causative_event.json +++ b/src/fusor/examples/ewsr1_no_causative_event.json @@ -2,9 +2,8 @@ "type": "AssayedFusion", "structural_elements": [ { - "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "id": "normalize.gene:EWSR1", "label": "EWSR1", "gene_id": "hgnc:3508" diff --git a/src/fusor/examples/igh_myc.json b/src/fusor/examples/igh_myc.json index ac49180..dc5a8f0 100644 --- a/src/fusor/examples/igh_myc.json +++ b/src/fusor/examples/igh_myc.json @@ -13,9 +13,8 @@ }, "structural_elements": [ { - "type": "GeneElement", - "gene_descriptor": { - "type": "GeneDescriptor", + "gene": { + "type": "Gene", "label": "MYC", "gene_id": "hgnc:7553", "id": "normalize.gene:MYC" diff --git a/src/fusor/fusor.py b/src/fusor/fusor.py index 5bff793..87f8d81 100644 --- a/src/fusor/fusor.py +++ b/src/fusor/fusor.py @@ -2,7 +2,6 @@ import logging import re -from urllib.parse import quote from biocommons.seqrepo import SeqRepo from bioutils.accessions import coerce_namespace @@ -10,7 +9,7 @@ from cool_seq_tool.schemas import ResidueMode from ga4gh.core import ga4gh_identify from ga4gh.vrs import models -from ga4gh.vrs.models import SequenceLocation +from ga4gh.vrs.models import Adjacency, SequenceLocation from gene.database import AbstractDatabase as GeneDatabase from gene.database import create_db from gene.query import QueryHandler @@ -32,7 +31,7 @@ FunctionalDomain, Fusion, FusionType, - GeneElement, + Gene, LinkerElement, MultiplePossibleGenesElement, RegulatoryClass, @@ -225,7 +224,7 @@ async def transcript_segment_element( use_minimal_gene_descr: bool = True, seq_id_target_namespace: str | None = None, **kwargs, - ) -> tuple[TranscriptSegmentElement | None, list[str] | None]: + ) -> tuple[Adjacency | None, list[str] | None]: """Create transcript segment element :param bool tx_to_genomic_coords: `True` if going from transcript @@ -285,12 +284,13 @@ async def transcript_segment_element( genomic_data = data.genomic_data genomic_data.transcript = coerce_namespace(genomic_data.transcript) - normalized_gene_response = self._normalized_gene_descriptor( + normalized_gene_response = self._normalized_gene( genomic_data.gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not normalized_gene_response[0] and normalized_gene_response[1]: return None, [normalized_gene_response[1]] + adjacency_obj = Adjacency() return ( TranscriptSegmentElement( transcript=genomic_data.transcript, @@ -298,7 +298,7 @@ async def transcript_segment_element( exon_start_offset=genomic_data.exon_start_offset, exon_end=genomic_data.exon_end, exon_end_offset=genomic_data.exon_end_offset, - gene_descriptor=normalized_gene_response[0], + gene=normalized_gene_response[0], element_genomic_start=self._location_descriptor( genomic_data.start, genomic_data.start + 1, @@ -330,14 +330,16 @@ def gene_element( :param bool use_minimal_gene_descr: `True` if minimal gene descriptor (`id`, `gene_id`, `label`) will be used. `False` if gene-normalizer's gene descriptor will be used - :return: GeneElement, warning + :return: Gene, warning """ - gene_descr, warning = self._normalized_gene_descriptor( + gene_descr, warning = self._normalized_gene( gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not gene_descr: return None, warning - return GeneElement(gene_descriptor=gene_descr), None + return Gene( + id=gene_descr.id, gene_id=gene_descr.gene_id, label=gene_descr.label + ), None def templated_sequence_element( self, @@ -474,7 +476,7 @@ def functional_domain( if not seq: return None, warning - gene_descr, warning = self._normalized_gene_descriptor( + gene_descr, warning = self._normalized_gene( gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not gene_descr: @@ -513,7 +515,7 @@ def regulatory_element( :return: Tuple with RegulatoryElement instance and None value for warnings if successful, or a None value and warning message if unsuccessful """ - gene_descr, warning = self._normalized_gene_descriptor( + gene_descr, warning = self._normalized_gene( gene, use_minimal_gene_descr=use_minimal_gene_descr ) if not gene_descr: @@ -547,7 +549,6 @@ def _location_descriptor( set this to the namespace you want the digest for. Otherwise, leave as `None`. """ - try: sequence_id = coerce_namespace(sequence_id) except ValueError: @@ -708,11 +709,11 @@ def add_translated_sequence_id( domain.sequence_location.location.sequence_id = new_id return fusion - def add_gene_descriptor(self, fusion: Fusion) -> Fusion: - """Add additional fields to `gene_descriptor` in fusion object + def add_gene_fields(self, fusion: Fusion) -> Fusion: + """Add additional fields to `gene` in fusion object :param Fusion fusion: A valid Fusion object - :return: Updated fusion with additional fields set in `gene_descriptor` + :return: Updated fusion with additional fields set in `gene` """ properties = [fusion.structural_elements] if fusion.type == FusionType.CATEGORICAL_FUSION: @@ -720,40 +721,40 @@ def add_gene_descriptor(self, fusion: Fusion) -> Fusion: for prop in properties: for obj in prop: - if "gene_descriptor" in obj.model_fields: - label = obj.gene_descriptor.label - norm_gene_descr, _ = self._normalized_gene_descriptor( + if "gene" in obj.model_fields: + label = obj.gene.label + norm_gene_descr, _ = self._normalized_gene( label, use_minimal_gene_descr=False ) if norm_gene_descr: - obj.gene_descriptor = norm_gene_descr + obj.gene = norm_gene_descr if fusion.regulatory_element and fusion.regulatory_element.associated_gene: reg_el = fusion.regulatory_element label = reg_el.associated_gene.label - norm_gene_descr, _ = self._normalized_gene_descriptor( + norm_gene_descr, _ = self._normalized_gene( label, use_minimal_gene_descr=False ) if norm_gene_descr: reg_el.associated_gene = norm_gene_descr return fusion - def _normalized_gene_descriptor( + def _normalized_gene( self, query: str, use_minimal_gene_descr: bool = True - ) -> tuple[GeneDescriptor | None, str | None]: - """Return gene descriptor from normalized response. + ) -> tuple[Gene | None, str | None]: + """Return gene from normalized response. :param str query: Gene query :param bool use_minimal_gene_descr: `True` if minimal gene descriptor (`id`, `gene_id`, `label`) will be used. `False` if gene-normalizer's gene descriptor will be used - :return: Tuple with gene descriptor and None value for warnings if + :return: Tuple with gene and None value for warnings if successful, and None value with warning string if unsuccessful """ gene_norm_resp = self.gene_normalizer.normalize(query) if gene_norm_resp.match_type: gene_descr = gene_norm_resp.gene if use_minimal_gene_descr: - gene_descr = GeneDescriptor( + gene_descr = Gene( id=gene_descr.id, gene_id=gene_norm_resp.normalized_id, label=gene_descr.label, @@ -782,14 +783,14 @@ def generate_nomenclature(self, fusion: Fusion) -> str: parts.append(element.linker_sequence.sequence) elif isinstance(element, TranscriptSegmentElement): if not any( - [gene == element.gene_descriptor.label for gene in element_genes] # noqa: C419 + [gene == element.gene.label for gene in element_genes] # noqa: C419 ): parts.append(tx_segment_nomenclature(element)) elif isinstance(element, TemplatedSequenceElement): parts.append(templated_seq_nomenclature(element, self.seqrepo)) - elif isinstance(element, GeneElement): + elif isinstance(element, Gene): if not any( - [gene == element.gene_descriptor.label for gene in element_genes] # noqa: C419 + [gene == element.gene.label for gene in element_genes] # noqa: C419 ): parts.append(gene_nomenclature(element)) else: diff --git a/src/fusor/models.py b/src/fusor/models.py index d5c4fa1..dd20c69 100644 --- a/src/fusor/models.py +++ b/src/fusor/models.py @@ -27,7 +27,7 @@ class FUSORTypes(str, Enum): TRANSCRIPT_SEGMENT_ELEMENT = "TranscriptSegmentElement" TEMPLATED_SEQUENCE_ELEMENT = "TemplatedSequenceElement" LINKER_SEQUENCE_ELEMENT = "LinkerSequenceElement" - GENE_ELEMENT = "GeneElement" + GENE = "Gene" UNKNOWN_GENE_ELEMENT = "UnknownGeneElement" MULTIPLE_POSSIBLE_GENES_ELEMENT = "MultiplePossibleGenesElement" REGULATORY_ELEMENT = "RegulatoryElement" @@ -41,7 +41,6 @@ class AdditionalFields(str, Enum): SEQUENCE_ID = "sequence_id" LOCATION_ID = "location_id" - GENE_DESCRIPTOR = "gene_descriptor" class DomainStatus(str, Enum): @@ -51,12 +50,49 @@ class DomainStatus(str, Enum): PRESERVED = "preserved" +class StructuralElementType(str, Enum): + """Define possible structural element type values.""" + + TRANSCRIPT_SEGMENT_ELEMENT = FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT.value + TEMPLATED_SEQUENCE_ELEMENT = FUSORTypes.TEMPLATED_SEQUENCE_ELEMENT.value + LINKER_SEQUENCE_ELEMENT = FUSORTypes.LINKER_SEQUENCE_ELEMENT.value + GENE_ELEMENT = FUSORTypes.GENE_ELEMENT.value + UNKNOWN_GENE_ELEMENT = FUSORTypes.UNKNOWN_GENE_ELEMENT.value + MULTIPLE_POSSIBLE_GENES_ELEMENT = FUSORTypes.MULTIPLE_POSSIBLE_GENES_ELEMENT.value + + +class BaseStructuralElement(ABC, BaseModel): + """Define base structural element class.""" + + type: StructuralElementType + + +class Gene(BaseStructuralElement): + """Define Gene Element class.""" + + type: Literal[FUSORTypes.GENE] = FUSORTypes.GENE + id: str + gene_id: str + label: str + + model_config = ConfigDict( + json_schema_extra={ + "example": { + "type": "Gene", + "id": "gene:BRAF", + "gene_id": "hgnc:1097", + "label": "BRAF", + } + }, + ) + + class FunctionalDomain(BaseModel): """Define FunctionalDomain class""" type: Literal[FUSORTypes.FUNCTIONAL_DOMAIN] = FUSORTypes.FUNCTIONAL_DOMAIN status: DomainStatus - associated_gene: GeneDescriptor + associated_gene: Gene id: CURIE | None = Field(None, alias="_id") label: StrictStr | None = None sequence_location: LocationDescriptor | None = None @@ -94,23 +130,6 @@ class FunctionalDomain(BaseModel): ) -class StructuralElementType(str, Enum): - """Define possible structural element type values.""" - - TRANSCRIPT_SEGMENT_ELEMENT = FUSORTypes.TRANSCRIPT_SEGMENT_ELEMENT.value - TEMPLATED_SEQUENCE_ELEMENT = FUSORTypes.TEMPLATED_SEQUENCE_ELEMENT.value - LINKER_SEQUENCE_ELEMENT = FUSORTypes.LINKER_SEQUENCE_ELEMENT.value - GENE_ELEMENT = FUSORTypes.GENE_ELEMENT.value - UNKNOWN_GENE_ELEMENT = FUSORTypes.UNKNOWN_GENE_ELEMENT.value - MULTIPLE_POSSIBLE_GENES_ELEMENT = FUSORTypes.MULTIPLE_POSSIBLE_GENES_ELEMENT.value - - -class BaseStructuralElement(ABC, BaseModel): - """Define base structural element class.""" - - type: StructuralElementType - - class TranscriptSegmentElement(BaseStructuralElement): """Define TranscriptSegment class""" @@ -122,7 +141,7 @@ class TranscriptSegmentElement(BaseStructuralElement): exon_start_offset: StrictInt | None = 0 exon_end: StrictInt | None = None exon_end_offset: StrictInt | None = 0 - gene_descriptor: GeneDescriptor + gene: Gene element_genomic_start: LocationDescriptor | None = None element_genomic_end: LocationDescriptor | None = None @@ -164,9 +183,9 @@ def check_exons(cls, values): "exon_start_offset": 0, "exon_end": 8, "exon_end_offset": 0, - "gene_descriptor": { + "gene": { "id": "normalize.gene:TPM3", - "type": "GeneDescriptor", + "type": "Gene", "label": "TPM3", "gene_id": "hgnc:12012", }, @@ -285,27 +304,6 @@ class TemplatedSequenceElement(BaseStructuralElement): ) -class GeneElement(BaseStructuralElement): - """Define Gene Element class.""" - - type: Literal[FUSORTypes.GENE_ELEMENT] = FUSORTypes.GENE_ELEMENT - gene_descriptor: GeneDescriptor - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "type": "GeneElement", - "gene_descriptor": { - "id": "gene:BRAF", - "gene_id": "hgnc:1097", - "label": "BRAF", - "type": "GeneDescriptor", - }, - } - }, - ) - - class UnknownGeneElement(BaseStructuralElement): """Define UnknownGene class. This is primarily intended to represent a partner in the result of a fusion partner-agnostic assay, which identifies @@ -381,7 +379,7 @@ class RegulatoryElement(BaseModel): type: Literal[FUSORTypes.REGULATORY_ELEMENT] = FUSORTypes.REGULATORY_ELEMENT regulatory_class: RegulatoryClass feature_id: str | None = None - associated_gene: GeneDescriptor | None = None + associated_gene: Gene | None = None feature_location: LocationDescriptor | None = None _get_ref_id_val = field_validator("feature_id")(return_value) @@ -472,23 +470,23 @@ def _access_object_attr( def _fetch_gene_id( cls, obj: dict | BaseModel, - gene_descriptor_field: str, + gene_field: str, ) -> str | None: """Get gene ID if element includes a gene annotation. :param obj: element to fetch gene from. Might not contain a gene (e.g. it's a TemplatedSequenceElement) so we have to use safe checks to fetch. - :param gene_descriptor_field: name of gene_descriptor field + :param gene_field: name of gene field :return: gene ID if gene is defined """ - gene_descriptor = cls._access_object_attr(obj, gene_descriptor_field) - if gene_descriptor: - gene_value = cls._access_object_attr(gene_descriptor, "gene") + gene = cls._access_object_attr(obj, gene_field) + if gene: + gene_value = cls._access_object_attr(gene, "gene") if gene_value: gene_id = cls._access_object_attr(gene_value, "gene_id") if gene_id: return gene_id - gene_id = cls._access_object_attr(gene_descriptor, "gene_id") + gene_id = cls._access_object_attr(gene, "gene_id") if gene_id: return gene_id return None @@ -526,15 +524,15 @@ def enforce_element_quantities(cls, values): gene_ids = [] if reg_element: gene_id = cls._fetch_gene_id( - obj=reg_element, gene_descriptor_field="associated_gene" + # TODO: verify this with Jeremy/Alex + obj=reg_element, + gene_field="gene", ) if gene_id: gene_ids.append(gene_id) for element in structural_elements: - gene_id = cls._fetch_gene_id( - obj=element, gene_descriptor_field="gene_descriptor" - ) + gene_id = cls._fetch_gene_id(obj=element, gene_field="gene") if gene_id: gene_ids.append(gene_id) @@ -607,7 +605,7 @@ class Assay(BaseModelForbidExtra): AssayedFusionElements = list[ TranscriptSegmentElement - | GeneElement + | Gene | TemplatedSequenceElement | LinkerElement | UnknownGeneElement @@ -675,13 +673,10 @@ class AssayedFusion(AbstractFusion): }, "structural_elements": [ { - "type": "GeneElement", - "gene_descriptor": { - "id": "gene:EWSR1", - "gene_id": "hgnc:3058", - "label": "EWSR1", - "type": "GeneDescriptor", - }, + "type": "Gene", + "id": "gene:EWSR1", + "gene_id": "hgnc:3058", + "label": "EWSR1", }, {"type": "UnknownGeneElement"}, ], @@ -691,8 +686,9 @@ class AssayedFusion(AbstractFusion): CategoricalFusionElements = list[ + # TODO: TsxSegmentElement -> VRS Adjacency TranscriptSegmentElement - | GeneElement + | Gene | TemplatedSequenceElement | LinkerElement | MultiplePossibleGenesElement @@ -709,7 +705,7 @@ class CategoricalFusion(AbstractFusion): type: Literal[FUSORTypes.CATEGORICAL_FUSION] = FUSORTypes.CATEGORICAL_FUSION r_frame_preserved: StrictBool | None = None critical_functional_domains: list[FunctionalDomain] | None = None - structural_elements: CategoricalFusionElements + structure: CategoricalFusionElements model_config = ConfigDict( json_schema_extra={ @@ -730,54 +726,26 @@ class CategoricalFusion(AbstractFusion): }, } ], - "structural_elements": [ + # TODO: update this example + "structure": [ { - "type": "TranscriptSegmentElement", + "type": "Adjacency", "transcript": "refseq:NM_152263.3", "exon_start": 1, "exon_start_offset": 0, "exon_end": 8, "exon_end_offset": 0, - "gene_descriptor": { + "gene": { "id": "gene:TPM3", "gene_id": "hgnc:12012", - "type": "GeneDescriptor", + "type": "Gene", "label": "TPM3", }, - "element_genomic_start": { - "id": "TPM3:exon1", - "type": "LocationDescriptor", - "location_id": "ga4gh:VSL.vyyyExx4enSZdWZr3z67-T8uVKH50uLi", - "location": { - "sequence_id": "ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 154192135}, - "end": {"type": "Number", "value": 154192136}, - "type": "SequenceInterval", - }, - }, - }, - "element_genomic_end": { - "id": "TPM3:exon8", - "type": "LocationDescriptor", - "location_id": "ga4gh:VSL._1bRdL4I6EtpBvVK5RUaXb0NN3k0gpqa", - "location": { - "sequence_id": "ga4gh:SQ.ijXOSP3XSsuLWZhXQ7_TJ5JXu4RJO6VT", - "type": "SequenceLocation", - "interval": { - "start": {"type": "Number", "value": 154170398}, - "end": {"type": "Number", "value": 154170399}, - "type": "SequenceInterval", - }, - }, - }, }, { - "type": "GeneElement", - "gene_descriptor": { + "gene": { "id": "gene:ALK", - "type": "GeneDescriptor", + "type": "Gene", "gene_id": "hgnc:427", "label": "ALK", }, diff --git a/src/fusor/nomenclature.py b/src/fusor/nomenclature.py index 4634e3a..836fc60 100644 --- a/src/fusor/nomenclature.py +++ b/src/fusor/nomenclature.py @@ -1,10 +1,11 @@ """Provide helper methods for fusion nomenclature generation.""" from biocommons.seqrepo.seqrepo import SeqRepo +from ga4gh.vrs.models import SequenceLocation from fusor.exceptions import IDTranslationException from fusor.models import ( - GeneElement, + Gene, RegulatoryClass, RegulatoryElement, TemplatedSequenceElement, @@ -67,7 +68,7 @@ def tx_segment_nomenclature(element: TranscriptSegmentElement) -> str: if ":" in transcript: transcript = transcript.split(":")[1] - prefix = f"{transcript}({element.gene_descriptor.label})" + prefix = f"{transcript}({element.gene.label})" start = element.exon_start if element.exon_start else "" if element.exon_start_offset: if element.exon_start_offset > 0: @@ -97,7 +98,7 @@ def templated_seq_nomenclature(element: TemplatedSequenceElement, sr: SeqRepo) - if element.region and element.region.location: location = element.region.location if isinstance(location, SequenceLocation): - sequence_id = str(location.sequence_id) + sequence_id = str(location.sequenceReference.id) refseq_id = str(translate_identifier(sr, sequence_id, "refseq")) start = location.interval.start.value end = location.interval.end.value @@ -112,19 +113,19 @@ def templated_seq_nomenclature(element: TemplatedSequenceElement, sr: SeqRepo) - raise ValueError -def gene_nomenclature(element: GeneElement) -> str: +def gene_nomenclature(element: Gene) -> str: """Return fusion nomenclature for gene element. - :param GeneElement element: a gene element object + :param Gene element: a gene element object :return: element nomenclature representation :raises ValueError: if unable to retrieve gene ID """ - if element.gene_descriptor.gene_id: - gene_id = gene_id = element.gene_descriptor.gene_id + if element.gene.gene_id: + gene_id = gene_id = element.gene.gene_id - if element.gene_descriptor.gene_id: - gene_id = element.gene_descriptor.gene_id - elif element.gene_descriptor.gene and element.gene_descriptor.gene.gene_id: - gene_id = element.gene_descriptor.gene.gene_id + if element.gene.gene_id: + gene_id = element.gene.gene_id + elif element.gene.gene and element.gene.gene.gene_id: + gene_id = element.gene.gene.gene_id else: raise ValueError - return f"{element.gene_descriptor.label}({gene_id})" + return f"{element.gene.label}({gene_id})" diff --git a/src/fusor/tools.py b/src/fusor/tools.py index 0c4873e..aadd88a 100644 --- a/src/fusor/tools.py +++ b/src/fusor/tools.py @@ -11,7 +11,7 @@ def translate_identifier( seqrepo: SeqRepo, ac: str, target_namespace: str = "ga4gh" -) -> CURIE: +) -> str: """Return `target_namespace` identifier for accession provided. :param str ac: Identifier accession diff --git a/tests/conftest.py b/tests/conftest.py index a7dfcee..b3671af 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -147,7 +147,7 @@ def alk_gene_descriptor(): """Create test fixture for ALK gene descriptor params""" return { "id": "normalize.gene:ALK", - "type": "GeneDescriptor", + "type": "Gene", "label": "ALK", "description": None, "xrefs": ["ensembl:ENSG00000171094", "ncbigene:238"], @@ -495,8 +495,7 @@ def exhaustive_example(alk_gene_descriptor, braf_gene_descriptor): }, }, { - "type": "GeneElement", - "gene_descriptor": alk_gene_descriptor, + "gene": alk_gene_descriptor, }, { "type": "LinkerSequenceElement", @@ -624,10 +623,9 @@ def fusion_example(): }, }, { - "type": "GeneElement", - "gene_descriptor": { + "gene": { "id": "normalize.gene:ALK", - "type": "GeneDescriptor", + "type": "Gene", "label": "ALK", "gene_id": "hgnc:427", }, diff --git a/tests/test_fusor.py b/tests/test_fusor.py index da57f19..a7b4c8b 100644 --- a/tests/test_fusor.py +++ b/tests/test_fusor.py @@ -9,7 +9,7 @@ AssayedFusion, CategoricalFusion, FunctionalDomain, - GeneElement, + Gene, LinkerElement, MultiplePossibleGenesElement, RegulatoryClass, @@ -475,15 +475,15 @@ def test_add_location_id(fusor_instance, fusion_example, exhaustive_example): ) -def test__normalized_gene_descriptor(fusor_instance): - """Test that _normalized_gene_descriptor works correctly.""" +def test__normalized_gene(fusor_instance): + """Test that _normalized_gene works correctly.""" # Actual response is tested in test_add_gene_descriptor - resp = fusor_instance._normalized_gene_descriptor("BRAF") + resp = fusor_instance._normalized_gene("BRAF") assert resp[0] assert resp[1] is None - assert isinstance(resp[0], GeneDescriptor) + assert isinstance(resp[0], Gene) - resp = fusor_instance._normalized_gene_descriptor("B R A F") + resp = fusor_instance._normalized_gene("B R A F") assert resp[0] is None assert resp[1] == "gene-normalizer unable to normalize B R A F" @@ -505,16 +505,16 @@ def test_add_gene_descriptor(fusor_instance, exhaustive_example, fusion_example) for t_field in [actual.critical_functional_domains, actual.structural_elements]: for e_obj in e_field: for t_obj in t_field: - if "gene_descriptor" in e_obj.model_fields: - e_gd = e_obj.gene_descriptor.label + if "gene" in e_obj.model_fields: + e_gd = e_obj.gene.label e_gds.add(e_gd) - if "gene_descriptor" in t_obj.model_fields: - t_gd = t_obj.gene_descriptor.label + if "gene" in t_obj.model_fields: + t_gd = t_obj.gene.label t_gds.add(t_gd) if e_gd == t_gd: compare_gene_descriptor( - t_obj.gene_descriptor.model_dump(), - e_obj.gene_descriptor.model_dump(), + t_obj.gene.model_dump(), + e_obj.gene.model_dump(), ) assert t_gds == e_gds @@ -781,7 +781,7 @@ def test_gene_element(fusor_instance, braf_gene_descr_min, braf_gene_descr): gc = fusor_instance.gene_element("BRAF", use_minimal_gene_descr=True) assert gc[0] assert gc[1] is None - assert isinstance(gc[0], GeneElement) + assert isinstance(gc[0], Gene) compare_gene_descriptor( gc[0].gene_descriptor.model_dump(), braf_gene_descr_min.model_dump() ) @@ -789,7 +789,7 @@ def test_gene_element(fusor_instance, braf_gene_descr_min, braf_gene_descr): gc = fusor_instance.gene_element("BRAF", use_minimal_gene_descr=False) assert gc[0] assert gc[1] is None - assert isinstance(gc[0], GeneElement) + assert isinstance(gc[0], Gene) compare_gene_descriptor( gc[0].gene_descriptor.model_dump(), braf_gene_descr.model_dump() ) diff --git a/tests/test_models.py b/tests/test_models.py index c77c031..c3aee2e 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -13,7 +13,7 @@ CausativeEvent, EventType, FunctionalDomain, - GeneElement, + Gene, LinkerElement, MultiplePossibleGenesElement, RegulatoryElement, @@ -40,7 +40,7 @@ def gene_descriptors(): # alternate structure { "id": "normalize.gene:BRAF", - "type": "GeneDescriptor", + "type": "Gene", "label": "BRAF", "gene_id": "hgnc:1097", }, @@ -217,8 +217,7 @@ def gene_elements(gene_descriptors): """Provide possible gene element input data.""" return [ { - "type": "GeneElement", - "gene_descriptor": gene_descriptors[1], + "gene": gene_descriptors[1], }, {"type": "GeneElement", "gene_descriptor": gene_descriptors[0]}, {"type"}, @@ -613,7 +612,7 @@ def test_gene_element(gene_descriptors): # test enum validation with pytest.raises(ValidationError) as exc_info: - assert GeneElement( + assert Gene( type="UnknownGeneElement", gene_descriptor=gene_descriptors[0] ) msg = "Input should be "