From 3353ce0c7a975cbce6576378a40a832a2b2dbde8 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Mon, 25 Nov 2024 10:23:45 -0500 Subject: [PATCH] Reformat variable names and add filtering checks --- src/fusor/translator.py | 59 +++++++++++++++----- tests/test_translators.py | 112 ++++++++++++++++++++++++++++++-------- 2 files changed, 133 insertions(+), 38 deletions(-) diff --git a/src/fusor/translator.py b/src/fusor/translator.py index 1addad2..e0a0c89 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -461,44 +461,75 @@ async def from_arriba( ) async def from_cicero( - self, cicero_row: pl.DataFrame, rb: Assembly - ) -> AssayedFusion: + self, + gene_a: str, + gene_b: str, + chr_a: str, + chr_b: str, + pos_a: int, + pos_b: int, + sv_ort: str, + event_type: str, + rb: Assembly, + ) -> AssayedFusion | str: """Parse CICERO output to create AssayedFusion object - :param cicero_row: A row of CICERO output + :param geneA: The gene symbol for the 5' partner + :param geneB: The gene symbol for the 3' partner + :param chrA: The chromosome for the 5' partner + :param chrB: The chromosome for the 3' partner + :param posA: The genomic breakpoint for the 5' partner + :param posB: The genomic breakpoint for the 3' partner + :param sv_ort: Whether the mapping orientation of assembled contig has + confident biological meaning + :param event_type: The structural variation event that created the called fusion :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful """ - gene1 = cicero_row.get_column("geneA").item() - gene2 = cicero_row.get_column("geneB").item() - gene_5prime = self._get_gene_element(gene1, "cicero")[0].gene.label - gene_3prime = self._get_gene_element(gene2, "cicero")[0].gene.label + # Check if gene symbols have valid formatting. CICERO can output two or more + # gene symbols for `gene_a` or `gene_b`, which are separated by a column. As + # there is not a precise way to resolve this ambiguity, we do not process + # these events + if "," in gene_a or "," in gene_b: + msg = "Ambiguous gene symbols are reported by CICERO for at least one of the fusion partners" + _logger.warning(msg) + return msg + + # Check CICERO annotation regarding the confidence that the called fusion + # has biological meaning + if sv_ort != ">": + msg = "CICERO annotation indicates that this event does not have confident biological meaning" + _logger.warning(msg) + return msg + + gene_5prime = self._get_gene_element(gene_a, "cicero")[0].gene.label + gene_3prime = self._get_gene_element(gene_b, "cicero")[0].gene.label tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(cicero_row.get_column("chrA").item(), rb), - seg_end_genomic=int(cicero_row.get_column("posA").item()), + genomic_ac=self._get_genomic_ac(chr_a, rb), + seg_end_genomic=pos_a, gene=gene_5prime, get_nearest_transcript_junction=True, ) tr_3prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(cicero_row.get_column("chrB").item(), rb), - seg_start_genomic=int(cicero_row.get_column("posB").item()), + genomic_ac=self._get_genomic_ac(chr_b, rb), + seg_start_genomic=pos_b, gene=gene_3prime, get_nearest_transcript_junction=True, ) - if cicero_row.get_column("type").item() == "read_through": + if event_type == "read_through": ce = CausativeEvent( eventType=EventType("read-through"), - eventDescription=cicero_row.get_column("type").item(), + eventDescription=event_type, ) else: ce = CausativeEvent( eventType=EventType("rearrangement"), - eventDescription=cicero_row.get_column("type").item(), + eventDescription=event_type, ) return self._format_fusion( gene_5prime, diff --git a/tests/test_translators.py b/tests/test_translators.py index 842e31f..d7e5cee 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -368,39 +368,103 @@ async def test_cicero( ): """Test CICERO translator""" # Test exonic breakpoint - cicero_data = pl.DataFrame( - { - "geneA": "TPM3", - "geneB": "PDGFRB", - "chrA": "1", - "posA": "154170465", - "chrB": "5", - "posB": "150126612", - "type": "CTX", - } - ) + gene_a = "TPM3" + gene_b = "PDGFRB" + chr_a = "1" + chr_b = "5" + pos_a = 154170465 + pos_b = 150126612 + sv_ort = ">" + event_type = "CTX" + cicero_fusor = await translator_instance.from_cicero( - cicero_data, Assembly.GRCH38.value + gene_a, + gene_b, + chr_a, + chr_b, + pos_a, + pos_b, + sv_ort, + event_type, + Assembly.GRCH38.value, ) assert cicero_fusor.structure == fusion_data_example.structure # Test non-exonic breakpoint - cicero_data_nonexonic = pl.DataFrame( - { - "geneA": "TPM3", - "geneB": "PDGFRB", - "chrA": "1", - "posA": "154173078", - "chrB": "5", - "posB": "150127173", - "type": "CTX", - } - ) + gene_a = "TPM3" + gene_b = "PDGFRB" + chr_a = "1" + chr_b = "5" + pos_a = 154173078 + pos_b = 150127173 + sv_ort = ">" + event_type = "CTX" + cicero_fusor_nonexonic = await translator_instance.from_cicero( - cicero_data_nonexonic, Assembly.GRCH38.value + gene_a, + gene_b, + chr_a, + chr_b, + pos_a, + pos_b, + sv_ort, + event_type, + Assembly.GRCH38.value, ) assert cicero_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure + # Test case where the called fusion does not have confident biological meaning + gene_a = "TPM3" + gene_b = "PDGFRB" + chr_a = "1" + chr_b = "5" + pos_a = 154173078 + pos_b = 150127173 + sv_ort = "?" + event_type = "CTX" + + non_confident_bio = await translator_instance.from_cicero( + gene_a, + gene_b, + chr_a, + chr_b, + pos_a, + pos_b, + sv_ort, + event_type, + Assembly.GRCH38.value, + ) + assert ( + non_confident_bio + == "CICERO annotation indicates that this event does not have confident biological meaning" + ) + + # Test case where multiple gene symbols are reported for a fusion partner + gene_a = "TPM3" + gene_b = "PDGFRB,PDGFRB-FGFR4,FGFR4" + chr_a = "1" + chr_b = "5" + pos_a = 154173078 + pos_b = 150127173 + sv_ort = "?" + event_type = "CTX" + + multiple_genes_fusion_partner = await translator_instance.from_cicero( + gene_a, + gene_b, + chr_a, + chr_b, + pos_a, + pos_b, + sv_ort, + event_type, + Assembly.GRCH38.value, + ) + assert ( + multiple_genes_fusion_partner + == "Ambiguous gene symbols are reported by CICERO for at least one of the fusion partners" + ) + @pytest.mark.asyncio() async def test_enfusion(