From 3353ce0c7a975cbce6576378a40a832a2b2dbde8 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Mon, 25 Nov 2024 10:23:45 -0500 Subject: [PATCH 1/5] Reformat variable names and add filtering checks --- src/fusor/translator.py | 59 +++++++++++++++----- tests/test_translators.py | 112 ++++++++++++++++++++++++++++++-------- 2 files changed, 133 insertions(+), 38 deletions(-) diff --git a/src/fusor/translator.py b/src/fusor/translator.py index 1addad2..e0a0c89 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -461,44 +461,75 @@ async def from_arriba( ) async def from_cicero( - self, cicero_row: pl.DataFrame, rb: Assembly - ) -> AssayedFusion: + self, + gene_a: str, + gene_b: str, + chr_a: str, + chr_b: str, + pos_a: int, + pos_b: int, + sv_ort: str, + event_type: str, + rb: Assembly, + ) -> AssayedFusion | str: """Parse CICERO output to create AssayedFusion object - :param cicero_row: A row of CICERO output + :param geneA: The gene symbol for the 5' partner + :param geneB: The gene symbol for the 3' partner + :param chrA: The chromosome for the 5' partner + :param chrB: The chromosome for the 3' partner + :param posA: The genomic breakpoint for the 5' partner + :param posB: The genomic breakpoint for the 3' partner + :param sv_ort: Whether the mapping orientation of assembled contig has + confident biological meaning + :param event_type: The structural variation event that created the called fusion :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful """ - gene1 = cicero_row.get_column("geneA").item() - gene2 = cicero_row.get_column("geneB").item() - gene_5prime = self._get_gene_element(gene1, "cicero")[0].gene.label - gene_3prime = self._get_gene_element(gene2, "cicero")[0].gene.label + # Check if gene symbols have valid formatting. CICERO can output two or more + # gene symbols for `gene_a` or `gene_b`, which are separated by a column. As + # there is not a precise way to resolve this ambiguity, we do not process + # these events + if "," in gene_a or "," in gene_b: + msg = "Ambiguous gene symbols are reported by CICERO for at least one of the fusion partners" + _logger.warning(msg) + return msg + + # Check CICERO annotation regarding the confidence that the called fusion + # has biological meaning + if sv_ort != ">": + msg = "CICERO annotation indicates that this event does not have confident biological meaning" + _logger.warning(msg) + return msg + + gene_5prime = self._get_gene_element(gene_a, "cicero")[0].gene.label + gene_3prime = self._get_gene_element(gene_b, "cicero")[0].gene.label tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(cicero_row.get_column("chrA").item(), rb), - seg_end_genomic=int(cicero_row.get_column("posA").item()), + genomic_ac=self._get_genomic_ac(chr_a, rb), + seg_end_genomic=pos_a, gene=gene_5prime, get_nearest_transcript_junction=True, ) tr_3prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(cicero_row.get_column("chrB").item(), rb), - seg_start_genomic=int(cicero_row.get_column("posB").item()), + genomic_ac=self._get_genomic_ac(chr_b, rb), + seg_start_genomic=pos_b, gene=gene_3prime, get_nearest_transcript_junction=True, ) - if cicero_row.get_column("type").item() == "read_through": + if event_type == "read_through": ce = CausativeEvent( eventType=EventType("read-through"), - eventDescription=cicero_row.get_column("type").item(), + eventDescription=event_type, ) else: ce = CausativeEvent( eventType=EventType("rearrangement"), - eventDescription=cicero_row.get_column("type").item(), + eventDescription=event_type, ) return self._format_fusion( gene_5prime, diff --git a/tests/test_translators.py b/tests/test_translators.py index 842e31f..d7e5cee 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -368,39 +368,103 @@ async def test_cicero( ): """Test CICERO translator""" # Test exonic breakpoint - cicero_data = pl.DataFrame( - { - "geneA": "TPM3", - "geneB": "PDGFRB", - "chrA": "1", - "posA": "154170465", - "chrB": "5", - "posB": "150126612", - "type": "CTX", - } - ) + gene_a = "TPM3" + gene_b = "PDGFRB" + chr_a = "1" + chr_b = "5" + pos_a = 154170465 + pos_b = 150126612 + sv_ort = ">" + event_type = "CTX" + cicero_fusor = await translator_instance.from_cicero( - cicero_data, Assembly.GRCH38.value + gene_a, + gene_b, + chr_a, + chr_b, + pos_a, + pos_b, + sv_ort, + event_type, + Assembly.GRCH38.value, ) assert cicero_fusor.structure == fusion_data_example.structure # Test non-exonic breakpoint - cicero_data_nonexonic = pl.DataFrame( - { - "geneA": "TPM3", - "geneB": "PDGFRB", - "chrA": "1", - "posA": "154173078", - "chrB": "5", - "posB": "150127173", - "type": "CTX", - } - ) + gene_a = "TPM3" + gene_b = "PDGFRB" + chr_a = "1" + chr_b = "5" + pos_a = 154173078 + pos_b = 150127173 + sv_ort = ">" + event_type = "CTX" + cicero_fusor_nonexonic = await translator_instance.from_cicero( - cicero_data_nonexonic, Assembly.GRCH38.value + gene_a, + gene_b, + chr_a, + chr_b, + pos_a, + pos_b, + sv_ort, + event_type, + Assembly.GRCH38.value, ) assert cicero_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure + # Test case where the called fusion does not have confident biological meaning + gene_a = "TPM3" + gene_b = "PDGFRB" + chr_a = "1" + chr_b = "5" + pos_a = 154173078 + pos_b = 150127173 + sv_ort = "?" + event_type = "CTX" + + non_confident_bio = await translator_instance.from_cicero( + gene_a, + gene_b, + chr_a, + chr_b, + pos_a, + pos_b, + sv_ort, + event_type, + Assembly.GRCH38.value, + ) + assert ( + non_confident_bio + == "CICERO annotation indicates that this event does not have confident biological meaning" + ) + + # Test case where multiple gene symbols are reported for a fusion partner + gene_a = "TPM3" + gene_b = "PDGFRB,PDGFRB-FGFR4,FGFR4" + chr_a = "1" + chr_b = "5" + pos_a = 154173078 + pos_b = 150127173 + sv_ort = "?" + event_type = "CTX" + + multiple_genes_fusion_partner = await translator_instance.from_cicero( + gene_a, + gene_b, + chr_a, + chr_b, + pos_a, + pos_b, + sv_ort, + event_type, + Assembly.GRCH38.value, + ) + assert ( + multiple_genes_fusion_partner + == "Ambiguous gene symbols are reported by CICERO for at least one of the fusion partners" + ) + @pytest.mark.asyncio() async def test_enfusion( From 8a403ac587e851cf617c3fd5d71ea7347d5d7731 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Mon, 25 Nov 2024 11:04:28 -0500 Subject: [PATCH 2/5] Update variable names and docstrings --- src/fusor/translator.py | 38 ++++++++-------- tests/test_translators.py | 96 +++++++++++++++++++-------------------- 2 files changed, 67 insertions(+), 67 deletions(-) diff --git a/src/fusor/translator.py b/src/fusor/translator.py index e0a0c89..4be7d60 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -462,24 +462,24 @@ async def from_arriba( async def from_cicero( self, - gene_a: str, - gene_b: str, - chr_a: str, - chr_b: str, - pos_a: int, - pos_b: int, + gene_5prime: str, + gene_3prime: str, + chr_5prime: str, + chr_3prime: str, + pos_5prime: int, + pos_3prime: int, sv_ort: str, event_type: str, rb: Assembly, ) -> AssayedFusion | str: """Parse CICERO output to create AssayedFusion object - :param geneA: The gene symbol for the 5' partner - :param geneB: The gene symbol for the 3' partner - :param chrA: The chromosome for the 5' partner - :param chrB: The chromosome for the 3' partner - :param posA: The genomic breakpoint for the 5' partner - :param posB: The genomic breakpoint for the 3' partner + :param gene_5prime: The gene symbol for the 5' partner + :param gene_3prime: The gene symbol for the 3' partner + :param chr_5prime: The chromosome for the 5' partner + :param chr_3prime: The chromosome for the 3' partner + :param pos_5prime: The genomic breakpoint for the 5' partner + :param pos_3prime: The genomic breakpoint for the 3' partner :param sv_ort: Whether the mapping orientation of assembled contig has confident biological meaning :param event_type: The structural variation event that created the called fusion @@ -490,7 +490,7 @@ async def from_cicero( # gene symbols for `gene_a` or `gene_b`, which are separated by a column. As # there is not a precise way to resolve this ambiguity, we do not process # these events - if "," in gene_a or "," in gene_b: + if "," in gene_5prime or "," in gene_3prime: msg = "Ambiguous gene symbols are reported by CICERO for at least one of the fusion partners" _logger.warning(msg) return msg @@ -502,21 +502,21 @@ async def from_cicero( _logger.warning(msg) return msg - gene_5prime = self._get_gene_element(gene_a, "cicero")[0].gene.label - gene_3prime = self._get_gene_element(gene_b, "cicero")[0].gene.label + gene_5prime = self._get_gene_element(gene_5prime, "cicero")[0].gene.label + gene_3prime = self._get_gene_element(gene_3prime, "cicero")[0].gene.label tr_5prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(chr_a, rb), - seg_end_genomic=pos_a, + genomic_ac=self._get_genomic_ac(chr_5prime, rb), + seg_end_genomic=pos_5prime, gene=gene_5prime, get_nearest_transcript_junction=True, ) tr_3prime = await self.fusor.transcript_segment_element( tx_to_genomic_coords=False, - genomic_ac=self._get_genomic_ac(chr_b, rb), - seg_start_genomic=pos_b, + genomic_ac=self._get_genomic_ac(chr_3prime, rb), + seg_start_genomic=pos_3prime, gene=gene_3prime, get_nearest_transcript_junction=True, ) diff --git a/tests/test_translators.py b/tests/test_translators.py index d7e5cee..604db87 100644 --- a/tests/test_translators.py +++ b/tests/test_translators.py @@ -368,22 +368,22 @@ async def test_cicero( ): """Test CICERO translator""" # Test exonic breakpoint - gene_a = "TPM3" - gene_b = "PDGFRB" - chr_a = "1" - chr_b = "5" - pos_a = 154170465 - pos_b = 150126612 + gene_5prime = "TPM3" + gene_3prime = "PDGFRB" + chr_5prime = "1" + chr_3prime = "5" + pos_5prime = 154170465 + pos_3prime = 150126612 sv_ort = ">" event_type = "CTX" cicero_fusor = await translator_instance.from_cicero( - gene_a, - gene_b, - chr_a, - chr_b, - pos_a, - pos_b, + gene_5prime, + gene_3prime, + chr_5prime, + chr_3prime, + pos_5prime, + pos_3prime, sv_ort, event_type, Assembly.GRCH38.value, @@ -391,22 +391,22 @@ async def test_cicero( assert cicero_fusor.structure == fusion_data_example.structure # Test non-exonic breakpoint - gene_a = "TPM3" - gene_b = "PDGFRB" - chr_a = "1" - chr_b = "5" - pos_a = 154173078 - pos_b = 150127173 + gene_5prime = "TPM3" + gene_3prime = "PDGFRB" + chr_5prime = "1" + chr_3prime = "5" + pos_5prime = 154173078 + pos_3prime = 150127173 sv_ort = ">" event_type = "CTX" cicero_fusor_nonexonic = await translator_instance.from_cicero( - gene_a, - gene_b, - chr_a, - chr_b, - pos_a, - pos_b, + gene_5prime, + gene_3prime, + chr_5prime, + chr_3prime, + pos_5prime, + pos_3prime, sv_ort, event_type, Assembly.GRCH38.value, @@ -414,22 +414,22 @@ async def test_cicero( assert cicero_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure # Test case where the called fusion does not have confident biological meaning - gene_a = "TPM3" - gene_b = "PDGFRB" - chr_a = "1" - chr_b = "5" - pos_a = 154173078 - pos_b = 150127173 + gene_5prime = "TPM3" + gene_3prime = "PDGFRB" + chr_5prime = "1" + chr_3prime = "5" + pos_5prime = 154173078 + pos_3prime = 150127173 sv_ort = "?" event_type = "CTX" non_confident_bio = await translator_instance.from_cicero( - gene_a, - gene_b, - chr_a, - chr_b, - pos_a, - pos_b, + gene_5prime, + gene_3prime, + chr_5prime, + chr_3prime, + pos_5prime, + pos_3prime, sv_ort, event_type, Assembly.GRCH38.value, @@ -440,22 +440,22 @@ async def test_cicero( ) # Test case where multiple gene symbols are reported for a fusion partner - gene_a = "TPM3" - gene_b = "PDGFRB,PDGFRB-FGFR4,FGFR4" - chr_a = "1" - chr_b = "5" - pos_a = 154173078 - pos_b = 150127173 + gene_5prime = "TPM3" + gene_3prime = "PDGFRB,PDGFRB-FGFR4,FGFR4" + chr_5prime = "1" + chr_3prime = "5" + pos_5prime = 154173078 + pos_3prime = 150127173 sv_ort = "?" event_type = "CTX" multiple_genes_fusion_partner = await translator_instance.from_cicero( - gene_a, - gene_b, - chr_a, - chr_b, - pos_a, - pos_b, + gene_5prime, + gene_3prime, + chr_5prime, + chr_3prime, + pos_5prime, + pos_3prime, sv_ort, event_type, Assembly.GRCH38.value, From 84bb78a72759462e7534c570e545b88fdafa6d76 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Mon, 25 Nov 2024 11:10:43 -0500 Subject: [PATCH 3/5] Update name references --- src/fusor/translator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/translator.py b/src/fusor/translator.py index 4be7d60..4ecf10a 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -487,7 +487,7 @@ async def from_cicero( :return: An AssayedFusion object, if construction is successful """ # Check if gene symbols have valid formatting. CICERO can output two or more - # gene symbols for `gene_a` or `gene_b`, which are separated by a column. As + # gene symbols for `gene_5prime` or `gene_3prime`, which are separated by a column. As # there is not a precise way to resolve this ambiguity, we do not process # these events if "," in gene_5prime or "," in gene_3prime: From 8e4c9bd3e0c24ccfbe9200187ba78777326c92da Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Mon, 25 Nov 2024 11:11:55 -0500 Subject: [PATCH 4/5] Fix typo with comma --- src/fusor/translator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/fusor/translator.py b/src/fusor/translator.py index 4ecf10a..c1b297e 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -487,7 +487,7 @@ async def from_cicero( :return: An AssayedFusion object, if construction is successful """ # Check if gene symbols have valid formatting. CICERO can output two or more - # gene symbols for `gene_5prime` or `gene_3prime`, which are separated by a column. As + # gene symbols for `gene_5prime` or `gene_3prime`, which are separated by a comma. As # there is not a precise way to resolve this ambiguity, we do not process # these events if "," in gene_5prime or "," in gene_3prime: From d29bfc730c3c050b20f89e6952d8454ce895f8d0 Mon Sep 17 00:00:00 2001 From: Jeremy Arbesfeld Date: Mon, 25 Nov 2024 11:53:47 -0500 Subject: [PATCH 5/5] Include description of structural variation --- src/fusor/translator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fusor/translator.py b/src/fusor/translator.py index c1b297e..4e49f18 100644 --- a/src/fusor/translator.py +++ b/src/fusor/translator.py @@ -480,8 +480,8 @@ async def from_cicero( :param chr_3prime: The chromosome for the 3' partner :param pos_5prime: The genomic breakpoint for the 5' partner :param pos_3prime: The genomic breakpoint for the 3' partner - :param sv_ort: Whether the mapping orientation of assembled contig has - confident biological meaning + :param sv_ort: Whether the mapping orientation of assembled contig (driven by + structural variation) has confident biological meaning :param event_type: The structural variation event that created the called fusion :param rb: The reference build used to call the fusion :return: An AssayedFusion object, if construction is successful