feat!: change arriba fusion detection algorithm input parameters (#204)

cancervariants · Nov 25, 2024 · 1b067f0 · 1b067f0
1 parent 9369a18
commit 1b067f0
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 37 deletions.
diff --git a/src/fusor/translator.py b/src/fusor/translator.py
@@ -374,53 +374,88 @@ async def from_fusion_map(
         )
 
     async def from_arriba(
-        self, arriba_row: pl.DataFrame, rb: Assembly
+        self,
+        gene1: str,
+        gene2: str,
+        strand1: str,
+        strand2: str,
+        breakpoint1: str,
+        breakpoint2: str,
+        event: str,
+        confidence: str,
+        direction1: str,
+        direction2: str,
+        rf: str,
+        rb: Assembly,
     ) -> AssayedFusion:
         """Parse Arriba output to create AssayedFusion object
 
-        :param arriba_row: A row of Arriba output
+        :param gene1: The 5' gene fusion partner
+        :param gene2: The 3' gene fusion partner
+        :param strand1: The strand information for the 5' gene fusion partner
+        :param strand2: The strand information for the 3' gene fusion partner
+        :param breakpoint1: The chromosome and breakpoint for gene1
+        :param breakpoint2: The chromosome and breakpoint for gene2
+        :param event: An inference about the type of fusion event
+        :param confidence: A metric describing the confidence of the fusion prediction
+        :param direction1: A description that indicates if the transcript segment
+            starts or ends at breakpoint1
+        :param direction2: A description that indicates if the transcript segment
+            starts or ends at breakpoint2
+        :param rf: A description if the reading frame is preserved for the fusion
+        :param rb: The reference build used to call the fusion
         :return: An AssayedFusion object, if construction is successful
         """
-        gene1 = arriba_row.get_column("#gene1").item()
-        gene2 = arriba_row.get_column("gene2").item()
-
         # Arriba reports two gene symbols if a breakpoint occurs in an intergenic
         # space. We select the gene symbol with the smallest distance from the
         # breakpoint.
         gene_5prime = self._get_gene_element(gene1, "arriba")[0].gene.label
         gene_3prime = self._get_gene_element(gene2, "arriba")[0].gene.label
 
-        breakpoint1 = arriba_row.get_column("breakpoint1").item().split(":")
-        breakpoint2 = arriba_row.get_column("breakpoint2").item().split(":")
+        strand1 = strand1.split("/")[1]  # Determine strand that is transcribed
+        strand2 = strand2.split("/")[1]  # Determine strand that is transcribed
+        if strand1 == "-":
+            gene1_seg_start = direction1 == "upstream"
+        else:
+            gene1_seg_start = direction1 == "downstream"
+        if strand2 == "-":
+            gene2_seg_start = direction2 == "upstream"
+        else:
+            gene2_seg_start = direction2 == "downstream"
+
+        breakpoint1 = breakpoint1.split(":")
+        breakpoint2 = breakpoint2.split(":")
 
         tr_5prime = await self.fusor.transcript_segment_element(
             tx_to_genomic_coords=False,
             genomic_ac=self._get_genomic_ac(breakpoint1[0], rb),
-            seg_end_genomic=int(breakpoint1[1]),
+            seg_start_genomic=int(breakpoint1[1]) if gene1_seg_start else None,
+            seg_end_genomic=int(breakpoint1[1]) if not gene1_seg_start else None,
             gene=gene_5prime,
             get_nearest_transcript_junction=True,
         )
 
         tr_3prime = await self.fusor.transcript_segment_element(
             tx_to_genomic_coords=False,
             genomic_ac=self._get_genomic_ac(breakpoint2[0], rb),
-            seg_start_genomic=int(breakpoint2[1]),
+            seg_start_genomic=int(breakpoint2[1]) if gene2_seg_start else None,
+            seg_end_genomic=int(breakpoint2[1]) if not gene2_seg_start else None,
             gene=gene_3prime,
             get_nearest_transcript_junction=True,
         )
 
         ce = (
             CausativeEvent(
                 eventType=EventType("read-through"),
-                eventDescription=arriba_row.get_column("confidence").item(),
+                eventDescription=confidence,
             )
-            if "read_through" in arriba_row["type"]
+            if "read_through" in event
             else CausativeEvent(
                 eventType=EventType("rearrangement"),
-                eventDescription=arriba_row.get_column("confidence").item(),
+                eventDescription=confidence,
             )
         )
-        rf = bool(arriba_row.get_column("reading_frame").item() == "in-frame")
+        rf = bool(rf == "in-frame") if rf != "." else None
         return self._format_fusion(
             gene_5prime, gene_3prime, tr_5prime, tr_3prime, ce, rf
         )

diff --git a/tests/test_translators.py b/tests/test_translators.py
@@ -304,36 +304,60 @@ async def test_arriba(
 ):
     """Test Arriba translator"""
     # Test exonic breakpoint
-    arriba_data = pl.DataFrame(
-        {
-            "#gene1": "TPM3",
-            "gene2": "PDGFRB",
-            "breakpoint1": "1:154170465",
-            "breakpoint2": "5:150126612",
-            "type": ".",
-            "confidence": "high",
-            "reading_frame": "in-frame",
-        }
-    )
+    gene1 = "TPM3"
+    gene2 = "PDGFRB"
+    strand1 = "-/-"
+    strand2 = "-/-"
+    breakpoint1 = "1:154170465"
+    breakpoint2 = "5:150126612"
+    event = "translocation"
+    confidence = "high"
+    direction1 = "dowstream"
+    direction2 = "upstream"
+    rf = "in-frame"
+
     arriba_fusor = await translator_instance.from_arriba(
-        arriba_data, Assembly.GRCH38.value
+        gene1,
+        gene2,
+        strand1,
+        strand2,
+        breakpoint1,
+        breakpoint2,
+        event,
+        confidence,
+        direction1,
+        direction2,
+        rf,
+        Assembly.GRCH38.value,
     )
     assert arriba_fusor.structure == fusion_data_example.structure
 
     # Test non-exonic breakpoint
-    arriba_data_nonexonic = pl.DataFrame(
-        {
-            "#gene1": "TPM3",
-            "gene2": "PDGFRB",
-            "breakpoint1": "1:154173078",
-            "breakpoint2": "5:150127173",
-            "type": ".",
-            "confidence": "high",
-            "reading_frame": "in-frame",
-        }
-    )
+    gene1 = "TPM3"
+    gene2 = "PDGFRB"
+    strand1 = "-/-"
+    strand2 = "-/-"
+    breakpoint1 = "1:154173078"
+    breakpoint2 = "5:150127173"
+    event = "translocation"
+    confidence = "high"
+    direction1 = "dowstream"
+    direction2 = "upstream"
+    rf = "in-frame"
+
     arriba_fusor_nonexonic = await translator_instance.from_arriba(
-        arriba_data_nonexonic, Assembly.GRCH38.value
+        gene1,
+        gene2,
+        strand1,
+        strand2,
+        breakpoint1,
+        breakpoint2,
+        event,
+        confidence,
+        direction1,
+        direction2,
+        rf,
+        Assembly.GRCH38.value,
     )
     assert arriba_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure