Skip to content

Commit

Permalink
feat!: change arriba fusion detection algorithm input parameters (#204)
Browse files Browse the repository at this point in the history
  • Loading branch information
jarbesfeld authored Nov 25, 2024
1 parent 9369a18 commit 1b067f0
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 37 deletions.
61 changes: 48 additions & 13 deletions src/fusor/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,53 +374,88 @@ async def from_fusion_map(
)

async def from_arriba(
self, arriba_row: pl.DataFrame, rb: Assembly
self,
gene1: str,
gene2: str,
strand1: str,
strand2: str,
breakpoint1: str,
breakpoint2: str,
event: str,
confidence: str,
direction1: str,
direction2: str,
rf: str,
rb: Assembly,
) -> AssayedFusion:
"""Parse Arriba output to create AssayedFusion object
:param arriba_row: A row of Arriba output
:param gene1: The 5' gene fusion partner
:param gene2: The 3' gene fusion partner
:param strand1: The strand information for the 5' gene fusion partner
:param strand2: The strand information for the 3' gene fusion partner
:param breakpoint1: The chromosome and breakpoint for gene1
:param breakpoint2: The chromosome and breakpoint for gene2
:param event: An inference about the type of fusion event
:param confidence: A metric describing the confidence of the fusion prediction
:param direction1: A description that indicates if the transcript segment
starts or ends at breakpoint1
:param direction2: A description that indicates if the transcript segment
starts or ends at breakpoint2
:param rf: A description if the reading frame is preserved for the fusion
:param rb: The reference build used to call the fusion
:return: An AssayedFusion object, if construction is successful
"""
gene1 = arriba_row.get_column("#gene1").item()
gene2 = arriba_row.get_column("gene2").item()

# Arriba reports two gene symbols if a breakpoint occurs in an intergenic
# space. We select the gene symbol with the smallest distance from the
# breakpoint.
gene_5prime = self._get_gene_element(gene1, "arriba")[0].gene.label
gene_3prime = self._get_gene_element(gene2, "arriba")[0].gene.label

breakpoint1 = arriba_row.get_column("breakpoint1").item().split(":")
breakpoint2 = arriba_row.get_column("breakpoint2").item().split(":")
strand1 = strand1.split("/")[1] # Determine strand that is transcribed
strand2 = strand2.split("/")[1] # Determine strand that is transcribed
if strand1 == "-":
gene1_seg_start = direction1 == "upstream"
else:
gene1_seg_start = direction1 == "downstream"
if strand2 == "-":
gene2_seg_start = direction2 == "upstream"
else:
gene2_seg_start = direction2 == "downstream"

breakpoint1 = breakpoint1.split(":")
breakpoint2 = breakpoint2.split(":")

tr_5prime = await self.fusor.transcript_segment_element(
tx_to_genomic_coords=False,
genomic_ac=self._get_genomic_ac(breakpoint1[0], rb),
seg_end_genomic=int(breakpoint1[1]),
seg_start_genomic=int(breakpoint1[1]) if gene1_seg_start else None,
seg_end_genomic=int(breakpoint1[1]) if not gene1_seg_start else None,
gene=gene_5prime,
get_nearest_transcript_junction=True,
)

tr_3prime = await self.fusor.transcript_segment_element(
tx_to_genomic_coords=False,
genomic_ac=self._get_genomic_ac(breakpoint2[0], rb),
seg_start_genomic=int(breakpoint2[1]),
seg_start_genomic=int(breakpoint2[1]) if gene2_seg_start else None,
seg_end_genomic=int(breakpoint2[1]) if not gene2_seg_start else None,
gene=gene_3prime,
get_nearest_transcript_junction=True,
)

ce = (
CausativeEvent(
eventType=EventType("read-through"),
eventDescription=arriba_row.get_column("confidence").item(),
eventDescription=confidence,
)
if "read_through" in arriba_row["type"]
if "read_through" in event
else CausativeEvent(
eventType=EventType("rearrangement"),
eventDescription=arriba_row.get_column("confidence").item(),
eventDescription=confidence,
)
)
rf = bool(arriba_row.get_column("reading_frame").item() == "in-frame")
rf = bool(rf == "in-frame") if rf != "." else None
return self._format_fusion(
gene_5prime, gene_3prime, tr_5prime, tr_3prime, ce, rf
)
Expand Down
72 changes: 48 additions & 24 deletions tests/test_translators.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,36 +304,60 @@ async def test_arriba(
):
"""Test Arriba translator"""
# Test exonic breakpoint
arriba_data = pl.DataFrame(
{
"#gene1": "TPM3",
"gene2": "PDGFRB",
"breakpoint1": "1:154170465",
"breakpoint2": "5:150126612",
"type": ".",
"confidence": "high",
"reading_frame": "in-frame",
}
)
gene1 = "TPM3"
gene2 = "PDGFRB"
strand1 = "-/-"
strand2 = "-/-"
breakpoint1 = "1:154170465"
breakpoint2 = "5:150126612"
event = "translocation"
confidence = "high"
direction1 = "dowstream"
direction2 = "upstream"
rf = "in-frame"

arriba_fusor = await translator_instance.from_arriba(
arriba_data, Assembly.GRCH38.value
gene1,
gene2,
strand1,
strand2,
breakpoint1,
breakpoint2,
event,
confidence,
direction1,
direction2,
rf,
Assembly.GRCH38.value,
)
assert arriba_fusor.structure == fusion_data_example.structure

# Test non-exonic breakpoint
arriba_data_nonexonic = pl.DataFrame(
{
"#gene1": "TPM3",
"gene2": "PDGFRB",
"breakpoint1": "1:154173078",
"breakpoint2": "5:150127173",
"type": ".",
"confidence": "high",
"reading_frame": "in-frame",
}
)
gene1 = "TPM3"
gene2 = "PDGFRB"
strand1 = "-/-"
strand2 = "-/-"
breakpoint1 = "1:154173078"
breakpoint2 = "5:150127173"
event = "translocation"
confidence = "high"
direction1 = "dowstream"
direction2 = "upstream"
rf = "in-frame"

arriba_fusor_nonexonic = await translator_instance.from_arriba(
arriba_data_nonexonic, Assembly.GRCH38.value
gene1,
gene2,
strand1,
strand2,
breakpoint1,
breakpoint2,
event,
confidence,
direction1,
direction2,
rf,
Assembly.GRCH38.value,
)
assert arriba_fusor_nonexonic.structure == fusion_data_example_nonexonic.structure

Expand Down

0 comments on commit 1b067f0

Please sign in to comment.