Skip to content

Commit

Permalink
fix(coloc): handle cases when the bayes factors are null (opentargets…
Browse files Browse the repository at this point in the history
…#556)

* fix(coloc): fillna doesnt fill nested data

* test(coloc): added test_coloc_no_logbf (semantic)

* revert(ecaviar): revert accidental changes
  • Loading branch information
ireneisdoomed authored Mar 22, 2024
1 parent 232b1e0 commit 8f9d268
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 9 deletions.
18 changes: 9 additions & 9 deletions src/gentropy/method/colocalisation.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,24 +159,24 @@ def colocalise(
posteriors = f.udf(Coloc._get_posteriors, VectorUDT())
return Colocalisation(
_df=(
overlapping_signals.df
overlapping_signals.df.select("*", "statistics.*")
# Before summing log_BF columns nulls need to be filled with 0:
.fillna(0, subset=["statistics.left_logBF", "statistics.right_logBF"])
.fillna(0, subset=["left_logBF", "right_logBF"])
# Sum of log_BFs for each pair of signals
.withColumn(
"sum_log_bf",
f.col("statistics.left_logBF") + f.col("statistics.right_logBF"),
f.col("left_logBF") + f.col("right_logBF"),
)
# Group by overlapping peak and generating dense vectors of log_BF:
.groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId")
.agg(
f.count("*").alias("numberColocalisingVariants"),
fml.array_to_vector(
f.collect_list(f.col("statistics.left_logBF"))
).alias("left_logBF"),
fml.array_to_vector(
f.collect_list(f.col("statistics.right_logBF"))
).alias("right_logBF"),
fml.array_to_vector(f.collect_list(f.col("left_logBF"))).alias(
"left_logBF"
),
fml.array_to_vector(f.collect_list(f.col("right_logBF"))).alias(
"right_logBF"
),
fml.array_to_vector(f.collect_list(f.col("sum_log_bf"))).alias(
"sum_log_bf"
),
Expand Down
52 changes: 52 additions & 0 deletions tests/gentropy/method/test_colocalisation_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from gentropy.method.colocalisation import Coloc, ECaviar
from pandas.testing import assert_frame_equal
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, LongType, StringType, StructField, StructType


def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None:
Expand Down Expand Up @@ -104,6 +105,57 @@ def test_coloc_semantic(
)


def test_coloc_no_logbf(
spark: SparkSession,
minimum_expected_h0: float = 0.99,
maximum_expected_h4: float = 1e-5,
) -> None:
"""Test COLOC output when the input data has irrelevant logBF."""
observed_overlap = StudyLocusOverlap(
(
spark.createDataFrame(
[
{
"leftStudyLocusId": 1,
"rightStudyLocusId": 2,
"chromosome": "1",
"tagVariantId": "snp",
"statistics": {
"left_logBF": None,
"right_logBF": None,
}, # irrelevant for COLOC
}
],
schema=StructType(
[
StructField("leftStudyLocusId", LongType(), False),
StructField("rightStudyLocusId", LongType(), False),
StructField("chromosome", StringType(), False),
StructField("tagVariantId", StringType(), False),
StructField(
"statistics",
StructType(
[
StructField("left_logBF", DoubleType(), True),
StructField("right_logBF", DoubleType(), True),
]
),
),
]
),
)
),
StudyLocusOverlap.get_schema(),
)
observed_coloc_df = Coloc.colocalise(observed_overlap).df
assert (
observed_coloc_df.select("h0").collect()[0]["h0"] > minimum_expected_h0
), "COLOC should return a high h0 (no association) when the input data has irrelevant logBF."
assert (
observed_coloc_df.select("h4").collect()[0]["h4"] < maximum_expected_h4
), "COLOC should return a low h4 (traits are associated) when the input data has irrelevant logBF."


def test_ecaviar(mock_study_locus_overlap: StudyLocusOverlap) -> None:
"""Test eCAVIAR."""
assert isinstance(ECaviar.colocalise(mock_study_locus_overlap), Colocalisation)

0 comments on commit 8f9d268

Please sign in to comment.