fix(coloc): handle cases when the bayes factors are null (opentargets…

…#556) * fix(coloc): fillna doesnt fill nested data * test(coloc): added test_coloc_no_logbf (semantic) * revert(ecaviar): revert accidental changes
thehyve · Mar 22, 2024 · 8f9d268 · 8f9d268
1 parent 232b1e0
commit 8f9d268
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 9 deletions.
diff --git a/src/gentropy/method/colocalisation.py b/src/gentropy/method/colocalisation.py
@@ -159,24 +159,24 @@ def colocalise(
         posteriors = f.udf(Coloc._get_posteriors, VectorUDT())
         return Colocalisation(
             _df=(
-                overlapping_signals.df
+                overlapping_signals.df.select("*", "statistics.*")
                 # Before summing log_BF columns nulls need to be filled with 0:
-                .fillna(0, subset=["statistics.left_logBF", "statistics.right_logBF"])
+                .fillna(0, subset=["left_logBF", "right_logBF"])
                 # Sum of log_BFs for each pair of signals
                 .withColumn(
                     "sum_log_bf",
-                    f.col("statistics.left_logBF") + f.col("statistics.right_logBF"),
+                    f.col("left_logBF") + f.col("right_logBF"),
                 )
                 # Group by overlapping peak and generating dense vectors of log_BF:
                 .groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId")
                 .agg(
                     f.count("*").alias("numberColocalisingVariants"),
-                    fml.array_to_vector(
-                        f.collect_list(f.col("statistics.left_logBF"))
-                    ).alias("left_logBF"),
-                    fml.array_to_vector(
-                        f.collect_list(f.col("statistics.right_logBF"))
-                    ).alias("right_logBF"),
+                    fml.array_to_vector(f.collect_list(f.col("left_logBF"))).alias(
+                        "left_logBF"
+                    ),
+                    fml.array_to_vector(f.collect_list(f.col("right_logBF"))).alias(
+                        "right_logBF"
+                    ),
                     fml.array_to_vector(f.collect_list(f.col("sum_log_bf"))).alias(
                         "sum_log_bf"
                     ),

diff --git a/tests/gentropy/method/test_colocalisation_method.py b/tests/gentropy/method/test_colocalisation_method.py
@@ -10,6 +10,7 @@
 from gentropy.method.colocalisation import Coloc, ECaviar
 from pandas.testing import assert_frame_equal
 from pyspark.sql import SparkSession
+from pyspark.sql.types import DoubleType, LongType, StringType, StructField, StructType
 
 
 def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None:
@@ -104,6 +105,57 @@ def test_coloc_semantic(
     )
 
 
+def test_coloc_no_logbf(
+    spark: SparkSession,
+    minimum_expected_h0: float = 0.99,
+    maximum_expected_h4: float = 1e-5,
+) -> None:
+    """Test COLOC output when the input data has irrelevant logBF."""
+    observed_overlap = StudyLocusOverlap(
+        (
+            spark.createDataFrame(
+                [
+                    {
+                        "leftStudyLocusId": 1,
+                        "rightStudyLocusId": 2,
+                        "chromosome": "1",
+                        "tagVariantId": "snp",
+                        "statistics": {
+                            "left_logBF": None,
+                            "right_logBF": None,
+                        },  # irrelevant for COLOC
+                    }
+                ],
+                schema=StructType(
+                    [
+                        StructField("leftStudyLocusId", LongType(), False),
+                        StructField("rightStudyLocusId", LongType(), False),
+                        StructField("chromosome", StringType(), False),
+                        StructField("tagVariantId", StringType(), False),
+                        StructField(
+                            "statistics",
+                            StructType(
+                                [
+                                    StructField("left_logBF", DoubleType(), True),
+                                    StructField("right_logBF", DoubleType(), True),
+                                ]
+                            ),
+                        ),
+                    ]
+                ),
+            )
+        ),
+        StudyLocusOverlap.get_schema(),
+    )
+    observed_coloc_df = Coloc.colocalise(observed_overlap).df
+    assert (
+        observed_coloc_df.select("h0").collect()[0]["h0"] > minimum_expected_h0
+    ), "COLOC should return a high h0 (no association) when the input data has irrelevant logBF."
+    assert (
+        observed_coloc_df.select("h4").collect()[0]["h4"] < maximum_expected_h4
+    ), "COLOC should return a low h4 (traits are associated) when the input data has irrelevant logBF."
+
+
 def test_ecaviar(mock_study_locus_overlap: StudyLocusOverlap) -> None:
     """Test eCAVIAR."""
     assert isinstance(ECaviar.colocalise(mock_study_locus_overlap), Colocalisation)