From 8f9d2680edb25545f71d9df518989f6e817ac233 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Irene=20L=C3=B3pez?= <45119610+ireneisdoomed@users.noreply.github.com> Date: Fri, 22 Mar 2024 18:01:05 +0000 Subject: [PATCH] fix(coloc): handle cases when the bayes factors are null (#556) * fix(coloc): fillna doesnt fill nested data * test(coloc): added test_coloc_no_logbf (semantic) * revert(ecaviar): revert accidental changes --- src/gentropy/method/colocalisation.py | 18 +++---- .../method/test_colocalisation_method.py | 52 +++++++++++++++++++ 2 files changed, 61 insertions(+), 9 deletions(-) diff --git a/src/gentropy/method/colocalisation.py b/src/gentropy/method/colocalisation.py index eaf5b9de6..18d97fdf8 100644 --- a/src/gentropy/method/colocalisation.py +++ b/src/gentropy/method/colocalisation.py @@ -159,24 +159,24 @@ def colocalise( posteriors = f.udf(Coloc._get_posteriors, VectorUDT()) return Colocalisation( _df=( - overlapping_signals.df + overlapping_signals.df.select("*", "statistics.*") # Before summing log_BF columns nulls need to be filled with 0: - .fillna(0, subset=["statistics.left_logBF", "statistics.right_logBF"]) + .fillna(0, subset=["left_logBF", "right_logBF"]) # Sum of log_BFs for each pair of signals .withColumn( "sum_log_bf", - f.col("statistics.left_logBF") + f.col("statistics.right_logBF"), + f.col("left_logBF") + f.col("right_logBF"), ) # Group by overlapping peak and generating dense vectors of log_BF: .groupBy("chromosome", "leftStudyLocusId", "rightStudyLocusId") .agg( f.count("*").alias("numberColocalisingVariants"), - fml.array_to_vector( - f.collect_list(f.col("statistics.left_logBF")) - ).alias("left_logBF"), - fml.array_to_vector( - f.collect_list(f.col("statistics.right_logBF")) - ).alias("right_logBF"), + fml.array_to_vector(f.collect_list(f.col("left_logBF"))).alias( + "left_logBF" + ), + fml.array_to_vector(f.collect_list(f.col("right_logBF"))).alias( + "right_logBF" + ), fml.array_to_vector(f.collect_list(f.col("sum_log_bf"))).alias( "sum_log_bf" ), diff --git a/tests/gentropy/method/test_colocalisation_method.py b/tests/gentropy/method/test_colocalisation_method.py index 37613354c..e58b0e562 100644 --- a/tests/gentropy/method/test_colocalisation_method.py +++ b/tests/gentropy/method/test_colocalisation_method.py @@ -10,6 +10,7 @@ from gentropy.method.colocalisation import Coloc, ECaviar from pandas.testing import assert_frame_equal from pyspark.sql import SparkSession +from pyspark.sql.types import DoubleType, LongType, StringType, StructField, StructType def test_coloc(mock_study_locus_overlap: StudyLocusOverlap) -> None: @@ -104,6 +105,57 @@ def test_coloc_semantic( ) +def test_coloc_no_logbf( + spark: SparkSession, + minimum_expected_h0: float = 0.99, + maximum_expected_h4: float = 1e-5, +) -> None: + """Test COLOC output when the input data has irrelevant logBF.""" + observed_overlap = StudyLocusOverlap( + ( + spark.createDataFrame( + [ + { + "leftStudyLocusId": 1, + "rightStudyLocusId": 2, + "chromosome": "1", + "tagVariantId": "snp", + "statistics": { + "left_logBF": None, + "right_logBF": None, + }, # irrelevant for COLOC + } + ], + schema=StructType( + [ + StructField("leftStudyLocusId", LongType(), False), + StructField("rightStudyLocusId", LongType(), False), + StructField("chromosome", StringType(), False), + StructField("tagVariantId", StringType(), False), + StructField( + "statistics", + StructType( + [ + StructField("left_logBF", DoubleType(), True), + StructField("right_logBF", DoubleType(), True), + ] + ), + ), + ] + ), + ) + ), + StudyLocusOverlap.get_schema(), + ) + observed_coloc_df = Coloc.colocalise(observed_overlap).df + assert ( + observed_coloc_df.select("h0").collect()[0]["h0"] > minimum_expected_h0 + ), "COLOC should return a high h0 (no association) when the input data has irrelevant logBF." + assert ( + observed_coloc_df.select("h4").collect()[0]["h4"] < maximum_expected_h4 + ), "COLOC should return a low h4 (traits are associated) when the input data has irrelevant logBF." + + def test_ecaviar(mock_study_locus_overlap: StudyLocusOverlap) -> None: """Test eCAVIAR.""" assert isinstance(ECaviar.colocalise(mock_study_locus_overlap), Colocalisation)