From e7edebf3cd5b1495b7a31f40ef34e4da2268c318 Mon Sep 17 00:00:00 2001
From: Daniel Weindl <dweindl@users.noreply.github.com>
Date: Wed, 18 Dec 2024 23:19:52 +0100
Subject: [PATCH] petab.calculate: compare all common columns (#347)

For computing residuals, ... from measurement + simulation tables,
we need to match the corresponding rows. Previously, this was done
using a subset of PEtab measurement table columns and checking
whether all values in these columns match.

This changes it to using the full set of overlapping columns,
not only the known measurement columns.
With that, the same functions can be used for PEtab v2
measurement/simulation tables.
---
 petab/v1/calculate.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/petab/v1/calculate.py b/petab/v1/calculate.py
index 32930807..4c129b88 100644
--- a/petab/v1/calculate.py
+++ b/petab/v1/calculate.py
@@ -106,10 +106,7 @@ def calculate_residuals_for_table(
     )
     residual_df[RESIDUAL] = residual_df[RESIDUAL].astype("float64")
     # matching columns
-    compared_cols = set(MEASUREMENT_DF_COLS)
-    compared_cols -= {MEASUREMENT}
-    compared_cols &= set(measurement_df.columns)
-    compared_cols &= set(simulation_df.columns)
+    compared_cols = set(measurement_df.columns) & set(simulation_df.columns)
 
     # compute noise formulas for observables
     noise_formulas = get_symbolic_noise_formulas(observable_df)
@@ -127,6 +124,16 @@ def calculate_residuals_for_table(
             raise ValueError(
                 f"Could not find simulation for measurement {row}."
             )
+        # if we have multiple matches, check that the rows are all identical
+        elif (
+            mask.sum() > 1
+            and simulation_df.loc[mask].drop_duplicates().shape[0] > 1
+        ):
+            raise ValueError(
+                f"Multiple different simulations found for measurement "
+                f"{row}:\n{simulation_df.loc[mask]}"
+            )
+
         simulation = simulation_df.loc[mask][SIMULATION].iloc[0]
         if scale:
             # apply scaling
@@ -343,10 +350,7 @@ def calculate_llh_for_table(
     llhs = []
 
     # matching columns
-    compared_cols = set(MEASUREMENT_DF_COLS)
-    compared_cols -= {MEASUREMENT}
-    compared_cols &= set(measurement_df.columns)
-    compared_cols &= set(simulation_df.columns)
+    compared_cols = set(measurement_df.columns) & set(simulation_df.columns)
 
     # compute noise formulas for observables
     noise_formulas = get_symbolic_noise_formulas(observable_df)