From d6f071ba42d223f5818c9c7e3016e2f7e85e051c Mon Sep 17 00:00:00 2001 From: Daniel Weindl Date: Mon, 16 Dec 2024 22:32:56 +0100 Subject: [PATCH] petab1->2: create experiment df --- petab/v1/calculate.py | 7 +++ petab/v1/problem.py | 8 +-- petab/v2/C.py | 14 +---- petab/v2/__init__.py | 5 +- petab/v2/lint.py | 129 ++++++++++++++++++++++++++------------- petab/v2/petab1to2.py | 102 ++++++++++++++++++++++++++++--- petab/v2/problem.py | 27 +++----- tests/v2/test_problem.py | 4 +- 8 files changed, 204 insertions(+), 92 deletions(-) diff --git a/petab/v1/calculate.py b/petab/v1/calculate.py index 3cc86f73..32930807 100644 --- a/petab/v1/calculate.py +++ b/petab/v1/calculate.py @@ -97,6 +97,9 @@ def calculate_residuals_for_table( Calculate residuals for a single measurement table. For the arguments, see `calculate_residuals`. """ + # below, we rely on a unique index + measurement_df = measurement_df.reset_index(drop=True) + # create residual df as copy of measurement df, change column residual_df = measurement_df.copy(deep=True).rename( columns={MEASUREMENT: RESIDUAL} @@ -120,6 +123,10 @@ def calculate_residuals_for_table( for col in compared_cols ] mask = reduce(lambda x, y: x & y, masks) + if mask.sum() == 0: + raise ValueError( + f"Could not find simulation for measurement {row}." + ) simulation = simulation_df.loc[mask][SIMULATION].iloc[0] if scale: # apply scaling diff --git a/petab/v1/problem.py b/petab/v1/problem.py index ea300258..6be96c68 100644 --- a/petab/v1/problem.py +++ b/petab/v1/problem.py @@ -1149,8 +1149,8 @@ def add_measurement( sim_cond_id: str, time: float, measurement: float, - observable_parameters: Sequence[str] = None, - noise_parameters: Sequence[str] = None, + observable_parameters: Sequence[str | float] = None, + noise_parameters: Sequence[str | float] = None, preeq_cond_id: str = None, ): """Add a measurement to the problem. @@ -1172,11 +1172,11 @@ def add_measurement( } if observable_parameters is not None: record[OBSERVABLE_PARAMETERS] = [ - PARAMETER_SEPARATOR.join(observable_parameters) + PARAMETER_SEPARATOR.join(map(str, observable_parameters)) ] if noise_parameters is not None: record[NOISE_PARAMETERS] = [ - PARAMETER_SEPARATOR.join(noise_parameters) + PARAMETER_SEPARATOR.join(map(str, noise_parameters)) ] if preeq_cond_id is not None: record[PREEQUILIBRATION_CONDITION_ID] = [preeq_cond_id] diff --git a/petab/v2/C.py b/petab/v2/C.py index cb095c68..a0e07917 100644 --- a/petab/v2/C.py +++ b/petab/v2/C.py @@ -13,14 +13,6 @@ #: Experiment ID column in the measurement table EXPERIMENT_ID = "experimentId" -# TODO: remove -#: Preequilibration condition ID column in the measurement table -PREEQUILIBRATION_CONDITION_ID = "preequilibrationConditionId" - -# TODO: remove -#: Simulation condition ID column in the measurement table -SIMULATION_CONDITION_ID = "simulationConditionId" - #: Measurement value column in the measurement table MEASUREMENT = "measurement" @@ -45,17 +37,13 @@ #: Mandatory columns of measurement table MEASUREMENT_DF_REQUIRED_COLS = [ OBSERVABLE_ID, - # TODO: add - # EXPERIMENT_ID, - SIMULATION_CONDITION_ID, + EXPERIMENT_ID, MEASUREMENT, TIME, ] #: Optional columns of measurement table MEASUREMENT_DF_OPTIONAL_COLS = [ - # TODO: remove - PREEQUILIBRATION_CONDITION_ID, OBSERVABLE_PARAMETERS, NOISE_PARAMETERS, DATASET_ID, diff --git a/petab/v2/__init__.py b/petab/v2/__init__.py index 0525d66c..adeb0e84 100644 --- a/petab/v2/__init__.py +++ b/petab/v2/__init__.py @@ -27,7 +27,10 @@ # import after v1 from ..version import __version__ # noqa: F401, E402 -from . import models # noqa: F401, E402 +from . import ( # noqa: F401, E402 + C, # noqa: F401, E402 + models, # noqa: F401, E402 +) from .conditions import * # noqa: F403, F401, E402 from .experiments import ( # noqa: F401, E402 get_experiment_df, diff --git a/petab/v2/lint.py b/petab/v2/lint.py index 76f3cdb6..2473c74d 100644 --- a/petab/v2/lint.py +++ b/petab/v2/lint.py @@ -15,6 +15,9 @@ from .. import v2 from ..v1.lint import ( _check_df, + assert_measured_observables_defined, + assert_measurements_not_null, + assert_measurements_numeric, assert_model_parameters_in_condition_or_parameter_table, assert_no_leading_trailing_whitespace, assert_parameter_bounds_are_numeric, @@ -23,13 +26,16 @@ assert_parameter_prior_parameters_are_valid, assert_parameter_prior_type_is_valid, assert_parameter_scale_is_valid, + assert_unique_observable_ids, assert_unique_parameter_ids, check_ids, - check_measurement_df, check_observable_df, check_parameter_bounds, ) -from ..v1.measurements import split_parameter_replacement_list +from ..v1.measurements import ( + assert_overrides_match_parameter_count, + split_parameter_replacement_list, +) from ..v1.observables import get_output_parameters, get_placeholders from ..v1.visualize.lint import validate_visualization_df from ..v2.C import * @@ -102,6 +108,23 @@ class ValidationError(ValidationIssue): level: ValidationIssueSeverity = field( default=ValidationIssueSeverity.ERROR, init=False ) + task: str | None = None + + def __post_init__(self): + if self.task is None: + self.task = self._get_task_name() + + def _get_task_name(self): + """Get the name of the ValidationTask that raised this error.""" + import inspect + + # walk up the stack until we find the ValidationTask.run method + for frame_info in inspect.stack(): + frame = frame_info.frame + if "self" in frame.f_locals: + task = frame.f_locals["self"] + if isinstance(task, ValidationTask): + return task.__class__.__name__ class ValidationResultList(list[ValidationIssue]): @@ -237,8 +260,51 @@ def run(self, problem: Problem) -> ValidationIssue | None: if problem.measurement_df is None: return + df = problem.measurement_df try: - check_measurement_df(problem.measurement_df, problem.observable_df) + _check_df(df, MEASUREMENT_DF_REQUIRED_COLS, "measurement") + + for column_name in MEASUREMENT_DF_REQUIRED_COLS: + if not np.issubdtype(df[column_name].dtype, np.number): + assert_no_leading_trailing_whitespace( + df[column_name].values, column_name + ) + + for column_name in MEASUREMENT_DF_OPTIONAL_COLS: + if column_name in df and not np.issubdtype( + df[column_name].dtype, np.number + ): + assert_no_leading_trailing_whitespace( + df[column_name].values, column_name + ) + + if problem.observable_df is not None: + assert_measured_observables_defined(df, problem.observable_df) + assert_overrides_match_parameter_count( + df, problem.observable_df + ) + + if OBSERVABLE_TRANSFORMATION in problem.observable_df: + # Check for positivity of measurements in case of + # log-transformation + assert_unique_observable_ids(problem.observable_df) + # If the above is not checked, in the following loop + # trafo may become a pandas Series + for measurement, obs_id in zip( + df[MEASUREMENT], df[OBSERVABLE_ID], strict=True + ): + trafo = problem.observable_df.loc[ + obs_id, OBSERVABLE_TRANSFORMATION + ] + if measurement <= 0.0 and trafo in [LOG, LOG10]: + raise ValueError( + "Measurements with observable " + f"transformation {trafo} must be " + f"positive, but {measurement} <= 0." + ) + + assert_measurements_not_null(df) + assert_measurements_numeric(df) except AssertionError as e: return ValidationError(str(e)) @@ -247,46 +313,20 @@ def run(self, problem: Problem) -> ValidationIssue | None: # condition table should be an error if the measurement table refers # to conditions - # check that measured experiments/conditions exist - # TODO: fully switch to experiment table and remove this: - if SIMULATION_CONDITION_ID in problem.measurement_df: - if problem.condition_df is None: - return - used_conditions = set( - problem.measurement_df[SIMULATION_CONDITION_ID].dropna().values - ) - if PREEQUILIBRATION_CONDITION_ID in problem.measurement_df: - used_conditions |= set( - problem.measurement_df[PREEQUILIBRATION_CONDITION_ID] - .dropna() - .values - ) - available_conditions = set( - problem.condition_df[CONDITION_ID].unique() - ) - if missing_conditions := (used_conditions - available_conditions): - return ValidationError( - "Measurement table references conditions that " - "are not specified in the condition table: " - + str(missing_conditions) - ) - elif EXPERIMENT_ID in problem.measurement_df: - if problem.experiment_df is None: - return - used_experiments = set( - problem.measurement_df[EXPERIMENT_ID].values - ) - available_experiments = set( - problem.condition_df[CONDITION_ID].unique() + # check that measured experiments + if problem.experiment_df is None: + return + + used_experiments = set(problem.measurement_df[EXPERIMENT_ID].values) + available_experiments = set( + problem.experiment_df[EXPERIMENT_ID].unique() + ) + if missing_experiments := (used_experiments - available_experiments): + raise AssertionError( + "Measurement table references experiments that " + "are not specified in the experiments table: " + + str(missing_experiments) ) - if missing_experiments := ( - used_experiments - available_experiments - ): - raise AssertionError( - "Measurement table references experiments that " - "are not specified in the experiments table: " - + str(missing_experiments) - ) class CheckConditionTable(ValidationTask): @@ -486,7 +526,7 @@ def run(self, problem: Problem) -> ValidationIssue | None: ) required_conditions = problem.experiment_df[CONDITION_ID].unique() - existing_conditions = problem.condition_df.index + existing_conditions = problem.condition_df[CONDITION_ID].unique() missing_conditions = set(required_conditions) - set( existing_conditions @@ -771,7 +811,8 @@ def append_overrides(overrides): ) # parameters that are overridden via the condition table are not allowed - parameter_ids -= set(problem.condition_df[TARGET_ID].unique()) + if problem.condition_df is not None: + parameter_ids -= set(problem.condition_df[TARGET_ID].unique()) return parameter_ids diff --git a/petab/v2/petab1to2.py b/petab/v2/petab1to2.py index d5d06229..b974b30b 100644 --- a/petab/v2/petab1to2.py +++ b/petab/v2/petab1to2.py @@ -4,8 +4,8 @@ from itertools import chain from pathlib import Path from urllib.parse import urlparse +from uuid import uuid4 -import numpy as np import pandas as pd from pandas.io.common import get_handle, is_url @@ -98,10 +98,81 @@ def petab1to2(yaml_config: Path | str, output_dir: Path | str = None): condition_df = v1v2_condition_df(condition_df, petab_problem.model) v2.write_condition_df(condition_df, get_dest_path(condition_file)) + # records for the experiment table to be created + experiments = [] + + def create_experiment_id(sim_cond_id: str, preeq_cond_id: str) -> str: + if not sim_cond_id and not preeq_cond_id: + return "" + if preeq_cond_id: + preeq_cond_id = f"{preeq_cond_id}_" + exp_id = f"experiment_{preeq_cond_id}{sim_cond_id}" + if exp_id in experiments: # noqa: B023 + i = 1 + while f"{exp_id}_{i}" in experiments: # noqa: B023 + i += 1 + exp_id = f"{exp_id}_{i}" + return exp_id + + measured_experiments = ( + petab_problem.get_simulation_conditions_from_measurement_df() + ) + for ( + _, + row, + ) in measured_experiments.iterrows(): + # generate a new experiment for each simulation / pre-eq condition + # combination + sim_cond_id = row[v1.C.SIMULATION_CONDITION_ID] + preeq_cond_id = row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, "") + exp_id = create_experiment_id(sim_cond_id, preeq_cond_id) + if preeq_cond_id: + experiments.append( + { + v2.C.EXPERIMENT_ID: exp_id, + v2.C.CONDITION_ID: preeq_cond_id, + v2.C.TIME: float("-inf"), + } + ) + experiments.append( + { + v2.C.EXPERIMENT_ID: exp_id, + v2.C.CONDITION_ID: sim_cond_id, + v2.C.TIME: 0, + } + ) + if experiments: + exp_table_path = output_dir / "experiments.tsv" + if exp_table_path.exists(): + raise ValueError( + f"Experiment table file {exp_table_path} already exists." + ) + problem_config[v2.C.EXPERIMENT_FILES] = [exp_table_path.name] + v2.write_experiment_df( + v2.get_experiment_df(pd.DataFrame(experiments)), exp_table_path + ) + for measurement_file in problem_config.get(v2.C.MEASUREMENT_FILES, []): measurement_df = v1.get_measurement_df( get_src_path(measurement_file) ) + # if there is already an experiment ID column, we rename it + if v2.C.EXPERIMENT_ID in measurement_df.columns: + measurement_df.rename( + columns={v2.C.EXPERIMENT_ID: f"experiment_id_{uuid4()}"}, + inplace=True, + ) + # add pre-eq condition id if not present or convert to string + # for simplicity + if v1.C.PREEQUILIBRATION_CONDITION_ID in measurement_df.columns: + measurement_df[ + v1.C.PREEQUILIBRATION_CONDITION_ID + ] = measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID].astype( + str + ) + else: + measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = "" + if ( petab_problem.condition_df is not None and len( @@ -110,20 +181,33 @@ def petab1to2(yaml_config: Path | str, output_dir: Path | str = None): ) == 0 ): - # can't have "empty" conditions with no overrides in v2 - # TODO: this needs to be done condition wise - measurement_df[v2.C.SIMULATION_CONDITION_ID] = np.nan + # we can't have "empty" conditions with no overrides in v2, + # therefore, we drop the respective condition ID completely + # TODO: or can we? + # TODO: this needs to be checked condition-wise, not globally + measurement_df[v1.C.SIMULATION_CONDITION_ID] = "" if ( v1.C.PREEQUILIBRATION_CONDITION_ID in measurement_df.columns ): - measurement_df[v2.C.PREEQUILIBRATION_CONDITION_ID] = np.nan + measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] = "" + # condition IDs to experiment IDs + measurement_df.insert( + 0, + v2.C.EXPERIMENT_ID, + measurement_df.apply( + lambda row: create_experiment_id( + row[v1.C.SIMULATION_CONDITION_ID], + row.get(v1.C.PREEQUILIBRATION_CONDITION_ID, ""), + ), + axis=1, + ), + ) + del measurement_df[v1.C.SIMULATION_CONDITION_ID] + del measurement_df[v1.C.PREEQUILIBRATION_CONDITION_ID] v2.write_measurement_df( measurement_df, get_dest_path(measurement_file) ) - # TODO: Measurements: preequilibration to experiments/timecourses once - # finalized - ... # validate updated Problem validation_issues = v2.lint_problem(new_yaml_file) @@ -189,7 +273,7 @@ def v1v2_condition_df( """Convert condition table from petab v1 to v2.""" condition_df = condition_df.copy().reset_index() with suppress(KeyError): - # TODO: are condition names still supported in v2? + # conditionName was dropped in PEtab v2 condition_df.drop(columns=[v2.C.CONDITION_NAME], inplace=True) condition_df = condition_df.melt( diff --git a/petab/v2/problem.py b/petab/v2/problem.py index 1df2c677..f8dad754 100644 --- a/petab/v2/problem.py +++ b/petab/v2/problem.py @@ -5,7 +5,6 @@ import os import tempfile import traceback -import warnings from collections.abc import Sequence from math import nan from numbers import Number @@ -92,12 +91,6 @@ def __init__( ValidationTask ] = default_validation_tasks.copy() self.config = config - if self.experiment_df is not None: - warnings.warn( - "The experiment table is not yet supported and " - "will be ignored.", - stacklevel=2, - ) def __str__(self): model = f"with model ({self.model})" if self.model else "without model" @@ -908,47 +901,43 @@ def add_parameter( def add_measurement( self, obs_id: str, - sim_cond_id: str, + experiment_id: str, time: float, measurement: float, - observable_parameters: Sequence[str] = None, - noise_parameters: Sequence[str] = None, - preeq_cond_id: str = None, + observable_parameters: Sequence[str | float] = None, + noise_parameters: Sequence[str | float] = None, ): """Add a measurement to the problem. Arguments: obs_id: The observable ID - sim_cond_id: The simulation condition ID + experiment_id: The experiment ID time: The measurement time measurement: The measurement value observable_parameters: The observable parameters noise_parameters: The noise parameters - preeq_cond_id: The pre-equilibration condition ID """ record = { OBSERVABLE_ID: [obs_id], - SIMULATION_CONDITION_ID: [sim_cond_id], + EXPERIMENT_ID: [experiment_id], TIME: [time], MEASUREMENT: [measurement], } if observable_parameters is not None: record[OBSERVABLE_PARAMETERS] = [ - PARAMETER_SEPARATOR.join(observable_parameters) + PARAMETER_SEPARATOR.join(map(str, observable_parameters)) ] if noise_parameters is not None: record[NOISE_PARAMETERS] = [ - PARAMETER_SEPARATOR.join(noise_parameters) + PARAMETER_SEPARATOR.join(map(str, noise_parameters)) ] - if preeq_cond_id is not None: - record[PREEQUILIBRATION_CONDITION_ID] = [preeq_cond_id] tmp_df = pd.DataFrame(record) self.measurement_df = ( pd.concat([self.measurement_df, tmp_df]) if self.measurement_df is not None else tmp_df - ) + ).reset_index(drop=True) def add_mapping(self, petab_id: str, model_id: str): """Add a mapping table entry to the problem. diff --git a/tests/v2/test_problem.py b/tests/v2/test_problem.py index ba210af0..dadc3a7c 100644 --- a/tests/v2/test_problem.py +++ b/tests/v2/test_problem.py @@ -30,7 +30,7 @@ def test_load_remote(): """Test loading remote files""" yaml_url = ( "https://raw.githubusercontent.com/PEtab-dev/petab_test_suite" - "/update_v2/petabtests/cases/v2.0.0/sbml/0001/_0001.yaml" + "/update_v2/petabtests/cases/v2.0.0/sbml/0010/_0010.yaml" ) petab_problem = Problem.from_yaml(yaml_url) @@ -83,7 +83,7 @@ def test_problem_from_yaml_multiple_files(): problem.experiment_df, Path(tmpdir, f"experiments{i}.tsv") ) - problem.add_measurement(f"observable{i}", f"condition{i}", 1, 1) + problem.add_measurement(f"observable{i}", f"experiment{i}", 1, 1) petab.write_measurement_df( problem.measurement_df, Path(tmpdir, f"measurements{i}.tsv") )