From e143205f1f9073b197502ca0e09dad74feebf0a7 Mon Sep 17 00:00:00 2001
From: Ben Schroeter <benjschroeter@gmail.com>
Date: Fri, 6 Oct 2023 11:22:26 +1100
Subject: [PATCH 1/6] Config validation (Fixes #143). I have also implemented
 the data installation part of #151.

---
 .conda/benchcab-dev.yaml              |   2 +-
 .conda/meta.yaml                      |   1 +
 benchcab/config.py                    | 213 ++++++----------
 benchcab/data/config-schema.json      |  89 +++++++
 benchcab/data/config-schema.yml       |  67 ++++++
 benchcab/data/test/config-invalid.yml |  19 ++
 benchcab/data/test/config-valid.yml   |  35 +++
 benchcab/utils/__init__.py            |  50 ++++
 setup.cfg                             |   5 +-
 tests/test_config.py                  | 334 ++------------------------
 tests/test_utils.py                   |  23 ++
 11 files changed, 389 insertions(+), 449 deletions(-)
 create mode 100644 benchcab/data/config-schema.json
 create mode 100644 benchcab/data/config-schema.yml
 create mode 100644 benchcab/data/test/config-invalid.yml
 create mode 100644 benchcab/data/test/config-valid.yml
 create mode 100644 tests/test_utils.py

diff --git a/.conda/benchcab-dev.yaml b/.conda/benchcab-dev.yaml
index 6be15d1e..2652f852 100644
--- a/.conda/benchcab-dev.yaml
+++ b/.conda/benchcab-dev.yaml
@@ -6,7 +6,7 @@ dependencies:
   - python=3.9
   - f90nml
   - netcdf4
-  - numpy
   - pytest-cov
   - pyyaml
   - flatdict
+  - cerberus>=1.3.5
\ No newline at end of file
diff --git a/.conda/meta.yaml b/.conda/meta.yaml
index 0ba08259..6745ab42 100644
--- a/.conda/meta.yaml
+++ b/.conda/meta.yaml
@@ -26,3 +26,4 @@ requirements:
         - PyYAML
         - f90nml
         - flatdict
+        - cerberus >= 1.3.5
diff --git a/benchcab/config.py b/benchcab/config.py
index eccf1605..ab8b3a17 100644
--- a/benchcab/config.py
+++ b/benchcab/config.py
@@ -1,153 +1,92 @@
 """A module containing all *_config() functions."""
-
 from pathlib import Path
 import yaml
-
 from benchcab import internal
+from cerberus import Validator
+import benchcab.utils as bu
+
+
+class ConfigValidationException(Exception):
+
+    def __init__(self, validator: Validator):
+        """Config validation exception.
+
+        Parameters
+        ----------
+        validator: cerberus.Validator
+            A validation object that has been used and has the errors attribute.
+        """
+
+        # Nicely format the errors.
+        errors = [f'{k} = {v}' for k, v in validator.errors.items()]
+
+        # Assemble the error message and 
+        msg = '\n\nThe following errors were raised when validating the config file.\n'
+        msg += '\n'.join(errors) + '\n'
 
+        # Raise to super.
+        super().__init__(msg)
 
-def check_config(config: dict):
-    """Performs input validation on config file.
 
-    If the config is invalid, an exception is raised. Otherwise, do nothing.
+def validate_config(config: dict) -> bool:
+    """Validate the configuration dictionary.
+
+    Parameters
+    ----------
+    config : dict
+        Dictionary of configuration loaded from the yaml file.
+
+    Returns
+    -------
+    bool
+        True if valid, exception raised otherwise.
+
+    Raises
+    ------
+    ConfigValidationException
+        Raised when the configuration file fails validation.
     """
+    
+    # Load the schema
+    schema = bu.load_package_data('config-schema.yml')
+
+    # Create a validator
+    v = Validator(schema)
 
-    if any(key not in config for key in internal.CONFIG_REQUIRED_KEYS):
-        raise ValueError(
-            "Keys are missing from the config file: "
-            + ", ".join(
-                key for key in internal.CONFIG_REQUIRED_KEYS if key not in config
-            )
-        )
-
-    if not isinstance(config["project"], str):
-        raise TypeError("The 'project' key must be a string.")
-
-    if not isinstance(config["modules"], list):
-        raise TypeError("The 'modules' key must be a list.")
-
-    if not isinstance(config["experiment"], str):
-        raise TypeError("The 'experiment' key must be a string.")
-
-    # the "science_configurations" key is optional
-    if "science_configurations" in config:
-        if not isinstance(config["science_configurations"], list):
-            raise TypeError("The 'science_configurations' key must be a list.")
-        if config["science_configurations"] == []:
-            raise ValueError("The 'science_configurations' key cannot be empty.")
-        if not all(
-            isinstance(value, dict) for value in config["science_configurations"]
-        ):
-            raise TypeError(
-                "Science config settings must be specified using a dictionary "
-                "that is compatible with the f90nml python package."
-            )
-
-    # the "fluxsite" key is optional
-    if "fluxsite" in config:
-        if not isinstance(config["fluxsite"], dict):
-            raise TypeError("The 'fluxsite' key must be a dictionary.")
-        # the "pbs" key is optional
-        if "pbs" in config["fluxsite"]:
-            if not isinstance(config["fluxsite"]["pbs"], dict):
-                raise TypeError("The 'pbs' key must be a dictionary.")
-            # the "ncpus" key is optional
-            if "ncpus" in config["fluxsite"]["pbs"] and not isinstance(
-                config["fluxsite"]["pbs"]["ncpus"], int
-            ):
-                raise TypeError("The 'ncpus' key must be an integer.")
-            # the "mem" key is optional
-            if "mem" in config["fluxsite"]["pbs"] and not isinstance(
-                config["fluxsite"]["pbs"]["mem"], str
-            ):
-                raise TypeError("The 'mem' key must be a string.")
-            # the "walltime" key is optional
-            if "walltime" in config["fluxsite"]["pbs"] and not isinstance(
-                config["fluxsite"]["pbs"]["walltime"], str
-            ):
-                raise TypeError("The 'walltime' key must be a string.")
-            # the "storage" key is optional
-            if "storage" in config["fluxsite"]["pbs"]:
-                if not isinstance(config["fluxsite"]["pbs"]["storage"], list) or any(
-                    not isinstance(val, str)
-                    for val in config["fluxsite"]["pbs"]["storage"]
-                ):
-                    raise TypeError("The 'storage' key must be a list of strings.")
-        # the "multiprocessing" key is optional
-        if "multiprocessing" in config["fluxsite"] and not isinstance(
-            config["fluxsite"]["multiprocessing"], bool
-        ):
-            raise TypeError("The 'multiprocessing' key must be a boolean.")
-
-    valid_experiments = (
-        list(internal.MEORG_EXPERIMENTS) + internal.MEORG_EXPERIMENTS["five-site-test"]
-    )
-    if config["experiment"] not in valid_experiments:
-        raise ValueError(
-            "The 'experiment' key is invalid.\n"
-            "Valid experiments are: " + ", ".join(valid_experiments)
-        )
-
-    if not isinstance(config["realisations"], list):
-        raise TypeError("The 'realisations' key must be a list.")
-
-    if config["realisations"] == []:
-        raise ValueError("The 'realisations' key cannot be empty.")
-
-    for branch_id, branch_config in enumerate(config["realisations"]):
-        if not isinstance(branch_config, dict):
-            raise TypeError(f"Realisation '{branch_id}' must be a dictionary object.")
-        if "path" not in branch_config:
-            raise ValueError(
-                f"Realisation '{branch_id}' must specify the `path` field."
-            )
-        if not isinstance(branch_config["path"], str):
-            raise TypeError(
-                f"The 'path' field in realisation '{branch_id}' must be a string."
-            )
-        # the "name" key is optional
-        if "name" in branch_config and not isinstance(branch_config["name"], str):
-            raise TypeError(
-                f"The 'name' field in realisation '{branch_id}' must be a string."
-            )
-        # the "revision" key is optional
-        if "revision" in branch_config and not isinstance(
-            branch_config["revision"], int
-        ):
-            raise TypeError(
-                f"The 'revision' field in realisation '{branch_id}' must be an "
-                "integer."
-            )
-        # the "patch" key is optional
-        if "patch" in branch_config and not isinstance(branch_config["patch"], dict):
-            raise TypeError(
-                f"The 'patch' field in realisation '{branch_id}' must be a "
-                "dictionary that is compatible with the f90nml python package."
-            )
-        # the "patch_remove" key is optional
-        if "patch_remove" in branch_config and not isinstance(
-            branch_config["patch_remove"], dict
-        ):
-            raise TypeError(
-                f"The 'patch_remove' field in realisation '{branch_id}' must be a "
-                "dictionary that is compatible with the f90nml python package."
-            )
-        # the "build_script" key is optional
-        if "build_script" in branch_config and not isinstance(
-            branch_config["build_script"], str
-        ):
-            raise TypeError(
-                f"The 'build_script' field in realisation '{branch_id}' must be a "
-                "string."
-            )
+    # Validate
+    is_valid = v.validate(config)
+    
+    # Valid
+    if is_valid:
+        return True
+
+    # Invalid
+    raise ConfigValidationException(v)
 
 
 def read_config(config_path: str) -> dict:
-    """Reads the config file and returns a dictionary containing the configurations."""
+    """Reads the config file and returns a dictionary containing the configurations.
+
+    Parameters
+    ----------
+    config_path : str
+        Path to the configuration file.
+
+    Returns
+    -------
+    dict
+        Configuration dict.
+    
+    Raises
+    ------
+    ConfigValidationError
+        Raised when the configuration file fails validation.
+    """
 
+    # Load the configuration file.
     with open(Path(config_path), "r", encoding="utf-8") as file:
         config = yaml.safe_load(file)
 
-    check_config(config)
-
-    return config
+    # Validate and return.
+    validate_config(config)
+    return config
\ No newline at end of file
diff --git a/benchcab/data/config-schema.json b/benchcab/data/config-schema.json
new file mode 100644
index 00000000..1c4b468f
--- /dev/null
+++ b/benchcab/data/config-schema.json
@@ -0,0 +1,89 @@
+{
+    "project": {
+        "type": "integer"
+    },
+    "modules": {
+        "type": "list",
+        "schema": {
+            "type": "string"
+        }
+    },
+    "experiment": {
+        "type": "string",
+        "allowed": [
+            "five-site-test",
+            "forty-two-site-test",
+            "AU-Tum",
+            "AU-How",
+            "FI-Hyy",
+            "US-Var",
+            "US-Whs"
+        ]
+    },
+    "science_configurations": {
+        "type": "list",
+        "schema": {
+            "type": "dict"
+        }
+    },
+    "realisations": {
+        "type": "list",
+        "required": true,
+        "schema": {
+            "type": "dict",
+            "schema": {
+                "path": {
+                    "type": "string"
+                },
+                "name": {
+                    "type": "string",
+                    "required": false
+                },
+                "build_script": {
+                    "type": "string",
+                    "required": false
+                },
+                "revision": {
+                    "type": "string",
+                    "required": false
+                },
+                "patch": {
+                    "type": "dict",
+                    "required": false
+                }
+            }
+        }
+    },
+    "fluxsite": {
+        "type": "dict",
+        "required": false,
+        "schema": {
+            "multiprocessing": {
+                "type": "boolean"
+            },
+            "pbs": {
+                "type": "dict",
+                "schema": {
+                    "ncpus": {
+                        "type": "integer",
+                        "required": false
+                    },
+                    "mem": {
+                        "type": "integer",
+                        "required": false
+                    },
+                    "walltime": {
+                        "type": "string",
+                        "required": false
+                    },
+                    "storage": {
+                        "type": "list",
+                        "schema": {
+                            "type": "string"
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
diff --git a/benchcab/data/config-schema.yml b/benchcab/data/config-schema.yml
new file mode 100644
index 00000000..37ff37d5
--- /dev/null
+++ b/benchcab/data/config-schema.yml
@@ -0,0 +1,67 @@
+project:
+  type: "string"
+
+modules:
+  type: "list"
+  schema:
+    type: "string"
+
+experiment:
+  type: "string"
+  allowed: [
+    "five-site-test",
+    "forty-two-site-test",
+    "AU-Tum",
+    "AU-How",
+    "FI-Hyy",
+    "US-Var",
+    "US-Whs"
+  ]
+
+science_configuration:
+  type: "list"
+  schema: 
+    type: "dict"
+
+realisations:
+  type: "list"
+  required: true
+  schema:
+    type: "dict"
+    schema:
+      path:
+        type: "string"
+      name:
+        type: "string"
+        required: false
+      build_script:
+        type: "string"
+        required: false
+      revision:
+        type: "string"
+        required: false
+      patch:
+        type: "dict"
+        required: false
+
+fluxsite:
+  type: "dict"
+  required: false
+  schema:
+    multiprocessing:
+      type: "boolean"
+    pbs:
+      type: "dict"
+      schema:
+        ncpus:
+          type: "integer"
+          required: false
+        mem:
+          type: "integer"
+          required: false
+        walltime:
+          type: "integer"
+          required: false
+        storage:
+          type: "integer"
+          required: false
\ No newline at end of file
diff --git a/benchcab/data/test/config-invalid.yml b/benchcab/data/test/config-invalid.yml
new file mode 100644
index 00000000..0e2f6e6b
--- /dev/null
+++ b/benchcab/data/test/config-invalid.yml
@@ -0,0 +1,19 @@
+# A sample configuration that should fail validation.
+project: w97
+
+experiment:  NON EXISTENT EXPERIMENT!!!
+
+realisations: [
+  {
+    path: "trunk",
+  },
+  {
+    path: "branches/Users/ccc561/v3.0-YP-changes",
+  }
+]
+
+modules: [
+  intel-compiler/2021.1.1,
+  netcdf/4.7.4,
+  openmpi/4.1.0
+]
\ No newline at end of file
diff --git a/benchcab/data/test/config-valid.yml b/benchcab/data/test/config-valid.yml
new file mode 100644
index 00000000..902a3249
--- /dev/null
+++ b/benchcab/data/test/config-valid.yml
@@ -0,0 +1,35 @@
+# Example configuration file for running the CABLE benchmarking.
+# 
+# Note, optional keys are available in this config file. See
+# https://benchcab.readthedocs.io/en/stable/user_guide/config_options/
+# for documentation on all the available keys.
+# 
+# This file is in YAML format. You can get information on the syntax here:
+# https://yaml.org/spec/1.2.2/#chapter-2-language-overview
+# Quick tips:
+# You need a space after the ":"
+#
+# It uses the same syntax as Python for:
+#     - lists (aka sequences)
+#     - dictionaries (aka mappings with key/value pairs)
+#
+# Strings can be given with or without double or single quotes.
+
+project: w97
+
+experiment: five-site-test
+
+realisations: [
+  {
+    path: "trunk",
+  },
+  {
+    path: "branches/Users/ccc561/v3.0-YP-changes",
+  }
+]
+
+modules: [
+  intel-compiler/2021.1.1,
+  netcdf/4.7.4,
+  openmpi/4.1.0
+]
\ No newline at end of file
diff --git a/benchcab/utils/__init__.py b/benchcab/utils/__init__.py
index e69de29b..d8b84f32 100644
--- a/benchcab/utils/__init__.py
+++ b/benchcab/utils/__init__.py
@@ -0,0 +1,50 @@
+"""Top-level utilities."""
+import pkgutil
+import json
+import yaml
+import os
+import importlib
+from pathlib import Path
+
+
+# List of one-argument decoding functions.
+PACKAGE_DATA_DECODERS = dict(
+    json=json.loads,
+    yml=yaml.safe_load
+)
+
+
+def get_installed_root() -> Path:
+    """Get the installed root of the benchcab installation.
+
+    Returns
+    -------
+    Path
+        Path to the installed root.
+    """
+    return Path(importlib.resources.files('benchcab'))
+
+
+
+def load_package_data(filename: str) -> dict:
+    """Load data out of the installed package data directory.
+
+    Parameters
+    ----------
+    filename : str
+        Filename of the file to load out of the data directory.
+    """
+    # Work out the encoding of requested file.
+    ext = filename.split('.')[-1]
+
+    # Alias yaml and yml.
+    ext = ext if ext != 'yaml' else 'yml'
+
+    # Make sure it is one of the supported types.
+    assert ext in PACKAGE_DATA_DECODERS.keys()
+
+    # Extract from the installations data directory.
+    raw = pkgutil.get_data('benchcab', os.path.join('data', filename)).decode('utf-8')
+
+    # Decode and return.
+    return PACKAGE_DATA_DECODERS[ext](raw)
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 7b5ebde0..2a878959 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,7 +1,7 @@
 [metadata]
 name=benchcab
 summary= Software to run a benchmarking suite for CABLE LSM
-version=2.0.0
+# version=2.0.0
 description=To benchmark CABLE simulations
 url=https://github.com/CABLE-LSM/benchcab
 license=Apache 2.0
@@ -21,3 +21,6 @@ console_scripts =
 
 [tool:pytest]
 addopts = --doctest-modules --doctest-glob='*.rst' --ignore setup.py --ignore conftest.py --ignore docs/conf.py
+
+[options]
+include_package_data = True
\ No newline at end of file
diff --git a/tests/test_config.py b/tests/test_config.py
index 19ae4b54..b76de27c 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,321 +1,35 @@
 """`pytest` tests for config.py"""
-
-import os
 import pytest
-import yaml
-
-from tests.common import TMP_DIR
-from tests.common import get_mock_config
-from benchcab.config import check_config, read_config
-from benchcab import internal
-
-
-def test_check_config():
-    """Tests for `check_config()`."""
-
-    # Success case: test barebones config is valid
-    config = get_mock_config()
-    check_config(config)
-
-    # Success case: branch configuration with missing name key
-    config = get_mock_config()
-    config["realisations"][0].pop("name")
-    check_config(config)
-
-    # Success case: branch configuration with missing revision key
-    config = get_mock_config()
-    config["realisations"][0].pop("revision")
-    check_config(config)
-
-    # Success case: branch configuration with missing patch key
-    config = get_mock_config()
-    config["realisations"][0].pop("patch")
-    check_config(config)
-
-    # Success case: branch configuration with missing patch_remove key
-    config = get_mock_config()
-    config["realisations"][0].pop("patch_remove")
-    check_config(config)
-
-    # Success case: test config when realisations contains more than two keys
-    config = get_mock_config()
-    config["realisations"].append({"path": "path/to/my_new_branch"})
-    assert len(config["realisations"]) > 2
-    check_config(config)
-
-    # Success case: test config when realisations contains less than two keys
-    config = get_mock_config()
-    config["realisations"].pop()
-    assert len(config["realisations"]) < 2
-    check_config(config)
-
-    # Success case: test experiment with site id from the
-    # five-site-test is valid
-    config = get_mock_config()
-    config["experiment"] = "AU-Tum"
-    check_config(config)
-
-    # Success case: test config without science_configurations is valid
-    config = get_mock_config()
-    config.pop("science_configurations")
-    check_config(config)
-
-    # Success case: test config without fluxsite key is valid
-    config = get_mock_config()
-    config.pop("fluxsite")
-    check_config(config)
-
-    # Success case: test config without multiprocessing key is valid
-    config = get_mock_config()
-    config["fluxsite"].pop("multiprocessing")
-    check_config(config)
-
-    # Success case: test config without pbs key is valid
-    config = get_mock_config()
-    config["fluxsite"].pop("pbs")
-    check_config(config)
-
-    # Success case: test config without ncpus key is valid
-    config = get_mock_config()
-    config["fluxsite"]["pbs"].pop("ncpus")
-    check_config(config)
-
-    # Success case: test config without mem key is valid
-    config = get_mock_config()
-    config["fluxsite"]["pbs"].pop("mem")
-    check_config(config)
-
-    # Success case: test config without walltime key is valid
-    config = get_mock_config()
-    config["fluxsite"]["pbs"].pop("walltime")
-    check_config(config)
-
-    # Success case: test config without storage key is valid
-    config = get_mock_config()
-    config["fluxsite"]["pbs"].pop("storage")
-    check_config(config)
-
-    # Failure case: test missing required keys raises an exception
-    with pytest.raises(
-        ValueError,
-        match="Keys are missing from the config file: project, experiment",
-    ):
-        config = get_mock_config()
-        config.pop("project")
-        config.pop("experiment")
-        check_config(config)
-
-    # Failure case: test config with empty realisations key raises an exception
-    with pytest.raises(ValueError, match="The 'realisations' key cannot be empty."):
-        config = get_mock_config()
-        config["realisations"] = []
-        check_config(config)
-
-    # Failure case: test config with invalid experiment key raises an exception
-    with pytest.raises(
-        ValueError,
-        match="The 'experiment' key is invalid.\n"
-        "Valid experiments are: "
-        + ", ".join(
-            list(internal.MEORG_EXPERIMENTS)
-            + internal.MEORG_EXPERIMENTS["five-site-test"]
-        ),
-    ):
-        config = get_mock_config()
-        config["experiment"] = "foo"
-        check_config(config)
-
-    # Failure case: test config with invalid experiment key (not a subset of
-    # five-site-test) raises an exception
-    with pytest.raises(
-        ValueError,
-        match="The 'experiment' key is invalid.\n"
-        "Valid experiments are: "
-        + ", ".join(
-            list(internal.MEORG_EXPERIMENTS)
-            + internal.MEORG_EXPERIMENTS["five-site-test"]
-        ),
-    ):
-        config = get_mock_config()
-        config["experiment"] = "CH-Dav"
-        check_config(config)
-
-    # Failure case: 'path' key is missing in branch configuration
-    with pytest.raises(
-        ValueError, match="Realisation '1' must specify the `path` field."
-    ):
-        config = get_mock_config()
-        config["realisations"][1].pop("path")
-        check_config(config)
-
-    # Failure case: test config with empty science_configurations key
-    # raises an exception
-    with pytest.raises(
-        ValueError, match="The 'science_configurations' key cannot be empty."
-    ):
-        config = get_mock_config()
-        config["science_configurations"] = []
-        check_config(config)
-
-    # Failure case: project key is not a string
-    with pytest.raises(TypeError, match="The 'project' key must be a string."):
-        config = get_mock_config()
-        config["project"] = 123
-        check_config(config)
-
-    # Failure case: realisations key is not a list
-    with pytest.raises(TypeError, match="The 'realisations' key must be a list."):
-        config = get_mock_config()
-        config["realisations"] = {"foo": "bar"}
-        check_config(config)
-
-    # Failure case: realisations key is not a list of dict
-    with pytest.raises(TypeError, match="Realisation '0' must be a dictionary object."):
-        config = get_mock_config()
-        config["realisations"] = ["foo"]
-        check_config(config)
-
-    # Failure case: type of name is not a string
-    with pytest.raises(
-        TypeError, match="The 'name' field in realisation '1' must be a string."
-    ):
-        config = get_mock_config()
-        config["realisations"][1]["name"] = 1234
-        check_config(config)
-
-    # Failure case: type of path is not a string
-    with pytest.raises(
-        TypeError, match="The 'path' field in realisation '1' must be a string."
-    ):
-        config = get_mock_config()
-        config["realisations"][1]["path"] = 1234
-        check_config(config)
-
-    # Failure case: type of revision key is not an integer
-    with pytest.raises(
-        TypeError, match="The 'revision' field in realisation '1' must be an integer."
-    ):
-        config = get_mock_config()
-        config["realisations"][1]["revision"] = "-1"
-        check_config(config)
-
-    # Failure case: type of patch key is not a dictionary
-    with pytest.raises(
-        TypeError,
-        match="The 'patch' field in realisation '1' must be a dictionary that is "
-        "compatible with the f90nml python package.",
-    ):
-        config = get_mock_config()
-        config["realisations"][1]["patch"] = r"cable_user%ENABLE_SOME_FEATURE = .FALSE."
-        check_config(config)
-
-    # Failure case: type of patch_remove key is not a dictionary
-    with pytest.raises(
-        TypeError,
-        match="The 'patch_remove' field in realisation '1' must be a dictionary that is "
-        "compatible with the f90nml python package.",
-    ):
-        config = get_mock_config()
-        config["realisations"][1]["patch_remove"] = r"cable_user%ENABLE_SOME_FEATURE"
-        check_config(config)
-
-    # Failure case: type of build_script key is not a string
-    with pytest.raises(
-        TypeError, match="The 'build_script' field in realisation '1' must be a string."
-    ):
-        config = get_mock_config()
-        config["realisations"][1]["build_script"] = ["echo", "hello"]
-        check_config(config)
-
-    # Failure case: modules key is not a list
-    with pytest.raises(TypeError, match="The 'modules' key must be a list."):
-        config = get_mock_config()
-        config["modules"] = "netcdf"
-        check_config(config)
-
-    # Failure case: experiment key is not a string
-    with pytest.raises(TypeError, match="The 'experiment' key must be a string."):
-        config = get_mock_config()
-        config["experiment"] = 0
-        check_config(config)
-
-    # Failure case: type of config["science_configurations"] is not a list
-    with pytest.raises(
-        TypeError, match="The 'science_configurations' key must be a list."
-    ):
-        config = get_mock_config()
-        config["science_configurations"] = r"cable_user%GS_SWITCH = 'medlyn'"
-        check_config(config)
-
-    # Failure case: type of config["science_configurations"] is not a list of dict
-    with pytest.raises(
-        TypeError,
-        match="Science config settings must be specified using a dictionary "
-        "that is compatible with the f90nml python package.",
-    ):
-        config = get_mock_config()
-        config["science_configurations"] = [r"cable_user%GS_SWITCH = 'medlyn'"]
-        check_config(config)
-
-    # Failure case: type of config["fluxsite"] is not a dict
-    with pytest.raises(TypeError, match="The 'fluxsite' key must be a dictionary."):
-        config = get_mock_config()
-        config["fluxsite"] = ["ncpus: 16\nmem: 64GB\n"]
-        check_config(config)
-
-    # Failure case: type of config["pbs"] is not a dict
-    with pytest.raises(TypeError, match="The 'pbs' key must be a dictionary."):
-        config = get_mock_config()
-        config["fluxsite"]["pbs"] = "-l ncpus=16"
-        check_config(config)
-
-    # Failure case: type of config["pbs"]["ncpus"] is not an int
-    with pytest.raises(TypeError, match="The 'ncpus' key must be an integer."):
-        config = get_mock_config()
-        config["fluxsite"]["pbs"]["ncpus"] = "16"
-        check_config(config)
-
-    # Failure case: type of config["pbs"]["mem"] is not a string
-    with pytest.raises(TypeError, match="The 'mem' key must be a string."):
-        config = get_mock_config()
-        config["fluxsite"]["pbs"]["mem"] = 64
-        check_config(config)
+import benchcab.utils as bu
+import benchcab.config as bc
 
-    # Failure case: type of config["pbs"]["walltime"] is not a string
-    with pytest.raises(TypeError, match="The 'walltime' key must be a string."):
-        config = get_mock_config()
-        config["fluxsite"]["pbs"]["walltime"] = 60
-        check_config(config)
 
-    # Failure case: type of config["pbs"]["storage"] is not a list
-    with pytest.raises(TypeError, match="The 'storage' key must be a list of strings."):
-        config = get_mock_config()
-        config["fluxsite"]["pbs"]["storage"] = "gdata/foo+gdata/bar"
-        check_config(config)
+def test_read_config_pass():
+    """Test read_config() passes as expected."""
+    existent_path = bu.get_installed_root() / 'data' / 'test' / 'config-valid.yml'
+    
+    # Test for a path that exists
+    config = bc.read_config(existent_path)
+    assert config
 
-    # Failure case: type of config["pbs"]["storage"] is not a list of strings
-    with pytest.raises(TypeError, match="The 'storage' key must be a list of strings."):
-        config = get_mock_config()
-        config["fluxsite"]["pbs"]["storage"] = [1, 2, 3]
-        check_config(config)
 
-    # Failure case: type of config["multiprocessing"] is not a bool
-    with pytest.raises(TypeError, match="The 'multiprocessing' key must be a boolean."):
-        config = get_mock_config()
-        config["fluxsite"]["multiprocessing"] = 1
-        check_config(config)
+def test_read_config_fail():
+    """Test that read_config() fails as expected."""
+    nonexistent_path = bu.get_installed_root() / 'data' / 'test' / 'config-missing.yml'
 
+    # Test for a path that does not exist.
+    with pytest.raises(FileNotFoundError):
+        config = bc.read_config(nonexistent_path)
 
-def test_read_config():
-    """Tests for `read_config()`."""
 
-    # Success case: write config to file, then read config from file
-    config = get_mock_config()
-    filename = TMP_DIR / "config-barebones.yaml"
+def test_validate_config_valid():
+    """Test validate_config() for a valid confiog file."""
+    valid_config = bu.load_package_data('test/config-valid.yml')
+    assert bc.validate_config(valid_config)
 
-    with open(filename, "w", encoding="utf-8") as file:
-        yaml.dump(config, file)
 
-    res = read_config(filename)
-    os.remove(filename)
-    assert config == res
+def test_validate_config_invalid():
+    """Test validate_config() for an invalid config file."""
+    invalid_config = bu.load_package_data('test/config-invalid.yml')
+    with pytest.raises(bc.ConfigValidationException):
+        bc.validate_config(invalid_config)
\ No newline at end of file
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 00000000..af8d9554
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,23 @@
+"""Tests for utilities."""
+import pytest
+import benchcab.utils as bu
+
+
+def test_get_installed_root():
+    """Test get_installed_root()."""
+
+    # Test it actually returns something. We should be able to mock this.
+    assert bu.get_installed_root()
+
+
+def test_load_package_data_pass():
+    """Test load_package_data() passes as expected."""
+    
+    assert isinstance(bu.load_package_data('config-schema.yml'), dict)
+
+
+def test_load_package_data_fail():
+    """Test load_package_data() fails as expected."""
+    
+    with pytest.raises(FileNotFoundError):
+        missing = bu.load_package_data('config-missing.yml'), dict()
\ No newline at end of file

From d0379a800739a9473b499253f9ae7b3d4b0dbe6c Mon Sep 17 00:00:00 2001
From: Ben Schroeter <benjschroeter@gmail.com>
Date: Fri, 6 Oct 2023 11:23:54 +1100
Subject: [PATCH 2/6] Removing JSON version of schema, YML is cleaner.

---
 benchcab/data/config-schema.json | 89 --------------------------------
 1 file changed, 89 deletions(-)
 delete mode 100644 benchcab/data/config-schema.json

diff --git a/benchcab/data/config-schema.json b/benchcab/data/config-schema.json
deleted file mode 100644
index 1c4b468f..00000000
--- a/benchcab/data/config-schema.json
+++ /dev/null
@@ -1,89 +0,0 @@
-{
-    "project": {
-        "type": "integer"
-    },
-    "modules": {
-        "type": "list",
-        "schema": {
-            "type": "string"
-        }
-    },
-    "experiment": {
-        "type": "string",
-        "allowed": [
-            "five-site-test",
-            "forty-two-site-test",
-            "AU-Tum",
-            "AU-How",
-            "FI-Hyy",
-            "US-Var",
-            "US-Whs"
-        ]
-    },
-    "science_configurations": {
-        "type": "list",
-        "schema": {
-            "type": "dict"
-        }
-    },
-    "realisations": {
-        "type": "list",
-        "required": true,
-        "schema": {
-            "type": "dict",
-            "schema": {
-                "path": {
-                    "type": "string"
-                },
-                "name": {
-                    "type": "string",
-                    "required": false
-                },
-                "build_script": {
-                    "type": "string",
-                    "required": false
-                },
-                "revision": {
-                    "type": "string",
-                    "required": false
-                },
-                "patch": {
-                    "type": "dict",
-                    "required": false
-                }
-            }
-        }
-    },
-    "fluxsite": {
-        "type": "dict",
-        "required": false,
-        "schema": {
-            "multiprocessing": {
-                "type": "boolean"
-            },
-            "pbs": {
-                "type": "dict",
-                "schema": {
-                    "ncpus": {
-                        "type": "integer",
-                        "required": false
-                    },
-                    "mem": {
-                        "type": "integer",
-                        "required": false
-                    },
-                    "walltime": {
-                        "type": "string",
-                        "required": false
-                    },
-                    "storage": {
-                        "type": "list",
-                        "schema": {
-                            "type": "string"
-                        }
-                    }
-                }
-            }
-        }
-    }
-}

From d71de8168991fbb72e76909e044ace62140e2117 Mon Sep 17 00:00:00 2001
From: Ben Schroeter <benjschroeter@gmail.com>
Date: Mon, 9 Oct 2023 09:53:10 +1100
Subject: [PATCH 3/6] Pinned cerberus version in meta now that it has been
 published. Fixes #143.

---
 .conda/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.conda/meta.yaml b/.conda/meta.yaml
index 6745ab42..21cd03c7 100644
--- a/.conda/meta.yaml
+++ b/.conda/meta.yaml
@@ -26,4 +26,4 @@ requirements:
         - PyYAML
         - f90nml
         - flatdict
-        - cerberus >= 1.3.5
+        - cerberus >=1.3.5

From 0b36493f81fbd75c89829c8dc4d65ac6d7968609 Mon Sep 17 00:00:00 2001
From: Ben Schroeter <benjschroeter@gmail.com>
Date: Wed, 18 Oct 2023 12:24:02 +1100
Subject: [PATCH 4/6] Added fixes for PR. Fixes #143

---
 benchcab/data/config-schema.yml | 20 +++++++++++++++-----
 benchcab/utils/__init__.py      |  3 ---
 setup.cfg                       |  2 +-
 tests/test_config.py            |  2 +-
 tests/test_utils.py             |  2 +-
 5 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/benchcab/data/config-schema.yml b/benchcab/data/config-schema.yml
index 37ff37d5..a9325ac5 100644
--- a/benchcab/data/config-schema.yml
+++ b/benchcab/data/config-schema.yml
@@ -18,7 +18,7 @@ experiment:
     "US-Whs"
   ]
 
-science_configuration:
+science_configurations:
   type: "list"
   schema: 
     type: "dict"
@@ -43,6 +43,9 @@ realisations:
       patch:
         type: "dict"
         required: false
+      patch_remove:
+        type: "dict"
+        required: false
 
 fluxsite:
   type: "dict"
@@ -50,6 +53,7 @@ fluxsite:
   schema:
     multiprocessing:
       type: "boolean"
+      required: false
     pbs:
       type: "dict"
       schema:
@@ -57,11 +61,17 @@ fluxsite:
           type: "integer"
           required: false
         mem:
-          type: "integer"
+          type: "string"
+          regex: "^[0-9]+(?i)(mb|gb)$"
           required: false
         walltime:
-          type: "integer"
+          type: "string"
+          regex: "^[0-4][0-9]:[0-5][0-9]:[0-5][0-9]$"
           required: false
         storage:
-          type: "integer"
-          required: false
\ No newline at end of file
+          type: list
+          required: false
+          schema:
+            type: "string"
+            required: false
+            
\ No newline at end of file
diff --git a/benchcab/utils/__init__.py b/benchcab/utils/__init__.py
index 01510edb..59095572 100644
--- a/benchcab/utils/__init__.py
+++ b/benchcab/utils/__init__.py
@@ -39,9 +39,6 @@ def load_package_data(filename: str) -> dict:
     # Alias yaml and yml.
     ext = ext if ext != 'yaml' else 'yml'
 
-    # Make sure it is one of the supported types.
-    assert ext in PACKAGE_DATA_DECODERS.keys()
-
     # Extract from the installations data directory.
     raw = pkgutil.get_data('benchcab', os.path.join('data', filename)).decode('utf-8')
 
diff --git a/setup.cfg b/setup.cfg
index 2a878959..13bc5ab3 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,7 +1,7 @@
 [metadata]
 name=benchcab
 summary= Software to run a benchmarking suite for CABLE LSM
-# version=2.0.0
+version=2.0.0
 description=To benchmark CABLE simulations
 url=https://github.com/CABLE-LSM/benchcab
 license=Apache 2.0
diff --git a/tests/test_config.py b/tests/test_config.py
index 95076507..b6e0d8a8 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -23,7 +23,7 @@ def test_read_config_fail():
 
 
 def test_validate_config_valid():
-    """Test validate_config() for a valid confiog file."""
+    """Test validate_config() for a valid config file."""
     valid_config = bu.load_package_data('test/config-valid.yml')
     assert bc.validate_config(valid_config)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index af8d9554..848161db 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -20,4 +20,4 @@ def test_load_package_data_fail():
     """Test load_package_data() fails as expected."""
     
     with pytest.raises(FileNotFoundError):
-        missing = bu.load_package_data('config-missing.yml'), dict()
\ No newline at end of file
+        missing = bu.load_package_data('config-missing.yml')
\ No newline at end of file

From 4368e222be0c05caa0b97c3d305b29ccb8358c6a Mon Sep 17 00:00:00 2001
From: Ben Schroeter <benjschroeter@gmail.com>
Date: Tue, 24 Oct 2023 12:06:53 +1100
Subject: [PATCH 5/6] Added integration tests. Fixes #143.

---
 benchcab/data/test/integration.sh | 38 +++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 benchcab/data/test/integration.sh

diff --git a/benchcab/data/test/integration.sh b/benchcab/data/test/integration.sh
new file mode 100644
index 00000000..f261217f
--- /dev/null
+++ b/benchcab/data/test/integration.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -ex
+
+TEST_DIR=/scratch/rp23/$USER/benchcab/integration
+EXAMPLE_REPO="git@github.com:CABLE-LSM/bench_example.git"
+
+# Remove the test work space, then recreate
+rm -rf $TEST_DIR
+mkdir -p $TEST_DIR
+
+# Clone the example repo
+git clone $EXAMPLE_REPO $TEST_DIR
+cd $TEST_DIR
+git reset --hard 6287539e96fc8ef36dc578201fbf9847314147fb
+
+cat > config.yaml << EOL
+project: rp23
+
+experiment: AU-Tum
+
+realisations:
+  - path: trunk
+  - path: branches/Users/sb8430/test-branch
+
+modules: [
+  intel-compiler/2021.1.1,
+  netcdf/4.7.4,
+  openmpi/4.1.0
+]
+
+fluxsite:
+    pbs:
+        storage:
+            - scratch/rp23
+EOL
+
+benchcab run -v
\ No newline at end of file

From b8a92f935fad7b479acd0385e1505a9891bb45dd Mon Sep 17 00:00:00 2001
From: Ben Schroeter <benjschroeter@gmail.com>
Date: Tue, 24 Oct 2023 12:51:26 +1100
Subject: [PATCH 6/6] Apply suggestions from code review

Co-authored-by: Sean Bryan <39685865+SeanBryan51@users.noreply.github.com>
---
 benchcab/data/test/integration.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchcab/data/test/integration.sh b/benchcab/data/test/integration.sh
index f261217f..f3440cbe 100644
--- a/benchcab/data/test/integration.sh
+++ b/benchcab/data/test/integration.sh
@@ -2,7 +2,7 @@
 
 set -ex
 
-TEST_DIR=/scratch/rp23/$USER/benchcab/integration
+TEST_DIR=/scratch/$PROJECT/$USER/benchcab/integration
 EXAMPLE_REPO="git@github.com:CABLE-LSM/bench_example.git"
 
 # Remove the test work space, then recreate
@@ -15,7 +15,7 @@ cd $TEST_DIR
 git reset --hard 6287539e96fc8ef36dc578201fbf9847314147fb
 
 cat > config.yaml << EOL
-project: rp23
+project: $PROJECT
 
 experiment: AU-Tum
 
@@ -32,7 +32,7 @@ modules: [
 fluxsite:
     pbs:
         storage:
-            - scratch/rp23
+            - scratch/$PROJECT
 EOL
 
 benchcab run -v
\ No newline at end of file