ecmwf · b8raoult · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jun 28, 2024
diff --git a/docs/cli/introduction.rst b/docs/cli/introduction.rst
@@ -0,0 +1,24 @@
+Introduction
+============
+
+When you install the `anemoi-training` package, this will also install command line tool
+called ``anemoi-training`` which can be used to train models.
+
+The tool can provide help with the ``--help`` options:
+
+.. code-block:: bash
+
+    % anemoi-training --help
+
+The commands are:
+
+.. toctree::
+    :maxdepth: 1
+
+    train
+
+.. argparse::
+    :module: anemoi.training.__main__
+    :func: create_parser
+    :prog: anemoi-training
+    :nosubcommands:
diff --git a/docs/cli/train.rst b/docs/cli/train.rst
@@ -0,0 +1,49 @@
+#######
+ train
+#######
+
+Use this command to create a train a model:
+
+.. code:: bash
+
+   % anemoi-training train config.yaml
+
+The command will read the default configuration and override it with the
+values in the provided configuration file. The configuration file should
+be a YAML file with the structure defined in the `Configuration`
+section. The file `config.yaml` will typically destribes the model to be
+trained, the dataset to be used, and the training hyperparameters:
+
+.. literalinclude:: train.yaml
+   :language: yaml
+
+You can provide more that one configuration file, in which case the
+values will be merged in the order they are provided. A typical usage
+would be to split the training configurations into model description,
+training hyperparameters and runtime options
+
+.. code:: bash
+
+   % anemoi-training train model.yaml hyperparameters.yaml slurm.yaml
+
+Furthermore, you can also provide values directly on the command line,
+which will override any values in the configuration files:
+
+.. code:: bash
+
+   % anemoi-training train config.yaml tracker.mlflow.tracking_uri=http://localhost:5000
+
+If the file `~/.config/anemoi/train.yaml` exists, it will be loaded
+after the defaults and before any other configuration file. This allows
+you to provide values such as passwords or other sensitive information
+that you do not want to store a git repository.
+
+********************
+ Command line usage
+********************
+
+.. argparse::
+   :module: anemoi.training.__main__
+   :func: create_parser
+   :prog: anemoi-training
+   :path: train
diff --git a/docs/cli/train.yaml b/docs/cli/train.yaml
@@ -0,0 +1,2 @@
+training:
+  max_epochs: 10
diff --git a/docs/index.rst b/docs/index.rst
@@ -30,6 +30,19 @@ of the *Anemoi* packages.
 
    installing
 
+**Command line tool**
+
+-  :doc:`cli/introduction`
+-  :doc:`cli/train`
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+   :caption: Command line tool
+
+   cli/introduction
+   cli/train
+
 *****************
  Anemoi packages
 *****************

@@ -51,7 +51,7 @@ dynamic = [
 ]
 dependencies = [
   "anemoi-datasets[data]>=0.1",
-  "anemoi-models @ git+https://github.com/ecmwf/anemoi-models.git",
+  "anemoi-models",
   "anemoi-utils[provenance]>=0.1.3",
   "einops>=0.6.1",
   "hydra-core>=1.3",

diff --git a/src/anemoi/training/commands/train.py b/src/anemoi/training/commands/train.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python
+# (C) Copyright 2024 ECMWF.
+#
+# This software is licensed under the terms of the Apache Licence Version 2.0
+# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
+# In applying this licence, ECMWF does not waive the privileges and immunities
+# granted to it by virtue of its status as an intergovernmental organisation
+# nor does it submit to any jurisdiction.
+#
+
+
+import json
+import logging
+import os
+import re
+
+import hydra
+from anemoi.utils.config import config_path
+from hydra.errors import ConfigCompositionException
+from omegaconf import OmegaConf
+
+from . import Command
+
+LOGGER = logging.getLogger(__name__)
+
+# https://hydra.cc/docs/advanced/override_grammar/basic/
+
+override_regex = re.compile(
+    r"""
+        ^
+        (
+            (~|\+|\+\+)?        # optional prefix
+            (\w+)([/@:\.]\w+)*  # key
+            =                   # assignment
+            (.*)                # value
+        )
+        |                       # or
+        (~                      # ~ prefix
+            (\w+)([/@:\.]\w+)*  # key
+        )
+        $
+    """,
+    re.VERBOSE,
+)
+
+
+def apply_delete_override(cfg, dotkey, value, value_given):
+
+    any_value = object()
+
+    if not value_given:
+        assert value is None
+        value = any_value
+
+    current = OmegaConf.select(cfg, dotkey, throw_on_missing=False)
+    if value not in (any_value, current):
+        raise ConfigCompositionException(
+            f"Key '{dotkey}' with value '{current}' does not match the value '{value}' in the override"
+        )
+
+    try:
+        # Allow 'del'
+        OmegaConf.set_struct(cfg, False)
+
+        if "." in dotkey:
+            parent, key = dotkey.rsplit(".", 1)
+            subtree = OmegaConf.select(cfg, parent)
+            del subtree[key]
+        else:
+            # Top level key
+            del cfg[dotkey]
+
+    finally:
+        OmegaConf.set_struct(cfg, True)
+
+
+def apply_add_override_force(cfg, dotkey, value):
+    OmegaConf.update(cfg, dotkey, value, merge=True, force_add=True)
+
+
+def apply_add_override(cfg, dotkey, value):
+    current = OmegaConf.select(cfg, dotkey, throw_on_missing=False)
+    if current is not None:
+        raise ConfigCompositionException(f"Cannot add key '{dotkey}' because it already exists, use '++' to force add")
+
+    OmegaConf.update(cfg, dotkey, value, merge=True, force_add=True)
+
+
+def apply_assign_override(cfg, dotkey, value):
+    OmegaConf.update(cfg, dotkey, value, merge=True)
+
+
+def parse_override(override, n):
+    dotkey = override[n:]
+    parsed = OmegaConf.from_dotlist([dotkey])
+    dotkey = dotkey.split("=")[0]
+    value = OmegaConf.select(parsed, dotkey)
+    return dotkey, value
+
+
+def apply_override(cfg, override):
+    if override.startswith("~"):
+        return apply_delete_override(cfg, *parse_override(override, 1), value_given="=" in override)
+
+    if override.startswith("++"):
+        return apply_add_override_force(cfg, *parse_override(override, 2))
+
+    if override.startswith("+"):
+        return apply_add_override(cfg, *parse_override(override, 1))
+
+    return apply_assign_override(cfg, *parse_override(override, 0))
+
+
+class Train(Command):
+
+    def add_arguments(self, command_parser):
+        command_parser.add_argument(
+            "config",
+            nargs="*",
+            type=str,
+            help="A list yaml files to load or a list of overrides to apply",
+        )
+
+    def run(self, args):
+
+        configs = []
+        overrides = []
+
+        for config in args.config:
+            if override_regex.match(config):
+                overrides.append(config)
+            elif config.endswith(".yaml") or config.endswith(".yml"):
+                configs.append(config)
+            else:
+                raise ValueError(f"Invalid config '{config}'. It must be a yaml file or an override")
+
+        hydra.initialize(config_path="../config", version_base=None)
+
+        cfg = hydra.compose(config_name="config")
+
+        # Add user config
+        user_config = config_path("train.yaml")
+
+        if os.path.exists(user_config):
+            LOGGER.info(f"Loading config {user_config}")
+            cfg = OmegaConf.merge(cfg, OmegaConf.load(user_config, resolve=True))
+
+        # Add extra config files specified in the command line
+
+        for config in configs:
+            LOGGER.info(f"Loading config {config}")
+            cfg = OmegaConf.merge(cfg, OmegaConf.load(config))
+
+        # We need to reapply the overrides
+        # OmegaConf do not implement the prefix logic, this is done by hydra
+        for override in overrides:
+            LOGGER.info(f"Applying override {override}")
+            apply_override(cfg, override)
+
+        # Resolve the config
+        OmegaConf.resolve(cfg)
+
+        print(json.dumps(OmegaConf.to_container(cfg), indent=4))
+
+        # AIFSTrainer(cfg).train()
+
+
+command = Train
diff --git a/src/anemoi/training/config/__init__.py b/src/anemoi/training/config/__init__.py
diff --git a/src/anemoi/training/config/config.yaml b/src/anemoi/training/config/config.yaml
@@ -0,0 +1,16 @@
+defaults:
+- _self_
+
+model:
+  num_channels: 128
+
+dataloader:
+  limit_batches:
+    training: 100
+    validation: 100
+
+training:
+  max_epochs: 3
+
+token:
+  mlflow: 8