ecmwf · JesperDramsch · Dec 19, 2024 · Nov 28, 2024 · Nov 29, 2024 · Dec 2, 2024
@@ -40,7 +40,7 @@ repos:
     - --force-single-line-imports
     - --profile black
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.7.2
+  rev: v0.8.1
   hooks:
   - id: ruff
     args:
@@ -64,7 +64,7 @@ repos:
   hooks:
   - id: pyproject-fmt
 -   repo: https://github.com/jshwi/docsig # Check docstrings against function sig
-    rev: v0.64.0
+    rev: v0.65.0
     hooks:
     -   id: docsig
         args:
@@ -74,6 +74,5 @@ repos:
         - --check-protected  # Check protected methods
         - --check-class      # Check class docstrings
         - --disable=E113     # Disable empty docstrings
-        - --summary          # Print a summary
 ci:
   autoupdate_schedule: monthly
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,11 +8,41 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 Please add your functional changes to the appropriate section in the PR.
 Keep it human-readable, your future self will thank you!
 
-## [Unreleased](https://github.com/ecmwf/anemoi-training/compare/0.3.0...HEAD)
+## [Unreleased](https://github.com/ecmwf/anemoi-training/compare/0.3.1...HEAD)
 ### Fixed
-- Update `n_pixel` used by datashader to better adapt across resolutions #152
+- Not update NaN-weight-mask for loss function when using remapper and no imputer [#178](https://github.com/ecmwf/anemoi-training/pull/178)
+- Dont crash when using the profiler if certain env vars arent set [#180](https://github.com/ecmwf/anemoi-training/pull/180)
+- Remove saving of metadata to training checkpoint [#57](https://github.com/ecmwf/anemoi-training/pull/190)
+- Fixes to callback plots [#182] (power spectrum large numpy array error + precip cmap for cases where precip is prognostic).
+- GraphTrainableParameters callback will log a warning when no trainable parameters are specified  [#173](https://github.com/ecmwf/anemoi-training/pull/173)
+- Fixes to checkpoint saving - ensure last checkpoint if saving when using max_steps [#191] (https://github.com/ecmwf/anemoi-training/pull/191)
+- Identify stretched grid models based on graph rather than configuration file [#204](https://github.com/ecmwf/anemoi-training/pull/204)
 
+### Added
+- Introduce variable to configure: transfer_learning -> bool, True if loading checkpoint in a transfer learning setting.
+- <b> TRANSFER LEARNING</b>: enabled new functionality. You can now load checkpoints from different models and different training runs.
+- Effective batch size: `(config.dataloader.batch_size["training"] * config.hardware.num_gpus_per_node * config.hardware.num_nodes) // config.hardware.num_gpus_per_model`.
+  Used for experiment reproducibility across different computing configurations.
+- Added a check for the variable sorting on pre-trained/finetuned models [#120](https://github.com/ecmwf/anemoi-training/pull/120)
+- Added default configuration files for stretched grid and limited area model experiments [173](https://github.com/ecmwf/anemoi-training/pull/173)
+- Added new metrics for stretched grid models to track losses inside/outside the regional domain [#199](https://github.com/ecmwf/anemoi-training/pull/199)
+- Add supporting arrrays (numpy) to checkpoint
+- Support for masking out unconnected nodes in LAM [#171](https://github.com/ecmwf/anemoi-training/pull/171)
+- Improved validation metrics, allow 'all' to be scaled [#202](https://github.com/ecmwf/anemoi-training/pull/202)
+
+### Changed
+
+### Removed
+- Removed the resolution config entry [#120](https://github.com/ecmwf/anemoi-training/pull/120)
+
+## [0.3.1 - AIFS v0.3 Compatibility](https://github.com/ecmwf/anemoi-training/compare/0.3.0...0.3.1) - 2024-11-28
 
+### Changed
+- Perform full shuffle of training dataset [#153](https://github.com/ecmwf/anemoi-training/pull/153)
+
+### Fixed
+
+- Update `n_pixel` used by datashader to better adapt across resolutions [#152](https://github.com/ecmwf/anemoi-training/pull/152)
 - Fixed bug in power spectra plotting for the n320 resolution.
 - Allow histogram and spectrum plot for one variable [#165](https://github.com/ecmwf/anemoi-training/pull/165)
 
@@ -21,22 +51,17 @@ Keep it human-readable, your future self will thank you!
 - Add reader groups to reduce CPU memory usage and increase dataloader throughput [#76](https://github.com/ecmwf/anemoi-training/pull/76)
 - Bump `anemoi-graphs` version to 0.4.1 [#159](https://github.com/ecmwf/anemoi-training/pull/159)
 
-### Changed
-## [0.3.0 - Loss & Callback Refactors](https://github.com/ecmwf/anemoi-training/compare/0.2.2...0.3.0) - 2024-11-14
 
-### Changed
-- Increase the default MlFlow HTTP max retries [#111](https://github.com/ecmwf/anemoi-training/pull/111)
+## [0.3.0 - Loss & Callback Refactors](https://github.com/ecmwf/anemoi-training/compare/0.2.2...0.3.0) - 2024-11-14
 
 ### Fixed
 
 - Rename loss_scaling to variable_loss_scaling [#138](https://github.com/ecmwf/anemoi-training/pull/138)
 - Refactored callbacks. [#60](https://github.com/ecmwf/anemoi-training/pulls/60)
   - Updated docs [#115](https://github.com/ecmwf/anemoi-training/pull/115)
   - Fix enabling LearningRateMonitor [#119](https://github.com/ecmwf/anemoi-training/pull/119)
-
 - Refactored rollout [#87](https://github.com/ecmwf/anemoi-training/pulls/87)
   - Enable longer validation rollout than training
-
 - Expand iterables in logging [#91](https://github.com/ecmwf/anemoi-training/pull/91)
   - Save entire config in mlflow
 
@@ -45,23 +70,27 @@ Keep it human-readable, your future self will thank you!
 
 - Included more loss functions and allowed configuration [#70](https://github.com/ecmwf/anemoi-training/pull/70)
 - Include option to use datashader and optimised asyncronohous callbacks [#102](https://github.com/ecmwf/anemoi-training/pull/102)
-   - Fix that applies the metric_ranges in the post-processed variable space [#116](https://github.com/ecmwf/anemoi-training/pull/116)
+  - Fix that applies the metric_ranges in the post-processed variable space [#116](https://github.com/ecmwf/anemoi-training/pull/116)
 - Allow updates to scalars [#137](https://github.com/ecmwf/anemoi-training/pulls/137)
   - Add without subsetting in ScaleTensor
-
 - Sub-hour datasets [#63](https://github.com/ecmwf/anemoi-training/pull/63)
 - Add synchronisation workflow [#92](https://github.com/ecmwf/anemoi-training/pull/92)
 - Feat: Anemoi Profiler compatible with mlflow and using Pytorch (Kineto) Profiler for memory report [38](https://github.com/ecmwf/anemoi-training/pull/38/)
 - Feat: Save a gif for longer rollouts in validation [#65](https://github.com/ecmwf/anemoi-training/pull/65)
 - New limited area config file added, limited_area.yaml. [#134](https://github.com/ecmwf/anemoi-training/pull/134/)
 - New stretched grid config added, stretched_grid.yaml [#133](https://github.com/ecmwf/anemoi-training/pull/133)
+- Functionality to change the weight attribute of nodes in the graph at the start of training without re-generating the graph. [#136] (https://github.com/ecmwf/anemoi-training/pull/136)
 - Custom System monitor for Nvidia and AMD GPUs [#147](https://github.com/ecmwf/anemoi-training/pull/147)
 
+
 ### Changed
 
 - Renamed frequency keys in callbacks configuration. [#118](https://github.com/ecmwf/anemoi-training/pull/118)
 - Modified training configuration to support max_steps and tied lr iterations to max_steps by default [#67](https://github.com/ecmwf/anemoi-training/pull/67)
 - Merged node & edge trainable feature callbacks into one. [#135](https://github.com/ecmwf/anemoi-training/pull/135)
+- Increase the default MlFlow HTTP max retries [#111](https://github.com/ecmwf/anemoi-training/pull/111)
+
+### Removed
 
 ## [0.2.2 - Maintenance: pin python <3.13](https://github.com/ecmwf/anemoi-training/compare/0.2.1...0.2.2) - 2024-10-28
 

diff --git a/README.md b/README.md
@@ -1,5 +1,8 @@
 # anemoi-training
 
+[![Documentation Status](https://readthedocs.org/projects/anemoi-training/badge/?version=latest)](https://anemoi-training.readthedocs.io/en/latest/?badge=latest)
+
+
 **DISCLAIMER**
 This project is **BETA** and will be **Experimental** for the foreseeable future.
 Interfaces and functionality are likely to change, and the project itself may be scrapped.

diff --git a/docs/conf.py b/docs/conf.py
@@ -42,7 +42,7 @@
 
 author = "Anemoi contributors"
 
-year = datetime.datetime.now(tz="UTC").year
+year = datetime.datetime.now(tz=datetime.timezone.utc).year
 years = "2024" if year == 2024 else f"2024-{year}"
 
 copyright = f"{years}, Anemoi contributors"  # noqa: A001

diff --git a/docs/modules/losses.rst b/docs/modules/losses.rst
@@ -73,11 +73,36 @@ Currently, the following scalars are available for use:
 ********************
 
 Validation metrics as defined in the config file at
-``config.training.validation_metrics`` follow the same initialise
+``config.training.validation_metrics`` follow the same initialisation
 behaviour as the loss function, but can be a list. In this case all
 losses are calculated and logged as a dictionary with the corresponding
 name
 
+Scaling Validation Losses
+=========================
+
+Validation metrics can **not** by default be scaled by scalars across
+the variable dimension, but can be by all other scalars. If you want to
+scale a validation metric by the variable weights, it must be added to
+`config.training.scale_validation_metrics`.
+
+These metrics are then kept in the normalised, preprocessed space, and
+thus the indexing of scalars aligns with the indexing of the tensors.
+
+By default, only `all` is kept in the normalised space and scaled.
+
+.. code:: yaml
+
+   # List of validation metrics to keep in normalised space, and scalars to be applied
+   # Use '*' in reference all metrics, or a list of metric names.
+   # Unlike above, variable scaling is possible due to these metrics being
+   # calculated in the same way as the training loss, within the internal model space.
+   scale_validation_metrics:
+   scalars_to_apply: ['variable']
+   metrics:
+      - 'all'
+      # - "*"
+
 ***********************
  Custom Loss Functions
 ***********************

diff --git a/docs/user-guide/training.rst b/docs/user-guide/training.rst
@@ -183,6 +183,28 @@ levels nearer to the surface). By default anemoi-training uses a ReLU
 Pressure Level scaler with a minimum weighting of 0.2 (i.e. no pressure
 level has a weighting less than 0.2).
 
+The loss is also scaled by assigning a weight to each node on the output
+grid. These weights are calculated during graph-creation and stored as
+an attribute in the graph object. The configuration option
+``config.training.node_loss_weights`` is used to specify the node
+attribute used as weights in the loss function. By default
+anemoi-training uses area weighting, where each node is weighted
+according to the size of the geographical area it represents.
+
+It is also possible to rescale the weight of a subset of nodes after
+they are loaded from the graph. For instance, for a stretched grid setup
+we can rescale the weight of nodes in the limited area such that their
+sum equals 0.25 of the sum of all node weights with the following config
+setup
+
+.. code:: yaml
+
+   node_loss_weights:
+      _target_: anemoi.training.losses.nodeweights.ReweightedGraphNodeAttribute
+      target_nodes: data
+      scaled_attribute: cutout
+      weight_frac_of_total: 0.25
+
 ***************
  Learning rate
 ***************

@@ -42,7 +42,7 @@ dynamic = [ "version" ]
 dependencies = [
   "anemoi-datasets>=0.5.2",
   "anemoi-graphs>=0.4.1",
-  "anemoi-models>=0.3",
+  "anemoi-models>=0.4.1",
   "anemoi-utils[provenance]>=0.4.4",
   "datashader>=0.16.3",
   "einops>=0.6.1",

diff --git a/src/anemoi/training/config/data/zarr.yaml b/src/anemoi/training/config/data/zarr.yaml
@@ -1,5 +1,4 @@
 format: zarr
-resolution: o96
 # Time frequency requested from dataset
 frequency: 6h
 # Time step of model (must be multiple of frequency)
@@ -82,5 +81,5 @@ processors:
   #   _convert_: all
   #   config: ${data.remapper}
 
-# Values set in the code
+  # Values set in the code
 num_features: null # number of features in the forecast state
diff --git a/src/anemoi/training/config/dataloader/native_grid.yaml b/src/anemoi/training/config/dataloader/native_grid.yaml
@@ -40,6 +40,12 @@ limit_batches:
   test: 20
   predict: 20
 
+# set a custom mask for grid points.
+# Useful for LAM (dropping unconnected nodes from forcing dataset)
+grid_indices:
+  _target_: anemoi.training.data.grid_indices.FullGrid
+  nodes_name: ${graph.data}
+
 # ============
 # Dataloader definitions
 # These follow the anemoi-datasets patterns

diff --git a/src/anemoi/training/config/diagnostics/plot/detailed.yaml b/src/anemoi/training/config/diagnostics/plot/detailed.yaml
@@ -44,6 +44,7 @@ callbacks:
 
   - _target_: anemoi.training.diagnostics.callbacks.plot.PlotSpectrum
     # every_n_batches: 100 # Override for batch frequency
+    # min_delta: 0.01 # Minimum distance between two consecutive points
     sample_idx: ${diagnostics.plot.sample_idx}
     parameters:
     - z_500

diff --git a/src/anemoi/training/config/graph/limited_area.yaml b/src/anemoi/training/config/graph/limited_area.yaml
@@ -17,7 +17,7 @@ nodes:
       _target_: anemoi.graphs.nodes.LimitedAreaTriNodes # options: ZarrDatasetNodes, NPZFileNodes, TriNodes
       resolution: 5 # grid resolution for npz (o32, o48, ...)
       reference_node_name: ${graph.data}
-      mask_attr_name: cutout
+      mask_attr_name: cutout_mask
 
 edges:
 # Encoder configuration
@@ -26,6 +26,9 @@ edges:
   edge_builders:
   - _target_: anemoi.graphs.edges.CutOffEdges # options: KNNEdges, CutOffEdges
     cutoff_factor: 0.6 # only for cutoff method
+  - _target_: anemoi.graphs.edges.CutOffEdges # connects only boundary nodes
+    cutoff_factor: 1.5 # only for cutoff method
+    source_mask_attr_name: boundary_mask
   attributes: ${graph.attributes.edges}
 # Processor configuration
 - source_name: ${graph.hidden}
@@ -39,18 +42,28 @@ edges:
   target_name: ${graph.data}
   edge_builders:
   - _target_: anemoi.graphs.edges.KNNEdges # options: KNNEdges, CutOffEdges
-    target_mask_attr_name: cutout
+    target_mask_attr_name: cutout_mask
     num_nearest_neighbours: 3 # only for knn method
   attributes: ${graph.attributes.edges}
 
+post_processors:
+  - _target_: anemoi.graphs.processors.RemoveUnconnectedNodes
+    nodes_name: data
+    ignore: cutout_mask # optional
+    save_mask_indices_to_attr: indices_connected_nodes # optional
 
 attributes:
   nodes:
+    # Attributes for data nodes
     area_weight:
       _target_: anemoi.graphs.nodes.attributes.AreaWeights # options: Area, Uniform
       norm: unit-max # options: l1, l2, unit-max, unit-sum, unit-std
-    cutout:
+    cutout_mask:
       _target_: anemoi.graphs.nodes.attributes.CutOutMask
+    boundary_mask:
+      _target_: anemoi.graphs.nodes.attributes.BooleanNot
+      masks:
+        _target_: anemoi.graphs.nodes.attributes.CutOutMask
   edges:
     edge_length:
       _target_: anemoi.graphs.edges.attributes.EdgeLength

diff --git a/src/anemoi/training/config/graph/stretched_grid.yaml b/src/anemoi/training/config/graph/stretched_grid.yaml
@@ -11,12 +11,7 @@ nodes:
     node_builder:
       _target_: anemoi.graphs.nodes.ZarrDatasetNodes
       dataset: ${dataloader.training.dataset}
-    attributes:
-      area_weight:
-        _target_: anemoi.graphs.nodes.attributes.AreaWeights
-        norm: unit-max
-      cutout:
-        _target_: anemoi.graphs.nodes.attributes.CutOutMask
+    attributes: ${graph.attributes.nodes}
   hidden:
     node_builder:
       _target_: anemoi.graphs.nodes.StretchedTriNodes
@@ -25,10 +20,6 @@ nodes:
       reference_node_name: ${graph.data}
       mask_attr_name: cutout
       margin_radius_km: 11
-    attributes:
-      area_weights:
-        _target_: anemoi.graphs.nodes.attributes.AreaWeights
-        norm: unit-max
 
 edges:
 # Encoder
@@ -54,6 +45,13 @@ edges:
   attributes: ${graph.attributes.edges}
 
 attributes:
+  nodes:
+    # Attributes for data nodes
+    area_weight:
+      _target_: anemoi.graphs.nodes.attributes.AreaWeights
+      norm: unit-max
+    cutout:
+      _target_: anemoi.graphs.nodes.attributes.CutOutMask
   edges:
     edge_length:
       _target_: anemoi.graphs.edges.attributes.EdgeLength

diff --git a/src/anemoi/training/config/lam.yaml b/src/anemoi/training/config/lam.yaml
@@ -0,0 +1,36 @@
+defaults:
+- data: zarr
+- dataloader: native_grid
+- diagnostics: evaluation
+- hardware: example
+- graph: limited_area
+- model: graphtransformer
+- training: default
+- _self_
+
+
+### This file is for local experimentation.
+##  When you commit your changes, assign the new features and keywords
+##  to the correct defaults.
+# For example to change from default GPU count:
+# hardware:
+#   num_gpus_per_node: 1
+
+dataloader:
+  dataset:
+    cutout:
+      - dataset: ${hardware.paths.data}/${hardware.files.dataset}
+        thinning: ???
+      - dataset: ${hardware.paths.data}/${hardware.files.forcing_dataset}
+    adjust: all
+    min_distance_km: 0
+  grid_indices:
+    _target_: anemoi.training.data.grid_indices.MaskedGrid
+    nodes_name: data
+    node_attribute_name: indices_connected_nodes
+model:
+  output_mask: cutout_mask # it must be a node attribute of the output nodes
+hardware:
+  files:
+    dataset: ???
+    forcing_dataset: ???
diff --git a/src/anemoi/training/config/model/gnn.yaml b/src/anemoi/training/config/model/gnn.yaml
@@ -45,8 +45,6 @@ attributes:
   - edge_dirs
   nodes: []
 
-node_loss_weight: area_weight
-
 # Bounding configuration
 bounding: #These are applied in order
 

diff --git a/src/anemoi/training/config/model/graphtransformer.yaml b/src/anemoi/training/config/model/graphtransformer.yaml
@@ -50,8 +50,6 @@ attributes:
   - edge_dirs
   nodes: []
 
-node_loss_weight: area_weight
-
 # Bounding configuration
 bounding: #These are applied in order