Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chore fill anemoi models #1

Merged
merged 32 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
c9844bc
refactor: Initial Distributed Implementation
theissenhelen May 14, 2024
6865d77
refactor: adjust imports
theissenhelen May 14, 2024
a88b09a
refactor: layers
theissenhelen May 15, 2024
82c0e47
refactor: Initial model implementation
theissenhelen May 15, 2024
57f46dd
refactor: Initial data implementation
theissenhelen May 15, 2024
f71fa30
refactor: Initial Implementation
theissenhelen May 15, 2024
c5f1cab
test: data module, data indices and normalizer
theissenhelen May 15, 2024
dafc4c0
fix: move data module to training
theissenhelen May 17, 2024
855b7a2
refactor: clean up init
theissenhelen May 17, 2024
8163b41
fix: logger
theissenhelen May 17, 2024
11bdd6b
fix: add pytest and hypothesis
theissenhelen May 17, 2024
c10d192
fix: missing init
theissenhelen May 20, 2024
eab9386
refactor: move seeding to anemoi-utils
theissenhelen May 21, 2024
343c619
refactor: move ddp strategy to training
theissenhelen May 21, 2024
4c1cf97
test: encoder_processor_decoder
theissenhelen May 21, 2024
7a6ac91
refactor: add dependencies
theissenhelen May 21, 2024
d696181
refactor: move normaizer to preprocessing
theissenhelen May 22, 2024
0b2d795
chore: move khop functions
JesperDramsch May 27, 2024
464438e
refactor: split distributed files, add licenses
JesperDramsch May 27, 2024
6a9dbb3
fix: chunking in processor
JesperDramsch May 27, 2024
8d90f28
refactor: add forward preprocessing refactor
JesperDramsch May 27, 2024
5f256c5
chore: update tests and dependencies
JesperDramsch May 27, 2024
8d9c562
chore: move anemoi interface to interface
JesperDramsch May 27, 2024
3934f66
fix: rm src
JesperDramsch May 28, 2024
fb68d6f
test: move test_models
JesperDramsch May 28, 2024
f7f9a22
fix: argument order and explicit types
theissenhelen May 28, 2024
931bb73
fix: duplicate lines in toml
theissenhelen May 28, 2024
b4d8e52
fix: import statement
theissenhelen May 29, 2024
544885e
chore: add licences
theissenhelen May 29, 2024
95fa5ab
fix: predict step remove ensemble dimension
theissenhelen May 29, 2024
64e35dd
refactor: DotConfig in anemoi-utils
JesperDramsch May 29, 2024
6584f0e
fix: isort fails
theissenhelen May 30, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#!/usr/bin/env python
# (C) Copyright 2024 ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
Expand All @@ -10,12 +9,13 @@
# https://packaging.python.org/en/latest/guides/writing-pyproject-toml/

[build-system]
requires = ["setuptools>=60", "setuptools-scm>=8.0"]
requires = ["setuptools>=61", "setuptools-scm>=8.0"]
build-backend = "setuptools.build_meta"

[project]
description = "A package to hold various functions to support training of ML models."
name = "anemoi-models"

readme = "README.md"
dynamic = ["version"]
license = { file = "LICENSE" }
requires-python = ">=3.9"
Expand All @@ -39,19 +39,34 @@ classifiers = [
"Operating System :: OS Independent",
]

dependencies = []
dependencies = [
"torch==2.3",
"torch-geometric==2.4",
"einops==0.6.1",
"hydra-core==1.3",
"anemoi-datasets==0.2.1",
"anemoi-utils==0.1.9",
]

[project.optional-dependencies]


docs = [
# For building the documentation
"sphinx", "sphinx_rtd_theme", "nbsphinx", "pandoc", "sphinx_argparse"
]

all = []

dev = []
tests = ["pytest", "hypothesis"]

dev = [
"sphinx",
"sphinx_rtd_theme",
"nbsphinx",
"pandoc",
"pytest",
"hypothesis",
]

[project.urls]
Homepage = "https://github.com/ecmwf/anemoi-models/"
Expand Down
74 changes: 74 additions & 0 deletions src/anemoi/models/data_indices/collection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# (C) Copyright 2024 ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.
#

import operator

import yaml
from omegaconf import OmegaConf

from anemoi.models.data_indices.index import BaseIndex
from anemoi.models.data_indices.index import DataIndex
from anemoi.models.data_indices.index import ModelIndex
from anemoi.models.data_indices.tensor import BaseTensorIndex
from anemoi.models.data_indices.tensor import InputTensorIndex
from anemoi.models.data_indices.tensor import OutputTensorIndex


class IndexCollection:
"""Collection of data and model indices."""

def __init__(self, config, name_to_index) -> None:
self.config = OmegaConf.to_container(config, resolve=True)

self.forcing = [] if config.data.forcing is None else OmegaConf.to_container(config.data.forcing, resolve=True)
self.diagnostic = (
[] if config.data.diagnostic is None else OmegaConf.to_container(config.data.diagnostic, resolve=True)
)

assert set(self.diagnostic).isdisjoint(self.forcing), (
f"Diagnostic and forcing variables overlap: {set(self.diagnostic).intersection(self.forcing)}. ",
"Please drop them at a dataset-level to exclude them from the training data.",
)
self.name_to_index = dict(sorted(name_to_index.items(), key=operator.itemgetter(1)))
name_to_index_model_input = {
name: i for i, name in enumerate(key for key in self.name_to_index if key not in self.diagnostic)
}
name_to_index_model_output = {
name: i for i, name in enumerate(key for key in self.name_to_index if key not in self.forcing)
}

self.data = DataIndex(self.diagnostic, self.forcing, self.name_to_index)
self.model = ModelIndex(self.diagnostic, self.forcing, name_to_index_model_input, name_to_index_model_output)

def __repr__(self) -> str:
return f"IndexCollection(config={self.config}, name_to_index={self.name_to_index})"

def __eq__(self, other):
if not isinstance(other, IndexCollection):
# don't attempt to compare against unrelated types
return NotImplemented

return self.model == other.model and self.data == other.data

def __getitem__(self, key):
return getattr(self, key)

def todict(self):
return {
"data": self.data.todict(),
"model": self.model.todict(),
}

@staticmethod
def representer(dumper, data):
return dumper.represent_scalar(f"!{data.__class__.__name__}", repr(data))


for cls in [BaseTensorIndex, InputTensorIndex, OutputTensorIndex, BaseIndex, DataIndex, ModelIndex, IndexCollection]:
yaml.add_representer(cls, cls.representer)
93 changes: 93 additions & 0 deletions src/anemoi/models/data_indices/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# (C) Copyright 2024 ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.
#

from anemoi.models.data_indices.tensor import InputTensorIndex
from anemoi.models.data_indices.tensor import OutputTensorIndex


class BaseIndex:
"""Base class for data and model indices."""

def __init__(self) -> None:
self.input = NotImplementedError
self.output = NotImplementedError

def __eq__(self, other):
if not isinstance(other, BaseIndex):
# don't attempt to compare against unrelated types
return NotImplemented

return self.input == other.input and self.output == other.output

def __repr__(self) -> str:
return f"{self.__class__.__name__}(input={self.input}, output={self.output})"

def __getitem__(self, key):
return getattr(self, key)

def todict(self):
return {
"input": self.input.todict(),
"output": self.output.todict(),
}

@staticmethod
def representer(dumper, data):
return dumper.represent_scalar(f"!{data.__class__.__name__}", repr(data))


class DataIndex(BaseIndex):
"""Indexing for data variables."""

def __init__(self, diagnostic, forcing, name_to_index) -> None:
self._diagnostic = diagnostic
self._forcing = forcing
self._name_to_index = name_to_index
self.input = InputTensorIndex(
includes=forcing,
excludes=diagnostic,
name_to_index=name_to_index,
)

self.output = OutputTensorIndex(
includes=diagnostic,
excludes=forcing,
name_to_index=name_to_index,
)

def __repr__(self) -> str:
return f"{self.__class__.__name__}(diagnostic={self._input}, forcing={self._output}, name_to_index={self._name_to_index})"


class ModelIndex(BaseIndex):
"""Indexing for model variables."""

def __init__(self, diagnostic, forcing, name_to_index_model_input, name_to_index_model_output) -> None:
self._diagnostic = diagnostic
self._forcing = forcing
self._name_to_index_model_input = name_to_index_model_input
self._name_to_index_model_output = name_to_index_model_output
self.input = InputTensorIndex(
includes=forcing,
excludes=[],
name_to_index=name_to_index_model_input,
)

self.output = OutputTensorIndex(
includes=diagnostic,
excludes=[],
name_to_index=name_to_index_model_output,
)

def __repr__(self) -> str:
return (
f"{self.__class__.__name__}(diagnostic={self._input}, forcing={self._output}, "
f"name_to_index_model_input={self._name_to_index_model_input}, "
f"name_to_index_model_output={self._name_to_index_model_output})"
)
114 changes: 114 additions & 0 deletions src/anemoi/models/data_indices/tensor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# (C) Copyright 2024 ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.
#

import torch


class BaseTensorIndex:
"""Indexing for variables in index as Tensor."""

def __init__(self, *, includes: list[str], excludes: list[str], name_to_index: dict[str, int]) -> None:
"""Initialize indexing tensors from includes and excludes using name_to_index.

Parameters
----------
includes : list[str]
Variables to include in the indexing that are exclusive to this indexing.
e.g. Forcing variables for the input indexing, diagnostic variables for the output indexing
excludes : list[str]
Variables to exclude from the indexing.
e.g. Diagnostic variables for the input indexing, forcing variables for the output indexing
name_to_index : dict[str, int]
Dictionary mapping variable names to their index in the Tensor.
"""
self.includes = includes
self.excludes = excludes
self.name_to_index = name_to_index

assert set(self.excludes).issubset(
self.name_to_index.keys(),
), f"Data indexing has invalid entries {[var for var in self.excludes if var not in self.name_to_index]}, not in dataset."
assert set(self.includes).issubset(
self.name_to_index.keys(),
), f"Data indexing has invalid entries {[var for var in self.includes if var not in self.name_to_index]}, not in dataset."

self.full = self._build_idx_from_excludes()
self._only = self._build_idx_from_includes()
self._removed = self._build_idx_from_includes(self.excludes)
self.prognostic = self._build_idx_prognostic()
self.diagnostic = NotImplementedError
self.forcing = NotImplementedError

def __len__(self) -> int:
return len(self.full)

def __repr__(self) -> str:
return f"{self.__class__.__name__}(includes={self.includes}, excludes={self.excludes}, name_to_index={self.name_to_index})"

def __eq__(self, other):
if not isinstance(other, BaseTensorIndex):
# don't attempt to compare against unrelated types
return NotImplemented

return (
torch.allclose(self.full, other.full)
and torch.allclose(self._only, other._only)
and torch.allclose(self._removed, other._removed)
and torch.allclose(self.prognostic, other.prognostic)
and torch.allclose(self.diagnostic, other.diagnostic)
and torch.allclose(self.forcing, other.forcing)
and self.includes == other.includes
and self.excludes == other.excludes
)

def __getitem__(self, key):
return getattr(self, key)

def todict(self):
return {
"full": self.full,
"prognostic": self.prognostic,
"diagnostic": self.diagnostic,
"forcing": self.forcing,
}

@staticmethod
def representer(dumper, data):
return dumper.represent_scalar(f"!{data.__class__.__name__}", repr(data))

def _build_idx_from_excludes(self, excludes=None) -> "torch.Tensor[int]":
if excludes is None:
excludes = self.excludes
return torch.Tensor(sorted(i for name, i in self.name_to_index.items() if name not in excludes)).to(torch.int)

def _build_idx_from_includes(self, includes=None) -> "torch.Tensor[int]":
if includes is None:
includes = self.includes
return torch.Tensor(sorted(self.name_to_index[name] for name in includes)).to(torch.int)

def _build_idx_prognostic(self) -> "torch.Tensor[int]":
return self._build_idx_from_excludes(self.includes + self.excludes)


class InputTensorIndex(BaseTensorIndex):
"""Indexing for input variables."""

def __init__(self, *, includes: list[str], excludes: list[str], name_to_index: dict[str, int]) -> None:
super().__init__(includes=includes, excludes=excludes, name_to_index=name_to_index)
self.forcing = self._only
self.diagnostic = self._removed


class OutputTensorIndex(BaseTensorIndex):
"""Indexing for output variables."""

def __init__(self, *, includes: list[str], excludes: list[str], name_to_index: dict[str, int]) -> None:
super().__init__(includes=includes, excludes=excludes, name_to_index=name_to_index)
self.forcing = self._removed
self.diagnostic = self._only
Loading
Loading