From 67998b836345336dae4514786c027d3ac1d10347 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Fri, 27 Sep 2024 17:37:16 +0200
Subject: [PATCH] read projection from datastore config extra section

---
 README.md                                     | 42 --------------
 neural_lam/datastore/mdp.py                   | 55 ++++++++++++-------
 neural_lam/datastore/plot_example.py          |  6 +-
 pyproject.toml                                |  2 +-
 .../datastore_examples/mdp/danra.example.yaml | 16 ++++--
 5 files changed, 51 insertions(+), 70 deletions(-)

diff --git a/README.md b/README.md
index e4a1989b..f23637c9 100644
--- a/README.md
+++ b/README.md
@@ -258,48 +258,6 @@ Except for training and pre-processing scripts all the source code can be found
 Model classes, including abstract base classes, are located in `neural_lam/models`.
 Notebooks for visualization and analysis are located in `docs`.
 
-
-## Format of data directory
-It is possible to store multiple datasets in the `data` directory.
-Each dataset contains a set of files with static features and a set of samples.
-The samples are split into different sub-directories for training, validation and testing.
-The directory structure is shown with examples below.
-Script names within parenthesis denote the script used to generate the file.
-```
-data
-├── dataset1
-│   ├── samples                             - Directory with data samples
-│   │   ├── train                           - Training data
-│   │   │   ├── nwp_2022040100_mbr000.npy  - A time series sample
-│   │   │   ├── nwp_2022040100_mbr001.npy
-│   │   │   ├── ...
-│   │   │   ├── nwp_2022043012_mbr001.npy
-│   │   │   ├── nwp_toa_downwelling_shortwave_flux_2022040100.npy   - Solar flux forcing
-│   │   │   ├── nwp_toa_downwelling_shortwave_flux_2022040112.npy
-│   │   │   ├── ...
-│   │   │   ├── nwp_toa_downwelling_shortwave_flux_2022043012.npy
-│   │   │   ├── wtr_2022040100.npy          - Open water features for one sample
-│   │   │   ├── wtr_2022040112.npy
-│   │   │   ├── ...
-│   │   │   └── wtr_202204012.npy
-│   │   ├── val                             - Validation data
-│   │   └── test                            - Test data
-│   └── static                              - Directory with graph information and static features
-│       ├── nwp_xy.npy                      - Coordinates of grid nodes (part of dataset)
-│       ├── surface_geopotential.npy        - Geopotential at surface of grid nodes (part of dataset)
-│       ├── border_mask.npy                 - Mask with True for grid nodes that are part of border (part of dataset)
-│       ├── grid_features.pt                - Static features of grid nodes (neural_lam.create_grid_features)
-│       ├── parameter_mean.pt               - Means of state parameters (neural_lam.create_parameter_weights)
-│       ├── parameter_std.pt                - Std.-dev. of state parameters (neural_lam.create_parameter_weights)
-│       ├── diff_mean.pt                    - Means of one-step differences (neural_lam.create_parameter_weights)
-│       ├── diff_std.pt                     - Std.-dev. of one-step differences (neural_lam.create_parameter_weights)
-│       ├── flux_stats.pt                   - Mean and std.-dev. of solar flux forcing (neural_lam.create_parameter_weights)
-│       └── parameter_weights.npy           - Loss weights for different state parameters (neural_lam.create_parameter_weights)
-├── dataset2
-├── ...
-└── datasetN
-```
-
 ## Format of graph directory
 The `graphs` directory contains generated graph structures that can be used by different graph-based models.
 The structure is shown with examples below:
diff --git a/neural_lam/datastore/mdp.py b/neural_lam/datastore/mdp.py
index 18a8df26..7384396d 100644
--- a/neural_lam/datastore/mdp.py
+++ b/neural_lam/datastore/mdp.py
@@ -333,7 +333,17 @@ def boundary_mask(self) -> xr.DataArray:
 
     @property
     def coords_projection(self) -> ccrs.Projection:
-        """Return the projection of the coordinates.
+        """
+        Return the projection of the coordinates.
+
+        NOTE: currently this expects the projection information to be in the
+        `extra` section of the configuration file, with a `projection` key
+        containing a `class_name` and `kwargs` for constructing the
+        `cartopy.crs.Projection` object. This is a temporary solution until
+        the projection information can be parsed in the produced dataset
+        itself. `mllam-data-prep` ignores the contents of the `extra` section
+        of the config file which is why we need to check that the necessary
+        parts are there.
 
         Returns
         -------
@@ -341,26 +351,33 @@ def coords_projection(self) -> ccrs.Projection:
             The projection of the coordinates.
 
         """
-        # XXX: this should move to config
-        kwargs = {
-            "LoVInDegrees": 25.0,
-            "LaDInDegrees": 56.7,
-            "Latin1InDegrees": 56.7,
-            "Latin2InDegrees": 56.7,
-        }
-
-        lon_0 = kwargs["LoVInDegrees"]  # Latitude of first standard parallel
-        lat_0 = kwargs["LaDInDegrees"]  # Latitude of second standard parallel
-        lat_1 = kwargs["Latin1InDegrees"]  # Origin latitude
-        lat_2 = kwargs["Latin2InDegrees"]  # Origin longitude
+        if "projection" not in self._config.extra:
+            raise ValueError(
+                "projection information not found in the configuration file "
+                f"({self._config_path}). Please add the projection information"
+                "to the `extra` section of the config, by adding a "
+                "`projection` key with the class name and kwargs of the "
+                "projection."
+            )
 
-        crs = ccrs.LambertConformal(
-            central_longitude=lon_0,
-            central_latitude=lat_0,
-            standard_parallels=(lat_1, lat_2),
-        )
+        projection_info = self._config.extra["projection"]
+        if "class_name" not in projection_info:
+            raise ValueError(
+                "class_name not found in the projection information. Please "
+                "add the class name of the projection to the `projection` key "
+                "in the `extra` section of the config."
+            )
+        if "kwargs" not in projection_info:
+            raise ValueError(
+                "kwargs not found in the projection information. Please add "
+                "the keyword arguments of the projection to the `projection` "
+                "key in the `extra` section of the config."
+            )
 
-        return crs
+        class_name = projection_info["class_name"]
+        ProjectionClass = getattr(ccrs, class_name)
+        kwargs = projection_info["kwargs"]
+        return ProjectionClass(**kwargs)
 
     @property
     def grid_shape_state(self):
diff --git a/neural_lam/datastore/plot_example.py b/neural_lam/datastore/plot_example.py
index 53bc6d5e..b68d33af 100644
--- a/neural_lam/datastore/plot_example.py
+++ b/neural_lam/datastore/plot_example.py
@@ -119,10 +119,8 @@ def _parse_dict(arg_str):
         nargs="+",
         default=[],
         type=_parse_dict,
-        help=(
-            "Selections to apply to the dataarray, for example "
-            '`time="1990-09-03T0:00" would select this single timestep',
-        ),
+        help="Selections to apply to the dataarray, for example "
+        "`time='1990-09-03T0:00' would select this single timestep",
     )
     args = parser.parse_args()
 
diff --git a/pyproject.toml b/pyproject.toml
index fc3fbf9e..15d59be2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ dependencies = [
     "torch-geometric==2.3.1",
     "parse>=1.20.2",
     "dataclass-wizard>=0.22.3",
-    "mllam-data-prep[dask-distributed]>=0.3.0",
+    "mllam-data-prep @ git+https://github.com/leifdenby/mllam-data-prep/@feat/extra-section-in-config",
 ]
 requires-python = ">=3.9"
 
diff --git a/tests/datastore_examples/mdp/danra.example.yaml b/tests/datastore_examples/mdp/danra.example.yaml
index 73aa0dfa..0801f832 100644
--- a/tests/datastore_examples/mdp/danra.example.yaml
+++ b/tests/datastore_examples/mdp/danra.example.yaml
@@ -1,4 +1,4 @@
-schema_version: v0.2.0
+schema_version: v0.2.0+dev
 dataset_version: v0.1.0
 
 output:
@@ -49,7 +49,7 @@ inputs:
       state_feature:
         method: stack_variables_by_var_name
         dims: [altitude]
-        name_format: f"{var_name}{altitude}m"
+        name_format: "{var_name}{altitude}m"
       grid_index:
         method: stack
         dims: [x, y]
@@ -70,7 +70,7 @@ inputs:
         dims: [x, y]
       forcing_feature:
         method: stack_variables_by_var_name
-        name_format: f"{var_name}"
+        name_format: "{var_name}"
     target_output_variable: forcing
 
   danra_lsm:
@@ -84,5 +84,13 @@ inputs:
         dims: [x, y]
       static_feature:
         method: stack_variables_by_var_name
-        name_format: f"{var_name}"
+        name_format: "{var_name}"
     target_output_variable: static
+
+extra:
+  projection:
+    class_name: LambertConformal
+    kwargs:
+      central_longitude: 25.0
+      central_latitude: 56.7
+      standard_parallels: [56.7, 56.7]