Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Changes to support subsetting datasets by constraining coordinate values. #494

Open
wants to merge 11 commits into
base: develop
Choose a base branch
from
1 change: 1 addition & 0 deletions coast/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from ._utils.process_data import Process_data
from .data.opendap import OpendapInfo
from .data.copernicus import Copernicus, Product
from ._utils.coordinates import Coordinates2D, Coordinates3D, Coordinates4D, Coordinates

# Set default for logging level when coast is imported
import logging
Expand Down
32 changes: 32 additions & 0 deletions coast/_utils/coordinates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from dataclasses import dataclass
from numpy import number
from numbers import Number
from typing import Union, Optional


Numeric = Optional[Union[Number, number]]


@dataclass
class Coordinates2D:
"""Represent a point in one-to-two-dimensional space with optional X and Y coordinates."""

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coordinates2D is an example of an object where the user needs to know the ordering convention: Coordinates2D(45, 50) is obviously a different place to Coordinates2D(50,45), but which is which.

Convention would have x preceding y, but also that latitude proceeds longitude...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As the parameters are named "x" and "y", what would be you preferred way of indicating that?

x: Numeric
y: Numeric


@dataclass
class Coordinates3D(Coordinates2D):
"""Represent a point in one-to-three-dimensional space with optional X, Y, and Z coordinates."""

z: Numeric


@dataclass
class Coordinates4D(Coordinates3D):
"""Represent a point in one-to-four-dimensional spacetime with optional X, Y, Z, and T coordinates."""

t: Numeric # TODO Should this be a datetime or is it likely to be something like a Unix timestamp?

MCazaly marked this conversation as resolved.
Show resolved Hide resolved

Coordinates = Union[Coordinates2D, Coordinates3D, Coordinates4D]
161 changes: 161 additions & 0 deletions coast/data/coast.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from dask.distributed import Client
import copy
from .._utils.logging_util import get_slug, debug, info, warn, warning
from .._utils.coordinates import Coordinates, Coordinates3D, Coordinates4D, Numeric
from .opendap import OpendapInfo


Expand Down Expand Up @@ -467,5 +468,165 @@ def plot_cartopy(self, var: str, plot_var: array, params, time_counter: int = 0)
info("Displaying plot!")
plt.show()

def set_constraint(self, start: Coordinates, end: Coordinates, drop: bool = True) -> None:
"""Constrain the underlying dataset to values within an arbitrarily sized orthotope.
MCazaly marked this conversation as resolved.
Show resolved Hide resolved

Args:
start: The start coordinates of the shape to define.
end: The end coordinates of the shape to define.
drop: Whether values should be dropped from the constrained dataset (if False, they will be NaNed).
"""
self.dataset = self.constrain(start, end, drop=drop)

def constrain(self, start: Coordinates, end: Coordinates, drop: bool = True) -> xr.Dataset:
"""Return the underlying dataset with values constrained to an arbitrarily sized orthotope.

Args:
start: The start coordinates of the shape to define.
end: The end coordinates of the shape to define.
drop: Whether values should be dropped from the constrained dataset (if False, they will be NaNed).

Returns:
The underlying dataset with values constrained to within the defined selection.
"""
return constrain(self.dataset, start, end, drop=drop)

@property
def x_dim(self) -> xr.DataArray:
"""Return the X coordinate array of the underlying dataset."""
return x_dim(self.dataset)

@property
def y_dim(self) -> xr.DataArray:
"""Return the Y coordinate array of the underlying dataset."""
return y_dim(self.dataset)

@property
def z_dim(self) -> xr.DataArray:
"""Return the Z coordinate array of the underlying dataset."""
return z_dim(self.dataset)

@property
def t_dim(self) -> xr.DataArray:
"""Return the T[ime] coordinate array of the underlying dataset."""
return t_dim(self.dataset)

def get_coord(self, dim: str) -> xr.DataArray:
"""Get the coordinate array for a dimension from the underlying dataset.

Args:
dim: The name of the dimension (i.e. "x", "y", "z", or "t").

Returns:
The corresponding coordinate array from the underlying dataset.
"""
return get_coord(self.dataset, dim)

def plot_movie(self):
raise NotImplementedError


def create_constraint(start: Numeric, end: Numeric, dim: xr.DataArray) -> np.typing.NDArray[bool]:
"""Create a mask to exclude coordinates that do not fall within a range of two arbitrary values.

Args:
start: The start of the range of values to constrain within.
end: The end of the range of values ot constrain within.
dim: The coordinate array to constrain values from.

Returns:
A mask that can be applied to dim to exclude unwanted values.
"""
return np.logical_and(dim >= start, dim <= end)
MCazaly marked this conversation as resolved.
Show resolved Hide resolved


def get_coord(dataset: xr.Dataset, dim: str) -> xr.DataArray:
"""Get the coordinate array for a dimension in a dataset.

Args:
dataset: The dataset to interrogate.
dim: The name of the dimension (i.e. "x", "y", "z", or "t").

Returns:
The corresponding coordinate array from the provided dataset.
"""
# TODO Really not a fan of this, is there an easier way to get the mapping?
return dataset[list(dataset[f"{dim.lower()}_dim"].coords)[0]]


def x_dim(dataset: xr.Dataset) -> xr.DataArray:
"""Get the X coordinate array for a dimension in a dataset.

Args:
dataset: The dataset to interrogate.

Returns:
The corresponding coordinate array from the provided dataset.
"""
return get_coord(dataset, "x")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear, a coordinate longitude can be 2 dimensional. E.g. if you want a box that is 1000 km x 1000 km, then near the north pole it would vary with latitude and longitude.

Or is the terminology for "coordinates" referring to indices in an array, which are 1-dimensional?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or is the terminology for "coordinates" referring to indices in an array, which are 1-dimensional?

That's right, I was basing the name off xr.Dataset.coords, what's the most correct term to use here?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. If I load your copernicus example and inspect the coords I get the following:

import coast

fn_config_t_grid = "./config/example_cmems_grid_t.json"
database = coast.Copernicus(username, password, "nrt")
forecast = database.get_product("global-analysis-forecast-phy-001-024")
nemo_t = coast.Gridded(fn_data=forecast, config=fn_config_t_grid)

start = coast.Coordinates2D(50, -10)
end = coast.Coordinates2D(60, 5)
nemo_t.set_constraint(start, end)

nemo_t.dataset.coords
Out[10]: 
Coordinates:
    longitude  (x_dim) float32 50.0 50.08 50.17 50.25 ... 59.75 59.83 59.92 60.0
    latitude   (y_dim) float32 -10.0 -9.917 -9.833 -9.75 ... 4.833 4.917 5.0
  * z_dim      (z_dim) float32 0.494 1.541 2.646 ... 5.275e+03 5.728e+03
    time       (t_dim) datetime64[ns] 2020-01-01T12:00:00 ... 2022-07-10T12:0...

So the horizontal coords are have names latitude and longitude and their content are floats not index integers.

Indices are generally referred to as j and i, which are a nightmare for grepping. So we are shifting towards j_ind and i_ind notation. j_ind is the index for the nominally northward, or y-direction. i_ind is the index for the nomillay eastward, or x-direction.
So, from the above example:
longitude[i_ind=0] = 50.0

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, from data in the example files:

import coast
import unit_testing.unit_test_files as files
nemo_t = coast.Gridded(files.fn_nemo_dat, files.fn_nemo_dom, config=files.fn_config_t_grid)
#start = coast.Coordinates2D(50, -10)
#end = coast.Coordinates2D(60, 5)
#nemo_t.set_constraint(start, end)

# Set some indices values
j_ind = 0
i_ind = 0

nemo_t.dataset.latitude[j_ind, i_ind].values  # notice the pain in the but `j` before `i` expected ordering.

Gives a floating point latitude, which will vary with both indices.

Out[26]: array(40.066406, dtype=float32)

In summary indices are more basic than coordinates because coordinates are a function of indices.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.



def y_dim(dataset: xr.Dataset) -> xr.DataArray:
"""Get the Y coordinate array for a dimension in a dataset.

Args:
dataset: The dataset to interrogate.

Returns:
The corresponding coordinate array from the provided dataset.
"""
return get_coord(dataset, "y")


def z_dim(dataset: xr.Dataset) -> xr.DataArray:
"""Get the Z coordinate array for a dimension in a dataset.

Args:
dataset: The dataset to interrogate.

Returns:
The corresponding coordinate array from the provided dataset.
"""
return get_coord(dataset, "z")


def t_dim(dataset: xr.Dataset) -> xr.DataArray:
"""Get the T[ime] coordinate array for a dimension in a dataset.

Args:
dataset: The dataset to interrogate.

Returns:
The corresponding coordinate array from the provided dataset.
"""
return get_coord(dataset, "t")


def constrain(dataset: xr.Dataset, start: Coordinates, end: Coordinates, drop: bool = True) -> xr.Dataset:
"""Constrain values within a dataset to an arbitrarily sized orthotope.

Args:
dataset: The dataset to constrain values from.
start: The start coordinates of the shape to define.
end: The end coordinates of the shape to define.
drop: Whether values should be dropped from the constrained dataset (if False, they will be NaNed).

Returns:
The provided dataset with values constrained to within the defined selection.
"""
assert type(start) == type(end), "Coordinates must be of the same dimensionality!"

constrained = dataset
if (x_start := start.x is not None) and (x_end := end.x is not None):
assert x_start == x_end, "Tried to constrain on X with a missing paired value!"
constrained = constrained.where(create_constraint(start.x, end.x, x_dim(constrained)), drop=drop)
if (y_start := start.y is not None) and (y_end := end.y is not None):
assert y_start == y_end, "Tried to constrain on Y with a missing paired value!"
constrained = constrained.where(create_constraint(start.y, end.y, y_dim(constrained)), drop=drop)
if isinstance(start, Coordinates3D) and (z_start := start.z is not None) and (z_end := end.z is not None):
assert z_start == z_end, "Tried to constrain on Z with a missing paired value!"
constrained = constrained.where(create_constraint(start.z, end.y, z_dim(constrained)), drop=drop)
if isinstance(start, Coordinates4D) and (t_start := start.t is not None) and (t_end := end.t is not None):
assert t_start == t_end, "Tried to constrain on Z with a missing paired value!"
constrained = constrained.where(create_constraint(start.t, end.t, t_dim(constrained)), drop=drop)
return constrained