ENH: add zonal_stats (#52)

* example * assign the pixels ids based on their polygons then group them using their ids * fix importing xagg and run an example * xagg using openEO data , it needs to implement parallel processing to make it faster * clean up * spatial_aggregation * clean up * rioxarray * packages * deps * deps * deps * fix the notes * zonal stats in pytest * handle long lines * handle long lines * fix pre-commit * updated files * support lat&lon names * fix the dimension names on the fly * testpy * dask support just axis as integer * Dimension Mapping * updated files * [pre-commit.ci] pre-commit autoupdate (#53) * MAINT: test on Python 3.12, update actions, use ruff format (#54) * DOC: fix formatting * MAINT: add PYthon 3.12, use ruff formatter * ignore conflicting rules * Update xvec/accessor.py Co-authored-by: Martin Fleischmann <[email protected]> * Update xvec/accessor.py Co-authored-by: Martin Fleischmann <[email protected]> * Update xvec/accessor.py Co-authored-by: Martin Fleischmann <[email protected]> * Update xvec/accessor.py Co-authored-by: Martin Fleischmann <[email protected]> * clean uo * clean accessor * remove dask option * clean * internal funcs * pytest * clean up * example * assign the pixels ids based on their polygons then group them using their ids * fix importing xagg and run an example * xagg using openEO data , it needs to implement parallel processing to make it faster * clean up * spatial_aggregation * clean up * rioxarray * packages * deps * deps * deps * fix the notes * zonal stats in pytest * handle long lines * handle long lines * fix pre-commit * updated files * support lat&lon names * fix the dimension names on the fly * testpy * dask support just axis as integer * Dimension Mapping * updated files * clean uo * clean accessor * remove dask option * clean * internal funcs * pytest * clean up * Update xvec/accessor.py Co-authored-by: Martin Fleischmann <[email protected]> * accessor * dimension-agnostic * clean up * update pytest * refactor * refactor structure * geodatasets * pyogrio for IO * api * include vectorized rasterize-based method * rasterio link * fix docstring * fmt * fix for DataArray * testing * stat -> stats * fix keyword --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Martin Fleischmann <[email protected]>
xarray-contrib · Dec 14, 2023 · 18ee579 · 18ee579
1 parent 718c414
commit 18ee579
Show file tree

Hide file tree

Showing 12 changed files with 584 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -144,3 +144,5 @@ doc/source/generated
 .ruff_cache
 doc/source/cube.joblib.compressed
 doc/source/cube.pickle
+
+cache/
diff --git a/ci/310.yaml b/ci/310.yaml
@@ -6,10 +6,17 @@ dependencies:
   # required
   - shapely >=2
   - xarray
+  - rioxarray
+  - joblib
+  - rasterio
+  - tqdm
   - pyproj
   # testing
   - pytest
   - pytest-cov
   - pytest-xdist
   - pytest-reportlog
   - geopandas-base
+  - geodatasets
+  - pyogrio
+
diff --git a/ci/311.yaml b/ci/311.yaml
@@ -6,10 +6,17 @@ dependencies:
   # required
   - shapely >=2
   - xarray
+  - rioxarray
+  - joblib
+  - rasterio
+  - tqdm
   - pyproj
   # testing
   - pytest
   - pytest-cov
   - pytest-xdist
   - pytest-reportlog
   - geopandas-base
+  - geodatasets
+  - pyogrio
+
diff --git a/ci/312.yaml b/ci/312.yaml
@@ -6,10 +6,17 @@ dependencies:
   # required
   - shapely >=2
   - xarray
+  - rioxarray
+  - joblib
+  - rasterio
+  - tqdm
   - pyproj
   # testing
   - pytest
   - pytest-cov
   - pytest-xdist
   - pytest-reportlog
   - geopandas-base
+  - geodatasets
+  - pyogrio
+
diff --git a/ci/39.yaml b/ci/39.yaml
@@ -6,10 +6,16 @@ dependencies:
   # required
   - shapely >=2
   - xarray
+  - rioxarray
+  - joblib
+  - rasterio
+  - tqdm
   - pyproj
   # testing
   - pytest
   - pytest-cov
   - pytest-xdist
   - pytest-reportlog
   - geopandas-base
+  - geodatasets
+  - pyogrio
diff --git a/ci/dev.yaml b/ci/dev.yaml
@@ -7,12 +7,18 @@ dependencies:
   - cython
   - geos
   - proj
+  - rioxarray
+  - joblib
+  - rasterio
+  - tqdm
   # testing
   - pytest
   - pytest-cov
   - pytest-xdist
   - pytest-reportlog
   - geopandas-base
+  - geodatasets
+  - pyogrio
   - pip
   - pip:
       - git+https://github.com/shapely/shapely.git@main

diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -55,6 +55,7 @@ Methods
     Dataset.xvec.to_geodataframe
     Dataset.xvec.to_geopandas
     Dataset.xvec.extract_points
+    Dataset.xvec.zonal_stats
 
 
 DataArray.xvec
@@ -89,4 +90,5 @@ Methods
     DataArray.xvec.query
     DataArray.xvec.to_geodataframe
     DataArray.xvec.to_geopandas
-    DataArray.xvec.extract_points
+    DataArray.xvec.extract_points
+    DataArray.xvec.zonal_stats
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -44,6 +44,8 @@
     "xarray": ("https://docs.xarray.dev/en/latest/", None),
     "geopandas": ("https://geopandas.org/en/latest", None),
     "pandas": ("https://pandas.pydata.org/docs", None),
+    "rasterio": ("https://rasterio.readthedocs.io/en/latest/", None),
+
 }
 
 # -- Options for HTML output -------------------------------------------------

diff --git a/xvec/accessor.py b/xvec/accessor.py
@@ -11,6 +11,7 @@
 from pyproj import CRS, Transformer
 
 from .index import GeometryIndex
+from .zonal import _zonal_stats_iterative, _zonal_stats_rasterize
 
 
 @xr.register_dataarray_accessor("xvec")
@@ -918,6 +919,119 @@ def to_geodataframe(
         )
         return df
 
+    def zonal_stats(
+        self,
+        polygons: Sequence[shapely.Geometry],
+        x_coords: Hashable,
+        y_coords: Hashable,
+        stats: str = "mean",
+        name: Hashable = "geometry",
+        index: bool = None,
+        method: str = "rasterize",
+        all_touched: bool = False,
+        n_jobs: int = -1,
+        **kwargs,
+    ):
+        """Extract the values from a dataset indexed by a set of geometries
+
+        The CRS of the raster and that of polygons need to be equal.
+        Xvec does not verify their equality.
+
+        Parameters
+        ----------
+        polygons : Sequence[shapely.Geometry]
+            An arrray-like (1-D) of shapely geometries, like a numpy array or
+            :class:`geopandas.GeoSeries`.
+        x_coords : Hashable
+            name of the coordinates containing ``x`` coordinates (i.e. the first value
+            in the coordinate pair encoding the vertex of the polygon)
+        y_coords : Hashable
+            name of the coordinates containing ``y`` coordinates (i.e. the second value
+            in the coordinate pair encoding the vertex of the polygon)
+        stats : string
+            Spatial aggregation statistic method, by default "mean". It supports the
+            following statistcs: ['mean', 'median', 'min', 'max', 'sum']
+        name : Hashable, optional
+            Name of the dimension that will hold the ``polygons``, by default "geometry"
+        index : bool, optional
+            If `polygons` is a GeoSeries, ``index=True`` will attach its index as another
+            coordinate to the geometry dimension in the resulting object. If
+            ``index=None``, the index will be stored if the `polygons.index` is a named
+            or non-default index. If ``index=False``, it will never be stored. This is
+            useful as an attribute link between the resulting array and the GeoPandas
+            object from which the polygons are sourced.
+        method : str, optional
+            The method of data extraction. The default is ``"rasterize"``, which uses
+            :func:`rasterio.features.rasterize` and is faster, but can lead to loss
+            of information in case of small polygons. Other option is ``"iterate"``, which
+            iterates over polygons and uses :func:`rasterio.features.geometry_mask`.
+        all_touched : bool, optional
+            If True, all pixels touched by geometries will be considered. If False, only
+            pixels whose center is within the polygon or that are selected by
+            Bresenham’s line algorithm will be considered.
+        n_jobs : int, optional
+            Number of parallel threads to use. It is recommended to set this to the
+            number of physical cores of the CPU. ``-1`` uses all available cores. Applies
+            only if ``method="iterate"``.
+        **kwargs : optional
+            Keyword arguments to be passed to the aggregation function
+            (e.g., ``Dataset.mean(**kwargs)``).
+
+        Returns
+        -------
+        Dataset
+            A subset of the original object with N-1 dimensions indexed by
+            the the GeometryIndex.
+
+        """
+        # TODO: allow multiple stats at the same time (concat along a new axis),
+        # TODO: possibly as a list of tuples to include names?
+        # TODO: allow callable in stat (via .reduce())
+        if method == "rasterize":
+            result = _zonal_stats_rasterize(
+                self,
+                polygons=polygons,
+                x_coords=x_coords,
+                y_coords=y_coords,
+                stats=stats,
+                name=name,
+                all_touched=all_touched,
+                **kwargs,
+            )
+        elif method == "iterate":
+            result = _zonal_stats_iterative(
+                self,
+                polygons=polygons,
+                x_coords=x_coords,
+                y_coords=y_coords,
+                stats=stats,
+                name=name,
+                all_touched=all_touched,
+                n_jobs=n_jobs,
+                **kwargs,
+            )
+        else:
+            raise ValueError(
+                f"method '{method}' is not supported. Allowed options are 'rasterize' "
+                "and 'iterate'."
+            )
+
+        # save the index as a data variable
+        if isinstance(polygons, pd.Series):
+            if index is None:
+                if polygons.index.name is not None or not polygons.index.equals(
+                    pd.RangeIndex(0, len(polygons))
+                ):
+                    index = True
+            if index:
+                index_name = polygons.index.name if polygons.index.name else "index"
+                result = result.assign_coords({index_name: (name, polygons.index)})
+
+        # standardize the shape - each method comes with a different one
+        return result.transpose(
+            name, *tuple(d for d in self._obj.dims if d not in [x_coords, y_coords])
+        )
+
     def extract_points(
         self,
         points: Sequence[shapely.Geometry],
@@ -965,7 +1079,7 @@ def extract_points(
             ``index=None``, the index will be stored if the `points.index` is a named
             or non-default index. If ``index=False``, it will never be stored. This is
             useful as an attribute link between the resulting array and the GeoPandas
-            object from which the points are sourced from.
+            object from which the points are sourced.
 
         Returns
         -------

diff --git a/xvec/tests/test_accessor.py b/xvec/tests/test_accessor.py
@@ -7,7 +7,7 @@
 from geopandas.testing import assert_geodataframe_equal
 from pandas.testing import assert_frame_equal
 
-import xvec  # noqa
+import xvec  # noqa: F401
 from xvec import GeometryIndex