Merge pull request #95 from Deltares/94-bug-memory-error-when-loading…

…-large-raster-datasets-clipped-to-a-geometry 94 bug memory error when loading large raster datasets clipped to a geometry
Deltares · Jul 12, 2023 · 81b85b8 · 81b85b8
2 parents 6e5f078 + 16f655a
commit 81b85b8
Show file tree

Hide file tree

Showing 5 changed files with 157 additions and 89 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -13,6 +13,7 @@ Bugfix
 - bugfix in `SfincsModel.write_forcing` to ensure all NetCDF files are written instead of only the first one. PR #86
 - bugfix in `SfincsModel.read_config` & `SfincsInput.read` for relative paths in inp file. PR #88
 - bugfix in `SfincsModel.setup_subgrid` to ensure that a 'big geotiff' will be written by default when 'write_dep_tif' or 'write_man_tif' are True
+- fix memory issues caused by rasterizing the model region and reprojecting before clipping of rasters. PR #94 
 - bugfix in `Sfincs.read_forcing` when combining attributes from the locations stored in the gis folder with the actual forcing locations. PR #99
 - bugfix in `SfincsModel.setup_discharge_from_grid` when snapping based on upstream area in case a src points is outside of the uparea grid domain. PR #99
 
@@ -21,10 +22,15 @@ New
 - `SfincsModel.setup_cn_infiltration_with_kr` to setup three layers related to the curve number (maximum and effective infiltration capacity; seff and smax) and recovery rate (kr). PR#87
 - `SfincsModelsetup_drainage_structures` to setup drainage structures (pumps,culverts) from a geodataframe. PR#90
 - Added `SfincsModel.setup_wind_forcing`, `SfincsModel.setup_wind_forcing_from_grid` and `SfincsModel.setup_pressure_forcing_from_grid` methods to easily add wind and pressure forcing.  PR #92
+- `SfincsModel.read_config` allows to use a template input file from a directory different than the model root. PR #102
 - Added the option to use landuse/landcover data combined with a reclass table to `SfincsModel.setup_constant_infiltration`.  PR #103
 - Enabled to provide locations only (so no timeseries) for `SfincsModel.setup_waterlevel_forcing` and `SfincsModel.setup_discharge_forcing` PR #104
 - New optional buffer argument in  `SfincsModel.setup_discharge_forcing` to select gauges around boundary only. PR #104
 
+Changed
+-------
+- `SfincsModel.setup_mask_active` argument reset_mask default to True PR #94
+
 v1.0 (17 April 2023)
 ====================
 

diff --git a/hydromt_sfincs/regulargrid.py b/hydromt_sfincs/regulargrid.py
@@ -144,7 +144,7 @@ def create_mask_active(
         drop_area: float = 0,
         connectivity: int = 8,
         all_touched: bool = True,
-        reset_mask: bool = False,
+        reset_mask: bool = True,
         logger: logging.Logger = logger,
     ) -> xr.DataArray:
         """Create an integer mask with inactive (msk=0) and active (msk=1) cells, optionally bounded
@@ -179,7 +179,7 @@ def create_mask_active(
             include (or exclude) geometries. If False, include a cell only if its center is
             within one of the shapes, or if it is selected by Bresenham's line algorithm.
         reset_mask: bool, optional
-            If True, reset existing mask layer. If False (default) updating existing mask.
+            If True (default), reset existing mask layer. If False updating existing mask.
 
         Returns
         -------

diff --git a/hydromt_sfincs/sfincs.py b/hydromt_sfincs/sfincs.py
@@ -344,14 +344,15 @@ def setup_dep(
             )
 
             # check if no nan data is present in the bed levels
-            if not np.isnan(da_dep).any():
+            if np.isnan(da_dep).any():
                 self.logger.warning(
                     f"Interpolate data at {int(np.sum(np.isnan(da_dep.values)))} cells"
                 )
+                # TODO add extrapolate=True option when available in hydromt core
                 da_dep = da_dep.raster.interpolate_na(method="rio_idw")
 
             self.set_grid(da_dep, name="dep")
-            # FIXME this shouldn't be necessary, since da_dep should already have the crs
+            # FIXME this shouldn't be necessary, since da_dep should already have a crs
             if self.crs is not None and self.grid.raster.crs is None:
                 self.grid.set_crs(self.crs)
 
@@ -374,7 +375,7 @@ def setup_mask_active(
         drop_area: float = 0.0,
         connectivity: int = 8,
         all_touched: bool = True,
-        reset_mask: bool = False,
+        reset_mask: bool = True,
     ):
         """Setup active model cells.
 
@@ -418,7 +419,7 @@ def setup_mask_active(
             include (or exclude) geometries. If False, include a cell only if its center is
             within one of the shapes, or if it is selected by Bresenham's line algorithm.
         reset_mask: bool, optional
-            If True, reset existing mask layer. If False (default) updating existing mask.
+            If True  (default), reset existing mask layer. If False updating existing mask.
         """
         # read geometries
         gdf_mask, gdf_include, gdf_exclude = None, None, None
@@ -481,9 +482,15 @@ def setup_mask_active(
             if "indexfile" not in self.config:
                 self.config.update({"indexfile": "sfincs.ind"})
             # update region
-            self.logger.info("Derive region geometry based on active cells.")
-            region = da_mask.where(da_mask <= 1, 1).raster.vectorize()
-            self.set_geoms(region, "region")
+            if np.any(da_mask >= 1):
+                self.logger.info("Derive region geometry based on active cells.")
+                # make mask with ones and zeros only -> vectorize ones
+                region = da_mask.where(da_mask <= 1, 1).raster.vectorize()
+                if region.empty:
+                    raise ValueError("No region found.")
+                self.set_geoms(region, "region")
+            else:
+                self.logger.warning("No active cells found.")
 
     def setup_mask_bounds(
         self,
@@ -517,11 +524,12 @@ def setup_mask_bounds(
         btype: {'waterlevel', 'outflow'}
             Boundary type
         include_mask, exclude_mask: str, Path, gpd.GeoDataFrame, optional
-            Path or data source name for geometries with areas to include/exclude from the model boundary.
+            Path or data source name for geometries with areas to include/exclude from
+            the model boundary.
         zmin, zmax : float, optional
             Minimum and maximum elevation thresholds for boundary cells.
-            Note that when include and exclude areas are used, the elevation range is only applied
-            on cells within the include area and outside the exclude area.
+            Note that when include and exclude areas are used, the elevation range is
+            only applied on cells within the include area and outside the exclude area.
         reset_bounds: bool, optional
             If True, reset existing boundary cells of the selected boundary
             type (`btype`) before setting new boundary cells, by default False.
@@ -536,7 +544,7 @@ def setup_mask_bounds(
 
         # get include / exclude geometries
         gdf_include, gdf_exclude = None, None
-        bbox = self.region.to_crs(4326).total_bounds
+        bbox = self.mask.raster.transform_bounds(4326)
         if include_mask is not None:
             if not isinstance(include_mask, gpd.GeoDataFrame) and str(
                 include_mask
@@ -609,9 +617,15 @@ def setup_subgrid(
         ----------
         datasets_dep : List[dict]
             List of dictionaries with topobathy data.
-            Each should minimally contain a data catalog source name, data file path, or xarray raster object ('elevtn')
-            Optional merge arguments include 'zmin', 'zmax', 'mask', 'offset', 'reproj_method', and 'merge_method'.
-            e.g.: [{'elevtn': merit_hydro, 'zmin': 0.01}, {'elevtn': gebco, 'offset': 0, 'merge_method': 'first', reproj_method: 'bilinear'}]
+            Each should minimally contain a data catalog source name, data file path,
+            or xarray raster object ('elevtn')
+
+            Optional merge arguments include 'zmin', 'zmax', 'mask', 'offset',
+            'reproj_method', and 'merge_method'. e.g.:
+            [
+                {'elevtn': merit_hydro, 'zmin': 0.01},
+                {'elevtn': gebco, 'offset': 0, 'merge_method': 'first', reproj_method: 'bilinear'}
+            ]
             For a complete overview of all merge options, see :py:function:~hydromt.workflows.merge_multi_dataarrays
         datasets_rgh : List[dict], optional
             List of dictionaries with Manning's n datasets. Each dictionary should at least contain one of the following:
@@ -762,7 +776,7 @@ def setup_river_inflow(
         if hydrography is not None:
             ds = self.data_catalog.get_rasterdataset(
                 hydrography,
-                geom=self.region,
+                bbox=self.mask.raster.transform_bounds(4326),
                 variables=["uparea", "flwdir"],
                 buffer=5,
             )
@@ -881,7 +895,7 @@ def setup_river_outflow(
         if hydrography is not None:
             ds = self.data_catalog.get_rasterdataset(
                 hydrography,
-                geom=self.region,
+                bbox=self.mask.raster.transform_bounds(4326),
                 variables=["uparea", "flwdir"],
                 buffer=5,
             )
@@ -1040,7 +1054,9 @@ def setup_cn_infiltration(self, cn, antecedent_moisture="avg", reproj_method="me
             By default 'med'. For more information see, :py:meth:`hydromt.raster.RasterDataArray.reproject_like`
         """
         # get data
-        da_org = self.data_catalog.get_rasterdataset(cn, geom=self.region, buffer=10)
+        da_org = self.data_catalog.get_rasterdataset(
+            cn, bbox=self.mask.raster.transform_bounds(4326), buffer=10
+        )
         # read variable
         v = "cn"
         if antecedent_moisture:
@@ -1089,10 +1105,14 @@ def setup_cn_infiltration_with_kr(
 
         # Read the datafiles
         da_landuse = self.data_catalog.get_rasterdataset(
-            lulc, geom=self.region, buffer=10
+            lulc, bbox=self.mask.raster.transform_bounds(4326), buffer=10
+        )
+        da_HSG = self.data_catalog.get_rasterdataset(
+            hsg, bbox=self.mask.raster.transform_bounds(4326), buffer=10
+        )
+        da_Ksat = self.data_catalog.get_rasterdataset(
+            ksat, bbox=self.mask.raster.transform_bounds(4326), buffer=10
         )
-        da_HSG = self.data_catalog.get_rasterdataset(hsg, geom=self.region, buffer=10)
-        da_Ksat = self.data_catalog.get_rasterdataset(ksat, geom=self.region, buffer=10)
         df_map = self.data_catalog.get_dataframe(reclass_table, index_col=0)
 
         # Define outputs
@@ -1655,7 +1675,9 @@ def setup_waterlevel_forcing(
                 df_ts += offset
             else:
                 da_offset = self.data_catalog.get_rasterdataset(
-                    offset, geom=self.region, buffer=5
+                    offset,
+                    bbox=self.mask.raster.transform_bounds(4326),
+                    buffer=5,
                 )
                 offset_pnts = da_offset.raster.sample(gdf_locs)
                 df_offset = offset_pnts.to_pandas().reindex(df_ts.columns).fillna(0)
@@ -1861,15 +1883,18 @@ def setup_discharge_forcing_from_grid(
         # read data
         ds = self.data_catalog.get_rasterdataset(
             discharge,
-            geom=self.region,
+            bbox=self.mask.raster.transform_bounds(4326),
             buffer=2,
             time_tuple=self.get_model_time(),  # model time
             variables=["discharge"],
             single_var_as_array=False,
         )
         if uparea is not None and "uparea" in gdf.columns:
             da_upa = self.data_catalog.get_rasterdataset(
-                uparea, geom=self.region, buffer=2, variables=["uparea"]
+                uparea,
+                bbox=self.mask.raster.transform_bounds(4326),
+                buffer=2,
+                variables=["uparea"],
             )
             # make sure ds and da_upa align
             ds["uparea"] = da_upa.raster.reproject_like(ds, method="nearest")
@@ -1929,7 +1954,7 @@ def setup_precip_forcing_from_grid(
         # get data for model domain and config time range
         precip = self.data_catalog.get_rasterdataset(
             precip,
-            geom=self.region,
+            bbox=self.mask.raster.transform_bounds(4326),
             buffer=2,
             time_tuple=self.get_model_time(),
             variables=["precip"],
@@ -3189,17 +3214,21 @@ def get_model_time(self):
 
     ## helper method
     def _parse_datasets_dep(self, datasets_dep, res):
-        """Parse filenames or paths of Datasets in list of dictionaries datasets_dep into xr.DataArray and gdf.GeoDataFrames:
+        """Parse filenames or paths of Datasets in list of dictionaries datasets_dep
+        into xr.DataArray and gdf.GeoDataFrames:
+
         * "elevtn" is parsed into da (xr.DataArray)
         * "offset" is parsed into da_offset (xr.DataArray)
         * "mask" is parsed into gdf (gpd.GeoDataFrame)
 
         Parameters
         ----------
         datasets_dep : List[dict]
-            List of dictionaries with topobathy data, each containing a dataset name or Path (dep) and optional merge arguments.
+            List of dictionaries with topobathy data, each containing a dataset name or
+            Path (dep) and optional merge arguments.
         res : float
-            Resolution of the model grid in meters. Used to obtain the correct zoom level of the depth datasets.
+            Resolution of the model grid in meters. Used to obtain the correct zoom
+            level of the depth datasets.
         """
         parse_keys = ["elevtn", "offset", "mask", "da"]
         copy_keys = ["zmin", "zmax", "reproj_method", "merge_method"]
@@ -3212,18 +3241,17 @@ def _parse_datasets_dep(self, datasets_dep, res):
                 try:
                     da_elv = self.data_catalog.get_rasterdataset(
                         dataset.get("elevtn", dataset.get("da")),
-                        geom=self.mask.raster.box,
+                        bbox=self.mask.raster.transform_bounds(4326),
                         buffer=10,
                         variables=["elevtn"],
                         zoom_level=(res, "meter"),
                     )
-                    dd.update({"da": da_elv})
-                except:
+                # TODO remove ValueError after fix in hydromt core
+                except (IndexError, ValueError):
                     data_name = dataset.get("elevtn")
-                    self.logger.warning(
-                        f"{data_name} not used; probably because all the data is outside of the mask."
-                    )
+                    self.logger.warning(f"No data in domain for {data_name}, skipped.")
                     continue
+                dd.update({"da": da_elv})
             else:
                 raise ValueError(
                     "No 'elevtn' (topobathy) dataset provided in datasets_dep."
@@ -3234,16 +3262,16 @@ def _parse_datasets_dep(self, datasets_dep, res):
             if "offset" in dataset and not isinstance(dataset["offset"], (float, int)):
                 da_offset = self.data_catalog.get_rasterdataset(
                     dataset.get("offset"),
-                    geom=self.mask.raster.box,
-                    buffer=20,
+                    bbox=self.mask.raster.transform_bounds(4326),
+                    buffer=10,
                 )
                 dd.update({"offset": da_offset})
 
             # read geodataframes describing valid areas
             if "mask" in dataset:
                 gdf_valid = self.data_catalog.get_geodataframe(
                     path_or_key=dataset.get("mask"),
-                    geom=self.mask.raster.box,
+                    bbox=self.mask.raster.transform_bounds(4326),
                 )
                 dd.update({"gdf_valid": gdf_valid})
 
@@ -3258,17 +3286,21 @@ def _parse_datasets_dep(self, datasets_dep, res):
         return datasets_out
 
     def _parse_datasets_rgh(self, datasets_rgh):
-        """Parse filenames or paths of Datasets in list of dictionaries datasets_rgh into xr.DataArrays and gdf.GeoDataFrames:
+        """Parse filenames or paths of Datasets in list of dictionaries datasets_rgh
+        into xr.DataArrays and gdf.GeoDataFrames:
+
         * "manning" is parsed into da (xr.DataArray)
-        * "lulc" is parsed into da (xr.DataArray) using reclassify table in "reclass_table"
+        * "lulc" is parsed into da (xr.DataArray) using reclass table in "reclass_table"
         * "mask" is parsed into gdf_valid (gpd.GeoDataFrame)
 
         Parameters
         ----------
         datasets_rgh : List[dict], optional
-            List of dictionaries with Manning's n datasets. Each dictionary should at least contain one of the following:
+            List of dictionaries with Manning's n datasets. Each dictionary should at
+            least contain one of the following:
             * (1) manning: filename (or Path) of gridded data with manning values
-            * (2) lulc (and reclass_table) :a combination of a filename of gridded landuse/landcover and a reclassify table.
+            * (2) lulc (and reclass_table): a combination of a filename of gridded
+                  landuse/landcover and a reclassify table.
             In additon, optional merge arguments can be provided e.g.: merge_method, mask
         """
         parse_keys = ["manning", "lulc", "reclass_table", "mask", "da"]
@@ -3281,7 +3313,7 @@ def _parse_datasets_rgh(self, datasets_rgh):
             if "manning" in dataset or "da" in dataset:
                 da_man = self.data_catalog.get_rasterdataset(
                     dataset.get("manning", dataset.get("da")),
-                    geom=self.mask.raster.box,
+                    bbox=self.mask.raster.transform_bounds(4326),
                     buffer=10,
                 )
                 dd.update({"da": da_man})
@@ -3291,12 +3323,15 @@ def _parse_datasets_rgh(self, datasets_rgh):
                 reclass_table = dataset.get("reclass_table", None)
                 if reclass_table is None and isinstance(lulc, str):
                     reclass_table = join(DATADIR, "lulc", f"{lulc}_mapping.csv")
-                if not os.path.isfile(reclass_table) and isinstance(lulc, str):
+                if reclass_table is None:
                     raise IOError(
                         f"Manning roughness mapping file not found: {reclass_table}"
                     )
                 da_lulc = self.data_catalog.get_rasterdataset(
-                    lulc, geom=self.mask.raster.box, buffer=10, variables=["lulc"]
+                    lulc,
+                    bbox=self.mask.raster.transform_bounds(4326),
+                    buffer=10,
+                    variables=["lulc"],
                 )
                 df_map = self.data_catalog.get_dataframe(reclass_table, index_col=0)
                 # reclassify
@@ -3309,7 +3344,7 @@ def _parse_datasets_rgh(self, datasets_rgh):
             if "mask" in dataset:
                 gdf_valid = self.data_catalog.get_geodataframe(
                     path_or_key=dataset.get("mask"),
-                    geom=self.mask.raster.box,
+                    bbox=self.mask.raster.transform_bounds(4326),
                 )
                 dd.update({"gdf_valid": gdf_valid})