diff --git a/.github/actions/python_build/action.yml b/.github/actions/python_build/action.yml index 17e0c53f6..002cd8499 100644 --- a/.github/actions/python_build/action.yml +++ b/.github/actions/python_build/action.yml @@ -16,6 +16,11 @@ runs: pip install build wheel pyspark==${{ matrix.spark }} numpy==${{ matrix.numpy }} pip install --no-build-isolation --no-cache-dir --force-reinstall gdal==${{ matrix.gdal }} pip install . + - name: Give Python interpreter write access to checkpointing / raster write location + shell: bash + run: | + sudo mkdir -p /mnt/mosaic_tmp + sudo chmod -R 777 /mnt/mosaic_tmp - name: Test and build python package shell: bash run: | diff --git a/.github/workflows/build_main.yml b/.github/workflows/build_main.yml index 8a0660111..2495106ba 100644 --- a/.github/workflows/build_main.yml +++ b/.github/workflows/build_main.yml @@ -11,7 +11,7 @@ on: - "**" jobs: build: - runs-on: ubuntu-22.04 + runs-on: larger env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} strategy: diff --git a/.github/workflows/build_python.yml b/.github/workflows/build_python.yml index 1b766a2e3..cd1e62250 100644 --- a/.github/workflows/build_python.yml +++ b/.github/workflows/build_python.yml @@ -7,7 +7,7 @@ on: jobs: build: - runs-on: ubuntu-22.04 + runs-on: larger env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} strategy: diff --git a/.github/workflows/build_r.yml b/.github/workflows/build_r.yml index 4c6ad732f..16423758d 100644 --- a/.github/workflows/build_r.yml +++ b/.github/workflows/build_r.yml @@ -8,7 +8,7 @@ on: jobs: build: - runs-on: ubuntu-22.04 + runs-on: larger env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} strategy: diff --git a/.github/workflows/build_scala.yml b/.github/workflows/build_scala.yml index b804b491e..e27c58f35 100644 --- a/.github/workflows/build_scala.yml +++ b/.github/workflows/build_scala.yml @@ -6,7 +6,7 @@ on: jobs: build: - runs-on: ubuntu-22.04 + runs-on: larger env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} strategy: diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a9ff6687..8edf1cde5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,57 @@ ## v0.4.3 [DBR 13.3 LTS] + +This is the final mainline release of Mosaic. Future development will be focused on the planned spatial-utils library, which will be a successor to Mosaic and will include new features and improvements. The first release of spatial-utils is expected in the coming months. + +We will continue to maintain Mosaic for the foreseeable future, including bug fixes and security updates. However, we recommend that users start transitioning to spatial-utils as soon as possible to take advantage of the new features and improvements that will be available in that library. + +This release includes a number of enhancements and fixes, detailed below. + +### Raster checkpointing is enabled by default +Fuse-based checkpointing for raster operations is now enabled by default and managed through: +- spark configs `spark.databricks.labs.mosaic.raster.use.checkpoint` and `spark.databricks.labs.mosaic.raster.checkpoint`. +- python: `mos.enable_gdal(spark, with_checkpoint_path=path)`. +- scala: `MosaicGDAL.enableGDALWithCheckpoint(spark, path)`. + +This feature is designed to improve performance and reduce memory usage for raster operations by writing intermediate data to a fuse directory. This is particularly useful for large rasters or when working with many rasters in a single operation. + +We plan further enhancements to this feature (including automatic cleanup of checkpoint locations) as part of the first release of spatial-utils. + +### Enhancements and fixes to the raster processing APIs + - Added `RST_Write`, a function that permits writing each raster 'tile' in a DataFrame to a specified location (e.g. fuse directory) using the appropriate GDAL driver and tile data / path. This is useful for formalizing the path when writing a Lakehouse table and allows removal of interim checkpointed data. + - Python bindings added for `RST_Avg`, `RST_Max`, `RST_Median`, `RST_Min`, and `RST_PixelCount`. + - `RST_PixelCount` now supports optional 'countNoData' and 'countMask' parameters (defaults are false, can now be true) to optionally get full pixel counts where mask is 0.0 and noData is what is configured in the tile. + - `RST_Clip` now exposes the GDAL Warp option `CUTLINE_ALL_TOUCHED` which determines whether or not any given pixel is included whether the clipping geometry crosses the centre point of the pixel (false) or any part of the pixel (true). The default is true but this is now configurable. + - Within clipping operations such as `RST_Clip` we now correctly set the CRS in the generated Shapefile Feature Layer used for clipping. This means that the CRS of the input geometry will be respected when clipping rasters. + - Added two new functions for getting and upcasting the datatype of a raster band: `RST_Type` and `RST_UpdateType`. Use these for ensuring that the datatype of a raster is appropriate for the operations being performed, e.g. upcasting the types of integer-typed input rasters before performing raster algebra like NDVI calculations where the result needs to be a float. + - The logic underpinning `RST_MemSize` (and related operations) has been updated to fall back to estimating based on the raster dimensions and data types of each band if the raster is held in-memory. + - `RST_To_Overlapping_Tiles` is renamed `RST_ToOverlappingTiles`. The original expression remains but is marked as deprecated. + - `RST_WorldToRasterCoordY` now returns the correct `y` value (was returning `x`) + - Docs added for expression `RST_SetSRID`. + - Docs updated for `RST_FromContent` to capture the optional 'driver' parameter. + +### Dependency management +Updates to and pinning of Python language and dependency versions: - Pyspark requirement removed from python setup.cfg as it is supplied by DBR - Python version limited to "<3.11,>=3.10" for DBR -- iPython dependency limited to "<8.11,>=7.4.2" for both DBR and keplergl-jupyter -- Expanded support for fuse-based checkpointing (persisted raster storage), managed through: - - spark config 'spark.databricks.labs.mosaic.raster.use.checkpoint' in addition to 'spark.databricks.labs.mosaic.raster.checkpoint'. - - python: `mos.enable_gdal(spark, with_checkpoint_path=path)`. - - scala: `MosaicGDAL.enableGDALWithCheckpoint(spark, path)`. +- iPython dependency limited to "<8.11,>=7.4.2" for both DBR and keplergl-jupyter +- numpy now limited to "<2.0,>=1.21.5" to match DBR minimum + +### Surface mesh APIs +A set of experimental APIs for for creating and working with surface meshes (i.e. triangulated irregular networks) have been added to Mosaic. Users can now generate a conforming Delaunay triangulation over point data (optionally including 'break' lines as hard constraints), interpolate elevation over a regular grid and rasterize the results to produce terrain models. + - `ST_Triangulate` performs a conforming Delaunay triangulation using a set of mass points and break lines. + - `ST_InterpolateElevation` computes the interpolated elevations of a grid of points. + - `RST_DTMFromGeoms` burns the interpolated elevations into a raster. + +### British National Grid +Two fixes have been made to the British National Grid indexing system: +- Corrected a typo in the grid letter array used to perform lookups. +- Updated the logic used for identifying quadrants when these are specified in a grid reference + +### Documentation +A few updates to our documentation and examples library: +- An example walkthrough has been added for arbitrary GDAL Warp and Transform operations using a pyspark UDF (see the section "API Documentation / Rasterio + GDAL UDFs") +- The Python "Quickstart Notebook" has been updated to use the `MosaicAnalyzer` class (added after `MosaicFrame` was deprecated) + ## v0.4.2 [DBR 13.3 LTS] - Geopandas now fixed to "<0.14.4,>=0.14" due to conflict with minimum numpy version in geopandas 0.14.4. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ea82645e9..8af086612 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -102,7 +102,7 @@ The python bindings can be tested using [unittest](https://docs.python.org/3/lib - Move to the `python/` directory and install the project and its dependencies: `pip install . && pip install pyspark==` (where 'project_spark_version' corresponds to the version of Spark - used for the target Databricks Runtime, e.g. `3.2.1`. + used for the target Databricks Runtime, e.g. `3.4.1` for DBR 13.3 LTS. - Run the tests using `unittest`: `python -m unittest` The project wheel file can be built with [build](https://pypa-build.readthedocs.io/en/stable/). diff --git a/R/.gitignore b/R/.gitignore index f5b37c1ec..a5dfb7e50 100644 --- a/R/.gitignore +++ b/R/.gitignore @@ -1,3 +1,9 @@ **/.Rhistory **/*.tar.gz +**/*.Rproj /sparklyr-mosaic/metastore_db/ +/sparklyr-mosaic/mosaic_checkpoint/ +/sparklyr-mosaic/mosaic_tmp/ +/sparkr-mosaic/metastore_db/ +/sparkr-mosaic/mosaic_checkpoint/ +/sparkr-mosaic/mosaic_tmp/ \ No newline at end of file diff --git a/R/sparkR-mosaic/SparkR.Rproj b/R/sparkR-mosaic/SparkR.Rproj deleted file mode 100644 index 8e3c2ebc9..000000000 --- a/R/sparkR-mosaic/SparkR.Rproj +++ /dev/null @@ -1,13 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: Sweave -LaTeX: pdfLaTeX diff --git a/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION b/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION index 2ea7718f9..c740cc011 100644 --- a/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION +++ b/R/sparkR-mosaic/sparkrMosaic/DESCRIPTION @@ -8,7 +8,7 @@ Description: This package extends SparkR to bring the Databricks Mosaic for geos License: Databricks Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Collate: 'enableGDAL.R' 'enableMosaic.R' diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/sd46_dtm_breakline.zip b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/sd46_dtm_breakline.zip new file mode 100644 index 000000000..4f7f3d57a Binary files /dev/null and b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/sd46_dtm_breakline.zip differ diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/sd46_dtm_point.zip b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/sd46_dtm_point.zip new file mode 100644 index 000000000..825fff818 Binary files /dev/null and b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data/sd46_dtm_point.zip differ diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R index 6e23454dc..13e78ac24 100644 --- a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R +++ b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R @@ -8,6 +8,7 @@ generate_singleband_raster_df <- function() { test_that("mosaic can read single-band GeoTiff", { sdf <- generate_singleband_raster_df() + row <- first(sdf) expect_equal(row$length, 1067862L) expect_equal(row$x_size, 2400) @@ -34,12 +35,15 @@ test_that("scalar raster functions behave as intended", { sdf <- withColumn(sdf, "rst_scaley", rst_scaley(column("tile"))) sdf <- withColumn(sdf, "rst_srid", rst_srid(column("tile"))) sdf <- withColumn(sdf, "rst_summary", rst_summary(column("tile"))) + sdf <- withColumn(sdf, "rst_type", rst_type(column("tile"))) + sdf <- withColumn(sdf, "rst_updatetype", rst_updatetype(column("tile"), lit("Float32"))) sdf <- withColumn(sdf, "rst_upperleftx", rst_upperleftx(column("tile"))) sdf <- withColumn(sdf, "rst_upperlefty", rst_upperlefty(column("tile"))) sdf <- withColumn(sdf, "rst_width", rst_width(column("tile"))) sdf <- withColumn(sdf, "rst_worldtorastercoordx", rst_worldtorastercoordx(column("tile"), lit(0.0), lit(0.0))) sdf <- withColumn(sdf, "rst_worldtorastercoordy", rst_worldtorastercoordy(column("tile"), lit(0.0), lit(0.0))) sdf <- withColumn(sdf, "rst_worldtorastercoord", rst_worldtorastercoord(column("tile"), lit(0.0), lit(0.0))) + sdf <- withColumn(sdf, "rst_write", rst_write(column("tile"), lit("/mnt/mosaic_tmp/write-raster"))) expect_no_error(write.df(sdf, source = "noop", mode = "overwrite")) }) @@ -64,7 +68,7 @@ test_that("raster flatmap functions behave as intended", { expect_equal(nrow(tessellate_sdf), 63) overlap_sdf <- generate_singleband_raster_df() - overlap_sdf <- withColumn(overlap_sdf, "rst_to_overlapping_tiles", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L))) + overlap_sdf <- withColumn(overlap_sdf, "rst_tooverlappingtiles", rst_tooverlappingtiles(column("tile"), lit(200L), lit(200L), lit(10L))) expect_no_error(write.df(overlap_sdf, source = "noop", mode = "overwrite")) expect_equal(nrow(overlap_sdf), 87) @@ -73,7 +77,7 @@ test_that("raster flatmap functions behave as intended", { test_that("raster aggregation functions behave as intended", { collection_sdf <- generate_singleband_raster_df() collection_sdf <- withColumn(collection_sdf, "extent", st_astext(rst_boundingbox(column("tile")))) - collection_sdf <- withColumn(collection_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L))) + collection_sdf <- withColumn(collection_sdf, "tile", rst_tooverlappingtiles(column("tile"), lit(200L), lit(200L), lit(10L))) merge_sdf <- summarize( groupBy(collection_sdf, "path"), @@ -124,7 +128,7 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", { raster_sdf <- withColumn(raster_sdf, "timestep", element_at(rst_metadata(column("tile")), "NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX")) raster_sdf <- where(raster_sdf, "timestep = 21") raster_sdf <- withColumn(raster_sdf, "tile", rst_setsrid(column("tile"), lit(4326L))) - raster_sdf <- withColumn(raster_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(20L), lit(20L), lit(10L))) + raster_sdf <- withColumn(raster_sdf, "tile", rst_tooverlappingtiles(column("tile"), lit(20L), lit(20L), lit(10L))) raster_sdf <- withColumn(raster_sdf, "tile", rst_tessellate(column("tile"), lit(target_resolution))) clipped_sdf <- join(raster_sdf, census_sdf, raster_sdf$tile.index_id == census_sdf$index_id) @@ -137,4 +141,33 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", { expect_equal(nrow(merged_precipitation), 1) +}) + +test_that("a terrain model can be produced from point geometries", { + +sdf <- createDataFrame( + data.frame( + wkt = c( + "POINT Z (3 2 1)", + "POINT Z (2 1 0)", + "POINT Z (1 3 3)", + "POINT Z (0 2 2)" + ) + ) +) + +sdf <- agg(groupBy(sdf), masspoints = collect_list(column("wkt"))) +sdf <- withColumn(sdf, "breaklines", expr("array('LINESTRING EMPTY')")) +sdf <- withColumn(sdf, "splitPointFinder", lit("NONENCROACHING")) +sdf <- withColumn(sdf, "origin", st_geomfromwkt(lit("POINT (0.6 1.8)"))) +sdf <- withColumn(sdf, "xWidth", lit(12L)) +sdf <- withColumn(sdf, "yWidth", lit(6L)) +sdf <- withColumn(sdf, "xSize", lit(0.1)) +sdf <- withColumn(sdf, "ySize", lit(0.1)) +sdf <- withColumn(sdf, "noData", lit(-9999.0)) +sdf <- withColumn(sdf, "tile", rst_dtmfromgeoms( +column("masspoints"), column("breaklines"), lit(0.0), lit(0.01), column("splitPointFinder"), +column("origin"), column("xWidth"), column("yWidth"), column("xSize"), column("ySize"), column("noData") +)) +expect_equal(SparkR::count(sdf), 1) }) \ No newline at end of file diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R index 154a4cb0f..07744b2bf 100644 --- a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R +++ b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R @@ -93,4 +93,47 @@ test_that("aggregate vector functions behave as intended", { expect_true(first(sdf.intersection)$comparison_intersects) expect_true(first(sdf.intersection)$comparison_intersection) -}) \ No newline at end of file +}) + +test_that("triangulation / interpolation functions behave as intended", { +sdf <- createDataFrame( + data.frame( + wkt = c( + "POINT Z (3 2 1)", + "POINT Z (2 1 0)", + "POINT Z (1 3 3)", + "POINT Z (0 2 2)" + ) + ) +) + +sdf <- agg(groupBy(sdf), masspoints = collect_list(column("wkt"))) +sdf <- withColumn(sdf, "breaklines", expr("array('LINESTRING EMPTY')")) +triangulation_sdf <- withColumn(sdf, "triangles", st_triangulate(column("masspoints"), column("breaklines"), lit(0.0), lit(0.01), lit("NONENCROACHING"))) +cache(triangulation_sdf) +expect_equal(SparkR::count(triangulation_sdf), 2) +expected <- c("POLYGON Z((0 2 2, 2 1 0, 1 3 3, 0 2 2))", "POLYGON Z((1 3 3, 2 1 0, 3 2 1, 1 3 3))") +expect_contains(expected, first(triangulation_sdf)$triangles) + +interpolation_sdf <- sdf +interpolation_sdf <- withColumn(interpolation_sdf, "origin", st_geomfromwkt(lit("POINT (0.55 1.75)"))) +interpolation_sdf <- withColumn(interpolation_sdf, "xWidth", lit(12L)) +interpolation_sdf <- withColumn(interpolation_sdf, "yWidth", lit(6L)) +interpolation_sdf <- withColumn(interpolation_sdf, "xSize", lit(0.1)) +interpolation_sdf <- withColumn(interpolation_sdf, "ySize", lit(0.1)) +interpolation_sdf <- withColumn(interpolation_sdf, "interpolated", st_interpolateelevation( + column("masspoints"), + column("breaklines"), + lit(0.01), + lit(0.01), + lit("NONENCROACHING"), + column("origin"), + column("xWidth"), + column("yWidth"), + column("xSize"), + column("ySize") +)) +cache(interpolation_sdf) +expect_equal(SparkR::count(interpolation_sdf), 6 * 12) +expect_contains(collect(interpolation_sdf)$interpolated, "POINT Z(1.6 1.8 1.2)") +}) diff --git a/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION b/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION index 4ce2f2be1..7fa449710 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION +++ b/R/sparklyr-mosaic/sparklyrMosaic/DESCRIPTION @@ -8,7 +8,7 @@ Description: This package extends sparklyr to bring the Databricks Mosaic for ge License: Databricks Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 Collate: 'enableGDAL.R' 'enableMosaic.R' diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/sd46_dtm_breakline.zip b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/sd46_dtm_breakline.zip new file mode 100644 index 000000000..4f7f3d57a Binary files /dev/null and b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/sd46_dtm_breakline.zip differ diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/sd46_dtm_point.zip b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/sd46_dtm_point.zip new file mode 100644 index 000000000..825fff818 Binary files /dev/null and b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data/sd46_dtm_point.zip differ diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R index 3cf016fa7..d67888725 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R @@ -29,9 +29,9 @@ test_that("scalar raster functions behave as intended", { mutate(rst_boundingbox = rst_boundingbox(tile)) %>% mutate(rst_boundingbox = st_buffer(rst_boundingbox, -0.001)) %>% mutate(rst_clip = rst_clip(tile, rst_boundingbox)) %>% - mutate(rst_combineavg = rst_combineavg(array(tile, rst_clip))) %>% - mutate(rst_frombands = rst_frombands(array(tile, tile))) %>% mutate(rst_fromfile = rst_fromfile(path, -1L)) %>% + mutate(rst_combineavg = rst_combineavg(array(rst_fromfile, rst_clip))) %>% + mutate(rst_frombands = rst_frombands(array(tile, tile))) %>% mutate(rst_georeference = rst_georeference(tile)) %>% mutate(rst_getnodata = rst_getnodata(tile)) %>% mutate(rst_subdatasets = rst_subdatasets(tile)) %>% @@ -63,12 +63,15 @@ test_that("scalar raster functions behave as intended", { mutate(rst_scaley = rst_scaley(tile)) %>% mutate(rst_srid = rst_srid(tile)) %>% mutate(rst_summary = rst_summary(tile)) %>% + mutate(rst_type = rst_type(tile)) %>% + mutate(rst_updatetype = rst_updatetype(tile, "Float32")) %>% mutate(rst_upperleftx = rst_upperleftx(tile)) %>% mutate(rst_upperlefty = rst_upperlefty(tile)) %>% mutate(rst_width = rst_width(tile)) %>% mutate(rst_worldtorastercoordx = rst_worldtorastercoordx(tile, as.double(0.0), as.double(0.0))) %>% mutate(rst_worldtorastercoordy = rst_worldtorastercoordy(tile, as.double(0.0), as.double(0.0))) %>% - mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0))) + mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0))) %>% + mutate(rst_write = rst_write(tile, "/mnt/mosaic_tmp/write-raster")) expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite")) }) @@ -93,7 +96,7 @@ test_that("raster flatmap functions behave as intended", { expect_equal(sdf_nrow(tessellate_sdf), 63) overlap_sdf <- generate_singleband_raster_df() %>% - mutate(rst_to_overlapping_tiles = rst_to_overlapping_tiles(tile, 200L, 200L, 10L)) + mutate(rst_tooverlappingtiles = rst_tooverlappingtiles(tile, 200L, 200L, 10L)) expect_no_error(spark_write_source(overlap_sdf, "noop", mode = "overwrite")) expect_equal(sdf_nrow(overlap_sdf), 87) @@ -103,7 +106,7 @@ test_that("raster flatmap functions behave as intended", { test_that("raster aggregation functions behave as intended", { collection_sdf <- generate_singleband_raster_df() %>% mutate(extent = st_astext(rst_boundingbox(tile))) %>% - mutate(tile = rst_to_overlapping_tiles(tile, 200L, 200L, 10L)) + mutate(tile = rst_tooverlappingtiles(tile, 200L, 200L, 10L)) merge_sdf <- collection_sdf %>% group_by(path) %>% @@ -165,7 +168,7 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", { indexed_raster_sdf <- sdf_sql(sc, "SELECT tile, element_at(rst_metadata(tile), 'NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX') as timestep FROM raster") %>% filter(timestep == 21L) %>% mutate(tile = rst_setsrid(tile, 4326L)) %>% - mutate(tile = rst_to_overlapping_tiles(tile, 20L, 20L, 10L)) %>% + mutate(tile = rst_tooverlappingtiles(tile, 20L, 20L, 10L)) %>% mutate(tile = rst_tessellate(tile, target_resolution)) clipped_sdf <- indexed_raster_sdf %>% @@ -173,9 +176,48 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", { inner_join(census_sdf, by = "index_id") %>% mutate(tile = rst_clip(tile, wkb)) + merged_precipitation <- clipped_sdf %>% group_by(region_keys, timestep) %>% summarise(tile = rst_merge_agg(tile)) expect_equal(sdf_nrow(merged_precipitation), 1) +}) + +test_that ("a terrain model can be produced from point geometries", { + + sdf <- sdf_copy_to(sc, data.frame( + wkt = c( + "POINT Z (3 2 1)", + "POINT Z (2 1 0)", + "POINT Z (1 3 3)", + "POINT Z (0 2 2)" + ) + ) + ) %>% + group_by() %>% + summarise(masspoints = collect_list("wkt")) %>% + mutate( + breaklines = array("LINESTRING EMPTY"), + origin = st_geomfromwkt("POINT (0.6 1.8)"), + xWidth = 12L, + yWidth = 6L, + xSize = as.double(0.1), + ySize = as.double(0.1), + tile = rst_dtmfromgeoms( + masspoints, + breaklines, + as.double(0.0), + as.double(0.01), + "NONENCROACHING", + origin, + xWidth, + yWidth, + xSize, + ySize, + as.double(-9999.0) + ) + ) + expect_equal(sdf_nrow(sdf), 1) + }) \ No newline at end of file diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R index 7aa5addda..c8baaf572 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R @@ -1,14 +1,12 @@ options(warn = -1) test_that("scalar vector functions behave as intended", { - sdf_raw <- sdf_copy_to( sc, data.frame( wkt = "POLYGON ((2 1, 1 2, 2 3, 2 1))", - point_wkt = "POINT (1 1)" + point_wkt = "POINT (1 1)") ) - ) sdf <- sdf_raw %>% mutate( st_area = st_area(wkt), @@ -24,7 +22,12 @@ test_that("scalar vector functions behave as intended", { st_rotate = st_rotate(wkt, 1L), st_centroid = st_centroid(wkt), st_numpoints = st_numpoints(wkt), - st_haversine = st_haversine(as.double(0.0), as.double(90.0), as.double(0.0), as.double(0.0)), + st_haversine = st_haversine( + as.double(0.0), + as.double(90.0), + as.double(0.0), + as.double(0.0) + ), st_isvalid = st_isvalid(wkt), st_hasvalidcoordinates = st_hasvalidcoordinates(wkt, "EPSG:2192", "bounds"), st_intersects = st_intersects(wkt, wkt), @@ -75,33 +78,28 @@ test_that("scalar vector functions behave as intended", { }) test_that("aggregate vector functions behave as intended", { - inputGJ <- read_file("data/boroughs.geojson") sdf <- sdf_sql(sc, "SELECT id as location_id FROM range(1)") %>% mutate(geometry = st_geomfromgeojson(inputGJ)) expect_equal(sdf_nrow(sdf), 1) sdf.l <- sdf %>% - select( - left_id = location_id, - left_geom = geometry - ) %>% + select(left_id = location_id, left_geom = geometry) %>% mutate(left_index = mosaic_explode(left_geom, 11L)) sdf.r <- sdf %>% - select( - right_id = location_id, - right_geom = geometry - ) %>% + select(right_id = location_id, right_geom = geometry) %>% mutate(right_geom = st_translate( right_geom, st_area(right_geom) * runif(n()) * 0.1, - st_area(right_geom) * runif(n()) * 0.1) - ) %>% + st_area(right_geom) * runif(n()) * 0.1 + )) %>% mutate(right_index = mosaic_explode(right_geom, 11L)) sdf.intersection <- sdf.l %>% - inner_join(sdf.r, by = c("left_index" = "right_index"), keep = TRUE) %>% + inner_join(sdf.r, + by = c("left_index" = "right_index"), + keep = TRUE) %>% group_by(left_id, right_id) %>% summarise( agg_intersects = st_intersects_agg(left_index, right_index), @@ -124,3 +122,47 @@ test_that("aggregate vector functions behave as intended", { }) + +test_that ("triangulation and interpolation functions behave as intended", { + sdf <- sdf_copy_to(sc, data.frame( + wkt = c("POINT Z (3 2 1)", "POINT Z (2 1 0)", "POINT Z (1 3 3)", "POINT Z (0 2 2)") + )) + + sdf <- sdf %>% + group_by() %>% + summarise(masspoints = collect_list(wkt)) %>% + mutate(breaklines = array("LINESTRING EMPTY")) + + triangulation_sdf <- sdf %>% + mutate(triangles = st_triangulate(masspoints, breaklines, as.double(0.00), as.double(0.01), "NONENCROACHING")) + + expect_equal(sdf_nrow(triangulation_sdf), 2) + + expected <- c("POLYGON Z((0 2 2, 2 1 0, 1 3 3, 0 2 2))", + "POLYGON Z((1 3 3, 2 1 0, 3 2 1, 1 3 3))") + expect_contains(expected, sdf_collect(triangulation_sdf)$triangles[0]) + + interpolation_sdf <- sdf %>% + mutate( + origin = st_geomfromwkt("POINT (0.55 1.75)"), + xWidth = 12L, + yWidth = 6L, + xSize = as.double(0.1), + ySize = as.double(0.1), + interpolated = st_interpolateelevation( + masspoints, + breaklines, + as.double(0.01), + as.double(0.01), + "NONENCROACHING", + origin, + xWidth, + yWidth, + xSize, + ySize + ) + ) + expect_equal(sdf_nrow(interpolation_sdf), 6 * 12) + expect_contains(sdf_collect(interpolation_sdf)$interpolated, + "POINT Z(1.6 1.8 1.2)") +}) diff --git a/docs/docs-requirements.txt b/docs/docs-requirements.txt index 969601087..60f5c67fa 100644 --- a/docs/docs-requirements.txt +++ b/docs/docs-requirements.txt @@ -1,4 +1,4 @@ -setuptools==68.1.2 +setuptools>=70.0.0 Sphinx==6.1.3 sphinx-material==0.0.36 nbsphinx>=0.8.8 diff --git a/docs/source/api/api.rst b/docs/source/api/api.rst index 5d4401fac..6503f91db 100644 --- a/docs/source/api/api.rst +++ b/docs/source/api/api.rst @@ -13,4 +13,4 @@ API Documentation spatial-predicates spatial-aggregations raster-functions - rasterio-udfs \ No newline at end of file + rasterio-gdal-udfs \ No newline at end of file diff --git a/docs/source/api/raster-functions.rst b/docs/source/api/raster-functions.rst index 94daa5efa..0b0ec5fcf 100644 --- a/docs/source/api/raster-functions.rst +++ b/docs/source/api/raster-functions.rst @@ -2,6 +2,7 @@ Raster functions ================= +##### Intro ##### Raster functions are available in mosaic if you have installed the optional dependency `GDAL`. @@ -25,23 +26,15 @@ e.g. :code:`spark.read.format("gdal")` * The Mosaic raster tile schema changed in v0.4.1 to the following: :code:`>`. All APIs that use tiles now follow this schema. - * The function :ref:`rst_maketiles` allows for the raster tile schema to hold either a path pointer (string) - or a byte array representation of the source raster. It also supports optional checkpointing for increased - performance during chains of raster operations. - -Updates to the raster features for 0.4.1 ----------------------------------------- - - * In 0.4.1, there are a new set of raster apis that have not yet had python bindings generated; however you can still - call the functions with pyspark function :code:`selectExpr`, e.g. :code:`selectExpr("rst_avg(...)")` which invokes the sql - registered expression. The calls are: :ref:`rst_avg`, :ref:`rst_max`, :ref:`rst_min`, :ref:`rst_median`, and :ref:`rst_pixelcount`. - * Also, scala does not have a :code:`df.display()` method while python does. In practice you would most often call - :code:`display(df)` in scala for a prettier output, but for brevity, we write :code:`df.show` in scala. + * Mosaic can write rasters from a DataFrame to a target directory in DBFS using the function :ref:`rst_write` .. note:: For mosaic versions > 0.4.0 you can use the revamped setup_gdal function or new setup_fuse_install. These functions will configure an init script in your preferred Workspace, Volume, or DBFS location to install GDAL on your cluster. See :doc:`Install and Enable GDAL with Mosaic ` for more details. +.. note:: For complex operations and / or working with large rasters, Mosaic offers the option option of employing checkpointing + to write intermediate results to disk. Follow the instructions in :doc:`Checkpointing ` to enable this feature. + Functions ######### @@ -51,10 +44,8 @@ rst_avg .. function:: rst_avg(tile) Returns an array containing mean values for each band. - The python bindings are available through sql, - e.g. :code:`selectExpr("rst_avg(tile)")` - :param tile: A column containing the raster tile. + :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) :rtype: Column: ArrayType(DoubleType) @@ -63,7 +54,7 @@ rst_avg .. tabs:: .. code-tab:: python - df.selectExpr("rst_avg(tile)"").limit(1).display() + df.selectExpr(mos.rst_avg("tile")).limit(1).display() +---------------+ | rst_avg(tile) | +---------------+ @@ -96,7 +87,7 @@ rst_bandmetadata Extract the metadata describing the raster band. Metadata is return as a map of key value pairs. - :param tile: A column containing the raster tile. + :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) :param band: The band number to extract metadata for. :type band: Column (IntegerType) @@ -197,7 +188,7 @@ rst_boundingbox rst_clip ******** -.. function:: rst_clip(tile, geometry) +.. function:: rst_clip(tile, geometry, cutline_all_touched) Clips :code:`tile` with :code:`geometry`, provided in a supported encoding (WKB, WKT or GeoJSON). @@ -205,21 +196,35 @@ rst_clip :type tile: Column (RasterTileType) :param geometry: A column containing the geometry to clip the raster to. :type geometry: Column (GeometryType) + :param cutline_all_touched: A column to specify pixels boundary behavior. + :type cutline_all_touched: Column (BooleanType) :rtype: Column: RasterTileType .. note:: - Notes - - :code:`geometry` is expected to be: - - in the same coordinate reference system as the raster. - - a polygon or a multipolygon. - - The output raster tiles will have: - - the same extent as the input geometry. - - the same number of bands as the input raster. - - the same pixel data type as the input raster. - - the same pixel size as the input raster. - - the same coordinate reference system as the input raster. + **Notes** + Geometry input + The :code:`geometry` parameter is expected to be a polygon or a multipolygon. + + Cutline handling + The :code:`cutline_all_touched` parameter: + + - Optional: default is true. This is a GDAL warp config for the operation. + - If set to true, the pixels touching the geometry are included in the clip, + regardless of half-in or half-out; this is important for tessellation behaviors. + - If set to false, only at least half-in pixels are included in the clip. + - More information can be found `here `__ + + The actual GDAL command employed to perform the clipping operation is as follows: + :code:`"gdalwarp -wo CUTLINE_ALL_TOUCHED= -cutline -crop_to_cutline"` + + Output + Output raster tiles will have: + + - the same extent as the input geometry. + - the same number of bands as the input raster. + - the same pixel data type as the input raster. + - the same pixel size as the input raster. + - the same coordinate reference system as the input raster. .. :example: @@ -482,6 +487,185 @@ rst_derivedband | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | +----------------------------------------------------------------------------------------------------------------+ + +rst_dtmfromgeoms +**************** + +.. function:: rst_dtmfromgeoms(pointsArray, linesArray, mergeTolerance, snapTolerance, splitPointFinder, origin, xWidth, yWidth, xSize, ySize, noData) + + Generate a raster with interpolated elevations across a grid of points described by: + + - :code:`origin`: a point geometry describing the bottom-left corner of the grid, + - :code:`xWidth` and :code:`yWidth`: the number of points in the grid in x and y directions, + - :code:`xSize` and :code:`ySize`: the space between grid points in the x and y directions. + + :note: To generate a grid from a "top-left" :code:`origin`, use a negative value for :code:`ySize`. + + The underlying algorithm first creates a surface mesh by triangulating :code:`pointsArray` + (including :code:`linesArray` as a set of constraint lines) then determines where each point + in the grid would lie on the surface mesh. Finally, it interpolates the + elevation of that point based on the surrounding triangle's vertices. + + As with :code:`st_triangulate`, there are two 'tolerance' parameters for the algorithm: + + - :code:`mergeTolerance` sets the point merging tolerance of the triangulation algorithm, i.e. before the initial + triangulation is performed, nearby points in :code:`pointsArray` can be merged in order to speed up the triangulation + process. A value of zero means all points are considered for triangulation. + - :code:`snapTolerance` sets the tolerance for post-processing the results of the triangulation, i.e. matching + the vertices of the output triangles to input points / lines. This is necessary as the algorithm often returns null + height / Z values. Setting this to a large value may result in the incorrect Z values being assigned to the + output triangle vertices (especially when :code:`linesArray` contains very densely spaced segments). + Setting this value to zero may result in the output triangle vertices being assigned a null Z value. + Both tolerance parameters are expressed in the same units as the projection of the input point geometries. + + Additionally, you have control over the algorithm used to find split points on the constraint lines. The recommended + default option here is the "NONENCROACHING" algorithm. You can also use the "MIDPOINT" algorithm if you find the + constraint fitting process fails to converge. For full details of these options see the JTS reference + `here `__. + + The :code:`noData` value of the output raster can be set using the :code:`noData` parameter. + + This is a generator expression and the resulting DataFrame will contain one row per point of the grid. + + :param pointsArray: Array of geometries respresenting the points to be triangulated + :type pointsArray: Column (ArrayType(Geometry)) + :param linesArray: Array of geometries respresenting the lines to be used as constraints + :type linesArray: Column (ArrayType(Geometry)) + :param mergeTolerance: A tolerance used to coalesce points in close proximity to each other before performing triangulation. + :type mergeTolerance: Column (DoubleType) + :param snapTolerance: A snapping tolerance used to relate created points to their corresponding lines for elevation interpolation. + :type snapTolerance: Column (DoubleType) + :param splitPointFinder: Algorithm used for finding split points on constraint lines. Options are "NONENCROACHING" and "MIDPOINT". + :type splitPointFinder: Column (StringType) + :param origin: A point geometry describing the bottom-left corner of the grid. + :type origin: Column (Geometry) + :param xWidth: The number of points in the grid in x direction. + :type xWidth: Column (IntegerType) + :param yWidth: The number of points in the grid in y direction. + :type yWidth: Column (IntegerType) + :param xSize: The spacing between each point on the grid's x-axis. + :type xSize: Column (DoubleType) + :param ySize: The spacing between each point on the grid's y-axis. + :type ySize: Column (DoubleType) + :param noData: The no-data value of the output raster. + :type noData: Column (DoubleType) + :rtype: Column (RasterTileType) + + :example: + +.. tabs:: + .. code-tab:: py + + df = ( + spark.createDataFrame( + [ + ["POINT Z (2 1 0)"], + ["POINT Z (3 2 1)"], + ["POINT Z (1 3 3)"], + ["POINT Z (0 2 2)"], + ], + ["wkt"], + ) + .groupBy() + .agg(collect_list("wkt").alias("masspoints")) + .withColumn("breaklines", array(lit("LINESTRING EMPTY"))) + .withColumn("origin", st_geomfromwkt(lit("POINT (0.6 1.8)"))) + .withColumn("xWidth", lit(12)) + .withColumn("yWidth", lit(6)) + .withColumn("xSize", lit(0.1)) + .withColumn("ySize", lit(0.1)) + ) + df.select( + rst_dtmfromgeoms( + "masspoints", "breaklines", lit(0.0), lit(0.01), + "origin", "xWidth", "yWidth", "xSize", "ySize", + split_point_finder="NONENCROACHING", no_data_value=-9999.0 + ) + ).show(truncate=False) + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |rst_dtmfromgeoms(masspoints, breaklines, 0.0, 0.01, origin, xWidth, yWidth, xSize, ySize) | + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |{NULL, /dbfs/tmp/mosaic/raster/checkpoint/raster_d4ab419f_9829_4004_99a3_aaa597a69938.GTiff, {path -> /dbfs/tmp/mosaic/raster/checkpoint/raster_d4ab419f_9829_4004_99a3_aaa597a69938.GTiff, last_error -> , all_parents -> , driver -> GTiff, parentPath -> /tmp/mosaic_tmp/mosaic5678582907307109410/raster_d4ab419f_9829_4004_99a3_aaa597a69938.GTiff, last_command -> gdal_rasterize ATTRIBUTE=VALUES -of GTiff -co TILED=YES -co COMPRESS=DEFLATE}}| + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + + .. code-tab:: scala + + val df = Seq( + Seq( + "POINT Z (2 1 0)", "POINT Z (3 2 1)", + "POINT Z (1 3 3)", "POINT Z (0 2 2)" + ) + ) + .toDF("masspoints") + .withColumn("breaklines", array().cast(ArrayType(StringType))) + .withColumn("origin", st_geomfromwkt(lit("POINT (0.6 1.8)"))) + .withColumn("xWidth", lit(12)) + .withColumn("yWidth", lit(6)) + .withColumn("xSize", lit(0.1)) + .withColumn("ySize", lit(0.1)) + + df.select( + rst_dtmfromgeoms( + $"masspoints", $"breaklines", + lit(0.0), lit(0.01), lit("NONENCROACHING") + $"origin", $"xWidth", $"yWidth", + $"xSize", $"ySize", lit(-9999.0) + ) + ).show(1, false) + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |rst_dtmfromgeoms(masspoints, breaklines, 0.0, 0.01, origin, xWidth, yWidth, xSize, ySize) | + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |{NULL, /dbfs/tmp/mosaic/raster/checkpoint/raster_d4ab419f_9829_4004_99a3_aaa597a69938.GTiff, {path -> /dbfs/tmp/mosaic/raster/checkpoint/raster_d4ab419f_9829_4004_99a3_aaa597a69938.GTiff, last_error -> , all_parents -> , driver -> GTiff, parentPath -> /tmp/mosaic_tmp/mosaic5678582907307109410/raster_d4ab419f_9829_4004_99a3_aaa597a69938.GTiff, last_command -> gdal_rasterize ATTRIBUTE=VALUES -of GTiff -co TILED=YES -co COMPRESS=DEFLATE}}| + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + + .. code-tab:: sql + + SELECT + RST_DTMFROMGEOMS( + ARRAY( + "POINT Z (2 1 0)", + "POINT Z (3 2 1)", + "POINT Z (1 3 3)", + "POINT Z (0 2 2)" + ), + ARRAY("LINESTRING EMPTY"), + DOUBLE(0.0), DOUBLE(0.01), "NONENCROACHING", + "POINT (0.6 1.8)", 12, 6, + DOUBLE(0.1), DOUBLE(0.1), DOUBLE(-9999.0) + ) AS tile + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |rst_dtmfromgeoms(masspoints, breaklines, 0.0, 0.01, origin, xWidth, yWidth, xSize, ySize) | + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |{NULL, /dbfs/tmp/mosaic/raster/checkpoint/raster_d4ab419f_9829_4004_99a3_aaa597a69938.GTiff, {path -> /dbfs/tmp/mosaic/raster/checkpoint/raster_d4ab419f_9829_4004_99a3_aaa597a69938.GTiff, last_error -> , all_parents -> , driver -> GTiff, parentPath -> /tmp/mosaic_tmp/mosaic5678582907307109410/raster_d4ab419f_9829_4004_99a3_aaa597a69938.GTiff, last_command -> gdal_rasterize ATTRIBUTE=VALUES -of GTiff -co TILED=YES -co COMPRESS=DEFLATE}}| + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + + .. code-tab:: r R + + sdf <- createDataFrame( + data.frame( + points = c( + "POINT Z (3 2 1)", "POINT Z (2 1 0)", + "POINT Z (1 3 3)", "POINT Z (0 2 2)" + ) + ) + ) + sdf <- agg(groupBy(sdf), masspoints = collect_list(column("points"))) + sdf <- withColumn(sdf, "breaklines", expr("array('LINESTRING EMPTY')")) + sdf <- select(sdf, rst_dtmfromgeoms( + column("masspoints"), column("breaklines"), + lit(0.0), lit(0.01), lit("NONENCROACHING"), + lit("POINT (0.6 1.8)"), lit(12L), lit(6L), + lit(0.1), lit(0.1), lit(-9999.0) + ) + ) + showDF(sdf, n=1, truncate=F) + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |rst_dtmfromgeoms(masspoints, breaklines, 0.0, 0.01, POINT (0.6 1.8), 12, 6, 0.1, 0.1) | + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + |{NULL, /dbfs/tmp/mosaic/raster/checkpoint/raster_ab03a97f_9bc3_410c_80e1_adf6f75f46e2.GTiff, {path -> /dbfs/tmp/mosaic/raster/checkpoint/raster_ab03a97f_9bc3_410c_80e1_adf6f75f46e2.GTiff, last_error -> , all_parents -> , driver -> GTiff, parentPath -> /tmp/mosaic_tmp/mosaic8840676907961488874/raster_ab03a97f_9bc3_410c_80e1_adf6f75f46e2.GTiff, last_command -> gdal_rasterize ATTRIBUTE=VALUES -of GTiff -co TILED=YES -co COMPRESS=DEFLATE}}| + +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ + + rst_filter ********** @@ -597,13 +781,15 @@ rst_fromcontent :param raster_bin: A column containing the raster data. :type raster_bin: Column (BinaryType) + :param driver: GDAL driver to use to open the raster. + :type driver: Column(StringType) :param size_in_MB: Optional parameter to specify the size of the raster tile in MB. Default is not to split the input. :type size_in_MB: Column (IntegerType) :rtype: Column: RasterTileType .. note:: - Notes + **Notes** - The input raster must be a byte array in a BinaryType column. - The driver required to read the raster must be one supplied with GDAL. - If the size_in_MB parameter is specified, the raster will be split into tiles of the specified size. @@ -1182,8 +1368,6 @@ rst_max .. function:: rst_max(tile) Returns an array containing maximum values for each band. - The python bindings are available through sql, - e.g. :code:`selectExpr("rst_max(tile)")` :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) @@ -1194,7 +1378,7 @@ rst_max .. tabs:: .. code-tab:: python - df.selectExpr("rst_max(tile)"").limit(1).display() + df.selectExpr(mos.rst_max("tile")).limit(1).display() +---------------+ | rst_max(tile) | +---------------+ @@ -1225,8 +1409,6 @@ rst_median .. function:: rst_median(tile) Returns an array containing median values for each band. - The python bindings are available through sql, - e.g. :code:`selectExpr("rst_median(tile)")` :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) @@ -1237,7 +1419,7 @@ rst_median .. tabs:: .. code-tab:: python - df.selectExpr("rst_median(tile)"").limit(1).display() + df.selectExpr(mos.rst_median("tile")).limit(1).display() +---------------+ | rst_median(tile) | +---------------+ @@ -1445,8 +1627,6 @@ rst_min .. function:: rst_min(tile) Returns an array containing minimum values for each band. - The python bindings are available through sql, - e.g. :code:`selectExpr("rst_min(tile)")` :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) @@ -1457,7 +1637,7 @@ rst_min .. tabs:: .. code-tab:: python - df.selectExpr("rst_min(tile)"").limit(1).display() + df.selectExpr(mos.rst_min("tile")).limit(1).display() +---------------+ | rst_min(tile) | +---------------+ @@ -1582,18 +1762,36 @@ rst_numbands +---------------------+ rst_pixelcount -*************** +************** -.. function:: rst_pixelcount(tile) +.. function:: rst_pixelcount(tile, count_nodata, count_all) - Returns an array containing valid pixel count values for each band. - The python bindings are available through sql, - e.g. :code:`selectExpr("rst_pixelcount(tile)")` + Returns an array containing pixel count values for each band; default excludes mask and nodata pixels. :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) + :param count_nodata: A column to specify whether to count nodata pixels. + :type count_nodata: Column (BooleanType) + :param count_all: A column to specify whether to count all pixels. + :type count_all: Column (BooleanType) :rtype: Column: ArrayType(LongType) +.. note:: + + Notes: + + If pixel value is noData or mask value is 0.0, the pixel is not counted by default. + + :code:`count_nodata` + - This is an optional param. + - if specified as true, include the noData (not mask) pixels in the count (default is false). + + :code:`count_all` + - This is an optional param; as a positional arg, must also pass :code:`count_nodata` + (value of :code:`count_nodata` is ignored). + - if specified as true, simply return bandX * bandY in the count (default is false). +.. + :example: .. tabs:: @@ -2463,7 +2661,7 @@ rst_separatebands +--------------------------------------------------------------------------------------------------------------------------------+ rst_setnodata -********************** +************* .. function:: rst_setnodata(tile, nodata) @@ -2515,6 +2713,49 @@ rst_setnodata | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | +------------------------------------------------------------------------------------------------------------------+ +rst_setsrid +*********** + +.. function:: rst_setsrid(tile, srid) + + Set the SRID of the raster tile as an EPSG code. + + :param tile: A column containing the raster tile. + :type tile: Column (RasterTileType) + :param srid: The SRID to set + :type srid: Column (IntegerType) + :rtype: Column: (RasterTileType) + + :example: + +.. tabs:: + .. code-tab:: py + + df.select(mos.rst_setsrid('tile', F.lit(9122))).display() + +------------------------------------------------------------------------------------------------------------------+ + | rst_setsrid(tile, 9122) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ + + .. code-tab:: scala + + df.select(rst_setsrid(col("tile"), lit(9122))).show + +------------------------------------------------------------------------------------------------------------------+ + | rst_setsrid(tile, 9122) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ + + .. code-tab:: sql + + SELECT rst_setsrid(tile, 9122) FROM table + +------------------------------------------------------------------------------------------------------------------+ + | rst_setsrid(tile, 9122) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ + rst_skewx ********* @@ -3008,6 +3249,99 @@ rst_tryopen | true | +------------------------------------------------------------------------------------------------------------------+ + +rst_type +******** + +.. function:: rst_type(tile) + + Returns the data type of the raster's bands. + + :param tile: A column containing the raster tile. + :type tile: Column (RasterTileType) + :rtype: Column: StringType + + :example: + +.. tabs:: + .. code-tab:: py + + df.select(mos.rst_type('tile')).display() + +------------------------------------------------------------------------------------------------------------------+ + | rst_type(tile) | + +------------------------------------------------------------------------------------------------------------------+ + | [Int16] | + +------------------------------------------------------------------------------------------------------------------+ + + .. code-tab:: scala + + df.select(rst_type(col("tile"))).show + +------------------------------------------------------------------------------------------------------------------+ + | rst_type(tile) | + +------------------------------------------------------------------------------------------------------------------+ + | [Int16] | + +------------------------------------------------------------------------------------------------------------------+ + + .. code-tab:: sql + + SELECT rst_type(tile) FROM table + +------------------------------------------------------------------------------------------------------------------+ + | rst_type(tile) | + +------------------------------------------------------------------------------------------------------------------+ + | [Int16] | + +------------------------------------------------------------------------------------------------------------------+ + + +rst_updatetype +************** + +.. function:: rst_updatetype(tile, newType) + + Translates the raster to a new data type. + + :param tile: A column containing the raster tile. + :type tile: Column (RasterTileType) + :param newType: Data type to translate the raster to. + :type newType: Column (StringType) + :rtype: Column: (RasterTileType) + + :example: + +.. tabs:: + .. code-tab:: py + + df.select(mos.rst_updatetype('tile', lit('Float32'))).display() + +----------------------------------------------------------------------------------------------------+ + | rst_updatetype(tile,Float32) | + +----------------------------------------------------------------------------------------------------+ + | {"index_id":null,"raster":"SUkqAAg...= (truncated)","metadata":{"path":"... .tif","last_error":"", | + | "all_parents":"no_path","driver":"GTiff","parentPath":"no_path", | + | "last_command":"gdaltranslate -ot Float32 -of GTiff -co TILED=YES -co COMPRESS=DEFLATE"}} | + +----------------------------------------------------------------------------------------------------+ + + .. code-tab:: scala + + df.select(rst_updatetype(col("tile"), lit("Float32"))).show + +----------------------------------------------------------------------------------------------------+ + | rst_updatetype(tile,Float32) | + +----------------------------------------------------------------------------------------------------+ + | {"index_id":null,"raster":"SUkqAAg...= (truncated)","metadata":{"path":"... .tif","last_error":"", | + | "all_parents":"no_path","driver":"GTiff","parentPath":"no_path", | + | "last_command":"gdaltranslate -ot Float32 -of GTiff -co TILED=YES -co COMPRESS=DEFLATE"}} | + +----------------------------------------------------------------------------------------------------+ + + .. code-tab:: sql + + SELECT rst_updatetype(tile, 'Float32') FROM table + +----------------------------------------------------------------------------------------------------+ + | rst_updatetype(tile,Float32) | + +----------------------------------------------------------------------------------------------------+ + | {"index_id":null,"raster":"SUkqAAg...= (truncated)","metadata":{"path":"... .tif","last_error":"", | + | "all_parents":"no_path","driver":"GTiff","parentPath":"no_path", | + | "last_command":"gdaltranslate -ot Float32 -of GTiff -co TILED=YES -co COMPRESS=DEFLATE"}} | + +----------------------------------------------------------------------------------------------------+ + + rst_upperleftx ********************** @@ -3271,3 +3605,60 @@ rst_worldtorastercoordy +------------------------------------------------------------------------------------------------------------------+ | 997 | +------------------------------------------------------------------------------------------------------------------+ + +rst_write +********* + +.. function:: rst_write(input, dir) + + Writes raster tiles from the input column to a specified directory. + + :param input: A column containing the raster tile. + :type input: Column + :param dir: The directory, e.g. fuse, to write the tile's raster as file. + :type dir: Column(StringType) + :rtype: Column: RasterTileType + +.. note:: + **Notes** + - Use :code:`RST_Write` to save a 'tile' column to a specified directory (e.g. fuse) location using its + already populated GDAL driver and tile information. + - Useful for formalizing the tile 'path' when writing a Lakehouse table. An example might be to turn on checkpointing + for internal data pipeline phase operations in which multiple interim tiles are populated, but at the end of the phase + use this function to set the final path to be used in the phase's persisted table. Then, you are free to delete + the internal tiles that accumulated in the configured checkpointing directory. +.. + + :example: + +.. tabs:: + .. code-tab:: py + + df.select(rst_write("tile", ).alias("tile")).limit(1).display() + +------------------------------------------------------------------------+ + | tile | + +------------------------------------------------------------------------+ + | {"index_id":null,"tile":"","metadata":{ | + | "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} | + +------------------------------------------------------------------------+ + + .. code-tab:: scala + + df.select(rst_write(col("tile", )).as("tile)).limit(1).show + +------------------------------------------------------------------------+ + | tile | + +------------------------------------------------------------------------+ + | {"index_id":null,"tile":"","metadata":{ | + | "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} | + +------------------------------------------------------------------------+ + + .. code-tab:: sql + + SELECT rst_write(tile, ) as tile FROM table LIMIT 1 + +------------------------------------------------------------------------+ + | tile | + +------------------------------------------------------------------------+ + | {"index_id":null,"tile":"","metadata":{ | + | "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} | + +------------------------------------------------------------------------+ + diff --git a/docs/source/api/rasterio-udfs.rst b/docs/source/api/rasterio-gdal-udfs.rst similarity index 87% rename from docs/source/api/rasterio-udfs.rst rename to docs/source/api/rasterio-gdal-udfs.rst index 46cf46f77..e82c181da 100644 --- a/docs/source/api/rasterio-udfs.rst +++ b/docs/source/api/rasterio-gdal-udfs.rst @@ -1,5 +1,5 @@ ===================== -Rasterio UDFs +Rasterio + GDAL UDFs ===================== @@ -12,7 +12,8 @@ It is a great library for working with raster data in Python and it is a popular Rasterio UDFs provide a way to use Rasterio Python API in Spark for distributed processing of raster data. The data structures used by Mosaic are compatible with Rasterio and can be used interchangeably. In this section we will show how to use Rasterio UDFs to process raster data in Mosaic + Spark. -We assume that you have a basic understanding of Rasterio and GDAL. +We assume that you have a basic understanding of Rasterio and GDAL. We also provide an example which directly calls GDAL +Translate and Warp. Please note that we advise the users to set these configuration to ensure proper distribution. @@ -22,7 +23,7 @@ Please note that we advise the users to set these configuration to ensure proper spark.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", "1024") spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "false") - spark.conf.set("spark.sql.shuffle.partitions", "400") + spark.conf.set("spark.sql.shuffle.partitions", "400") # maybe higher, depending Rasterio raster plotting @@ -238,7 +239,7 @@ Next we will define a function that will write a given raster file to disk. A "g not want to have a file context manager open when you go to write out its context as the context manager will not yet have been flushed. Another "gotcha" might be that the raster dataset does not have CRS included; if this arises, we recommend adjusting the function to specify the CRS and set it on the dst variable, more at -`rasterio.crs `_. We would also point out that notional +`rasterio.crs `__. We would also point out that notional "file_id" param can be constructed as a repeatable name from other field(s) in your dataframe / table or be random, depending on your needs. @@ -355,3 +356,77 @@ Finally we will apply the function to the DataFrame. | /dbfs/path/to/output/dir/3215.tif | | ... | +-------------------------------------------+ + + +UDF example for generating Google Maps compatible tiles +####################################################### + +Delta Tables can be used as the basis for serving pre-generated tiles as an option. Here is an example UDF that applies +a few gdal operations on each band, to write to Google Maps Compatible tiles transformed into 3857 (Web Mercator). Note: +the 'quadbin' column shown in this example was generated separately using CARTO's `quadbin `__ +package. You can replace the calls with whatever you need to do. The output structure looks something like the following: + +.. figure:: ../images/rasterio/quadbin.png + :figclass: doc-figure + +The UDF example sets raster extent, block size, and interpolation. It specifies source SRID as 4326; +additionally, output type and nodata values are specified. COG overviews are not generated +nor is an ALPHA band, but they could be. Again, you would modify this example to suit your needs. + +.. code-block:: python + + @udf("binary") + def transform_raw_raster(raster): + import tempfile + import uuid + from osgeo import gdal + + with tempfile.TemporaryDirectory() as tmp_dir: + fn1 = f"{tmp_dir}/{uuid.uuid4().hex}.tif" + fn2 = f"{tmp_dir}/{uuid.uuid4().hex}.tif" + fn3 = f"{tmp_dir}/{uuid.uuid4().hex}.tif" + fn4 = f"{tmp_dir}/{uuid.uuid4().hex}.tif" + + with open(fn1, "wb") as f: + f.write(raster) + + gdal.Translate(fn2, fn1, options="-of GTiff -a_ullr -180 90 180 -90 -a_nodata -32767 -ot Int16") + gdal.Warp(fn3, fn2, options= "-tr 0.125 -0.125 -r cubicspline") + gdal.Warp(fn4, fn3, options= "-of COG -co BLOCKSIZE=1024 -co TILING_SCHEME=GoogleMapsCompatible -co COMPRESS=DEFLATE -co OVERVIEWS=NONE -co ADD_ALPHA=NO -co RESAMPLING=cubicspline -s_srs EPSG:4326") + + with open(fn4, "rb") as f: + res = f.read() + return res + +Example of calling the UDF (original data was NetCDF). If you have more than 1 band, this assumes :code:`transform_raw_rasters` UDF is called after +:code:`rst_separatebands` function (or you could potentially modify the UDF to operate on multiple bands). + +.. code-block:: python + + base_table = ( + df + .select( + "path", + "metadata", + "tile" + ) + .withColumn("subdatasets", mos.rst_subdatasets("tile")) + .where(F.array_contains(F.map_values("subdatasets"), "sfcWind")) + .withColumn("tile", mos.rst_getsubdataset("tile", F.lit("sfcWind"))) + .withColumn("tile", mos.rst_separatebands("tile")) + .repartition(sc.defaultParallelism) + .withColumn( + "tile", + F.col("tile") + .withField("raster", transform_raw_raster("tile.raster")) + .withField( + "metadata", + F.map_concat("tile.metadata", F.create_map(F.lit("driver"), F.lit("GTiff"))) + ) + ) + .withColumn("srid", mos.rst_srid("tile")) + .withColumn("srid", F.when(F.col("srid") == F.lit(0), F.lit(4326)).otherwise(F.col("srid"))) + .withColumn("timestep", F.element_at(mos.rst_metadata("tile"), "NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX")) + .withColumn("tile", mos.rst_transform("tile", F.lit(3857))) + .repartition(sc.defaultParallelism, "timestep") + ) diff --git a/docs/source/api/spatial-functions.rst b/docs/source/api/spatial-functions.rst index 9e2dfbc55..44b5945fd 100644 --- a/docs/source/api/spatial-functions.rst +++ b/docs/source/api/spatial-functions.rst @@ -878,6 +878,189 @@ st_haversine .. note:: Results of this function are always expressed in km, while the input lat/lng pairs are expected to be in degrees. The radius used (in km) is 6371.0088. +st_interpolateelevation +*********************** + +.. function:: st_interpolateelevation(pointsArray, linesArray, mergeTolerance, snapTolerance, splitPointFinder, origin, xWidth, yWidth, xSize, ySize) + + Compute interpolated elevations across a grid of points described by: + + - :code:`origin`: a point geometry describing the bottom-left corner of the grid, + - :code:`xWidth` and :code:`yWidth`: the number of points in the grid in x and y directions, + - :code:`xSize` and :code:`ySize`: the space between grid points in the x and y directions. + + :note: To generate a grid from a "top-left" :code:`origin`, use a negative value for :code:`ySize`. + + The underlying algorithm first creates a surface mesh by triangulating :code:`pointsArray` + (including :code:`linesArray` as a set of constraint lines) then determines where each point + in the grid would lie on the surface mesh. Finally, it interpolates the + elevation of that point based on the surrounding triangle's vertices. + + As with :code:`st_triangulate`, there are two 'tolerance' parameters for the algorithm: + + - :code:`mergeTolerance` sets the point merging tolerance of the triangulation algorithm, i.e. before the initial + triangulation is performed, nearby points in :code:`pointsArray` can be merged in order to speed up the triangulation + process. A value of zero means all points are considered for triangulation. + - :code:`snapTolerance` sets the tolerance for post-processing the results of the triangulation, i.e. matching + the vertices of the output triangles to input points / lines. This is necessary as the algorithm often returns null + height / Z values. Setting this to a large value may result in the incorrect Z values being assigned to the + output triangle vertices (especially when :code:`linesArray` contains very densely spaced segments). + Setting this value to zero may result in the output triangle vertices being assigned a null Z value. + Both tolerance parameters are expressed in the same units as the projection of the input point geometries. + + Additionally, you have control over the algorithm used to find split points on the constraint lines. The recommended + default option here is the "NONENCROACHING" algorithm. You can also use the "MIDPOINT" algorithm if you find the + constraint fitting process fails to converge. For full details of these options see the JTS reference + `here `__. + + This is a generator expression and the resulting DataFrame will contain one row per point of the grid. + + :param pointsArray: Array of geometries respresenting the points to be triangulated + :type pointsArray: Column (ArrayType(Geometry)) + :param linesArray: Array of geometries respresenting the lines to be used as constraints + :type linesArray: Column (ArrayType(Geometry)) + :param mergeTolerance: A tolerance used to coalesce points in close proximity to each other before performing triangulation. + :type mergeTolerance: Column (DoubleType) + :param snapTolerance: A snapping tolerance used to relate created points to their corresponding lines for elevation interpolation. + :type snapTolerance: Column (DoubleType) + :param origin: A point geometry describing the bottom-left corner of the grid. + :param splitPointFinder: Algorithm used for finding split points on constraint lines. Options are "NONENCROACHING" and "MIDPOINT". + :type splitPointFinder: Column (StringType) + :type origin: Column (Geometry) + :param xWidth: The number of points in the grid in x direction. + :type xWidth: Column (IntegerType) + :param yWidth: The number of points in the grid in y direction. + :type yWidth: Column (IntegerType) + :param xSize: The spacing between each point on the grid's x-axis. + :type xSize: Column (DoubleType) + :param ySize: The spacing between each point on the grid's y-axis. + :type ySize: Column (DoubleType) + :rtype: Column (Geometry) + + :example: + +.. tabs:: + .. code-tab:: py + + df = ( + spark.createDataFrame( + [ + ["POINT Z (2 1 0)"], + ["POINT Z (3 2 1)"], + ["POINT Z (1 3 3)"], + ["POINT Z (0 2 2)"], + ], + ["wkt"], + ) + .groupBy() + .agg(collect_list("wkt").alias("masspoints")) + .withColumn("breaklines", array(lit("LINESTRING EMPTY"))) + .withColumn("origin", st_geomfromwkt(lit("POINT (0.6 1.8)"))) + .withColumn("xWidth", lit(12)) + .withColumn("yWidth", lit(6)) + .withColumn("xSize", lit(0.1)) + .withColumn("ySize", lit(0.1)) + ) + df.select( + st_interpolateelevation( + "masspoints", "breaklines", lit(0.0), lit(0.01), + "origin", "xWidth", "yWidth", "xSize", "ySize", + split_point_finder="NONENCROACHING" + ) + ).show(4, truncate=False) + +--------------------------------------------------+ + |geom | + +--------------------------------------------------+ + |POINT Z(1.4 2.1 1.6666666666666665) | + |POINT Z(1.5 2 1.5) | + |POINT Z(1.4 1.9000000000000001 1.4000000000000001)| + |POINT Z(0.9 2 1.7) | + +--------------------------------------------------+ + + .. code-tab:: scala + + val df = Seq( + Seq( + "POINT Z (2 1 0)", "POINT Z (3 2 1)", + "POINT Z (1 3 3)", "POINT Z (0 2 2)" + ) + ) + .toDF("masspoints") + .withColumn("breaklines", array().cast(ArrayType(StringType))) + .withColumn("origin", st_geomfromwkt(lit("POINT (0.6 1.8)"))) + .withColumn("xWidth", lit(12)) + .withColumn("yWidth", lit(6)) + .withColumn("xSize", lit(0.1)) + .withColumn("ySize", lit(0.1)) + + df.select( + st_interpolateelevation( + $"masspoints", $"breaklines", + lit(0.0), lit(0.01), lit("NONENCROACHING"), + $"origin", $"xWidth", $"yWidth", $"xSize", $"ySize" + ) + ).show(4, false) + +--------------------------------------------------+ + |geom | + +--------------------------------------------------+ + |POINT Z(1.4 2.1 1.6666666666666665) | + |POINT Z(1.5 2 1.5) | + |POINT Z(1.4 1.9000000000000001 1.4000000000000001)| + |POINT Z(0.9 2 1.7) | + +--------------------------------------------------+ + + .. code-tab:: sql + + SELECT + ST_INTERPOLATEELEVATION( + ARRAY( + "POINT Z (2 1 0)", + "POINT Z (3 2 1)", + "POINT Z (1 3 3)", + "POINT Z (0 2 2)" + ), + ARRAY("LINESTRING EMPTY"), + DOUBLE(0.0), DOUBLE(0.01), "NONENCROACHING", + "POINT (0.6 1.8)", 12, 6, DOUBLE(0.1), DOUBLE(0.1) + ) + +--------------------------------------------------+ + |geom | + +--------------------------------------------------+ + |POINT Z(1.4 2.1 1.6666666666666665) | + |POINT Z(1.5 2 1.5) | + |POINT Z(1.4 1.9000000000000001 1.4000000000000001)| + |POINT Z(0.9 2 1.7) | + +--------------------------------------------------+ + + .. code-tab:: r R + + sdf <- createDataFrame( + data.frame( + points = c( + "POINT Z (3 2 1)", "POINT Z (2 1 0)", + "POINT Z (1 3 3)", "POINT Z (0 2 2)" + ) + ) + ) + sdf <- agg(groupBy(sdf), masspoints = collect_list(column("points"))) + sdf <- withColumn(sdf, "breaklines", expr("array('LINESTRING EMPTY')")) + sdf <- select(sdf, st_interpolateelevation( + column("masspoints"), column("breaklines"), + lit(0.0), lit(0.01), lit("NONENCROACHING"), + lit("POINT (0.6 1.8)"), lit(12L), lit(6L), lit(0.1), lit(0.1) + ) + ) + showDF(sdf, n=4, truncate=F) + +--------------------------------------------------+ + |geom | + +--------------------------------------------------+ + |POINT Z(1.4 2.1 1.6666666666666665) | + |POINT Z(1.5 2 1.5) | + |POINT Z(1.4 1.9000000000000001 1.4000000000000001)| + |POINT Z(0.9 2 1.7) | + +--------------------------------------------------+ + + st_intersection *************** @@ -1580,6 +1763,142 @@ st_transform by specifying the :code:`srcSRID` and :code:`dstSRID`. +st_triangulate +************** + +.. function:: st_triangulate(pointsArray, linesArray, mergeTolerance, snapTolerance, splitPointFinder) + + Performs a conforming Delaunay triangulation using the points in :code:`pointsArray` including :code:`linesArray` as constraint / break lines. + + There are two 'tolerance' parameters for the algorithm. + + - :code:`mergeTolerance` sets the point merging tolerance of the triangulation algorithm, i.e. before the initial + triangulation is performed, nearby points in :code:`pointsArray` can be merged in order to speed up the triangulation + process. A value of zero means all points are considered for triangulation. + - :code:`snapTolerance` sets the tolerance for post-processing the results of the triangulation, i.e. matching + the vertices of the output triangles to input points / lines. This is necessary as the algorithm often returns null + height / Z values. Setting this to a large value may result in the incorrect Z values being assigned to the + output triangle vertices (especially when :code:`linesArray` contains very densely spaced segments). + Setting this value to zero may result in the output triangle vertices being assigned a null Z value. + Both tolerance parameters are expressed in the same units as the projection of the input point geometries. + + Additionally, you have control over the algorithm used to find split points on the constraint lines. The recommended + default option here is the "NONENCROACHING" algorithm. You can also use the "MIDPOINT" algorithm if you find the + constraint fitting process fails to converge. For full details of these options see the JTS reference + `here `__. + + This is a generator expression and the resulting DataFrame will contain one row per triangle returned by the algorithm. + + :param pointsArray: Array of geometries respresenting the points to be triangulated + :type pointsArray: Column (ArrayType(Geometry)) + :param linesArray: Array of geometries respresenting the lines to be used as constraints + :type linesArray: Column (ArrayType(Geometry)) + :param mergeTolerance: A tolerance used to coalesce points in close proximity to each other before performing triangulation. + :type mergeTolerance: Column (DoubleType) + :param snapTolerance: A snapping tolerance used to relate created points to their corresponding lines for elevation interpolation. + :type snapTolerance: Column (DoubleType) + :param splitPointFinder: Algorithm used for finding split points on constraint lines. Options are "NONENCROACHING" and "MIDPOINT". + :type splitPointFinder: Column (StringType) + :rtype: Column (Geometry) + + :example: + +.. tabs:: + .. code-tab:: py + + df = ( + spark.createDataFrame( + [ + ["POINT Z (2 1 0)"], + ["POINT Z (3 2 1)"], + ["POINT Z (1 3 3)"], + ["POINT Z (0 2 2)"], + ], + ["wkt"], + ) + .groupBy() + .agg(collect_list("wkt").alias("masspoints")) + .withColumn("breaklines", array(lit("LINESTRING EMPTY"))) + .withColumn("triangles", st_triangulate("masspoints", "breaklines", lit(0.0), lit(0.01), "NONENCROACHING")) + ) + df.show(2, False) + +---------------------------------------+ + |triangles | + +---------------------------------------+ + |POLYGON Z((0 2 2, 2 1 0, 1 3 3, 0 2 2))| + |POLYGON Z((1 3 3, 2 1 0, 3 2 1, 1 3 3))| + +---------------------------------------+ + + .. code-tab:: scala + + val df = Seq( + Seq( + "POINT Z (2 1 0)", "POINT Z (3 2 1)", + "POINT Z (1 3 3)", "POINT Z (0 2 2)" + ) + ) + .toDF("masspoints") + .withColumn("breaklines", array().cast(ArrayType(StringType))) + .withColumn("triangles", + st_triangulate( + $"masspoints", $"breaklines", + lit(0.0), lit(0.01), lit("NONENCROACHING") + ) + ) + + df.select(st_astext($"triangles")).show(2, false) + +------------------------------+ + |st_astext(triangles) | + +------------------------------+ + |POLYGON ((0 2, 2 1, 1 3, 0 2))| + |POLYGON ((1 3, 2 1, 3 2, 1 3))| + +------------------------------+ + + .. code-tab:: sql + + SELECT + ST_TRIANGULATE( + ARRAY( + "POINT Z (2 1 0)", + "POINT Z (3 2 1)", + "POINT Z (1 3 3)", + "POINT Z (0 2 2)" + ), + ARRAY("LINESTRING EMPTY"), + DOUBLE(0.0), DOUBLE(0.01), + "NONENCROACHING" + ) + +---------------------------------------+ + |triangles | + +---------------------------------------+ + |POLYGON Z((0 2 2, 2 1 0, 1 3 3, 0 2 2))| + |POLYGON Z((1 3 3, 2 1 0, 3 2 1, 1 3 3))| + +---------------------------------------+ + + .. code-tab:: r R + + sdf <- createDataFrame( + data.frame( + points = c( + "POINT Z (3 2 1)", "POINT Z (2 1 0)", + "POINT Z (1 3 3)", "POINT Z (0 2 2)" + ) + ) + ) + sdf <- agg(groupBy(sdf), masspoints = collect_list(column("points"))) + sdf <- withColumn(sdf, "breaklines", expr("array('LINESTRING EMPTY')")) + result <- select(sdf, st_triangulate( + column("masspoints"), column("breaklines"), + lit(0.0), lit(0.01), lit("NONENCROACHING") + ) + showDF(result, truncate=F) + +---------------------------------------+ + |triangles | + +---------------------------------------+ + |POLYGON Z((0 2 2, 2 1 0, 1 3 3, 0 2 2))| + |POLYGON Z((1 3 3, 2 1 0, 3 2 1, 1 3 3))| + +---------------------------------------+ + st_translate ************ diff --git a/docs/source/api/spatial-indexing.rst b/docs/source/api/spatial-indexing.rst index 4521dd38c..eb95a77b4 100644 --- a/docs/source/api/spatial-indexing.rst +++ b/docs/source/api/spatial-indexing.rst @@ -850,7 +850,7 @@ grid_cellkringexplode grid_cell_intersection -************** +********************** .. function:: grid_cell_intersection(left_chip, right_chip) @@ -906,7 +906,7 @@ grid_cell_intersection +--------------------------------------------------------+ grid_cell_union -************** +*************** .. function:: grid_cell_union(left_chip, right_chip) diff --git a/docs/source/api/vector-format-readers.rst b/docs/source/api/vector-format-readers.rst index f6821427f..8df63fb1a 100644 --- a/docs/source/api/vector-format-readers.rst +++ b/docs/source/api/vector-format-readers.rst @@ -2,9 +2,9 @@ Vector Format Readers ===================== - +##### Intro -################ +##### Mosaic provides spark readers for vector files supported by GDAL OGR drivers. Only the drivers that are built by default are supported. Here are some common useful file formats: @@ -35,7 +35,7 @@ Additionally, for convenience, Mosaic provides specific readers for Shapefile an * :code:`spark.read.format("shapefile")` reader for Shapefiles natively in Spark. spark.read.format("ogr") -************************* +************************ A base Spark SQL data source for reading GDAL vector data sources. The output of the reader is a DataFrame with inferred schema. The schema is inferred from both features and fields in the vector file. @@ -55,7 +55,8 @@ The reader supports the following options: * layerNumber - number of the layer to read (IntegerType), zero-indexed -.. function:: spark.read.format("ogr").load(path) +.. function:: load(path) + :module: spark.read.format("ogr") Loads a vector file and returns the result as a :class:`DataFrame`. @@ -128,7 +129,8 @@ and parsed into expected types on execution. The reader supports the following o * layerNumber - number of the layer to read (IntegerType), zero-indexed [pass as String] -.. function:: mos.read().format("multi_read_ogr").load(path) +.. function:: load(path) + :module: mos.read().format("multi_read_ogr") Loads a vector file and returns the result as a :class:`DataFrame`. @@ -175,7 +177,7 @@ and parsed into expected types on execution. The reader supports the following o spark.read.format("geo_db") -***************************** +*************************** Mosaic provides a reader for GeoDB files natively in Spark. The output of the reader is a DataFrame with inferred schema. Only 1 file per task is read. For parallel reading of large files use the multi_read_ogr reader. @@ -186,7 +188,8 @@ The reader supports the following options: * layerNumber - number of the layer to read (IntegerType), zero-indexed * vsizip - if the vector files are zipped files, set this to true (BooleanType) -.. function:: spark.read.format("geo_db").load(path) +.. function:: load(path) + :module: spark.read.format("geo_db") Loads a GeoDB file and returns the result as a :class:`DataFrame`. @@ -234,7 +237,7 @@ The reader supports the following options: spark.read.format("shapefile") -******************************** +****************************** Mosaic provides a reader for Shapefiles natively in Spark. The output of the reader is a DataFrame with inferred schema. Only 1 file per task is read. For parallel reading of large files use the multi_read_ogr reader. @@ -245,7 +248,8 @@ The reader supports the following options: * layerNumber - number of the layer to read (IntegerType), zero-indexed * vsizip - if the vector files are zipped files, set this to true (BooleanType) -.. function:: spark.read.format("shapefile").load(path) +.. function:: load(path) + :module: spark.read.format("shapefile") Loads a Shapefile and returns the result as a :class:`DataFrame`. @@ -291,6 +295,7 @@ The reader supports the following options: These must be supplied as a :code:`String`. Also, you can supply function signature values as :code:`String`. +################ Vector File UDFs ################ diff --git a/docs/source/conf.py b/docs/source/conf.py index e81dd3385..e01d5e4d0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -52,6 +52,7 @@ napoleon_use_admonition_for_notes = True sphinx_tabs_disable_tab_closing = True todo_include_todos = True +suppress_warnings = ["autosectionlabel.*"] # -- Options for HTML output ------------------------------------------------- @@ -64,27 +65,27 @@ html_theme_options = { # Set the name of the project to appear in the navigation. - 'nav_title': f'Mosaic {release}', + # 'nav_title': f'Mosaic {release}', # Specify a base_url used to generate sitemap.xml. If not # specified, then no sitemap will be built. # 'base_url': 'https://project.github.io/project', # Set the color and the accent color - 'color_primary': 'green', - 'color_accent': 'green', + # 'color_primary': 'green', + # 'color_accent': 'green', # Set the repo location to get a badge with stats - 'repo_url': 'https://github.com/databrickslabs/mosaic/', - 'repo_name': 'Mosaic', + # 'repo_url': 'https://github.com/databrickslabs/mosaic/', + # 'repo_name': 'Mosaic', - 'globaltoc_depth': 3, + # 'globaltoc_depth': 3, 'globaltoc_collapse': False, 'globaltoc_includehidden': True, - 'heroes': {'index': 'Simple, scalable geospatial analytics on Databricks', - 'examples/index': 'examples and tutorials to get started with ' - 'Mosaic'}, - "version_dropdown": True, + # 'heroes': {'index': 'Simple, scalable geospatial analytics on Databricks', + # 'examples/index': 'examples and tutorials to get started with ' + # 'Mosaic'}, + # "version_dropdown": True, # "version_json": "../versions-v2.json", } diff --git a/docs/source/images/rasterio/quadbin.png b/docs/source/images/rasterio/quadbin.png new file mode 100644 index 000000000..066136feb Binary files /dev/null and b/docs/source/images/rasterio/quadbin.png differ diff --git a/docs/source/usage/automatic-sql-registration.rst b/docs/source/usage/automatic-sql-registration.rst index 56cd1b219..48c19dff7 100644 --- a/docs/source/usage/automatic-sql-registration.rst +++ b/docs/source/usage/automatic-sql-registration.rst @@ -12,7 +12,7 @@ with a geospatial middleware component such as [Geoserver](https://geoserver.org .. warning:: Mosaic 0.4.x SQL bindings for DBR 13 can register with Assigned clusters (as Spark Expressions), but not Shared Access due - to `Unity Catalog `_ API changes, more `here `_. + to `Unity Catalog `__ API changes, more `here `__. Pre-requisites ************** @@ -20,13 +20,13 @@ Pre-requisites In order to use Mosaic, you must have access to a Databricks cluster running Databricks Runtime 13. If you have cluster creation permissions in your Databricks workspace, you can create a cluster using the instructions -`here `_. +`here `__. You will also need "Can Manage" permissions on this cluster in order to attach init script to your cluster. A workspace administrator will be able to grant these permissions and more information about cluster permissions can be found in our documentation -`here `_. +`here `__. Installation ************ @@ -59,9 +59,9 @@ To install Mosaic on your Databricks cluster, take the following steps: EOF -#. Configure the init script for the cluster following the instructions `here `_. +#. Configure the init script for the cluster following the instructions `here `__. -#. Add the following spark configuration values for your cluster following the instructions `here `_. +#. Add the following spark configuration values for your cluster following the instructions `here `__. .. code-block:: bash diff --git a/docs/source/usage/install-gdal.rst b/docs/source/usage/install-gdal.rst index 12d1217d0..bd71d20dc 100644 --- a/docs/source/usage/install-gdal.rst +++ b/docs/source/usage/install-gdal.rst @@ -8,17 +8,17 @@ In order to use Mosaic 0.4 series, you must have access to a Databricks cluster Databricks Runtime 13.3 LTS. If you have cluster creation permissions in your Databricks workspace, you can create a cluster using the instructions -`here `_. +`here `__. You will also need "Can Manage" permissions on this cluster in order to attach the Mosaic library to your cluster. A workspace administrator will be able to grant these permissions and more information about cluster permissions can be found in our documentation -`here `_. +`here `__. .. warning:: These instructions assume an Assigned cluster is being used (vs a Shared Access cluster), - more on access modes `here `_. + more on access modes `here `__. GDAL Installation #################### @@ -131,7 +131,7 @@ GDAL is configured as follows in `MosaicGDAL " * - GDAL_PAM_PROXY_DIR diff --git a/docs/source/usage/installation.rst b/docs/source/usage/installation.rst index cdeeba4d0..777a471a3 100644 --- a/docs/source/usage/installation.rst +++ b/docs/source/usage/installation.rst @@ -16,49 +16,49 @@ Mosaic 0.4.x series only supports DBR 13.x DBRs. If running on a different DBR i DEPRECATION ERROR: Mosaic v0.4.x series only supports Databricks Runtime 13. You can specify :code:`%pip install 'databricks-mosaic<0.4,>=0.3'` for DBR < 13. -Mosaic 0.4.x series issues an ERROR on standard, non-Photon clusters `ADB `_ | -`AWS `_ | -`GCP `_: +Mosaic 0.4.x series issues an ERROR on standard, non-Photon clusters `ADB `__ | +`AWS `__ | +`GCP `__: DEPRECATION ERROR: Please use a Databricks Photon-enabled Runtime for performance benefits or Runtime ML for spatial AI benefits; Mosaic 0.4.x series restricts executing this cluster. As of Mosaic 0.4.0 / DBR 13.3 LTS (subject to change in follow-on releases): -* `Assigned Clusters `_ +* `Assigned Clusters `__ * Mosaic Python, SQL, R, and Scala APIs. -* `Shared Access Clusters `_ - * Mosaic Scala API (JVM) with Admin `allowlisting `_. +* `Shared Access Clusters `__ + * Mosaic Scala API (JVM) with Admin `allowlisting `__. * Mosaic Python bindings (to Mosaic Scala APIs) are blocked by Py4J Security on Shared Access Clusters. - * Mosaic SQL expressions cannot yet be registered due to `Unity Catalog `_. - API changes, more `here `_. + * Mosaic SQL expressions cannot yet be registered due to `Unity Catalog `__. + API changes, more `here `__. .. note:: Mosaic is a custom JVM library that extends spark, which has the following implications in DBR 13.3 LTS: - * `Unity Catalog `_ enforces process isolation which is difficult + * `Unity Catalog `__ enforces process isolation which is difficult to accomplish with custom JVM libraries; as such only built-in (aka platform provided) JVM APIs can be invoked from other supported languages in Shared Access Clusters. - * Clusters can read `Volumes `_ via relevant + * Clusters can read `Volumes `__ via relevant built-in (aka platform provided) readers and writers or via custom python calls which do not involve any custom JVM code. If you have cluster creation permissions in your Databricks workspace, you can create a cluster using the instructions -`here `_. +`here `__. You will also need "Can Manage" permissions on this cluster in order to attach the Mosaic library to your cluster. A workspace administrator will be able to grant these permissions and more information about cluster permissions can be found in our documentation -`here `_. +`here `__. Package installation #################### Installation from PyPI ********************** -Python users can install the library directly from `PyPI `_ -using the instructions `here `_ +Python users can install the library directly from `PyPI `__ +using the instructions `here `__ or from within a Databricks notebook using the :code:`%pip` magic command, e.g. .. code-block:: bash @@ -72,11 +72,11 @@ if you need to install Mosaic 0.3 series for DBR 12.2 LTS, e.g. %pip install "databricks-mosaic<0.4,>=0.3" -For Mosaic versions < 0.4 please use the `0.3 docs `_. +For Mosaic versions < 0.4 please use the `0.3 docs `__. Installation from release artifacts *********************************** -Alternatively, you can access the latest release artifacts `here `_ +Alternatively, you can access the latest release artifacts `here `__ and manually attach the appropriate library to your cluster. Which artifact you choose to attach will depend on the language API you intend to use. @@ -85,13 +85,13 @@ Which artifact you choose to attach will depend on the language API you intend t * For Scala users, take the Scala JAR (packaged with all necessary dependencies). * For R users, download the Scala JAR and the R bindings library [see the sparkR readme](R/sparkR-mosaic/README.md). -Instructions for how to attach libraries to a Databricks cluster can be found `here `_. +Instructions for how to attach libraries to a Databricks cluster can be found `here `__. Automated SQL registration ************************** If you would like to use Mosaic's functions in pure SQL (in a SQL notebook, from a business intelligence tool, or via a middleware layer such as Geoserver, perhaps) then you can configure -"Automatic SQL Registration" using the instructions `here `_. +"Automatic SQL Registration" using the instructions `here `__. Enabling the Mosaic functions ############################# @@ -184,4 +184,4 @@ register the Mosaic SQL functions in your SparkSession from a Scala notebook cel .. warning:: Mosaic 0.4.x SQL bindings for DBR 13 can register with Assigned clusters (as Spark Expressions), but not Shared Access due - to `Unity Catalog `_ API changes, more `here `_. + to `Unity Catalog `__ API changes, more `here `__. diff --git a/docs/source/usage/raster-checkpointing.rst b/docs/source/usage/raster-checkpointing.rst new file mode 100644 index 000000000..5d35ca5ad --- /dev/null +++ b/docs/source/usage/raster-checkpointing.rst @@ -0,0 +1,47 @@ +=================================== +Checkpointing for raster operations +=================================== + +Mosaic offers the ability to checkpoint raster operations to disk. This is useful when working with large rasters and +complex operations that may require multiple stages of computation. Checkpointing can be used to save intermediate results +to the cloud object store, which can be loaded back into memory at a later stage. +This can help to reduce the amount of memory required to store intermediate results, and can also help to improve +performance by reducing the amount of data that needs to be transferred between nodes during wide transformations. + +Checkpointing is enabled by setting the :code:`spark.databricks.labs.mosaic.raster.use.checkpoint` configuration to :code:`true`. +By default, checkpointing is disabled. When checkpointing is enabled, Mosaic will save intermediate results to the checkpoint directory +specified by the :code:`spark.databricks.labs.mosaic.raster.checkpoint` configuration. +The checkpoint directory must be a valid DBFS path (by default it is set to :code:`/dbfs/tmp/mosaic/raster/checkpoint`). + +There are also a number of helper functions that can be used to manage checkpointing in Mosaic. + +The simplest way to enable checkpointing is to specify a checkpoint directory when calling :code:`enable_gdal()` from the Python interface... + + .. code-block:: python + + import mosaic as mos + + mos.enable_mosaic(spark, dbutils) + mos.enable_gdal(spark, checkpoint_dir="/dbfs/tmp/mosaic/raster/checkpoint") + + +... or directly from Scala using :code:`MosaicGDAL.enableGDALWithCheckpoint()`: + + .. code-block:: scala + + import com.databricks.labs.mosaic.functions.MosaicContext + import com.databricks.labs.mosaic.gdal.MosaicGDAL + import com.databricks.labs.mosaic.H3 + import com.databricks.labs.mosaic.JTS + + val mosaicContext = MosaicContext.build(H3, JTS) + import mosaicContext.functions._ + + MosaicGDAL.enableGDALWithCheckpoint(session, "/dbfs/tmp/mosaic/raster/checkpoint") + +Checkpointing can be modified within the Python interface using the functions + + * :code:`update_checkpoint_path(spark: SparkSession, path: str)` + * :code:`set_checkpoint_on(spark: SparkSession)` + * :code:`set_checkpoint_off(spark: SparkSession)` + diff --git a/docs/source/usage/usage.rst b/docs/source/usage/usage.rst index d7c8c3e10..f829ac043 100644 --- a/docs/source/usage/usage.rst +++ b/docs/source/usage/usage.rst @@ -11,4 +11,5 @@ Usage quickstart kepler automatic-sql-registration + raster-checkpointing diff --git a/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb b/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb index d45e3b986..f2614f071 100644 --- a/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb +++ b/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb @@ -12,6 +12,7 @@ } }, "source": [ + "%md\n", "# Mosaic Quickstart\n", "\n", "> Perform a point-in-polygon spatial join between NYC Taxi trips and zones. __Note: this does not get into performance tweaks that are available for scaled joins.__\n", @@ -29,13 +30,10 @@ " * Already installed with Mosaic, use `%%mosaic_kepler` magic [[Mosaic Docs](https://databrickslabs.github.io/mosaic/usage/kepler.html)]\n", " * Import with `from keplergl import KeplerGl` to use directly\n", "\n", - "If you have trouble with Volume access:\n", - "\n", - "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", - "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html).\n", + "If you have trouble reading source datasets from a Unity Catalog Volume, the easiest workaround is to copy resources to an accessible location DBFS. Note: 'Shared' access mode clusters are not supported at all.\n", "\n", "--- \n", - " __Last Update__ 28 NOV 2023 [Mosaic 0.3.12]" + " __Last Update__ 10 JUN 2024 [Mosaic 0.4.2]" ] }, { @@ -57,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -72,22 +70,21 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ - "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + "Python interpreter will be restarted.\n", + "Python interpreter will be restarted.\n" ] } ], "source": [ - "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", - "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + "%pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series" ] }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -109,8 +106,7 @@ "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", "\n", "# -- import databricks + spark functions\n", - "from pyspark.sql import functions as F\n", - "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.functions import col, udf, lit, to_json, explode, array\n", "from pyspark.sql.types import *\n", "\n", "# -- setup mosaic\n", @@ -145,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -160,7 +156,6 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ @@ -194,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -209,7 +204,6 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ @@ -228,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 0, + "execution_count": null, "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": { @@ -243,7 +237,6 @@ }, "outputs": [ { - "output_type": "stream", "name": "stdout", "output_type": "stream", "text": [ @@ -251,7 +244,6 @@ ] }, { - "output_type": "display_data", "data": { "text/html": [ "