Merge pull request #517 from mjohns-databricks/mjohns-4.0-docs-1

Mjohns 0.4.0 docs 1
databrickslabs · Jan 24, 2024 · d89cc16 · d89cc16
2 parents 696e852 + de74df2
commit d89cc16
Show file tree

Hide file tree

Showing 68 changed files with 1,446 additions and 1,377 deletions.
diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R
@@ -78,8 +78,8 @@ test_that("aggregate vector functions behave as intended", {
   sdf.intersection <- join(sdf.l, sdf.r, sdf.l$left_index == sdf.r$right_index, "inner")
   sdf.intersection <- summarize(
     groupBy(sdf.intersection, sdf.intersection$left_id, sdf.intersection$right_id),
-    agg_intersects = st_intersects_aggregate(column("left_index"), column("right_index")),
-    agg_intersection = st_intersection_aggregate(column("left_index"), column("right_index")),
+    agg_intersects = st_intersects_agg(column("left_index"), column("right_index")),
+    agg_intersection = st_intersection_agg(column("left_index"), column("right_index")),
     left_geom = first(column("left_geom")),
     right_geom = first(column("right_geom"))
   )

diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R
@@ -92,8 +92,8 @@ test_that("aggregate vector functions behave as intended", {
     inner_join(sdf.r, by = c("left_index" = "right_index"), keep = TRUE) %>%
     dplyr::group_by(left_id, right_id) %>%
     dplyr::summarise(
-      agg_intersects = st_intersects_aggregate(left_index, right_index),
-      agg_intersection = st_intersection_aggregate(left_index, right_index),
+      agg_intersects = st_intersects_agg(left_index, right_index),
+      agg_intersection = st_intersection_agg(left_index, right_index),
       left_geom = max(left_geom, 1),
       right_geom = max(right_geom, 1)
     ) %>%

diff --git a/docs/source/api/raster-functions.rst b/docs/source/api/raster-functions.rst
diff --git a/docs/source/api/rasterio-udfs.rst b/docs/source/api/rasterio-udfs.rst
@@ -119,14 +119,14 @@ Finally we will apply the function to the DataFrame.
 .. code-block:: python
 
     df.select(compute_band_mean("tile.raster")).show()
-    +---------------------------+
-    | compute_band_mean(raster) |
-    +---------------------------+
-    |         0.0111000000000000|
-    |         0.0021000000000000|
-    |         0.3001000000000000|
-    | ...                       |
-    +---------------------------+
+    +----------------------------+
+    | compute_band_mean(raster)  |
+    +----------------------------+
+    |         0.0111000000000000 |
+    |         0.0021000000000000 |
+    |         0.3001000000000000 |
+    | ...                        |
+    +----------------------------+
 
 
 UDF example for computing NDVI
@@ -234,43 +234,125 @@ Firstly we will create a spark DataFrame from a directory of raster files.
     +-----------------------------------------------------------+------------------------------+-----------+---------------------+-------+-------+-----------+----------------------+-------------+-------+---------------------------------------------------------------------------------------------------------------+
 
 
-Next we will define a function that will write a given raster file to disk.
+Next we will define a function that will write a given raster file to disk. A "gotcha" to keep in mind is that you do
+not want to have a file context manager open when you go to write out its context as the context manager will not yet
+have been flushed. Another "gotcha" might be that the raster dataset does not have CRS included; if this arises, we
+recommend adjusting the function to specify the CRS and set it on the dst variable, more at
+`rasterio.crs <https://rasterio.readthedocs.io/en/stable/api/rasterio.crs.html>`_. We would also point out that notional
+"file_id" param can be constructed as a repeatable name from other field(s) in your dataframe / table or be random,
+depending on your needs.
 
 .. code-block:: python
 
-    import numpy as np
-    import rasterio
-    from rasterio.io import MemoryFile
-    from io import BytesIO
-    from pyspark.sql.functions import udf
-    from pathlib import Path
+    @udf("string")
+    def write_raster(raster, driver, file_id, fuse_dir):
+        from io import BytesIO
+        from pathlib import Path
+        from pyspark.sql.functions import udf
+        from rasterio.io import MemoryFile
+        import numpy as np
+        import rasterio
+        import shutil
+        import tempfile
+
+        # - [1] populate the initial profile
+        # # profile is needed in order to georeference the image
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            profile = None
+            data_arr = None
+            with MemoryFile(BytesIO(raster)) as memfile:
+                with memfile.open() as dataset:
+                    profile = dataset.profile
+                    data_arr = dataset.read()
+            # here you can update profile using .update method
+            # example https://rasterio.readthedocs.io/en/latest/topics/writing.html
+            # - [2] get the correct extension
+            extensions_map = rasterio.drivers.raster_driver_extensions()
+            driver_map = {v: k for k, v in extensions_map.items()}
+            extension = driver_map[driver] #e.g. GTiff
+            file_name = f"{file_id}.{extension}"
+            # - [3] write local raster
+            # - this is showing a single band [1]
+            #   being written
+            tmp_path = f"{tmp_dir}/{file_name}"
+            with rasterio.open(
+              tmp_path,
+              "w",
+              **profile
+            ) as dst:
+                dst.write(data_arr) # <- adjust as needed
+            # - [4] copy to fuse path
+            Path(fuse_dir).mkdir(parents=True, exist_ok=True)
+            fuse_path = f"{fuse_dir}/{file_name}"
+            if not os.path.exists(fuse_path):
+                shutil.copyfile(tmp_path, fuse_path)
+        return fuse_path
+
+Finally we will apply the function to the DataFrame.
+
+.. code-block:: python
+
+    df.select(
+      write_raster(
+        "tile.raster",
+        lit("GTiff").alias("driver"),
+        "uuid",
+        lit("/dbfs/path/to/output/dir").alias("fuse_dir")
+      )
+    ).display()
+    +----------------------------------------------+
+    | write_raster(raster, driver, uuid, fuse_dir) |
+    +----------------------------------------------+
+    | /dbfs/path/to/output/dir/1234.tif            |
+    | /dbfs/path/to/output/dir/4545.tif            |
+    | /dbfs/path/to/output/dir/3215.tif            |
+    | ...                                          |
+    +----------------------------------------------+
+
+Sometimes you don't need to be quite as fancy. Consider when you simply want to specify to write out raster contents,
+assuming you specify the extension in the file_name. This is just writing binary column to file, nothing further. Again,
+we use a notional "uuid" column as part of "file_name" param, which would have the same considerations as mentioned
+above.
+
+.. code-block:: python
 
     @udf("string")
-    def write_raster(raster, file_id, parent_dir):
-      with MemoryFile(BytesIO(raster)) as memfile:
-        with memfile.open() as dataset:
-          Path(outputpath).mkdir(parents=True, exist_ok=True)
-          extensions_map = rasterio.drivers.raster_driver_extensions()
-          driver_map = {v: k for k, v in extensions_map.items()}
-          extension = driver_map[dataset.driver]
-          path = f"{parent_dir}/{file_id}.{extension}"
-          # If you want to write the raster to a different format
-          # you can update the profile here. Note that the extension
-          # should match the driver format
-          with rasterio.open(path, "w", **dataset.profile) as dst:
-            dst.write(dataset.read())
-          return path
+    def write_binary(raster_bin, file_name, fuse_dir):
+        from pathlib import Path
+        import os
+        import shutil
+        import tempfile
+
+        Path(fuse_dir).mkdir(parents=True, exist_ok=True)
+        fuse_path = f"{fuse_dir}/{file_name}"
+        if not os.path.exists(fuse_path):
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                tmp_path = f"{tmp_dir}/{file_name}"
+                # - write within the tmp_dir context
+                # - flush the writer before copy
+                tmp_file = open(tmp_path, "wb")
+                tmp_file.write(raster_bin)  # <- write entire binary content
+                tmp_file.close()
+                # - copy local to fuse
+                shutil.copyfile(tmp_path, fuse_path)
+        return fuse_path
 
 Finally we will apply the function to the DataFrame.
 
 .. code-block:: python
 
-    df.select(write_raster("tile.raster", "uuid", lit("dbfs:/path/to/output/dir"))).show()
-    +-------------------------------------+
-    | write_raster(raster, output, output)|
-    +-------------------------------------+
-    | dbfs:/path/to/output/dir/1234.tif   |
-    | dbfs:/path/to/output/dir/4545.tif   |
-    | dbfs:/path/to/output/dir/3215.tif   |
-    | ...                                 |
-    +-------------------------------------+
+    df.select(
+      write_binary(
+        "tile.raster",
+        F.concat("uuid", F.lit(".tif")).alias("file_name"),
+        F.lit("/dbfs/path/to/output/dir").alias("fuse_dir")
+      )
+    ).display()
+    +-------------------------------------------+
+    | write_binary(raster, file_name, fuse_dir) |
+    +-------------------------------------------+
+    | /dbfs/path/to/output/dir/1234.tif         |
+    | /dbfs/path/to/output/dir/4545.tif         |
+    | /dbfs/path/to/output/dir/3215.tif         |
+    | ...                                       |
+    +-------------------------------------------+