diff --git a/.github/actions/python_build/action.yml b/.github/actions/python_build/action.yml index 3eab1778d..97d9b3af2 100644 --- a/.github/actions/python_build/action.yml +++ b/.github/actions/python_build/action.yml @@ -12,6 +12,8 @@ runs: run: | cd python pip install build wheel pyspark==${{ matrix.spark }} numpy==${{ matrix.numpy }} + pip install numpy==${{ matrix.numpy }} + pip install --no-build-isolation --no-cache-dir --force-reinstall gdal==${{ matrix.gdal }} pip install . - name: Test and build python package shell: bash diff --git a/.github/actions/r_build/action.yml b/.github/actions/r_build/action.yml index e970f2fdb..c9fa2f231 100644 --- a/.github/actions/r_build/action.yml +++ b/.github/actions/r_build/action.yml @@ -23,8 +23,8 @@ runs: name: Download and unpack Spark shell: bash run: | - wget -P /usr/spark-download/raw https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop2.7.tgz - tar zxvf /usr/spark-download/raw/spark-3.2.1-bin-hadoop2.7.tgz -C /usr/spark-download/unzipped + wget -P /usr/spark-download/raw https://archive.apache.org/dist/spark/spark-${{ matrix.spark }}/spark-${{ matrix.spark }}-bin-hadoop3.tgz + tar zxvf /usr/spark-download/raw/spark-${{ matrix.spark }}-bin-hadoop3.tgz -C /usr/spark-download/unzipped - name: Create R environment shell: bash run: | @@ -50,16 +50,25 @@ runs: run: | cd R Rscript --vanilla generate_docs.R + env: + SPARK_HOME: /usr/spark-download/unzipped/spark-${{ matrix.spark }}-bin-hadoop3 - name: Build R package shell: bash run: | cd R Rscript --vanilla build_r_package.R - - name: Test R package + env: + SPARK_HOME: /usr/spark-download/unzipped/spark-${{ matrix.spark }}-bin-hadoop3 + - name: Test SparkR package shell: bash run: | cd R/sparkR-mosaic Rscript --vanilla tests.R + - name: Test sparklyr package + shell: bash + run: | + cd R/sparklyr-mosaic + Rscript --vanilla tests.R - name: Copy R artifacts to GH Actions run shell: bash run: | diff --git a/.github/actions/scala_build/action.yml b/.github/actions/scala_build/action.yml index 5e12ead9a..2d999f0ca 100644 --- a/.github/actions/scala_build/action.yml +++ b/.github/actions/scala_build/action.yml @@ -23,6 +23,8 @@ runs: pip install databricks-mosaic-gdal==${{ matrix.gdal }} sudo tar -xf /opt/hostedtoolcache/Python/${{ matrix.python }}/x64/lib/python3.9/site-packages/databricks-mosaic-gdal/resources/gdal-${{ matrix.gdal }}-filetree.tar.xz -C / sudo tar -xhf /opt/hostedtoolcache/Python/${{ matrix.python }}/x64/lib/python3.9/site-packages/databricks-mosaic-gdal/resources/gdal-${{ matrix.gdal }}-symlinks.tar.xz -C / + pip install numpy==${{ matrix.numpy }} + pip install gdal==${{ matrix.gdal }} - name: Test and build the scala JAR - skip tests is false if: inputs.skip_tests == 'false' shell: bash diff --git a/LICENSE b/LICENSE index 21db58bb9..dc30b4656 100644 --- a/LICENSE +++ b/LICENSE @@ -1,25 +1,69 @@ -DB license + Databricks License + Copyright (2022) Databricks, Inc. -Copyright (2022) Databricks, Inc. + Definitions. + + Agreement: The agreement between Databricks, Inc., and you governing + the use of the Databricks Services, as that term is defined in + the Master Cloud Services Agreement (MCSA) located at + www.databricks.com/legal/mcsa. + + Licensed Materials: The source code, object code, data, and/or other + works to which this license applies. -Definitions. + Scope of Use. You may not use the Licensed Materials except in + connection with your use of the Databricks Services pursuant to + the Agreement. Your use of the Licensed Materials must comply at all + times with any restrictions applicable to the Databricks Services, + generally, and must be used in accordance with any applicable + documentation. You may view, use, copy, modify, publish, and/or + distribute the Licensed Materials solely for the purposes of using + the Licensed Materials within or connecting to the Databricks Services. + If you do not agree to these terms, you may not view, use, copy, + modify, publish, and/or distribute the Licensed Materials. + + Redistribution. You may redistribute and sublicense the Licensed + Materials so long as all use is in compliance with these terms. + In addition: + + - You must give any other recipients a copy of this License; + - You must cause any modified files to carry prominent notices + stating that you changed the files; + - You must retain, in any derivative works that you distribute, + all copyright, patent, trademark, and attribution notices, + excluding those notices that do not pertain to any part of + the derivative works; and + - If a "NOTICE" text file is provided as part of its + distribution, then any derivative works that you distribute + must include a readable copy of the attribution notices + contained within such NOTICE file, excluding those notices + that do not pertain to any part of the derivative works. -Agreement: The agreement between Databricks, Inc., and you governing the use of the Databricks Services, which shall be, with respect to Databricks, the Databricks Terms of Service located at www.databricks.com/termsofservice, and with respect to Databricks Community Edition, the Community Edition Terms of Service located at www.databricks.com/ce-termsofuse, in each case unless you have entered into a separate written agreement with Databricks governing the use of the applicable Databricks Services. + You may add your own copyright statement to your modifications and may + provide additional license terms and conditions for use, reproduction, + or distribution of your modifications, or for any such derivative works + as a whole, provided your use, reproduction, and distribution of + the Licensed Materials otherwise complies with the conditions stated + in this License. -Software: The source code and object code to which this license applies. + Termination. This license terminates automatically upon your breach of + these terms or upon the termination of your Agreement. Additionally, + Databricks may terminate this license at any time on notice. Upon + termination, you must permanently delete the Licensed Materials and + all copies thereof. -Scope of Use. You may not use this Software except in connection with your use of the Databricks Services pursuant to the Agreement. Your use of the Software must comply at all times with any restrictions applicable to the Databricks Services, generally, and must be used in accordance with any applicable documentation. You may view, use, copy, modify, publish, and/or distribute the Software solely for the purposes of using the code within or connecting to the Databricks Services. If you do not agree to these terms, you may not view, use, copy, modify, publish, and/or distribute the Software. + DISCLAIMER; LIMITATION OF LIABILITY. -Redistribution. You may redistribute and sublicense the Software so long as all use is in compliance with these terms. In addition: - -You must give any other recipients a copy of this License; -You must cause any modified files to carry prominent notices stating that you changed the files; -You must retain, in the source code form of any derivative works that you distribute, all copyright, patent, trademark, and attribution notices from the source code form, excluding those notices that do not pertain to any part of the derivative works; and -If the source code form includes a "NOTICE" text file as part of its distribution, then any derivative works that you distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the derivative works. -You may add your own copyright statement to your modifications and may provide additional license terms and conditions for use, reproduction, or distribution of your modifications, or for any such derivative works as a whole, provided your use, reproduction, and distribution of the Software otherwise complies with the conditions stated in this License. - -Termination. This license terminates automatically upon your breach of these terms or upon the termination of your Agreement. Additionally, Databricks may terminate this license at any time on notice. Upon termination, you must permanently delete the Software and all copies thereof. - -DISCLAIMER; LIMITATION OF LIABILITY. - -THE SOFTWARE IS PROVIDED “AS-IS” AND WITH ALL FAULTS. DATABRICKS, ON BEHALF OF ITSELF AND ITS LICENSORS, SPECIFICALLY DISCLAIMS ALL WARRANTIES RELATING TO THE SOURCE CODE, EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, IMPLIED WARRANTIES, CONDITIONS AND OTHER TERMS OF MERCHANTABILITY, SATISFACTORY QUALITY OR FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. DATABRICKS AND ITS LICENSORS TOTAL AGGREGATE LIABILITY RELATING TO OR ARISING OUT OF YOUR USE OF OR DATABRICKS’ PROVISIONING OF THE SOURCE CODE SHALL BE LIMITED TO ONE THOUSAND ($1,000) DOLLARS. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + THE LICENSED MATERIALS ARE PROVIDED “AS-IS” AND WITH ALL FAULTS. + DATABRICKS, ON BEHALF OF ITSELF AND ITS LICENSORS, SPECIFICALLY + DISCLAIMS ALL WARRANTIES RELATING TO THE LICENSED MATERIALS, EXPRESS + AND IMPLIED, INCLUDING, WITHOUT LIMITATION, IMPLIED WARRANTIES, + CONDITIONS AND OTHER TERMS OF MERCHANTABILITY, SATISFACTORY QUALITY OR + FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. DATABRICKS AND + ITS LICENSORS TOTAL AGGREGATE LIABILITY RELATING TO OR ARISING OUT OF + YOUR USE OF OR DATABRICKS’ PROVISIONING OF THE LICENSED MATERIALS SHALL + BE LIMITED TO ONE THOUSAND ($1,000) DOLLARS. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + ARISING FROM, OUT OF OR IN CONNECTION WITH THE LICENSED MATERIALS OR + THE USE OR OTHER DEALINGS IN THE LICENSED MATERIALS. diff --git a/R/.gitignore b/R/.gitignore index eb4bec116..f5b37c1ec 100644 --- a/R/.gitignore +++ b/R/.gitignore @@ -1,2 +1,3 @@ **/.Rhistory **/*.tar.gz +/sparklyr-mosaic/metastore_db/ diff --git a/R/build_r_package.R b/R/build_r_package.R index ab82b99b1..a114736d1 100644 --- a/R/build_r_package.R +++ b/R/build_r_package.R @@ -1,13 +1,6 @@ -spark_location <- "/usr/spark-download/unzipped/spark-3.2.1-bin-hadoop2.7" -Sys.setenv(SPARK_HOME = spark_location) - +spark_location <- Sys.getenv("SPARK_HOME") library(SparkR, lib.loc = c(file.path(spark_location, "R", "lib"))) - - library(pkgbuild) -library(sparklyr) - - build_mosaic_bindings <- function(){ ## build package diff --git a/R/generate_R_bindings.R b/R/generate_R_bindings.R index 96a1d8286..093d68e95 100644 --- a/R/generate_R_bindings.R +++ b/R/generate_R_bindings.R @@ -8,14 +8,14 @@ library(methods) parser <- function(x){ #split on left bracket to get name - splitted = strsplit(x, "(", fixed=T)[[1]] + splitted <- strsplit(x, "(", fixed=T)[[1]] # extract function name - function_name = splitted[1] + function_name <- splitted[1] # remove the trailing bracket - args = gsub( ")", '',splitted[2], fixed=T) - args = strsplit(args, ", ", fixed=T)[[1]] - args = lapply(args, function(x){strsplit(x, ": ", fixed=T)}[[1]]) - output = list( + args <- gsub( ")", '',splitted[2], fixed=T) + args <- strsplit(args, ", ", fixed=T)[[1]] + args <- lapply(args, function(x){strsplit(x, ": ", fixed=T)}[[1]]) + output <- list( "function_name" = function_name ,"args"=args ) @@ -24,8 +24,8 @@ parser <- function(x){ ############################################################ build_generic <- function(input){ - function_name = input$function_name - args = lapply(input$args, function(x){x[1]}) + function_name <- input$function_name + args <- lapply(input$args, function(x){x[1]}) paste0( '#\' @rdname ', function_name, ' setGeneric( @@ -35,21 +35,9 @@ build_generic <- function(input){ ') } - -build_generic2 <- function(input){ - function_name = input$function_name - args = lapply(input$args, function(x){x[1]}) - paste0( - '#\' @rdname ', function_name, ' - setGeneric( - name="',function_name,'" - ,def=function(',paste0(args, collapse=','), ') {standardGeneric("',function_name, '")} - ) - ') -} ############################################################ build_column_specifiers <- function(input){ - args = lapply(input$args, function(x){x[1]}) + args <- lapply(input$args, function(x){x[1]}) build_column_specifier <- function(arg){ return(paste0(arg, '@jc')) } @@ -62,29 +50,32 @@ build_column_specifiers <- function(input){ } ############################################################ build_method<-function(input){ - function_name = input$function_name - arg_names = lapply(input$args, function(x){c(x[1])}) + function_name <- input$function_name + arg_names <- lapply(input$args, function(x){c(x[1])}) #this handles converting non-Column arguments to their R equivalents argument_parser <- function(x){ if(x[2] == 'Int'){ - x[2] = "numeric" + x[2] <- "numeric" } else if(x[2] == 'String'){ - x[2] = "character" + x[2] <- "character" } else if(x[2] == 'Double'){ - x[2] = "numeric" + x[2] <- "numeric" + } + else if(x[2] == 'Boolean') { + x[2] <- "logical" } x } # convert scala type to R types - args = lapply(input$args, argument_parser) + args <- lapply(input$args, argument_parser) # take a copy for building the docs - param_args = args + param_args <- args # wrap the strings in speech marks - args = lapply(args, function(x){c(x[1], paste0("'", x[2], "'"))}) + args <- lapply(args, function(x){c(x[1], paste0("'", x[2], "'"))}) # collapse down to a single string - args = lapply(args, function(x){paste0(x, collapse= ' = ')}) + args <- lapply(args, function(x){paste0(x, collapse= ' = ')}) column_specifiers <- build_column_specifiers(input) docstring <- paste0( c(paste0(c("#'", function_name), collapse=" "), @@ -116,48 +107,62 @@ build_method<-function(input){ ############################################################ get_function_names <- function(scala_file_path){ #scala_file_path = "~/Documents/mosaic/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala" - scala_file_object = file(scala_file_path) + scala_file_object <- file(scala_file_path) - scala_file = readLines(scala_file_object) + scala_file <- readLines(scala_file_object) closeAllConnections() # find where the methods start - start_string = " object functions extends Serializable {" - start_index = grep(start_string, scala_file, fixed=T) + 1 + start_string <- " object functions extends Serializable {" + start_index <- grep(start_string, scala_file, fixed=T) + 1 # find the methods end - will be the next curly bracket # need to find where the matching end brace for the start string is located. # counter starts at 1 as the start string includes the opening brace - brace_counter = 1 + brace_counter <- 1 for(i in start_index : length(scala_file)){ # split the string into characters - returns a list so unlist it line_characters <- unlist(strsplit(scala_file[i], '')) # count the number of brace opens - n_opens = sum(grepl("{", line_characters, fixed=T)) + n_opens <- sum(grepl("{", line_characters, fixed=T)) # count the number of brace closes - n_closes = sum(grepl("}", line_characters, fixed=T)) + n_closes <- sum(grepl("}", line_characters, fixed=T)) # update the counter brace_counter <- brace_counter + n_opens - n_closes if (brace_counter == 0) break } - methods_to_bind = scala_file[start_index:i] + methods_to_bind <- scala_file[start_index:i] # remove any line that doesn't start with def - def_mask = grepl('\\s+def .*', methods_to_bind) - methods_to_bind = methods_to_bind[def_mask] + def_mask <- grepl('\\s+def .*', methods_to_bind) + methods_to_bind <- methods_to_bind[def_mask] # parse the string to get just the function_name(input:type...) pattern - methods_to_bind = unlist(lapply(methods_to_bind, function(x){ + methods_to_bind <- unlist(lapply(methods_to_bind, function(x){ substr(x , regexpr("def ", x, fixed=T)[1]+4 # get the starting point to account for whitespace , regexpr("): ", x, fixed=T)[1] # get the end point of where the return is. ) } )) - sort(methods_to_bind, T) + sort_methods_by_argcount(methods_to_bind) +} + +############################################################ +sort_methods_by_argcount <- function(methods) { + # Split the strings by colon and calculate the number of colons + method_names <- sapply(strsplit(methods, "\\("), function(x) x[1]) + argcount <- sapply(strsplit(methods, ","), function(x) length(x) - 1) + + # Use the order function to sort first alphabetically and then by the number of colons + order_indices <- order(method_names, argcount) + + # Return the sorted list + methods_sorted <- methods[order_indices] + return(methods_sorted) } ############################################################ build_sparklyr_mosaic_function <- function(input){ - function_name = input$function_name + function_name <- input$function_name paste0( "#' ", function_name, "\n\n", @@ -191,7 +196,7 @@ main <- function(scala_file_path){ ########################## ########################## # build sparkr functions - function_data = get_function_names(scala_file_path) + function_data <- get_function_names(scala_file_path) parsed <- lapply(function_data, parser) @@ -223,9 +228,9 @@ main <- function(scala_file_path){ # supplementary files sparkr_supplementary_files <- c("sparklyr-mosaic/enableMosaic.R", "sparklyr-mosaic/sparkFunctions.R") copy_supplementary_file(sparkr_supplementary_files, "sparklyr-mosaic/sparklyrMosaic/R/") - } + args <- commandArgs(trailingOnly = T) if (length(args) != 1){ stop("Please provide the MosaicContext.scala file path to generate_sparkr_functions.R") diff --git a/R/generate_docs.R b/R/generate_docs.R index 06b23e6fa..4b5fe19b3 100644 --- a/R/generate_docs.R +++ b/R/generate_docs.R @@ -1,6 +1,4 @@ -spark_location <- "/usr/spark-download/unzipped/spark-3.2.1-bin-hadoop2.7" -Sys.setenv(SPARK_HOME = spark_location) - +spark_location <- Sys.getenv("SPARK_HOME") library(SparkR, lib.loc = c(file.path(spark_location, "R", "lib"))) library(roxygen2) diff --git a/R/install_deps.R b/R/install_deps.R index d05207329..10a35e7d6 100644 --- a/R/install_deps.R +++ b/R/install_deps.R @@ -1,5 +1,3 @@ options(repos = c(CRAN = "https://packagemanager.posit.co/cran/__linux__/focal/latest")) -install.packages("pkgbuild") -install.packages("roxygen2") -install.packages("sparklyr") \ No newline at end of file +install.packages(c("pkgbuild", "testthat", "roxygen2", "sparklyr")) \ No newline at end of file diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data.R new file mode 100644 index 000000000..1ad46c114 --- /dev/null +++ b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/data.R @@ -0,0 +1,228 @@ +inputGJ = '{ + "type":"Feature", + "properties":{ + "shape_area":"0.0000607235737749", + "objectid":"24", + "shape_leng":"0.0469999619287", + "location_id":"24", + "zone":"Bloomingdale", + "borough":"Manhattan" + }, + "geometry":{ + "type":"MultiPolygon", + "coordinates":[ + [ + [ + [ + -73.95953658899997, + 40.7987185259999 + ], + [ + -73.96004456499999, + 40.79804123499991 + ], + [ + -73.96147779999993, + 40.79865415599994 + ], + [ + -73.96286980099991, + 40.799239676999946 + ], + [ + -73.96571144299992, + 40.80043806999988 + ], + [ + -73.96775900399992, + 40.80130351599994 + ], + [ + -73.96787379699998, + 40.80135169799993 + ], + [ + -73.96798415999996, + 40.80139826599985 + ], + [ + -73.96858360799983, + 40.80163546999991 + ], + [ + -73.97004742199995, + 40.80226500999991 + ], + [ + -73.97021594799989, + 40.80233585099994 + ], + [ + -73.97027400499987, + 40.802359035999885 + ], + [ + -73.97032589799987, + 40.80238456099995 + ], + [ + -73.97150381000002, + 40.80283773599996 + ], + [ + -73.97250022199995, + 40.80321661299997 + ], + [ + -73.97257779799996, + 40.803247183999915 + ], + [ + -73.9726574479999, + 40.803276513999926 + ], + [ + -73.97279907800002, + 40.803329158999894 + ], + [ + -73.97287179090726, + 40.803356187573904 + ], + [ + -73.97255429829633, + 40.80379863465013 + ], + [ + -73.97217237241792, + 40.80431907695909 + ], + [ + -73.97205095521862, + 40.80453037600999 + ], + [ + -73.9719529243853, + 40.804619709051586 + ], + [ + -73.97181849483539, + 40.80471754320449 + ], + [ + -73.97179980150743, + 40.80478561548601 + ], + [ + -73.97171200727101, + 40.80493169127652 + ], + [ + -73.9716204956932, + 40.80504229439937 + ], + [ + -73.97161113391445, + 40.80509618747434 + ], + [ + -73.97144210961531, + 40.80533442683979 + ], + [ + -73.9712908330033, + 40.805528707533185 + ], + [ + -73.97110765876137, + 40.805790139589654 + ], + [ + -73.97098767999995, + 40.805742902999974 + ], + [ + -73.97084148399995, + 40.80568534399987 + ], + [ + -73.97076362299994, + 40.80565737299987 + ], + [ + -73.97069326499995, + 40.80563026799993 + ], + [ + -73.96978722000001, + 40.80530049699997 + ], + [ + -73.96860707399988, + 40.804870949999895 + ], + [ + -73.96855913399985, + 40.80485358399993 + ], + [ + -73.96849506699984, + 40.80483226999993 + ], + [ + -73.96804849299988, + 40.804683699999906 + ], + [ + -73.96680327299997, + 40.804207560999906 + ], + [ + -73.96670106899995, + 40.80416847599993 + ], + [ + -73.96659731599986, + 40.804122764999946 + ], + [ + -73.96386126299988, + 40.80297203799993 + ], + [ + -73.96107793999975, + 40.801800357999866 + ], + [ + -73.95964685399987, + 40.80115642299993 + ], + [ + -73.95848111500001, + 40.800670477999894 + ], + [ + -73.95817297099987, + 40.800582540999876 + ], + [ + -73.95833304999988, + 40.80036505399989 + ], + [ + -73.95861879299989, + 40.79997702599996 + ], + [ + -73.95907669099992, + 40.79935223299986 + ], + [ + -73.95953658899997, + 40.7987185259999 + ] + ] + ] + ] + } +}' \ No newline at end of file diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R new file mode 100644 index 000000000..ad7032134 --- /dev/null +++ b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testVectorFunctions.R @@ -0,0 +1,96 @@ +source("data.R") + +test_that("scalar vector functions behave as intended", { + sdf <- SparkR::createDataFrame( + data.frame( + wkt = "POLYGON ((0 0, 0 2, 1 2, 1 0, 0 0))", + point_wkt = "POINT (1 1)" + ) + ) + + sdf <- withColumn(sdf, "st_area", st_area(column("wkt"))) + sdf <- withColumn(sdf, "st_length", st_length(column("wkt"))) + sdf <- withColumn(sdf, "st_perimeter", st_perimeter(column("wkt"))) + sdf <- withColumn(sdf, "st_convexhull", st_convexhull(column("wkt"))) + sdf <- withColumn(sdf, "st_dump", st_dump(column("wkt"))) + sdf <- withColumn(sdf, "st_translate", st_translate(column("wkt"), lit(1), lit(1))) + sdf <- withColumn(sdf, "st_scale", st_scale(column("wkt"), lit(1), lit(1))) + sdf <- withColumn(sdf, "st_rotate", st_rotate(column("wkt"), lit(1))) + sdf <- withColumn(sdf, "st_centroid", st_centroid(column("wkt"))) + sdf <- withColumn(sdf, "st_length", st_length(column("wkt"))) + sdf <- withColumn(sdf, "st_isvalid", st_isvalid(column("wkt"))) + sdf <- withColumn(sdf, "st_intersects", st_intersects(column("wkt"), column("wkt"))) + sdf <- withColumn(sdf, "st_intersection", st_intersection(column("wkt"), column("wkt"))) + sdf <- withColumn(sdf, "st_geometrytype", st_geometrytype(column("wkt"))) + sdf <- withColumn(sdf, "st_isvalid", st_isvalid(column("wkt"))) + sdf <- withColumn(sdf, "st_xmin", st_xmin(column("wkt"))) + sdf <- withColumn(sdf, "st_xmax", st_xmax(column("wkt"))) + sdf <- withColumn(sdf, "st_ymin", st_ymin(column("wkt"))) + sdf <- withColumn(sdf, "st_ymax", st_ymax(column("wkt"))) + sdf <- withColumn(sdf, "st_zmin", st_zmin(column("wkt"))) + sdf <- withColumn(sdf, "st_zmax", st_zmax(column("wkt"))) + sdf <- withColumn(sdf, "flatten_polygons", flatten_polygons(column("wkt"))) + + # SRID + sdf <- withColumn(sdf, "geom_with_srid", st_setsrid(st_geomfromwkt(column("wkt")), lit(4326L))) + sdf <- withColumn(sdf, "srid_check", st_srid(column("geom_with_srid"))) + sdf <- withColumn(sdf, "transformed_geom", st_transform(column("geom_with_srid"), lit(3857L))) + + # Grid functions + sdf <- withColumn(sdf, "grid_longlatascellid", grid_longlatascellid(lit(1), lit(1), lit(1L))) + sdf <- withColumn(sdf, "grid_pointascellid", grid_pointascellid(column("point_wkt"), lit(1L))) + sdf <- withColumn(sdf, "grid_boundaryaswkb", grid_boundaryaswkb(column("grid_pointascellid"))) + sdf <- withColumn(sdf, "grid_polyfill", grid_polyfill(column("wkt"), lit(1L))) + sdf <- withColumn(sdf, "grid_tessellateexplode", grid_tessellateexplode(column("wkt"), lit(1L))) + sdf <- withColumn(sdf, "grid_tessellate", grid_tessellate(column("wkt"), lit(1L))) + + # Deprecated + sdf <- withColumn(sdf, "point_index_lonlat", point_index_lonlat(lit(1), lit(1), lit(1L))) + sdf <- withColumn(sdf, "point_index_geom", point_index_geom(column("point_wkt"), lit(1L))) + sdf <- withColumn(sdf, "index_geometry", index_geometry(column("point_index_geom"))) + sdf <- withColumn(sdf, "polyfill", polyfill(column("wkt"), lit(1L))) + sdf <- withColumn(sdf, "mosaic_explode", mosaic_explode(column("wkt"), lit(1L))) + sdf <- withColumn(sdf, "mosaicfill", mosaicfill(column("wkt"), lit(1L))) + + expect_no_error(SparkR::write.df(sdf, source = "noop", mode = "overwrite")) + expect_equal(nrow(sdf), 1) + +}) + +test_that("aggregate vector functions behave as intended", { + + sdf <- SparkR::sql("SELECT id as location_id FROM range(1)") + sdf <- withColumn(sdf, "geometry", st_geomfromgeojson(lit(inputGJ))) + expect_equal(nrow(sdf), 1) + + sdf.l <- select(sdf, alias(column("location_id"), "left_id"), alias(column("geometry"), "left_geom")) + sdf.l <- withColumn(sdf.l, "left_index", mosaic_explode(column("left_geom"), lit(11L))) + + sdf.r <- select(sdf, alias(column("location_id"), "right_id"), alias(column("geometry"), "right_geom")) + sdf.r <- withColumn(sdf.r, "right_geom", st_translate( + column("right_geom"), + st_area(column("right_geom")) * runif(1) * 0.1, + st_area(column("right_geom")) * runif(1) * 0.1 + ) + ) + sdf.r <- withColumn(sdf.r, "right_index", mosaic_explode(column("right_geom"), lit(11L))) + + sdf.intersection <- join(sdf.l, sdf.r, sdf.l$left_index == sdf.r$right_index, "inner") + sdf.intersection <- summarize( + groupBy(sdf.intersection, sdf.intersection$left_id, sdf.intersection$right_id), + agg_intersects = st_intersects_aggregate(column("left_index"), column("right_index")), + agg_intersection = st_intersection_aggregate(column("left_index"), column("right_index")), + left_geom = first(column("left_geom")), + right_geom = first(column("right_geom")) + ) + sdf.intersection <- withColumn(sdf.intersection, "flat_intersects", st_intersects(column("left_geom"), column("right_geom"))) + sdf.intersection <- withColumn(sdf.intersection, "comparison_intersects", column("agg_intersects") == column("flat_intersects")) + sdf.intersection <- withColumn(sdf.intersection, "agg_area", st_area(column("agg_intersection"))) + sdf.intersection <- withColumn(sdf.intersection, "flat_intersection", st_intersection(column("left_geom"), column("right_geom"))) + sdf.intersection <- withColumn(sdf.intersection, "flat_area", st_area(column("flat_intersection"))) + sdf.intersection <- withColumn(sdf.intersection, "comparison_intersection", abs(column("agg_area") - column("flat_area")) <= lit(1e-3)) + + expect_true(first(sdf.intersection)$comparison_intersects) + expect_true(first(sdf.intersection)$comparison_intersection) + +}) \ No newline at end of file diff --git a/R/sparkR-mosaic/tests.R b/R/sparkR-mosaic/tests.R index f4071c3d2..aba14c82b 100644 --- a/R/sparkR-mosaic/tests.R +++ b/R/sparkR-mosaic/tests.R @@ -1,4 +1,6 @@ -spark_location <- "/usr/spark-download/unzipped/spark-3.2.1-bin-hadoop2.7" +library(testthat) + +spark_location <- "/usr/spark-download/unzipped/spark-3.3.2-bin-hadoop3" Sys.setenv(SPARK_HOME = spark_location) library(SparkR, lib.loc = c(file.path(spark_location, "R", "lib"))) .libPaths(c(file.path(spark_location, "R", "lib"), .libPaths())) @@ -11,74 +13,18 @@ install.packages(package_file, repos=NULL) library(sparkrMosaic) # find the mosaic jar in staging -staging_dir = "/home/runner/work/mosaic/mosaic/staging/" +staging_dir <- "/home/runner/work/mosaic/mosaic/staging/" mosaic_jar <- list.files(staging_dir) mosaic_jar <- mosaic_jar[grep("jar-with-dependencies.jar", mosaic_jar, fixed=T)] print("Looking for mosaic jar in") -mosaic_jar_path = paste0(staging_dir, mosaic_jar) +mosaic_jar_path <- paste0(staging_dir, mosaic_jar) print(mosaic_jar_path) -spark = sparkR.session( +spark <- sparkR.session( master = "local[*]" ,sparkJars = mosaic_jar_path ) enableMosaic() -sdf <- SparkR::createDataFrame( - data.frame( - wkt = "POLYGON ((0 0, 0 2, 1 2, 1 0, 0 0))", - point_wkt = "POINT (1 1)" - ) -) - -sdf <- withColumn(sdf, "st_area", st_area(column("wkt"))) -sdf <- withColumn(sdf, "st_length", st_length(column("wkt"))) -sdf <- withColumn(sdf, "st_perimeter", st_perimeter(column("wkt"))) -sdf <- withColumn(sdf, "st_convexhull", st_convexhull(column("wkt"))) -sdf <- withColumn(sdf, "st_dump", st_dump(column("wkt"))) -sdf <- withColumn(sdf, "st_translate", st_translate(column("wkt"), lit(1), lit(1))) -sdf <- withColumn(sdf, "st_scale", st_scale(column("wkt"), lit(1), lit(1))) -sdf <- withColumn(sdf, "st_rotate", st_rotate(column("wkt"), lit(1))) -sdf <- withColumn(sdf, "st_centroid", st_centroid(column("wkt"))) -#sdf <- withColumn(sdf, "st_centroid3D", st_centroid3D(column("wkt"))) -sdf <- withColumn(sdf, "st_length", st_length(column("wkt"))) -sdf <- withColumn(sdf, "st_isvalid", st_isvalid(column("wkt"))) -sdf <- withColumn(sdf, "st_intersects", st_intersects(column("wkt"), column("wkt"))) -#sdf <- withColumn(sdf, "st_intersection", st_intersection(column("wkt"), column("wkt"))) -sdf <- withColumn(sdf, "st_geometrytype", st_geometrytype(column("wkt"))) -sdf <- withColumn(sdf, "st_isvalid", st_isvalid(column("wkt"))) -sdf <- withColumn(sdf, "st_xmin", st_xmin(column("wkt"))) -sdf <- withColumn(sdf, "st_xmax", st_xmax(column("wkt"))) -sdf <- withColumn(sdf, "st_ymin", st_ymin(column("wkt"))) -sdf <- withColumn(sdf, "st_ymax", st_ymax(column("wkt"))) -sdf <- withColumn(sdf, "st_zmin", st_zmin(column("wkt"))) -sdf <- withColumn(sdf, "st_zmax", st_zmax(column("wkt"))) -sdf <- withColumn(sdf, "flatten_polygons", flatten_polygons(column("wkt"))) - -# SRID -sdf <- withColumn(sdf, "geom_with_srid", st_setsrid(st_geomfromwkt(column("wkt")), lit(4326L))) -sdf <- withColumn(sdf, "srid_check", st_srid(column("geom_with_srid"))) -sdf <- withColumn(sdf, "transformed_geom", st_transform(column("geom_with_srid"), lit(3857L))) - -# Grid functions -sdf <- withColumn(sdf, "grid_longlatascellid", grid_longlatascellid(lit(1), lit(1), lit(1L))) -sdf <- withColumn(sdf, "grid_pointascellid", grid_pointascellid(column("point_wkt"), lit(1L))) -sdf <- withColumn(sdf, "grid_boundaryaswkb", grid_boundaryaswkb(column("grid_pointascellid"))) -sdf <- withColumn(sdf, "grid_polyfill", grid_polyfill(column("wkt"), lit(1L))) -sdf <- withColumn(sdf, "grid_tessellateexplode", grid_tessellateexplode(column("wkt"), lit(1L))) -sdf <- withColumn(sdf, "grid_tessellate", grid_tessellate(column("wkt"), lit(1L))) - -# Deprecated -sdf <- withColumn(sdf, "point_index_lonlat", point_index_lonlat(lit(1), lit(1), lit(1L))) -sdf <- withColumn(sdf, "point_index_geom", point_index_geom(column("point_wkt"), lit(1L))) -sdf <- withColumn(sdf, "index_geometry", index_geometry(column("point_index_geom"))) -sdf <- withColumn(sdf, "polyfill", polyfill(column("wkt"), lit(1L))) -sdf <- withColumn(sdf, "mosaic_explode", mosaic_explode(column("wkt"), lit(1L))) -sdf <- withColumn(sdf, "mosaicfill", mosaicfill(column("wkt"), lit(1L))) - -if (nrow(SparkR::collect(sdf)) == 1.0){ - q(save="no", status=0) -} else q(save="no", status=1) - - +testthat::test_local(path="./sparkrMosaic") \ No newline at end of file diff --git a/R/sparklyr-mosaic/enableMosaic.R b/R/sparklyr-mosaic/enableMosaic.R index c0baedecd..3deb15194 100644 --- a/R/sparklyr-mosaic/enableMosaic.R +++ b/R/sparklyr-mosaic/enableMosaic.R @@ -19,14 +19,11 @@ enableMosaic <- function( sc ,geometryAPI="JTS" ,indexSystem="H3" - ,rasterAPI="GDAL" ){ geometry_api <- sparklyr::invoke_static(sc, class="com.databricks.labs.mosaic.core.geometry.api.GeometryAPI", method="apply", geometryAPI) - index_system_id <- sparklyr::invoke_static(sc, class="com.databricks.labs.mosaic.core.index.IndexSystemID", method="apply", indexSystem) - raster_api <- sparklyr::invoke_static(sc, class="com.databricks.labs.mosaic.core.raster.api.RasterAPI", method="apply", rasterAPI) - indexing_system <- sparklyr::invoke_static(sc, class="com.databricks.labs.mosaic.core.index.IndexSystemID", method="getIndexSystem", index_system_id) - mosaic_context <- sparklyr::invoke_new(sc, class="com.databricks.labs.mosaic.functions.MosaicContext", indexing_system, geometry_api, raster_api) + indexing_system <- sparklyr::invoke_static(sc, class="com.databricks.labs.mosaic.core.index.IndexSystemFactory", method="getIndexSystem", indexSystem) + mosaic_context <- sparklyr::invoke_new(sc, class="com.databricks.labs.mosaic.functions.MosaicContext", indexing_system, geometry_api) functions <<- sparklyr::invoke(mosaic_context, "functions") sparklyr::invoke(mosaic_context, "register") diff --git a/R/sparklyr-mosaic/sparklyr-mosaic-tests.R b/R/sparklyr-mosaic/sparklyr-mosaic-tests.R deleted file mode 100644 index b77559f45..000000000 --- a/R/sparklyr-mosaic/sparklyr-mosaic-tests.R +++ /dev/null @@ -1,44 +0,0 @@ -sdf <- sparklyr::sdf_copy_to(sc, - data.frame( - wkt = "POLYGON ((0 0, 0 2, 1 2, 1 0, 0 0))", - point_wkt = "POINT (1 1)" - ) -) - -sdf <- mutate(sdf, "st_area" = st_area(wkt)) -sdf <- mutate(sdf, "st_length" = st_length(wkt)) -sdf <- mutate(sdf, "st_perimeter" = st_perimeter(wkt)) -sdf <- mutate(sdf, "st_convexhull" = st_convexhull(wkt)) -sdf <- mutate(sdf, "st_dump" = st_dump(wkt)) -sdf <- mutate(sdf, "st_translate" = st_translate(wkt, 1L, 1L)) -sdf <- mutate(sdf, "st_scale" = st_scale(wkt, 1L, 1L)) -sdf <- mutate(sdf, "st_rotate" = st_rotate(wkt, 1L)) -sdf <- mutate(sdf, "st_centroid2D" = st_centroid2D(wkt)) -sdf <- mutate(sdf, "st_centroid3D" = st_centroid3D(wkt)) -sdf <- mutate(sdf, "st_length" = st_length(wkt)) -sdf <- mutate(sdf, "st_isvalid" = st_isvalid(wkt)) -sdf <- mutate(sdf, "st_intersects" = st_intersects(wkt, wkt)) -sdf <- mutate(sdf, "st_intersection" = st_intersection(wkt, wkt)) -sdf <- mutate(sdf, "st_geometrytype" = st_geometrytype(wkt)) -sdf <- mutate(sdf, "st_isvalid" = st_isvalid(wkt)) -sdf <- mutate(sdf, "st_xmin" = st_xmin(wkt)) -sdf <- mutate(sdf, "st_xmax" = st_xmax(wkt)) -sdf <- mutate(sdf, "st_ymin" = st_ymin(wkt)) -sdf <- mutate(sdf, "st_ymax" = st_ymax(wkt)) -sdf <- mutate(sdf, "st_zmin" = st_zmin(wkt)) -sdf <- mutate(sdf, "st_zmax" = st_zmax(wkt)) -sdf <- mutate(sdf, "flatten_polygons" = flatten_polygons(wkt)) -sdf <- mutate(sdf, "point_index_lonlat" = point_index_lonlat(as.double(1L), as.double(1L), 1L)) # issue with type. needs a double but native R type not converting properly so requires explicit setting -sdf <- mutate(sdf, "point_index_geom" = point_index_geom(point_wkt, as.integer(1L))) -sdf <- mutate(sdf, "index_geometry" = index_geometry(point_index_geom)) -sdf <- mutate(sdf, "polyfill" = polyfill(wkt, as.integer(1L))) # requires a long and passing a double or an integer causes a cast exception - can't convert to Long. -sdf <- mutate(sdf, "mosaic_explode" = mosaic_explode(wkt, as.integer(1L))) -sdf <- mutate(sdf, "mosaicfill" = mosaicfill(wkt, 1L)) -sdf <- mutate(sdf, "geom_with_srid" = st_setsrid(st_geomfromwkt(wkt), 4326L)) -sdf <- mutate(sdf, "srid_check" = st_srid(geom_with_srid)) -sdf <- mutate(sdf, "transformed_geom" = st_transform(geom_with_srid, 3857L)) - - - - -sdf %>% sparklyr::collect() %>% as.data.frame() \ No newline at end of file diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data.R new file mode 100644 index 000000000..1ad46c114 --- /dev/null +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/data.R @@ -0,0 +1,228 @@ +inputGJ = '{ + "type":"Feature", + "properties":{ + "shape_area":"0.0000607235737749", + "objectid":"24", + "shape_leng":"0.0469999619287", + "location_id":"24", + "zone":"Bloomingdale", + "borough":"Manhattan" + }, + "geometry":{ + "type":"MultiPolygon", + "coordinates":[ + [ + [ + [ + -73.95953658899997, + 40.7987185259999 + ], + [ + -73.96004456499999, + 40.79804123499991 + ], + [ + -73.96147779999993, + 40.79865415599994 + ], + [ + -73.96286980099991, + 40.799239676999946 + ], + [ + -73.96571144299992, + 40.80043806999988 + ], + [ + -73.96775900399992, + 40.80130351599994 + ], + [ + -73.96787379699998, + 40.80135169799993 + ], + [ + -73.96798415999996, + 40.80139826599985 + ], + [ + -73.96858360799983, + 40.80163546999991 + ], + [ + -73.97004742199995, + 40.80226500999991 + ], + [ + -73.97021594799989, + 40.80233585099994 + ], + [ + -73.97027400499987, + 40.802359035999885 + ], + [ + -73.97032589799987, + 40.80238456099995 + ], + [ + -73.97150381000002, + 40.80283773599996 + ], + [ + -73.97250022199995, + 40.80321661299997 + ], + [ + -73.97257779799996, + 40.803247183999915 + ], + [ + -73.9726574479999, + 40.803276513999926 + ], + [ + -73.97279907800002, + 40.803329158999894 + ], + [ + -73.97287179090726, + 40.803356187573904 + ], + [ + -73.97255429829633, + 40.80379863465013 + ], + [ + -73.97217237241792, + 40.80431907695909 + ], + [ + -73.97205095521862, + 40.80453037600999 + ], + [ + -73.9719529243853, + 40.804619709051586 + ], + [ + -73.97181849483539, + 40.80471754320449 + ], + [ + -73.97179980150743, + 40.80478561548601 + ], + [ + -73.97171200727101, + 40.80493169127652 + ], + [ + -73.9716204956932, + 40.80504229439937 + ], + [ + -73.97161113391445, + 40.80509618747434 + ], + [ + -73.97144210961531, + 40.80533442683979 + ], + [ + -73.9712908330033, + 40.805528707533185 + ], + [ + -73.97110765876137, + 40.805790139589654 + ], + [ + -73.97098767999995, + 40.805742902999974 + ], + [ + -73.97084148399995, + 40.80568534399987 + ], + [ + -73.97076362299994, + 40.80565737299987 + ], + [ + -73.97069326499995, + 40.80563026799993 + ], + [ + -73.96978722000001, + 40.80530049699997 + ], + [ + -73.96860707399988, + 40.804870949999895 + ], + [ + -73.96855913399985, + 40.80485358399993 + ], + [ + -73.96849506699984, + 40.80483226999993 + ], + [ + -73.96804849299988, + 40.804683699999906 + ], + [ + -73.96680327299997, + 40.804207560999906 + ], + [ + -73.96670106899995, + 40.80416847599993 + ], + [ + -73.96659731599986, + 40.804122764999946 + ], + [ + -73.96386126299988, + 40.80297203799993 + ], + [ + -73.96107793999975, + 40.801800357999866 + ], + [ + -73.95964685399987, + 40.80115642299993 + ], + [ + -73.95848111500001, + 40.800670477999894 + ], + [ + -73.95817297099987, + 40.800582540999876 + ], + [ + -73.95833304999988, + 40.80036505399989 + ], + [ + -73.95861879299989, + 40.79997702599996 + ], + [ + -73.95907669099992, + 40.79935223299986 + ], + [ + -73.95953658899997, + 40.7987185259999 + ] + ] + ] + ] + } +}' \ No newline at end of file diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R new file mode 100644 index 000000000..f91a0ff8b --- /dev/null +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testVectorFunctions.R @@ -0,0 +1,114 @@ +source("data.R") + +test_that("scalar vector functions behave as intended", { + + sdf <- sdf_copy_to( + sc, + data.frame( + wkt = "POLYGON ((0 0, 0 2, 1 2, 1 0, 0 0))", + point_wkt = "POINT (1 1)" + ) + ) + + sdf <- mutate(sdf, "st_area" = st_area(wkt)) + sdf <- mutate(sdf, "st_length" = st_length(wkt)) + sdf <- mutate(sdf, "st_perimeter" = st_perimeter(wkt)) + sdf <- mutate(sdf, "st_buffer" = st_buffer(wkt, as.double(1.1))) + sdf <- mutate(sdf, "st_bufferloop" = st_bufferloop(wkt, as.double(1.1), as.double(1.2))) + sdf <- mutate(sdf, "st_convexhull" = st_convexhull(wkt)) + sdf <- mutate(sdf, "st_dump" = st_dump(wkt)) + sdf <- mutate(sdf, "st_translate" = st_translate(wkt, 1L, 1L)) + sdf <- mutate(sdf, "st_scale" = st_scale(wkt, 1L, 1L)) + sdf <- mutate(sdf, "st_rotate" = st_rotate(wkt, 1L)) + sdf <- mutate(sdf, "st_centroid" = st_centroid(wkt)) + sdf <- mutate(sdf, "st_numpoints" = st_numpoints(wkt)) + sdf <- mutate(sdf, "st_haversine" = st_haversine(as.double(0.0), as.double(90.0), as.double(0.0), as.double(0.0))) + sdf <- mutate(sdf, "st_isvalid" = st_isvalid(wkt)) + sdf <- mutate(sdf, "st_hasvalidcoordinates" = st_hasvalidcoordinates(wkt, "EPSG:2192", "bounds")) + sdf <- mutate(sdf, "st_intersects" = st_intersects(wkt, wkt)) + sdf <- mutate(sdf, "st_intersection" = st_intersection(wkt, wkt)) + sdf <- mutate(sdf, "st_envelope" = st_envelope(wkt)) + sdf <- mutate(sdf, "st_simplify" = st_simplify(wkt, as.double(0.001))) + sdf <- mutate(sdf, "st_difference" = st_difference(wkt, wkt)) + sdf <- mutate(sdf, "st_union" = st_union(wkt, wkt)) + sdf <- mutate(sdf, "st_unaryunion" = st_unaryunion(wkt)) + sdf <- mutate(sdf, "st_geometrytype" = st_geometrytype(wkt)) + sdf <- mutate(sdf, "st_xmin" = st_xmin(wkt)) + sdf <- mutate(sdf, "st_xmax" = st_xmax(wkt)) + sdf <- mutate(sdf, "st_ymin" = st_ymin(wkt)) + sdf <- mutate(sdf, "st_ymax" = st_ymax(wkt)) + sdf <- mutate(sdf, "st_zmin" = st_zmin(wkt)) + sdf <- mutate(sdf, "st_zmax" = st_zmax(wkt)) + sdf <- mutate(sdf, "flatten_polygons" = flatten_polygons(wkt)) + + # SRID functions + + sdf <- mutate(sdf, "geom_with_srid" = st_setsrid(st_geomfromwkt(wkt), 4326L)) + sdf <- mutate(sdf, "srid_check" = st_srid(geom_with_srid)) + sdf <- mutate(sdf, "transformed_geom" = st_transform(geom_with_srid, 3857L)) + + # Grid functions + + sdf <- mutate(sdf, "grid_longlatascellid" = grid_longlatascellid(as.double(1L), as.double(1L), 1L)) + sdf <- mutate(sdf, "grid_pointascellid" = grid_pointascellid(point_wkt, 1L)) + sdf <- mutate(sdf, "grid_boundaryaswkb" = grid_boundaryaswkb(grid_longlatascellid)) + sdf <- mutate(sdf, "grid_polyfill" = grid_polyfill(wkt, 1L)) + sdf <- mutate(sdf, "grid_tessellateexplode" = grid_tessellateexplode(wkt, 1L)) + sdf <- mutate(sdf, "grid_tessellateexplode_no_core_chips" = grid_tessellateexplode(wkt, 1L, FALSE)) + sdf <- mutate(sdf, "grid_tessellate" = grid_tessellate(wkt, 1L)) + sdf <- mutate(sdf, "grid_cellarea" = grid_cellarea(grid_longlatascellid)) + + expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite")) + expect_equal(sdf_nrow(sdf), 1) + +}) + +test_that("aggregate vector functions behave as intended", { + + sdf <- sdf_sql(sc, "SELECT id as location_id FROM range(1)") %>% + mutate(geometry = st_geomfromgeojson(inputGJ)) + expect_equal(sdf_nrow(sdf), 1) + + sdf.l <- sdf %>% + select( + left_id = location_id, + left_geom = geometry + ) %>% + mutate(left_index = mosaic_explode(left_geom, 11L)) + + sdf.r <- sdf %>% + select( + right_id = location_id, + right_geom = geometry + ) %>% + mutate(right_geom = st_translate( + right_geom, + st_area(right_geom) * runif(n()) * 0.1, + st_area(right_geom) * runif(n()) * 0.1) + ) %>% + mutate(right_index = mosaic_explode(right_geom, 11L)) + + sdf.intersection <- sdf.l %>% + inner_join(sdf.r, by = c("left_index" = "right_index"), keep = TRUE) %>% + dplyr::group_by(left_id, right_id) %>% + dplyr::summarise( + agg_intersects = st_intersects_aggregate(left_index, right_index), + agg_intersection = st_intersection_aggregate(left_index, right_index), + left_geom = max(left_geom, 1), + right_geom = max(right_geom, 1) + ) %>% + mutate( + flat_intersects = st_intersects(left_geom, right_geom), + comparison_intersects = agg_intersects == flat_intersects, + agg_area = st_area(agg_intersection), + flat_intersection = st_intersection(left_geom, right_geom), + flat_area = st_area(flat_intersection), + comparison_intersection = abs(agg_area - flat_area) <= 1e-3 + ) + + expect_no_error(spark_write_source(sdf.intersection, "noop", mode = "overwrite")) + expect_true(sdf.intersection %>% head(1) %>% sdf_collect %>% .$comparison_intersects) + expect_true(sdf.intersection %>% head(1) %>% sdf_collect %>% .$comparison_intersection) + + +}) \ No newline at end of file diff --git a/R/sparklyr-mosaic/tests.R b/R/sparklyr-mosaic/tests.R new file mode 100644 index 000000000..cb4f533a7 --- /dev/null +++ b/R/sparklyr-mosaic/tests.R @@ -0,0 +1,29 @@ +library(testthat) + +if(length(getOption("repos")) < 1) { + options(repos = c( + CRAN = "https://cloud.r-project.org" + )) +} + +install.packages("sparklyr", repos="") +library(sparklyr) + +spark_home_set("/usr/spark-download/unzipped/spark-3.3.2-bin-hadoop3") +install.packages("sparklyrMosaic_0.3.12.tar.gz", repos = NULL) +library(sparklyrMosaic) + +# find the mosaic jar in staging +staging_dir <- "/home/runner/work/mosaic/mosaic/staging/" +mosaic_jar <- list.files(staging_dir) +mosaic_jar <- mosaic_jar[grep("jar-with-dependencies.jar", mosaic_jar, fixed=T)] +mosaic_jar_path <- paste0(staging_dir, mosaic_jar) +print(paste("Looking for mosaic jar in", mosaic_jar_path)) + +config <- sparklyr::spark_config() +config$`sparklyr.jars.default` <- c(mosaic_jar_path) + +sc <- spark_connect(master="local[*]", config=config) +enableMosaic(sc) + +testthat::test_local(path="./sparklyrMosaic") \ No newline at end of file diff --git a/docs/source/api/raster-functions.rst b/docs/source/api/raster-functions.rst index 2cf0e90fa..19a8cc42c 100644 --- a/docs/source/api/raster-functions.rst +++ b/docs/source/api/raster-functions.rst @@ -58,7 +58,7 @@ rst_bandmetadata val df = spark.read .format("gdal").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_bandmetadata(col("tile"), lit(1)).limit(1).show(false) + df.select(rst_bandmetadata(col("tile"), lit(1))).limit(1).show(false) +--------------------------------------------------------------------------------------+ | rst_bandmetadata(tile, 1) | +--------------------------------------------------------------------------------------+ @@ -246,18 +246,18 @@ rst_combineavg .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_netcdf - USING gdal - OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - WITH grouped as ( - SELECT collect_list(tile) as tile FROM coral_netcdf - ) - SELECT rst_combineavg(tile) FROM grouped LIMIT 1 - +----------------------------------------------------------------------------------------------------------------+ - | rst_combineavg(tile) | - +----------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | - +----------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_netcdf + USING gdal + OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + WITH grouped as ( + SELECT collect_list(tile) as tile FROM coral_netcdf + ) + SELECT rst_combineavg(tile) FROM grouped LIMIT 1 + +----------------------------------------------------------------------------------------------------------------+ + | rst_combineavg(tile) | + +----------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | + +----------------------------------------------------------------------------------------------------------------+ rst_combineavgagg ***************** @@ -305,17 +305,17 @@ rst_combineavgagg .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_netcdf - USING gdal - OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - SELECT rst_combineavgagg(tile) - FROM coral_netcdf - GROUP BY 1 - +----------------------------------------------------------------------------------------------------------------+ - | rst_combineavgagg(tile) | - +----------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | - +----------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_netcdf + USING gdal + OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + SELECT rst_combineavgagg(tile) + FROM coral_netcdf + GROUP BY 1 + +----------------------------------------------------------------------------------------------------------------+ + | rst_combineavgagg(tile) | + +----------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | + +----------------------------------------------------------------------------------------------------------------+ rst_frombands ************** @@ -365,18 +365,18 @@ rst_frombands .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_netcdf - USING gdal - OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - WITH grouped as ( - SELECT collect_list(tile) as tile FROM coral_netcdf - ) - SELECT rst_frombands(tile) FROM grouped LIMIT 1 - +----------------------------------------------------------------------------------------------------------------+ - | rst_frombands(tile) | - +----------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | - +----------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_netcdf + USING gdal + OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + WITH grouped as ( + SELECT collect_list(tile) as tile FROM coral_netcdf + ) + SELECT rst_frombands(tile) FROM grouped LIMIT 1 + +----------------------------------------------------------------------------------------------------------------+ + | rst_frombands(tile) | + +----------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | + +----------------------------------------------------------------------------------------------------------------+ rst_fromfile ************ @@ -425,13 +425,13 @@ rst_fromfile .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_netcdf - USING binaryFile - OPTIONS (path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - SELECT rst_fromfile(path) FROM coral_netcdf LIMIT 1 - +----------------------------------------------------------------------------------------------------------------+ - | rst_fromfile(path) | - +----------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_netcdf + USING binaryFile + OPTIONS (path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + SELECT rst_fromfile(path) FROM coral_netcdf LIMIT 1 + +----------------------------------------------------------------------------------------------------------------+ + | rst_fromfile(path) | + +----------------------------------------------------------------------------------------------------------------+ rst_georeference **************** @@ -694,15 +694,15 @@ rst_initnodata .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_netcdf - USING gdal - OPTIONS (extensions "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - SELECT rst_initnodata(path) FROM coral_netcdf LIMIT 1 - +----------------------------------------------------------------------------------------------------------------+ - | rst_initnodata(path) | - +----------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | - +----------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_netcdf + USING gdal + OPTIONS (extensions "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + SELECT rst_initnodata(path) FROM coral_netcdf LIMIT 1 + +----------------------------------------------------------------------------------------------------------------+ + | rst_initnodata(path) | + +----------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | + +----------------------------------------------------------------------------------------------------------------+ rst_isempty ************* @@ -859,18 +859,18 @@ rst_merge .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_netcdf - USING gdal - OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - WITH grouped as ( - SELECT collect_list(tile) as tile FROM coral_netcdf - ) - SELECT rst_merge(tile) FROM grouped LIMIT 1 - +----------------------------------------------------------------------------------------------------------------+ - | rst_merge(tile) | - +----------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | - +----------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_netcdf + USING gdal + OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + WITH grouped as ( + SELECT collect_list(tile) as tile FROM coral_netcdf + ) + SELECT rst_merge(tile) FROM grouped LIMIT 1 + +----------------------------------------------------------------------------------------------------------------+ + | rst_merge(tile) | + +----------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | + +----------------------------------------------------------------------------------------------------------------+ rst_mergeagg ************ @@ -924,15 +924,15 @@ rst_mergeagg .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_netcdf - USING gdal - OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - SELECT rst_mergeagg(tile) FROM coral_netcdf LIMIT 1 - +----------------------------------------------------------------------------------------------------------------+ - | rst_mergeagg(tile) | - +----------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | - +----------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_netcdf + USING gdal + OPTIONS (extension "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + SELECT rst_mergeagg(tile) FROM coral_netcdf LIMIT 1 + +----------------------------------------------------------------------------------------------------------------+ + | rst_mergeagg(tile) | + +----------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | + +----------------------------------------------------------------------------------------------------------------+ rst_metadata ************* @@ -1036,38 +1036,38 @@ rst_ndvi .. tabs:: .. code-tab:: py - df = spark.read.format("binaryFile").option("extensions", "nc")\ - .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_ndvi("path", 1, 2)).limit(1).display() - +----------------------------------------------------------------------------------------------------------------+ - | rst_ndvi(path, 1, 2) | - +----------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | - +----------------------------------------------------------------------------------------------------------------+ + df = spark.read.format("binaryFile").option("extensions", "nc")\ + .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + df.select(mos.rst_ndvi("path", 1, 2)).limit(1).display() + +----------------------------------------------------------------------------------------------------------------+ + | rst_ndvi(path, 1, 2) | + +----------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | + +----------------------------------------------------------------------------------------------------------------+ .. code-tab:: scala - val df = spark.read - .format("binaryFile").option("extensions", "nc") - .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_ndvi(col("path"), lit(1), lit(2))).limit(1).show(false) - +----------------------------------------------------------------------------------------------------------------+ - | rst_ndvi(path, 1, 2) | - +----------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | - +----------------------------------------------------------------------------------------------------------------+ + val df = spark.read + .format("binaryFile").option("extensions", "nc") + .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + df.select(rst_ndvi(col("path"), lit(1), lit(2))).limit(1).show(false) + +----------------------------------------------------------------------------------------------------------------+ + | rst_ndvi(path, 1, 2) | + +----------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | + +----------------------------------------------------------------------------------------------------------------+ .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_netcdf - USING gdal - OPTIONS (extensions "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - SELECT rst_ndvi(path, 1, 2) FROM coral_netcdf LIMIT 1 - +----------------------------------------------------------------------------------------------------------------+ - | rst_ndvi(path, 1, 2) | - +----------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | - +----------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_netcdf + USING gdal + OPTIONS (extensions "nc", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") + SELECT rst_ndvi(path, 1, 2) FROM coral_netcdf LIMIT 1 + +----------------------------------------------------------------------------------------------------------------+ + | rst_ndvi(path, 1, 2) | + +----------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "NetCDF" } | + +----------------------------------------------------------------------------------------------------------------+ rst_numbands ************* @@ -1237,7 +1237,7 @@ rst_rastertogridavg :param tile: A column containing the raster tile. For < 0.3.11 string representing the path to a raster file or byte array. :type col: Column (RasterTileType) - :param raster: A resolution of the grid index system. + :param resolution: A resolution of the grid index system. :type col: Column (IntegerType) :rtype: Column: ArrayType(ArrayType(StructType(LongType|StringType, DoubleType))) @@ -1248,7 +1248,7 @@ rst_rastertogridavg df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_rastertogridavg('path', F.lit(3)).show() + df.select(mos.rst_rastertogridavg('path', F.lit(3))).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridavg(path, 3) | +--------------------------------------------------------------------------------------------------------------------+ @@ -1266,7 +1266,7 @@ rst_rastertogridavg val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_rastertogridavg(col("path"), lit(3)).show() + df.select(rst_rastertogridavg(col("path"), lit(3))).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridavg(path, 3) | +--------------------------------------------------------------------------------------------------------------------+ @@ -1314,7 +1314,7 @@ rst_rastertogridcount :param tile: A column containing the raster tile. For < 0.3.11 string representing the path to a raster file or byte array. :type col: Column (RasterTileType) - :param raster: A resolution of the grid index system. + :param resolution: A resolution of the grid index system. :type col: Column (IntegerType) :rtype: Column: ArrayType(ArrayType(StructType(LongType|StringType, DoubleType))) @@ -1325,7 +1325,7 @@ rst_rastertogridcount df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_rastertogridcount('path', F.lit(3)).show() + df.select(mos.rst_rastertogridcount('path', F.lit(3))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridcount(path, 3) | +------------------------------------------------------------------------------------------------------------------+ @@ -1343,7 +1343,7 @@ rst_rastertogridcount val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_rastertogridcount(col("path"), lit(3)).show() + df.select(rst_rastertogridcount(col("path"), lit(3))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridcount(path, 3) | +------------------------------------------------------------------------------------------------------------------+ @@ -1391,7 +1391,7 @@ rst_rastertogridmax :param tile: A column containing the raster tile. For < 0.3.11 string representing the path to a raster file or byte array. :type col: Column (RasterTileType) - :param raster: A resolution of the grid index system. + :param resolution: A resolution of the grid index system. :type col: Column (IntegerType) :rtype: Column: ArrayType(ArrayType(StructType(LongType|StringType, DoubleType))) @@ -1402,7 +1402,7 @@ rst_rastertogridmax df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_rastertogridmax('path', F.lit(3)).show() + df.select(mos.rst_rastertogridmax('path', F.lit(3))).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridmax(path, 3) | +--------------------------------------------------------------------------------------------------------------------+ @@ -1420,7 +1420,7 @@ rst_rastertogridmax val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_rastertogridmax(col("path"), lit(3)).show() + df.select(rst_rastertogridmax(col("path"), lit(3))).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridmax(path, 3) | +--------------------------------------------------------------------------------------------------------------------+ @@ -1468,7 +1468,7 @@ rst_rastertogridmedian :param tile: A column containing the raster tile. For < 0.3.11 string representing the path to a raster file or byte array. :type col: Column (RasterTileType) - :param raster: A resolution of the grid index system. + :param resolution: A resolution of the grid index system. :type col: Column (IntegerType) :rtype: Column: ArrayType(ArrayType(StructType(LongType|StringType, DoubleType))) @@ -1479,7 +1479,7 @@ rst_rastertogridmedian df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_rastertogridmedian('path', F.lit(3)).show() + df.select(mos.rst_rastertogridmedian('path', F.lit(3))).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridmedian(path, 3) | +--------------------------------------------------------------------------------------------------------------------+ @@ -1497,7 +1497,7 @@ rst_rastertogridmedian val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_rastertogridmedian(col("path"), lit(3)).show() + df.select(rst_rastertogridmedian(col("path"), lit(3))).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridmedian(path, 3) | +--------------------------------------------------------------------------------------------------------------------+ @@ -1545,7 +1545,7 @@ rst_rastertogridmin :param tile: A column containing the raster tile. For < 0.3.11 string representing the path to a raster file or byte array. :type col: Column (RasterTileType) - :param raster: A resolution of the grid index system. + :param resolution: A resolution of the grid index system. :type col: Column (IntegerType) :rtype: Column: ArrayType(ArrayType(StructType(LongType|StringType, DoubleType))) @@ -1556,7 +1556,7 @@ rst_rastertogridmin df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_rastertogridmin('path', F.lit(3)).show() + df.select(mos.rst_rastertogridmin('path', F.lit(3))).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridmin(path, 3) | +--------------------------------------------------------------------------------------------------------------------+ @@ -1574,7 +1574,7 @@ rst_rastertogridmin val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_rastertogridmin(col("path"), lit(3)).show() + df.select(rst_rastertogridmin(col("path"), lit(3))).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_rastertogridmin(path, 3) | +--------------------------------------------------------------------------------------------------------------------+ @@ -1634,7 +1634,7 @@ rst_rastertoworldcoord df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_rastertoworldcoord('path', F.lit(3), F.lit(3)).show() + df.select(mos.rst_rastertoworldcoord('path', F.lit(3), F.lit(3))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_rastertoworldcoord(path, 3, 3) | +------------------------------------------------------------------------------------------------------------------+ @@ -1646,7 +1646,7 @@ rst_rastertoworldcoord val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_rastertoworldcoord(col("path"), lit(3), lit(3)).show() + df.select(rst_rastertoworldcoord(col("path"), lit(3), lit(3))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_rastertoworldcoord(path, 3, 3) | +------------------------------------------------------------------------------------------------------------------+ @@ -1688,7 +1688,7 @@ rst_rastertoworldcoordx df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_rastertoworldcoordx('path', F.lit(3), F.lit(3)).show() + df.select(mos.rst_rastertoworldcoordx('path', F.lit(3), F.lit(3))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_rastertoworldcoordx(path, 3, 3) | +------------------------------------------------------------------------------------------------------------------+ @@ -1700,7 +1700,7 @@ rst_rastertoworldcoordx val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_rastertoworldcoordx(col("path"), lit(3), lit(3)).show() + df.select(rst_rastertoworldcoordx(col("path"), lit(3), lit(3))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_rastertoworldcoordx(path, 3, 3) | +------------------------------------------------------------------------------------------------------------------+ @@ -1742,7 +1742,7 @@ rst_rastertoworldcoordy df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_rastertoworldcoordy('path', F.lit(3), F.lit(3)).show() + df.select(mos.rst_rastertoworldcoordy('path', F.lit(3), F.lit(3))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_rastertoworldcoordy(path, 3, 3) | +------------------------------------------------------------------------------------------------------------------+ @@ -1754,7 +1754,7 @@ rst_rastertoworldcoordy val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_rastertoworldcoordy(col("path"), lit(3), lit(3)).show() + df.select(rst_rastertoworldcoordy(col("path"), lit(3), lit(3))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_rastertoworldcoordy(path, 3, 3) | +------------------------------------------------------------------------------------------------------------------+ @@ -1798,7 +1798,7 @@ rst_retile df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_retile('path', F.lit(300), F.lit(300)).show() + df.select(mos.rst_retile('path', F.lit(300), F.lit(300))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_retile(path, 300, 300) | +------------------------------------------------------------------------------------------------------------------+ @@ -1811,7 +1811,7 @@ rst_retile val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_retile(col("path"), lit(300), lit(300)).show() + df.select(rst_retile(col("path"), lit(300), lit(300))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_retile(path, 300, 300) | +------------------------------------------------------------------------------------------------------------------+ @@ -1865,7 +1865,7 @@ rst_rotation val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_rotation(col("path")).show() + df.select(rst_rotation(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_rotation(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -1904,7 +1904,7 @@ rst_scalex df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_scalex('path').show() + df.select(mos.rst_scalex('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_scalex(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -1916,7 +1916,7 @@ rst_scalex val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_scalex(col("path")).show() + df.select(rst_scalex(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_scalex(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -1953,7 +1953,7 @@ rst_scaley df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_scaley('path').show() + df.select(mos.rst_scaley('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_scaley(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -1965,7 +1965,7 @@ rst_scaley val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_scaley(col("path")).show() + df.select(rst_scaley(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_scaley(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2008,7 +2008,7 @@ rst_setnodata df = spark.read.format("binaryFile").option("extensions", "tif")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - df.select(mos.rst_setnodata('path', F.lit(0)).show() + df.select(mos.rst_setnodata('path', F.lit(0))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_setnodata(path, 0) | +------------------------------------------------------------------------------------------------------------------+ @@ -2021,7 +2021,7 @@ rst_setnodata val df = spark.read .format("binaryFile").option("extensions", "tif") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - df.select(rst_setnodata(col("path"), lit(0)).show() + df.select(rst_setnodata(col("path"), lit(0))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_setnodata(path, 0) | +------------------------------------------------------------------------------------------------------------------+ @@ -2031,16 +2031,16 @@ rst_setnodata .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_tif - USING gdal - OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - SELECT rst_setnodata(path, 0) - +------------------------------------------------------------------------------------------------------------------+ - | rst_setnodata(path, 0) | - +------------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - +------------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_tif + USING gdal + OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") + SELECT rst_setnodata(path, 0) + +------------------------------------------------------------------------------------------------------------------+ + | rst_setnodata(path, 0) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ rst_skewx ********************** @@ -2060,7 +2060,7 @@ rst_skewx df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_skewx('path').show() + df.select(mos.rst_skewx('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_skewx(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2072,7 +2072,7 @@ rst_skewx val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_skewx(col("path")).show() + df.select(rst_skewx(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_skewx(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2109,7 +2109,7 @@ rst_skewy df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_skewy('path').show() + df.select(mos.rst_skewy('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_skewy(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2121,7 +2121,7 @@ rst_skewy val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_skewy(col("path")).show() + df.select(rst_skewy(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_skewy(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2161,7 +2161,7 @@ rst_srid df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_srid('path').show() + df.select(mos.rst_srid('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_srid(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2173,7 +2173,7 @@ rst_srid val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_srid(col("path")).show() + df.select(rst_srid(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_srid(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2212,7 +2212,7 @@ rst_subdatasets df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_subdatasets('path').show() + df.select(mos.rst_subdatasets('path')).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_subdatasets(path) | +--------------------------------------------------------------------------------------------------------------------+ @@ -2227,7 +2227,7 @@ rst_subdatasets val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_subdatasets(col("path")).show() + df.select(rst_subdatasets(col("path"))).show() +--------------------------------------------------------------------------------------------------------------------+ | rst_subdatasets(path) | +--------------------------------------------------------------------------------------------------------------------+ @@ -2277,7 +2277,7 @@ rst_subdivide df = spark.read.format("binaryFile").option("extensions", "tif")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - df.select(mos.rst_subdivide('path', F.lit(10)).show() + df.select(mos.rst_subdivide('path', F.lit(10))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_subdivide(path, 10) | +------------------------------------------------------------------------------------------------------------------+ @@ -2290,7 +2290,7 @@ rst_subdivide val df = spark.read .format("binaryFile").option("extensions", "tif") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - df.select(rst_subdivide(col("path"), lit(10)).show() + df.select(rst_subdivide(col("path"), lit(10))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_subdivide(path, 10) | +------------------------------------------------------------------------------------------------------------------+ @@ -2300,16 +2300,16 @@ rst_subdivide .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_tif - USING gdal - OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - SELECT rst_subdivide(path, 10) - +------------------------------------------------------------------------------------------------------------------+ - | rst_subdivide(path, 10) | - +------------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - +------------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_tif + USING gdal + OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") + SELECT rst_subdivide(path, 10) + +------------------------------------------------------------------------------------------------------------------+ + | rst_subdivide(path, 10) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ rst_summary ********************** @@ -2332,7 +2332,7 @@ rst_summary df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_summary('path').show() + df.select(mos.rst_summary('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_summary(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2348,7 +2348,7 @@ rst_summary val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_summary(col("path")).show() + df.select(rst_summary(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_summary(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2397,7 +2397,7 @@ rst_tessellate df = spark.read.format("binaryFile").option("extensions", "tif")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - df.select(mos.rst_tessellate('path', F.lit(10)).show() + df.select(mos.rst_tessellate('path', F.lit(10))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_tessellate(path, 10) | +------------------------------------------------------------------------------------------------------------------+ @@ -2407,29 +2407,29 @@ rst_tessellate .. code-tab:: scala - val df = spark.read - .format("binaryFile").option("extensions", "tif") - .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - df.select(rst_tessellate(col("path"), lit(10)).show() - +------------------------------------------------------------------------------------------------------------------+ - | rst_tessellate(path, 10) | - +------------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - +------------------------------------------------------------------------------------------------------------------+ + val df = spark.read + .format("binaryFile").option("extensions", "tif") + .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") + df.select(rst_tessellate(col("path"), lit(10))).show() + +------------------------------------------------------------------------------------------------------------------+ + | rst_tessellate(path, 10) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_tif - USING gdal - OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - SELECT rst_tessellate(path, 10) - +------------------------------------------------------------------------------------------------------------------+ - | rst_tessellate(path, 10) | - +------------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - +------------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_tif + USING gdal + OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") + SELECT rst_tessellate(path, 10) + +------------------------------------------------------------------------------------------------------------------+ + | rst_tessellate(path, 10) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ rst_tooverlappingtiles ********************** @@ -2459,7 +2459,7 @@ rst_tooverlappingtiles df = spark.read.format("binaryFile").option("extensions", "tif")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - df.select(mos.rst_tooverlappingtiles('path', F.lit(10), F.lit(10), F.lit(10)).show() + df.select(mos.rst_tooverlappingtiles('path', F.lit(10), F.lit(10), F.lit(10))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_tooverlappingtiles(path, 10, 10, 10) | +------------------------------------------------------------------------------------------------------------------+ @@ -2469,29 +2469,29 @@ rst_tooverlappingtiles .. code-tab:: scala - val df = spark.read - .format("binaryFile").option("extensions", "tif") - .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif - df.select(rst_tooverlappingtiles(col("path"), lit(10), lit(10), lit(10)).show() - +------------------------------------------------------------------------------------------------------------------+ - | rst_tooverlappingtiles(path, 10, 10, 10) | - +------------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - +------------------------------------------------------------------------------------------------------------------+ + val df = spark.read + .format("binaryFile").option("extensions", "tif") + .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif + df.select(rst_tooverlappingtiles(col("path"), lit(10), lit(10), lit(10))).show() + +------------------------------------------------------------------------------------------------------------------+ + | rst_tooverlappingtiles(path, 10, 10, 10) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_tif - USING gdal - OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - SELECT rst_tooverlappingtiles(path, 10, 10, 10) - +------------------------------------------------------------------------------------------------------------------+ - | rst_tooverlappingtiles(path, 10, 10, 10) | - +------------------------------------------------------------------------------------------------------------------+ - | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | - +------------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_tif + USING gdal + OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") + SELECT rst_tooverlappingtiles(path, 10, 10, 10) + +------------------------------------------------------------------------------------------------------------------+ + | rst_tooverlappingtiles(path, 10, 10, 10) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ rst_tryopen ********************** @@ -2511,7 +2511,7 @@ rst_tryopen df = spark.read.format("binaryFile").option("extensions", "tif")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - df.select(mos.rst_tryopen('path').show() + df.select(mos.rst_tryopen('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_tryopen(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2520,27 +2520,27 @@ rst_tryopen .. code-tab:: scala - val df = spark.read - .format("binaryFile").option("extensions", "tif") - .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif - df.select(rst_tryopen(col("path")).show() - +------------------------------------------------------------------------------------------------------------------+ - | rst_tryopen(path) | - +------------------------------------------------------------------------------------------------------------------+ - | true | - +------------------------------------------------------------------------------------------------------------------+ + val df = spark.read + .format("binaryFile").option("extensions", "tif") + .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif + df.select(rst_tryopen(col("path"))).show() + +------------------------------------------------------------------------------------------------------------------+ + | rst_tryopen(path) | + +------------------------------------------------------------------------------------------------------------------+ + | true | + +------------------------------------------------------------------------------------------------------------------+ .. code-tab:: sql - CREATE TABLE IF NOT EXISTS TABLE coral_tif - USING gdal - OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") - SELECT rst_tryopen(path) - +------------------------------------------------------------------------------------------------------------------+ - | rst_tryopen(path) | - +------------------------------------------------------------------------------------------------------------------+ - | true | - +------------------------------------------------------------------------------------------------------------------+ + CREATE TABLE IF NOT EXISTS TABLE coral_tif + USING gdal + OPTIONS (extensions "tif", path "dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/tif") + SELECT rst_tryopen(path) + +------------------------------------------------------------------------------------------------------------------+ + | rst_tryopen(path) | + +------------------------------------------------------------------------------------------------------------------+ + | true | + +------------------------------------------------------------------------------------------------------------------+ rst_upperleftx ********************** @@ -2561,7 +2561,7 @@ rst_upperleftx df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_upperleftx('path').show() + df.select(mos.rst_upperleftx('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_upperleftx(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2573,7 +2573,7 @@ rst_upperleftx val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_upperleftx(col("path")).show() + df.select(rst_upperleftx(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_upperleftx(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2611,7 +2611,7 @@ rst_upperlefty df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_upperlefty('path').show() + df.select(mos.rst_upperlefty('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_upperlefty(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2623,7 +2623,7 @@ rst_upperlefty val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_upperlefty(col("path")).show() + df.select(rst_upperlefty(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_upperlefty(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2661,7 +2661,7 @@ rst_width df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_width('path').show() + df.select(mos.rst_width('path')).show() +------------------------------------------------------------------------------------------------------------------+ | rst_width(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2673,7 +2673,7 @@ rst_width val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_width(col("path")).show() + df.select(rst_width(col("path"))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_width(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2717,7 +2717,7 @@ rst_worldtorastercoord df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_worldtorastercoord('path', F.lit(-160.1), F.lit(40.0)).show() + df.select(mos.rst_worldtorastercoord('path', F.lit(-160.1), F.lit(40.0))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_worldtorastercoord(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2729,7 +2729,7 @@ rst_worldtorastercoord val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_worldtorastercoord(col("path"), lit(-160.1), lit(40.0)).show() + df.select(rst_worldtorastercoord(col("path"), lit(-160.1), lit(40.0))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_worldtorastercoord(path) | +------------------------------------------------------------------------------------------------------------------+ @@ -2775,7 +2775,7 @@ rst_worldtorastercoordx df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_worldtorastercoord('path', F.lit(-160.1), F.lit(40.0)).show() + df.select(mos.rst_worldtorastercoord('path', F.lit(-160.1), F.lit(40.0))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_worldtorastercoordx(path, -160.1, 40.0) | +------------------------------------------------------------------------------------------------------------------+ @@ -2787,7 +2787,7 @@ rst_worldtorastercoordx val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_worldtorastercoordx(col("path"), lit(-160.1), lit(40.0)).show() + df.select(rst_worldtorastercoordx(col("path"), lit(-160.1), lit(40.0))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_worldtorastercoordx(path, -160.1, 40.0) | +------------------------------------------------------------------------------------------------------------------+ @@ -2833,7 +2833,7 @@ rst_worldtorastercoordy df = spark.read.format("binaryFile").option("extensions", "nc")\ .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(mos.rst_worldtorastercoordy('path', F.lit(-160.1), F.lit(40.0)).show() + df.select(mos.rst_worldtorastercoordy('path', F.lit(-160.1), F.lit(40.0))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_worldtorastercoordy(path, -160.1, 40.0) | +------------------------------------------------------------------------------------------------------------------+ @@ -2845,7 +2845,7 @@ rst_worldtorastercoordy val df = spark.read .format("binaryFile").option("extensions", "nc") .load("dbfs:/FileStore/geospatial/mosaic/sample_raster_data/binary/netcdf-coral") - df.select(rst_worldtorastercoordy(col("path"), lit(-160.1), lit(40.0)).show() + df.select(rst_worldtorastercoordy(col("path"), lit(-160.1), lit(40.0))).show() +------------------------------------------------------------------------------------------------------------------+ | rst_worldtorastercoordy(path, -160.1, 40.0) | +------------------------------------------------------------------------------------------------------------------+ diff --git a/docs/source/usage/kepler.ipynb b/docs/source/usage/kepler.ipynb index 73c4de17d..967b46266 100644 --- a/docs/source/usage/kepler.ipynb +++ b/docs/source/usage/kepler.ipynb @@ -3,8 +3,11 @@ { "cell_type": "markdown", <<<<<<< HEAD +<<<<<<< HEAD ======= <<<<<<< HEAD +======= +>>>>>>> labs-main "source": [ "# Kepler visualizations" ], @@ -20,7 +23,10 @@ { "cell_type": "markdown", ======= +<<<<<<< HEAD >>>>>>> databrickslabs-main +======= +>>>>>>> labs-main "metadata": { "application/vnd.databricks.v1+cell": { "cellMetadata": {}, @@ -46,9 +52,13 @@ } }, <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> efd95270 (Documentation updates) >>>>>>> databrickslabs-main +======= +>>>>>>> efd95270 (Documentation updates) +>>>>>>> labs-main "source": [ "You can use the `%%mosaic_kepler` magic function to visualise data using [Kepler.gl](https://kepler.gl/).\n", "\n", @@ -58,7 +68,11 @@ "\n", "2) `column_name`: The column that needs to be plotted, can be either a geometry column (`WKT`, `WKB` or Mosaic internal format) or a column containing a spatial grid index ID\n", "\n", +<<<<<<< HEAD + "3) `feature_type`: The type of data to be plotted. Valid values are `geometry` (if SRID=4326), `geometry()` (where `` is the SRID used by the geometry column) and `h3`\n", +======= "3) `feature_type`: The type of data to be plotted. Valid values are `geometry` and `h3`\n", +>>>>>>> efd95270 (Documentation updates) "\n", "4) `limit`: The maximum number of objects to plot. The default limit is `1000`\n", "\n", @@ -70,8 +84,11 @@ "\n", "This magic function is only available in python. It can be used from notebooks with other default languages by storing the intermediate result in a temporary view, and then adding a python cell that uses the `mosaic_kepler` with the temporary view created from another language." <<<<<<< HEAD +<<<<<<< HEAD ======= <<<<<<< HEAD +======= +>>>>>>> labs-main ], "metadata": { "application/vnd.databricks.v1+cell": { @@ -135,7 +152,10 @@ { "cell_type": "code", ======= +<<<<<<< HEAD >>>>>>> databrickslabs-main +======= +>>>>>>> labs-main ] }, { @@ -184,16 +204,23 @@ }, "outputs": [], <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> efd95270 (Documentation updates) >>>>>>> databrickslabs-main +======= +>>>>>>> efd95270 (Documentation updates) +>>>>>>> labs-main "source": [ "from pyspark.sql.functions import *\n", "import mosaic as mos\n", "mos.enable_mosaic(spark, dbutils)" <<<<<<< HEAD +<<<<<<< HEAD ======= <<<<<<< HEAD +======= +>>>>>>> labs-main ], "metadata": { "application/vnd.databricks.v1+cell": { @@ -243,7 +270,10 @@ { "cell_type": "code", ======= +<<<<<<< HEAD >>>>>>> databrickslabs-main +======= +>>>>>>> labs-main ] }, { @@ -275,9 +305,13 @@ }, "outputs": [], <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> efd95270 (Documentation updates) >>>>>>> databrickslabs-main +======= +>>>>>>> efd95270 (Documentation updates) +>>>>>>> labs-main "source": [ "import requests\n", "\n", @@ -285,8 +319,11 @@ "with open('/dbfs/tmp/nyc_taxi_zones.geojson', 'wb') as f:\n", " f.write(req.content)" <<<<<<< HEAD +<<<<<<< HEAD ======= <<<<<<< HEAD +======= +>>>>>>> labs-main ], "metadata": { "application/vnd.databricks.v1+cell": { @@ -322,7 +359,10 @@ { "cell_type": "code", ======= +<<<<<<< HEAD >>>>>>> databrickslabs-main +======= +>>>>>>> labs-main ] }, { @@ -339,9 +379,13 @@ }, "outputs": [], <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> efd95270 (Documentation updates) >>>>>>> databrickslabs-main +======= +>>>>>>> efd95270 (Documentation updates) +>>>>>>> labs-main "source": [ "neighbourhoods = (\n", " spark.read\n", @@ -368,8 +412,11 @@ "\n", "neighbourhoods.show()" <<<<<<< HEAD +<<<<<<< HEAD ======= <<<<<<< HEAD +======= +>>>>>>> labs-main ], "metadata": { "application/vnd.databricks.v1+cell": { @@ -889,7 +936,10 @@ { "cell_type": "code", ======= +<<<<<<< HEAD >>>>>>> databrickslabs-main +======= +>>>>>>> labs-main ] }, { @@ -1145,9 +1195,13 @@ }, "outputs": [], <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> efd95270 (Documentation updates) >>>>>>> databrickslabs-main +======= +>>>>>>> efd95270 (Documentation updates) +>>>>>>> labs-main "source": [ "neighbourhood_chips = (neighbourhoods\n", " .limit(1)\n", @@ -1157,8 +1211,11 @@ "\n", "neighbourhood_chips.show()" <<<<<<< HEAD +<<<<<<< HEAD ======= <<<<<<< HEAD +======= +>>>>>>> labs-main ], "metadata": { "application/vnd.databricks.v1+cell": { @@ -1387,7 +1444,10 @@ } } ======= +<<<<<<< HEAD >>>>>>> databrickslabs-main +======= +>>>>>>> labs-main ] }, { @@ -1471,16 +1531,23 @@ "![mosaic kepler map example h3 chips](../images/kepler-3.png)" ] <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> efd95270 (Documentation updates) >>>>>>> databrickslabs-main +======= +>>>>>>> efd95270 (Documentation updates) +>>>>>>> labs-main } ], "metadata": { "application/vnd.databricks.v1+notebook": { <<<<<<< HEAD +<<<<<<< HEAD ======= <<<<<<< HEAD +======= +>>>>>>> labs-main "notebookName": "kepler", "dashboards": [], "notebookMetadata": { @@ -1490,7 +1557,10 @@ "widgets": {}, "notebookOrigID": 2874007245243191 ======= +<<<<<<< HEAD >>>>>>> databrickslabs-main +======= +>>>>>>> labs-main "dashboards": [], "language": "python", "notebookMetadata": { @@ -1500,9 +1570,13 @@ "notebookOrigID": 2666786534675682, "widgets": {} <<<<<<< HEAD +<<<<<<< HEAD ======= >>>>>>> efd95270 (Documentation updates) >>>>>>> databrickslabs-main +======= +>>>>>>> efd95270 (Documentation updates) +>>>>>>> labs-main } }, "nbformat": 4, diff --git a/mosaic-0.3.12-jar-with-dependencies.jar b/mosaic-0.3.12-jar-with-dependencies.jar new file mode 100644 index 000000000..8fcbdf0a3 Binary files /dev/null and b/mosaic-0.3.12-jar-with-dependencies.jar differ diff --git a/notebooks/examples/python/BritishNationalGrid.py b/notebooks/examples/python/BritishNationalGrid.py deleted file mode 100644 index bd359f906..000000000 --- a/notebooks/examples/python/BritishNationalGrid.py +++ /dev/null @@ -1,393 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC ## Install Mosaic -# MAGIC Mosaic framework is available via pip install and it comes with bindings for Python, SQL, Scala and R.
-# MAGIC The wheel file coming with pip installation is registering any necessary jars for other language support. - -# COMMAND ---------- - -# MAGIC %pip install databricks-mosaic - -# COMMAND ---------- - -# MAGIC %md ## Download demo data -# MAGIC -# MAGIC Run this only once - -# COMMAND ---------- - -# MAGIC %run ../../data/DownloadLondonPostcodeZones - -# COMMAND ---------- - -# MAGIC %run ../../data/DownloadUPRNsData - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Setup London Postcode zones -# MAGIC In order to setup the data please run the notebook available at "../../data/DownloadLondonPostcodeZones".
-# MAGIC DownloadLondonPostcodeZones notebook will make sure we have London Postcode shapes available in our environment. - -# COMMAND ---------- - -user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get() - -raw_path = f"dbfs:/tmp/mosaic/{user_name}" -raw_postcode_zones_path = f"{raw_path}/postcodes" - -print(f"The raw data is stored in {raw_path}") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Enable Mosaic in the notebook -# MAGIC To get started, you'll need to attach the wheel to your cluster and import instances as in the cell below.
-# MAGIC The defautl grid index system is set to H3. In oreder to use British National Grid you'd need to set the configuration parameter.
- -# COMMAND ---------- - -from pyspark.sql.functions import * -import mosaic as mos -spark.conf.set("spark.databricks.labs.mosaic.index.system", "BNG") -mos.enable_mosaic(spark, dbutils) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Read polygons from GeoJson - -# COMMAND ---------- - -postcodes = ( - spark.read - .option("multiline", "true") - .format("json") - .load(raw_postcode_zones_path) - .select("type", explode(col("features")).alias("feature")) - .select("type", col("feature.properties").alias("properties"), to_json(col("feature.geometry")).alias("json_geometry")) - .withColumn("geometry", mos.st_geomfromgeojson("json_geometry")) -) - -display( - postcodes -) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Reproject the geometries to correct SRID -# MAGIC British National Grid expects coordinate of geometries to be provided in EPSG:27700.
-# MAGIC Our geometries are provided in EPSG:4326. So we will need to reproject the geometries.
-# MAGIC Mosaic has the necessary functionality to help us achieve this. - -# COMMAND ---------- - -postcodes = ( - postcodes.select( - "type", "properties", "geometry" - ).withColumn( - "geometry", mos.st_setsrid("geometry", lit(4326)) - ).withColumn( - "geometry", mos.st_transform("geometry", lit(27700)) - ) -) - -postcodes.display() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Compute some basic geometry attributes - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Mosaic provides a number of functions for extracting the properties of geometries. Here are some that are relevant to Polygon geometries: - -# COMMAND ---------- - -display( - postcodes - .withColumn("calculated_area", mos.st_area(col("geometry"))) - .withColumn("calculated_length", mos.st_length(col("geometry"))) - # Note: The unit of measure of the area and length depends on the CRS used. - # For British National Grid locations it will be square meters and meters - .select("geometry", "calculated_area", "calculated_length") -) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Read points data - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We will load the Unique Property Reference Numbers (UPRNs) data to represent point data.
-# MAGIC In order to setup the data please run the notebook available at "../../data/DownloadUPRNsData".
-# MAGIC DownloadUPRNsData notebook will make sure we have UPRN table with point data available in our environment. -# MAGIC We already loaded some shapes representing polygons that correspond to London postcodes.
- -# COMMAND ---------- - -uprns_table = spark.table("uprns_table") -display(uprns_table) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC The UPRNs table contains Unique Property Reference Numbers and positions provided in EPSG:27700 and EPSG:4326.
-# MAGIC Since we are operating in EPSG:27700 and using BNG as our indexing system, we will use the location data provided via Northings and Eastings coordinates. - -# COMMAND ---------- - -uprns_table = ( - uprns_table - .withColumn("uprn_point", mos.st_point(col("X_COORDINATE"), col("Y_COORDINATE"))) - # we are using WKT here for simpler displaying, use WKB for faster query run time - .withColumn("uprn_point", mos.st_aswkt("uprn_point")) - .where(mos.st_hasvalidcoordinates("uprn_point", lit('EPSG:27700'), lit('reprojected_bounds'))) - .where(mos.st_isvalid(col("uprn_point"))) - .drop("LATITUDE", "LONGITUDE") -) -display(uprns_table) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Next step is optional. Howerver, since we are constructing POINT geometries and ensuring they are valid it is prudent to write out the validated dataset.
-# MAGIC That way we are making sure validation is performed only once at ingestion time and not each time spark runs the queries (due to spark lazy evaluation). - -# COMMAND ---------- - -uprns_table.write.format("delta").mode("overwrite").saveAsTable("uprns_bng_table") - -# COMMAND ---------- - -uprns_table = spark.read.table("uprns_bng_table") - -# COMMAND ---------- - -uprns_table.count() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Spatial Joins - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We can use Mosaic to perform spatial joins both with and without Mosaic indexing strategies.
-# MAGIC Indexing is very important when handling very different geometries both in size and in shape (ie. number of vertices).
-# MAGIC In the context of Mosaic we are using grid index systems rather than traditional tree based index system.
-# MAGIC The reason for this is the fact grid index systems like BNG and/or H3 are far better suited for distributed massive scale systems.
-# MAGIC Mosaic comes with grid_tessallate expressions that allow the caller to index an arbitrary shape within grid index system of choice.
-# MAGIC One thing to note here is that tessellation is a specialised way of converting a geometry to set of grid index system cells with their local geometries.
-# MAGIC Tesselation is applicable to any shape, Polygon, LineString, Points and their Multi* variants.
- -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### Getting the optimal resolution - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We can use Mosaic functionality to identify how to best index/tessellate our data based on the data inside the specific dataframe.
-# MAGIC Selecting an apropriate tesselation resolution can have a considerable impact on the performance.
- -# COMMAND ---------- - -from mosaic import MosaicFrame - -postcodes_mosaic_frame = MosaicFrame(postcodes, "geometry") -optimal_resolution = postcodes_mosaic_frame.get_optimal_resolution(sample_fraction=0.75) -optimal_resolution_str = postcodes_mosaic_frame.get_optimal_resolution_str(sample_fraction=0.75) - -print(f""" - Optimal resolution code is :{optimal_resolution}. - Optimal resolution name is :{optimal_resolution_str}. -""") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Not every resolution will yield performance improvements.
-# MAGIC By a rule of thumb it is always better to select more coarse resolution than to select a more fine grained resolution - if not sure select a lower resolution.
-# MAGIC Tessellation is a trade off between decomposition and explosion factor.
-# MAGIC The more fine grained the resolution is the more explosion of rows will impact the preprocessing time. However, it will make data more parallel.
-# MAGIC On the other hand, if the resolution is too coarse we are not addressing localisation related data skews.
-# MAGIC You can think of Mosaic's tesselation as a way to partition an overly complex row into multiple rows that have a balanced amount of computation each. - -# COMMAND ---------- - -display( - postcodes_mosaic_frame.get_resolution_metrics(sample_rows=150) -) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### Indexing/Tessellating using the optimal resolution - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We will use mosaic sql functions to index our points data.
-# MAGIC Here we will use resolution -4 (500m), index resolution depends on the dataset in use.
-# MAGIC There is a second best choice which is 4 (100m).
-# MAGIC The user can pass either numerical resolution or the string label to the grid expressions.
-# MAGIC BNG provides 2 types of hierarchies.
-# MAGIC The standard hierarchy which operates with index resolutions in base 10 (i.e. (6, 1m), (5, 10m), (4, 100m), (3, 1km), (2, 10km), (1, 100km)) and cell ids follow the format of letter pair followed by coordinate bins at the selected resolution (e.g. TQ100100 for (4, 100m)).
-# MAGIC The quad hierachy (or quadrant hierarchy) which operates with index resolutions in base 5 (i.e. (-6, 5m), (-5, 50m), (-4, 500m), (-3, 5km), (-2, 50km), (-1, 500km)) and cell ids follow the format of letter pair followed by coordinate bins at the selected resolution and folowed by quadrant letters (e.g. TQ100100SW for (-4, 500m)). Quadrants correspond to compas directions SW (south west), NW (north west), NE (north east) and SE (south east).
- -# COMMAND ---------- - -uprns_table = spark.read.table("uprns_bng_table") -uprns_table = ( - uprns_table - .withColumn("uprn_bng_500m", mos.grid_pointascellid("uprn_point", lit(optimal_resolution))) - .withColumn("uprn_bng_500m_str", mos.grid_pointascellid("uprn_point", lit(optimal_resolution_str))) - .withColumn("uprn_bng_100m_str", mos.grid_pointascellid("uprn_point", lit("100m"))) -) - -# COMMAND ---------- - -uprns_table.display() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Mosaic has a builtin wrappers for KeplerGL map plots using mosaic_kepler IPython magics.
-# MAGIC Mosaic magics automatically handle bng grid idex system and CRS conversion for you.
-# MAGIC Given that Kepler Plots are rendered on the browser side we are automatically limiting the row count to 1000.
-# MAGIC The end user can override the number of ploted rows by specifying the desired number. - -# COMMAND ---------- - -count_per_index = uprns_table.groupBy("uprn_bng_500m").count().cache() - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC count_per_index "uprn_bng_500m" "bng" 50 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We will use Mosaic to tessellate our postcode geometries using a built in tesselation generator (explode) function . - -# COMMAND ---------- - -postcodes_with_index = (postcodes - - # We break down the original geometry in multiple smaller mosaic chips - # each fully contained in a grid cell - .withColumn("chips", mos.grid_tessellateexplode(col("geometry"), lit(optimal_resolution))) - - # We don't need the original geometry any more, since we have broken it down into - # Smaller mosaic chips. - .drop("json_geometry", "geometry") - ) - -# COMMAND ---------- - -postcodes_with_index.display() - -# COMMAND ---------- - -to_display = postcodes_with_index.select("properties.Name", "chips.index_id", mos.st_aswkt("chips.wkb").alias("geometry")) - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC to_display "geometry" "geometry(27700)" 200 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### Performing the spatial join - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We can now do spatial join between our UPRNs and postcodes. - -# COMMAND ---------- - -with_postcodes = ( - uprns_table.join( - postcodes_with_index, - uprns_table["uprn_bng_500m"] == postcodes_with_index["chips.index_id"], - how = "right_outer" # to perserve even emtpy chips - ).where( - # If the borough is a core chip (the chip is fully contained within the geometry), then we do not need - # to perform any intersection, because any point matching the same index will certainly be contained in - # the borough. Otherwise we need to perform an st_contains operation on the chip geometry. - col("chips.is_core") | mos.st_contains(col("chips.wkb"), col("uprn_point")) - ).select( - "properties.*", "uprn_point", "UPRN", "chips.index_id", mos.st_aswkt("chips.wkb").alias("index_geometry") - ) -) - -display(with_postcodes) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Visualise the results in Kepler - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Mosaic abstracts interaction with Kepler in python through mosaic_kepler magic.
-# MAGIC Mosaic_kepler magic takes care of conversion between EPSG:27700 and EPSG:4326 so that Kepler can properly render.
-# MAGIC It can handle columns with bng index ids (int and str formats are both supported) and geometries that are provided in EPSG:27700.
-# MAGIC Mosaic will convert all the geometries for proper rendering. - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC with_postcodes "index_geometry" "geometry(27700)" 5000 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Using mosaic it takes only a few lines of code to produce BNG based heat map and visualise it in Kepler.
-# MAGIC By default the colors wont be affected by the counts and you'd need to change the options in Kepler UI.
-# MAGIC Navigate to the layer, expland it and for the fill color click on the 3 dots icon, then select count as the field for color scaling. - -# COMMAND ---------- - -properties_per_index = with_postcodes.groupBy("index_id").count() - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC properties_per_index "index_id" "bng" 6000 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We can do the same - -# COMMAND ---------- - -properties_per_chip = with_postcodes.groupBy("index_geometry").count() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Note that if you dont use "right_outer" join some chips may be empty.
-# MAGIC This is due to no UPRNs being located in those exact chips.
- -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC properties_per_chip "index_geometry" "geometry(27700)" 20000 - -# COMMAND ---------- - - diff --git a/notebooks/examples/python/MosaicAndSedona.py b/notebooks/examples/python/MosaicAndSedona.py deleted file mode 100644 index 9426cf0c8..000000000 --- a/notebooks/examples/python/MosaicAndSedona.py +++ /dev/null @@ -1,40 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC # Mosaic & Sedona -# MAGIC -# MAGIC You can combine the usage of Mosaic with other geospatial libraries. -# MAGIC -# MAGIC In this example we combine the use of [Sedona](https://sedona.apache.org) and Mosaic. -# MAGIC -# MAGIC ## Setup -# MAGIC -# MAGIC This notebook will run if you have both Mosaic and Sedona installed on your cluster. -# MAGIC -# MAGIC ### Install sedona -# MAGIC -# MAGIC To install Sedona, follow the [official Sedona instructions](https://sedona.apache.org/1.4.0/setup/databricks). - -# COMMAND ---------- - -import pyspark.sql.functions as f -import mosaic as mos -from sedona.register.geo_registrator import SedonaRegistrator - -mos.enable_mosaic(spark, dbutils) # Enable Mosaic -SedonaRegistrator.registerAll(spark) # Register Sedona SQL functions - -# COMMAND ---------- - -df = spark.createDataFrame([{'wkt': 'POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))'}]) -(df - # Mosaic - .withColumn("mosaic_area", mos.st_area('wkt')) - # Sedona - .withColumn("sedona_area", f.expr("ST_Area(ST_GeomFromWKT(wkt))")) - # Sedona function not available in Mosaic - .withColumn("sedona_flipped", f.expr("ST_FlipCoordinates(ST_GeomFromWKT(wkt))")) -).show() - -# COMMAND ---------- - - diff --git a/notebooks/examples/python/NetCDF/CoralBleaching/mosaic_gdal_coral_bleaching.ipynb b/notebooks/examples/python/NetCDF/CoralBleaching/mosaic_gdal_coral_bleaching.ipynb new file mode 100644 index 000000000..585fe3f44 --- /dev/null +++ b/notebooks/examples/python/NetCDF/CoralBleaching/mosaic_gdal_coral_bleaching.ipynb @@ -0,0 +1,918 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a7ef2bb5-0a2b-44db-b08d-d641e53578e2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Analyze Coral Bleaching with Mosaic + GDAL\n", + "\n", + "> Read multiple NetCDFs using Mosaic and process through several performance-driving data engineering steps before rendering avg coral bleaching worldwide at h3 resolution `3`.\n", + "\n", + "__Notes:__\n", + "\n", + "

\n", + "\n", + "* This notebook was updated for Mosaic [0.3.12](https://github.com/databrickslabs/mosaic/releases/tag/v_0.3.12) on DBR 12.2 LTS\n", + "* [GDAL](https://gdal.org/) supported in [Mosaic](https://databrickslabs.github.io/mosaic/index.html)\n", + " * Install this GDAL [init script](https://github.com/databrickslabs/mosaic/blob/main/modules/python/gdal_package/databricks-mosaic-gdal/resources/scripts/mosaic-gdal-3.4.3-filetree-init.sh) (for DBR 12.2) on your cluster, see [[1](https://docs.databricks.com/en/init-scripts/cluster-scoped.html#use-cluster-scoped-init-scripts) | [2](https://databrickslabs.github.io/mosaic/usage/install-gdal.html)] for more.\n", + "* Recommend using an auto-scaling 2-8 worker cluster, doesn't need to be a large instance type but should use delta (aka disk) caching, more [here](https://docs.databricks.com/en/optimizations/disk-cache.html).\n", + "\n", + "--- \n", + "__Last Update:__ 21 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "70d2df7e-e1e5-400c-8065-7ff2d262ebc6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bbb5ae0c-2d9f-4434-a76e-54240068a64a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d17676ed-e132-4d4b-9873-993394f23068", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "GDAL enabled.\n\nGDAL 3.4.3, released 2022/04/22\n\n\n" + ] + } + ], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "\n", + "# -- import databricks + spark functions\n", + "\n", + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "mos.enable_gdal(spark)\n", + "\n", + "# -- other imports\n", + "import os" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3ec607f0-30f3-495b-b18c-f67eec55bc70", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## NetCDF Coral Bleaching Data\n", + "\n", + "> These files were uploaded from [Mosaic Test Resources](https://github.com/databrickslabs/mosaic/tree/main/src/test/resources/binary/netcdf-coral).\n", + "\n", + "__Hint:__ _Can also use [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/index.html) to move files around, e.g. from your local machine._" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4cf39fbb-f181-45f9-98f3-1e46c85cf61d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Download data [1x] into Workspace_\n", + "\n", + "> There are a few ways to do this; we will create a folder in our workspace; your path will look something like `/Workspace/Users//`. __Note: Spark cannot directly interact with Workspace files, so we will take an additional step after downloading, more [here](https://docs.databricks.com/en/files/workspace-interact.html#read-data-workspace-files).__ Workspace files are newer to Databricks and we want to make sure you get familiar with them." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "167db92f-9f4e-4706-98bb-4b8a1caee3c3", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "ws_data = \"/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data\"\n", + "\n", + "os.environ['WS_DATA'] = ws_data" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "63b7947d-e36f-4385-bfb3-f97715f789d5", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "total 5.5K\ndrwxrwxrwx 2 root root 4.0K Nov 21 16:31 bak\ndrwxrwxrwx 2 root root 0 Nov 21 16:31 data\n-rwxrwxrwx 1 root root 1.1K Nov 8 12:57 mosaic-gdal-3.4.3-filetree-init.sh\n-rwxrwxrwx 1 root root 0 Jan 1 1970 mosaic-gdal-coral-bleaching\n" + ] + } + ], + "source": [ + "%sh\n", + "# this is just in the workspace initially\n", + "mkdir -p $WS_DATA\n", + "ls -lh $WS_DATA/.." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ad3aa80d-4aa9-4e68-8503-f78e3c99e068", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "File ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220101.nc’ already there; not retrieving.\n\nFile ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220102.nc’ already there; not retrieving.\n\nFile ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220103.nc’ already there; not retrieving.\n\nFile ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220104.nc’ already there; not retrieving.\n\nFile ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220105.nc’ already there; not retrieving.\n\nFile ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220106.nc’ already there; not retrieving.\n\nFile ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220107.nc’ already there; not retrieving.\n\nFile ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220108.nc’ already there; not retrieving.\n\nFile ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220109.nc’ already there; not retrieving.\n\nFile ‘/Workspace/Users/mjohns@databricks.com/All_Shared/mosaic_raster/NetCDF_Coral/data/ct5km_baa-max-7d_v3.1_20220110.nc’ already there; not retrieving.\n\n" + ] + } + ], + "source": [ + "%sh \n", + "# download all the nc files used\n", + "# - '-nc' means no clobber here\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220101.nc\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220102.nc\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220103.nc\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220104.nc\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220105.nc\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220106.nc\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220107.nc\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220108.nc\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220109.nc\n", + "wget -P $WS_DATA -nc https://github.com/databrickslabs/mosaic/raw/main/src/test/resources/binary/netcdf-coral/ct5km_baa-max-7d_v3.1_20220110.nc" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0059a460-0841-4b08-a7eb-449fc02e9827", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_For simplicity (and since we are running DBR 12.2), we are going to copy from the Workspace folder to DBFS, but this is all shifting with Unity Catalog (more [here](https://docs.databricks.com/en/dbfs/unity-catalog.html))._ __Note: [DBFS](https://docs.databricks.com/en/dbfs/dbfs-root.html), and more recent [Volumes](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#volumes), are FUSE mounted to the cluster nodes, looking like a local path.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0b297023-8bfe-4eb7-8eeb-ef6f151ec587", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbfs_data = \"/home/mjohns@databricks.com/datasets/netcdf-coral\"\n", + "dbfs_data_fuse = f\"/dbfs{dbfs_data}\"\n", + "os.environ['DBFS_DATA'] = dbfs_data\n", + "os.environ['DBFS_DATA_FUSE'] = dbfs_data_fuse" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2181742a-9680-4e4c-b22c-fedebb2ae4cd", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "total 7.0M\n-rwxrwxrwx 1 root root 691K Nov 21 16:31 ct5km_baa-max-7d_v3.1_20220101.nc\n-rwxrwxrwx 1 root root 692K Nov 21 16:31 ct5km_baa-max-7d_v3.1_20220102.nc\n-rwxrwxrwx 1 root root 701K Nov 21 16:31 ct5km_baa-max-7d_v3.1_20220103.nc\n-rwxrwxrwx 1 root root 704K Nov 21 16:31 ct5km_baa-max-7d_v3.1_20220104.nc\n-rwxrwxrwx 1 root root 710K Nov 21 16:31 ct5km_baa-max-7d_v3.1_20220105.nc\n-rwxrwxrwx 1 root root 714K Nov 21 16:31 ct5km_baa-max-7d_v3.1_20220106.nc\n-rwxrwxrwx 1 root root 718K Nov 21 16:31 ct5km_baa-max-7d_v3.1_20220107.nc\n-rwxrwxrwx 1 root root 720K Nov 21 16:31 ct5km_baa-max-7d_v3.1_20220108.nc\n-rwxrwxrwx 1 root root 722K Nov 21 16:31 ct5km_baa-max-7d_v3.1_20220109.nc\n-rwxrwxrwx 1 root root 726K Nov 21 2023 ct5km_baa-max-7d_v3.1_20220110.nc\n" + ] + } + ], + "source": [ + "%sh \n", + "# copy from workspace\n", + "# - for spark / distributed work\n", + "mkdir -p $DBFS_DATA_FUSE\n", + "cp -r $WS_DATA/* $DBFS_DATA_FUSE\n", + "ls -lh $DBFS_DATA_FUSE" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9d0b3a64-e93c-4102-b0df-77a57851497b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Read NetCDFs with Spark\n", + "\n", + "> Uses Mosaic [GDAL readers](https://databrickslabs.github.io/mosaic/api/raster-format-readers.html#raster-format-readers). __Note: starting with Mosaic 0.3.12, the 'tile' column is populated and is used by various `rst_` functions.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1486e9fc-719b-4151-bb95-73618dd5c654", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 10\n+--------------------+--------------------+------+-------------------+------+------+---------+--------------------+--------------------+----+--------------------+\n| path| modificationTime|length| uuid|x_size|y_size|bandCount| metadata| subdatasets|srid| tile|\n+--------------------+--------------------+------+-------------------+------+------+---------+--------------------+--------------------+----+--------------------+\n|dbfs:/home/mjohns...|1970-01-20 16:23:...|743047|5240782214809708542| 512| 512| 0|{SUBDATASET_1_DES...|{SUBDATASET_1_DES...| 0|{null, �HDF\\r\\n\u001A\\...|\n+--------------------+--------------------+------+-------------------+------+------+---------+--------------------+--------------------+----+--------------------+\n\n" + ] + } + ], + "source": [ + "df = (\n", + " spark\n", + " .read.format(\"gdal\")\n", + " .option(\"driverName\", \"NetCDF\")\n", + " .load(dbfs_data)\n", + ")\n", + "print(f\"count? {df.count():,}\")\n", + "df.limit(1).show() # <- limiting display for ipynb output only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3ec2909b-f883-4a37-b183-d5c715d3e5ba", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Let's work with the \"bleaching_alert_area\" subdataset.__\n", + "\n", + "> We are using `rst_subdataset` which uses the (new) 'tile' column, more [here](https://databrickslabs.github.io/mosaic/api/raster-functions.html#rst-getsubdataset).\n", + "\n", + "SubDataset 'tile' output looks something like...\n", + "\n", + "```\n", + "{\"index_id\":null,\"raster\":\"Q0RGAQAAAAAAAAAKAAAAAwAAAANsb24AAAAcIAAAAANsYXQAAAAOEAAAAAR0aW1lAAAAAQAAAAwAAAA7AAAAD2Fja25vd2xlZGdlbWVudAAAAAACAAAAHU5PQUEgQ29yYWwgUmVlZiB\n", + "XYXRjaCBQcm9ncmFtAAAAAAAADWNkbV8= (truncated)\",\"parentPath\":\"dbfs:/home/mjohns@databricks.com/datasets/netcdf-coral/ct5km_baa-max-7d_v3.1_20220110.nc\",\"driver\":\"netCDF\"}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "16190122-2654-4e0b-b2b8-35c35f9a2efc", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 10\n+--------------------+\n| tile|\n+--------------------+\n|{null, CDF\u0001\u0000\u0000\u0000\u0000\u0000\u0000...|\n+--------------------+\n\n" + ] + } + ], + "source": [ + "df_bleach = (\n", + " df\n", + " .repartition(df.count(), \"tile\")\n", + " .select(\n", + " mos\n", + " .rst_getsubdataset(\"tile\", F.lit(\"bleaching_alert_area\"))\n", + " .alias(\"tile\")\n", + " )\n", + ")\n", + "print(f\"count? {df_bleach.count():,}\")\n", + "df_bleach.limit(1).show() # <- `.display()` is prettier in databricks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d32b3241-eca9-406c-b751-55749cb62638", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## SubDivide tiles from subdataset column to max of 8MB\n", + "\n", + "> While this is optional for smaller data, we want to demonstrate how you can master tiling at any scale. Let's use [rst_subdivide](https://databrickslabs.github.io/mosaic/api/raster-functions.html#rst-subdivide) to ensure we have tiles no larger than 8MB.\n", + "\n", + "SubDivide 'tile' output looks something like...\n", + "\n", + "```\n", + "{\"index_id\":null,\"raster\":\"iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////6WRBAAAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", + "AAAAAAAAAAAAAT0hEUgINSAMCIgAAAAAAAwUAAAAAAAAA//////////8= (truncated)\",\"parentPath\":\"dbfs:/home/mjohns@databricks.com/datasets/netcdf-coral/\n", + "ct5km_baa-max-7d_v3.1_20220103.nc\",\"driver\":\"netCDF\"}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "149d86d3-57fb-4ae7-97bb-b95997032b3c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 40\n+--------------------+\n| tile|\n+--------------------+\n|{null, �HDF\\r\\n\u001A\\...|\n+--------------------+\n\n" + ] + } + ], + "source": [ + "df_subdivide = (\n", + " df_bleach\n", + " .repartition(df_bleach.count(), \"tile\") # <- repartition important!\n", + " .select(\n", + " mos\n", + " .rst_subdivide(col(\"tile\"), F.lit(8))\n", + " .alias(\"tile\")\n", + " )\n", + ")\n", + "print(f\"count? {df_subdivide.count():,}\") # <- go from 10 to 40 tiles\n", + "df_subdivide.limit(1).show() # <- `.display()` is prettier in databricks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "56b969c9-837d-4d8b-b74e-a8ee9dab8489", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## ReTile tiles from subdataset to 600x600 pixels\n", + "\n", + "> While this is optional for smaller data, we want to demonstrate how you can master tiling at any scale. Let's use [rst_retile](https://databrickslabs.github.io/mosaic/api/raster-functions.html#rst-retile) to ensure we have even data and drive more parallelism.\n", + "\n", + "_ReTile 'tile' output looks something like..._\n", + "\n", + "```\n", + "{\"index_id\":null,\"raster\":\"iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////9t5AQAAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINSAMCIgAAAAAAAwUAAAAAAAAA//////////8= (truncated)\",\"parentPath\":\"dbfs:/home/mjohns@databricks.com/datasets/netcdf-coral/ct5km_baa-max-7d_v3.1_20220102.nc\",\"driver\":\"netCDF\"}\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fe221c80-655f-4556-a68c-8c3feb6dcacf", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 463\n+--------------------+\n| tile|\n+--------------------+\n|{null, �HDF\\r\\n\u001A\\...|\n+--------------------+\n\n" + ] + } + ], + "source": [ + "df_retile = (\n", + " df_subdivide\n", + " .repartition(df_subdivide.count(), \"tile\") # <- repartition important!\n", + " .select(\n", + " mos\n", + " .rst_retile(col(\"tile\"), F.lit(600), F.lit(600))\n", + " .alias(\"tile\")\n", + " )\n", + ")\n", + "print(f\"count? {df_retile.count():,}\") # <- go from 40 to 463 tiles\n", + "df_retile.limit(1).show() # <- `.display()` is prettier in databricks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "943ab1ab-d881-4cd1-a1da-e7a012c0c1b1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Render Raster to H3 Results\n", + "\n", + "> Use [rst_rastertogridavg](https://databrickslabs.github.io/mosaic/api/raster-functions.html#rst-rastertogridavg) to tessellate to grid (default is h3) and provide the average measure for the resolution chosen (in this case resolution `3`); also, creates a temp view & renders with Kepler.gl.\n", + "\n", + "Initial structure of a single `grid_avg` row looks something like...\n", + "\n", + "```\n", + "[\n", + " [\n", + " {\"cellID\":\"592144529759404031\",\"measure\":0},{\"cellID\":\"592849935188099071\",\"measure\":0.8013245033112583},{\"cellID\":\"592834816903217151\",\"measure\":0}, {\"cellID\":\"592823203311648767\",\"measure\":0.9726027397260274},{\"cellID\":\"592323818874208255\",\"measure\":0.028328611898016998}, \n", + " {\"cellID\":\"592306295407640575\",\"measure\":1.8468899521531101},{\"cellID\":\"592143224089346047\",\"measure\":1},{\"cellID\":\"592851447016587263\",\"measure\":0.9979123173277662}, \n", + " {\"cellID\":\"592849316712808447\",\"measure\":0.4621676891615542},{\"cellID\":\"592136833178009599\",\"measure\":0.06970509383378017}, \n", + " {\"cellID\":\"592314885342232575\",\"measure\":0}, {\"cellID\":\"592832274282577919\",\"measure\":0},{\"cellID\":\"592831861965717503\",\"measure\":0} \n", + " ... (truncated)\n", + " ]\n", + "]\n", + "```\n", + "\n", + "Data ultimately looks something like...\n", + "\n", + "| h3 | measure |\n", + "| --- | ------- |\n", + "| 593176490141548543 | 0 |\n", + "| 593386771740360703 | 2.0113207547169814 |\n", + "| 593308294097928191 | 0 |\n", + "| 593825202001936383 | 0.015432098765432098 |\n", + "| 593163914477305855 | 2.008650519031142 |\n", + "\n", + "__Hint: zoom back out once rendered; also, verify the `.contains()` string is actually in the data. Also, this can take a few minutes to run, recommend a few nodes (min. 3 to say 8) in your cluster to speed up processing__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0350f7f4-76b3-4260-943f-5a6ca4a93e28", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# here is the initial structure\n", + "# - notice the array nesting, which we will handle\n", + "# by exploding 2x\n", + "# display (\n", + "# df_retile\n", + "# .limit(5)\n", + "# .select(\n", + "# mos.rst_rastertogridavg(\"tile\", F.lit(3))\n", + "# .alias(\"grid_avg\")\n", + "# )\n", + "# )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a4e90704-b929-406a-bfb0-13fe98ad919b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Prepare a View for rendering with Kepler + other analysis._\n", + "\n", + "> This generates 241,486 rows (row per cellid at h3 resolution `3`)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "be6c49c3-de9b-4321-8921-d15d788dfd96", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 241486\n" + ] + } + ], + "source": [ + "# create view \"to_display\"\n", + "# - you could also write to Delta Lake \n", + "# at any point to avoid recomputing\n", + "(\n", + " df_retile\n", + " .repartition(df_retile.count(), \"tile\")\n", + " .select(mos.rst_rastertogridavg(\"tile\", F.lit(3)).alias(\"grid_avg\"))\n", + " .select(F.explode(col(\"grid_avg\")).alias(\"grid_avg\")) # <- explode-1 of 2d array\n", + " .select(F.explode(col(\"grid_avg\")).alias(\"grid_avg\")) # <- explode-2 of 2d array\n", + " .select(\n", + " F.col(\"grid_avg\").getItem(\"cellID\").alias(\"h3\"), # <- h3 cellid\n", + " F.col(\"grid_avg\").getItem(\"measure\").alias(\"measure\") # <- coral bleaching\n", + " )\n", + " .createOrReplaceTempView(\"to_display\")\n", + ")\n", + "\n", + "# optional: can work with the view in sql\n", + "# - you would probably want to write to delta lake \n", + "# to avoid recompute\n", + "# print(f\"\"\"count? {spark.table(\"to_display\").count():,}\"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5ac256db-8140-4176-90b0-0b09f3817eaa", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "%sql\n", + "-- optional: can work with the view in sql\n", + "-- you would probably want to write to delta lake \n", + "-- to avoid recompute\n", + "-- select * from to_display" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "23fa459c-c73e-4013-938c-c7cf138ace7a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Render with Kepler.gl via Mosaic magic._\n", + "\n", + "> Hint: zoom out within the map viewport to see all available data rendered." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "31d5098d-c49c-4efe-8f38-8b19456363b5", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c7d7ae57-e9cf-4c3e-a8b4-368f732aeaab", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b1f68ace-3253-4a06-b110-e52463772043", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# \"to_display\" \"h3\" \"h3\" 250_000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c371f1d0-7e8f-4426-bc35-096922f3a63c", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Databricks Lakehouse can read / write most any data format\n", + "\n", + "> Here are [built-in](https://docs.databricks.com/en/external-data/index.html) formats as well as Mosaic [readers](https://databrickslabs.github.io/mosaic/api/api.html). __Note: best performance with Delta Lake format__, ref [Databricks](https://docs.databricks.com/en/delta/index.html) and [OSS](https://docs.delta.io/latest/index.html) docs for Delta Lake. Beyond built-in formats, Databricks is a platform on which you can install a wide variety of libraries, e.g. [1](https://docs.databricks.com/en/libraries/index.html#python-environment-management) | [2](https://docs.databricks.com/en/compute/compatibility.html) | [3](https://docs.databricks.com/en/init-scripts/index.html).\n", + "\n", + "Example of [reading](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html?highlight=read#pyspark.sql.DataFrameReader) and [writing](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html?highlight=pyspark%20sql%20dataframe%20writer#pyspark.sql.DataFrameWriter) a Spark DataFrame with Delta Lake format.\n", + "\n", + "```\n", + "# - `write.format(\"delta\")` is default in Databricks\n", + "# - can save to a specified path in the Lakehouse\n", + "# - can save as a table in the Databricks Metastore\n", + "df.write.save(\"\")\n", + "df.write.saveAsTable(\"\")\n", + "```\n", + "\n", + "Example of loading a Delta Lake Table as a Spark DataFrame.\n", + "\n", + "```\n", + "# - `read.format(\"delta\")` is default in Databricks\n", + "# - can load a specified path in the Lakehouse\n", + "# - can load a table in the Databricks Metastore\n", + "df.read.load(\"\")\n", + "df.table(\"\")\n", + "```\n", + "\n", + "More on [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/index.html) in Databricks Lakehouse for Governing [Tables](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#tables) and [Volumes](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#volumes)." + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549841912349, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "mosaic_gdal_coral_bleaching", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/NetCDF/README.md b/notebooks/examples/python/NetCDF/README.md new file mode 100644 index 000000000..37d473bee --- /dev/null +++ b/notebooks/examples/python/NetCDF/README.md @@ -0,0 +1,5 @@ +# NetCDF Examples + +> Some examples loading NetCDF into Databricks. + +__Note: `ipynb` files can be previewed in GitHub and can also be imported into Databricks, more [here](https://docs.databricks.com/en/notebooks/notebook-export-import.html).__ diff --git a/notebooks/examples/python/NetCDF/Raster processing with netCDF + Mosaic.py b/notebooks/examples/python/NetCDF/Raster processing with netCDF + Mosaic.py deleted file mode 100644 index ba606fab5..000000000 --- a/notebooks/examples/python/NetCDF/Raster processing with netCDF + Mosaic.py +++ /dev/null @@ -1,625 +0,0 @@ -# Databricks notebook source -# MAGIC %md # Raster processing with netCDF + Mosaic -# MAGIC -# MAGIC **Prereqs:** -# MAGIC -# MAGIC * Install `netCDF4` from PyPI -# MAGIC * Install `org.locationtech.jts:jts-io:1.19.0` from Maven -# MAGIC * Install `mosaic-0.33` from [Project Mosaic (Release v0.3.3)](https://github.com/databrickslabs/mosaic/releases/tag/v0.3.3) -# MAGIC -# MAGIC **Notes:** -# MAGIC -# MAGIC * Requires [numpy](http://numpy.scipy.org) and netCDF/HDF5 C libraries. -# MAGIC * Github site: https://github.com/Unidata/netcdf4-python -# MAGIC * Online docs: http://unidata.github.io/netcdf4-python/ -# MAGIC * Based on Konrad Hinsen's old [Scientific.IO.NetCDF](http://dirac.cnrs-orleans.fr/plone/software/scientificpython/) API, with lots of added netcdf version 4 features. -# MAGIC * Developed by Jeff Whitaker at NOAA, with many contributions from users. - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC ## Part I - Ingest netCDF in Parallel to Spark Dataframe - -# COMMAND ---------- - -import netCDF4 -import numpy as np - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC #### Inspect netcdf file with driver code - -# COMMAND ---------- - -f = netCDF4.Dataset('/dbfs/ml/blogs/geospatial/data/netcdf/tables/SkyWise_CONUS_SurfaceAnalysis_Daily_20190220_000000-f4048.nc') -print(f) - -# COMMAND ---------- - -lat, lon, temp = f.variables['latitude'], f.variables['longitude'], f.variables['maximum_temperature'] -print(lat) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC #### Get a parameter list - in this case, a list of ~20 files (replace this with thousands of files) - -# COMMAND ---------- - -file_list = [] -date_file_list = dbutils.fs.ls("/ml/blogs/geospatial/data/netcdf/tables") -for fi in date_file_list: - sub_list = dbutils.fs.ls(fi.path) - file_list.append(sub_list[0].path.replace("dbfs:", "/dbfs")) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC #### Test out Single-threaded transformation - -# COMMAND ---------- - -import pandas as pd -from itertools import product - -def get_temp(filename): - current_file = filename - f = netCDF4.Dataset(current_file) - lat, lon, temp = f.variables['latitude'], f.variables['longitude'], f.variables['maximum_temperature'] - - ydim = len(lon[0,:]) - xdim = len(lat[:,0]) - - new_temp = temp[:].reshape([ydim*xdim, 1]) - prod = product(lat[:,0], lon[0,:]) - df = pd.DataFrame(prod, columns=['lat', 'lon']) - df['temp'] = new_temp - f.close() - return(df) - - -output_df = get_temp('/dbfs/ml/blogs/geospatial/data/netcdf/tables/SkyWise_CONUS_SurfaceAnalysis_Daily_20190220_000000-f4048.nc') -output_df.head() - -# COMMAND ---------- - -output_schema = spark.createDataFrame(output_df).schema -output_schema - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC #### Write and Run Pandas UDF to coalesce File Info Into Spark Data Frame - -# COMMAND ---------- - -from pyspark.sql.functions import pandas_udf, col -from pyspark.sql.functions import PandasUDFType -from pyspark.sql.types import * - -sdf = spark.createDataFrame(sc.parallelize(file_list), StringType()) - -@pandas_udf(output_schema, functionType=PandasUDFType.GROUPED_MAP) -def p_get_temp(filename): - current_file = filename.iloc[0]['value'] - print('current_file type is', type(current_file)) - print('current_file is: ', current_file) - f = netCDF4.Dataset(current_file) - lat, lon, temp = f.variables['latitude'], f.variables['longitude'], f.variables['maximum_temperature'] - - ydim = len(lon[0,:]) - xdim = len(lat[:,0]) - - new_temp = temp[:].reshape([ydim*xdim, 1]) - prod = product(lat[:,0], lon[0,:]) - df = pd.DataFrame(prod, columns=['lat', 'lon']) - df['temp'] = new_temp - f.close() - return(df) - -output_df = sdf.groupBy(col("value")).apply(p_get_temp) - -# COMMAND ---------- - -display(output_df) - -# COMMAND ---------- - -output_df.createOrReplaceTempView("rp_weather_netcdf_delta") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC ## Part II - Add Geospatial Index for Large-Scale Geospatial Analytics - -# COMMAND ---------- - -# MAGIC %sh -# MAGIC -# MAGIC sudo apt-get install -y cmake - -# COMMAND ---------- - -# MAGIC %scala -# MAGIC -# MAGIC // UDFs for comparison with Mosaic -# MAGIC // using JTS + H3 directly -# MAGIC -# MAGIC import com.locationtech.jts.geom.{Coordinate, Geometry, GeometryFactory} -# MAGIC import com.uber.h3core.H3Core; -# MAGIC import com.uber.h3core.util.GeoCoord; -# MAGIC import scala.collection.JavaConversions._ -# MAGIC import scala.collection.mutable.ListBuffer -# MAGIC import scala.collection.JavaConverters._ -# MAGIC -# MAGIC object H3 extends Serializable { -# MAGIC private val _instance = H3Core.newInstance(); -# MAGIC def instance() = _instance -# MAGIC } -# MAGIC -# MAGIC def geoToH3 = udf((latitude:Double, longitude:Double, resolution:Int)=>{ -# MAGIC //h3.instance.geoToH3Address(latitude, longitude, resolution); //--> Long.toHexString(long h3) -# MAGIC H3.instance.geoToH3(latitude, longitude, resolution) -# MAGIC }) -# MAGIC -# MAGIC def polygonToH3 = udf((geometry: Geometry, resolution: Int)=>{ -# MAGIC var points: java.util.List[com.uber.h3core.util.GeoCoord] = List(); -# MAGIC var holes: java.util.List[java.util.List[com.uber.h3core.util.GeoCoord]] = List(); -# MAGIC if (geometry.getGeometryType == "Polygon"){ -# MAGIC points = ListBuffer(geometry.getCoordinates().toList.map(coord => new GeoCoord(coord.y,coord.x)) : _*); -# MAGIC } -# MAGIC asScalaBuffer(H3.instance.polyfill(points, holes ,resolution)).toList -# MAGIC }); -# MAGIC -# MAGIC def multiPolygonToH3 = udf((geometry: Geometry, resolution: Int)=>{ -# MAGIC var points: java.util.List[com.uber.h3core.util.GeoCoord] = List(); -# MAGIC var holes: java.util.List[java.util.List[com.uber.h3core.util.GeoCoord]] = List(); -# MAGIC if (geometry.getGeometryType == "MultiPolygon"){ -# MAGIC val numGeometries = geometry.getNumGeometries(); -# MAGIC if (numGeometries > 0){ -# MAGIC points = ListBuffer(geometry.getGeometryN(0).getCoordinates().toList.map(coord => new GeoCoord(coord.y,coord.x)): _*); -# MAGIC } -# MAGIC if (numGeometries >1){ -# MAGIC holes = ListBuffer((1 to (numGeometries-1)).toList.map(n => { -# MAGIC val templist: java.util.List[com.uber.h3core.util.GeoCoord] = ListBuffer(geometry.getGeometryN(n).getCoordinates().toList.map(coord => new GeoCoord(coord.y,coord.x)): _*) -# MAGIC templist -# MAGIC }): _*); -# MAGIC } -# MAGIC } -# MAGIC asScalaBuffer(H3.instance.polyfill(points, holes ,resolution)).toList -# MAGIC }); - -# COMMAND ---------- - -# MAGIC %python -# MAGIC -# MAGIC geoToH3 = udf(geo_to_h3) - -# COMMAND ---------- - -# MAGIC %scala -# MAGIC -# MAGIC import org.apache.spark.sql.functions._ -# MAGIC -# MAGIC val res = 8 -# MAGIC val weather_df = spark.sql("select temp, lat, lon from rp_weather_netcdf_delta").withColumn("h3_index", geoToH3(col("lat"), col("lon"), lit(res))) -# MAGIC weather_df.write.mode("overwrite").format("delta").saveAsTable("rp_weather_netcdf_delta_silver2") - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC -# MAGIC OPTIMIZE RP_WEATHER_NETCDF_DELTA_SILVER2 -# MAGIC ZORDER BY H3_INDEX - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC ### For that Home Depot in Antarctica, What is the Temperature? -# MAGIC -# MAGIC - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC As long as we set the resolution large enough, our hexagon will refer to a region where we can report the temperature for Home Depot. - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC -# MAGIC select * from RP_WEATHER_NETCDF_DELTA_SILVER2 - -# COMMAND ---------- - -# MAGIC %scala -# MAGIC -# MAGIC val res = 8 -# MAGIC val home_depot_lat_long = Seq((31, -100.80)) -# MAGIC -# MAGIC val h3_lat_range = home_depot_lat_long.toDF("lat", "lon").withColumn("hd_h3_idx", geoToH3(col("lat"), col("lon"), lit(res))) -# MAGIC -# MAGIC display(h3_lat_range) - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC -# MAGIC select * from RP_WEATHER_NETCDF_DELTA_SILVER2 -# MAGIC where h3_index = 613765671894908927 - -# COMMAND ---------- - -@pandas_udf(output_schema, functionType=PandasUDFType.GROUPED_MAP) -def p_get_raster(filename): - current_file = filename.iloc[0]['value'] - print('current_file type is', type(current_file)) - print('current_file is: ', current_file) - f = netCDF4.Dataset(current_file) - raster_data = netCDF4.raster(f) - lat, lon, temp = f.variables['latitude'], f.variables['longitude'], f.variables['maximum_temperature'] - - ydim = len(lon[0,:]) - xdim = len(lat[:,0]) - - new_temp = temp[:].reshape([ydim*xdim, 1]) - prod = product(lat[:,0], lon[0,:]) - df = pd.DataFrame(prod, columns=['lat', 'lon']) - df['raster'] = raster_data - df['temp'] = new_temp - f.close() - return(df) - -output_df = sdf.groupBy(col("value")).apply(p_get_raster) - -# COMMAND ---------- - - - -# COMMAND ---------- - - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Extract the CRS (coordinate reference system) associated with the dataset identifying where the raster is located in geographic space. - -# COMMAND ---------- - -@udf(returnType=StringType()) -def get_crs(content): - # Read the in-memory tiff file - with MemoryFile(bytes(content)) as memfile: - with memfile.open() as data: - # Use netcdf with the data object - return str(data.crs) - -df_bin.withColumn("crs", get_crs("content")).display() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Extract masks from images -# MAGIC An image mask identifies the regions of the image where there is valid data to be processed. - -# COMMAND ---------- - -from pyspark.sql.types import ArrayType, StringType - -@udf(returnType=ArrayType(StringType())) -def get_mask_shapes(content): - geometries = [] - - # Read the in-memory tiff file - with MemoryFile(bytes(content)) as memfile: - with memfile.open() as data: - - # Read the dataset's valid data mask as a ndarray. - mask = data.dataset_mask() - - # Extract feature shapes and values from the array. - for geom, val in nc.features.shapes( - mask, transform=data.transform): - - if val > 0: # Only append shapes that have a positive maks value - - # Transform shapes from the dataset's own coordinate - # reference system to CRS84 (EPSG:4326). - geom = nc.warp.transform_geom( - data.crs, 'EPSG:4326', geom, precision=6) - - geometries.append(json.dumps(geom)) - - return geometries - - -# COMMAND ---------- - -df_masks = (df_bin - .withColumn("mask_json_shapes", get_mask_shapes("content")) - .withColumn("mask_json", explode("mask_json_shapes")) - # Convert geoJSON to WKB - .withColumn("mask_wkb", mos.st_aswkb(mos.st_geomfromgeojson("mask_json"))) - .drop("content", "mask_json_shapes", "mask_json") - .cache() # Caching while developing, TODO: Remove in prod - ) -df_masks.display() - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC df_masks "mask_wkb" "geometry" - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Tessellate with Mosaic - -# COMMAND ---------- - -df_chips = (df_masks - # Tessellate with Mosaic - .withColumn("chips", mos.grid_tessellateexplode("mask_wkb", lit(h3_resolution))) - .select("path", "modificationTime", "chips.*") - .withColumn("chip_geojson", mos.st_asgeojson("wkb")) - .cache() # TODO remove for distribution - ) -df_chips.display() - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC df_chips "wkb" "geometry" 10000 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Pixels to chips - -# COMMAND ---------- - -import pyspark.sql.functions as F -import numpy as np -from pyspark.sql.types import ArrayType, StringType, DoubleType, StructType, StructField, LongType, IntegerType -import nc.mask - -schema = ArrayType( - StructType([ - StructField("values", ArrayType(DoubleType())), - StructField("nonzero_pixel_count", IntegerType()), - ])) - -@udf(returnType=schema) -def get_shapes_avg(content, chips): - chip_values = [] - - # Read the in-memory tiff file - with MemoryFile(bytes(content)) as memfile: - with memfile.open() as data: - - for chip in chips: - chip_geojson = json.loads(chip) # Chip in GeoJSON format - geom = nc.warp.transform_geom('EPSG:4326', data.crs, chip_geojson, precision=6) # Project chips to the image CRS - out_image, out_transform = nc.mask.mask(data, [geom], crop=True, filled=False) # Crop the image on a shape containing the chip - - val = np.average(out_image, axis=(1,2)).tolist() # Aggregated by band - nonzeroes = np.count_nonzero(out_image.mask) # Cound pixels within the chip shape - - chip_values.append({ - "values": val, # Aggregated pixel values by band - "nonzero_pixel_count": nonzeroes # Number of pixels within the mask shape - }) - - return chip_values - -df_chipped = (df_chips - .groupBy("path", "modificationTime") - .agg(F.collect_list(F.struct("chip_geojson", "index_id", "wkb", "is_core")).alias("chips")) # Collecting the list of chips - .join(df_bin, ["path", "modificationTime"]) # Join with the original data files - .withColumn("chip_values", get_shapes_avg(col("content"), F.expr("chips.chip_geojson"))) # Execute UDF to aggregate pixels for each chip - .withColumn("zipped_chip_values", F.arrays_zip("chips", "chip_values")) - .withColumn("zipped_chip_value", F.explode("zipped_chip_values")) # Explode result array in multiple rows - .select( # Select only relevant columns - col("path"), - col("modificationTime"), - F.expr("zipped_chip_value.chips.*"), - F.expr("zipped_chip_value.chip_values.*"), -# col("chip.index_id").cast("long").alias("index_id"), -# col("chip.nonzero_pixel_count").alias("nonzero_pixel_count"), - F.expr("zipped_chip_value.chip_values.values[0]").alias("value_band_0") - ) - .cache() # TODO: Remove in production - ) -df_chipped.display() - -# COMMAND ---------- - -import pyspark.sql.functions as F - -from pyspark.sql.types import ArrayType, StringType, DoubleType, StructType, StructField, LongType, IntegerType - -schema = ArrayType( - StructType([ - StructField("index_id", LongType()), - StructField("values", ArrayType(DoubleType())), - StructField("nonzero_pixel_count", IntegerType()), - ])) - -@udf(returnType=schema) -def get_shapes_avg(content, chips): - chip_values = [] - - # Read the in-memory tiff file - with MemoryFile(bytes(content)) as memfile: - with memfile.open() as data: - - for chip in chips: - chip_geojson = json.loads(chip["chip_geojson"]) # Chip in GeoJSON format - geom = nc.warp.transform_geom('EPSG:4326', data.crs, chip_geojson, precision=6) # Project chips to the image CRS - out_image, out_transform = nc.mask.mask(data, shapes, crop=True, filled=False) # Crop the image on a shape containing the chip - - val = np.average(out_image, axis=(1,2)).tolist() # Aggregated by band - nonzeroes = np.count_nonzero(out_image.mask) # Cound pixels within the chip shape - - chip_values.append({ - "index_id": chip["index_id"], # H3 index ID - "values": val, # Aggregated pixel values by band - "nonzero_pixel_count": nonzeroes # Number of pixels within the mask shape - }) - - return chip_values - -df_chipped = (df_chips - .groupBy("path", "modificationTime") - .agg(F.collect_list(F.struct("chip_geojson", "index_id", "wkb", "is_core")).alias("chips")) # Collecting the list of chips - .join(df_bin, ["path", "modificationTime"]) # Join with the original data files - .withColumn("chip_values", get_shapes_avg("content", "chips")) # Execute UDF to aggregate pixels for each chip - .withColumn("chip", F.explode("chip_values")) # Explode result array in multiple rows - .select( # Select only relevant columns - col("path"), - col("modificationTime"), - col("chip.index_id").cast("long").alias("index_id"), - col("chip.nonzero_pixel_count").alias("nonzero_pixel_count"), - F.expr("chip.values[0]").alias("value_band_0") - ) - .cache() # TODO: Remove in production - ) -df_chipped.display() - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC df_chipped "wkb" "geometry" 10000 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC In the Raster TIFF file the shape is the rectangle that contains image (we can get this from bounds). We need to cut the image with the same technique, but on top of that we need find all the pixels that fall within each cell and run an aggregation for those pixels (min/max/average/median/etc.). -# MAGIC In order to do that we need to use functions like masking or rasterize to get the portion of the image that corresponds to each grid cell. -# MAGIC This will generate an aggregated information based on the grid cells. We can store that in a table and visualise it, join it with tesselated vectors etc. - -# COMMAND ---------- - -# MAGIC %md ## Join data - environmental exposure - -# COMMAND ---------- - -# Path to directory of the csv for impact assessment -EXP_FILE = "California_Fire_Perimeters.csv" -exposure_fire = spark.read.format("csv").option("header","true").load(DATA_DIR+"/"+EXP_FILE) - -# COMMAND ---------- - -display(exposure_fire) - -# COMMAND ---------- - -exposure_fire.count() - -# COMMAND ---------- - -exposure_fire_tf = ( - exposure_fire - .drop("ID") - .withColumn("latitude",col("latitude").cast(DoubleType())) - .withColumn("longitude",col("longitude").cast(DoubleType())) - .withColumn("geom", mos.st_astext(mos.st_point(col("longitude"), col("latitude")))) # First we need to creating a new Mosaic Point geometry, and afterwards translate a geometry into its Well-known Text (WKT) representation -) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We can use Mosaic functionality to identify how to best index our data based on the data inside the specific dataframe.
-# MAGIC Selecting an appropriate indexing resolution can have a considerable impact on the performance.
- -# COMMAND ---------- - -firesWithIndex = (exppsure_fire_tf - .withColumn("index_id", mos.grid_pointascellid(col("geom"), lit(h3_resolution))) -) - -# COMMAND ---------- - -display(firesWithIndex) - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC firesWithIndex "geom" "geometry" 100000 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Aggregation - -# COMMAND ---------- - -withFirePerimeter = ( - firesWithIndex.join( - df_chipped, - on="index_id", - how="right") - .where( - # If the borough is a core chip (the chip is fully contained within the geometry), then we do not need - # to perform any intersection, because any point matching the same index will certainly be contained in - # the borough. Otherwise we need to perform an st_contains operation on the chip geometry. - col("is_core") | mos.st_contains(col("wkb"), col("geom"))) - .groupBy(["index_id", "wkb", "value_band_0", "nonzero_pixel_count", "is_core"]) - .agg(F.count("geom").alias("point_count")) - .cache() # TODO: Remove in production -# drop("count") -) - -display(withFirePerimeter) - -# COMMAND ---------- - -withFirePerimeter.count() - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC withFirePerimeter "wkb" "geometry" 100000 - -# COMMAND ---------- - -# MAGIC %md ### ```ST_Contains``` of Raster and exposure data - -# COMMAND ---------- - -withFirePerimeterExposure = ( - firesWithIndex.join( - df_chipped, - on="index_id", - how="inner") - .where( - # If the borough is a core chip (the chip is fully contained within the geometry), then we do not need - # to perform any intersection, because any point matching the same index will certainly be contained in - # the borough. Otherwise we need to perform an st_contains operation on the chip geometry. - col("is_core") | mos.st_contains(col("wkb"), col("geom"))) - .groupBy(["index_id", "wkb", "value_band_0", "nonzero_pixel_count", "is_core"]) - .agg(F.count("geom").alias("point_count")) - .cache() # TODO: Remove in production -# drop("count") -) - -display(withFirePerimeterExposure) - -# COMMAND ---------- - -display(withFirePremeterExposure) - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC withFirePerimeterExposure "wkb" "geometry" 100000 diff --git a/notebooks/examples/python/NetCDF/Xarray/distributed_slice netcdf_files.ipynb b/notebooks/examples/python/NetCDF/Xarray/distributed_slice netcdf_files.ipynb new file mode 100644 index 000000000..d11b87bdb --- /dev/null +++ b/notebooks/examples/python/NetCDF/Xarray/distributed_slice netcdf_files.ipynb @@ -0,0 +1,4839 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fc0fc4ed-6933-4793-b939-612fd6fcd1f7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Slice NetCDFs [Distributed]\n", + "\n", + "## Overview\n", + "\n", + "> This notebook demonstrates how to open and explore the netCDF file, and slice it using Spark + Xarray. Data is sliced by time, latitude, and longitude attributes.\n", + "\n", + "__Examples:__\n", + "\n", + "

\n", + "\n", + "* Single File: slice example with flattening\n", + "* Distributed: slice examples with / without flattening\n", + "\n", + "## Source Data\n", + "\n", + "The source data is NOAA Global Precipitation [Data](https://downloads.psl.noaa.gov/Datasets/cpc_global_precip/); contains all years since 1979, each ~60MB.\n", + "\n", + "## Prerequisites\n", + "\n", + "Python 3 or later. Python modules: we will add 'h5netcdf', 'xarray', and 'cftime'; also will update 'scipy' version (numpy, pandas, matplotlib already available)\n", + "\n", + "---\n", + "__Last Updated:__ 21 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "05d7ca66-92a2-4dfc-bb85-f721a6269466", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c3b90656-5a2f-4e31-a6b2-26dca4b4e77d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "59b09f6b-850f-444b-af01-fc58a33c34da", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\nPython interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install h5netcdf cftime xarray scipy --quiet\n", + "%pip install databricks-mosaic --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "00873219-7fae-4987-af20-d277f218f5cc", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "GDAL enabled.\n\nGDAL 3.4.3, released 2022/04/22\n\n\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.databricks.v1+bamboolib_hint": "{\"pd.DataFrames\": [], \"version\": \"0.0.1\"}", + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "\n", + "# -- import databricks + spark functions\n", + "\n", + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "mos.enable_gdal(spark)\n", + "\n", + "# -- other imports\n", + "import io\n", + "import json\n", + "import os\n", + "import pandas as pd\n", + "import xarray as xr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4197a44c-3a73-41d5-8c45-6be19c9c3677", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Data\n", + "\n", + "> Adjust `nc_dir` to your preferred fuse path. _For simplicity, we are going to use DBFS, but this is all shifting with Unity Catalog [more [here](https://docs.databricks.com/en/dbfs/unity-catalog.html)]._ __Note: [DBFS](https://docs.databricks.com/en/dbfs/dbfs-root.html), [Workspace Files](https://docs.databricks.com/en/files/workspace.html), and [most recent] [Volumes](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#volumes), are FUSE mounted to the cluster nodes, looking like a local path.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b3cd8fea-ac3d-4e9f-a5aa-75d20cbae9d6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "nc_dir = '/home/mjohns@databricks.com/geospatial/netcdf-precip'\n", + "nc_dir_fuse = f'/dbfs{nc_dir}'\n", + "os.makedirs(nc_dir_fuse, exist_ok=True)\n", + "\n", + "nc_sample_path = f'{nc_dir}/precip.2023.nc'\n", + "nc_sample_path_fuse = f'/dbfs{nc_sample_path}'\n", + "\n", + "os.environ['NC_DIR_FUSE'] = nc_dir_fuse\n", + "os.environ['NC_SAMPLE_PATH_FUSE'] = nc_sample_path_fuse\n", + "os.environ['NC_SAMPLE_FILE'] = 'precip.2023.nc'" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0e69191a-0a50-4d8c-9bd7-0484e9d54f55", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[3]: False" + ] + } + ], + "source": [ + "os.path.isfile('test.txt')" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "61ff37b0-1566-467e-bd4f-63a58f99f74a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def download_url(url:str, out_path:str, debug_level:int = 0):\n", + " \"\"\"\n", + " Download URL to out path\n", + " \"\"\"\n", + " import os\n", + " import requests\n", + "\n", + " if os.path.exists(out_path):\n", + " debug_level > 0 and print(f\"...skipping existing '{out_path}'\")\n", + " else:\n", + " r = requests.get(url) # create HTTP response object\n", + " with open(out_path,'wb') as f:\n", + " f.write(r.content)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "99f0ad90-0d23-4b7d-b650-d156ad37d51b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "...skipping existing '/dbfs/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2023.nc'\n" + ] + } + ], + "source": [ + "# single year sample\n", + "year = \"2023\"\n", + "download_url(\n", + " f\"https://downloads.psl.noaa.gov/Datasets/cpc_global_precip/precip.{year}.nc\", \n", + " f\"{nc_dir_fuse}/precip.{year}.nc\", \n", + " debug_level=1\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6662e0df-96fb-416f-8e24-d416ee7443e7", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# - you can adjust the range to avoid so many files\n", + "# - reminder: range is not inclusive, so this is through 2022 as-is\n", + "for year in range(1979,2023):\n", + " download_url(f\"https://downloads.psl.noaa.gov/Datasets/cpc_global_precip/precip.{year}.nc\", f\"{nc_dir_fuse}/precip.{year}.nc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ffe7b25a-6e61-4968-8f74-b6e97203e017", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "total 2.6G\n-rwxrwxrwx 1 root root 61M Nov 21 19:00 precip.1979.nc\n-rwxrwxrwx 1 root root 63M Nov 21 19:00 precip.1980.nc\n-rwxrwxrwx 1 root root 64M Nov 21 19:00 precip.1981.nc\n-rwxrwxrwx 1 root root 62M Nov 21 19:00 precip.1982.nc\n...\n-rwxrwxrwx 1 root root 58M Nov 21 19:02 precip.2019.nc\n-rwxrwxrwx 1 root root 57M Nov 21 19:02 precip.2020.nc\n-rwxrwxrwx 1 root root 58M Nov 21 19:02 precip.2021.nc\n-rwxrwxrwx 1 root root 64M Nov 21 19:02 precip.2022.nc\n-rwxrwxrwx 1 root root 55M Nov 21 19:00 precip.2023.nc\n" + ] + } + ], + "source": [ + "%sh\n", + "# avoid list all files\n", + "ls -lh $NC_DIR_FUSE | head -5\n", + "echo \"...\"\n", + "ls -lh $NC_DIR_FUSE | tail -5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9579a699-5a41-4e02-a2e0-daf50ca1be7d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Helper Functions\n", + "\n", + "> These are used a couple of places in the examples, have UDF version of each." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c42d72ea-f36a-4f1b-a2d5-483a67f7c1d4", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def from_360(lon):\n", + " \"\"\"\n", + " Standardize from 0:360 to -180:180 degrees.\n", + " - NetCDF does 0:360 for longitude\n", + " - See https://itecnote.com/tecnote/python-change-longitude-from-180-to-180-to-0-to-360/\n", + " \"\"\"\n", + " return ((lon - 180) % 360) - 180\n", + "\n", + "@udf(returnType=DoubleType())\n", + "def from_360_udf(lon):\n", + " return from_360(lon)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e483c91c-c0d2-4e4d-b4d7-539d0e77dc48", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def from_180(lon):\n", + " \"\"\"\n", + " Standardize from -180:180 to 0:360 degrees.\n", + " - NetCDF does 0:360 for longitude\n", + " - See https://itecnote.com/tecnote/python-change-longitude-from-180-to-180-to-0-to-360/\n", + " \"\"\"\n", + " return lon % 360\n", + "\n", + "@udf(returnType=DoubleType())\n", + "def from_180_udf(lon):\n", + " return from_180(lon)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e4903a70-da0d-4014-bb48-f46f87e8a3f6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Load Metadata [Spark]\n", + "\n", + "> We start with the Mosaic reader to load our NetCDFs, all 45 in this case. Loading results in various metadata about the NetCDFs." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ed8098d3-d22d-4dc5-a55a-a137b2e5d41c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 45\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "

pathmodificationTimelengthuuidx_sizey_sizebandCountmetadatasubdatasetssridtile
dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2023.nc1970-01-20T16:23:13.201+000057443346-7234899442207905050720360323Map(NC_GLOBAL#dataset_title -> CPC GLOBAL PRCP V1.0, precip#long_name -> Daily total of precipitation, time#delta_t -> 0000-00-01 00:00:00, time#long_name -> Time, lat#units -> degrees_north, NETCDF_DIM_time_VALUES -> {}, time#axis -> T, precip#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#References -> https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html, lat#standard_name -> latitude, lat#actual_range -> {89.75,-89.75}, time#coordinate_defines -> start, NETCDF_DIM_EXTRA -> {time}, DERIVED_SUBDATASET_1_NAME -> DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/6835514557054555330.nc, precip#cell_methods -> time: sum, lon#axis -> X, lon#standard_name -> longitude, NC_GLOBAL#title -> CPC GLOBAL PRCP V1.0 RT, precip#actual_range -> {0,776.75}, lon#long_name -> Longitude, lat#axis -> Y, NC_GLOBAL#version -> V1.0, NC_GLOBAL#Source -> ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/, lon#units -> degrees_east, precip#statistic -> Total, time#units -> hours since 1900-01-01 00:00:00, NETCDF_DIM_time_DEF -> {323,6}, lon#actual_range -> {0.25,359.75}, precip#var_desc -> Precipitation, DERIVED_SUBDATASET_1_DESC -> log10 of amplitude of input bands from /vsimem/6835514557054555330.nc, lat#coordinate_defines -> center, precip#valid_range -> {0,1000}, precip#parent_stat -> Other, precip#missing_value -> -9.96921e+36, precip#level_desc -> Surface, lon#coordinate_defines -> center, lat#long_name -> Latitude, time#standard_name -> time, precip#units -> mm, time#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#Conventions -> CF-1.0, precip#dataset -> CPC Global Precipitation, NC_GLOBAL#history -> Updated 2023-11-20 23:31:01, time#actual_range -> {1085832,1085928})Map()0List(null, iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////xKEbAMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINbgICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated), dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2023.nc, netCDF)
dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2022.nc1970-01-20T16:23:13.349+000066268125-1649003126296939909720360365Map(NC_GLOBAL#dataset_title -> CPC GLOBAL PRCP V1.0, precip#long_name -> Daily total of precipitation, time#delta_t -> 0000-00-01 00:00:00, time#long_name -> Time, lat#units -> degrees_north, NETCDF_DIM_time_VALUES -> {}, time#axis -> T, precip#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#References -> https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html, lat#standard_name -> latitude, lat#actual_range -> {89.75,-89.75}, time#coordinate_defines -> start, NETCDF_DIM_EXTRA -> {time}, DERIVED_SUBDATASET_1_NAME -> DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/-7182182872443146294.nc, precip#cell_methods -> time: sum, lon#axis -> X, lon#standard_name -> longitude, NC_GLOBAL#title -> CPC GLOBAL PRCP V1.0 RT, precip#actual_range -> {0,776.75}, lon#long_name -> Longitude, lat#axis -> Y, NC_GLOBAL#version -> V1.0, NC_GLOBAL#Source -> ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/, lon#units -> degrees_east, precip#statistic -> Total, time#units -> hours since 1900-01-01 00:00:00, NETCDF_DIM_time_DEF -> {365,6}, lon#actual_range -> {0.25,359.75}, precip#var_desc -> Precipitation, DERIVED_SUBDATASET_1_DESC -> log10 of amplitude of input bands from /vsimem/-7182182872443146294.nc, lat#coordinate_defines -> center, precip#valid_range -> {0,1000}, precip#parent_stat -> Other, precip#missing_value -> -9.96921e+36, precip#level_desc -> Surface, lon#coordinate_defines -> center, lat#long_name -> Latitude, time#standard_name -> time, precip#units -> mm, time#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#Conventions -> CF-1.0, precip#dataset -> CPC Global Precipitation, NC_GLOBAL#history -> Updated 2023-01-02 23:31:13, time#actual_range -> {1078104,1078176})Map()0List(null, iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////90r8wMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINbgICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated), dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2022.nc, netCDF)
dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2021.nc1970-01-20T16:23:13.347+000059910391-6545382777001061517720360365Map(NC_GLOBAL#dataset_title -> CPC GLOBAL PRCP V1.0, precip#long_name -> Daily total of precipitation, time#delta_t -> 0000-00-01 00:00:00, time#long_name -> Time, lat#units -> degrees_north, NETCDF_DIM_time_VALUES -> {}, time#axis -> T, precip#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#References -> https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html, lat#standard_name -> latitude, lat#actual_range -> {89.75,-89.75}, time#coordinate_defines -> start, NETCDF_DIM_EXTRA -> {time}, DERIVED_SUBDATASET_1_NAME -> DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/-6809554218790945837.nc, precip#cell_methods -> time: sum, lon#axis -> X, lon#standard_name -> longitude, NC_GLOBAL#title -> CPC GLOBAL PRCP V1.0 RT, precip#actual_range -> {0,776.75}, lon#long_name -> Longitude, lat#axis -> Y, NC_GLOBAL#version -> V1.0, NC_GLOBAL#Source -> ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/, lon#units -> degrees_east, precip#statistic -> Total, time#units -> hours since 1900-01-01 00:00:00, NETCDF_DIM_time_DEF -> {365,6}, lon#actual_range -> {0.25,359.75}, precip#var_desc -> Precipitation, DERIVED_SUBDATASET_1_DESC -> log10 of amplitude of input bands from /vsimem/-6809554218790945837.nc, lat#coordinate_defines -> center, precip#valid_range -> {0,1000}, precip#parent_stat -> Other, precip#missing_value -> -9.96921e+36, precip#level_desc -> Surface, lon#coordinate_defines -> center, lat#long_name -> Latitude, time#standard_name -> time, precip#units -> mm, time#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#Conventions -> CF-1.0, precip#dataset -> CPC Global Precipitation, NC_GLOBAL#history -> Updated 2022-01-02 23:30:58, time#actual_range -> {1060680,1069416})Map()0List(null, iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD///////////cokgMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINbgICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated), dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2021.nc, netCDF)
dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2020.nc1970-01-20T16:23:13.345+000059112656-7320144535504418501720360366Map(NC_GLOBAL#dataset_title -> CPC GLOBAL PRCP V1.0, precip#long_name -> Daily total of precipitation, time#delta_t -> 0000-00-01 00:00:00, time#long_name -> Time, lat#units -> degrees_north, NETCDF_DIM_time_VALUES -> {}, time#axis -> T, precip#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#References -> https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html, lat#standard_name -> latitude, lat#actual_range -> {89.75,-89.75}, time#coordinate_defines -> start, NETCDF_DIM_EXTRA -> {time}, DERIVED_SUBDATASET_1_NAME -> DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/-2945555412143531241.nc, precip#cell_methods -> time: sum, lon#axis -> X, lon#standard_name -> longitude, NC_GLOBAL#title -> CPC GLOBAL PRCP V1.0 RT, precip#actual_range -> {0,776.75}, lon#long_name -> Longitude, lat#axis -> Y, NC_GLOBAL#version -> V1.0, NC_GLOBAL#Source -> ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/, lon#units -> degrees_east, precip#statistic -> Total, time#units -> hours since 1900-01-01 00:00:00, NETCDF_DIM_time_DEF -> {366,6}, lon#actual_range -> {0.25,359.75}, precip#var_desc -> Precipitation, DERIVED_SUBDATASET_1_DESC -> log10 of amplitude of input bands from /vsimem/-2945555412143531241.nc, lat#coordinate_defines -> center, precip#valid_range -> {0,1000}, precip#parent_stat -> Other, precip#missing_value -> -9.96921e+36, precip#level_desc -> Surface, lon#coordinate_defines -> center, lat#long_name -> Latitude, time#standard_name -> time, precip#units -> mm, time#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#Conventions -> CF-1.0, precip#dataset -> CPC Global Precipitation, NC_GLOBAL#history -> Updated 2021-01-02 23:31:03, time#actual_range -> {1051896,1060656})Map()0List(null, iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////9D8hQMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINmQICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated), dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2020.nc, netCDF)
dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2019.nc1970-01-20T16:23:13.341+000059798408-5859169813170941141720360365Map(NC_GLOBAL#dataset_title -> CPC GLOBAL PRCP V1.0, precip#long_name -> Daily total of precipitation, time#delta_t -> 0000-00-01 00:00:00, NC_GLOBAL#dataset -> CPC Global Precipitation, time#long_name -> Time, lat#units -> degrees_north, NETCDF_DIM_time_VALUES -> {}, time#axis -> T, precip#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#References -> https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html, lat#standard_name -> latitude, lat#actual_range -> {89.75,-89.75}, time#coordinate_defines -> start, NETCDF_DIM_EXTRA -> {time}, DERIVED_SUBDATASET_1_NAME -> DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/-8363922573784257297.nc, precip#cell_methods -> time: sum, lon#axis -> X, lon#standard_name -> longitude, NC_GLOBAL#title -> CPC GLOBAL PRCP V1.0 RT, precip#actual_range -> {0,776.75}, lon#long_name -> Longitude, lat#axis -> Y, NC_GLOBAL#version -> V1.0, NC_GLOBAL#Source -> ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/, lon#units -> degrees_east, precip#statistic -> Total, time#units -> hours since 1900-01-01 00:00:00, NETCDF_DIM_time_DEF -> {365,6}, lon#actual_range -> {0.25,359.75}, precip#var_desc -> Precipitation, DERIVED_SUBDATASET_1_DESC -> log10 of amplitude of input bands from /vsimem/-8363922573784257297.nc, lat#coordinate_defines -> center, precip#valid_range -> {0,1000}, precip#parent_stat -> Other, precip#missing_value -> -9.96921e+36, precip#level_desc -> Surface, lon#coordinate_defines -> center, lat#long_name -> Latitude, time#standard_name -> time, precip#units -> mm, time#avg_period -> 0000-00-01 00:00:00, NC_GLOBAL#Conventions -> CF-1.0, precip#dataset -> CPC Global Precip RT, NC_GLOBAL#history -> Updated 2020-01-02 23:31:10, time#actual_range -> {1043136,1051872})Map()0List(null, iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////4hzkAMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINoAICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated), dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2019.nc, netCDF)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2023.nc", + "1970-01-20T16:23:13.201+0000", + 57443346, + -7234899442207905050, + 720, + 360, + 323, + { + "DERIVED_SUBDATASET_1_DESC": "log10 of amplitude of input bands from /vsimem/6835514557054555330.nc", + "DERIVED_SUBDATASET_1_NAME": "DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/6835514557054555330.nc", + "NC_GLOBAL#Conventions": "CF-1.0", + "NC_GLOBAL#References": "https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html", + "NC_GLOBAL#Source": "ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/", + "NC_GLOBAL#dataset_title": "CPC GLOBAL PRCP V1.0", + "NC_GLOBAL#history": "Updated 2023-11-20 23:31:01", + "NC_GLOBAL#title": "CPC GLOBAL PRCP V1.0 RT", + "NC_GLOBAL#version": "V1.0", + "NETCDF_DIM_EXTRA": "{time}", + "NETCDF_DIM_time_DEF": "{323,6}", + "NETCDF_DIM_time_VALUES": "{}", + "lat#actual_range": "{89.75,-89.75}", + "lat#axis": "Y", + "lat#coordinate_defines": "center", + "lat#long_name": "Latitude", + "lat#standard_name": "latitude", + "lat#units": "degrees_north", + "lon#actual_range": "{0.25,359.75}", + "lon#axis": "X", + "lon#coordinate_defines": "center", + "lon#long_name": "Longitude", + "lon#standard_name": "longitude", + "lon#units": "degrees_east", + "precip#actual_range": "{0,776.75}", + "precip#avg_period": "0000-00-01 00:00:00", + "precip#cell_methods": "time: sum", + "precip#dataset": "CPC Global Precipitation", + "precip#level_desc": "Surface", + "precip#long_name": "Daily total of precipitation", + "precip#missing_value": "-9.96921e+36", + "precip#parent_stat": "Other", + "precip#statistic": "Total", + "precip#units": "mm", + "precip#valid_range": "{0,1000}", + "precip#var_desc": "Precipitation", + "time#actual_range": "{1085832,1085928}", + "time#avg_period": "0000-00-01 00:00:00", + "time#axis": "T", + "time#coordinate_defines": "start", + "time#delta_t": "0000-00-01 00:00:00", + "time#long_name": "Time", + "time#standard_name": "time", + "time#units": "hours since 1900-01-01 00:00:00" + }, + {}, + 0, + [ + null, + "iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////xKEbAMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINbgICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated)", + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2023.nc", + "netCDF" + ] + ], + [ + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2022.nc", + "1970-01-20T16:23:13.349+0000", + 66268125, + -1649003126296939909, + 720, + 360, + 365, + { + "DERIVED_SUBDATASET_1_DESC": "log10 of amplitude of input bands from /vsimem/-7182182872443146294.nc", + "DERIVED_SUBDATASET_1_NAME": "DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/-7182182872443146294.nc", + "NC_GLOBAL#Conventions": "CF-1.0", + "NC_GLOBAL#References": "https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html", + "NC_GLOBAL#Source": "ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/", + "NC_GLOBAL#dataset_title": "CPC GLOBAL PRCP V1.0", + "NC_GLOBAL#history": "Updated 2023-01-02 23:31:13", + "NC_GLOBAL#title": "CPC GLOBAL PRCP V1.0 RT", + "NC_GLOBAL#version": "V1.0", + "NETCDF_DIM_EXTRA": "{time}", + "NETCDF_DIM_time_DEF": "{365,6}", + "NETCDF_DIM_time_VALUES": "{}", + "lat#actual_range": "{89.75,-89.75}", + "lat#axis": "Y", + "lat#coordinate_defines": "center", + "lat#long_name": "Latitude", + "lat#standard_name": "latitude", + "lat#units": "degrees_north", + "lon#actual_range": "{0.25,359.75}", + "lon#axis": "X", + "lon#coordinate_defines": "center", + "lon#long_name": "Longitude", + "lon#standard_name": "longitude", + "lon#units": "degrees_east", + "precip#actual_range": "{0,776.75}", + "precip#avg_period": "0000-00-01 00:00:00", + "precip#cell_methods": "time: sum", + "precip#dataset": "CPC Global Precipitation", + "precip#level_desc": "Surface", + "precip#long_name": "Daily total of precipitation", + "precip#missing_value": "-9.96921e+36", + "precip#parent_stat": "Other", + "precip#statistic": "Total", + "precip#units": "mm", + "precip#valid_range": "{0,1000}", + "precip#var_desc": "Precipitation", + "time#actual_range": "{1078104,1078176}", + "time#avg_period": "0000-00-01 00:00:00", + "time#axis": "T", + "time#coordinate_defines": "start", + "time#delta_t": "0000-00-01 00:00:00", + "time#long_name": "Time", + "time#standard_name": "time", + "time#units": "hours since 1900-01-01 00:00:00" + }, + {}, + 0, + [ + null, + "iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////90r8wMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINbgICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated)", + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2022.nc", + "netCDF" + ] + ], + [ + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2021.nc", + "1970-01-20T16:23:13.347+0000", + 59910391, + -6545382777001061517, + 720, + 360, + 365, + { + "DERIVED_SUBDATASET_1_DESC": "log10 of amplitude of input bands from /vsimem/-6809554218790945837.nc", + "DERIVED_SUBDATASET_1_NAME": "DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/-6809554218790945837.nc", + "NC_GLOBAL#Conventions": "CF-1.0", + "NC_GLOBAL#References": "https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html", + "NC_GLOBAL#Source": "ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/", + "NC_GLOBAL#dataset_title": "CPC GLOBAL PRCP V1.0", + "NC_GLOBAL#history": "Updated 2022-01-02 23:30:58", + "NC_GLOBAL#title": "CPC GLOBAL PRCP V1.0 RT", + "NC_GLOBAL#version": "V1.0", + "NETCDF_DIM_EXTRA": "{time}", + "NETCDF_DIM_time_DEF": "{365,6}", + "NETCDF_DIM_time_VALUES": "{}", + "lat#actual_range": "{89.75,-89.75}", + "lat#axis": "Y", + "lat#coordinate_defines": "center", + "lat#long_name": "Latitude", + "lat#standard_name": "latitude", + "lat#units": "degrees_north", + "lon#actual_range": "{0.25,359.75}", + "lon#axis": "X", + "lon#coordinate_defines": "center", + "lon#long_name": "Longitude", + "lon#standard_name": "longitude", + "lon#units": "degrees_east", + "precip#actual_range": "{0,776.75}", + "precip#avg_period": "0000-00-01 00:00:00", + "precip#cell_methods": "time: sum", + "precip#dataset": "CPC Global Precipitation", + "precip#level_desc": "Surface", + "precip#long_name": "Daily total of precipitation", + "precip#missing_value": "-9.96921e+36", + "precip#parent_stat": "Other", + "precip#statistic": "Total", + "precip#units": "mm", + "precip#valid_range": "{0,1000}", + "precip#var_desc": "Precipitation", + "time#actual_range": "{1060680,1069416}", + "time#avg_period": "0000-00-01 00:00:00", + "time#axis": "T", + "time#coordinate_defines": "start", + "time#delta_t": "0000-00-01 00:00:00", + "time#long_name": "Time", + "time#standard_name": "time", + "time#units": "hours since 1900-01-01 00:00:00" + }, + {}, + 0, + [ + null, + "iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD///////////cokgMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINbgICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated)", + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2021.nc", + "netCDF" + ] + ], + [ + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2020.nc", + "1970-01-20T16:23:13.345+0000", + 59112656, + -7320144535504418501, + 720, + 360, + 366, + { + "DERIVED_SUBDATASET_1_DESC": "log10 of amplitude of input bands from /vsimem/-2945555412143531241.nc", + "DERIVED_SUBDATASET_1_NAME": "DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/-2945555412143531241.nc", + "NC_GLOBAL#Conventions": "CF-1.0", + "NC_GLOBAL#References": "https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html", + "NC_GLOBAL#Source": "ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/", + "NC_GLOBAL#dataset_title": "CPC GLOBAL PRCP V1.0", + "NC_GLOBAL#history": "Updated 2021-01-02 23:31:03", + "NC_GLOBAL#title": "CPC GLOBAL PRCP V1.0 RT", + "NC_GLOBAL#version": "V1.0", + "NETCDF_DIM_EXTRA": "{time}", + "NETCDF_DIM_time_DEF": "{366,6}", + "NETCDF_DIM_time_VALUES": "{}", + "lat#actual_range": "{89.75,-89.75}", + "lat#axis": "Y", + "lat#coordinate_defines": "center", + "lat#long_name": "Latitude", + "lat#standard_name": "latitude", + "lat#units": "degrees_north", + "lon#actual_range": "{0.25,359.75}", + "lon#axis": "X", + "lon#coordinate_defines": "center", + "lon#long_name": "Longitude", + "lon#standard_name": "longitude", + "lon#units": "degrees_east", + "precip#actual_range": "{0,776.75}", + "precip#avg_period": "0000-00-01 00:00:00", + "precip#cell_methods": "time: sum", + "precip#dataset": "CPC Global Precipitation", + "precip#level_desc": "Surface", + "precip#long_name": "Daily total of precipitation", + "precip#missing_value": "-9.96921e+36", + "precip#parent_stat": "Other", + "precip#statistic": "Total", + "precip#units": "mm", + "precip#valid_range": "{0,1000}", + "precip#var_desc": "Precipitation", + "time#actual_range": "{1051896,1060656}", + "time#avg_period": "0000-00-01 00:00:00", + "time#axis": "T", + "time#coordinate_defines": "start", + "time#delta_t": "0000-00-01 00:00:00", + "time#long_name": "Time", + "time#standard_name": "time", + "time#units": "hours since 1900-01-01 00:00:00" + }, + {}, + 0, + [ + null, + "iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////9D8hQMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINmQICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated)", + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2020.nc", + "netCDF" + ] + ], + [ + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2019.nc", + "1970-01-20T16:23:13.341+0000", + 59798408, + -5859169813170941141, + 720, + 360, + 365, + { + "DERIVED_SUBDATASET_1_DESC": "log10 of amplitude of input bands from /vsimem/-8363922573784257297.nc", + "DERIVED_SUBDATASET_1_NAME": "DERIVED_SUBDATASET:LOGAMPLITUDE:/vsimem/-8363922573784257297.nc", + "NC_GLOBAL#Conventions": "CF-1.0", + "NC_GLOBAL#References": "https://www.psl.noaa.gov/data/gridded/data.cpc.globalprecip.html", + "NC_GLOBAL#Source": "ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/", + "NC_GLOBAL#dataset": "CPC Global Precipitation", + "NC_GLOBAL#dataset_title": "CPC GLOBAL PRCP V1.0", + "NC_GLOBAL#history": "Updated 2020-01-02 23:31:10", + "NC_GLOBAL#title": "CPC GLOBAL PRCP V1.0 RT", + "NC_GLOBAL#version": "V1.0", + "NETCDF_DIM_EXTRA": "{time}", + "NETCDF_DIM_time_DEF": "{365,6}", + "NETCDF_DIM_time_VALUES": "{}", + "lat#actual_range": "{89.75,-89.75}", + "lat#axis": "Y", + "lat#coordinate_defines": "center", + "lat#long_name": "Latitude", + "lat#standard_name": "latitude", + "lat#units": "degrees_north", + "lon#actual_range": "{0.25,359.75}", + "lon#axis": "X", + "lon#coordinate_defines": "center", + "lon#long_name": "Longitude", + "lon#standard_name": "longitude", + "lon#units": "degrees_east", + "precip#actual_range": "{0,776.75}", + "precip#avg_period": "0000-00-01 00:00:00", + "precip#cell_methods": "time: sum", + "precip#dataset": "CPC Global Precip RT", + "precip#level_desc": "Surface", + "precip#long_name": "Daily total of precipitation", + "precip#missing_value": "-9.96921e+36", + "precip#parent_stat": "Other", + "precip#statistic": "Total", + "precip#units": "mm", + "precip#valid_range": "{0,1000}", + "precip#var_desc": "Precipitation", + "time#actual_range": "{1043136,1051872}", + "time#avg_period": "0000-00-01 00:00:00", + "time#axis": "T", + "time#coordinate_defines": "start", + "time#delta_t": "0000-00-01 00:00:00", + "time#long_name": "Time", + "time#standard_name": "time", + "time#units": "hours since 1900-01-01 00:00:00" + }, + {}, + 0, + [ + null, + "iUhERg0KGgoAAAAAAAgIAAQAEAAAAAAAAAAAAAAAAAD//////////4hzkAMAAAAA//////////8AAAAAAAAAAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT0hEUgINoAICIgAAAAAAAwQAAAAAAAAA//////////8= (truncated)", + "dbfs:/home/mjohns@databricks.com/geospatial/netcdf-precip/precip.2019.nc", + "netCDF" + ] + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "length", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "uuid", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "x_size", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "y_size", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "bandCount", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "metadata", + "type": "{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true}" + }, + { + "metadata": "{}", + "name": "subdatasets", + "type": "{\"type\":\"map\",\"keyType\":\"string\",\"valueType\":\"string\",\"valueContainsNull\":true}" + }, + { + "metadata": "{}", + "name": "srid", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "tile", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"index_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"raster\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}},{\"name\":\"parentPath\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"driver\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_mos = (\n", + " spark\n", + " .read.format(\"gdal\")\n", + " .option(\"driverName\", \"NetCDF\")\n", + " .load(nc_dir)\n", + ")\n", + "print(f\"count? {df_mos.count():,}\")\n", + "df_mos.orderBy(F.desc(\"path\")).limit(5).display() # <- limiting display for ipynb output only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e0a70c99-fc13-4f85-9549-8211c29e0b78", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Slice Example-1: Single File [with Flattening]\n", + "\n", + "> Before we move to distributed with Xarray, let's consider what slicing a single file might look like." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "53df9be9-5016-4966-baa4-273dd005b1da", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Read XArray Dataset_" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "174489ac-fb1b-4af7-b127-31a776e7592c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 360, lon: 720, time: 323)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 89.75 89.25 88.75 88.25 ... -88.75 -89.25 -89.75\n",
+       "  * lon      (lon) float32 0.25 0.75 1.25 1.75 2.25 ... 358.2 358.8 359.2 359.8\n",
+       "  * time     (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-11-19\n",
+       "Data variables:\n",
+       "    precip   (time, lat, lon) float32 ...\n",
+       "Attributes:\n",
+       "    Conventions:    CF-1.0\n",
+       "    version:        V1.0\n",
+       "    title:          CPC GLOBAL PRCP V1.0 RT\n",
+       "    References:     https://www.psl.noaa.gov/data/gridded/data.cpc.globalprec...\n",
+       "    dataset_title:  CPC GLOBAL PRCP V1.0\n",
+       "    Source:         ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/\n",
+       "    history:        Updated 2023-11-20 23:31:01
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.Dataset>\nDimensions:  (lat: 360, lon: 720, time: 323)\nCoordinates:\n  * lat      (lat) float32 89.75 89.25 88.75 88.25 ... -88.75 -89.25 -89.75\n  * lon      (lon) float32 0.25 0.75 1.25 1.75 2.25 ... 358.2 358.8 359.2 359.8\n  * time     (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-11-19\nData variables:\n    precip   (time, lat, lon) float32 ...\nAttributes:\n    Conventions:    CF-1.0\n    version:        V1.0\n    title:          CPC GLOBAL PRCP V1.0 RT\n    References:     https://www.psl.noaa.gov/data/gridded/data.cpc.globalprec...\n    dataset_title:  CPC GLOBAL PRCP V1.0\n    Source:         ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/\n    history:        Updated 2023-11-20 23:31:01
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "xds = xr.open_dataset(nc_sample_path_fuse)\n", + "xds" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6b542d80-07d1-4482-ab73-f273d98d15cd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Slice Dataset_" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8d522d40-a715-428d-bc6e-1c27c0cf896c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 360, lon: 720, time: 31)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 89.75 89.25 88.75 88.25 ... -88.75 -89.25 -89.75\n",
+       "  * lon      (lon) float32 0.25 0.75 1.25 1.75 2.25 ... 358.2 358.8 359.2 359.8\n",
+       "  * time     (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-31\n",
+       "Data variables:\n",
+       "    precip   (time, lat, lon) float32 ...\n",
+       "Attributes:\n",
+       "    Conventions:    CF-1.0\n",
+       "    version:        V1.0\n",
+       "    title:          CPC GLOBAL PRCP V1.0 RT\n",
+       "    References:     https://www.psl.noaa.gov/data/gridded/data.cpc.globalprec...\n",
+       "    dataset_title:  CPC GLOBAL PRCP V1.0\n",
+       "    Source:         ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/\n",
+       "    history:        Updated 2023-11-20 23:31:01
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.Dataset>\nDimensions:  (lat: 360, lon: 720, time: 31)\nCoordinates:\n  * lat      (lat) float32 89.75 89.25 88.75 88.25 ... -88.75 -89.25 -89.75\n  * lon      (lon) float32 0.25 0.75 1.25 1.75 2.25 ... 358.2 358.8 359.2 359.8\n  * time     (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-01-31\nData variables:\n    precip   (time, lat, lon) float32 ...\nAttributes:\n    Conventions:    CF-1.0\n    version:        V1.0\n    title:          CPC GLOBAL PRCP V1.0 RT\n    References:     https://www.psl.noaa.gov/data/gridded/data.cpc.globalprec...\n    dataset_title:  CPC GLOBAL PRCP V1.0\n    Source:         ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/\n    history:        Updated 2023-11-20 23:31:01
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "xds.sel(time=slice('2023-01-01','2023-01-31'))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e6cd1401-833a-474b-b335-f23ddcf042ba", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 2, lon: 2, time: 323)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 88.75 88.25\n",
+       "  * lon      (lon) float32 0.25 0.75\n",
+       "  * time     (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-11-19\n",
+       "Data variables:\n",
+       "    precip   (time, lat, lon) float32 ...\n",
+       "Attributes:\n",
+       "    Conventions:    CF-1.0\n",
+       "    version:        V1.0\n",
+       "    title:          CPC GLOBAL PRCP V1.0 RT\n",
+       "    References:     https://www.psl.noaa.gov/data/gridded/data.cpc.globalprec...\n",
+       "    dataset_title:  CPC GLOBAL PRCP V1.0\n",
+       "    Source:         ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/\n",
+       "    history:        Updated 2023-11-20 23:31:01
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.Dataset>\nDimensions:  (lat: 2, lon: 2, time: 323)\nCoordinates:\n  * lat      (lat) float32 88.75 88.25\n  * lon      (lon) float32 0.25 0.75\n  * time     (time) datetime64[ns] 2023-01-01 2023-01-02 ... 2023-11-19\nData variables:\n    precip   (time, lat, lon) float32 ...\nAttributes:\n    Conventions:    CF-1.0\n    version:        V1.0\n    title:          CPC GLOBAL PRCP V1.0 RT\n    References:     https://www.psl.noaa.gov/data/gridded/data.cpc.globalprec...\n    dataset_title:  CPC GLOBAL PRCP V1.0\n    Source:         ftp://ftp.cpc.ncep.noaa.gov/precip/CPC_UNI_PRCP/\n    history:        Updated 2023-11-20 23:31:01
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "xds.sel(lat=slice(89.0,88.0), lon=slice(0.25,0.75))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "23b64d64-c40b-43d7-b7f8-2a1d6aeac2cb", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Convert to Pandas & Flatten_" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dd8d636e-8a79-4759-a683-610663099001", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "rows? 83,721,600, cols? 1\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precip
latlontime
89.750.252023-01-01NaN
2023-01-02NaN
2023-01-03NaN
2023-01-04NaN
2023-01-05NaN
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
precip
latlontime
89.750.252023-01-01NaN
2023-01-02NaN
2023-01-03NaN
2023-01-04NaN
2023-01-05NaN
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "if 'time' in xds.dims.keys() and not isinstance(xds.indexes['time'], pd.DatetimeIndex):\n", + " xds['time'] = xds.indexes['time'].to_datetimeindex()\n", + "pdf = xds.to_dataframe() # <- this is the right move (get a multi-index)\n", + "print(f'rows? {pdf.shape[0]:,}, cols? {pdf.shape[1]}')\n", + "pdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a1f924cc-2731-4f91-828e-d35e2b675b56", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "rows? 30,052,731, cols? 1\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precip
latlontime
83.75323.252023-01-010.000000
2023-01-020.000000
2023-01-030.000000
2023-01-040.000000
2023-01-050.022191
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
precip
latlontime
83.75323.252023-01-010.000000
2023-01-020.000000
2023-01-030.000000
2023-01-040.000000
2023-01-050.022191
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "pdf.dropna(inplace=True)\n", + "print(f'rows? {pdf.shape[0]:,}, cols? {pdf.shape[1]}')\n", + "pdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "47901e3b-b004-402d-8cde-723e16495f6c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "rows? 30,052,731, cols? 4\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
latlontimeprecip
083.75323.252023-01-010.000000
183.75323.252023-01-020.000000
283.75323.252023-01-030.000000
383.75323.252023-01-040.000000
483.75323.252023-01-050.022191
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
latlontimeprecip
083.75323.252023-01-010.000000
183.75323.252023-01-020.000000
283.75323.252023-01-030.000000
383.75323.252023-01-040.000000
483.75323.252023-01-050.022191
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "pdf_flat = pdf.reset_index()\n", + "print(f'rows? {pdf_flat.shape[0]:,}, cols? {pdf_flat.shape[1]}')\n", + "pdf_flat.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "74d5ffe4-e2ba-484f-a0fe-c82fc9ba9617", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Get the [spark] schema from the flattened [pandas] sample_\n", + "\n", + "> This will be used in our distributed execution (below)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d3b0de2e-afbe-4ec0-a823-2a72f6661f27", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 30,052,731\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
latlontimeprecip
83.75323.252023-01-01T00:00:00.000+00000.0
83.75323.252023-01-02T00:00:00.000+00000.0
83.75323.252023-01-03T00:00:00.000+00000.0
83.75323.252023-01-04T00:00:00.000+00000.0
83.75323.252023-01-05T00:00:00.000+00000.022190852
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 83.75, + 323.25, + "2023-01-01T00:00:00.000+0000", + 0.0 + ], + [ + 83.75, + 323.25, + "2023-01-02T00:00:00.000+0000", + 0.0 + ], + [ + 83.75, + 323.25, + "2023-01-03T00:00:00.000+0000", + 0.0 + ], + [ + 83.75, + 323.25, + "2023-01-04T00:00:00.000+0000", + 0.0 + ], + [ + 83.75, + 323.25, + "2023-01-05T00:00:00.000+0000", + 0.022190852 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "lat", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "lon", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "time", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "precip", + "type": "\"float\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df = spark.createDataFrame(pdf_flat)\n", + "print(f\"count? {df.count():,}\")\n", + "df.limit(5).display()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "94614086-173e-46c1-8c2b-15a0e34f09fa", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[18]: StructType([StructField('lat', DoubleType(), True), StructField('lon', DoubleType(), True), StructField('time', TimestampType(), True), StructField('precip', FloatType(), True)])" + ] + } + ], + "source": [ + "df.schema" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6dfd2b8f-de57-49fa-b751-30ae9e69533d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Slice Example-2: Vectorized UDF [with Flattening]\n", + "\n", + "> Use `applyInPandas` UDF to work more directly with the netCDF [outside of Moasaic + GDAL]. __Note: Will enforce grouping by path.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b486864e-1fa4-40bb-8cc7-1c3039d5464e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[11]: StructType([StructField('lon', DoubleType(), True), StructField('lat', DoubleType(), True), StructField('time', TimestampType(), True), StructField('precip', FloatType(), True)])" + ] + } + ], + "source": [ + "# idenfified earlier in sample\n", + "flat_schema = (\n", + " StructType(\n", + " [\n", + " StructField('lon', DoubleType(), True), \n", + " StructField('lat', DoubleType(), True), \n", + " StructField('time', TimestampType(), True), \n", + " StructField('precip', FloatType(), True), \n", + " ]\n", + " )\n", + ")\n", + "flat_schema" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "88fc03f6-426e-4d12-a53e-8452226d6c97", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def slice_flatten_path(key, input_pdf: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " slice the `path` column [optimal w/single path]:\n", + " - based on provided time, lat, lon slices\n", + " - Read with XArray using h5netcdf engine\n", + " - Handles conversion to pandas\n", + " - flattens out multi-dimensions\n", + " - drops na values (much smaller)\n", + " Returns pandas dataframe\n", + " \"\"\"\n", + " import io\n", + " import pandas as pd\n", + " import xarray as xr \n", + "\n", + " # -- iterate over pdf --\n", + " # - this may just be 1 path,\n", + " # depends on groupBy\n", + " # - to further optimize, consider enforcing 1 path\n", + " # and not doing the `pd.concat` call, just returning \n", + " pdf_arr = []\n", + " for index, row in input_pdf.iterrows():\n", + " path_fuse = row['path'].replace(\"dbfs:\",\"/dbfs\")\n", + " xds = xr.open_dataset(path_fuse)\n", + "\n", + " xds_slice = xds\n", + " if 'time_slice' in input_pdf:\n", + " xds_slice = xds_slice.sel(time=slice(*row['time_slice']))\n", + " if 'lat_slice' in input_pdf:\n", + " xds_slice = xds_slice.sel(lat=slice(*row['lat_slice']))\n", + " if 'lon_slice' in input_pdf:\n", + " xds_slice = xds_slice.sel(lon=slice(*row['lon_slice']))\n", + " \n", + " if 'time' in xds_slice.dims.keys() and not isinstance(xds_slice.indexes['time'], pd.DatetimeIndex):\n", + " xds_slice['time'] = xds_slice.indexes['time'].to_datetimeindex()\n", + " pdf = xds_slice.to_dataframe() # <- handle drops in xdf for large files\n", + " pdf.dropna(inplace=True)\n", + " pdf_arr.append(pdf.reset_index())\n", + " \n", + " return pd.concat(pdf_arr)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d1a2abdc-6441-4567-b300-93283b8457b6", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[13]: 277.0" + ] + } + ], + "source": [ + "from_180(-83.0) # <- becomes min" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "af5d2795-cc5b-492b-82f5-7fe6041ee22e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[14]: 279.1" + ] + } + ], + "source": [ + "from_180(-80.9) # <- becomes max" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e157bb4-30b7-4bfb-b2f6-082100a5dfb5", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 372\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
lonlattimeprecipyearmonthdaygeom_wkt
277.2528.252023-01-01T00:00:00.000+00008.654497202311POINT (-82.75 28.25)
277.2528.252023-01-02T00:00:00.000+00000.12019344202312POINT (-82.75 28.25)
277.2528.252023-01-03T00:00:00.000+00000.0202313POINT (-82.75 28.25)
277.2528.252023-01-04T00:00:00.000+00000.0202314POINT (-82.75 28.25)
277.2528.252023-01-05T00:00:00.000+00006.124646202315POINT (-82.75 28.25)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 277.25, + 28.25, + "2023-01-01T00:00:00.000+0000", + 8.654497, + 2023, + 1, + 1, + "POINT (-82.75 28.25)" + ], + [ + 277.25, + 28.25, + "2023-01-02T00:00:00.000+0000", + 0.12019344, + 2023, + 1, + 2, + "POINT (-82.75 28.25)" + ], + [ + 277.25, + 28.25, + "2023-01-03T00:00:00.000+0000", + 0.0, + 2023, + 1, + 3, + "POINT (-82.75 28.25)" + ], + [ + 277.25, + 28.25, + "2023-01-04T00:00:00.000+0000", + 0.0, + 2023, + 1, + 4, + "POINT (-82.75 28.25)" + ], + [ + 277.25, + 28.25, + "2023-01-05T00:00:00.000+0000", + 6.124646, + 2023, + 1, + 5, + "POINT (-82.75 28.25)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "lon", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "lat", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "time", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "precip", + "type": "\"float\"" + }, + { + "metadata": "{}", + "name": "year", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "month", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "day", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "spark.catalog.clearCache() # <- cache for dev, help avoid recomputes\n", + "\n", + "df_path = (\n", + " df_mos\n", + " .repartition(df_mos.count(), \"path\") # <- repartition is important!\n", + " .withColumn(\n", + " \"time_slice\", \n", + " F.array([F.lit(x) for x in ['2023-01-01', '2023-01-31']])\n", + " )\n", + " .withColumn(\n", + " \"lat_slice\", \n", + " F.array([F.lit(x) for x in [28.6, 26.9]]) # <- max, min\n", + " )\n", + " .withColumn(\n", + " \"lon_slice\", \n", + " F.array([F.lit(x) for x in [from_180(-83.0), from_180(-80.9)]]) # <- min, max ... convert to 360 \n", + " )\n", + " .groupBy(\"path\")\n", + " .applyInPandas(slice_flatten_path, schema=flat_schema) # <- applyInPandas UDF \n", + " .withColumn(\"year\", F.year(\"time\"))\n", + " .withColumn(\"month\", F.month(\"time\"))\n", + " .withColumn(\"day\", F.dayofmonth(\"time\"))\n", + " .withColumn(\"geom_wkt\", mos.st_astext(mos.st_point(from_360_udf(\"lon\"), \"lat\"))) # <- convert to -180:180\n", + " .cache()\n", + ")\n", + "\n", + "print(f\"count? {df_path.count():,}\")\n", + "display(df_path.limit(5)) # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cdbbbd15-233a-46e3-be51-2fdc87baa925", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Render average precipitation through the years_\n", + "\n", + "> This is per collected location __[our slice in Florida]__. Note: `precip` units are in millimeters." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "32888e11-e814-4add-b754-5e461bf0f5c9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "df_kepler = (\n", + " df_path\n", + " .groupBy(\"geom_wkt\")\n", + " .agg(F.avg(\"precip\").alias(\"avg_precip\"))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a1914b8b-2c62-43cb-898c-e13db7628341", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "919763df-97ea-4b61-8ee7-d438d551b14d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "92e43404-515d-4fe9-9ff7-2fca55d2715e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# df_kepler \"geom_wkt\" \"geometry\" 1_000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b2765745-f1e6-4404-b65f-097341a06f7f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Slice Example-3: Vecorized UDF [without Flatten]\n", + "\n", + "> Use `applyInPandas` UDF to work more directly with the netCDF [outside of Mosaic + GDAL]. This shows two variations on maintaining a nested structure within a Delta Table: [a] Store Slices as NetCDF binary and [b] Store slices as JSON. __Note: Will enforce grouping by path.__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9e8f1d65-7419-4261-b29e-f7a4b078aff0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Option-[a]: Return Slice as NetCDF" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "73672011-a624-4391-b321-d431ee74c254", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_This will be binary type in our UDF._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b0684772-86b6-4ae5-bcc6-fa760f052e14", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "nc_slice = xds.sel(time=slice('2023-01-01','2023-01-31'), lat=slice(89.0,88.0), lon=slice(0.25,0.75)).to_netcdf()\n", + "# nc_slice # <- this is binary" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb8b536f-8481-4b33-8013-e15b522360d3", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "nc_slice_schema = StructType([StructField('content', BinaryType(), True)])" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a26c2b89-dbf7-4879-8288-6913ec4227e6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def slice_path_nc(key, input_pdf: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " slice the `path` column [optimal w/single path]:\n", + " - based on provided time, lat, lon slices\n", + " - Read with XArray using h5netcdf engine\n", + " - maintains the sliced netcdf as binary\n", + " Returns pandas dataframe\n", + " \"\"\"\n", + " import io\n", + " import pandas as pd\n", + " import xarray as xr \n", + "\n", + " # -- iterate over pdf --\n", + " # - this may just be 1 path,\n", + " # depends on groupBy\n", + " # - to further optimize, consider enforcing 1 path\n", + " # and not doing the `pd.concat` call, just returning \n", + " pdf_arr = []\n", + " for index, row in input_pdf.iterrows():\n", + " path_fuse = row['path'].replace(\"dbfs:\",\"/dbfs\")\n", + " xds = xr.open_dataset(path_fuse)\n", + "\n", + " xds_slice = xds\n", + " if 'time_slice' in input_pdf:\n", + " xds_slice = xds_slice.sel(time=slice(*row['time_slice']))\n", + " if 'lat_slice' in input_pdf:\n", + " xds_slice = xds_slice.sel(lat=slice(*row['lat_slice']))\n", + " if 'lon_slice' in input_pdf:\n", + " xds_slice = xds_slice.sel(lon=slice(*row['lon_slice']))\n", + " \n", + " pdf_arr.append(\n", + " pd.DataFrame([xds_slice.to_netcdf()], columns=['content'])\n", + " )\n", + " \n", + " return pd.concat(pdf_arr)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e81241ca-a489-4a32-a2c7-e536e44282d7", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 45\n+--------------------+\n| content|\n+--------------------+\n|[43 44 46 02 00 0...|\n+--------------------+\n\n" + ] + } + ], + "source": [ + "spark.catalog.clearCache() # <- cache for dev, help avoid recomputes\n", + "\n", + "df_nc_slice = (\n", + " df_mos\n", + " .repartition(df_mos.count(), \"path\") # <- repartition is important!\n", + " .withColumn(\n", + " \"time_slice\", \n", + " F.array([F.lit(x) for x in ['2023-01-01', '2023-01-31']])\n", + " )\n", + " .withColumn(\n", + " \"lat_slice\", \n", + " F.array([F.lit(x) for x in [28.6, 26.9]]) # <- max, min\n", + " )\n", + " .withColumn(\n", + " \"lon_slice\", \n", + " F.array([F.lit(x) for x in [from_180(-83.0), from_180(-80.9)]]) # <- min, max ... convert to 360 \n", + " )\n", + " .groupBy(\"path\")\n", + " .applyInPandas(slice_path_nc, schema=nc_slice_schema) # <- applyInPandas UDF \n", + " .cache()\n", + ")\n", + "\n", + "print(f\"count? {df_nc_slice.count():,}\")\n", + "df_nc_slice.limit(1).show() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "73e444db-9eb6-4b22-a358-3ed32c36a455", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Example flattening from the slices_\n", + "\n", + "> Though not explicitely shown here, you could have written out your slices to Delta Lake and then later decided to work with them again as field data. __Note: the use of BytesIO to load straight from the Delta Lake field data: `xds = xr.open_dataset(io.BytesIO(row['content']))`!__ " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9a9e9457-218c-4910-a1d0-1cd3e3c53eca", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def explode_content(key, input_pdf: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Explode the expected `contents` column:\n", + " - Read with XArray using h5netcdf engine\n", + " - Handles conversion to pandas\n", + " - flattens out multi-dimensions\n", + " - drops na values (much smaller)\n", + " Returns pandas dataframe\n", + " \"\"\"\n", + " import io\n", + " import pandas as pd\n", + " import xarray as xr\n", + "\n", + " # -- iterate over pdf --\n", + " # - this may just be 1 path,\n", + " # depends on groupBy\n", + " # - to further optimize, consider enforcing 1 path\n", + " # and not doing the `pd.concat` call, just returning \n", + " pdf_arr = []\n", + "\n", + " for index, row in input_pdf.iterrows():\n", + " xds = xr.open_dataset(io.BytesIO(row['content']))\n", + " if 'time' in xds.dims.keys() and not isinstance(xds.indexes['time'], pd.DatetimeIndex):\n", + " xds['time'] = xds.indexes['time'].to_datetimeindex()\n", + " pdf = xds.to_dataframe()\n", + " pdf.dropna(inplace=True) # <- handle drops in xdf for large files\n", + " pdf_arr.append(pdf.reset_index())\n", + " \n", + " return pd.concat(pdf_arr)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "93396017-084e-46d9-b60a-c5d01d513c40", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 372\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
lonlattimeprecipyearmonthdaygeom_wkt
277.2528.252023-01-01T00:00:00.000+00008.654497202311POINT (-82.75 28.25)
277.2528.252023-01-02T00:00:00.000+00000.12019344202312POINT (-82.75 28.25)
277.2528.252023-01-03T00:00:00.000+00000.0202313POINT (-82.75 28.25)
277.2528.252023-01-04T00:00:00.000+00000.0202314POINT (-82.75 28.25)
277.2528.252023-01-05T00:00:00.000+00006.124646202315POINT (-82.75 28.25)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 277.25, + 28.25, + "2023-01-01T00:00:00.000+0000", + 8.654497, + 2023, + 1, + 1, + "POINT (-82.75 28.25)" + ], + [ + 277.25, + 28.25, + "2023-01-02T00:00:00.000+0000", + 0.12019344, + 2023, + 1, + 2, + "POINT (-82.75 28.25)" + ], + [ + 277.25, + 28.25, + "2023-01-03T00:00:00.000+0000", + 0.0, + 2023, + 1, + 3, + "POINT (-82.75 28.25)" + ], + [ + 277.25, + 28.25, + "2023-01-04T00:00:00.000+0000", + 0.0, + 2023, + 1, + 4, + "POINT (-82.75 28.25)" + ], + [ + 277.25, + 28.25, + "2023-01-05T00:00:00.000+0000", + 6.124646, + 2023, + 1, + 5, + "POINT (-82.75 28.25)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "lon", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "lat", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "time", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "precip", + "type": "\"float\"" + }, + { + "metadata": "{}", + "name": "year", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "month", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "day", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_flat_slice = (\n", + " df_nc_slice\n", + " .groupBy(\"content\")\n", + " .applyInPandas(explode_content, schema=flat_schema)\n", + " .withColumn(\"year\", F.year(\"time\"))\n", + " .withColumn(\"month\", F.month(\"time\"))\n", + " .withColumn(\"day\", F.dayofmonth(\"time\"))\n", + " .withColumn(\"geom_wkt\", mos.st_astext(mos.st_point(from_360_udf(\"lon\"), \"lat\"))) # <- to -180:180\n", + " .cache()\n", + ")\n", + "\n", + "print(f\"count? {df_flat_slice.count():,}\")\n", + "display(df_flat_slice.limit(5)) # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9ce1b6f1-a915-4242-87e6-c8878c874a3d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Render average precipitation through the years_\n", + "\n", + "> This is per collected location __[AKA our \"same\" slice in Florida]__. Note: `precip` units are in millimeters." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f9cf3b4e-7c62-4a72-a251-757d8e8e8cb3", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "df_flat_slice_kepler = (\n", + " df_flat_slice\n", + " .groupBy(\"geom_wkt\")\n", + " .agg(F.avg(\"precip\").alias(\"avg_precip\"))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7b6e4652-1f4c-45e9-8c6b-dd0624b68be0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results [essentially same as the screenshot shown in the earlier example]._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8fb07c0d-b42c-4400-9ee3-5270b4dae19a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# df_flat_slice_kepler \"geom_wkt\" \"geometry\" 1_000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ffe6af84-9f69-4d1b-9a84-3934c1f49bbf", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Option-[b]: Slice as JSON\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4386d7f0-b9a6-4888-a85a-66ffbeaebb73", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Single File Example \n", + "\n", + "> Just to understand before moving to distributed." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7e01a67d-9190-4ba9-901a-0154ab117b7b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "rows? 372, cols? 1\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
precip
latlontime
28.25277.252023-01-018.654497
2023-01-020.120193
2023-01-030.000000
2023-01-040.000000
2023-01-056.124646
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
precip
latlontime
28.25277.252023-01-018.654497
2023-01-020.120193
2023-01-030.000000
2023-01-040.000000
2023-01-056.124646
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "pdf_slice = (\n", + " xds.sel(\n", + " time=slice('2023-01-01','2023-01-31'),\n", + " lat=slice(28.6, 26.9), \n", + " lon=slice(from_180(-83.0), from_180(-80.9))\n", + " ).to_dataframe()\n", + ")\n", + "print(f'rows? {pdf_slice.shape[0]:,}, cols? {pdf_slice.shape[1]}')\n", + "pdf_slice.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "df9986a1-1991-4164-a14a-8c1a433e9bc5", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[38]: [{'columns': ['precip'],\n 'index': [[28.25, 277.25, 1672531200000]],\n 'data': [[8.6544971466]]},\n {'columns': ['precip'],\n 'index': [[28.25, 277.25, 1672617600000]],\n 'data': [[0.1201934367]]},\n {'columns': ['precip'],\n 'index': [[28.25, 277.25, 1672704000000]],\n 'data': [[0.0]]}]" + ] + } + ], + "source": [ + "data_out = pdf_slice.groupby(['lat','lon','time']).apply(lambda x : x.to_json(orient='split')).values.tolist()\n", + "json_data = [json.loads(x) for x in data_out]\n", + "json_data[:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "daf7f0f9-dff7-4560-85e7-cc3e028612e7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Get the [spark] schema from the flattened [pandas] sample_\n", + "\n", + "> This will be used in our distributed execution (below)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6a871d20-1cbc-4430-81fc-ee6be65e8dde", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n| nc_json|\n+--------------------+\n|[{[precip], [[8.6...|\n+--------------------+\n\n" + ] + } + ], + "source": [ + "df_slice_json = spark.createDataFrame(pd.DataFrame([[json_data]], columns=['nc_json']))\n", + "df_slice_json.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4e844320-ae01-441b-a26c-0b9fa9a3906e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[44]: StructType([StructField('nc_json', ArrayType(StructType([StructField('columns', ArrayType(StringType(), True), True), StructField('data', ArrayType(ArrayType(DoubleType(), True), True), True), StructField('index', ArrayType(ArrayType(DoubleType(), True), True), True)]), True), True)])" + ] + } + ], + "source": [ + "df_slice_json.schema" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6d8ff9d7-eaa4-4814-8ea7-012717c4517f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Distributed Example" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "323ab7fd-f855-4678-9ed0-f1f7e27647a9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "json_schema = (\n", + " StructType([\n", + " StructField(\n", + " 'nc_json', \n", + " ArrayType(\n", + " StructType([\n", + " StructField('columns', ArrayType(StringType(), True), True), \n", + " StructField('data', ArrayType(ArrayType(DoubleType(), True), True), True),\n", + " StructField('index', ArrayType(ArrayType(DoubleType(), True), True), True)\n", + " ]),\n", + " True\n", + " ),\n", + " True\n", + " )\n", + " ])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fdd5cd72-2cb9-4837-ac50-21fdf4696271", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def slice_path_json(key, input_pdf: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " slice the `path` column [optimal w/single path]:\n", + " - based on provided time, lat, lon slices\n", + " - Read with XArray using h5netcdf engine\n", + " - drops na values\n", + " - returns slice as json\n", + " Returns pandas dataframe\n", + " \"\"\"\n", + " import io\n", + " import json\n", + " import pandas as pd\n", + " import xarray as xr \n", + "\n", + " # -- iterate over pdf --\n", + " # - this may just be 1 path,\n", + " # depends on groupBy\n", + " # - to further optimize, consider enforcing 1 path\n", + " # and not doing the `pd.concat` call, just returning \n", + " pdf_arr = []\n", + " for index, row in input_pdf.iterrows():\n", + " path_fuse = row['path'].replace(\"dbfs:\",\"/dbfs\")\n", + " xds = xr.open_dataset(path_fuse)\n", + "\n", + " xds_slice = xds\n", + " if 'time_slice' in input_pdf:\n", + " xds_slice = xds_slice.sel(time=slice(*row['time_slice']))\n", + " if 'lat_slice' in input_pdf:\n", + " xds_slice = xds_slice.sel(lat=slice(*row['lat_slice']))\n", + " if 'lon_slice' in input_pdf:\n", + " xds_slice = xds_slice.sel(lon=slice(*row['lon_slice']))\n", + " \n", + " if 'time' in xds_slice.dims.keys() and not isinstance(xds_slice.indexes['time'], pd.DatetimeIndex):\n", + " xds_slice['time'] = xds_slice.indexes['time'].to_datetimeindex()\n", + " pdf = xds_slice.to_dataframe() # <- handle drops in xdf for large files\n", + " pdf.dropna(inplace=True)\n", + "\n", + " pdf_arr.append(pdf.groupby(['lat','lon','time']).apply(lambda x : x.to_json(orient='split')))\n", + " \n", + " pdf_list = pd.concat(pdf_arr).values.tolist()\n", + " json_data = [json.loads(x) for x in pdf_list]\n", + " \n", + " return pd.DataFrame([[json_data]], columns=['nc_json'])" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bd38e3ee-9a03-45ab-8b57-d85ca65c0d61", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 1\n+--------------------+\n| nc_json|\n+--------------------+\n|[{[precip], [[8.6...|\n+--------------------+\n\n" + ] + } + ], + "source": [ + "spark.catalog.clearCache() # <- cache for dev, help avoid recomputes\n", + "\n", + "df_json_slice = (\n", + " df_mos\n", + " .repartition(df_mos.count(), \"path\") # <- repartition is important!\n", + " .withColumn(\n", + " \"time_slice\", \n", + " F.array([F.lit(x) for x in ['2023-01-01', '2023-01-31']])\n", + " )\n", + " .withColumn(\n", + " \"lat_slice\", \n", + " F.array([F.lit(x) for x in [28.6, 26.9]]) # <- max, min\n", + " )\n", + " .withColumn(\n", + " \"lon_slice\", \n", + " F.array([F.lit(x) for x in [from_180(-83.0), from_180(-80.9)]]) # <- min, max ... convert to 360 \n", + " )\n", + " .groupBy(\"path\")\n", + " .applyInPandas(slice_path_json, schema=json_schema) # <- applyInPandas UDF\n", + " .filter(F.size(\"nc_json\") > 0)\n", + " .cache()\n", + ")\n", + "\n", + "print(f\"count? {df_json_slice.count():,}\") # <- this is all consolidated into a single json\n", + "df_json_slice.show() # <- not display, too big (see just the one row with results)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ba40df01-eb62-4c55-ad06-94061cacae87", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_We can explode the (consolidated) json with Spark to have a more manageable structure._ " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ba213b31-1ae3-40cc-9c34-6fc240008dde", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 372\n+--------+----------------+--------------------+\n| columns| data| index|\n+--------+----------------+--------------------+\n|[precip]|[[8.6544971466]]|[[28.25, 277.25, ...|\n|[precip]|[[0.1201934367]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]|[[6.1246461868]]|[[28.25, 277.25, ...|\n|[precip]|[[0.7794950008]]|[[28.25, 277.25, ...|\n|[precip]|[[0.0298616886]]|[[28.25, 277.25, ...|\n|[precip]|[[0.1123939902]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]|[[18.437210083]]|[[28.25, 277.25, ...|\n|[precip]|[[3.1411890984]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]| [[0.0]]|[[28.25, 277.25, ...|\n|[precip]|[[0.4759106636]]|[[28.25, 277.25, ...|\n+--------+----------------+--------------------+\nonly showing top 20 rows\n\n" + ] + } + ], + "source": [ + "df_explode_slice = (\n", + " df_json_slice\n", + " .select(F.explode(\"nc_json\"))\n", + " .select(\"col.*\")\n", + ")\n", + "print(f\"count? {df_explode_slice.count():,}\")\n", + "df_explode_slice.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a9ecc31a-0037-4094-9640-026b2e0c85f9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_We can further extract a measure from the exploded structure._\n", + "\n", + "> The following shows precipitation, but you could pick another column as needed. __Note: There is still some nesting (handled further below).__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2966e6e9-3e33-4415-a51c-f2717263d280", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 372\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
measureindex
8.6544971466List(28.25, 277.25, 1.6725312E12)
0.1201934367List(28.25, 277.25, 1.6726176E12)
0.0List(28.25, 277.25, 1.672704E12)
0.0List(28.25, 277.25, 1.6727904E12)
6.1246461868List(28.25, 277.25, 1.6728768E12)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 8.6544971466, + [ + 28.25, + 277.25, + 1.6725312E12 + ] + ], + [ + 0.1201934367, + [ + 28.25, + 277.25, + 1.6726176E12 + ] + ], + [ + 0.0, + [ + 28.25, + 277.25, + 1.672704E12 + ] + ], + [ + 0.0, + [ + 28.25, + 277.25, + 1.6727904E12 + ] + ], + [ + 6.1246461868, + [ + 28.25, + 277.25, + 1.6728768E12 + ] + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "measure", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "index", + "type": "{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true}" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "precip_idx = 1 # <- could be multiple data columns\n", + "\n", + "df_precip_slice = (\n", + " df_explode_slice\n", + " .select(\n", + " F.element_at(\"data\", precip_idx)[0].alias(\"measure\"),\n", + " F.element_at(\"index\", precip_idx).alias(\"index\"),\n", + " )\n", + ")\n", + "\n", + "print(f\"count? {df_precip_slice.count():,}\")\n", + "display(df_precip_slice.limit(5)) # <- limiting output for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8d1b3401-40d6-42a9-8b6b-b15fe17a524b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is an example of fully flattening from the precipitation slice._\n", + "\n", + "\n", + "__Notes:__\n", + "\n", + "

\n", + "\n", + "* This standardizes from 0:360 degrees to -180:180, see [here](https://pratiman-91.github.io/2020/08/01/NetCDF-to-GeoTIFF-using-Python.html) for pattern `( + 180) % 360 - 180`\n", + "* Also, adjust double value for timestamp by `/1000`" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "79b77b80-a428-4a78-89c6-8a6315971a80", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 372\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "

timelatlonmeasurelon_360geom_wkt
2023-01-0128.25-82.758.6544971466277.25POINT (-82.75 28.25)
2023-01-0228.25-82.750.1201934367277.25POINT (-82.75 28.25)
2023-01-0328.25-82.750.0277.25POINT (-82.75 28.25)
2023-01-0428.25-82.750.0277.25POINT (-82.75 28.25)
2023-01-0528.25-82.756.1246461868277.25POINT (-82.75 28.25)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "2023-01-01", + 28.25, + -82.75, + 8.6544971466, + 277.25, + "POINT (-82.75 28.25)" + ], + [ + "2023-01-02", + 28.25, + -82.75, + 0.1201934367, + 277.25, + "POINT (-82.75 28.25)" + ], + [ + "2023-01-03", + 28.25, + -82.75, + 0.0, + 277.25, + "POINT (-82.75 28.25)" + ], + [ + "2023-01-04", + 28.25, + -82.75, + 0.0, + 277.25, + "POINT (-82.75 28.25)" + ], + [ + "2023-01-05", + 28.25, + -82.75, + 6.1246461868, + 277.25, + "POINT (-82.75 28.25)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "time", + "type": "\"date\"" + }, + { + "metadata": "{}", + "name": "lat", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "lon", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "measure", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "lon_360", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_precip_slice_flat = (\n", + " df_precip_slice\n", + " .select(\n", + " (F.element_at(\"index\", 3) / 1000.0).cast(\"timestamp\").cast(\"date\").alias(\"time\"),\n", + " F.element_at(\"index\", 1).alias(\"lat\"),\n", + " from_360_udf(F.element_at(\"index\",2)).alias(\"lon\"),\n", + " \"measure\",\n", + " F.element_at(\"index\", 2).alias(\"lon_360\"),\n", + " )\n", + " .select(\"*\", mos.st_astext(mos.st_point(\"lon\", \"lat\")).alias(\"geom_wkt\"))\n", + ")\n", + "print(f\"count? {df_precip_slice_flat.count():,}\")\n", + "display(df_precip_slice_flat.limit(5)) # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2b2a88f5-c99c-4d9e-a9d8-8d736d46aeab", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Render average precipitation through the years_\n", + "\n", + "> This is per collected location __[AKA our \"same\" slice in Florida]__. Note: `precip` units are in millimeters." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "07fa3e84-f2c6-4e2f-9fba-6ae88846c247", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "df_precip_slice_kepler = (\n", + " df_precip_slice_flat\n", + " .groupBy(\"geom_wkt\")\n", + " .agg(F.avg(\"measure\").alias(\"avg_precip\"))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ac82bc4e-8af0-4e3e-924d-483e891d1558", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results [essentially same as the screenshot shown in the earlier example]._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3ec182af-7be5-470d-a8e1-9d35f674bfd0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# df_precip_slice_kepler \"geom_wkt\" \"geometry\" 1_000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d4ca1918-ad0b-4a44-a2e3-3ad50ecc4419", + "showTitle": false, + "title": "" + } + }, + "source": [ + "#### Databricks Lakehouse can read / write most any data format\n", + "\n", + "> Here are [built-in](https://docs.databricks.com/en/external-data/index.html) formats as well as Mosaic [readers](https://databrickslabs.github.io/mosaic/api/api.html). __Note: best performance with Delta Lake format__, ref [Databricks](https://docs.databricks.com/en/delta/index.html) and [OSS](https://docs.delta.io/latest/index.html) docs for Delta Lake. Beyond built-in formats, Databricks is a platform on which you can install a wide variety of libraries, e.g. [1](https://docs.databricks.com/en/libraries/index.html#python-environment-management) | [2](https://docs.databricks.com/en/compute/compatibility.html) | [3](https://docs.databricks.com/en/init-scripts/index.html).\n", + "\n", + "Example of [reading](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html?highlight=read#pyspark.sql.DataFrameReader) and [writing](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html?highlight=pyspark%20sql%20dataframe%20writer#pyspark.sql.DataFrameWriter) a Spark DataFrame with Delta Lake format.\n", + "\n", + "```\n", + "# - `write.format(\"delta\")` is default in Databricks\n", + "# - can save to a specified path in the Lakehouse\n", + "# - can save as a table in the Databricks Metastore\n", + "df.write.save(\"\")\n", + "df.write.saveAsTable(\"\")\n", + "```\n", + "\n", + "Example of loading a Delta Lake Table as a Spark DataFrame.\n", + "\n", + "```\n", + "# - `read.format(\"delta\")` is default in Databricks\n", + "# - can load a specified path in the Lakehouse\n", + "# - can load a table in the Databricks Metastore\n", + "df.read.load(\"\")\n", + "df.table(\"\")\n", + "```\n", + "\n", + "More on [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/index.html) in Databricks Lakehouse for Governing [Tables](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#tables) and [Volumes](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#volumes)." + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549841921965, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "distributed_slice netcdf_files", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/NetCDF/Xarray/single_node_netcdf_files.ipynb b/notebooks/examples/python/NetCDF/Xarray/single_node_netcdf_files.ipynb new file mode 100644 index 000000000..192069872 --- /dev/null +++ b/notebooks/examples/python/NetCDF/Xarray/single_node_netcdf_files.ipynb @@ -0,0 +1,3063 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "08fbb986-c3d0-45c9-9ab7-7fd842c78692", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Single Node: Opening + visualizing a netCDF file (Python)\n", + "\n", + "## Overview\n", + "\n", + "> This notebook demonstrates how to open and explore the netCDF file, visualize the data, and convert it to Pandas as well as Spark DataFrames. This is for users just getting familiar with [netCDF climate and forecast (CF) metadata conventions](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html). __The example is mostly single node / non-distributed__; until the spark call at the end, it runs only on the cluster driver, which is something like running on a \"laptop in the sky\". \n", + "\n", + "## Source Data\n", + "\n", + "The source data is a netCDF file, you can swap out for any data you are working with and follow the same basic pattern.\n", + "\n", + "## Prerequisites\n", + "\n", + "Python 3 or later. Python modules: we will add 'netCDF4', 'xarray', 'nc-time-axis', and 'cartopy' (numpy, pandas, matplotlib already available)\n", + "\n", + "---\n", + "__Last Update:__ 07 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dbc09cd9-d0b1-48e4-a950-024c387b3a8a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### 1. Import python modules\n", + "First import the required modules:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "838d465f-5fcd-40ef-b19e-b9c0e3931027", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nscipy 1.7.3 requires numpy<1.23.0,>=1.16.5, but you have numpy 1.26.2 which is incompatible.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install netCDF4 xarray nc-time-axis cartopy --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "84144f3b-6ef4-457a-be6c-ea67f758b94c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.databricks.v1+bamboolib_hint": "{\"pd.DataFrames\": [], \"version\": \"0.0.1\"}", + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import cartopy.crs as ccrs\n", + "import matplotlib.pyplot as plt\n", + "import netCDF4 as nc\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xarray as xr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f06b12c2-cd29-451b-b98b-0894e07bba2f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### 2. Read and explore the netCDF file\n", + "\n", + "> Example of reading in the netCDF file -- this is [Community Climate\n", + " System Model project](https://www.cesm.ucar.edu/models/ccsm). Printing `in_nc` displays important information about the data sets, such as *global attributes*, *data dimensions*, and *variable names*. Global attributes in a netCDF file contains information about the data such as data authors, publisher, data contacts, etc.\n", + "\n", + "__DataArray__\n", + "\n", + "xarray.DataArray is an implementation of a labelled, multi-dimensional array for a single variable, such as precipitation, temperature etc. It has the following key properties:\n", + "\n", + "

\n", + "\n", + "* `values`: a numpy.ndarray holding the array’s values\n", + "* `dims`: dimension names for each axis (e.g., ('lat', 'lon', 'z', 'time'))\n", + "* `coords`: a dict-like container of arrays (coordinates) that label each point (e.g., 1-dim arrays of numbers, * DateTime objects, or strings)\n", + "* `attrs`: an OrderedDict to hold arbitrary metadata (attributes)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5d04345d-d214-4cd0-ad1d-4c4842591ba3", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "--2023-11-21 16:12:04-- https://www.unidata.ucar.edu/software/netcdf/examples/sresa1b_ncar_ccsm3-example.nc\nResolving www.unidata.ucar.edu (www.unidata.ucar.edu)... 128.117.149.20\nConnecting to www.unidata.ucar.edu (www.unidata.ucar.edu)|128.117.149.20|:443... connected.\nHTTP request sent, awaiting response... 200 OK\nLength: 2767916 (2.6M) [application/x-netcdf]\nSaving to: ‘sresa1b_ncar_ccsm3-example.nc’\n\n 0K .......... .......... .......... .......... .......... 1% 703K 4s\n 50K .......... .......... .......... .......... .......... 3% 2.03M 2s\n 100K .......... .......... .......... .......... .......... 5% 1.02M 2s\n 150K .......... .......... .......... .......... .......... 7% 1.98M 2s\n 200K .......... .......... .......... .......... .......... 9% 2.05M 2s\n 250K .......... .......... .......... .......... .......... 11% 2.07M 2s\n 300K .......... .......... .......... .......... .......... 12% 2.03M 2s\n 350K .......... .......... .......... .......... .......... 14% 2.02M 2s\n 400K .......... .......... .......... .......... .......... 16% 129M 1s\n 450K .......... .......... .......... .......... .......... 18% 2.05M 1s\n 500K .......... .......... .......... .......... .......... 20% 229M 1s\n 550K .......... .......... .......... .......... .......... 22% 1.96M 1s\n 600K .......... .......... .......... .......... .......... 24% 90.5M 1s\n 650K .......... .......... .......... .......... .......... 25% 1.94M 1s\n 700K .......... .......... .......... .......... .......... 27% 124M 1s\n 750K .......... .......... .......... .......... .......... 29% 2.07M 1s\n 800K .......... .......... .......... .......... .......... 31% 24.9M 1s\n 850K .......... .......... .......... .......... .......... 33% 124M 1s\n 900K .......... .......... .......... .......... .......... 35% 2.24M 1s\n 950K .......... .......... .......... .......... .......... 36% 19.0M 1s\n 1000K .......... .......... .......... .......... .......... 38% 98.6M 1s\n 1050K .......... .......... .......... .......... .......... 40% 2.28M 1s\n 1100K .......... .......... .......... .......... .......... 42% 14.0M 1s\n 1150K .......... .......... .......... .......... .......... 44% 157M 1s\n 1200K .......... .......... .......... .......... .......... 46% 2.40M 1s\n 1250K .......... .......... .......... .......... .......... 48% 17.6M 0s\n 1300K .......... .......... .......... .......... .......... 49% 81.0M 0s\n 1350K .......... .......... .......... .......... .......... 51% 2.16M 0s\n 1400K .......... .......... .......... .......... .......... 53% 123M 0s\n 1450K .......... .......... .......... .......... .......... 55% 40.4M 0s\n 1500K .......... .......... .......... .......... .......... 57% 2.34M 0s\n 1550K .......... .......... .......... .......... .......... 59% 20.4M 0s\n 1600K .......... .......... .......... .......... .......... 61% 26.4M 0s\n 1650K .......... .......... .......... .......... .......... 62% 127M 0s\n 1700K .......... .......... .......... .......... .......... 64% 2.50M 0s\n 1750K .......... .......... .......... .......... .......... 66% 10.5M 0s\n 1800K .......... .......... .......... .......... .......... 68% 133M 0s\n 1850K .......... .......... .......... .......... .......... 70% 116M 0s\n 1900K .......... .......... .......... .......... .......... 72% 2.76M 0s\n 1950K .......... .......... .......... .......... .......... 73% 17.8M 0s\n 2000K .......... .......... .......... .......... .......... 75% 12.4M 0s\n 2050K .......... .......... .......... .......... .......... 77% 135M 0s\n 2100K .......... .......... .......... .......... .......... 79% 3.03M 0s\n 2150K .......... .......... .......... .......... .......... 81% 17.0M 0s\n 2200K .......... .......... .......... .......... .......... 83% 15.6M 0s\n 2250K .......... .......... .......... .......... .......... 85% 21.4M 0s\n 2300K .......... .......... .......... .......... .......... 86% 150M 0s\n 2350K .......... .......... .......... .......... .......... 88% 3.02M 0s\n 2400K .......... .......... .......... .......... .......... 90% 18.0M 0s\n 2450K .......... .......... .......... .......... .......... 92% 11.1M 0s\n 2500K .......... .......... .......... .......... .......... 94% 42.3M 0s\n 2550K .......... .......... .......... .......... .......... 96% 172M 0s\n 2600K .......... .......... .......... .......... .......... 98% 3.26M 0s\n 2650K .......... .......... .......... .......... .......... 99% 17.9M 0s\n 2700K ... 100% 33.4M=0.6s\n\n2023-11-21 16:12:05 (4.36 MB/s) - ‘sresa1b_ncar_ccsm3-example.nc’ saved [2767916/2767916]\n\ntotal 4.0M\ndrwxr-xr-x 2 root root 4.0K Nov 21 16:07 azure\ndrwxr-xr-x 1 root root 4.0K Nov 21 16:07 conf\ndrwxr-xr-x 3 root root 4.0K Nov 21 16:10 eventlogs\n-r-xr-xr-x 1 root root 2.7K Nov 21 16:07 hadoop_accessed_config.lst\ndrwxr-xr-x 2 root root 4.0K Nov 21 16:10 logs\n-r-xr-xr-x 1 root root 1.3M Nov 21 16:07 preload_class.lst\n-rw-r--r-- 1 root root 2.7M Sep 12 2012 sresa1b_ncar_ccsm3-example.nc\n" + ] + } + ], + "source": [ + "%sh \n", + "# - again, this is single node\n", + "# - can just download to the driver and start working with it\n", + "wget https://www.unidata.ucar.edu/software/netcdf/examples/sresa1b_ncar_ccsm3-example.nc\n", + "ls -lh" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2efd02b1-c112-4bed-b8f9-2e8ce415f762", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\nDimensions: (lat: 128, lon: 256, bnds: 2, plev: 17, time: 1)\nCoordinates:\n * lat (lat) float32 -88.93 -87.54 -86.14 -84.74 ... 86.14 87.54 88.93\n * lon (lon) float32 0.0 1.406 2.812 4.219 ... 354.4 355.8 357.2 358.6\n * plev (plev) float64 1e+05 9.25e+04 8.5e+04 7e+04 ... 3e+03 2e+03 1e+03\n * time (time) object 2000-05-16 12:00:00\nDimensions without coordinates: bnds\nData variables:\n area (lat, lon) float32 ...\n lat_bnds (lat, bnds) float64 ...\n lon_bnds (lon, bnds) float64 ...\n msk_rgn (lat, lon) int32 ...\n pr (time, lat, lon) float32 ...\n tas (time, lat, lon) float32 ...\n time_bnds (time, bnds) object ...\n ua (time, plev, lat, lon) float32 ...\nAttributes: (12/18)\n CVS_Id: $Id$\n creation_date: \n prg_ID: Source file unknown Version unknown Date unknown\n cmd_ln: bds -x 256 -y 128 -m 23 -o /data/zender/data/dst_T85.nc\n history: Tue Oct 25 15:08:51 2005: ncks -O -x -v va -m sresa1...\n table_id: Table A1\n ... ...\n references: Collins, W.D., et al., 2005:\\n The Community Climate...\n acknowledgment: Any use of CCSM data should acknowledge the contrib...\n realization: 1\n experiment_id: 720 ppm stabilization experiment (SRESA1B)\n comment: This simulation was initiated from year 2000 of \\n C...\n model_name_english: NCAR CCSM\n" + ] + } + ], + "source": [ + "ds = xr.open_dataset(\"sresa1b_ncar_ccsm3-example.nc\")\n", + "print(ds)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "549c000f-3860-4a0a-9ba2-169faeac58c8", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'tas' (time: 1, lat: 128, lon: 256)>\n",
+       "[32768 values with dtype=float32]\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 -88.93 -87.54 -86.14 -84.74 ... 86.14 87.54 88.93\n",
+       "  * lon      (lon) float32 0.0 1.406 2.812 4.219 ... 354.4 355.8 357.2 358.6\n",
+       "  * time     (time) object 2000-05-16 12:00:00\n",
+       "Attributes:\n",
+       "    comment:         Created using NCL code CCSM_atmm_2cf.ncl on\\n machine ea...\n",
+       "    cell_methods:    time: mean (interval: 1 month)\n",
+       "    history:         Added height coordinate\n",
+       "    original_units:  K\n",
+       "    original_name:   TREFHT\n",
+       "    standard_name:   air_temperature\n",
+       "    units:           K\n",
+       "    long_name:       air_temperature\n",
+       "    cell_method:     time: mean
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.DataArray 'tas' (time: 1, lat: 128, lon: 256)>\n[32768 values with dtype=float32]\nCoordinates:\n  * lat      (lat) float32 -88.93 -87.54 -86.14 -84.74 ... 86.14 87.54 88.93\n  * lon      (lon) float32 0.0 1.406 2.812 4.219 ... 354.4 355.8 357.2 358.6\n  * time     (time) object 2000-05-16 12:00:00\nAttributes:\n    comment:         Created using NCL code CCSM_atmm_2cf.ncl on\\n machine ea...\n    cell_methods:    time: mean (interval: 1 month)\n    history:         Added height coordinate\n    original_units:  K\n    original_name:   TREFHT\n    standard_name:   air_temperature\n    units:           K\n    long_name:       air_temperature\n    cell_method:     time: mean
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "ds.tas" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "45a3ebde-3845-4d5c-9d02-a52b66ce1210", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### 3. Plot a Variable\n", + "\n", + "> In this case air temperature ('tas') in Kelvins." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4bb705bd-236d-401f-a8db-a14dd1934c90", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[7]: " + ] + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# - Example-1: a simple plot\n", + "ax = plt.axes(projection=ccrs.PlateCarree()) # equidistance\n", + "ax.coastlines() \n", + "ds.tas.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c0c48f6c-d63d-418b-a89d-6e8860a185df", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + " # Example-2\n", + " # - pick the center\n", + " # - handle data and plot projection\n", + " p = ds.tas.plot(\n", + " transform=ccrs.PlateCarree(), # data's projection (equidistance)\n", + " col=\"time\", # time\n", + " col_wrap=1, # multiplot settings\n", + " aspect=ds.dims[\"lon\"] / ds.dims[\"lat\"], # for a sensible figsize\n", + " subplot_kws={ # plot projection (conic)\n", + " \"projection\": \n", + " ccrs.LambertConformal(\n", + " central_longitude=-95, central_latitude=45\n", + " )},\n", + ")\n", + "\n", + "# set options\n", + "for ax in p.axs.flat:\n", + " ax.coastlines()\n", + " ax.set_extent([-160, -30, 5, 75])" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3efd3a2a-2653-49cf-a7a2-53643390bd8b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# Example-3: Kelvin to Celsius\n", + "airc = ds.tas - 273.15 \n", + "\n", + "# Figure\n", + "# - can easily adjust this for multipe subplots\n", + "f, ax1 = plt.subplots(1, 1, figsize=(12, 9), sharey=True)\n", + "\n", + "# Selected latitude indices\n", + "# - there are 128 lats, so 64 is around equator\n", + "isel_lats = [32, 64, 96]\n", + "\n", + "# Temperature vs longitude plot \n", + "# - lons are 0..360\n", + "# - we are filtering to just 1 day (in this case that's all there is)\n", + "airc.isel(time=0, lat=isel_lats).plot.line(ax=ax1, hue=\"lat\")\n", + "ax1.set_ylabel(\"°C\")\n", + "\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "384a07a4-8046-4e76-abb2-2e1ad8a1b33a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.DataArray 'tas' ()>\n",
+       "array(67.66379, dtype=float32)\n",
+       "Coordinates:\n",
+       "    lat      float32 -25.91\n",
+       "    lon      float32 0.0\n",
+       "    time     object 2000-05-16 12:00:00
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
<xarray.DataArray 'tas' ()>\narray(67.66379, dtype=float32)\nCoordinates:\n    lat      float32 -25.91\n    lon      float32 0.0\n    time     object 2000-05-16 12:00:00
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# convert air temp from kelvins to fahrenheit (for fun)\n", + "airf = (ds.tas * 1.8) - 459.67\n", + "airf[0][45][0] # <- here is a reading at time[0], lat[45], lon[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0a560e37-48cc-425b-8afb-b72e2fe52d28", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### 4. Output to Pandas and Spark DataFrames\n", + "\n", + "> We see that each variable and coordinate in the Dataset is now a column in the DataFrame, with the exception of indexes which are in the index. To convert the DataFrame to any other convenient representation, use DataFrame methods like `reset_index()`, `stack()` and `unstack()`.\n", + "\n", + "_Note: We can save the Xarray Dataset to various formats, just focusing on Pandas for brevity._" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a92ada6a-438d-427b-984b-38873dc15a10", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__First let's convert time to DateTime Index (from CFTimeIndex)__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6c50c9fd-cac6-4397-bb8b-091ff0105846", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "DatetimeIndex(['2000-05-16 12:00:00'], dtype='datetime64[ns]', freq=None)\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + ":3: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.\n ds_times = ds.indexes['time'].to_datetimeindex()\n" + ] + } + ], + "source": [ + "from datetime import datetime\n", + "\n", + "ds_times = ds.indexes['time'].to_datetimeindex()\n", + "print(ds_times)\n", + "\n", + "ds['time'] = ds_times" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "812cdcbc-d6f8-4858-a656-b59f9eca945a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "rows? 1,114,112, cols? 13\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
latlonbndsplevtimearealat_bndslon_bndsmsk_rgnprtastime_bndsua
0-88.9277340.00100000.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00NaN
1-88.9277340.0092500.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00NaN
2-88.9277340.0085000.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00NaN
3-88.9277340.0070000.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00NaN
4-88.9277340.0060000.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00-0.703753
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
latlonbndsplevtimearealat_bndslon_bndsmsk_rgnprtastime_bndsua
0-88.9277340.00100000.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00NaN
1-88.9277340.0092500.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00NaN
2-88.9277340.0085000.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00NaN
3-88.9277340.0070000.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00NaN
4-88.9277340.0060000.02000-05-16 12:00:00473460608.0-90.0-0.70312500.000001215.8934942000-05-01 00:00:00-0.703753
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "pdf = ds.to_dataframe().reset_index() # <- pandas\n", + "print(f'rows? {pdf.shape[0]:,}, cols? {pdf.shape[1]}')\n", + "pdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "50d35ced-e330-46bb-bd3f-4700a8e97323", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[14]: lat float64\nlon float64\nbnds int64\nplev float64\ntime datetime64[ns]\narea float32\nlat_bnds float64\nlon_bnds float64\nmsk_rgn int32\npr float32\ntas float32\ntime_bnds object\nua float32\ndtype: object" + ] + } + ], + "source": [ + "pdf.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f2e86510-edff-4d96-b34b-a52e21d83d99", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Can convert to a Spark DataFrame.__\n", + "\n", + "> Note: we are dropping \"*_bnds\" and \"msk_rgn\" columns for simplicity." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "370c5f4f-485c-44dc-aa37-0b3ca13b2643", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 1,114,112\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
latlonplevtimeareaprtasua
-88.9277343750.0100000.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935null
-88.9277343750.092500.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935null
-88.9277343750.085000.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935null
-88.9277343750.070000.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935null
-88.9277343750.060000.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935-0.70375293
-88.9277343750.050000.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935-1.4543961
-88.9277343750.040000.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935-2.1575398
-88.9277343750.030000.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935-2.320977
-88.9277343750.025000.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935-1.9883183
-88.9277343750.020000.02000-05-16T12:00:00.000+00004.73460608E81.0915462E-6215.8935-1.5386307
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + -88.927734375, + 0.0, + 100000.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + null + ], + [ + -88.927734375, + 0.0, + 92500.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + null + ], + [ + -88.927734375, + 0.0, + 85000.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + null + ], + [ + -88.927734375, + 0.0, + 70000.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + null + ], + [ + -88.927734375, + 0.0, + 60000.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + -0.70375293 + ], + [ + -88.927734375, + 0.0, + 50000.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + -1.4543961 + ], + [ + -88.927734375, + 0.0, + 40000.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + -2.1575398 + ], + [ + -88.927734375, + 0.0, + 30000.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + -2.320977 + ], + [ + -88.927734375, + 0.0, + 25000.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + -1.9883183 + ], + [ + -88.927734375, + 0.0, + 20000.0, + "2000-05-16T12:00:00.000+0000", + 4.73460608E8, + 1.0915462E-6, + 215.8935, + -1.5386307 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "lat", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "lon", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "plev", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "time", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "area", + "type": "\"float\"" + }, + { + "metadata": "{}", + "name": "pr", + "type": "\"float\"" + }, + { + "metadata": "{}", + "name": "tas", + "type": "\"float\"" + }, + { + "metadata": "{}", + "name": "ua", + "type": "\"float\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df = (\n", + " spark\n", + " .createDataFrame(\n", + " pdf.drop(columns=[\"bnds\", \"time_bnds\", \"lat_bnds\", \"lon_bnds\", \"msk_rgn\"])\n", + " )\n", + " #.distinct() # <- not needed\n", + ")\n", + "print(f\"count? {df.count():,}\")\n", + "df.limit(10).display()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "69db8a1f-dc69-40a0-ad92-9a4e122c75a0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "63304e95-eb36-49a9-9e3a-43d24baa3185", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
lat_degreeplev_avgpr_avgtas_avgua_avg
-8836205.8823529411751.407552143284363E-6217.075331270694730.715708841607833
-8736205.8823529411751.468051382458313E-6219.76243400573731.6284686288521613
-8636205.8823529411751.5245482806180988E-6223.064042150974272.7873747913918647
-8436205.8823529411751.6745170340337623E-6225.779721975326544.2820890932200335
-8336205.8823529411751.8137844463028685E-6227.50417834520345.726601821867347
-8136205.8823529411751.7317814833273104E-6227.75531798601157.4828300487098796
-8036205.8823529411752.092438653955586E-6227.44362866878519.309421336565226
-7936205.8823529411753.095372376971106E-6227.9950541853904710.981656906893834
-7736205.8823529411755.237449556383389E-6229.6767591238021912.499625367135527
-7636205.8823529411757.134318825485764E-6231.3214695453643813.954102117710605
-7436205.8823529411758.911818972112684E-6233.6075807213783315.063077128260037
-7336205.8823529411751.0271408606765675E-5236.791002452373516.332507042848015
-7236205.8823529411751.1347927823646131E-5240.5600757002830517.600392201925374
-7036205.8823529411751.2949680546547881E-5244.9671499729156518.899724136071168
-6936205.8823529411751.5240858808773794E-5249.1242272853851319.85490629388062
-6636205.8823529411752.335175603462858E-5257.027713716030122.567665543444722
-6736205.8823529411751.9055666367506774E-5253.3175278306007421.33403223093976
-6536205.8823529411752.642416576748019E-5260.051078259944924.121534322192588
-6336205.8823529411753.0250296504163998E-5263.9410998821258526.061645990507714
-6236205.8823529411753.363296142566696E-5267.645325303077727.7609099962865
-6036205.8823529411753.658373520920577E-5270.2537860870361329.078597813728265
-5936205.8823529411753.8627924396905655E-5272.071479320526129.962373270187527
-5836205.8823529411753.977384552200647E-5273.323612689971930.45397131307982
-5536205.8823529411753.922717606741344E-5275.2435662746429430.342048638500273
-5636205.8823529411754.0021182847738146E-5274.3404514789581330.584524759324268
-5336205.8823529411753.8147845707214856E-5276.090158581733729.754980638157576
-5236205.8823529411753.6586314010378373E-5276.9588081836700428.917417611693963
-5136205.8823529411753.56162929406878E-5277.869970321655327.919671651674435
-4936205.8823529411753.547756497823684E-5278.820172071456926.817347396863624
-4836205.8823529411753.5915758296312106E-5279.826328396797225.356595777202013
-4536205.8823529411753.8973745198234155E-5281.848015308380122.624973117723833
-4636205.8823529411753.7044281562259584E-5280.8009599447250423.865529830570377
-4436205.8823529411753.937726972225164E-5282.989215731620821.511043656515977
-4236205.8823529411753.800691258071254E-5284.167769193649320.561248390903543
-4136205.8823529411753.761225878129437E-5285.447011113166819.710598713910464
-3936205.8823529411753.737289180705261E-5286.5937551259994518.991576121728247
-3836205.8823529411753.6070513374397706E-5287.539517641067518.40325901697045
-3736205.8823529411753.361257644485249E-5288.425348043441817.92410158707894
-3536205.8823529411753.186514219599701E-5289.186974167823817.534759012915742
-3436205.8823529411753.129685891067879E-5289.8049093484878517.26623534235839
-3236205.8823529411753.1867615610359223E-5290.3188040256517.062277284440455
-3136205.8823529411753.091571664759485E-5290.867923378944416.901856414547666
-3036205.8823529411752.8989225321129908E-5291.5200726985931416.66400538279219
-2836205.8823529411752.7917650087916357E-5292.190698027610816.228677592839944
-2736205.8823529411752.663895256738158E-5292.8712266683578515.591127863293181
-2536205.8823529411752.504328327868066E-5293.586828351020814.647611196294921
-2436205.8823529411752.3760702883679485E-5294.342584013938913.429689861707075
-2336205.8823529411752.2775894214399628E-5295.1134769916534411.923323967396803
-2136205.8823529411752.200307770061638E-5295.817610859870910.232726991354179
-2036205.8823529411752.169744915965087E-5296.45769536495218.451828995326595
-1836205.8823529411752.2563958886465205E-5297.022237658500676.630378416227909
-1736205.8823529411752.3807533069953472E-5297.52763557434084.866297666888033
-1636205.8823529411752.5782454218972674E-5298.021517038345343.1784555008326856
-1436205.8823529411753.0783642704299875E-5298.54498910903931.631938779288713
-1336205.8823529411753.885846786269706E-5299.03565943241120.2651273898080488
-1136205.8823529411754.6388930301947094E-5299.48440730571747-0.8592761338746177
-1036205.8823529411755.611159023176035E-5299.8747184276581-1.7270243766819258
-936205.8823529411756.736855016609145E-5300.064936876297-2.3546500064878466
-736205.8823529411757.888406363054656E-5300.1867402791977-2.7568328289501225
-636205.8823529411758.453085083259815E-5300.21373772621155-2.966069584627585
-436205.8823529411757.22098972909535E-5300.1345340013504-3.0291564527349566
-336205.8823529411756.17253456989264E-5299.9818021059036-2.9979388670644367
-236205.8823529411756.221974441666944E-5299.83054459095-2.9469196503050625
036205.8823529411755.889521054225355E-5299.6397006511688-2.941949666653154
136205.8823529411755.421492519053217E-5299.6047521829605-3.012444794654725
336205.8823529411755.896429902119138E-5299.77970588207245-3.21287120971011
436205.8823529411758.234097119785844E-5299.9592877626419-3.625886762240949
536205.8823529411758.366640793466829E-5299.98875296115875-4.165366051782663
736205.8823529411756.616698960426604E-5299.9502363204956-4.580509089940484
1036205.8823529411754.2954371996362095E-5299.73693549633026-4.771790233790217
836205.8823529411755.5249429811610185E-5299.82291972637177-4.772187882197364
1136205.8823529411753.531368289863557E-5299.5046976804733-4.580618888885045
1236205.8823529411753.0785425306489866E-5299.27507412433624-4.169953896092391
1436205.8823529411752.6882649195368244E-5299.17446398735046-3.549184047331177
1536205.8823529411752.2885766425326167E-5299.21928918361664-2.7441951802861047
1736205.8823529411752.0384059304227802E-5299.26371693611145-1.7883517546554617
1836205.8823529411752.0790263808273264E-5299.3160631656647-0.7174469160352669
1936205.8823529411752.3547091964045168E-5299.147697806358340.43744630606911733
2136205.8823529411752.2832696491790363E-5298.89616751670841.6132969536313366
2236205.8823529411752.178873426439416E-5298.54990839958192.8055290518530933
2436205.8823529411752.0740196643588218E-5298.07691216468813.964794905403176
2536205.8823529411751.774813630320921E-5297.423513650894175.058618271370385
2636205.8823529411751.6223084839307217E-5296.7325634956366.063541283554576
2836205.8823529411751.632987075590908E-5295.838605165481577.008807920183934
2936205.8823529411751.5369384776683278E-5294.664321064949047.868744953782849
3136205.8823529411751.678816827054702E-5293.27915751934058.601543619596798
3236205.8823529411751.7429461815949426E-5291.83599483966839.175114817848021
3336205.8823529411751.8117643652375975E-5290.50454056262979.621874967636325
3536205.8823529411751.955390670117796E-5289.436612486839310.041328847475105
3636205.8823529411752.129450458772386E-5288.681814312934910.42841895321429
3836205.8823529411752.2790912938865093E-5288.1380276679992710.827813545657351
3936205.8823529411752.1596644427357425E-5287.636707305908211.149618975753539
4236205.8823529411752.1708571755651995E-5285.963031888008111.653115103018783
4036205.8823529411752.096218664687788E-5287.0181226730346711.439079338695977
4336205.8823529411752.3629241329370387E-5284.7048951387405411.809871168746234
4536205.8823529411752.5237222078811072E-5283.7178472280502311.85068867350996
4636205.8823529411752.503286689536921E-5282.952270746231111.781795795693562
4736205.8823529411752.563926795229321E-5282.204720973968511.610528457889167
4936205.8823529411752.733612904037841E-5281.443157196044911.357871455883723
5036205.8823529411752.7512512897232E-5280.671113848686211.037044279385107
5236205.8823529411752.7612559133416426E-5280.066786170005810.631961508018351
5336205.8823529411752.7535040423742885E-5279.563749074935910.122838367281862
5436205.8823529411752.7008159129593423E-5279.00816929340369.557985360001865
5636205.8823529411752.52276799308504E-5278.27589380741128.863281032040106
5736205.8823529411752.3354141770681736E-5277.51602780818948.045447692465611
5936205.8823529411752.3222815306311873E-5276.896544337272647.14694831198156
6036205.8823529411752.5113261845177703E-5276.1807048320776.130267364519796
6136205.8823529411752.5752178828497563E-5275.32207179069525.133624380103912
6336205.8823529411752.451439545936296E-5274.540047407150274.231104789147489
6436205.8823529411752.2735587961619785E-5273.78710925579073.4881211109904653
6636205.8823529411752.0525798589687838E-5272.99530339241032.908634777729333
6736205.8823529411751.879029821871825E-5272.164831042289732.503968360230367
6836205.8823529411751.712034589829159E-5271.335730552673342.249409265256777
7036205.8823529411751.4604347343905033E-5270.43958628177642.078202474317823
7136205.8823529411751.1892302612004357E-5269.654697239398961.9389573841849588
7436205.8823529411751.0404651441309198E-5268.723317265510561.7664747506720284
7336205.8823529411751.0299419310744184E-5269.202671408653261.8324613151466245
7536205.8823529411759.97003277625197E-6268.230050444602971.7331573905556625
7736205.8823529411759.39069850725005E-6267.7129188179971.7297824485295452
7836205.8823529411759.353685972879333E-6267.19857645034791.7234866682749332
8036205.8823529411759.348147517407313E-6266.64522719383241.694802076628088
8136205.8823529411758.977633049855882E-6265.928640723228451.6786359874904684
8236205.8823529411758.330678127776991E-6265.47684335708621.708944203726163
8436205.8823529411757.683479836373408E-6265.48405694961551.776776054355016
8536205.8823529411757.2926797098915586E-6265.632208228111271.8246638329583718
8736205.8823529411757.94587144525849E-6265.768729686737061.7291501955064734
8836205.8823529411759.113501825197545E-6265.82334959506991.3286047701541577
8936205.8823529411758.562256367028453E-6265.83681964874270.6457764548329692
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + -88, + 36205.882352941175, + 1.407552143284363E-6, + 217.07533127069473, + 0.715708841607833 + ], + [ + -87, + 36205.882352941175, + 1.468051382458313E-6, + 219.7624340057373, + 1.6284686288521613 + ], + [ + -86, + 36205.882352941175, + 1.5245482806180988E-6, + 223.06404215097427, + 2.7873747913918647 + ], + [ + -84, + 36205.882352941175, + 1.6745170340337623E-6, + 225.77972197532654, + 4.2820890932200335 + ], + [ + -83, + 36205.882352941175, + 1.8137844463028685E-6, + 227.5041783452034, + 5.726601821867347 + ], + [ + -81, + 36205.882352941175, + 1.7317814833273104E-6, + 227.7553179860115, + 7.4828300487098796 + ], + [ + -80, + 36205.882352941175, + 2.092438653955586E-6, + 227.4436286687851, + 9.309421336565226 + ], + [ + -79, + 36205.882352941175, + 3.095372376971106E-6, + 227.99505418539047, + 10.981656906893834 + ], + [ + -77, + 36205.882352941175, + 5.237449556383389E-6, + 229.67675912380219, + 12.499625367135527 + ], + [ + -76, + 36205.882352941175, + 7.134318825485764E-6, + 231.32146954536438, + 13.954102117710605 + ], + [ + -74, + 36205.882352941175, + 8.911818972112684E-6, + 233.60758072137833, + 15.063077128260037 + ], + [ + -73, + 36205.882352941175, + 1.0271408606765675E-5, + 236.7910024523735, + 16.332507042848015 + ], + [ + -72, + 36205.882352941175, + 1.1347927823646131E-5, + 240.56007570028305, + 17.600392201925374 + ], + [ + -70, + 36205.882352941175, + 1.2949680546547881E-5, + 244.96714997291565, + 18.899724136071168 + ], + [ + -69, + 36205.882352941175, + 1.5240858808773794E-5, + 249.12422728538513, + 19.85490629388062 + ], + [ + -66, + 36205.882352941175, + 2.335175603462858E-5, + 257.0277137160301, + 22.567665543444722 + ], + [ + -67, + 36205.882352941175, + 1.9055666367506774E-5, + 253.31752783060074, + 21.33403223093976 + ], + [ + -65, + 36205.882352941175, + 2.642416576748019E-5, + 260.0510782599449, + 24.121534322192588 + ], + [ + -63, + 36205.882352941175, + 3.0250296504163998E-5, + 263.94109988212585, + 26.061645990507714 + ], + [ + -62, + 36205.882352941175, + 3.363296142566696E-5, + 267.6453253030777, + 27.7609099962865 + ], + [ + -60, + 36205.882352941175, + 3.658373520920577E-5, + 270.25378608703613, + 29.078597813728265 + ], + [ + -59, + 36205.882352941175, + 3.8627924396905655E-5, + 272.0714793205261, + 29.962373270187527 + ], + [ + -58, + 36205.882352941175, + 3.977384552200647E-5, + 273.3236126899719, + 30.45397131307982 + ], + [ + -55, + 36205.882352941175, + 3.922717606741344E-5, + 275.24356627464294, + 30.342048638500273 + ], + [ + -56, + 36205.882352941175, + 4.0021182847738146E-5, + 274.34045147895813, + 30.584524759324268 + ], + [ + -53, + 36205.882352941175, + 3.8147845707214856E-5, + 276.0901585817337, + 29.754980638157576 + ], + [ + -52, + 36205.882352941175, + 3.6586314010378373E-5, + 276.95880818367004, + 28.917417611693963 + ], + [ + -51, + 36205.882352941175, + 3.56162929406878E-5, + 277.8699703216553, + 27.919671651674435 + ], + [ + -49, + 36205.882352941175, + 3.547756497823684E-5, + 278.8201720714569, + 26.817347396863624 + ], + [ + -48, + 36205.882352941175, + 3.5915758296312106E-5, + 279.8263283967972, + 25.356595777202013 + ], + [ + -45, + 36205.882352941175, + 3.8973745198234155E-5, + 281.8480153083801, + 22.624973117723833 + ], + [ + -46, + 36205.882352941175, + 3.7044281562259584E-5, + 280.80095994472504, + 23.865529830570377 + ], + [ + -44, + 36205.882352941175, + 3.937726972225164E-5, + 282.9892157316208, + 21.511043656515977 + ], + [ + -42, + 36205.882352941175, + 3.800691258071254E-5, + 284.1677691936493, + 20.561248390903543 + ], + [ + -41, + 36205.882352941175, + 3.761225878129437E-5, + 285.4470111131668, + 19.710598713910464 + ], + [ + -39, + 36205.882352941175, + 3.737289180705261E-5, + 286.59375512599945, + 18.991576121728247 + ], + [ + -38, + 36205.882352941175, + 3.6070513374397706E-5, + 287.5395176410675, + 18.40325901697045 + ], + [ + -37, + 36205.882352941175, + 3.361257644485249E-5, + 288.4253480434418, + 17.92410158707894 + ], + [ + -35, + 36205.882352941175, + 3.186514219599701E-5, + 289.1869741678238, + 17.534759012915742 + ], + [ + -34, + 36205.882352941175, + 3.129685891067879E-5, + 289.80490934848785, + 17.26623534235839 + ], + [ + -32, + 36205.882352941175, + 3.1867615610359223E-5, + 290.31880402565, + 17.062277284440455 + ], + [ + -31, + 36205.882352941175, + 3.091571664759485E-5, + 290.8679233789444, + 16.901856414547666 + ], + [ + -30, + 36205.882352941175, + 2.8989225321129908E-5, + 291.52007269859314, + 16.66400538279219 + ], + [ + -28, + 36205.882352941175, + 2.7917650087916357E-5, + 292.1906980276108, + 16.228677592839944 + ], + [ + -27, + 36205.882352941175, + 2.663895256738158E-5, + 292.87122666835785, + 15.591127863293181 + ], + [ + -25, + 36205.882352941175, + 2.504328327868066E-5, + 293.5868283510208, + 14.647611196294921 + ], + [ + -24, + 36205.882352941175, + 2.3760702883679485E-5, + 294.3425840139389, + 13.429689861707075 + ], + [ + -23, + 36205.882352941175, + 2.2775894214399628E-5, + 295.11347699165344, + 11.923323967396803 + ], + [ + -21, + 36205.882352941175, + 2.200307770061638E-5, + 295.8176108598709, + 10.232726991354179 + ], + [ + -20, + 36205.882352941175, + 2.169744915965087E-5, + 296.4576953649521, + 8.451828995326595 + ], + [ + -18, + 36205.882352941175, + 2.2563958886465205E-5, + 297.02223765850067, + 6.630378416227909 + ], + [ + -17, + 36205.882352941175, + 2.3807533069953472E-5, + 297.5276355743408, + 4.866297666888033 + ], + [ + -16, + 36205.882352941175, + 2.5782454218972674E-5, + 298.02151703834534, + 3.1784555008326856 + ], + [ + -14, + 36205.882352941175, + 3.0783642704299875E-5, + 298.5449891090393, + 1.631938779288713 + ], + [ + -13, + 36205.882352941175, + 3.885846786269706E-5, + 299.0356594324112, + 0.2651273898080488 + ], + [ + -11, + 36205.882352941175, + 4.6388930301947094E-5, + 299.48440730571747, + -0.8592761338746177 + ], + [ + -10, + 36205.882352941175, + 5.611159023176035E-5, + 299.8747184276581, + -1.7270243766819258 + ], + [ + -9, + 36205.882352941175, + 6.736855016609145E-5, + 300.064936876297, + -2.3546500064878466 + ], + [ + -7, + 36205.882352941175, + 7.888406363054656E-5, + 300.1867402791977, + -2.7568328289501225 + ], + [ + -6, + 36205.882352941175, + 8.453085083259815E-5, + 300.21373772621155, + -2.966069584627585 + ], + [ + -4, + 36205.882352941175, + 7.22098972909535E-5, + 300.1345340013504, + -3.0291564527349566 + ], + [ + -3, + 36205.882352941175, + 6.17253456989264E-5, + 299.9818021059036, + -2.9979388670644367 + ], + [ + -2, + 36205.882352941175, + 6.221974441666944E-5, + 299.83054459095, + -2.9469196503050625 + ], + [ + 0, + 36205.882352941175, + 5.889521054225355E-5, + 299.6397006511688, + -2.941949666653154 + ], + [ + 1, + 36205.882352941175, + 5.421492519053217E-5, + 299.6047521829605, + -3.012444794654725 + ], + [ + 3, + 36205.882352941175, + 5.896429902119138E-5, + 299.77970588207245, + -3.21287120971011 + ], + [ + 4, + 36205.882352941175, + 8.234097119785844E-5, + 299.9592877626419, + -3.625886762240949 + ], + [ + 5, + 36205.882352941175, + 8.366640793466829E-5, + 299.98875296115875, + -4.165366051782663 + ], + [ + 7, + 36205.882352941175, + 6.616698960426604E-5, + 299.9502363204956, + -4.580509089940484 + ], + [ + 10, + 36205.882352941175, + 4.2954371996362095E-5, + 299.73693549633026, + -4.771790233790217 + ], + [ + 8, + 36205.882352941175, + 5.5249429811610185E-5, + 299.82291972637177, + -4.772187882197364 + ], + [ + 11, + 36205.882352941175, + 3.531368289863557E-5, + 299.5046976804733, + -4.580618888885045 + ], + [ + 12, + 36205.882352941175, + 3.0785425306489866E-5, + 299.27507412433624, + -4.169953896092391 + ], + [ + 14, + 36205.882352941175, + 2.6882649195368244E-5, + 299.17446398735046, + -3.549184047331177 + ], + [ + 15, + 36205.882352941175, + 2.2885766425326167E-5, + 299.21928918361664, + -2.7441951802861047 + ], + [ + 17, + 36205.882352941175, + 2.0384059304227802E-5, + 299.26371693611145, + -1.7883517546554617 + ], + [ + 18, + 36205.882352941175, + 2.0790263808273264E-5, + 299.3160631656647, + -0.7174469160352669 + ], + [ + 19, + 36205.882352941175, + 2.3547091964045168E-5, + 299.14769780635834, + 0.43744630606911733 + ], + [ + 21, + 36205.882352941175, + 2.2832696491790363E-5, + 298.8961675167084, + 1.6132969536313366 + ], + [ + 22, + 36205.882352941175, + 2.178873426439416E-5, + 298.5499083995819, + 2.8055290518530933 + ], + [ + 24, + 36205.882352941175, + 2.0740196643588218E-5, + 298.0769121646881, + 3.964794905403176 + ], + [ + 25, + 36205.882352941175, + 1.774813630320921E-5, + 297.42351365089417, + 5.058618271370385 + ], + [ + 26, + 36205.882352941175, + 1.6223084839307217E-5, + 296.732563495636, + 6.063541283554576 + ], + [ + 28, + 36205.882352941175, + 1.632987075590908E-5, + 295.83860516548157, + 7.008807920183934 + ], + [ + 29, + 36205.882352941175, + 1.5369384776683278E-5, + 294.66432106494904, + 7.868744953782849 + ], + [ + 31, + 36205.882352941175, + 1.678816827054702E-5, + 293.2791575193405, + 8.601543619596798 + ], + [ + 32, + 36205.882352941175, + 1.7429461815949426E-5, + 291.8359948396683, + 9.175114817848021 + ], + [ + 33, + 36205.882352941175, + 1.8117643652375975E-5, + 290.5045405626297, + 9.621874967636325 + ], + [ + 35, + 36205.882352941175, + 1.955390670117796E-5, + 289.4366124868393, + 10.041328847475105 + ], + [ + 36, + 36205.882352941175, + 2.129450458772386E-5, + 288.6818143129349, + 10.42841895321429 + ], + [ + 38, + 36205.882352941175, + 2.2790912938865093E-5, + 288.13802766799927, + 10.827813545657351 + ], + [ + 39, + 36205.882352941175, + 2.1596644427357425E-5, + 287.6367073059082, + 11.149618975753539 + ], + [ + 42, + 36205.882352941175, + 2.1708571755651995E-5, + 285.9630318880081, + 11.653115103018783 + ], + [ + 40, + 36205.882352941175, + 2.096218664687788E-5, + 287.01812267303467, + 11.439079338695977 + ], + [ + 43, + 36205.882352941175, + 2.3629241329370387E-5, + 284.70489513874054, + 11.809871168746234 + ], + [ + 45, + 36205.882352941175, + 2.5237222078811072E-5, + 283.71784722805023, + 11.85068867350996 + ], + [ + 46, + 36205.882352941175, + 2.503286689536921E-5, + 282.9522707462311, + 11.781795795693562 + ], + [ + 47, + 36205.882352941175, + 2.563926795229321E-5, + 282.2047209739685, + 11.610528457889167 + ], + [ + 49, + 36205.882352941175, + 2.733612904037841E-5, + 281.4431571960449, + 11.357871455883723 + ], + [ + 50, + 36205.882352941175, + 2.7512512897232E-5, + 280.6711138486862, + 11.037044279385107 + ], + [ + 52, + 36205.882352941175, + 2.7612559133416426E-5, + 280.0667861700058, + 10.631961508018351 + ], + [ + 53, + 36205.882352941175, + 2.7535040423742885E-5, + 279.5637490749359, + 10.122838367281862 + ], + [ + 54, + 36205.882352941175, + 2.7008159129593423E-5, + 279.0081692934036, + 9.557985360001865 + ], + [ + 56, + 36205.882352941175, + 2.52276799308504E-5, + 278.2758938074112, + 8.863281032040106 + ], + [ + 57, + 36205.882352941175, + 2.3354141770681736E-5, + 277.5160278081894, + 8.045447692465611 + ], + [ + 59, + 36205.882352941175, + 2.3222815306311873E-5, + 276.89654433727264, + 7.14694831198156 + ], + [ + 60, + 36205.882352941175, + 2.5113261845177703E-5, + 276.180704832077, + 6.130267364519796 + ], + [ + 61, + 36205.882352941175, + 2.5752178828497563E-5, + 275.3220717906952, + 5.133624380103912 + ], + [ + 63, + 36205.882352941175, + 2.451439545936296E-5, + 274.54004740715027, + 4.231104789147489 + ], + [ + 64, + 36205.882352941175, + 2.2735587961619785E-5, + 273.7871092557907, + 3.4881211109904653 + ], + [ + 66, + 36205.882352941175, + 2.0525798589687838E-5, + 272.9953033924103, + 2.908634777729333 + ], + [ + 67, + 36205.882352941175, + 1.879029821871825E-5, + 272.16483104228973, + 2.503968360230367 + ], + [ + 68, + 36205.882352941175, + 1.712034589829159E-5, + 271.33573055267334, + 2.249409265256777 + ], + [ + 70, + 36205.882352941175, + 1.4604347343905033E-5, + 270.4395862817764, + 2.078202474317823 + ], + [ + 71, + 36205.882352941175, + 1.1892302612004357E-5, + 269.65469723939896, + 1.9389573841849588 + ], + [ + 74, + 36205.882352941175, + 1.0404651441309198E-5, + 268.72331726551056, + 1.7664747506720284 + ], + [ + 73, + 36205.882352941175, + 1.0299419310744184E-5, + 269.20267140865326, + 1.8324613151466245 + ], + [ + 75, + 36205.882352941175, + 9.97003277625197E-6, + 268.23005044460297, + 1.7331573905556625 + ], + [ + 77, + 36205.882352941175, + 9.39069850725005E-6, + 267.712918817997, + 1.7297824485295452 + ], + [ + 78, + 36205.882352941175, + 9.353685972879333E-6, + 267.1985764503479, + 1.7234866682749332 + ], + [ + 80, + 36205.882352941175, + 9.348147517407313E-6, + 266.6452271938324, + 1.694802076628088 + ], + [ + 81, + 36205.882352941175, + 8.977633049855882E-6, + 265.92864072322845, + 1.6786359874904684 + ], + [ + 82, + 36205.882352941175, + 8.330678127776991E-6, + 265.4768433570862, + 1.708944203726163 + ], + [ + 84, + 36205.882352941175, + 7.683479836373408E-6, + 265.4840569496155, + 1.776776054355016 + ], + [ + 85, + 36205.882352941175, + 7.2926797098915586E-6, + 265.63220822811127, + 1.8246638329583718 + ], + [ + 87, + 36205.882352941175, + 7.94587144525849E-6, + 265.76872968673706, + 1.7291501955064734 + ], + [ + 88, + 36205.882352941175, + 9.113501825197545E-6, + 265.8233495950699, + 1.3286047701541577 + ], + [ + 89, + 36205.882352941175, + 8.562256367028453E-6, + 265.8368196487427, + 0.6457764548329692 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "lat_degree", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "plev_avg", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pr_avg", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tas_avg", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "ua_avg", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "Databricks visualization. Run in Databricks to view." + ] + }, + "metadata": { + "application/vnd.databricks.v1.subcommand+json": { + "bindings": {}, + "collapsed": false, + "command": "%python\n__backend_agg_display_orig = display\n__backend_agg_dfs = []\ndef __backend_agg_display_new(df):\n __backend_agg_df_modules = [\"pandas.core.frame\", \"databricks.koalas.frame\", \"pyspark.sql.dataframe\", \"pyspark.pandas.frame\", \"pyspark.sql.connect.dataframe\"]\n if (type(df).__module__ in __backend_agg_df_modules and type(df).__name__ == 'DataFrame') or isinstance(df, list):\n __backend_agg_dfs.append(df)\n\ndisplay = __backend_agg_display_new\n\ndef __backend_agg_user_code_fn():\n import base64\n exec(base64.standard_b64decode(\"ZGlzcGxheSgKICBkZgogICAgLmdyb3VwQnkoRi5leHByKCJjZWlsKGxhdCkgYXMgbGF0X2RlZ3JlZSIpKQogICAgLmFnZygKICAgICAgRi5tZWFuKGNvbCgicGxldiIpKS5hbGlhcygicGxldl9hdmciKSwKICAgICAgRi5tZWFuKGNvbCgicHIiKSkuYWxpYXMoInByX2F2ZyIpLAogICAgICBGLm1lYW4oY29sKCJ0YXMiKSkuYWxpYXMoInRhc19hdmciKSwKICAgICAgRi5tZWFuKGNvbCgidWEiKSkuYWxpYXMoInVhX2F2ZyIpCiAgICApCik=\").decode())\n\ntry:\n # run user code\n __backend_agg_user_code_fn()\n\n #reset display function\n display = __backend_agg_display_orig\n\n if len(__backend_agg_dfs) > 0:\n # create a temp view\n if type(__backend_agg_dfs[0]).__module__ == \"databricks.koalas.frame\":\n # koalas dataframe\n __backend_agg_dfs[0].to_spark().createOrReplaceTempView(\"DatabricksView81cdc06\")\n elif type(__backend_agg_dfs[0]).__module__ == \"pandas.core.frame\" or isinstance(__backend_agg_dfs[0], list):\n # pandas dataframe\n spark.createDataFrame(__backend_agg_dfs[0]).createOrReplaceTempView(\"DatabricksView81cdc06\")\n else:\n __backend_agg_dfs[0].createOrReplaceTempView(\"DatabricksView81cdc06\")\n #run backend agg\n display(spark.sql(\"\"\"WITH q AS (select * from DatabricksView81cdc06) SELECT `lat_degree`,`plev_avg`,`pr_avg`,`tas_avg`,`ua_avg` FROM q\"\"\"))\n else:\n displayHTML(\"dataframe no longer exists. If you're using dataframe.display(), use display(dataframe) instead.\")\n\n\nfinally:\n spark.sql(\"drop view if exists DatabricksView81cdc06\")\n display = __backend_agg_display_orig\n del __backend_agg_display_new\n del __backend_agg_display_orig\n del __backend_agg_dfs\n del __backend_agg_user_code_fn\n\n", + "commandTitle": "Visualization 1", + "commandType": "auto", + "commandVersion": 0, + "commentThread": [], + "commentsVisible": false, + "contentSha256Hex": null, + "customPlotOptions": { + "redashChart": [ + { + "key": "type", + "value": "CHART" + }, + { + "key": "options", + "value": { + "alignYAxesAtZero": true, + "coefficient": 1, + "columnConfigurationMap": { + "x": { + "column": "lat_degree", + "id": "column_4bc17458205" + }, + "y": [ + { + "column": "plev_avg", + "id": "column_4bc17458217" + }, + { + "column": "pr_avg", + "id": "column_4bc17458218" + }, + { + "column": "tas_avg", + "id": "column_4bc17458219" + }, + { + "column": "ua_avg", + "id": "column_4bc17458220" + } + ] + }, + "dateTimeFormat": "DD/MM/YYYY HH:mm", + "direction": { + "type": "counterclockwise" + }, + "error_y": { + "type": "data", + "visible": true + }, + "globalSeriesType": "scatter", + "legend": { + "traceorder": "normal" + }, + "missingValuesAsZero": true, + "numberFormat": "0,0[.]00000", + "percentFormat": "0[.]00%", + "series": { + "error_y": { + "type": "data", + "visible": true + }, + "stacking": null + }, + "seriesOptions": { + "plev_avg": { + "type": "scatter", + "yAxis": 0 + }, + "pr_avg": { + "type": "scatter", + "yAxis": 0 + }, + "tas_avg": { + "type": "scatter", + "yAxis": 0 + }, + "ua_avg": { + "type": "scatter", + "yAxis": 0 + } + }, + "showDataLabels": false, + "sizemode": "diameter", + "sortX": true, + "sortY": true, + "swappedAxes": false, + "textFormat": "", + "useAggregationsUi": true, + "valuesOptions": {}, + "version": 2, + "xAxis": { + "labels": { + "enabled": true + }, + "type": "-" + }, + "yAxis": [ + { + "type": "logarithmic" + }, + { + "opposite": true, + "type": "-" + } + ] + } + } + ] + }, + "datasetPreviewNameToCmdIdMap": {}, + "diffDeletes": [], + "diffInserts": [], + "displayType": "redashChart", + "error": null, + "errorDetails": null, + "errorSummary": null, + "errorTraceType": null, + "finishTime": 0, + "globalVars": {}, + "guid": "", + "height": "auto", + "hideCommandCode": false, + "hideCommandResult": false, + "iPythonMetadata": null, + "inputWidgets": {}, + "isLockedInExamMode": false, + "latestUser": "a user", + "latestUserId": null, + "listResultMetadata": null, + "metadata": {}, + "nuid": "5a586138-49a5-419a-9517-9cc7c9fbae4e", + "origId": 0, + "parentHierarchy": [], + "pivotAggregation": null, + "pivotColumns": null, + "position": 23.921875, + "resultDbfsErrorMessage": null, + "resultDbfsStatus": "INLINED_IN_TREE", + "results": null, + "showCommandTitle": false, + "startTime": 0, + "state": "input", + "streamStates": {}, + "subcommandOptions": { + "queryPlan": { + "selects": [ + { + "column": "lat_degree", + "type": "column" + }, + { + "column": "plev_avg", + "type": "column" + }, + { + "column": "pr_avg", + "type": "column" + }, + { + "column": "tas_avg", + "type": "column" + }, + { + "column": "ua_avg", + "type": "column" + } + ] + } + }, + "submitTime": 0, + "subtype": "tableResultSubCmd.visualization", + "tableResultIndex": 0, + "useConsistentColors": false, + "version": "CommandV1", + "width": "auto", + "workflows": [], + "xColumns": null, + "yColumns": null + } + }, + "output_type": "display_data" + } + ], + "source": [ + "# You can visualize this plot in Databricks\n", + "# - hint: press the '+' button below\n", + "display(\n", + " df\n", + " .groupBy(F.expr(\"ceil(lat) as lat_degree\"))\n", + " .agg(\n", + " F.mean(col(\"plev\")).alias(\"plev_avg\"),\n", + " F.mean(col(\"pr\")).alias(\"pr_avg\"),\n", + " F.mean(col(\"tas\")).alias(\"tas_avg\"),\n", + " F.mean(col(\"ua\")).alias(\"ua_avg\")\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "88fcde55-6ee6-4043-bcad-52226b395d61", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### 5. Databricks Lakehouse can read / write most any data format\n", + "\n", + "> Here are [built-in](https://docs.databricks.com/en/external-data/index.html) formats as well as Mosaic [readers](https://databrickslabs.github.io/mosaic/api/api.html). __Note: best performance with Delta Lake format__, ref [Databricks](https://docs.databricks.com/en/delta/index.html) and [OSS](https://docs.delta.io/latest/index.html) docs for Delta Lake. Beyond built-in formats, Databricks is a platform on which you can install a wide variety of libraries, e.g. [1](https://docs.databricks.com/en/libraries/index.html#python-environment-management) | [2](https://docs.databricks.com/en/compute/compatibility.html) | [3](https://docs.databricks.com/en/init-scripts/index.html).\n", + "\n", + "Example of [reading](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html?highlight=read#pyspark.sql.DataFrameReader) and [writing](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html?highlight=pyspark%20sql%20dataframe%20writer#pyspark.sql.DataFrameWriter) a Spark DataFrame with Delta Lake format.\n", + "\n", + "```\n", + "# - `write.format(\"delta\")` is default in Databricks\n", + "# - can save to a specified path in the Lakehouse\n", + "# - can save as a table in the Databricks Metastore\n", + "df.write.save(\"\")\n", + "df.write.saveAsTable(\"\")\n", + "```\n", + "\n", + "Example of loading a Delta Lake Table as a Spark DataFrame.\n", + "\n", + "```\n", + "# - `read.format(\"delta\")` is default in Databricks\n", + "# - can load a specified path in the Lakehouse\n", + "# - can load a table in the Databricks Metastore\n", + "df.read.load(\"\")\n", + "df.table(\"\")\n", + "```\n", + "\n", + "More on [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/index.html) in Databricks Lakehouse for Governing [Tables](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#tables) and [Volumes](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#volumes)." + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 2486404516283409, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "single_node_netcdf_files", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/NetCDF/load_netcdf_files.py b/notebooks/examples/python/NetCDF/load_netcdf_files.py deleted file mode 100644 index 37d752db7..000000000 --- a/notebooks/examples/python/NetCDF/load_netcdf_files.py +++ /dev/null @@ -1,140 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC # Opening and visualizing a netCDF file (Python) - -# COMMAND ---------- - -## Overview - -This notebook demonstrates how to open and explore the netCDF file, visualize the data, and export to a comma-separated file (CSV). This tutorial is intended for the users with novice-level programming skills. However, it is expected that the users familiarize themselves with key aspects of [netCDF climate and forecast (CF) metadata convention](http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html) before starting the tutorials. - -![Volumetric soil moisture at various soil depths](https://raw.githubusercontent.com/ornldaac/netcdf_open_visualize_csv/master/resources/py-nc-visualize.png) - -## Source Data - -The source data is a netCDF file ([soil_moist_20min_Kendall_AZ_n1400.nc](https://daac.ornl.gov/daacdata/eos_land_val/SoilSCAPE/data//soil_moist_20min_Kendall_AZ_n1400.nc)) consisting of volumetric root zone soil moisture data from a location in Kendall, Arizona, USA. This data was collected as a part of SoilSCAPE (Soil moisture Sensing Controller and oPtimal Estimator) project (https://daac.ornl.gov/cgi-bin/dsviewer.pl?ds_id=1339) - -## Prerequisites - -Python 3 or later. Python modules: netCDF4, numpy, pandas, matplotlib - -## Tutorial -In this tutorial, we will open and explore the netCDF file, visualize the data, and export to a comma-separated file (CSV). - -### 1. Import python modules -First import the required modules: - -# COMMAND ---------- - -# MAGIC %matplotlib inline -# MAGIC # above generates plots in line within this page -# MAGIC -# MAGIC import pandas as pd # pandas module -# MAGIC import numpy as np # numpy module -# MAGIC import netCDF4 as nc # netcdf module -# MAGIC import matplotlib.pyplot as plt # plot from matplotlib module - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 2. Read and explore the netCDF file -# MAGIC Read in the netCDF file at folder /data/indata/ into 'in_nc'. Printing 'in_nc' displays important information about the data sets, such as *global attributes*, *data dimensions*, and *variable names*. Global attributes in a netCDF file contains information about the data such as data authors, publisher, data contacts, etc. - -# COMMAND ---------- - -in_nc = nc.Dataset("dbfs:/ml/blogs/geospatial/data/netcdf/indata/soil_moist_20min_Kendall_AZ_n1400.nc") # read file -print(in_nc) # print file information - -# COMMAND ---------- - -# MAGIC %md -# MAGIC In the above print output, we can get the variables names and dimension names/sizes. For example, "lat", "lon" variables with geographic coordinates, and "soil moisture" variable with the volumetric soil moisture data. Let us print the location: - -# COMMAND ---------- - -y = in_nc.variables['lat'][:] # read latitutde variable -x = in_nc.variables['lon'][:] # read longitude variable -print("Latitude: %.5f, Longitude: %.5f" % (y,x)) # print latitutde, longitude - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 3. Read in variables and check attributes -# MAGIC In this step we will read in variables we are interested in and print their attributes (e.g. units of measurements, detailed names etc). - -# COMMAND ---------- - -soil_moisture = in_nc.variables['soil_moisture'][:] # read soil moisture variable -print(in_nc.variables['soil_moisture']) # print the variable attributes - -# COMMAND ---------- - -depth = in_nc.variables['depth'][:] # read depth variable -print(in_nc.variables['depth']) # print the variable attributes - -# COMMAND ---------- - -time = in_nc.variables['time'][:] # read time variable -print(in_nc.variables['time']) # print the variable attributes - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 4. Convert time values -# MAGIC As you may have noticed, the time units in most netCDFs are relative to a fixed date (e.g. minutes since 2011-01-01 in this case). To convert it to corresponding meaningful date/time values, we will use 'num2date' command: - -# COMMAND ---------- - -time_unit = in_nc.variables["time"].getncattr('units') # first read the 'units' attributes from the variable time -time_cal = in_nc.variables["time"].getncattr('calendar') # read calendar type -local_time = nc.num2date(time, units=time_unit, calendar=time_cal) # convert time -print("Original time %s is now converted as %s" % (time[0], local_time[0])) # check conversion - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 5. Create daily average soil moisture plot -# MAGIC To create soil moisture plots aggregated by day, we will first put the data into a *pandas dataframe*, which let you organize data in a meaningful tabular data structure and does time aggregation easily. - -# COMMAND ---------- - -sm_df = pd.DataFrame(soil_moisture, columns=depth, index=local_time.tolist()) # read into pandas dataframe -print(sm_df[:5]) # print the first 5 rows of dataframe - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Now we will convert the original (~ half-hourly) data to daily using *Pandas's TimeGrouper"*. '1D' means daily, '6M' means six-monthly etc. More aliases are listed [here](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases). Notice that we are using "numpy's nanmean" instead of "mean" to exclude all NaN values. Ignore any run time warning messages. - -# COMMAND ---------- - -sm_df_daily = sm_df.groupby(pd.TimeGrouper('1D')).aggregate(np.nanmean) # convert to daily. -print(sm_df_daily[:5]) # print the first 5 rows - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We will now create plot of daily time series of soil moisture measured at soil depths (5, 15 and 30cm) using python's matplotlib module: - -# COMMAND ---------- - -ylabel_name = in_nc.variables["soil_moisture"].getncattr('long_name') + ' (' + \ - in_nc.variables["soil_moisture"].getncattr('units') + ')' # Label for y-axis -series_name = in_nc.variables["depth"].getncattr('long_name') + ' (' + \ - in_nc.variables["depth"].getncattr('units') + ')' # Legend title -# plot -plt.figure() -sm_df_daily.plot() -plt.legend(title=series_name) -plt.ylabel(ylabel_name) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 6. Output to CSV -# MAGIC We can also convert pandas dataframes (both daily and original) to separate comma-separated-values(CSV) files, to be used for further analysis, etc. - -# COMMAND ---------- - -sm_df_daily.to_csv("dbfs:/ml/blogs/geospatial/data/netcdf/outdata/daily_soilscape.csv", index_label="DateTime") # Daily -sm_df.to_csv("dbfs:/ml/blogs/geospatial/data/netcdf/outdata/original_soilscape.csv", index_label="DateTime") # Original diff --git a/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb b/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb new file mode 100644 index 000000000..d45e3b986 --- /dev/null +++ b/notebooks/examples/python/Quickstart/QuickstartNotebook.ipynb @@ -0,0 +1,2308 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db1d4ea7-d138-4740-ac41-74998430b3df", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Mosaic Quickstart\n", + "\n", + "> Perform a point-in-polygon spatial join between NYC Taxi trips and zones. __Note: this does not get into performance tweaks that are available for scaled joins.__\n", + "\n", + "1. To use Databricks Labs [Mosaic](https://databrickslabs.github.io/mosaic/index.html) library for geospatial data engineering, analysis, and visualization functionality:\n", + " * Install with `%pip install databricks-mosaic`\n", + " * Import and use with the following:\n", + " ```\n", + " import mosaic as mos\n", + " mos.enable_mosaic(spark, dbutils)\n", + " ```\n", + "

\n", + "\n", + "2. To use [KeplerGl](https://kepler.gl/) OSS library for map layer rendering:\n", + " * Already installed with Mosaic, use `%%mosaic_kepler` magic [[Mosaic Docs](https://databrickslabs.github.io/mosaic/usage/kepler.html)]\n", + " * Import with `from keplergl import KeplerGl` to use directly\n", + "\n", + "If you have trouble with Volume access:\n", + "\n", + "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", + "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html).\n", + "\n", + "--- \n", + " __Last Update__ 28 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d6cbd7f0-cfa1-41f9-88dc-dccd355343d4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Install Mosaic\n", + "\n", + "> Mosaic framework is available via pip install and it comes with bindings for Python, SQL, Scala and R. The wheel file coming with pip installation is registering any necessary jars for other language support." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2fb6d01c-9da4-471a-b765-eb4578600eb1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3444513e-7349-4159-8dda-c5bff1225a12", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import os\n", + "import pathlib\n", + "import requests\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9251d814-2e9f-4287-8fd9-769f0bf40c68", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup Data" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c1f01df5-a33a-4d99-91f3-3dbe066ecc98", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial data stored in '/tmp/mosaic/mjohns@databricks.com'\n" + ] + } + ], + "source": [ + "user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()\n", + "\n", + "data_dir = f\"/tmp/mosaic/{user_name}\"\n", + "print(f\"Initial data stored in '{data_dir}'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "293022d3-40c6-4928-946b-7bfe8a6fbb1e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Download NYC Taxi Zones\n", + "\n", + "> Make sure we have New York City Taxi zone shapes available in our environment." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e3283b3f-4b81-459c-b513-2d782e9acc7f", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "ZONE_DIR_FUSE? '/dbfs/tmp/mosaic/mjohns@databricks.com/taxi_zones'\n" + ] + } + ], + "source": [ + "zone_dir = f\"{data_dir}/taxi_zones\"\n", + "zone_dir_fuse = f\"/dbfs{zone_dir}\"\n", + "dbutils.fs.mkdirs(zone_dir)\n", + "\n", + "os.environ['ZONE_DIR_FUSE'] = zone_dir_fuse\n", + "print(f\"ZONE_DIR_FUSE? '{zone_dir_fuse}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "941175c3-4142-4254-b47d-c8e813dfe95c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "...skipping '/dbfs/tmp/mosaic/mjohns@databricks.com/taxi_zones/nyc_taxi_zones.geojson', already exits.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "

pathnamesizemodificationTime
dbfs:/tmp/mosaic/mjohns@databricks.com/taxi_zones/nyc_taxi_zones.geojsonnyc_taxi_zones.geojson38924781701183475000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/tmp/mosaic/mjohns@databricks.com/taxi_zones/nyc_taxi_zones.geojson", + "nyc_taxi_zones.geojson", + 3892478, + 1701183475000 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "size", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "zone_url = 'https://data.cityofnewyork.us/api/geospatial/d3c5-ddgc?method=export&format=GeoJSON'\n", + "\n", + "zone_fuse_path = pathlib.Path(zone_dir_fuse) / 'nyc_taxi_zones.geojson'\n", + "if not zone_fuse_path.exists():\n", + " req = requests.get(zone_url)\n", + " with open(zone_fuse_path, 'wb') as f:\n", + " f.write(req.content)\n", + "else:\n", + " print(f\"...skipping '{zone_fuse_path}', already exits.\")\n", + "\n", + "display(dbutils.fs.ls(zone_dir))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "56b5edd3-3e43-428b-b13d-752c2a3a18a3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initial Taxi Zone from GeoJSON [Polygons]\n", + "\n", + "> With the functionality Mosaic brings we can easily load GeoJSON files. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "172476a3-aa95-45cd-91e8-040d865b37d1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 263\n+-----------------+--------------------+--------------------+--------------------+\n| type| properties| json_geometry| geometry|\n+-----------------+--------------------+--------------------+--------------------+\n|FeatureCollection|{EWR, 1, 1, 0.000...|{\"coordinates\":[[...|MULTIPOLYGON (((-...|\n|FeatureCollection|{Queens, 2, 2, 0....|{\"coordinates\":[[...|MULTIPOLYGON (((-...|\n|FeatureCollection|{Bronx, 3, 3, 0.0...|{\"coordinates\":[[...|MULTIPOLYGON (((-...|\n+-----------------+--------------------+--------------------+--------------------+\n\n" + ] + } + ], + "source": [ + "neighbourhoods = (\n", + " spark.read\n", + " .option(\"multiline\", \"true\")\n", + " .format(\"json\")\n", + " .load(zone_dir)\n", + " .select(\"type\", explode(col(\"features\")).alias(\"feature\"))\n", + " .select(\"type\", col(\"feature.properties\").alias(\"properties\"), to_json(col(\"feature.geometry\")).alias(\"json_geometry\"))\n", + " .withColumn(\"geometry\", mos.st_aswkt(mos.st_geomfromgeojson(\"json_geometry\")))\n", + ")\n", + "\n", + "print(f\"count? {neighbourhoods.count():,}\")\n", + "neighbourhoods.limit(3).show() # <- limiting + show for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3738db02-9bc9-437b-a723-00c438a6fb87", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Compute some basic geometry attributes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5a4d0baf-a36f-4e15-828d-991b46b76a02", + "showTitle": false, + "title": "" + } + }, + "source": [ + "Mosaic provides a number of functions for extracting the properties of geometries. Here are some that are relevant to Polygon geometries:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "31304e6e-07ec-4179-bcc3-1dd94e1eb756", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+--------------------+-------------------+\n| geometry| calculatedArea| calculatedLength|\n+--------------------+--------------------+-------------------+\n|MULTIPOLYGON (((-...|7.823067885002558E-4|0.11635745318867867|\n|MULTIPOLYGON (((-...|0.001422779097814599| 0.8431218810128791|\n|MULTIPOLYGON (((-...|3.144141568206508E-4| 0.0843411059010578|\n+--------------------+--------------------+-------------------+\n\n" + ] + } + ], + "source": [ + "display(\n", + " neighbourhoods\n", + " .withColumn(\"calculatedArea\", mos.st_area(col(\"geometry\")))\n", + " .withColumn(\"calculatedLength\", mos.st_length(col(\"geometry\")))\n", + " # Note: The unit of measure of the area and length depends on the CRS used.\n", + " # For GPS locations it will be square radians and radians\n", + " .select(\"geometry\", \"calculatedArea\", \"calculatedLength\")\n", + " .limit(3)\n", + " .show() # <- limiting + show for ipynb only\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "704024a0-8171-4218-a95e-ff8ef6ace37c", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initial Trips Data [Points]\n", + "\n", + "> We will load some Taxi trips data to represent point data; this data is coming from Databricks public datasets available in your environment. __Note: this is 1.6 billion trips as-is; while it is no problem to process this, to keep this to a quickstart level, we are going to use just 1/10th of 1% or ~1.6 million.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "909bb39e-22c8-46e2-a537-23f6db32ebc6", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 1,609,513\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
row_idvendor_idpickup_datetimedropoff_datetimepassenger_counttrip_distancepickup_longitudepickup_latituderate_code_iddropoff_longitudedropoff_latitudefare_amountextramta_taxtip_amounttolls_amounttotal_amountpickup_geomdropoff_geom
2129013916481179996CMT2012-09-22T11:15:03.000+00002012-09-22T11:21:08.000+000011.0-73.9712840.7642321-73.95890740.7688856.00.00.50.00.06.5POINT (-73.97128 40.764232)POINT (-73.958907 40.768885)
-187451197385687112CMT2012-09-22T11:57:33.000+00002012-09-22T12:05:04.000+000021.1-73.9681240.7592431-73.96028740.7738727.50.00.50.00.08.0POINT (-73.96812 40.759243)POINT (-73.960287 40.773872)
-5307413836615879790CMT2012-09-22T13:19:36.000+00002012-09-22T13:29:27.000+000011.8-73.96730140.7602731-73.94736340.7756889.00.00.50.00.09.5POINT (-73.967301 40.760273)POINT (-73.947363 40.775688)
-8038135963231314765CMT2012-09-22T15:23:44.000+00002012-09-22T15:27:24.000+000011.0-73.97370840.7662341-73.98404240.7493045.00.00.50.00.05.5POINT (-73.973708 40.766234)POINT (-73.984042 40.749304)
9153652317280220397CMT2012-09-22T15:38:33.000+00002012-09-22T15:44:19.000+000011.0-73.97251740.7540821-73.97818740.744666.00.00.50.00.06.5POINT (-73.972517 40.754082)POINT (-73.978187 40.74466)
7276234971418044547CMT2012-09-22T16:36:46.000+00002012-09-22T16:40:46.000+000010.7-73.9688340.7672831-73.97005740.7630125.00.00.50.00.05.5POINT (-73.96883 40.767283)POINT (-73.970057 40.763012)
-5272651262997334143CMT2012-09-22T18:51:39.000+00002012-09-22T19:01:32.000+000021.5-73.965840.7625051-73.98477540.7568578.50.00.50.00.09.0POINT (-73.9658 40.762505)POINT (-73.984775 40.756857)
6354727495669002224CMT2012-09-22T19:34:39.000+00002012-09-22T19:52:28.000+000013.1-73.96528640.7591021-73.98143440.78625115.00.00.50.00.015.5POINT (-73.965286 40.759102)POINT (-73.981434 40.786251)
6519124484930989871CMT2012-09-22T19:30:54.000+00002012-09-22T19:35:05.000+000020.9-73.96946540.7666111-73.98254240.7726015.00.00.50.00.05.5POINT (-73.969465 40.766611)POINT (-73.982542 40.772601)
-5496030646103797643CMT2012-09-22T19:47:41.000+00002012-09-22T20:06:02.000+000034.7-73.92642540.7657561-73.98676140.74807417.50.00.50.00.018.0POINT (-73.926425 40.765756)POINT (-73.986761 40.748074)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 2129013916481179996, + "CMT", + "2012-09-22T11:15:03.000+0000", + "2012-09-22T11:21:08.000+0000", + 1, + 1.0, + -73.97128, + 40.764232, + 1, + -73.958907, + 40.768885, + 6.0, + 0.0, + 0.5, + 0.0, + 0.0, + 6.5, + "POINT (-73.97128 40.764232)", + "POINT (-73.958907 40.768885)" + ], + [ + -187451197385687112, + "CMT", + "2012-09-22T11:57:33.000+0000", + "2012-09-22T12:05:04.000+0000", + 2, + 1.1, + -73.96812, + 40.759243, + 1, + -73.960287, + 40.773872, + 7.5, + 0.0, + 0.5, + 0.0, + 0.0, + 8.0, + "POINT (-73.96812 40.759243)", + "POINT (-73.960287 40.773872)" + ], + [ + -5307413836615879790, + "CMT", + "2012-09-22T13:19:36.000+0000", + "2012-09-22T13:29:27.000+0000", + 1, + 1.8, + -73.967301, + 40.760273, + 1, + -73.947363, + 40.775688, + 9.0, + 0.0, + 0.5, + 0.0, + 0.0, + 9.5, + "POINT (-73.967301 40.760273)", + "POINT (-73.947363 40.775688)" + ], + [ + -8038135963231314765, + "CMT", + "2012-09-22T15:23:44.000+0000", + "2012-09-22T15:27:24.000+0000", + 1, + 1.0, + -73.973708, + 40.766234, + 1, + -73.984042, + 40.749304, + 5.0, + 0.0, + 0.5, + 0.0, + 0.0, + 5.5, + "POINT (-73.973708 40.766234)", + "POINT (-73.984042 40.749304)" + ], + [ + 9153652317280220397, + "CMT", + "2012-09-22T15:38:33.000+0000", + "2012-09-22T15:44:19.000+0000", + 1, + 1.0, + -73.972517, + 40.754082, + 1, + -73.978187, + 40.74466, + 6.0, + 0.0, + 0.5, + 0.0, + 0.0, + 6.5, + "POINT (-73.972517 40.754082)", + "POINT (-73.978187 40.74466)" + ], + [ + 7276234971418044547, + "CMT", + "2012-09-22T16:36:46.000+0000", + "2012-09-22T16:40:46.000+0000", + 1, + 0.7, + -73.96883, + 40.767283, + 1, + -73.970057, + 40.763012, + 5.0, + 0.0, + 0.5, + 0.0, + 0.0, + 5.5, + "POINT (-73.96883 40.767283)", + "POINT (-73.970057 40.763012)" + ], + [ + -5272651262997334143, + "CMT", + "2012-09-22T18:51:39.000+0000", + "2012-09-22T19:01:32.000+0000", + 2, + 1.5, + -73.9658, + 40.762505, + 1, + -73.984775, + 40.756857, + 8.5, + 0.0, + 0.5, + 0.0, + 0.0, + 9.0, + "POINT (-73.9658 40.762505)", + "POINT (-73.984775 40.756857)" + ], + [ + 6354727495669002224, + "CMT", + "2012-09-22T19:34:39.000+0000", + "2012-09-22T19:52:28.000+0000", + 1, + 3.1, + -73.965286, + 40.759102, + 1, + -73.981434, + 40.786251, + 15.0, + 0.0, + 0.5, + 0.0, + 0.0, + 15.5, + "POINT (-73.965286 40.759102)", + "POINT (-73.981434 40.786251)" + ], + [ + 6519124484930989871, + "CMT", + "2012-09-22T19:30:54.000+0000", + "2012-09-22T19:35:05.000+0000", + 2, + 0.9, + -73.969465, + 40.766611, + 1, + -73.982542, + 40.772601, + 5.0, + 0.0, + 0.5, + 0.0, + 0.0, + 5.5, + "POINT (-73.969465 40.766611)", + "POINT (-73.982542 40.772601)" + ], + [ + -5496030646103797643, + "CMT", + "2012-09-22T19:47:41.000+0000", + "2012-09-22T20:06:02.000+0000", + 3, + 4.7, + -73.926425, + 40.765756, + 1, + -73.986761, + 40.748074, + 17.5, + 0.0, + 0.5, + 0.0, + 0.0, + 18.0, + "POINT (-73.926425 40.765756)", + "POINT (-73.986761 40.748074)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "row_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "vendor_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "dropoff_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "passenger_count", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "rate_code_id", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "dropoff_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "dropoff_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "fare_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "extra", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mta_tax", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tip_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tolls_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "total_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "trips = (\n", + " spark.table(\"delta.`/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`\")\n", + " # - .1% sample\n", + " .sample(.001)\n", + " .drop(\"vendorId\", \"rateCodeId\", \"store_and_fwd_flag\", \"payment_type\")\n", + " .withColumn(\"pickup_geom\", mos.st_astext(mos.st_point(col(\"pickup_longitude\"), col(\"pickup_latitude\"))))\n", + " .withColumn(\"dropoff_geom\", mos.st_astext(mos.st_point(col(\"dropoff_longitude\"), col(\"dropoff_latitude\"))))\n", + " # - adding a row id\n", + " .selectExpr(\n", + " \"xxhash64(pickup_datetime, dropoff_datetime, pickup_geom, dropoff_geom) as row_id\", \"*\"\n", + " )\n", + ")\n", + "print(f\"count? {trips.count():,}\")\n", + "trips.limit(10).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d905fbe9-5104-4a09-bdee-4b5adba23bcf", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Spatial Joins\n", + "\n", + "> We can use Mosaic to perform spatial joins both with and without Mosaic indexing strategies. Indexing is very important when handling very different geometries both in size and in shape (ie. number of vertices)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e9ff1fa2-ca0b-4472-8c8a-1b317da11e76", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Getting the optimal resolution\n", + "\n", + "> We can use Mosaic functionality to identify how to best index our data based on the data inside the specific dataframe. Selecting an appropriate indexing resolution can have a considerable impact on the performance." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2bff2755-9a4f-481b-a00f-93e0fd2ebd8a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimal resolution is 9\n" + ] + } + ], + "source": [ + "from mosaic import MosaicFrame\n", + "\n", + "neighbourhoods_mosaic_frame = MosaicFrame(neighbourhoods, \"geometry\")\n", + "optimal_resolution = neighbourhoods_mosaic_frame.get_optimal_resolution(sample_fraction=0.75)\n", + "\n", + "print(f\"Optimal resolution is {optimal_resolution}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b28464a3-f420-4264-b58b-a7e7d79329ad", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> Not every resolution will yield performance improvements. By a rule of thumb it is always better to under-index than over-index - if not sure select a lower resolution. Higher resolutions are needed when we have very imbalanced geometries with respect to their size or with respect to the number of vertices. In such case indexing with more indices will considerably increase the parallel nature of the operations. You can think of Mosaic as a way to partition an overly complex row into multiple rows that have a balanced amount of computation each." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6129062a-c951-4e1d-aac7-e146225dc9d8", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
resolutionmean_index_areamean_geometry_areapercentile_25_geometry_areapercentile_50_geometry_areapercentile_75_geometry_area
101.613031989295642E-6209.2813735025822366.88836237217488140.47899017489053277.457697890302
91.129122336468363E-529.897340555608889.55548081312268720.06842816384189739.63681595151253
112.3043288507855827E-71464.9712436152888468.2190572805376983.35402474692811942.206045030451
87.90392229801954E-54.2710130172709331.36505729876610032.86689439030573875.662355037138002
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 10, + 1.613031989295642E-6, + 209.28137350258223, + 66.88836237217488, + 140.47899017489053, + 277.457697890302 + ], + [ + 9, + 1.129122336468363E-5, + 29.89734055560888, + 9.555480813122687, + 20.068428163841897, + 39.63681595151253 + ], + [ + 11, + 2.3043288507855827E-7, + 1464.9712436152888, + 468.2190572805376, + 983.3540247469281, + 1942.206045030451 + ], + [ + 8, + 7.90392229801954E-5, + 4.271013017270933, + 1.3650572987661003, + 2.8668943903057387, + 5.662355037138002 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "resolution", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "mean_index_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mean_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_25_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_50_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_75_geometry_area", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "display(\n", + " neighbourhoods_mosaic_frame.get_resolution_metrics(sample_rows=150)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "eb868752-f425-4c70-aab9-9a7f0d45f049", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Indexing using the optimal resolution\n", + "\n", + "> We will use mosaic sql functions to index our points data. Here we will use resolution 9, index resolution depends on the dataset in use." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0ce69d9e-be91-4b8d-bd29-50fa9f569e9f", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
row_idpickup_h3dropoff_h3vendor_idpickup_datetimedropoff_datetimepassenger_counttrip_distancepickup_longitudepickup_latituderate_code_iddropoff_longitudedropoff_latitudefare_amountextramta_taxtip_amounttolls_amounttotal_amountpickup_geomdropoff_geom
2129013916481179996617733123866886143617733122576875519CMT2012-09-22T11:15:03.000+00002012-09-22T11:21:08.000+000011.0-73.9712840.7642321-73.95890740.7688856.00.00.50.00.06.5POINT (-73.97128 40.764232)POINT (-73.958907 40.768885)
-187451197385687112617733123876847615617733122577661951CMT2012-09-22T11:57:33.000+00002012-09-22T12:05:04.000+000021.1-73.9681240.7592431-73.96028740.7738727.50.00.50.00.08.0POINT (-73.96812 40.759243)POINT (-73.960287 40.773872)
-5307413836615879790617733123867148287617733122584739839CMT2012-09-22T13:19:36.000+00002012-09-22T13:29:27.000+000011.8-73.96730140.7602731-73.94736340.7756889.00.00.50.00.09.5POINT (-73.967301 40.760273)POINT (-73.947363 40.775688)
-8038135963231314765617733123878682623617733123812622335CMT2012-09-22T15:23:44.000+00002012-09-22T15:27:24.000+000011.0-73.97370840.7662341-73.98404240.7493045.00.00.50.00.05.5POINT (-73.973708 40.766234)POINT (-73.984042 40.749304)
9153652317280220397617733123869507583617733123807379455CMT2012-09-22T15:38:33.000+00002012-09-22T15:44:19.000+000011.0-73.97251740.7540821-73.97818740.744666.00.00.50.00.06.5POINT (-73.972517 40.754082)POINT (-73.978187 40.74466)
7276234971418044547617733123874750463617733123866886143CMT2012-09-22T16:36:46.000+00002012-09-22T16:40:46.000+000010.7-73.9688340.7672831-73.97005740.7630125.00.00.50.00.05.5POINT (-73.96883 40.767283)POINT (-73.970057 40.763012)
-5272651262997334143617733123875012607617733123873701887CMT2012-09-22T18:51:39.000+00002012-09-22T19:01:32.000+000021.5-73.965840.7625051-73.98477540.7568578.50.00.50.00.09.0POINT (-73.9658 40.762505)POINT (-73.984775 40.756857)
6354727495669002224617733123876847615617733122617507839CMT2012-09-22T19:34:39.000+00002012-09-22T19:52:28.000+000013.1-73.96528640.7591021-73.98143440.78625115.00.00.50.00.015.5POINT (-73.965286 40.759102)POINT (-73.981434 40.786251)
6519124484930989871617733123874750463617733122610954239CMT2012-09-22T19:30:54.000+00002012-09-22T19:35:05.000+000020.9-73.96946540.7666111-73.98254240.7726015.00.00.50.00.05.5POINT (-73.969465 40.766611)POINT (-73.982542 40.772601)
-5496030646103797643617733124358143999617733123811573759CMT2012-09-22T19:47:41.000+00002012-09-22T20:06:02.000+000034.7-73.92642540.7657561-73.98676140.74807417.50.00.50.00.018.0POINT (-73.926425 40.765756)POINT (-73.986761 40.748074)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 2129013916481179996, + 617733123866886143, + 617733122576875519, + "CMT", + "2012-09-22T11:15:03.000+0000", + "2012-09-22T11:21:08.000+0000", + 1, + 1.0, + -73.97128, + 40.764232, + 1, + -73.958907, + 40.768885, + 6.0, + 0.0, + 0.5, + 0.0, + 0.0, + 6.5, + "POINT (-73.97128 40.764232)", + "POINT (-73.958907 40.768885)" + ], + [ + -187451197385687112, + 617733123876847615, + 617733122577661951, + "CMT", + "2012-09-22T11:57:33.000+0000", + "2012-09-22T12:05:04.000+0000", + 2, + 1.1, + -73.96812, + 40.759243, + 1, + -73.960287, + 40.773872, + 7.5, + 0.0, + 0.5, + 0.0, + 0.0, + 8.0, + "POINT (-73.96812 40.759243)", + "POINT (-73.960287 40.773872)" + ], + [ + -5307413836615879790, + 617733123867148287, + 617733122584739839, + "CMT", + "2012-09-22T13:19:36.000+0000", + "2012-09-22T13:29:27.000+0000", + 1, + 1.8, + -73.967301, + 40.760273, + 1, + -73.947363, + 40.775688, + 9.0, + 0.0, + 0.5, + 0.0, + 0.0, + 9.5, + "POINT (-73.967301 40.760273)", + "POINT (-73.947363 40.775688)" + ], + [ + -8038135963231314765, + 617733123878682623, + 617733123812622335, + "CMT", + "2012-09-22T15:23:44.000+0000", + "2012-09-22T15:27:24.000+0000", + 1, + 1.0, + -73.973708, + 40.766234, + 1, + -73.984042, + 40.749304, + 5.0, + 0.0, + 0.5, + 0.0, + 0.0, + 5.5, + "POINT (-73.973708 40.766234)", + "POINT (-73.984042 40.749304)" + ], + [ + 9153652317280220397, + 617733123869507583, + 617733123807379455, + "CMT", + "2012-09-22T15:38:33.000+0000", + "2012-09-22T15:44:19.000+0000", + 1, + 1.0, + -73.972517, + 40.754082, + 1, + -73.978187, + 40.74466, + 6.0, + 0.0, + 0.5, + 0.0, + 0.0, + 6.5, + "POINT (-73.972517 40.754082)", + "POINT (-73.978187 40.74466)" + ], + [ + 7276234971418044547, + 617733123874750463, + 617733123866886143, + "CMT", + "2012-09-22T16:36:46.000+0000", + "2012-09-22T16:40:46.000+0000", + 1, + 0.7, + -73.96883, + 40.767283, + 1, + -73.970057, + 40.763012, + 5.0, + 0.0, + 0.5, + 0.0, + 0.0, + 5.5, + "POINT (-73.96883 40.767283)", + "POINT (-73.970057 40.763012)" + ], + [ + -5272651262997334143, + 617733123875012607, + 617733123873701887, + "CMT", + "2012-09-22T18:51:39.000+0000", + "2012-09-22T19:01:32.000+0000", + 2, + 1.5, + -73.9658, + 40.762505, + 1, + -73.984775, + 40.756857, + 8.5, + 0.0, + 0.5, + 0.0, + 0.0, + 9.0, + "POINT (-73.9658 40.762505)", + "POINT (-73.984775 40.756857)" + ], + [ + 6354727495669002224, + 617733123876847615, + 617733122617507839, + "CMT", + "2012-09-22T19:34:39.000+0000", + "2012-09-22T19:52:28.000+0000", + 1, + 3.1, + -73.965286, + 40.759102, + 1, + -73.981434, + 40.786251, + 15.0, + 0.0, + 0.5, + 0.0, + 0.0, + 15.5, + "POINT (-73.965286 40.759102)", + "POINT (-73.981434 40.786251)" + ], + [ + 6519124484930989871, + 617733123874750463, + 617733122610954239, + "CMT", + "2012-09-22T19:30:54.000+0000", + "2012-09-22T19:35:05.000+0000", + 2, + 0.9, + -73.969465, + 40.766611, + 1, + -73.982542, + 40.772601, + 5.0, + 0.0, + 0.5, + 0.0, + 0.0, + 5.5, + "POINT (-73.969465 40.766611)", + "POINT (-73.982542 40.772601)" + ], + [ + -5496030646103797643, + 617733124358143999, + 617733123811573759, + "CMT", + "2012-09-22T19:47:41.000+0000", + "2012-09-22T20:06:02.000+0000", + 3, + 4.7, + -73.926425, + 40.765756, + 1, + -73.986761, + 40.748074, + 17.5, + 0.0, + 0.5, + 0.0, + 0.0, + 18.0, + "POINT (-73.926425 40.765756)", + "POINT (-73.986761 40.748074)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "row_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "pickup_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "dropoff_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "vendor_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "dropoff_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "passenger_count", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "rate_code_id", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "dropoff_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "dropoff_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "fare_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "extra", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mta_tax", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tip_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tolls_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "total_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "tripsWithIndex = (\n", + " trips\n", + " .withColumn(\"pickup_h3\", mos.grid_pointascellid(col(\"pickup_geom\"), lit(optimal_resolution)))\n", + " .withColumn(\"dropoff_h3\", mos.grid_pointascellid(col(\"dropoff_geom\"), lit(optimal_resolution)))\n", + " # - beneficial to have index in first 32 table cols\n", + " .selectExpr(\n", + " \"row_id\", \"pickup_h3\", \"dropoff_h3\", \"* except(row_id, pickup_h3, dropoff_h3)\"\n", + " )\n", + ")\n", + "display(tripsWithIndex.limit(10))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e519823d-b03b-4984-9a6a-4988aff54648", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> We will also index our neighbourhoods using a built in generator function." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "32321c65-39c0-4d9a-9124-2d3de4ad61a0", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 11,885\n+-----------------+--------------------+--------------------+\n| type| properties| mosaic_index|\n+-----------------+--------------------+--------------------+\n|FeatureCollection|{EWR, 1, 1, 0.000...|{true, 6177331507...|\n|FeatureCollection|{EWR, 1, 1, 0.000...|{true, 6177331508...|\n|FeatureCollection|{EWR, 1, 1, 0.000...|{true, 6177331508...|\n|FeatureCollection|{EWR, 1, 1, 0.000...|{true, 6177331507...|\n|FeatureCollection|{EWR, 1, 1, 0.000...|{true, 6177331508...|\n+-----------------+--------------------+--------------------+\n\n" + ] + } + ], + "source": [ + "neighbourhoodsWithIndex = (\n", + " neighbourhoods\n", + " # We break down the original geometry in multiple smaller mosaic chips, each with its\n", + " # own index\n", + " .withColumn(\n", + " \"mosaic_index\", \n", + " mos.grid_tessellateexplode(col(\"geometry\"), lit(optimal_resolution))\n", + " )\n", + "\n", + " # We don't need the original geometry any more, since we have broken it down into\n", + " # Smaller mosaic chips.\n", + " .drop(\"json_geometry\", \"geometry\")\n", + ")\n", + "\n", + "print(f\"count? {neighbourhoodsWithIndex.count():,}\") # <- notice the explode results in more rows\n", + "neighbourhoodsWithIndex.limit(5).show() # <- limiting + show for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a8028556-2f1e-4f84-9ef0-e400815908d1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Performing the spatial join\n", + "\n", + "> We can now do spatial joins to both pickup and drop off zones based on geolocations in our datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "03a98ce7-18ea-402f-8cb6-396ce0c4b14c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
trip_distancepickup_geompickup_zonedropoff_geompickup_h3dropoff_h3
1.05POINT (-73.95499 40.777475)Upper East Side NorthPOINT (-73.968848 40.785197)617733122574254079617733122579234815
1.5POINT (-73.95782 40.769567)Lenox Hill WestPOINT (-73.977888 40.76625)617733122576875519617733123879206911
1.2POINT (-73.957921 40.776327)Upper East Side NorthPOINT (-73.957664 40.776225)617733122574254079617733122574254079
10.1POINT (-73.951195 40.794134)East Harlem SouthPOINT (-73.986655 40.703102)617733122648440831617733151096045567
0.61POINT (-73.955828 40.768823)Lenox Hill WestPOINT (-73.956868 40.77482)617733122576351231617733122577661951
1.19POINT (-73.95152 40.810228)Central HarlemPOINT (-73.943287 40.799235)617733122635595775617733122641100799
8.17POINT (-73.870643 40.773607)LaGuardia AirportPOINT (-73.955632 40.782498)617733124388552703617733122575040511
1.3POINT (-73.961098 40.80209)Morningside HeightsPOINT (-73.9723 40.78674)617733122645819391617733122560098303
1.66POINT (-73.96123 40.811963)Morningside HeightsPOINT (-73.974852 40.790805)617733122627993599617733122559836159
11.0POINT (-73.862835 40.768945)LaGuardia AirportPOINT (-73.970477 40.757715)617733124072407039617733123866099711
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1.05, + "POINT (-73.95499 40.777475)", + "Upper East Side North", + "POINT (-73.968848 40.785197)", + 617733122574254079, + 617733122579234815 + ], + [ + 1.5, + "POINT (-73.95782 40.769567)", + "Lenox Hill West", + "POINT (-73.977888 40.76625)", + 617733122576875519, + 617733123879206911 + ], + [ + 1.2, + "POINT (-73.957921 40.776327)", + "Upper East Side North", + "POINT (-73.957664 40.776225)", + 617733122574254079, + 617733122574254079 + ], + [ + 10.1, + "POINT (-73.951195 40.794134)", + "East Harlem South", + "POINT (-73.986655 40.703102)", + 617733122648440831, + 617733151096045567 + ], + [ + 0.61, + "POINT (-73.955828 40.768823)", + "Lenox Hill West", + "POINT (-73.956868 40.77482)", + 617733122576351231, + 617733122577661951 + ], + [ + 1.19, + "POINT (-73.95152 40.810228)", + "Central Harlem", + "POINT (-73.943287 40.799235)", + 617733122635595775, + 617733122641100799 + ], + [ + 8.17, + "POINT (-73.870643 40.773607)", + "LaGuardia Airport", + "POINT (-73.955632 40.782498)", + 617733124388552703, + 617733122575040511 + ], + [ + 1.3, + "POINT (-73.961098 40.80209)", + "Morningside Heights", + "POINT (-73.9723 40.78674)", + 617733122645819391, + 617733122560098303 + ], + [ + 1.66, + "POINT (-73.96123 40.811963)", + "Morningside Heights", + "POINT (-73.974852 40.790805)", + 617733122627993599, + 617733122559836159 + ], + [ + 11.0, + "POINT (-73.862835 40.768945)", + "LaGuardia Airport", + "POINT (-73.970477 40.757715)", + 617733124072407039, + 617733123866099711 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_zone", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "dropoff_h3", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "pickupNeighbourhoods = neighbourhoodsWithIndex.select(col(\"properties.zone\").alias(\"pickup_zone\"), col(\"mosaic_index\"))\n", + "\n", + "withPickupZone = (\n", + " tripsWithIndex.join(\n", + " pickupNeighbourhoods,\n", + " tripsWithIndex[\"pickup_h3\"] == pickupNeighbourhoods[\"mosaic_index.index_id\"]\n", + " ).where(\n", + " # If the borough is a core chip (the chip is fully contained within the geometry), then we do not need\n", + " # to perform any intersection, because any point matching the same index will certainly be contained in\n", + " # the borough. Otherwise we need to perform an st_contains operation on the chip geometry.\n", + " col(\"mosaic_index.is_core\") | mos.st_contains(col(\"mosaic_index.wkb\"), col(\"pickup_geom\"))\n", + " ).select(\n", + " \"trip_distance\", \"pickup_geom\", \"pickup_zone\", \"dropoff_geom\", \"pickup_h3\", \"dropoff_h3\"\n", + " )\n", + ")\n", + "\n", + "display(withPickupZone.limit(10)) # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db947fdf-b039-4675-838f-0d16fdd4516f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> We can easily perform a similar join for the drop off location. __Note: in this case using `withPickupZone` from above as the left sid of the join.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0b2e6289-93a4-42d5-b47d-70ea00325bb5", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
trip_distancepickup_geompickup_zonedropoff_geomdropoff_zonepickup_h3dropoff_h3trip_line
1.05POINT (-73.95499 40.777475)Upper East Side NorthPOINT (-73.968848 40.785197)Central Park617733122574254079617733122579234815LINESTRING (-73.95499 40.777475, -73.968848 40.785197)
1.5POINT (-73.95782 40.769567)Lenox Hill WestPOINT (-73.977888 40.76625)Midtown North617733122576875519617733123879206911LINESTRING (-73.95782 40.769567, -73.977888 40.76625)
1.2POINT (-73.957921 40.776327)Upper East Side NorthPOINT (-73.957664 40.776225)Upper East Side North617733122574254079617733122574254079LINESTRING (-73.957921 40.776327, -73.957664 40.776225)
10.1POINT (-73.951195 40.794134)East Harlem SouthPOINT (-73.986655 40.703102)DUMBO/Vinegar Hill617733122648440831617733151096045567LINESTRING (-73.951195 40.794134, -73.986655 40.703102)
0.61POINT (-73.955828 40.768823)Lenox Hill WestPOINT (-73.956868 40.77482)Yorkville West617733122576351231617733122577661951LINESTRING (-73.955828 40.768823, -73.956868 40.77482)
1.19POINT (-73.95152 40.810228)Central HarlemPOINT (-73.943287 40.799235)East Harlem North617733122635595775617733122641100799LINESTRING (-73.95152 40.810228, -73.943287 40.799235)
8.17POINT (-73.870643 40.773607)LaGuardia AirportPOINT (-73.955632 40.782498)Upper East Side North617733124388552703617733122575040511LINESTRING (-73.870643 40.773607, -73.955632 40.782498)
1.3POINT (-73.961098 40.80209)Morningside HeightsPOINT (-73.9723 40.78674)Upper West Side North617733122645819391617733122560098303LINESTRING (-73.961098 40.80209, -73.9723 40.78674)
1.66POINT (-73.96123 40.811963)Morningside HeightsPOINT (-73.974852 40.790805)Upper West Side North617733122627993599617733122559836159LINESTRING (-73.96123 40.811963, -73.974852 40.790805)
11.0POINT (-73.862835 40.768945)LaGuardia AirportPOINT (-73.970477 40.757715)Midtown East617733124072407039617733123866099711LINESTRING (-73.862835 40.768945, -73.970477 40.757715)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1.05, + "POINT (-73.95499 40.777475)", + "Upper East Side North", + "POINT (-73.968848 40.785197)", + "Central Park", + 617733122574254079, + 617733122579234815, + "LINESTRING (-73.95499 40.777475, -73.968848 40.785197)" + ], + [ + 1.5, + "POINT (-73.95782 40.769567)", + "Lenox Hill West", + "POINT (-73.977888 40.76625)", + "Midtown North", + 617733122576875519, + 617733123879206911, + "LINESTRING (-73.95782 40.769567, -73.977888 40.76625)" + ], + [ + 1.2, + "POINT (-73.957921 40.776327)", + "Upper East Side North", + "POINT (-73.957664 40.776225)", + "Upper East Side North", + 617733122574254079, + 617733122574254079, + "LINESTRING (-73.957921 40.776327, -73.957664 40.776225)" + ], + [ + 10.1, + "POINT (-73.951195 40.794134)", + "East Harlem South", + "POINT (-73.986655 40.703102)", + "DUMBO/Vinegar Hill", + 617733122648440831, + 617733151096045567, + "LINESTRING (-73.951195 40.794134, -73.986655 40.703102)" + ], + [ + 0.61, + "POINT (-73.955828 40.768823)", + "Lenox Hill West", + "POINT (-73.956868 40.77482)", + "Yorkville West", + 617733122576351231, + 617733122577661951, + "LINESTRING (-73.955828 40.768823, -73.956868 40.77482)" + ], + [ + 1.19, + "POINT (-73.95152 40.810228)", + "Central Harlem", + "POINT (-73.943287 40.799235)", + "East Harlem North", + 617733122635595775, + 617733122641100799, + "LINESTRING (-73.95152 40.810228, -73.943287 40.799235)" + ], + [ + 8.17, + "POINT (-73.870643 40.773607)", + "LaGuardia Airport", + "POINT (-73.955632 40.782498)", + "Upper East Side North", + 617733124388552703, + 617733122575040511, + "LINESTRING (-73.870643 40.773607, -73.955632 40.782498)" + ], + [ + 1.3, + "POINT (-73.961098 40.80209)", + "Morningside Heights", + "POINT (-73.9723 40.78674)", + "Upper West Side North", + 617733122645819391, + 617733122560098303, + "LINESTRING (-73.961098 40.80209, -73.9723 40.78674)" + ], + [ + 1.66, + "POINT (-73.96123 40.811963)", + "Morningside Heights", + "POINT (-73.974852 40.790805)", + "Upper West Side North", + 617733122627993599, + 617733122559836159, + "LINESTRING (-73.96123 40.811963, -73.974852 40.790805)" + ], + [ + 11.0, + "POINT (-73.862835 40.768945)", + "LaGuardia Airport", + "POINT (-73.970477 40.757715)", + "Midtown East", + 617733124072407039, + 617733123866099711, + "LINESTRING (-73.862835 40.768945, -73.970477 40.757715)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_zone", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_zone", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "dropoff_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "trip_line", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "dropoffNeighbourhoods = neighbourhoodsWithIndex.select(col(\"properties.zone\").alias(\"dropoff_zone\"), col(\"mosaic_index\"))\n", + "\n", + "withDropoffZone = (\n", + " withPickupZone.join(\n", + " dropoffNeighbourhoods,\n", + " withPickupZone[\"dropoff_h3\"] == dropoffNeighbourhoods[\"mosaic_index.index_id\"]\n", + " ).where(\n", + " col(\"mosaic_index.is_core\") | mos.st_contains(col(\"mosaic_index.wkb\"), col(\"dropoff_geom\"))\n", + " ).select(\n", + " \"trip_distance\", \"pickup_geom\", \"pickup_zone\", \"dropoff_geom\", \"dropoff_zone\", \"pickup_h3\", \"dropoff_h3\"\n", + " )\n", + " .withColumn(\"trip_line\", mos.st_astext(mos.st_makeline(array(mos.st_geomfromwkt(col(\"pickup_geom\")), mos.st_geomfromwkt(col(\"dropoff_geom\"))))))\n", + ")\n", + "\n", + "display(withDropoffZone.limit(10)) # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "30d4507b-7189-455e-9a4e-681d2f4714ac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Visualise the results in Kepler\n", + "\n", + "> Mosaic abstracts interaction with Kepler in python through the use of the `%%mosaic_kepler` magic. When python is not the notebook language, you can prepend `%python` before the magic to make the switch." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "17c9cbeb-f411-4b2d-94d7-6737aaf1e1c4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is the initial rendering with trip lines._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "81d92d32-c979-4dd8-9e7b-1a19d2507f13", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d331fd71-d383-485e-bb6f-6bcd2302dae7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is with trip lines off and some other adjustments._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4c1c94de-97bb-41e3-abd2-955b6ea3effd", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e1311242-147c-461e-9689-e10e02bd66e8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can toggle layers on/off and adjust properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0a093833-f267-4194-b06e-5575001727d2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# withDropoffZone \"pickup_h3\" \"h3\" 5000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c1466346-8537-4e15-9afc-54056129af5a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Databricks Lakehouse can read / write most any data format\n", + "\n", + "> Here are [built-in](https://docs.databricks.com/en/external-data/index.html) formats as well as Mosaic [readers](https://databrickslabs.github.io/mosaic/api/api.html). __Note: best performance with Delta Lake format__, ref [Databricks](https://docs.databricks.com/en/delta/index.html) and [OSS](https://docs.delta.io/latest/index.html) docs for Delta Lake. Beyond built-in formats, Databricks is a platform on which you can install a wide variety of libraries, e.g. [1](https://docs.databricks.com/en/libraries/index.html#python-environment-management) | [2](https://docs.databricks.com/en/compute/compatibility.html) | [3](https://docs.databricks.com/en/init-scripts/index.html).\n", + "\n", + "Example of [reading](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html?highlight=read#pyspark.sql.DataFrameReader) and [writing](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html?highlight=pyspark%20sql%20dataframe%20writer#pyspark.sql.DataFrameWriter) a Spark DataFrame with Delta Lake format.\n", + "\n", + "```\n", + "# - `write.format(\"delta\")` is default in Databricks\n", + "# - can save to a specified path in the Lakehouse\n", + "# - can save as a table in the Databricks Metastore\n", + "df.write.save(\"\")\n", + "df.write.saveAsTable(\"\")\n", + "```\n", + "\n", + "Example of loading a Delta Lake Table as a Spark DataFrame.\n", + "\n", + "```\n", + "# - `read.format(\"delta\")` is default in Databricks\n", + "# - can load a specified path in the Lakehouse\n", + "# - can load a table in the Databricks Metastore\n", + "df.read.load(\"\")\n", + "df.table(\"\")\n", + "```\n", + "\n", + "More on [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/index.html) in Databricks Lakehouse for Governing [Tables](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#tables) and [Volumes](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#volumes)." + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "QuickstartNotebook", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/QuickstartNotebook.py b/notebooks/examples/python/QuickstartNotebook.py deleted file mode 100644 index 1205a2ed6..000000000 --- a/notebooks/examples/python/QuickstartNotebook.py +++ /dev/null @@ -1,270 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC ## Setup NYC taxi zones -# MAGIC In order to setup the data please run the notebook available at "../../data/DownloadNYCTaxiZones".
-# MAGIC DownloadNYCTaxiZones notebook will make sure we have New York City Taxi zone shapes available in our environment. - -# COMMAND ---------- - -# MAGIC %pip install databricks-mosaic - -# COMMAND ---------- - -user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get() - -raw_path = f"dbfs:/tmp/mosaic/{user_name}" -raw_taxi_zones_path = f"{raw_path}/taxi_zones" - -print(f"The raw data is stored in {raw_path}") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Enable Mosaic in the notebook -# MAGIC To get started, you'll need to attach the wheel to your cluster and import instances as in the cell below. - -# COMMAND ---------- - -from pyspark.sql.functions import * -import mosaic as mos -mos.enable_mosaic(spark, dbutils) - -# COMMAND ---------- - -# MAGIC %md ## Read polygons from GeoJson - -# COMMAND ---------- - -# MAGIC %md -# MAGIC With the functionality Mosaic brings we can easily load GeoJSON files using spark.
-# MAGIC In the past this required GeoPandas in python and conversion to spark dataframe.
- -# COMMAND ---------- - -neighbourhoods = ( - spark.read - .option("multiline", "true") - .format("json") - .load(raw_taxi_zones_path) - .select("type", explode(col("features")).alias("feature")) - .select("type", col("feature.properties").alias("properties"), to_json(col("feature.geometry")).alias("json_geometry")) - .withColumn("geometry", mos.st_aswkt(mos.st_geomfromgeojson("json_geometry"))) -) - -display( - neighbourhoods -) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Compute some basic geometry attributes - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Mosaic provides a number of functions for extracting the properties of geometries. Here are some that are relevant to Polygon geometries: - -# COMMAND ---------- - -display( - neighbourhoods - .withColumn("calculatedArea", mos.st_area(col("geometry"))) - .withColumn("calculatedLength", mos.st_length(col("geometry"))) - # Note: The unit of measure of the area and length depends on the CRS used. - # For GPS locations it will be square radians and radians - .select("geometry", "calculatedArea", "calculatedLength") -) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Read points data - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We will load some Taxi trips data to represent point data.
-# MAGIC We already loaded some shapes representing polygons that correspond to NYC neighbourhoods.
- -# COMMAND ---------- - -tripsTable = spark.table("delta.`/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`") - -# COMMAND ---------- - -trips = ( - tripsTable - .drop("vendorId", "rateCodeId", "store_and_fwd_flag", "payment_type") - .withColumn("pickup_geom", mos.st_astext(mos.st_point(col("pickup_longitude"), col("pickup_latitude")))) - .withColumn("dropoff_geom", mos.st_astext(mos.st_point(col("dropoff_longitude"), col("dropoff_latitude")))) -) - -# COMMAND ---------- - -display(trips.select("pickup_geom", "dropoff_geom")) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Spatial Joins - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We can use Mosaic to perform spatial joins both with and without Mosaic indexing strategies.
-# MAGIC Indexing is very important when handling very different geometries both in size and in shape (ie. number of vertices).
- -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### Getting the optimal resolution - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We can use Mosaic functionality to identify how to best index our data based on the data inside the specific dataframe.
-# MAGIC Selecting an appropriate indexing resolution can have a considerable impact on the performance.
- -# COMMAND ---------- - -from mosaic import MosaicFrame - -neighbourhoods_mosaic_frame = MosaicFrame(neighbourhoods, "geometry") -optimal_resolution = neighbourhoods_mosaic_frame.get_optimal_resolution(sample_fraction=0.75) - -print(f"Optimal resolution is {optimal_resolution}") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Not every resolution will yield performance improvements.
-# MAGIC By a rule of thumb it is always better to under-index than over-index - if not sure select a lower resolution.
-# MAGIC Higher resolutions are needed when we have very imbalanced geometries with respect to their size or with respect to the number of vertices.
-# MAGIC In such case indexing with more indices will considerably increase the parallel nature of the operations.
-# MAGIC You can think of Mosaic as a way to partition an overly complex row into multiple rows that have a balanced amount of computation each. - -# COMMAND ---------- - -display( - neighbourhoods_mosaic_frame.get_resolution_metrics(sample_rows=150) -) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### Indexing using the optimal resolution - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We will use mosaic sql functions to index our points data.
-# MAGIC Here we will use resolution 9, index resolution depends on the dataset in use. - -# COMMAND ---------- - -tripsWithIndex = (trips - .withColumn("pickup_h3", mos.grid_pointascellid(col("pickup_geom"), lit(optimal_resolution))) - .withColumn("dropoff_h3", mos.grid_pointascellid(col("dropoff_geom"), lit(optimal_resolution))) -) - -# COMMAND ---------- - -display(tripsWithIndex) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We will also index our neighbourhoods using a built in generator function. - -# COMMAND ---------- - -neighbourhoodsWithIndex = (neighbourhoods - - # We break down the original geometry in multiple smaller mosaic chips, each with its - # own index - .withColumn("mosaic_index", mos.grid_tessellateexplode(col("geometry"), lit(optimal_resolution))) - - # We don't need the original geometry any more, since we have broken it down into - # Smaller mosaic chips. - .drop("json_geometry", "geometry") - ) - -# COMMAND ---------- - -display(neighbourhoodsWithIndex) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### Performing the spatial join - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We can now do spatial joins to both pickup and drop off zones based on geolocations in our datasets. - -# COMMAND ---------- - -pickupNeighbourhoods = neighbourhoodsWithIndex.select(col("properties.zone").alias("pickup_zone"), col("mosaic_index")) - -withPickupZone = ( - tripsWithIndex.join( - pickupNeighbourhoods, - tripsWithIndex["pickup_h3"] == pickupNeighbourhoods["mosaic_index.index_id"] - ).where( - # If the borough is a core chip (the chip is fully contained within the geometry), then we do not need - # to perform any intersection, because any point matching the same index will certainly be contained in - # the borough. Otherwise we need to perform an st_contains operation on the chip geometry. - col("mosaic_index.is_core") | mos.st_contains(col("mosaic_index.wkb"), col("pickup_geom")) - ).select( - "trip_distance", "pickup_geom", "pickup_zone", "dropoff_geom", "pickup_h3", "dropoff_h3" - ) -) - -display(withPickupZone) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC We can easily perform a similar join for the drop off location. - -# COMMAND ---------- - -dropoffNeighbourhoods = neighbourhoodsWithIndex.select(col("properties.zone").alias("dropoff_zone"), col("mosaic_index")) - -withDropoffZone = ( - withPickupZone.join( - dropoffNeighbourhoods, - withPickupZone["dropoff_h3"] == dropoffNeighbourhoods["mosaic_index.index_id"] - ).where( - col("mosaic_index.is_core") | mos.st_contains(col("mosaic_index.wkb"), col("dropoff_geom")) - ).select( - "trip_distance", "pickup_geom", "pickup_zone", "dropoff_geom", "dropoff_zone", "pickup_h3", "dropoff_h3" - ) - .withColumn("trip_line", mos.st_astext(mos.st_makeline(array(mos.st_geomfromwkt(col("pickup_geom")), mos.st_geomfromwkt(col("dropoff_geom")))))) -) - -display(withDropoffZone) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Visualise the results in Kepler - -# COMMAND ---------- - -# MAGIC %md -# MAGIC For visualisation there simply aren't good options in scala.
-# MAGIC Luckily in our notebooks you can easily switch to python just for UI.
-# MAGIC Mosaic abstracts interaction with Kepler in python. - -# COMMAND ---------- - -# MAGIC %python -# MAGIC %%mosaic_kepler -# MAGIC withDropoffZone "pickup_h3" "h3" 5000 - -# COMMAND ---------- - - diff --git a/notebooks/examples/python/Sedona/MosaicAndSedona.ipynb b/notebooks/examples/python/Sedona/MosaicAndSedona.ipynb new file mode 100644 index 000000000..9407100a6 --- /dev/null +++ b/notebooks/examples/python/Sedona/MosaicAndSedona.ipynb @@ -0,0 +1,808 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bf98136c-9276-4388-8eef-b567621fe1a4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Mosaic & Sedona\n", + "\n", + "> You can combine the usage of [Mosaic](https://databrickslabs.github.io/mosaic/index.html) with other geospatial libraries. In this example we combine it with [Sedona](https://sedona.apache.org).\n", + "\n", + "## Setup\n", + "\n", + "This notebook will run if you have both Mosaic and Sedona installed on your cluster.\n", + "\n", + "### Install sedona\n", + "\n", + "To install Sedona, follow the [official Sedona instructions](https://sedona.apache.org/1.5.0/setup/databricks/).\n", + "\n", + "E.g. Add the following maven coordinates to a non-photon cluster [[1](https://docs.databricks.com/en/libraries/package-repositories.html)]. This is showing DBR 12.2 LTS. \n", + "\n", + "```\n", + "org.apache.sedona:sedona-spark-shaded-3.0_2.12:1.5.0\n", + "org.datasyslab:geotools-wrapper:1.5.0-28.2\n", + "```\n", + "\n", + "### Notes\n", + "\n", + "* See instructions for `SedonaContext.create(spark)` [[1](https://sedona.apache.org/1.5.0/tutorial/sql/?h=sedonacontext#initiate-sedonacontext)]. \n", + "* Also, notice we are downgrading pandas from default DBR version for Sedona Python bindings\n", + "* And, Sedona identifies that it might have issues if executed on a [Photon](https://www.databricks.com/product/photon) cluster\n", + "\n", + "--- \n", + " __Last Update__ 30 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a0b2b41f-3769-4cc3-b88f-79b60b28654a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\nPython interpreter will be restarted.\nPython interpreter will be restarted.\nPython interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"pandas<=1.3.5\" \"shapely<= 1.8.4\" \"geopandas<=0.10.2\" --quiet # <- Sedona 1.5 dep versions\n", + "%pip install keplergl==0.3.2 pydeck==0.8.0 --quiet # <- Sedona 1.5 dep versions\n", + "%pip install \"apache-sedona<1.6,>=1.5\" --quiet # <- Sedona 1.5 series\n", + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series [install last]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5bdb06ce-0c5b-4e4a-89b9-58e593e964c4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Verify our Sedona dependency versions_" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f6b0188a-5535-4588-afd7-cdc3c51f76d7", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.databricks.v1+bamboolib_hint": "{\"pd.DataFrames\": [], \"version\": \"0.0.1\"}", + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "application/vnd.databricks.v1+h3_hint": "", + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "pandas version? 1.3.5\ngeopandas version? 0.10.2\nshapely version? 1.8.4\nkepler version? 0.3.2\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import shapely\n", + "import geopandas as gpd\n", + "import keplergl\n", + "\n", + "print(f\"pandas version? {pd.__version__}\")\n", + "print(f\"geopandas version? {gpd.__version__}\")\n", + "print(f\"shapely version? {shapely.__version__}\")\n", + "print(f\"kepler version? {keplergl.__version__}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "46dcda8a-cd24-4016-acf9-6ede54978d2f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Main imports_" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "63686169-877d-4d31-8e6d-dbf6595753e8", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import pyspark.sql.functions as F\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "\n", + "# -- setup sedona\n", + "from sedona.spark import *\n", + "\n", + "sedona = SedonaContext.create(spark)\n", + "\n", + "# --other imports\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6dd1e21d-7a84-4c5e-b5f6-b02831d846b0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Setup simple DataFrame_\n", + "\n", + "> Showing blending Mosaic calls (namespaced as `mos.`) with Sedona (sql) calls." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d00cc61f-65f8-4f10-ace9-54eb895c6d7c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+-----------+-----------+--------------------+\n| wkt|mosaic_area|sedona_area| sedona_flipped|\n+--------------------+-----------+-----------+--------------------+\n|POLYGON ((30 10, ...| 550.0| 550.0|POLYGON ((10 30, ...|\n+--------------------+-----------+-----------+--------------------+\n\n" + ] + } + ], + "source": [ + "df = spark.createDataFrame([{'wkt': 'POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))'}])\n", + "(df\n", + " # Mosaic\n", + " .withColumn(\"mosaic_area\", mos.st_area('wkt'))\n", + " # Sedona\n", + " .withColumn(\"sedona_area\", F.expr(\"ST_Area(ST_GeomFromWKT(wkt))\"))\n", + " # Sedona function not available in Mosaic\n", + " .withColumn(\"sedona_flipped\", F.expr(\"ST_FlipCoordinates(ST_GeomFromWKT(wkt))\"))\n", + ").show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "213a41b5-664c-4a06-b3ee-b7d32d9dc2bb", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Mosaic + Kepler\n", + "\n", + "> Mosaic has the ability to render tables / views + dataframes with `%%mosaic_kepler` magic [[1](https://databrickslabs.github.io/mosaic/usage/kepler.html)]." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0d0af2ac-bd49-45b7-8abc-e8de78e77727", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f75ca032-47cc-4807-b282-883703bf4b52", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can toggle layers on/off and adjust properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "61d8d0c3-6b94-4131-a94c-587e7a6aa54d", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# df \"wkt\" \"geometry\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d6e266b4-82ff-40bb-bf9e-793ebf0d73c4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Sedona\n", + "\n", + "> Converting to a Sedona DataFrame. __Note: there are a few ways to do this.__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5463d1e5-e12d-413e-a664-3137d01c844f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_[1] Spark DataFrame `df` to Pandas DataFrame `pdf`._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0ebb4450-0a76-4b79-9df6-0be79e3afd97", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
wkt
0POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n
wkt
0POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "pdf = df.toPandas()\n", + "pdf" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "26c76d91-1ac8-446c-b394-67c3c0ffe4d9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_[2] Pandas DataFrame `pdf` to GeoPandas DataFrame `gdf`._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "952779e8-1dde-4059-9c04-bf6b1cab611e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
geometry
0POLYGON ((30.00000 10.00000, 40.00000 40.00000...
\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n
geometry
0POLYGON ((30.00000 10.00000, 40.00000 40.00000...
\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "gdf = gpd.GeoDataFrame(\n", + " pdf, geometry=gpd.geoseries.from_wkt(pdf['wkt'], crs=\"EPSG:4326\")\n", + ")\n", + "gdf.drop('wkt', axis=1, inplace=True)\n", + "gdf" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3330a57c-907e-4af8-95c0-71d77a6621c2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_[3] GeoPandas DataFrame `gdf` to Sedona DataFrame `sdf`._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "78a6965c-ec03-4b08-a723-e1efaf4568c2", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "+--------------------+\n| geometry|\n+--------------------+\n|POLYGON ((30 10, ...|\n+--------------------+\n\n" + ] + } + ], + "source": [ + "sdf = sedona.createDataFrame(gdf)\n", + "sdf.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e48b657c-67bc-4bc7-9ec5-470fb6a85c9c", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### SedonaKepler\n", + "\n", + "> Sedona also has some ability to render Kepler [[1](https://sedona.apache.org/1.5.0/api/sql/Visualization_SedonaKepler/)]. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "41cc2bc5-89af-4989-9efd-1e8c7820a107", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d9e1ce9e-0523-41d2-b58b-b0015593b972", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can toggle layers on/off and adjust properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6ca39d58-65b9-47f7-8a18-7cca12e8f079", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# SedonaKepler.create_map(df=sdf, name=\"MySedona\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "09b88855-1895-4805-b31b-4a1f0e7a2c37", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### SedonaPyDeck\n", + "\n", + "> Sedona also has a pydeck renderer [[1](https://sedona.apache.org/1.5.0/api/sql/Visualization_SedonaPyDeck/)]." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e45072f9-d367-461a-bd18-716e500e54fe", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1b0dc412-0690-492b-9ac7-111ba488e861", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you __cannot__ toggle layers on/off and adjust properties with `SedonaPyDeck`._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "be450a6b-b780-4610-81a4-284a2fcd86f9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# SedonaPyDeck.create_geometry_map(sdf)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "17c9df98-ed13-47e8-8865-8038aaf167e4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Sedona provided notebooks are [here](https://github.com/apache/sedona/tree/master/binder) you can explore and potentially execute on Databricks, following the setup instructions identified in this notebook._" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "MosaicAndSedona", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/Sedona/README.md b/notebooks/examples/python/Sedona/README.md new file mode 100644 index 000000000..0d42fed45 --- /dev/null +++ b/notebooks/examples/python/Sedona/README.md @@ -0,0 +1,2 @@ +# Sedona Examples +> Note: `ipynb` files can be previewed in GitHub and can also be imported into Databricks, more [here](https://docs.databricks.com/en/notebooks/notebook-export-import.html). diff --git a/notebooks/examples/python/Shapefiles/GeoPandasUDF/shapefiles_geopandas_udf.ipynb b/notebooks/examples/python/Shapefiles/GeoPandasUDF/shapefiles_geopandas_udf.ipynb new file mode 100644 index 000000000..3d8373196 --- /dev/null +++ b/notebooks/examples/python/Shapefiles/GeoPandasUDF/shapefiles_geopandas_udf.ipynb @@ -0,0 +1,2577 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ae9da0c3-a134-462a-a41f-afbb8dd35a98", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# GeoPandas Shapefiles UDF Example\n", + "\n", + "> These are Census address blocks; download Shapefile(s) from https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDR/\n", + "\n", + "__Artifacts Generated__\n", + "

\n", + "\n", + "1. Volume - `..census_data/address_block_shapefiles`\n", + "1. Table - `..shape_address_block`\n", + "\n", + "--- \n", + "__Last Update:__ 22 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4bca5d5c-92da-45e1-b8fe-cb37e7a807c2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup\n", + "

\n", + "\n", + "1. [GeoPandas](https://pypi.org/project/geopandas/) - used for Shapefile reading and rendering \n", + "1. [Contextily](https://pypi.org/project/contextily/) - used to add basemap to GeoPandas, supports WGS84 (4326) and Spheric Mercator (3857)\n", + "1. Import Databricks columnar functions (including H3) for DBR / DBSQL Photon with `from pyspark.databricks.sql.functions import *`\n", + "\n", + "__Note: If you hit `H3_NOT_ENABLED` [[docs](https://docs.databricks.com/error-messages/h3-not-enabled-error-class.html#h3_not_enabled-error-class)]__\n", + "\n", + "> `h3Expression` is disabled or unsupported. Consider enabling Photon or switch to a tier that supports H3 expressions. [[AWS](https://www.databricks.com/product/aws-pricing) | [Azure](https://azure.microsoft.com/en-us/pricing/details/databricks/) | [GCP](https://www.databricks.com/product/gcp-pricing)]\n", + "\n", + "__Note:__ _Recommend run on DBR 14.1+ for better [Volumes](https://docs.databricks.com/en/sql/language-manual/sql-ref-volumes.html) support._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "57cf168a-2337-413d-8bda-ca1549da216c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install geopandas contextily --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5dbcc88a-f311-408d-824d-ab7553909f3a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import udf, col\n", + "from pyspark.sql.types import *\n", + "\n", + "import contextily as cx\n", + "import fiona\n", + "import geopandas as gpd\n", + "import os\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9dc45265-6f12-42cb-8244-f17f1ad4bde9", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "spark.conf.set(\"spark.sql.shuffle.partitions\", 10_000) # <-- default is 200\n", + "\n", + "# https://spark.apache.org/docs/latest/sql-performance-tuning.html#adaptive-query-execution\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", True) # <-- default is true [nuclear option]\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <-- default is true [softer option]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0140ec49-a07e-45af-a768-65296c68a465", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Configure Database + Username__\n", + "\n", + "> Note: Adjust this to your own specified [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/admin-privileges.html#managing-unity-catalog-metastores) Schema." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "416e37e6-b250-4d0a-afa4-ec2a55c1df62", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[3]: DataFrame[]" + ] + } + ], + "source": [ + "catalog_name = \"mjohns\"\n", + "db_name = \"census\"\n", + "\n", + "sql(f\"use catalog {catalog_name}\")\n", + "sql(f\"use schema {db_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb979e77-e604-43f1-b2d9-9ceca2e804d2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %sql show tables" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "077c6054-1634-4b80-86e5-2c77d7eeb145", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Setup `ETL_DIR` + `ETL_DIR_FUSE`__\n", + "\n", + "> Note: Adjust this to your own specified [Volume](https://docs.databricks.com/en/ingestion/add-data/upload-to-volume.html#upload-files-to-a-unity-catalog-volume) (under a schema). _You must already have setup the Volume path._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0bbff310-7d73-4087-9fc8-fb5887d167e3", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "...ETL_DIR: '/Volumes/mjohns/census/census_data/address_block_shapefiles' (create)\n" + ] + } + ], + "source": [ + "ETL_DIR = f'/Volumes/{catalog_name}/{db_name}/census_data/address_block_shapefiles'\n", + "os.environ['ETL_DIR'] = ETL_DIR\n", + "\n", + "dbutils.fs.mkdirs(ETL_DIR)\n", + "print(f\"...ETL_DIR: '{ETL_DIR}' (create)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d7c307ec-aa2c-4003-ac31-142add84923d", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[0m\u001B[34;42maddress_block_shapefiles\u001B[0m/\r\n" + ] + } + ], + "source": [ + "ls $ETL_DIR/.." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3e10da79-5774-486a-b88c-f420056b6aa7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Get All GA Addresses (Shapefiles)\n", + "

\n", + "\n", + "* Look for pattern https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/tl_rd22_13*.zip (13 is GA number)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ce9f0f90-71d4-4ec0-9628-0466aee9f287", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "state_num = \"13\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2eda22a5-11b7-4f01-ad9e-9f8ee73493c7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Make `address_features` directory.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "694a2f2f-bcb0-4a47-bcfb-dcdd483e85f1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[9]: True" + ] + } + ], + "source": [ + "dbutils.fs.mkdirs(f\"{ETL_DIR}/address_features\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "88055f32-6bfa-4c91-9861-4a8ba63cbe5b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Get List of Shapefile ZIPs" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3468e630-c073-44c2-aed7-a0b3c09ebcec", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "/databricks/driver\nFile ‘address_features.txt’ already there; not retrieving.\n" + ] + } + ], + "source": [ + "%sh \n", + "echo \"$PWD\"\n", + "wget -O address_features.txt -nc \"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a778d810-30a9-4cf5-9c45-1a00b42c3210", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "

pathnamesizemodificationTime
dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/address_features/01700668858233
dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features.txtaddress_features.txt7741321700668858000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/", + "address_features/", + 0, + 1700668858233 + ], + [ + "dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features.txt", + "address_features.txt", + 774132, + 1700668858000 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "size", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "dbutils.fs.cp(\"file:/databricks/driver/address_features.txt\", ETL_DIR)\n", + "display(dbutils.fs.ls(ETL_DIR))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f6fffcb8-a376-4419-872e-ad05785c50f9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Figure out which rows are within the `` tag and extract the filenames.__\n", + "\n", + "> Since this is all in one file being read on one node, get consistent ordered id for `row_num` (not always true)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a030a498-2613-4ff7-b1dd-2922a965a3ea", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "tbl_start_row: 237, tbl_end_row: 3463\n" + ] + } + ], + "source": [ + "tbl_start_row = (\n", + " spark.read.text(f\"{ETL_DIR}/address_features.txt\")\n", + " .withColumn(\"row_num\", F.monotonically_increasing_id())\n", + " .withColumn(\"tbl_start_row\", F.trim(\"value\") == '
')\n", + " .filter(\"tbl_start_row = True\")\n", + " .select(\"row_num\")\n", + ").collect()[0][0]\n", + "\n", + "tbl_end_row = (\n", + " spark.read.text(f\"{ETL_DIR}/address_features.txt\")\n", + " .withColumn(\"row_num\", F.monotonically_increasing_id())\n", + " .withColumn(\"tbl_end_row\", F.trim(\"value\") == '
')\n", + " .filter(\"tbl_end_row = True\")\n", + " .select(\"row_num\")\n", + ").collect()[0][0]\n", + "\n", + "print(f\"tbl_start_row: {tbl_start_row}, tbl_end_row: {tbl_end_row}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5715c3c9-4c68-4412-8fb7-521fe8881bd1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "len state files? 159\nOut[16]: ['tl_rd22_13001_addrfeat.zip',\n 'tl_rd22_13003_addrfeat.zip',\n 'tl_rd22_13005_addrfeat.zip',\n 'tl_rd22_13007_addrfeat.zip',\n 'tl_rd22_13009_addrfeat.zip']" + ] + } + ], + "source": [ + "state_files = [r[1] for r in (\n", + " spark.read.text(f\"{ETL_DIR}/address_features.txt\")\n", + " .withColumn(\"row_num\", F.monotonically_increasing_id())\n", + " .filter(f\"row_num > {tbl_start_row}\")\n", + " .filter(f\"row_num < {tbl_end_row}\")\n", + " .withColumn(\"href_start\", F.substring_index(\"value\", 'href=\"', -1))\n", + " .withColumn(\"href\", F.substring_index(\"href_start\", '\">', 1))\n", + " .filter(col(\"href\").startswith(f\"tl_rd22_{state_num}\")) \n", + " .select(\"row_num\",\"href\")\n", + ").collect()]\n", + "\n", + "print(f\"len state files? {len(state_files):,}\")\n", + "state_files[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4668ed64-7da6-4a22-8cea-32d4b0693d62", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Download Shapefile ZIPs (159)\n", + "\n", + "> Could do this in parallel, but keeping on just driver for now so as to not overload Census server with requests.\n", + "\n", + "__Note: writing locally to driver, then copying to volume with `dbutils`.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b618ca99-bfe5-4b15-a9d8-e5c62b23ca1b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + " 0 --> 'tl_rd22_13001_addrfeat.zip' exists...skipping\n 1 --> 'tl_rd22_13003_addrfeat.zip' exists...skipping\n 2 --> 'tl_rd22_13005_addrfeat.zip' exists...skipping\n 3 --> 'tl_rd22_13007_addrfeat.zip' exists...skipping\n 4 --> 'tl_rd22_13009_addrfeat.zip' exists...skipping\n 5 --> 'tl_rd22_13011_addrfeat.zip' exists...skipping\n 6 --> 'tl_rd22_13013_addrfeat.zip' exists...skipping\n 7 --> 'tl_rd22_13015_addrfeat.zip' exists...skipping\n 8 --> 'tl_rd22_13017_addrfeat.zip' exists...skipping\n 9 --> 'tl_rd22_13019_addrfeat.zip' exists...skipping\n 10 --> 'tl_rd22_13021_addrfeat.zip' exists...skipping\n 11 --> 'tl_rd22_13023_addrfeat.zip' exists...skipping\n 12 --> 'tl_rd22_13025_addrfeat.zip' exists...skipping\n 13 --> 'tl_rd22_13027_addrfeat.zip' exists...skipping\n 14 --> 'tl_rd22_13029_addrfeat.zip' exists...skipping\n 15 --> 'tl_rd22_13031_addrfeat.zip' exists...skipping\n 16 --> 'tl_rd22_13033_addrfeat.zip' exists...skipping\n 17 --> 'tl_rd22_13035_addrfeat.zip' exists...skipping\n 18 --> 'tl_rd22_13037_addrfeat.zip' exists...skipping\n 19 --> 'tl_rd22_13039_addrfeat.zip' exists...skipping\n 20 --> 'tl_rd22_13043_addrfeat.zip' exists...skipping\n 21 --> 'tl_rd22_13045_addrfeat.zip' exists...skipping\n 22 --> 'tl_rd22_13047_addrfeat.zip' exists...skipping\n 23 --> 'tl_rd22_13049_addrfeat.zip' exists...skipping\n 24 --> 'tl_rd22_13051_addrfeat.zip' exists...skipping\n 25 --> 'tl_rd22_13053_addrfeat.zip' exists...skipping\n 26 --> 'tl_rd22_13055_addrfeat.zip' exists...skipping\n 27 --> 'tl_rd22_13057_addrfeat.zip' exists...skipping\n 28 --> 'tl_rd22_13059_addrfeat.zip' exists...skipping\n 29 --> 'tl_rd22_13061_addrfeat.zip' exists...skipping\n 30 --> 'tl_rd22_13063_addrfeat.zip' exists...skipping\n 31 --> 'tl_rd22_13065_addrfeat.zip' exists...skipping\n 32 --> 'tl_rd22_13067_addrfeat.zip' exists...skipping\n 33 --> 'tl_rd22_13069_addrfeat.zip' exists...skipping\n 34 --> 'tl_rd22_13071_addrfeat.zip' exists...skipping\n 35 --> 'tl_rd22_13073_addrfeat.zip' exists...skipping\n 36 --> 'tl_rd22_13075_addrfeat.zip' exists...skipping\n 37 --> 'tl_rd22_13077_addrfeat.zip' exists...skipping\n 38 --> 'tl_rd22_13079_addrfeat.zip' exists...skipping\n 39 --> 'tl_rd22_13081_addrfeat.zip' exists...skipping\n 40 --> 'tl_rd22_13083_addrfeat.zip' exists...skipping\n 41 --> 'tl_rd22_13085_addrfeat.zip' exists...skipping\n 42 --> 'tl_rd22_13087_addrfeat.zip' exists...skipping\n 43 --> 'tl_rd22_13089_addrfeat.zip' exists...skipping\n 44 --> 'tl_rd22_13091_addrfeat.zip' exists...skipping\n 45 --> 'tl_rd22_13093_addrfeat.zip' exists...skipping\n 46 --> 'tl_rd22_13095_addrfeat.zip' exists...skipping\n 47 --> 'tl_rd22_13097_addrfeat.zip' exists...skipping\n 48 --> 'tl_rd22_13099_addrfeat.zip' exists...skipping\n 49 --> 'tl_rd22_13101_addrfeat.zip' exists...skipping\n 50 --> 'tl_rd22_13103_addrfeat.zip' exists...skipping\n 51 --> 'tl_rd22_13105_addrfeat.zip' exists...skipping\n 52 --> 'tl_rd22_13107_addrfeat.zip' exists...skipping\n 53 --> 'tl_rd22_13109_addrfeat.zip' exists...skipping\n 54 --> 'tl_rd22_13111_addrfeat.zip' exists...skipping\n 55 --> 'tl_rd22_13113_addrfeat.zip' exists...skipping\n 56 --> 'tl_rd22_13115_addrfeat.zip' exists...skipping\n 57 --> 'tl_rd22_13117_addrfeat.zip' exists...skipping\n 58 --> 'tl_rd22_13119_addrfeat.zip' exists...skipping\n 59 --> 'tl_rd22_13121_addrfeat.zip' exists...skipping\n 60 --> 'tl_rd22_13123_addrfeat.zip' exists...skipping\n 61 --> 'tl_rd22_13125_addrfeat.zip' exists...skipping\n 62 --> 'tl_rd22_13127_addrfeat.zip' exists...skipping\n 63 --> 'tl_rd22_13129_addrfeat.zip' exists...skipping\n 64 --> 'tl_rd22_13131_addrfeat.zip' exists...skipping\n 65 --> 'tl_rd22_13133_addrfeat.zip' exists...skipping\n 66 --> 'tl_rd22_13135_addrfeat.zip' exists...skipping\n 67 --> 'tl_rd22_13137_addrfeat.zip' exists...skipping\n 68 --> 'tl_rd22_13139_addrfeat.zip' exists...skipping\n 69 --> 'tl_rd22_13141_addrfeat.zip' exists...skipping\n 70 --> 'tl_rd22_13143_addrfeat.zip' exists...skipping\n 71 --> 'tl_rd22_13145_addrfeat.zip' exists...skipping\n 72 --> 'tl_rd22_13147_addrfeat.zip' exists...skipping\n 73 --> 'tl_rd22_13149_addrfeat.zip' exists...skipping\n 74 --> 'tl_rd22_13151_addrfeat.zip' exists...skipping\n 75 --> 'tl_rd22_13153_addrfeat.zip' exists...skipping\n 76 --> 'tl_rd22_13155_addrfeat.zip' exists...skipping\n 77 --> 'tl_rd22_13157_addrfeat.zip' exists...skipping\n 78 --> 'tl_rd22_13159_addrfeat.zip' exists...skipping\n 79 --> 'tl_rd22_13161_addrfeat.zip' exists...skipping\n 80 --> 'tl_rd22_13163_addrfeat.zip' exists...skipping\n 81 --> 'tl_rd22_13165_addrfeat.zip' exists...skipping\n 82 --> 'tl_rd22_13167_addrfeat.zip' exists...skipping\n 83 --> 'tl_rd22_13169_addrfeat.zip' exists...skipping\n 84 --> 'tl_rd22_13171_addrfeat.zip' exists...skipping\n 85 --> 'tl_rd22_13173_addrfeat.zip' exists...skipping\n 86 --> 'tl_rd22_13175_addrfeat.zip' exists...skipping\n 87 --> 'tl_rd22_13177_addrfeat.zip' exists...skipping\n 88 --> 'tl_rd22_13179_addrfeat.zip' exists...skipping\n 89 --> 'tl_rd22_13181_addrfeat.zip' exists...skipping\n 90 --> 'tl_rd22_13183_addrfeat.zip' exists...skipping\n 91 --> 'tl_rd22_13185_addrfeat.zip' exists...skipping\n 92 --> 'tl_rd22_13187_addrfeat.zip' exists...skipping\n 93 --> 'tl_rd22_13189_addrfeat.zip' exists...skipping\n 94 --> 'tl_rd22_13191_addrfeat.zip' exists...skipping\n 95 --> 'tl_rd22_13193_addrfeat.zip' exists...skipping\n 96 --> 'tl_rd22_13195_addrfeat.zip' exists...skipping\n 97 --> 'tl_rd22_13197_addrfeat.zip' exists...skipping\n 98 --> 'tl_rd22_13199_addrfeat.zip' exists...skipping\n 99 --> 'tl_rd22_13201_addrfeat.zip' exists...skipping\n 100 --> 'tl_rd22_13205_addrfeat.zip' exists...skipping\n 101 --> 'tl_rd22_13207_addrfeat.zip' exists...skipping\n 102 --> 'tl_rd22_13209_addrfeat.zip' exists...skipping\n 103 --> 'tl_rd22_13211_addrfeat.zip' exists...skipping\n 104 --> 'tl_rd22_13213_addrfeat.zip' exists...skipping\n 105 --> 'tl_rd22_13215_addrfeat.zip' exists...skipping\n 106 --> 'tl_rd22_13217_addrfeat.zip' exists...skipping\n 107 --> 'tl_rd22_13219_addrfeat.zip' exists...skipping\n 108 --> 'tl_rd22_13221_addrfeat.zip' exists...skipping\n 109 --> 'tl_rd22_13223_addrfeat.zip' exists...skipping\n 110 --> 'tl_rd22_13225_addrfeat.zip' exists...skipping\n 111 --> 'tl_rd22_13227_addrfeat.zip' exists...skipping\n 112 --> 'tl_rd22_13229_addrfeat.zip' exists...skipping\n 113 --> 'tl_rd22_13231_addrfeat.zip' exists...skipping\n 114 --> 'tl_rd22_13233_addrfeat.zip' exists...skipping\n 115 --> 'tl_rd22_13235_addrfeat.zip' exists...skipping\n 116 --> 'tl_rd22_13237_addrfeat.zip' exists...skipping\n 117 --> 'tl_rd22_13239_addrfeat.zip' exists...skipping\n 118 --> 'tl_rd22_13241_addrfeat.zip' exists...skipping\n 119 --> 'tl_rd22_13243_addrfeat.zip' exists...skipping\n 120 --> 'tl_rd22_13245_addrfeat.zip' exists...skipping\n 121 --> 'tl_rd22_13247_addrfeat.zip' exists...skipping\n 122 --> 'tl_rd22_13249_addrfeat.zip' exists...skipping\n 123 --> 'tl_rd22_13251_addrfeat.zip' exists...skipping\n 124 --> 'tl_rd22_13253_addrfeat.zip' exists...skipping\n 125 --> 'tl_rd22_13255_addrfeat.zip' exists...skipping\n 126 --> 'tl_rd22_13257_addrfeat.zip' exists...skipping\n 127 --> 'tl_rd22_13259_addrfeat.zip' exists...skipping\n 128 --> 'tl_rd22_13261_addrfeat.zip' exists...skipping\n 129 --> 'tl_rd22_13263_addrfeat.zip' exists...skipping\n 130 --> 'tl_rd22_13265_addrfeat.zip' exists...skipping\n 131 --> 'tl_rd22_13267_addrfeat.zip' exists...skipping\n 132 --> 'tl_rd22_13269_addrfeat.zip' exists...skipping\n 133 --> 'tl_rd22_13271_addrfeat.zip' exists...skipping\n 134 --> 'tl_rd22_13273_addrfeat.zip' exists...skipping\n 135 --> 'tl_rd22_13275_addrfeat.zip' exists...skipping\n 136 --> 'tl_rd22_13277_addrfeat.zip' exists...skipping\n 137 --> 'tl_rd22_13279_addrfeat.zip' exists...skipping\n 138 --> 'tl_rd22_13281_addrfeat.zip' exists...skipping\n 139 --> 'tl_rd22_13283_addrfeat.zip' exists...skipping\n 140 --> 'tl_rd22_13285_addrfeat.zip' exists...skipping\n 141 --> 'tl_rd22_13287_addrfeat.zip' exists...skipping\n 142 --> 'tl_rd22_13289_addrfeat.zip' exists...skipping\n 143 --> 'tl_rd22_13291_addrfeat.zip' exists...skipping\n 144 --> 'tl_rd22_13293_addrfeat.zip' exists...skipping\n 145 --> 'tl_rd22_13295_addrfeat.zip' exists...skipping\n 146 --> 'tl_rd22_13297_addrfeat.zip' exists...skipping\n 147 --> 'tl_rd22_13299_addrfeat.zip' exists...skipping\n 148 --> 'tl_rd22_13301_addrfeat.zip' exists...skipping\n 149 --> 'tl_rd22_13303_addrfeat.zip' exists...skipping\n 150 --> 'tl_rd22_13305_addrfeat.zip' exists...skipping\n 151 --> 'tl_rd22_13307_addrfeat.zip' exists...skipping\n 152 --> 'tl_rd22_13309_addrfeat.zip' exists...skipping\n 153 --> 'tl_rd22_13311_addrfeat.zip' exists...skipping\n 154 --> 'tl_rd22_13313_addrfeat.zip' exists...skipping\n 155 --> 'tl_rd22_13315_addrfeat.zip' exists...skipping\n 156 --> 'tl_rd22_13317_addrfeat.zip' exists...skipping\n 157 --> 'tl_rd22_13319_addrfeat.zip' exists...skipping\n 158 --> 'tl_rd22_13321_addrfeat.zip' exists...skipping\n" + ] + } + ], + "source": [ + "import pathlib\n", + "import requests\n", + "\n", + "vol_path = pathlib.Path(f\"{ETL_DIR}/address_features\")\n", + "local_path = pathlib.Path(f\"address_features\")\n", + "local_path.mkdir(parents=True, exist_ok=True)\n", + "\n", + "for idx,f in enumerate(state_files):\n", + " idx_str = str(idx).rjust(4)\n", + " \n", + " vol_file = vol_path / f\n", + " if not vol_file.exists():\n", + " local_file = local_path / f \n", + " print(f\"{idx_str} --> '{f}'\")\n", + " req = requests.get(f'https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/{f}')\n", + " with open(local_file, 'wb') as f:\n", + " f.write(req.content)\n", + " else:\n", + " print(f\"{idx_str} --> '{f}' exists...skipping\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "20c476a8-9a3f-4ca9-9d3b-f99914f861a7", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[39]: True" + ] + } + ], + "source": [ + "dbutils.fs.cp(\"file:/databricks/driver/address_features\", f\"{ETL_DIR}/address_features\", recurse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1563023f-d74b-4e70-82c4-aa9a8c79c8f9", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "total 366M\n-rwxrwxrwx 1 nobody nogroup 1.8M Oct 23 14:53 tl_rd22_13001_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 888K Oct 23 14:53 tl_rd22_13003_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 814K Oct 23 14:53 tl_rd22_13005_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 447K Oct 23 14:53 tl_rd22_13007_addrfeat.zip\n...\n-rwxrwxrwx 1 nobody nogroup 4.2M Oct 23 14:53 tl_rd22_13313_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 966K Oct 23 14:53 tl_rd22_13315_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 1.1M Oct 23 14:53 tl_rd22_13317_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 1.1M Oct 23 14:53 tl_rd22_13319_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 1.9M Oct 23 14:53 tl_rd22_13321_addrfeat.zip\n" + ] + } + ], + "source": [ + "%sh\n", + "# avoid list all files\n", + "ls -lh $ETL_DIR/address_features | head -5\n", + "echo \"...\"\n", + "ls -lh $ETL_DIR/address_features | tail -5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db241304-a937-42b5-9613-70ad1b953204", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Test Render with GeoPandas\n", + "\n", + "> Just rendering the first file `tl_rd22_13001_addrfeat.zip` for an example. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "70934189-e1c5-4d4a-a68d-df318acb7738", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %sh\n", + "# - Can copy locally to driver (but don't have to)\n", + "# mkdir -p $PWD/address_features\n", + "# cp $ETL_DIR/address_features/tl_rd22_13001_addrfeat.zip $PWD/address_features\n", + "# ls -lh $PWD/address_features" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1e353ca2-7d9c-4c89-b492-30adb73f38e9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Get layer information_\n", + "\n", + "> Fiona is a dependency of GeoPandas." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7f029910-81ed-4bc8-9377-4f28d6239f35", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[22]: ['tl_rd22_13001_addrfeat']" + ] + } + ], + "source": [ + "fiona.listlayers(f\"zip://{ETL_DIR}/address_features/tl_rd22_13001_addrfeat.zip\") # <- 'zip://' is required here" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d821fbc8-e2c4-45d4-a3bd-035aed0e9c00", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "rows? 3,762, cols? 26\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TLIDTFIDLTFIDRARIDLARIDRLINEARIDFULLNAMELFROMHNLTOHNRFROMHN...PARITYRPLUS4LPLUS4RLFROMTYPLTOTYPRFROMTYPRTOTYPOFFSETLOFFSETRgeometry
04465900209153283259208815None40060416405411105646216480Holmesville RdNoneNone10401...ONoneNoneNoneNoneNoneNoneNNLINESTRING (-82.22232 31.64547, -82.22300 31.6...
14466075209150399209150092None400204692323391105646216633Herbert Rentz RdNoneNone599...ONoneNoneNoneNoneNoneNoneNNLINESTRING (-82.26755 31.61842, -82.26755 31.6...
2645523094209151387209151389None400204680414101105646216724Milton HallmanNoneNone1094...ENoneNoneNoneNoneNoneNoneNNLINESTRING (-82.26705 31.88152, -82.26705 31.8...
3645523094209151387209151389None400204680407431105646216724Milton HallmanNoneNone1062...ENoneNoneNoneNoneNoneNoneNNLINESTRING (-82.26705 31.88152, -82.26705 31.8...
4634061234264401804264782725None40039956216671106087813821Heath StNoneNone300...ENoneNoneNoneNoneNoneNoneNNLINESTRING (-82.33666 31.73849, -82.33635 31.7...
\n", + "

5 rows × 26 columns

\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
TLIDTFIDLTFIDRARIDLARIDRLINEARIDFULLNAMELFROMHNLTOHNRFROMHN...PARITYRPLUS4LPLUS4RLFROMTYPLTOTYPRFROMTYPRTOTYPOFFSETLOFFSETRgeometry
04465900209153283259208815None40060416405411105646216480Holmesville RdNoneNone10401...ONoneNoneNoneNoneNoneNoneNNLINESTRING (-82.22232 31.64547, -82.22300 31.6...
14466075209150399209150092None400204692323391105646216633Herbert Rentz RdNoneNone599...ONoneNoneNoneNoneNoneNoneNNLINESTRING (-82.26755 31.61842, -82.26755 31.6...
2645523094209151387209151389None400204680414101105646216724Milton HallmanNoneNone1094...ENoneNoneNoneNoneNoneNoneNNLINESTRING (-82.26705 31.88152, -82.26705 31.8...
3645523094209151387209151389None400204680407431105646216724Milton HallmanNoneNone1062...ENoneNoneNoneNoneNoneNoneNNLINESTRING (-82.26705 31.88152, -82.26705 31.8...
4634061234264401804264782725None40039956216671106087813821Heath StNoneNone300...ENoneNoneNoneNoneNoneNoneNNLINESTRING (-82.33666 31.73849, -82.33635 31.7...
\n

5 rows × 26 columns

\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "#options: driver='shapefile', layer=0; also, 'zip://' is optional\n", + "gdf = gpd.read_file(f\"{ETL_DIR}/address_features/tl_rd22_13001_addrfeat.zip\")\n", + "print(f'rows? {gdf.shape[0]:,}, cols? {gdf.shape[1]}')\n", + "gdf.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7462661f-ac9a-42d3-b5c0-6901e668a399", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Map Rendering_\n", + "\n", + "> Convert to WGS84 (EPSG=4326) for rendering + this is recommended as baseline for all data layers." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "43c35084-fb20-427a-9de3-28ca808ea7c5", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[26]: \nName: NAD83\nAxis Info [ellipsoidal]:\n- Lat[north]: Geodetic latitude (degree)\n- Lon[east]: Geodetic longitude (degree)\nArea of Use:\n- name: North America - onshore and offshore: Canada - Alberta; British Columbia; Manitoba; New Brunswick; Newfoundland and Labrador; Northwest Territories; Nova Scotia; Nunavut; Ontario; Prince Edward Island; Quebec; Saskatchewan; Yukon. Puerto Rico. United States (USA) - Alabama; Alaska; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Hawaii; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming. US Virgin Islands. British Virgin Islands.\n- bounds: (167.65, 14.92, -40.73, 86.45)\nDatum: North American Datum 1983\n- Ellipsoid: GRS 1980\n- Prime Meridian: Greenwich" + ] + } + ], + "source": [ + "gdf.crs" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d01e2ea4-a145-4546-998b-c7fb1b0c2918", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[25]: 'EPSG:4326'" + ] + } + ], + "source": [ + "gdf_4326 = gdf.to_crs(epsg=4326)\n", + "gdf_4326.crs.to_string() # <- will be used with contextily" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c9a4d22a-15fa-4756-80b6-8452b25af5c1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "ax = gdf_4326.plot(column='ZIPL', cmap=None, legend=True, figsize=(20, 20), alpha=0.5, edgecolor=\"k\")\n", + "cx.add_basemap(ax, zoom='auto', crs=gdf_4326.crs.to_string()) # <- specify crs!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "63ab22d8-14fb-4ee7-9565-9627a269c8c0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Shapefiles to Delta Lake\n", + "\n", + "> Will use GeoPandas to read the ShapeFiles and write as Delta Table\n", + "\n", + "__Focus on `ADDRFEAT` (Address Feature) for both geometries and address ranges.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "60090cbe-867a-4a85-a7b8-b45a9d54efde", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[28]: 159" + ] + } + ], + "source": [ + "num_shapefiles = len(dbutils.fs.ls(f\"{ETL_DIR}/address_features\"))\n", + "num_shapefiles" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "262164bc-39ac-4968-bf02-6ddd11a58025", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__[1] Define the UDF function.__\n", + "\n", + "> This will be invoked with [applyInPandas](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.GroupedData.applyInPandas.html?highlight=applyinpandas)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a4b41fae-83da-4ca0-89a6-94792edcf8dc", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[29]: Index(['TLID', 'TFIDL', 'TFIDR', 'ARIDL', 'ARIDR', 'LINEARID', 'FULLNAME',\n 'LFROMHN', 'LTOHN', 'RFROMHN', 'RTOHN', 'ZIPL', 'ZIPR', 'EDGE_MTFCC',\n 'ROAD_MTFCC', 'PARITYL', 'PARITYR', 'PLUS4L', 'PLUS4R', 'LFROMTYP',\n 'LTOTYP', 'RFROMTYP', 'RTOTYP', 'OFFSETL', 'OFFSETR', 'geometry'],\n dtype='object')" + ] + } + ], + "source": [ + "gdf_4326.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "399106db-2a7d-4fcb-b699-53125e57da35", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "def geopandas_read(pdf:pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Read using geopandas; recommend using `repartition`\n", + " in caller to drive parallelism.\n", + " - 'path' field assumed to be a Volume path,\n", + " which is automatically FUSE mounted\n", + " - layer_num is either field 'layer_num', if present\n", + " or defaults to 0\n", + " - standardizes to CRS=4326\n", + " \"\"\"\n", + " pdf_arr = []\n", + "\n", + " # --- iterate over pdf ---\n", + " for index, row in pdf.iterrows():\n", + " # [1] read 'path' + 'layer_num'\n", + " layer_num = 0\n", + " if 'layer_num' in row:\n", + " layer_num = row['layer_num']\n", + "\n", + " file_path = row['path'].replace('dbfs:','')\n", + "\n", + " gdf = gpd.read_file(file_path, layer=layer_num)\n", + " # [2] set CRS to 4326 (WGS84)\n", + " gdf_4326 = gdf.to_crs(epsg=4326)\n", + "\n", + " # [3] \n", + " gdf_wkt = gdf_4326.to_wkt\n", + "\n", + " # [3] convert 'geometry' column to wkt +\n", + " pdf_arr.append(pd.DataFrame(gdf_4326.to_wkt()))\n", + "\n", + " # return as pandas dataframe\n", + " return pd.concat(pdf_arr)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f4083d81-f53e-497b-b117-870fb65174e2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__[2] We need a schema for our return__ \n", + "\n", + "> Will use the example from above for this; in production, you will want to be more careful defining the return schema." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e12ee0ca-b544-47bf-877d-d63b65a48f16", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[31]: StructType([StructField('TLID', LongType(), True), StructField('TFIDL', LongType(), True), StructField('TFIDR', LongType(), True), StructField('ARIDL', StringType(), True), StructField('ARIDR', StringType(), True), StructField('LINEARID', StringType(), True), StructField('FULLNAME', StringType(), True), StructField('LFROMHN', StringType(), True), StructField('LTOHN', StringType(), True), StructField('RFROMHN', StringType(), True), StructField('RTOHN', StringType(), True), StructField('ZIPL', StringType(), True), StructField('ZIPR', StringType(), True), StructField('EDGE_MTFCC', StringType(), True), StructField('ROAD_MTFCC', StringType(), True), StructField('PARITYL', StringType(), True), StructField('PARITYR', StringType(), True), StructField('PLUS4L', NullType(), True), StructField('PLUS4R', NullType(), True), StructField('LFROMTYP', StringType(), True), StructField('LTOTYP', StringType(), True), StructField('RFROMTYP', StringType(), True), StructField('RTOTYP', StringType(), True), StructField('OFFSETL', StringType(), True), StructField('OFFSETR', StringType(), True), StructField('geometry', StringType(), True)])" + ] + } + ], + "source": [ + "spark.createDataFrame(pd.DataFrame(gdf_4326.to_wkt())).schema" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a44d01b2-2049-465e-ba3e-828ac422c8bf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "layer_schema = StructType([\n", + " StructField('TLID', LongType(), True), \n", + " StructField('TFIDL', LongType(), True), \n", + " StructField('TFIDR', LongType(), True), \n", + " StructField('ARIDL', StringType(), True), \n", + " StructField('ARIDR', StringType(), True), \n", + " StructField('LINEARID', StringType(), True), \n", + " StructField('FULLNAME', StringType(), True), \n", + " StructField('LFROMHN', StringType(), True), \n", + " StructField('LTOHN', StringType(), True), \n", + " StructField('RFROMHN', StringType(), True), \n", + " StructField('RTOHN', StringType(), True), \n", + " StructField('ZIPL', StringType(), True), \n", + " StructField('ZIPR', StringType(), True), \n", + " StructField('EDGE_MTFCC', StringType(), True), \n", + " StructField('ROAD_MTFCC', StringType(), True), \n", + " StructField('PARITYL', StringType(), True), \n", + " StructField('PARITYR', StringType(), True), \n", + " StructField('PLUS4L', StringType(), True), # <- altered from inferred NullType\n", + " StructField('PLUS4R', StringType(), True), # <- altered from inferred NullType\n", + " StructField('LFROMTYP', StringType(), True), \n", + " StructField('LTOTYP', StringType(), True), \n", + " StructField('RFROMTYP', StringType(), True), \n", + " StructField('RTOTYP', StringType(), True), \n", + " StructField('OFFSETL', StringType(), True), \n", + " StructField('OFFSETR', StringType(), True), \n", + " StructField('geometry', StringType(), True)\n", + "])\n", + "\n", + "# layer_schema" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5bd16864-a25a-4b23-bbaf-d16405339b75", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__[3] Define Spark DataFrame with Paths__\n", + "\n", + "> We just need a list of files to process, e.g. from \"address_features\" directory." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "44ffd7c1-1277-4e7e-b773-fa1663dcf959", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 159\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
pathnamesizemodificationTime
dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13001_addrfeat.ziptl_rd22_13001_addrfeat.zip18810471698072828000
dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13003_addrfeat.ziptl_rd22_13003_addrfeat.zip9088611698072803000
dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13005_addrfeat.ziptl_rd22_13005_addrfeat.zip8326591698072825000
dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13007_addrfeat.ziptl_rd22_13007_addrfeat.zip4574131698072818000
dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13009_addrfeat.ziptl_rd22_13009_addrfeat.zip18128531698072835000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13001_addrfeat.zip", + "tl_rd22_13001_addrfeat.zip", + 1881047, + 1698072828000 + ], + [ + "dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13003_addrfeat.zip", + "tl_rd22_13003_addrfeat.zip", + 908861, + 1698072803000 + ], + [ + "dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13005_addrfeat.zip", + "tl_rd22_13005_addrfeat.zip", + 832659, + 1698072825000 + ], + [ + "dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13007_addrfeat.zip", + "tl_rd22_13007_addrfeat.zip", + 457413, + 1698072818000 + ], + [ + "dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/tl_rd22_13009_addrfeat.zip", + "tl_rd22_13009_addrfeat.zip", + 1812853, + 1698072835000 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "size", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_path = spark.createDataFrame(dbutils.fs.ls(f\"{ETL_DIR}/address_features\"))\n", + "print(f\"count? {df_path.count():,}\")\n", + "df_path.limit(5).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4187e60a-c86f-47ad-9d5a-1a63d7ddfe07", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__[4] Invoke the UDF__\n", + "\n", + "> Group By 'path'; also repartition by 'path' to drive parallelism." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "76552f7f-6713-4f49-85e0-2db7fb884783", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__DRY-RUN:__ _LIMIT 1_" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a5b7cfbd-5bf3-4ea5-b2bc-1379311d16ad", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 11,202\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
TLIDTFIDLTFIDRARIDLARIDRLINEARIDFULLNAMELFROMHNLTOHNRFROMHNRTOHNZIPLZIPREDGE_MTFCCROAD_MTFCCPARITYLPARITYRPLUS4LPLUS4RLFROMTYPLTOTYPRFROMTYPRTOTYPOFFSETLOFFSETRgeometry
64554442626502692026502689540015491037520400403202219911048687443Taylor Chapel Rd11981100110111993155231552S1400S1400EOnullnullnullnullnullnullNNLINESTRING (-82.632431 31.307748, -82.632431 31.307947, -82.632434 31.309355, -82.632449 31.309766, -82.63247 31.310324, -82.632459 31.311564, -82.632472 31.312339, -82.632523 31.312437, -82.632641 31.312605, -82.632924 31.312855, -82.63334 31.313174, -82.633656 31.313395, -82.634004 31.313588, -82.634468 31.313794, -82.63498 31.313971, -82.635194 31.314052, -82.635409 31.31412, -82.635645 31.314158, -82.635803 31.314115, -82.636063 31.313971, -82.636454 31.313769, -82.636746 31.31361, -82.637252 31.31342, -82.637661 31.313302, -82.637956 31.313284, -82.638186 31.313281, -82.638467 31.313405, -82.639292 31.313676, -82.641407 31.314452, -82.641721 31.314547, -82.642019 31.31467, -82.642283 31.314751, -82.64248 31.314777, -82.642776 31.314801, -82.643103 31.314741, -82.644182 31.314431, -82.644541 31.314314, -82.64521 31.314094, -82.646256 31.313798, -82.646669 31.313725)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 645544426, + 265026920, + 265026895, + "40015491037520", + "4004032022199", + "11048687443", + "Taylor Chapel Rd", + "1198", + "1100", + "1101", + "1199", + "31552", + "31552", + "S1400", + "S1400", + "E", + "O", + null, + null, + null, + null, + null, + null, + "N", + "N", + "LINESTRING (-82.632431 31.307748, -82.632431 31.307947, -82.632434 31.309355, -82.632449 31.309766, -82.63247 31.310324, -82.632459 31.311564, -82.632472 31.312339, -82.632523 31.312437, -82.632641 31.312605, -82.632924 31.312855, -82.63334 31.313174, -82.633656 31.313395, -82.634004 31.313588, -82.634468 31.313794, -82.63498 31.313971, -82.635194 31.314052, -82.635409 31.31412, -82.635645 31.314158, -82.635803 31.314115, -82.636063 31.313971, -82.636454 31.313769, -82.636746 31.31361, -82.637252 31.31342, -82.637661 31.313302, -82.637956 31.313284, -82.638186 31.313281, -82.638467 31.313405, -82.639292 31.313676, -82.641407 31.314452, -82.641721 31.314547, -82.642019 31.31467, -82.642283 31.314751, -82.64248 31.314777, -82.642776 31.314801, -82.643103 31.314741, -82.644182 31.314431, -82.644541 31.314314, -82.64521 31.314094, -82.646256 31.313798, -82.646669 31.313725)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "TLID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ARIDR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LINEARID", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "FULLNAME", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ZIPL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ZIPR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "EDGE_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ROAD_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4L", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4R", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geometry", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "spark.catalog.clearCache() # <- cache for dev, help avoid recomputes\n", + "\n", + "DRY_LIMIT = 1\n", + "\n", + "out_df = (\n", + " df_path \n", + " .limit(DRY_LIMIT) # <- NOTE: DRY-RUN\n", + " .repartition(DRY_LIMIT, \"path\") # <-repartition \n", + " .groupBy(\"path\") # <- groupby `path`\n", + " .applyInPandas(\n", + " geopandas_read, schema=layer_schema\n", + " )\n", + " .cache()\n", + ")\n", + "\n", + "print(f\"count? {out_df.count():,}\")\n", + "out_df.limit(1).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9befaf23-eaea-465f-92dc-692b1f625835", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[36]: 0" + ] + } + ], + "source": [ + "out_df.filter(col(\"plus4l\").isNotNull()).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1a3e2b71-b5e2-4662-bb6e-d6c8a55f4ed8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__ACTUAL:__ _Write All to Delta Lake_\n", + "\n", + "> We are saving as a managed table named 'shape_address_block'." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a30c9972-a7e9-4a80-80d0-b4866fa0bf84", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "sql(f\"drop table if exists {catalog_name}.{db_name}.shape_address_block\")\n", + "\n", + "(\n", + " df_path \n", + " .repartition(num_shapefiles, \"path\") # <-repartition \n", + " .groupBy(\"path\") # <- groupby `path`\n", + " .applyInPandas(\n", + " geopandas_read, schema=layer_schema\n", + " )\n", + " .write\n", + " .mode(\"append\")\n", + " .saveAsTable(f\"{catalog_name}.{db_name}.shape_address_block\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f023c32d-98ec-43aa-96da-8114b6e0b617", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 782,054\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
TLIDTFIDLTFIDRARIDLARIDRLINEARIDFULLNAMELFROMHNLTOHNRFROMHNRTOHNZIPLZIPREDGE_MTFCCROAD_MTFCCPARITYLPARITYRPLUS4LPLUS4RLFROMTYPLTOTYPRFROMTYPRTOTYPOFFSETLOFFSETRgeometry
629684220250114008212667664null4004714389943110453770290Maria Sorrell Rdnullnull638898null30450S1400S1400nullEnullnullnullnullnullnullNNLINESTRING (-81.770631 32.498047, -81.770421 32.498247, -81.770249 32.498447, -81.77015 32.49861, -81.769576 32.499769, -81.769513 32.499879)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 629684220, + 250114008, + 212667664, + null, + "4004714389943", + "110453770290", + "Maria Sorrell Rd", + null, + null, + "638", + "898", + null, + "30450", + "S1400", + "S1400", + null, + "E", + null, + null, + null, + null, + null, + null, + "N", + "N", + "LINESTRING (-81.770631 32.498047, -81.770421 32.498247, -81.770249 32.498447, -81.77015 32.49861, -81.769576 32.499769, -81.769513 32.499879)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "TLID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ARIDR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LINEARID", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "FULLNAME", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ZIPL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ZIPR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "EDGE_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ROAD_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4L", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4R", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geometry", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_address = spark.table(f\"{catalog_name}.{db_name}.shape_address_block\")\n", + "\n", + "print(f\"count? {df_address.count():,}\")\n", + "df_address.limit(1).display()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "fe1152a6-cd99-496c-95fd-82e636e8bff8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__NOTE: WE DID NOT ADD A SPATIAL INDEX, THAT WILL INVOLVE [1] H3 TESSELLATION and [2] Optimizing, e.g. via [ZORDER](https://docs.databricks.com/en/delta/data-skipping.html) or (newer for DBR 13.3+) [LIQUID CLUSTERING](https://docs.databricks.com/en/delta/clustering.html).__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bba4a72f-c1ab-45b8-b4d2-e464ef2c2365", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Final Sanity Check" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dede4ee0-595a-43a4-a214-ce3f412c123c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
databasetableNameisTemporary
censusga_address_blockfalse
censusshape_address_blockfalse
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "census", + "ga_address_block", + false + ], + [ + "census", + "shape_address_block", + false + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "database", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "tableName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "isTemporary", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql show tables" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d8f02e22-df0c-4e63-a02b-ec12326fa362", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
TLIDTFIDLTFIDRARIDLARIDRLINEARIDFULLNAMELFROMHNLTOHNRFROMHNRTOHNZIPLZIPREDGE_MTFCCROAD_MTFCCPARITYLPARITYRPLUS4LPLUS4RLFROMTYPLTOTYPRFROMTYPRTOTYPOFFSETLOFFSETRgeometry
629684220250114008212667664null4004714389943110453770290Maria Sorrell Rdnullnull638898null30450S1400S1400nullEnullnullnullnullnullnullNNLINESTRING (-81.770631 32.498047, -81.770421 32.498247, -81.770249 32.498447, -81.77015 32.49861, -81.769576 32.499769, -81.769513 32.499879)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 629684220, + 250114008, + 212667664, + null, + "4004714389943", + "110453770290", + "Maria Sorrell Rd", + null, + null, + "638", + "898", + null, + "30450", + "S1400", + "S1400", + null, + "E", + null, + null, + null, + null, + null, + null, + "N", + "N", + "LINESTRING (-81.770631 32.498047, -81.770421 32.498247, -81.770249 32.498447, -81.77015 32.49861, -81.769576 32.499769, -81.769513 32.499879)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "TLID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ARIDR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LINEARID", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "FULLNAME", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOHN", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ZIPL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ZIPR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "EDGE_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ROAD_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4L", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4R", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geometry", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql select * from shape_address_block limit 1" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549841987706, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "shapefiles_geopandas_udf", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/Shapefiles/MosaicGDAL/mosaic_gdal_shapefiles.ipynb b/notebooks/examples/python/Shapefiles/MosaicGDAL/mosaic_gdal_shapefiles.ipynb new file mode 100644 index 000000000..39443df6c --- /dev/null +++ b/notebooks/examples/python/Shapefiles/MosaicGDAL/mosaic_gdal_shapefiles.ipynb @@ -0,0 +1,2580 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ae9da0c3-a134-462a-a41f-afbb8dd35a98", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Mosaic + GDAL Shapefile Example\n", + "\n", + "> Download Shapefile(s) from https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDR/\n", + "\n", + "__Artifacts Generated__\n", + "

\n", + "\n", + "1. Volume - `..census_data/address_block_shapefiles`\n", + "1. Table - `..ga_address_block`\n", + "\n", + "--- \n", + "__Last Update:__ 22 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4bca5d5c-92da-45e1-b8fe-cb37e7a807c2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup\n", + "

\n", + "\n", + "1. Import Databricks columnar functions (including H3) for DBR / DBSQL Photon with `from pyspark.databricks.sql.functions import *`\n", + "2. To use Databricks Labs [Mosaic](https://databrickslabs.github.io/mosaic/index.html) library for geospatial data engineering, analysis, and visualization functionality:\n", + " * Install with `%pip install databricks-mosaic`\n", + " * Import and use with the following:\n", + " ```\n", + " import mosaic as mos\n", + " mos.enable_mosaic(spark, dbutils)\n", + " ```\n", + "

\n", + "\n", + "3. For Mosaic + GDAL (used for SHP reading)\n", + " * Follow instructions for your version [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html)\n", + " * then additionally call the following in the notebook \n", + " ```\n", + " mos.enable_gdal(spark)\n", + " ```\n", + "

\n", + "\n", + "4. To use [KeplerGl](https://kepler.gl/) OSS library for map layer rendering:\n", + " * Already installed with Mosaic, use `%%mosaic_kepler` magic [[Mosaic Docs](https://databrickslabs.github.io/mosaic/usage/kepler.html)]\n", + " * Import with `from keplergl import KeplerGl` to use directly\n", + "\n", + "__Notes:__\n", + "\n", + "If you hit `H3_NOT_ENABLED` [[docs](https://docs.databricks.com/error-messages/h3-not-enabled-error-class.html#h3_not_enabled-error-class)]\n", + "\n", + "> `h3Expression` is disabled or unsupported. Consider enabling Photon or switch to a tier that supports H3 expressions. [[AWS](https://www.databricks.com/product/aws-pricing) | [Azure](https://azure.microsoft.com/en-us/pricing/details/databricks/) | [GCP](https://www.databricks.com/product/gcp-pricing)]\n", + "\n", + "If you have trouble with Volume access:\n", + "\n", + "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", + "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "57cf168a-2337-413d-8bda-ca1549da216c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5dbcc88a-f311-408d-824d-ab7553909f3a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "GDAL enabled.\n\nGDAL 3.4.3, released 2022/04/22\n\n\n" + ] + } + ], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 10_000) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import udf, col\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "mos.enable_gdal(spark)\n", + "\n", + "# --other imports\n", + "import os\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0140ec49-a07e-45af-a768-65296c68a465", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Configure Database + Username__\n", + "\n", + "> Note: Adjust this to your own specified [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/admin-privileges.html#managing-unity-catalog-metastores) Schema." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "416e37e6-b250-4d0a-afa4-ec2a55c1df62", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[2]: DataFrame[]" + ] + } + ], + "source": [ + "catalog_name = \"mjohns\"\n", + "db_name = \"census\"\n", + "\n", + "sql(f\"use catalog {catalog_name}\")\n", + "sql(f\"use schema {db_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb979e77-e604-43f1-b2d9-9ceca2e804d2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %sql show tables" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "077c6054-1634-4b80-86e5-2c77d7eeb145", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Setup `ETL_DIR` + `ETL_DIR_FUSE`__\n", + "\n", + "> Note: Adjust this to your own specified [Volume](https://docs.databricks.com/en/ingestion/add-data/upload-to-volume.html#upload-files-to-a-unity-catalog-volume) (under a schema)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0bbff310-7d73-4087-9fc8-fb5887d167e3", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "...ETL_DIR: '/Volumes/mjohns/census/census_data/address_block_shapefiles' (create)\n" + ] + } + ], + "source": [ + "ETL_DIR = f'/Volumes/{catalog_name}/{db_name}/census_data/address_block_shapefiles'\n", + "os.environ['ETL_DIR'] = ETL_DIR\n", + "\n", + "dbutils.fs.mkdirs(ETL_DIR)\n", + "print(f\"...ETL_DIR: '{ETL_DIR}' (create)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d7c307ec-aa2c-4003-ac31-142add84923d", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[0m\u001B[34;42maddress_block_shapefiles\u001B[0m/\r\n" + ] + } + ], + "source": [ + "ls $ETL_DIR/.." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3e10da79-5774-486a-b88c-f420056b6aa7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Get All GA Addresses (Shapefiles)\n", + "

\n", + "\n", + "* Look for pattern https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/tl_rd22_13*.zip (13 is GA number)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ce9f0f90-71d4-4ec0-9628-0466aee9f287", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "state_num = \"13\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2eda22a5-11b7-4f01-ad9e-9f8ee73493c7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Make `address_features` directory.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "694a2f2f-bcb0-4a47-bcfb-dcdd483e85f1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[8]: True" + ] + } + ], + "source": [ + "dbutils.fs.mkdirs(f\"{ETL_DIR}/address_features\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "88055f32-6bfa-4c91-9861-4a8ba63cbe5b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Get List of Shapefile ZIPs" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3468e630-c073-44c2-aed7-a0b3c09ebcec", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "/databricks/driver\nFile ‘address_features.txt’ already there; not retrieving.\n" + ] + } + ], + "source": [ + "%sh \n", + "echo \"$PWD\"\n", + "wget -O address_features.txt -nc \"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a778d810-30a9-4cf5-9c45-1a00b42c3210", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "

pathnamesizemodificationTime
dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/address_features/01700675263932
dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features.txtaddress_features.txt7741321700675264000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features/", + "address_features/", + 0, + 1700675263932 + ], + [ + "dbfs:/Volumes/mjohns/census/census_data/address_block_shapefiles/address_features.txt", + "address_features.txt", + 774132, + 1700675264000 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "size", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "dbutils.fs.cp(\"file:/databricks/driver/address_features.txt\", ETL_DIR)\n", + "display(dbutils.fs.ls(ETL_DIR))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f6fffcb8-a376-4419-872e-ad05785c50f9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Figure out which rows are within the `` tag and extract the filenames.__\n", + "\n", + "> Since this is all in one file being read on one node, get consistent ordered id for `row_num` (not always true)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a030a498-2613-4ff7-b1dd-2922a965a3ea", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "tbl_start_row: 237, tbl_end_row: 3463\n" + ] + } + ], + "source": [ + "tbl_start_row = (\n", + " spark.read.text(f\"{ETL_DIR}/address_features.txt\")\n", + " .withColumn(\"row_num\", F.monotonically_increasing_id())\n", + " .withColumn(\"tbl_start_row\", F.trim(\"value\") == '
')\n", + " .filter(\"tbl_start_row = True\")\n", + " .select(\"row_num\")\n", + ").collect()[0][0]\n", + "\n", + "tbl_end_row = (\n", + " spark.read.text(f\"{ETL_DIR}/address_features.txt\")\n", + " .withColumn(\"row_num\", F.monotonically_increasing_id())\n", + " .withColumn(\"tbl_end_row\", F.trim(\"value\") == '
')\n", + " .filter(\"tbl_end_row = True\")\n", + " .select(\"row_num\")\n", + ").collect()[0][0]\n", + "\n", + "print(f\"tbl_start_row: {tbl_start_row}, tbl_end_row: {tbl_end_row}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5715c3c9-4c68-4412-8fb7-521fe8881bd1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "len state files? 159\nOut[13]: ['tl_rd22_13001_addrfeat.zip',\n 'tl_rd22_13003_addrfeat.zip',\n 'tl_rd22_13005_addrfeat.zip',\n 'tl_rd22_13007_addrfeat.zip',\n 'tl_rd22_13009_addrfeat.zip']" + ] + } + ], + "source": [ + "state_files = [r[1] for r in (\n", + " spark.read.text(f\"{ETL_DIR}/address_features.txt\")\n", + " .withColumn(\"row_num\", F.monotonically_increasing_id())\n", + " .filter(f\"row_num > {tbl_start_row}\")\n", + " .filter(f\"row_num < {tbl_end_row}\")\n", + " .withColumn(\"href_start\", F.substring_index(\"value\", 'href=\"', -1))\n", + " .withColumn(\"href\", F.substring_index(\"href_start\", '\">', 1))\n", + " .filter(col(\"href\").startswith(f\"tl_rd22_{state_num}\")) \n", + " .select(\"row_num\",\"href\")\n", + ").collect()]\n", + "\n", + "print(f\"len state files? {len(state_files):,}\")\n", + "state_files[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4668ed64-7da6-4a22-8cea-32d4b0693d62", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Download Shapefile ZIPs (159)\n", + "\n", + "> Could do this in parallel, but keeping on just driver for now so as to not overload Census server with requests.\n", + "\n", + "__Note: writing locally to driver, then copying to volume with `dbutils`.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b618ca99-bfe5-4b15-a9d8-e5c62b23ca1b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + " 0 --> 'tl_rd22_13001_addrfeat.zip' exists...skipping\n 1 --> 'tl_rd22_13003_addrfeat.zip' exists...skipping\n 2 --> 'tl_rd22_13005_addrfeat.zip' exists...skipping\n 3 --> 'tl_rd22_13007_addrfeat.zip' exists...skipping\n 4 --> 'tl_rd22_13009_addrfeat.zip' exists...skipping\n 5 --> 'tl_rd22_13011_addrfeat.zip' exists...skipping\n 6 --> 'tl_rd22_13013_addrfeat.zip' exists...skipping\n 7 --> 'tl_rd22_13015_addrfeat.zip' exists...skipping\n 8 --> 'tl_rd22_13017_addrfeat.zip' exists...skipping\n 9 --> 'tl_rd22_13019_addrfeat.zip' exists...skipping\n 10 --> 'tl_rd22_13021_addrfeat.zip' exists...skipping\n 11 --> 'tl_rd22_13023_addrfeat.zip' exists...skipping\n 12 --> 'tl_rd22_13025_addrfeat.zip' exists...skipping\n 13 --> 'tl_rd22_13027_addrfeat.zip' exists...skipping\n 14 --> 'tl_rd22_13029_addrfeat.zip' exists...skipping\n 15 --> 'tl_rd22_13031_addrfeat.zip' exists...skipping\n 16 --> 'tl_rd22_13033_addrfeat.zip' exists...skipping\n 17 --> 'tl_rd22_13035_addrfeat.zip' exists...skipping\n 18 --> 'tl_rd22_13037_addrfeat.zip' exists...skipping\n 19 --> 'tl_rd22_13039_addrfeat.zip' exists...skipping\n 20 --> 'tl_rd22_13043_addrfeat.zip' exists...skipping\n 21 --> 'tl_rd22_13045_addrfeat.zip' exists...skipping\n 22 --> 'tl_rd22_13047_addrfeat.zip' exists...skipping\n 23 --> 'tl_rd22_13049_addrfeat.zip' exists...skipping\n 24 --> 'tl_rd22_13051_addrfeat.zip' exists...skipping\n 25 --> 'tl_rd22_13053_addrfeat.zip' exists...skipping\n 26 --> 'tl_rd22_13055_addrfeat.zip' exists...skipping\n 27 --> 'tl_rd22_13057_addrfeat.zip' exists...skipping\n 28 --> 'tl_rd22_13059_addrfeat.zip' exists...skipping\n 29 --> 'tl_rd22_13061_addrfeat.zip' exists...skipping\n 30 --> 'tl_rd22_13063_addrfeat.zip' exists...skipping\n 31 --> 'tl_rd22_13065_addrfeat.zip' exists...skipping\n 32 --> 'tl_rd22_13067_addrfeat.zip' exists...skipping\n 33 --> 'tl_rd22_13069_addrfeat.zip' exists...skipping\n 34 --> 'tl_rd22_13071_addrfeat.zip' exists...skipping\n 35 --> 'tl_rd22_13073_addrfeat.zip' exists...skipping\n 36 --> 'tl_rd22_13075_addrfeat.zip' exists...skipping\n 37 --> 'tl_rd22_13077_addrfeat.zip' exists...skipping\n 38 --> 'tl_rd22_13079_addrfeat.zip' exists...skipping\n 39 --> 'tl_rd22_13081_addrfeat.zip' exists...skipping\n 40 --> 'tl_rd22_13083_addrfeat.zip' exists...skipping\n 41 --> 'tl_rd22_13085_addrfeat.zip' exists...skipping\n 42 --> 'tl_rd22_13087_addrfeat.zip' exists...skipping\n 43 --> 'tl_rd22_13089_addrfeat.zip' exists...skipping\n 44 --> 'tl_rd22_13091_addrfeat.zip' exists...skipping\n 45 --> 'tl_rd22_13093_addrfeat.zip' exists...skipping\n 46 --> 'tl_rd22_13095_addrfeat.zip' exists...skipping\n 47 --> 'tl_rd22_13097_addrfeat.zip' exists...skipping\n 48 --> 'tl_rd22_13099_addrfeat.zip' exists...skipping\n 49 --> 'tl_rd22_13101_addrfeat.zip' exists...skipping\n 50 --> 'tl_rd22_13103_addrfeat.zip' exists...skipping\n 51 --> 'tl_rd22_13105_addrfeat.zip' exists...skipping\n 52 --> 'tl_rd22_13107_addrfeat.zip' exists...skipping\n 53 --> 'tl_rd22_13109_addrfeat.zip' exists...skipping\n 54 --> 'tl_rd22_13111_addrfeat.zip' exists...skipping\n 55 --> 'tl_rd22_13113_addrfeat.zip' exists...skipping\n 56 --> 'tl_rd22_13115_addrfeat.zip' exists...skipping\n 57 --> 'tl_rd22_13117_addrfeat.zip' exists...skipping\n 58 --> 'tl_rd22_13119_addrfeat.zip' exists...skipping\n 59 --> 'tl_rd22_13121_addrfeat.zip' exists...skipping\n 60 --> 'tl_rd22_13123_addrfeat.zip' exists...skipping\n 61 --> 'tl_rd22_13125_addrfeat.zip' exists...skipping\n 62 --> 'tl_rd22_13127_addrfeat.zip' exists...skipping\n 63 --> 'tl_rd22_13129_addrfeat.zip' exists...skipping\n 64 --> 'tl_rd22_13131_addrfeat.zip' exists...skipping\n 65 --> 'tl_rd22_13133_addrfeat.zip' exists...skipping\n 66 --> 'tl_rd22_13135_addrfeat.zip' exists...skipping\n 67 --> 'tl_rd22_13137_addrfeat.zip' exists...skipping\n 68 --> 'tl_rd22_13139_addrfeat.zip' exists...skipping\n 69 --> 'tl_rd22_13141_addrfeat.zip' exists...skipping\n 70 --> 'tl_rd22_13143_addrfeat.zip' exists...skipping\n 71 --> 'tl_rd22_13145_addrfeat.zip' exists...skipping\n 72 --> 'tl_rd22_13147_addrfeat.zip' exists...skipping\n 73 --> 'tl_rd22_13149_addrfeat.zip' exists...skipping\n 74 --> 'tl_rd22_13151_addrfeat.zip' exists...skipping\n 75 --> 'tl_rd22_13153_addrfeat.zip' exists...skipping\n 76 --> 'tl_rd22_13155_addrfeat.zip' exists...skipping\n 77 --> 'tl_rd22_13157_addrfeat.zip' exists...skipping\n 78 --> 'tl_rd22_13159_addrfeat.zip' exists...skipping\n 79 --> 'tl_rd22_13161_addrfeat.zip' exists...skipping\n 80 --> 'tl_rd22_13163_addrfeat.zip' exists...skipping\n 81 --> 'tl_rd22_13165_addrfeat.zip' exists...skipping\n 82 --> 'tl_rd22_13167_addrfeat.zip' exists...skipping\n 83 --> 'tl_rd22_13169_addrfeat.zip' exists...skipping\n 84 --> 'tl_rd22_13171_addrfeat.zip' exists...skipping\n 85 --> 'tl_rd22_13173_addrfeat.zip' exists...skipping\n 86 --> 'tl_rd22_13175_addrfeat.zip' exists...skipping\n 87 --> 'tl_rd22_13177_addrfeat.zip' exists...skipping\n 88 --> 'tl_rd22_13179_addrfeat.zip' exists...skipping\n 89 --> 'tl_rd22_13181_addrfeat.zip' exists...skipping\n 90 --> 'tl_rd22_13183_addrfeat.zip' exists...skipping\n 91 --> 'tl_rd22_13185_addrfeat.zip' exists...skipping\n 92 --> 'tl_rd22_13187_addrfeat.zip' exists...skipping\n 93 --> 'tl_rd22_13189_addrfeat.zip' exists...skipping\n 94 --> 'tl_rd22_13191_addrfeat.zip' exists...skipping\n 95 --> 'tl_rd22_13193_addrfeat.zip' exists...skipping\n 96 --> 'tl_rd22_13195_addrfeat.zip' exists...skipping\n 97 --> 'tl_rd22_13197_addrfeat.zip' exists...skipping\n 98 --> 'tl_rd22_13199_addrfeat.zip' exists...skipping\n 99 --> 'tl_rd22_13201_addrfeat.zip' exists...skipping\n 100 --> 'tl_rd22_13205_addrfeat.zip' exists...skipping\n 101 --> 'tl_rd22_13207_addrfeat.zip' exists...skipping\n 102 --> 'tl_rd22_13209_addrfeat.zip' exists...skipping\n 103 --> 'tl_rd22_13211_addrfeat.zip' exists...skipping\n 104 --> 'tl_rd22_13213_addrfeat.zip' exists...skipping\n 105 --> 'tl_rd22_13215_addrfeat.zip' exists...skipping\n 106 --> 'tl_rd22_13217_addrfeat.zip' exists...skipping\n 107 --> 'tl_rd22_13219_addrfeat.zip' exists...skipping\n 108 --> 'tl_rd22_13221_addrfeat.zip' exists...skipping\n 109 --> 'tl_rd22_13223_addrfeat.zip' exists...skipping\n 110 --> 'tl_rd22_13225_addrfeat.zip' exists...skipping\n 111 --> 'tl_rd22_13227_addrfeat.zip' exists...skipping\n 112 --> 'tl_rd22_13229_addrfeat.zip' exists...skipping\n 113 --> 'tl_rd22_13231_addrfeat.zip' exists...skipping\n 114 --> 'tl_rd22_13233_addrfeat.zip' exists...skipping\n 115 --> 'tl_rd22_13235_addrfeat.zip' exists...skipping\n 116 --> 'tl_rd22_13237_addrfeat.zip' exists...skipping\n 117 --> 'tl_rd22_13239_addrfeat.zip' exists...skipping\n 118 --> 'tl_rd22_13241_addrfeat.zip' exists...skipping\n 119 --> 'tl_rd22_13243_addrfeat.zip' exists...skipping\n 120 --> 'tl_rd22_13245_addrfeat.zip' exists...skipping\n 121 --> 'tl_rd22_13247_addrfeat.zip' exists...skipping\n 122 --> 'tl_rd22_13249_addrfeat.zip' exists...skipping\n 123 --> 'tl_rd22_13251_addrfeat.zip' exists...skipping\n 124 --> 'tl_rd22_13253_addrfeat.zip' exists...skipping\n 125 --> 'tl_rd22_13255_addrfeat.zip' exists...skipping\n 126 --> 'tl_rd22_13257_addrfeat.zip' exists...skipping\n 127 --> 'tl_rd22_13259_addrfeat.zip' exists...skipping\n 128 --> 'tl_rd22_13261_addrfeat.zip' exists...skipping\n 129 --> 'tl_rd22_13263_addrfeat.zip' exists...skipping\n 130 --> 'tl_rd22_13265_addrfeat.zip' exists...skipping\n 131 --> 'tl_rd22_13267_addrfeat.zip' exists...skipping\n 132 --> 'tl_rd22_13269_addrfeat.zip' exists...skipping\n 133 --> 'tl_rd22_13271_addrfeat.zip' exists...skipping\n 134 --> 'tl_rd22_13273_addrfeat.zip' exists...skipping\n 135 --> 'tl_rd22_13275_addrfeat.zip' exists...skipping\n 136 --> 'tl_rd22_13277_addrfeat.zip' exists...skipping\n 137 --> 'tl_rd22_13279_addrfeat.zip' exists...skipping\n 138 --> 'tl_rd22_13281_addrfeat.zip' exists...skipping\n 139 --> 'tl_rd22_13283_addrfeat.zip' exists...skipping\n 140 --> 'tl_rd22_13285_addrfeat.zip' exists...skipping\n 141 --> 'tl_rd22_13287_addrfeat.zip' exists...skipping\n 142 --> 'tl_rd22_13289_addrfeat.zip' exists...skipping\n 143 --> 'tl_rd22_13291_addrfeat.zip' exists...skipping\n 144 --> 'tl_rd22_13293_addrfeat.zip' exists...skipping\n 145 --> 'tl_rd22_13295_addrfeat.zip' exists...skipping\n 146 --> 'tl_rd22_13297_addrfeat.zip' exists...skipping\n 147 --> 'tl_rd22_13299_addrfeat.zip' exists...skipping\n 148 --> 'tl_rd22_13301_addrfeat.zip' exists...skipping\n 149 --> 'tl_rd22_13303_addrfeat.zip' exists...skipping\n 150 --> 'tl_rd22_13305_addrfeat.zip' exists...skipping\n 151 --> 'tl_rd22_13307_addrfeat.zip' exists...skipping\n 152 --> 'tl_rd22_13309_addrfeat.zip' exists...skipping\n 153 --> 'tl_rd22_13311_addrfeat.zip' exists...skipping\n 154 --> 'tl_rd22_13313_addrfeat.zip' exists...skipping\n 155 --> 'tl_rd22_13315_addrfeat.zip' exists...skipping\n 156 --> 'tl_rd22_13317_addrfeat.zip' exists...skipping\n 157 --> 'tl_rd22_13319_addrfeat.zip' exists...skipping\n 158 --> 'tl_rd22_13321_addrfeat.zip' exists...skipping\n" + ] + } + ], + "source": [ + "import pathlib\n", + "import requests\n", + "\n", + "vol_path = pathlib.Path(f\"{ETL_DIR}/address_features\")\n", + "local_path = pathlib.Path(f\"address_features\")\n", + "local_path.mkdir(parents=True, exist_ok=True)\n", + "\n", + "for idx,f in enumerate(state_files):\n", + " idx_str = str(idx).rjust(4)\n", + " \n", + " vol_file = vol_path / f\n", + " if not vol_file.exists():\n", + " local_file = local_path / f \n", + " print(f\"{idx_str} --> '{f}'\")\n", + " req = requests.get(f'https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/ADDRFEAT/{f}')\n", + " with open(local_file, 'wb') as f:\n", + " f.write(req.content)\n", + " else:\n", + " print(f\"{idx_str} --> '{f}' exists...skipping\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "20c476a8-9a3f-4ca9-9d3b-f99914f861a7", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[39]: True" + ] + } + ], + "source": [ + "dbutils.fs.cp(\"file:/databricks/driver/address_features\", f\"{ETL_DIR}/address_features\", recurse=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1563023f-d74b-4e70-82c4-aa9a8c79c8f9", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "total 366M\n-rwxrwxrwx 1 nobody nogroup 1.8M Oct 23 14:53 tl_rd22_13001_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 888K Oct 23 14:53 tl_rd22_13003_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 814K Oct 23 14:53 tl_rd22_13005_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 447K Oct 23 14:53 tl_rd22_13007_addrfeat.zip\n...\n-rwxrwxrwx 1 nobody nogroup 4.2M Oct 23 14:53 tl_rd22_13313_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 966K Oct 23 14:53 tl_rd22_13315_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 1.1M Oct 23 14:53 tl_rd22_13317_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 1.1M Oct 23 14:53 tl_rd22_13319_addrfeat.zip\n-rwxrwxrwx 1 nobody nogroup 1.9M Oct 23 14:53 tl_rd22_13321_addrfeat.zip\n" + ] + } + ], + "source": [ + "%sh\n", + "# avoid list all files\n", + "ls -lh $ETL_DIR/address_features | head -5\n", + "echo \"...\"\n", + "ls -lh $ETL_DIR/address_features | tail -5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8ec7388f-b607-46ba-a5b8-c2f4116ca1fd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Note:__ Showing DBFS based processing [Volumes](https://docs.databricks.com/en/sql/language-manual/sql-ref-volumes.html) for access, though you could skip this if all setup with Unity Catalog + Shared Access clusters." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eaa1c491-bfef-4a07-9295-821969bc1bdd", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[15]: True" + ] + } + ], + "source": [ + "# - change to your preferred DBFS path\n", + "ETL_DBFS_DIR = \"/home/mjohns@databricks.com/datasets/census/address_features\"\n", + "os.environ['ETL_DBFS_DIR'] = ETL_DBFS_DIR\n", + "dbutils.fs.mkdirs(ETL_DBFS_DIR)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "14d5b6e5-eeb7-4e48-9f1f-3b7fd1ebfbf3", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
pathnamesizemodificationTime
dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13001_addrfeat.ziptl_rd22_13001_addrfeat.zip18810471700675678000
dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13003_addrfeat.ziptl_rd22_13003_addrfeat.zip9088611700675678000
dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13005_addrfeat.ziptl_rd22_13005_addrfeat.zip8326591700675679000
dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13007_addrfeat.ziptl_rd22_13007_addrfeat.zip4574131700675679000
dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13009_addrfeat.ziptl_rd22_13009_addrfeat.zip18128531700675679000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13001_addrfeat.zip", + "tl_rd22_13001_addrfeat.zip", + 1881047, + 1700675678000 + ], + [ + "dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13003_addrfeat.zip", + "tl_rd22_13003_addrfeat.zip", + 908861, + 1700675678000 + ], + [ + "dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13005_addrfeat.zip", + "tl_rd22_13005_addrfeat.zip", + 832659, + 1700675679000 + ], + [ + "dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13007_addrfeat.zip", + "tl_rd22_13007_addrfeat.zip", + 457413, + 1700675679000 + ], + [ + "dbfs:/home/mjohns@databricks.com/datasets/census/address_features/tl_rd22_13009_addrfeat.zip", + "tl_rd22_13009_addrfeat.zip", + 1812853, + 1700675679000 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "size", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "dbutils.fs.cp(f\"{ETL_DIR}/address_features\", ETL_DBFS_DIR, recurse=True)\n", + "display(dbutils.fs.ls(ETL_DBFS_DIR)[:5]) # <- just showing the first 5 for ipynb" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db241304-a937-42b5-9613-70ad1b953204", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Test Render with Kepler\n", + "\n", + "> Just rendering the first file `tl_rd22_13001_addrfeat.zip` for an example. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "73632755-3450-49d6-8802-89786eba420f", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 3,762, num invalid? 0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
fullnamelfromhnltohnziplrfromhnrtohnziprgeom_wktis_valid
Holmesville Rd000104011049931563LINESTRING (-82.222325 31.645474,-82.222997 31.645623)true
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "Holmesville Rd", + 0, + 0, + 0, + 10401, + 10499, + 31563, + "LINESTRING (-82.222325 31.645474,-82.222997 31.645623)", + true + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "fullname", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "lfromhn", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ltohn", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "zipl", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "rfromhn", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "rtohn", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "zipr", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_kepler = (\n", + " mos.read()\n", + " .format(\"multi_read_ogr\")\n", + " .option(\"vsizip\", \"true\")\n", + " .option(\"asWKB\", \"false\")\n", + " .load(f\"dbfs:{ETL_DBFS_DIR}/tl_rd22_13001_addrfeat.zip\")\n", + " .withColumn(\"geom\", mos.st_geomfromwkt(\"geom_0\"))\n", + " .withColumn(\"is_valid\", mos.st_isvalid(\"geom\"))\n", + " .selectExpr(\n", + " \"fullname\", \"lfromhn\", \"ltohn\", \"zipl\", \"rfromhn\", \"rtohn\", \"zipr\",\n", + " \"geom_0 as geom_wkt\", \"is_valid\"\n", + " )\n", + ")\n", + "print(f\"count? {df_kepler.count():,}, num invalid? {df_kepler.filter('is_valid = False').count():,}\")\n", + "df_kepler.limit(1).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7177408a-3a5b-4a61-8c16-5e0d0485a129", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "197ea9bf-b3ff-41a5-b1eb-a0ef44b40025", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "44f95338-e268-483e-93d9-6c3e70d37107", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# df_kepler \"geom_wkt\" \"geometry\" 10_000 " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "63ab22d8-14fb-4ee7-9565-9627a269c8c0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Shapefiles to Delta Lake\n", + "\n", + "> Will use Mosaic + GDAL to read the ShapeFiles and write as Delta Lake to DBFS\n", + "\n", + "__Focus on `ADDRFEAT` (Address Feature) for both geometries and address ranges.__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7f9fcddc-5dee-4597-bc20-1dfa945e222b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Assess `ST_Transform` to 4326 (from 4269)__\n", + "\n", + "> See that 'geom_0_srid' is 4269, so use `st_setsrid` and `st_transform` to standardize to 4326, more [here](https://databrickslabs.github.io/mosaic/usage/grid-indexes-bng.html#coordinate-reference-system). _This just uses one file to demonstrate the transform initially, full data is transformed later._\n", + "\n", + "__Note:__ _This pattern will shift to avoid Mosaic internal geometry in Mosaic 0.4 series._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7954c22b-fb4b-4c4b-9355-ef74d3a1b1e6", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 3,762\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
TLIDTFIDLTFIDRARIDLARIDRLINEARIDFULLNAMELFROMHNLTOHNRFROMHNRTOHNZIPLZIPREDGE_MTFCCROAD_MTFCCPARITYLPARITYRPLUS4LPLUS4RLFROMTYPLTOTYPRFROMTYPRTOTYPOFFSETLOFFSETRgeom_0geom_0_srid
4465900209153283259208815040060416405411105646216480Holmesville Rd001040110499031563S1400S1400ONNLINESTRING (-82.222325 31.645474,-82.222997 31.645623)4269
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 4465900, + 209153283, + 259208815, + 0, + 4006041640541, + 1105646216480, + "Holmesville Rd", + 0, + 0, + 10401, + 10499, + 0, + 31563, + "S1400", + "S1400", + "", + "O", + "", + "", + "", + "", + "", + "", + "N", + "N", + "LINESTRING (-82.222325 31.645474,-82.222997 31.645623)", + "4269" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "TLID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "LINEARID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "FULLNAME", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "LTOHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "RFROMHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "RTOHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ZIPL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ZIPR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "EDGE_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ROAD_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4L", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4R", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_0", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_0_srid", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_test = (\n", + " mos.read()\n", + " .format(\"multi_read_ogr\")\n", + " .option(\"vsizip\", \"true\")\n", + " .option(\"asWKB\", \"false\")\n", + " .load(f\"dbfs:{ETL_DBFS_DIR}/tl_rd22_13001_addrfeat.zip\")\n", + ")\n", + "print(f\"count? {df_test.count():,}\")\n", + "df_test.limit(1).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "35e70182-7813-4235-b96f-c280b4cf6528", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 3,762\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
TLIDTFIDLTFIDRARIDLARIDRLINEARIDFULLNAMELFROMHNLTOHNRFROMHNRTOHNZIPLZIPREDGE_MTFCCROAD_MTFCCPARITYLPARITYRPLUS4LPLUS4RLFROMTYPLTOTYPRFROMTYPRTOTYPOFFSETLOFFSETRgeom_0geom_0_sridgeom_4269can_coords_from_4269geomis_coords_4326geom_wktis_valid
4465900209153283259208815040060416405411105646216480Holmesville Rd001040110499031563S1400S1400ONNLINESTRING (-82.222325 31.645474,-82.222997 31.645623)4269List(3, 4269, List(List(List(-82.222325, 31.645474), List(-82.222997, 31.645623))), List(List(List())))trueList(3, 4326, List(List(List(-82.222325, 31.645474), List(-82.222997, 31.645623))), List(List(List())))trueLINESTRING (-82.222325 31.645474, -82.222997 31.645623)true
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 4465900, + 209153283, + 259208815, + 0, + 4006041640541, + 1105646216480, + "Holmesville Rd", + 0, + 0, + 10401, + 10499, + 0, + 31563, + "S1400", + "S1400", + "", + "O", + "", + "", + "", + "", + "", + "", + "N", + "N", + "LINESTRING (-82.222325 31.645474,-82.222997 31.645623)", + "4269", + [ + 3, + 4269, + [ + [ + [ + -82.222325, + 31.645474 + ], + [ + -82.222997, + 31.645623 + ] + ] + ], + [ + [ + [] + ] + ] + ], + true, + [ + 3, + 4326, + [ + [ + [ + -82.222325, + 31.645474 + ], + [ + -82.222997, + 31.645623 + ] + ] + ], + [ + [ + [] + ] + ] + ], + true, + "LINESTRING (-82.222325 31.645474, -82.222997 31.645623)", + true + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "TLID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "LINEARID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "FULLNAME", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "LTOHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "RFROMHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "RTOHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ZIPL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ZIPR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "EDGE_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ROAD_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4L", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4R", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_0", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_0_srid", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_4269", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"type_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"srid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boundary\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"holes\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "can_coords_from_4269", + "type": "\"boolean\"" + }, + { + "metadata": "{}", + "name": "geom", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"type_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"srid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boundary\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"holes\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "is_coords_4326", + "type": "\"boolean\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_trans_test = (\n", + " mos.read()\n", + " .format(\"multi_read_ogr\")\n", + " .option(\"vsizip\", \"true\")\n", + " .option(\"asWKB\", \"false\")\n", + " .load(f\"dbfs:{ETL_DBFS_DIR}/tl_rd22_13001_addrfeat.zip\")\n", + " .withColumn(\"geom_4269\", mos.st_geomfromwkt(\"geom_0\"))\n", + " .withColumn(\"geom_4269\", mos.st_setsrid(\"geom_4269\", F.lit(4269)))\n", + " .withColumn(\"can_coords_from_4269\", mos.st_hasvalidcoordinates(\"geom_4269\", F.lit(\"EPSG:4326\"), F.lit('reprojected_bounds')))\n", + " .withColumn(\"geom\", mos.st_transform(\"geom_4269\", F.lit(4326)))\n", + " .withColumn(\"is_coords_4326\", mos.st_hasvalidcoordinates(\"geom\", F.lit(\"EPSG:4326\"), F.lit('bounds')))\n", + " .withColumn(\"geom_wkt\", mos.st_astext(\"geom\"))\n", + " .withColumn(\"is_valid\", mos.st_isvalid(\"geom_wkt\"))\n", + ")\n", + "print(f\"count? {df_trans_test.count():,}\")\n", + "df_trans_test.limit(1).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "60090cbe-867a-4a85-a7b8-b45a9d54efde", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[63]: 159" + ] + } + ], + "source": [ + "num_shapefiles = len(dbutils.fs.ls(ETL_DBFS_DIR))\n", + "num_shapefiles" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b6b2f61c-2597-44bf-a176-49c7fe9b22c8", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "_df = (\n", + " mos.read()\n", + " .format(\"multi_read_ogr\")\n", + " .option(\"vsizip\", \"true\")\n", + " .option(\"asWKB\", \"false\")\n", + " .load(f\"dbfs:{ETL_DBFS_DIR}/\")\n", + " .repartition(num_shapefiles, F.rand())\n", + " .withColumn(\"geom_4269\", mos.st_geomfromwkt(\"geom_0\"))\n", + " .withColumn(\"geom_4269\", mos.st_setsrid(\"geom_4269\", F.lit(4269)))\n", + " .withColumn(\"geom\", mos.st_transform(\"geom_4269\", F.lit(4326)))\n", + " .withColumn(\"geom_wkt\", mos.st_astext(\"geom\"))\n", + " .withColumn(\"is_valid\", mos.st_isvalid(\"geom_wkt\"))\n", + " .selectExpr(\n", + " \"* except(geom_0, geom_4269, geom, geom_wkt, is_valid)\",\n", + " \"geom_wkt\", \"is_valid\"\n", + " )\n", + ")\n", + "\n", + "## -- wait until write to delta, will be faster --\n", + "# print(f\"\"\"count? {_df.count():,}, num invalid? {_df.filter(\"is_valid = False\").count():,}\"\"\")\n", + "# _df.display()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1a3e2b71-b5e2-4662-bb6e-d6c8a55f4ed8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Write to Delta Lake\n", + "\n", + "> We are saving as a managed table named 'ga_address_block'." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a30c9972-a7e9-4a80-80d0-b4866fa0bf84", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 782,054, num invalid? 0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
TLIDTFIDLTFIDRARIDLARIDRLINEARIDFULLNAMELFROMHNLTOHNRFROMHNRTOHNZIPLZIPREDGE_MTFCCROAD_MTFCCPARITYLPARITYRPLUS4LPLUS4RLFROMTYPLTOTYPRFROMTYPRTOTYPOFFSETLOFFSETRgeom_0_sridgeom_wktis_valid
647339257265438206265438206400101245966240010125263861101019209353Old Forge Dr10001298100112993007630076S1400S1400EOIINN4269LINESTRING (-84.333864 34.03804, -84.333828 34.037969, -84.33378 34.037876, -84.333718 34.037712, -84.333677 34.037543, -84.33365600000002 34.037371, -84.333689 34.035147, -84.333711 34.034936, -84.333764 34.034735, -84.333848 34.03454, -84.33396100000002 34.034355000000005)true
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 647339257, + 265438206, + 265438206, + 4001012459662, + 4001012526386, + 1101019209353, + "Old Forge Dr", + 1000, + 1298, + 1001, + 1299, + 30076, + 30076, + "S1400", + "S1400", + "E", + "O", + "", + "", + "", + "I", + "", + "I", + "N", + "N", + "4269", + "LINESTRING (-84.333864 34.03804, -84.333828 34.037969, -84.33378 34.037876, -84.333718 34.037712, -84.333677 34.037543, -84.33365600000002 34.037371, -84.333689 34.035147, -84.333711 34.034936, -84.333764 34.034735, -84.333848 34.03454, -84.33396100000002 34.034355000000005)", + true + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "TLID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "LINEARID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "FULLNAME", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "LTOHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "RFROMHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "RTOHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ZIPL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ZIPR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "EDGE_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ROAD_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4L", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4R", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_0_srid", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "(\n", + " _df\n", + " .write\n", + " .mode(\"overwrite\")\n", + " .option(\"mergeSchema\", \"true\")\n", + " .saveAsTable(f\"{catalog_name}.{db_name}.ga_address_block\")\n", + ")\n", + "\n", + "df_address = spark.table(f\"{catalog_name}.{db_name}.ga_address_block\")\n", + "\n", + "print(f\"count? {df_address.count():,}, num invalid? {df_address.filter('is_valid = False').count():,}\")\n", + "df_address.limit(1).display()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bba4a72f-c1ab-45b8-b4d2-e464ef2c2365", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Final Sanity Check" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dede4ee0-595a-43a4-a214-ce3f412c123c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
databasetableNameisTemporary
censusga_address_blockfalse
censusshape_address_blockfalse
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "census", + "ga_address_block", + false + ], + [ + "census", + "shape_address_block", + false + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "database", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "tableName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "isTemporary", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql show tables" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d8f02e22-df0c-4e63-a02b-ec12326fa362", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
TLIDTFIDLTFIDRARIDLARIDRLINEARIDFULLNAMELFROMHNLTOHNRFROMHNRTOHNZIPLZIPREDGE_MTFCCROAD_MTFCCPARITYLPARITYRPLUS4LPLUS4RLFROMTYPLTOTYPRFROMTYPRTOTYPOFFSETLOFFSETRgeom_0_sridgeom_wktis_valid
647339257265438206265438206400101245966240010125263861101019209353Old Forge Dr10001298100112993007630076S1400S1400EOIINN4269LINESTRING (-84.333864 34.03804, -84.333828 34.037969, -84.33378 34.037876, -84.333718 34.037712, -84.333677 34.037543, -84.33365600000002 34.037371, -84.333689 34.035147, -84.333711 34.034936, -84.333764 34.034735, -84.333848 34.03454, -84.33396100000002 34.034355000000005)true
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 647339257, + 265438206, + 265438206, + 4001012459662, + 4001012526386, + 1101019209353, + "Old Forge Dr", + 1000, + 1298, + 1001, + 1299, + 30076, + 30076, + "S1400", + "S1400", + "E", + "O", + "", + "", + "", + "I", + "", + "I", + "N", + "N", + "4269", + "LINESTRING (-84.333864 34.03804, -84.333828 34.037969, -84.33378 34.037876, -84.333718 34.037712, -84.333677 34.037543, -84.33365600000002 34.037371, -84.333689 34.035147, -84.333711 34.034936, -84.333764 34.034735, -84.333848 34.03454, -84.33396100000002 34.034355000000005)", + true + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "TLID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "TFIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ARIDR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "LINEARID", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "FULLNAME", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "LTOHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "RFROMHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "RTOHN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ZIPL", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "ZIPR", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "EDGE_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "ROAD_MTFCC", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PARITYR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4L", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "PLUS4R", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "LTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RFROMTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "RTOTYP", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETL", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "OFFSETR", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_0_srid", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql select * from ga_address_block limit 1" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549841987820, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "mosaic_gdal_shapefiles", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/Shapefiles/README.md b/notebooks/examples/python/Shapefiles/README.md new file mode 100644 index 000000000..64b56ed1f --- /dev/null +++ b/notebooks/examples/python/Shapefiles/README.md @@ -0,0 +1,5 @@ +# Shapefile Examples + +> A couple of examples loading shapefiles into Databricks. + +__Note: `ipynb` files can be previewed in GitHub and can also be imported into Databricks, more [here](https://docs.databricks.com/en/notebooks/notebook-export-import.html).__ diff --git a/notebooks/examples/python/Ship2ShipTransfers/01. Data Prep.ipynb b/notebooks/examples/python/Ship2ShipTransfers/01. Data Prep.ipynb new file mode 100644 index 000000000..89d8120c7 --- /dev/null +++ b/notebooks/examples/python/Ship2ShipTransfers/01. Data Prep.ipynb @@ -0,0 +1,1129 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "45f7a0b5-6237-4e83-a3e1-364b071901ee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup\n", + "\n", + "> Generates the table 'harbours_h3' in database 'ship2ship'.\n", + "\n", + "

\n", + "\n", + "1. Import Databricks columnar functions (including H3) for DBR / DBSQL Photon with `from pyspark.databricks.sql.functions import *`\n", + "2. To use Databricks Labs [Mosaic](https://databrickslabs.github.io/mosaic/index.html) library for geospatial data engineering, analysis, and visualization functionality:\n", + " * Install with `%pip install databricks-mosaic`\n", + " * Import and use with the following:\n", + " ```\n", + " import mosaic as mos\n", + " mos.enable_mosaic(spark, dbutils)\n", + " ```\n", + "

\n", + "\n", + "3. To use [KeplerGl](https://kepler.gl/) OSS library for map layer rendering:\n", + " * Already installed with Mosaic, use `%%mosaic_kepler` magic [[Mosaic Docs](https://databrickslabs.github.io/mosaic/usage/kepler.html)]\n", + " * Import with `from keplergl import KeplerGl` to use directly\n", + "\n", + "If you have trouble with Volume access:\n", + "\n", + "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", + "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html).\n", + "\n", + "---\n", + "__Last Updated:__ 27 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "14677abf-cdba-4606-8453-ac6405a19b77", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "16a02bda-f795-4736-8a0f-7cff3bc07c0c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import os\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6cda4285-edab-47be-9d30-a9c9209e78a0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Configure Database__\n", + "\n", + "> Note: Adjust this to your own specified [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/manage-privileges/admin-privileges.html#managing-unity-catalog-metastores) Schema." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e27ffc3d-de44-44c6-a160-99cbb2bf5fe3", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[2]: DataFrame[]" + ] + } + ], + "source": [ + "catalog_name = \"mjohns\"\n", + "sql(f\"use catalog {catalog_name}\")\n", + "\n", + "db_name = \"ship2ship\"\n", + "sql(f\"CREATE DATABASE IF NOT EXISTS {db_name}\")\n", + "sql(f\"use schema {db_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0c0d5c41-f152-4455-afef-72ab921a3d72", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__AIS Data Download: `ETL_DIR` + `ETL_DIR_FUSE`__\n", + "\n", + "> Downloading initial data into a temp location. After the Delta Tables have been created, this location can be removed. You can alter this, of course, to match your preferred location. __Note:__ this is showing DBFS for continuity outside Unity Catalog + Shared Access clusters, but you can easily modify paths to use [Volumes](https://docs.databricks.com/en/sql/language-manual/sql-ref-volumes.html), see more details [here](https://databrickslabs.github.io/mosaic/usage/installation.html) as available." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0edf604e-5e52-49f1-ae4f-baef341ca47d", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "...ETL_DIR: '/tmp/ship2ship', ETL_DIR_FUSE: '/dbfs/tmp/ship2ship' (create)\n" + ] + } + ], + "source": [ + "ETL_DIR = '/tmp/ship2ship'\n", + "ETL_DIR_FUSE = f'/dbfs{ETL_DIR}'\n", + "\n", + "os.environ['ETL_DIR'] = ETL_DIR\n", + "os.environ['ETL_DIR_FUSE'] = ETL_DIR_FUSE\n", + "\n", + "dbutils.fs.mkdirs(ETL_DIR)\n", + "print(f\"...ETL_DIR: '{ETL_DIR}', ETL_DIR_FUSE: '{ETL_DIR_FUSE}' (create)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "305f4172-0e06-4f51-94f9-0903a3e2e5a6", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: AIS_2018_01_31.zip\n inflating: AIS_2018_01_31.csv \ntotal 735M\n-rwxrwxrwx 1 root root 735M Nov 27 19:53 AIS_2018_01_31.csv\n" + ] + } + ], + "source": [ + "%sh\n", + "# see: https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2018/index.html\n", + "# - [1] we download data locally and unzip\n", + "mkdir /ship2ship/\n", + "cd /ship2ship/\n", + "wget -np -r -nH -L --cut-dirs=4 -nc https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2018/AIS_2018_01_31.zip > /dev/null 2>&1\n", + "unzip AIS_2018_01_31.zip\n", + "\n", + "# - [2] then copy to dbfs:// fuse mountpoint (/dbfs)\n", + "mv AIS_2018_01_31.csv $ETL_DIR_FUSE\n", + "ls -lh $ETL_DIR_FUSE" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "029f1fcc-cc00-469b-8d2d-08d171a984d3", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "

MMSIBaseDateTimeLATLONSOGCOGHeadingVesselNameIMOCallSignVesselTypeStatusLengthWidthDraftCargoTranscieverClass
3673536602018-01-31T01:37:52.000+000040.78783-73.919830.196.035.0RED HOOKIMO9501916WDE4476700107164.980A
3673536602018-01-31T01:36:42.000+000040.78783-73.919830.2320.035.0RED HOOKIMO9501916WDE4476700107164.980A
3673536602018-01-31T01:34:11.000+000040.78783-73.919830.2167.035.0RED HOOKIMO9501916WDE4476700107164.980A
3673536602018-01-31T01:31:51.000+000040.78783-73.919830.0170.034.0RED HOOKIMO9501916WDE4476700107164.980A
3673536602018-01-31T01:33:02.000+000040.78783-73.919830.1125.035.0RED HOOKIMO9501916WDE4476700107164.980A
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 367353660, + "2018-01-31T01:37:52.000+0000", + 40.78783, + -73.91983, + 0.1, + 96.0, + 35.0, + "RED HOOK", + "IMO9501916", + "WDE4476", + 70, + 0, + 107, + 16, + 4.9, + 80, + "A" + ], + [ + 367353660, + "2018-01-31T01:36:42.000+0000", + 40.78783, + -73.91983, + 0.2, + 320.0, + 35.0, + "RED HOOK", + "IMO9501916", + "WDE4476", + 70, + 0, + 107, + 16, + 4.9, + 80, + "A" + ], + [ + 367353660, + "2018-01-31T01:34:11.000+0000", + 40.78783, + -73.91983, + 0.2, + 167.0, + 35.0, + "RED HOOK", + "IMO9501916", + "WDE4476", + 70, + 0, + 107, + 16, + 4.9, + 80, + "A" + ], + [ + 367353660, + "2018-01-31T01:31:51.000+0000", + 40.78783, + -73.91983, + 0.0, + 170.0, + 34.0, + "RED HOOK", + "IMO9501916", + "WDE4476", + 70, + 0, + 107, + 16, + 4.9, + 80, + "A" + ], + [ + 367353660, + "2018-01-31T01:33:02.000+0000", + 40.78783, + -73.91983, + 0.1, + 125.0, + 35.0, + "RED HOOK", + "IMO9501916", + "WDE4476", + 70, + 0, + 107, + 16, + 4.9, + 80, + "A" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "MMSI", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "BaseDateTime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "LAT", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "LON", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "SOG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "COG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Heading", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "VesselName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "IMO", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "CallSign", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "VesselType", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Status", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Length", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Width", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Draft", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Cargo", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "TranscieverClass", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "schema = \"\"\"\n", + " MMSI int, \n", + " BaseDateTime timestamp, \n", + " LAT double, \n", + " LON double, \n", + " SOG double, \n", + " COG double, \n", + " Heading double, \n", + " VesselName string, \n", + " IMO string, \n", + " CallSign string, \n", + " VesselType int, \n", + " Status int, \n", + " Length int, \n", + " Width int, \n", + " Draft double, \n", + " Cargo int, \n", + " TranscieverClass string\n", + "\"\"\"\n", + "\n", + "AIS_df = (\n", + " spark.read.csv(ETL_DIR, header=True, schema=schema)\n", + " .filter(\"VesselType = 70\") # <- only select cargos\n", + " .filter(\"Status IS NOT NULL\")\n", + ")\n", + "display(AIS_df.limit(5)) # <- limiting for ipynb only" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c784e3a3-96a7-4bbe-9f11-7ccabc54e58a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(AIS_df.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"AIS\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c2196de6-8bbd-4e6a-8bef-c87ef2eefdb1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
count
521,867
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "521,867" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "count", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql select format_number(count(1), 0) as count from AIS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "23b47629-80df-42bb-9b94-2c12f7d90dd1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Harbours\n", + "\n", + "This data can be obtained from [here](https://data-usdot.opendata.arcgis.com/datasets/usdot::ports-major/about), and loaded with the code below.\n", + "\n", + "To avoid detecting overlap close to, or within harbours, in Notebook `03.b Advanced Overlap Detection` we filter out events taking place close to a harbour.\n", + "Various approaches are possible, including filtering out events too close to shore, and can be implemented in a similar fashion.\n", + "\n", + "In this instance we set a buffer of `10 km` around harbours to arbitrarily define an area wherein we do not expect ship-to-ship transfers to take place.\n", + "Since our projection is not in metres, we convert from decimal degrees. With `(0.00001 - 0.000001)` as being equal to one metre at the equator\n", + "Ref: http://wiki.gis.com/wiki/index.php/Decimal_degrees" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d432c9b5-ccf6-4e74-b3d4-6d4ed51f2cfd", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "total 735M\n-rwxrwxrwx 1 root root 735M Nov 27 19:53 AIS_2018_01_31.csv\n-rwxrwxrwx 1 root root 5.9K Nov 27 19:59 harbours.geojson\n" + ] + } + ], + "source": [ + "%sh\n", + "# we download data to dbfs:// mountpoint (/dbfs)\n", + "cd $ETL_DIR_FUSE && \\\n", + " wget -np -r -nH -L -q --cut-dirs=7 -O harbours.geojson -nc \"https://geo.dot.gov/mapping/rest/services/NTAD/Strategic_Ports/MapServer/0/query?outFields=*&where=1%3D1&f=geojson\"\n", + "\n", + "ls -lh $ETL_DIR_FUSE" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a3845378-e1c1-4eaa-8254-42b96a1ec72c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
namegeom
Beaumont, TXPOLYGON ((-93.99840083045797 30.076160457289905, -94.00013015522168 30.058602328308453, -94.00525167253195 30.041718948377046, -94.01356856535074 30.026159136318142, -94.02476122015118 30.012520846983115, -94.03839950948621 30.001328192182676, -94.05395932154511 29.99301129936389, -94.07084270147652 29.987889782053614, -94.08840083045797 29.986160457289905, -94.10595895943942 29.987889782053614, -94.12284233937083 29.99301129936389, -94.13840215142973 30.001328192182676, -94.15204044076476 30.012520846983115, -94.1632330955652 30.026159136318142, -94.17154998838399 30.041718948377046, -94.17667150569426 30.058602328308453, -94.17840083045797 30.076160457289905, -94.17667150569426 30.093718586271358, -94.17154998838399 30.110601966202765, -94.1632330955652 30.126161778261668, -94.15204044076476 30.139800067596695, -94.13840215142973 30.150992722397135, -94.12284233937083 30.15930961521592, -94.10595895943942 30.164431132526197, -94.08840083045797 30.166160457289905, -94.07084270147652 30.164431132526197, -94.05395932154511 30.15930961521592, -94.03839950948621 30.150992722397135, -94.02476122015118 30.139800067596695, -94.01356856535074 30.126161778261668, -94.00525167253195 30.110601966202765, -94.00013015522168 30.093718586271358, -93.99840083045797 30.076160457289905))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "Beaumont, TX", + "POLYGON ((-93.99840083045797 30.076160457289905, -94.00013015522168 30.058602328308453, -94.00525167253195 30.041718948377046, -94.01356856535074 30.026159136318142, -94.02476122015118 30.012520846983115, -94.03839950948621 30.001328192182676, -94.05395932154511 29.99301129936389, -94.07084270147652 29.987889782053614, -94.08840083045797 29.986160457289905, -94.10595895943942 29.987889782053614, -94.12284233937083 29.99301129936389, -94.13840215142973 30.001328192182676, -94.15204044076476 30.012520846983115, -94.1632330955652 30.026159136318142, -94.17154998838399 30.041718948377046, -94.17667150569426 30.058602328308453, -94.17840083045797 30.076160457289905, -94.17667150569426 30.093718586271358, -94.17154998838399 30.110601966202765, -94.1632330955652 30.126161778261668, -94.15204044076476 30.139800067596695, -94.13840215142973 30.150992722397135, -94.12284233937083 30.15930961521592, -94.10595895943942 30.164431132526197, -94.08840083045797 30.166160457289905, -94.07084270147652 30.164431132526197, -94.05395932154511 30.15930961521592, -94.03839950948621 30.150992722397135, -94.02476122015118 30.139800067596695, -94.01356856535074 30.126161778261668, -94.00525167253195 30.110601966202765, -94.00013015522168 30.093718586271358, -93.99840083045797 30.076160457289905))" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "one_metre = 0.00001 - 0.000001\n", + "buffer = 10 * 1000 * one_metre\n", + "\n", + "major_ports = (\n", + " spark.read.format(\"json\")\n", + " .option(\"multiline\", \"true\")\n", + " .load(f\"{ETL_DIR}/harbours.geojson\")\n", + " .select(\"type\", F.explode(col(\"features\")).alias(\"feature\"))\n", + " .select(\n", + " \"type\",\n", + " col(\"feature.properties\").alias(\"properties\"),\n", + " F.to_json(col(\"feature.geometry\")).alias(\"json_geometry\"),\n", + " )\n", + " .withColumn(\"geom\", mos.st_aswkt(mos.st_geomfromgeojson(\"json_geometry\")))\n", + " .select(col(\"properties.PORT_NAME\").alias(\"name\"), \"geom\")\n", + " .withColumn(\"geom\", mos.st_buffer(\"geom\", F.lit(buffer)))\n", + ")\n", + "major_ports.limit(1).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "93ae6db4-58d9-47d6-9ed7-5c2ddaa6d5f0", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a2b7ed7a-35cf-43bd-a0bc-0160ee317802", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "408696c6-efc7-4810-8bf3-c9e6bbfcdde0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# major_ports \"geom\" \"geometry\"" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "055a7491-8ffc-43fa-8675-d62720360d5b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(\n", + " major_ports.select(\"name\", mos.grid_tessellateexplode(\"geom\", F.lit(9)).alias(\"mos\"))\n", + " .select(\"name\", col(\"mos.index_id\").alias(\"h3\"))\n", + " .write.mode(\"overwrite\")\n", + " .format(\"delta\")\n", + " .saveAsTable(\"harbours_h3\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "56529998-ff0c-407c-b442-7cbeff0b6027", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
nameh3
Beaumont, TX618197022217338879
Beaumont, TX618196982298836991
Beaumont, TX618197022704664575
Beaumont, TX618196978558566399
Beaumont, TX618197022151540735
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "Beaumont, TX", + 618197022217338879 + ], + [ + "Beaumont, TX", + 618196982298836991 + ], + [ + "Beaumont, TX", + 618197022704664575 + ], + [ + "Beaumont, TX", + 618196978558566399 + ], + [ + "Beaumont, TX", + 618197022151540735 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "h3", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "harbours_h3 = spark.read.table(\"harbours_h3\")\n", + "display(harbours_h3.limit(5)) # <- limiting for ipynb only" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f056ec35-3ada-4887-8916-306aa30f4ece", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "fa4abb30-81cb-4b9c-9938-0f8b1f38df45", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cb71bd65-7f91-472e-98c7-f62f93d0cea6", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# \"harbours_h3\" \"h3\" \"h3\" 5_000" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549842168716, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "01. Data Prep", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/Ship2ShipTransfers/01. Data Prep.py b/notebooks/examples/python/Ship2ShipTransfers/01. Data Prep.py deleted file mode 100644 index a0378d68b..000000000 --- a/notebooks/examples/python/Ship2ShipTransfers/01. Data Prep.py +++ /dev/null @@ -1,141 +0,0 @@ -# Databricks notebook source -# MAGIC %md ## We First Prep the data and download it - -# COMMAND ---------- - -# MAGIC %pip install databricks_mosaic - -# COMMAND ---------- - -from pyspark.sql.functions import * -import mosaic as mos - -spark.conf.set("spark.databricks.labs.mosaic.geometry.api", "JTS") -spark.conf.set("spark.databricks.labs.mosaic.index.system", "H3") -mos.enable_mosaic(spark, dbutils) - -# COMMAND ---------- - -# MAGIC %md ##AIS Data - -# COMMAND ---------- - -dbutils.fs.mkdirs("/tmp/ship2ship") - -# COMMAND ---------- - -# MAGIC %sh -# MAGIC # see: https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2018/index.html -# MAGIC # we download data to dbfs:// mountpoint (/dbfs) -# MAGIC mkdir /ship2ship/ -# MAGIC cd /ship2ship/ -# MAGIC wget -np -r -nH -L --cut-dirs=4 https://coast.noaa.gov/htdata/CMSP/AISDataHandler/2018/AIS_2018_01_31.zip > /dev/null 2>&1 -# MAGIC unzip AIS_2018_01_31.zip -# MAGIC mv AIS_2018_01_31.csv /dbfs/tmp/ship2ship/ - -# COMMAND ---------- - -schema = """ - MMSI int, - BaseDateTime timestamp, - LAT double, - LON double, - SOG double, - COG double, - Heading double, - VesselName string, - IMO string, - CallSign string, - VesselType int, - Status int, - Length int, - Width int, - Draft double, - Cargo int, - TranscieverClass string -""" - -AIS_df = ( - spark.read.csv("/tmp/ship2ship", header=True, schema=schema) - .filter("VesselType = 70") # Only select cargos - .filter("Status IS NOT NULL") -) -display(AIS_df) - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC CREATE DATABASE IF NOT EXISTS ship2ship - -# COMMAND ---------- - -(AIS_df.write.format("delta").mode("overwrite").saveAsTable("ship2ship.AIS")) - -# COMMAND ---------- - -# MAGIC %md ## Harbours -# MAGIC -# MAGIC This data can be obtained from [here](https://data-usdot.opendata.arcgis.com/datasets/usdot::ports-major/about), and loaded with the code below. -# MAGIC -# MAGIC To avoid detecting overlap close to, or within harbours, in Notebook `03.b Advanced Overlap Detection` we filter out events taking place close to a harbour. -# MAGIC Various approaches are possible, including filtering out events too close to shore, and can be implemented in a similar fashion. -# MAGIC -# MAGIC In this instance we set a buffer of `10 km` around harbours to arbitrarily define an area wherein we do not expect ship-to-ship transfers to take place. -# MAGIC Since our projection is not in metres, we convert from decimal degrees. With `(0.00001 - 0.000001)` as being equal to one metre at the equator -# MAGIC Ref: http://wiki.gis.com/wiki/index.php/Decimal_degrees - -# COMMAND ---------- - -# MAGIC %sh -# MAGIC # we download data to dbfs:// mountpoint (/dbfs) -# MAGIC cd /dbfs/tmp/ship2ship/ -# MAGIC # wget -np -r -nH -L -q --cut-dirs=7 -O harbours.geojson "https://geo.dot.gov/mapping/rest/services/NTAD/Ports_Major/MapServer/0/query?outFields=*&where=1%3D1&f=geojson" -# MAGIC wget -np -r -nH -L -q --cut-dirs=7 -O harbours.geojson "https://geo.dot.gov/mapping/rest/services/NTAD/Strategic_Ports/MapServer/0/query?outFields=*&where=1%3D1&f=geojson" - -# COMMAND ---------- - -one_metre = 0.00001 - 0.000001 -buffer = 10 * 1000 * one_metre - -major_ports = ( - spark.read.format("json") - .option("multiline", "true") - .load("/tmp/ship2ship/harbours.geojson") - .select("type", explode(col("features")).alias("feature")) - .select( - "type", - col("feature.properties").alias("properties"), - to_json(col("feature.geometry")).alias("json_geometry"), - ) - .withColumn("geom", mos.st_aswkt(mos.st_geomfromgeojson("json_geometry"))) - .select(col("properties.PORT_NAME").alias("name"), "geom") - .withColumn("geom", mos.st_buffer("geom", lit(buffer))) -) -display(major_ports) - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC major_ports "geom" "geometry" - -# COMMAND ---------- - -( - major_ports.select("name", mos.grid_tessellateexplode("geom", lit(9)).alias("mos")) - .select("name", col("mos.index_id").alias("h3")) - .write.mode("overwrite") - .format("delta") - .saveAsTable("ship2ship.harbours_h3") -) - -# COMMAND ---------- - -harbours_h3 = spark.read.table("ship2ship.harbours_h3") -display(harbours_h3) - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC "harbours_h3" "h3" "h3" 5_000 - -# COMMAND ---------- diff --git a/notebooks/examples/python/Ship2ShipTransfers/02. Data Ingestion.ipynb b/notebooks/examples/python/Ship2ShipTransfers/02. Data Ingestion.ipynb new file mode 100644 index 000000000..ae2b9610d --- /dev/null +++ b/notebooks/examples/python/Ship2ShipTransfers/02. Data Ingestion.ipynb @@ -0,0 +1,1126 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ab3d9623-51a7-494f-9e5d-125e641ff600", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup\n", + "\n", + "---\n", + "__Last Updated:__ 27 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "62903103-9e6d-408a-ae71-9fa0bd3c5b2b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a304a53f-6d97-4f18-a80d-35c372c32c7c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c4acd803-e44f-4f00-9611-2c8032818306", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Configure Database__\n", + "\n", + "> Adjust this to settings from the Data Prep notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9298ec9a-0383-4bb5-95f3-046774a2e0e5", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[2]: DataFrame[]" + ] + } + ], + "source": [ + "catalog_name = \"mjohns\"\n", + "db_name = \"ship2ship\"\n", + "\n", + "sql(f\"use catalog {catalog_name}\")\n", + "sql(f\"use schema {db_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "84daf849-2e8c-468f-bf27-008e0f7408ac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "\n", + "We begin with loading from a table. Here we use captured `AIS` data.\n", + "\n", + "

\n", + "\n", + "- `MMSI`: unique 9-digit identification code of the ship - numeric\n", + "- `VesselName`: name of the ship - string\n", + "- `CallSign`: unique callsign of the ship - string\n", + "- `BaseDateTime`: timestamp of the AIS message - datetime\n", + "- `LAT`: latitude of the ship (in degree: [-90 ; 90], negative value represents South, 91 indicates ‘not available’) - numeric\n", + "- `LON`: longitude of the ship (in degree: [-180 ; 180], negative value represents West, 181 indicates ‘not available’) - numeric\n", + "- `SOG`: speed over ground, in knots - numeric\n", + "- `Status`: status of the ship - string" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c7ad164b-8168-4de2-a03c-dd2b3c8b9b62", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "

MMSIBaseDateTimeLATLONSOGCOGHeadingVesselNameIMOCallSignVesselTypeStatusLengthWidthDraftCargoTranscieverClass
2283429002018-01-31T21:38:42.000+000033.57885-121.5771715.090.487.0CMA CGM MEDEAIMO9299800FMFR7003494215.0nullA
3054620002018-01-31T00:04:12.000+000045.49781-73.5460.0197.2184.0BBC OREGONIMO9501265V2EK7705138217.5nullA
3054620002018-01-31T00:01:17.000+000045.49782-73.545980.0197.2184.0BBC OREGONIMO9501265V2EK7705138217.5nullA
3054620002018-01-31T00:10:15.000+000045.49784-73.546070.0197.2184.0BBC OREGONIMO9501265V2EK7705138217.5nullA
3054620002018-01-31T00:07:19.000+000045.49781-73.546040.0197.2184.0BBC OREGONIMO9501265V2EK7705138217.5nullA
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 228342900, + "2018-01-31T21:38:42.000+0000", + 33.57885, + -121.57717, + 15.0, + 90.4, + 87.0, + "CMA CGM MEDEA", + "IMO9299800", + "FMFR", + 70, + 0, + 349, + 42, + 15.0, + null, + "A" + ], + [ + 305462000, + "2018-01-31T00:04:12.000+0000", + 45.49781, + -73.546, + 0.0, + 197.2, + 184.0, + "BBC OREGON", + "IMO9501265", + "V2EK7", + 70, + 5, + 138, + 21, + 7.5, + null, + "A" + ], + [ + 305462000, + "2018-01-31T00:01:17.000+0000", + 45.49782, + -73.54598, + 0.0, + 197.2, + 184.0, + "BBC OREGON", + "IMO9501265", + "V2EK7", + 70, + 5, + 138, + 21, + 7.5, + null, + "A" + ], + [ + 305462000, + "2018-01-31T00:10:15.000+0000", + 45.49784, + -73.54607, + 0.0, + 197.2, + 184.0, + "BBC OREGON", + "IMO9501265", + "V2EK7", + 70, + 5, + 138, + 21, + 7.5, + null, + "A" + ], + [ + 305462000, + "2018-01-31T00:07:19.000+0000", + 45.49781, + -73.54604, + 0.0, + 197.2, + 184.0, + "BBC OREGON", + "IMO9501265", + "V2EK7", + 70, + 5, + 138, + 21, + 7.5, + null, + "A" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "MMSI", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "BaseDateTime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "LAT", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "LON", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "SOG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "COG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Heading", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "VesselName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "IMO", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "CallSign", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "VesselType", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Status", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Length", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Width", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Draft", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Cargo", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "TranscieverClass", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "cargos = spark.read.table(\"AIS\")\n", + "display(cargos.limit(5)) # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "40a1ac33-8f26-42de-b9e2-4e462e349168", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## AIS Data Indexing\n", + "\n", + "> To facilitate downstream analytics it is also possible to create a quick point index leveraging a chosen H3 resolution.\n", + "In this case, resolution `9` has an edge length of ~174 metres." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "261912e6-6d06-49e5-a86c-fcc9135ffcec", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
MMSIBaseDateTimeLATLONSOGCOGHeadingVesselNameIMOCallSignVesselTypeStatusLengthWidthDraftCargoTranscieverClasspoint_geomixsog_kmph
3530960002018-01-31T01:02:43.000+000036.80157-76.289430.0191.075.0SUNNY HOPEIMO94821343EVB97051963212.770AList(1, 0, List(List(List(-76.28943, 36.80157))), List(List()))6177489347739647990.0
3530960002018-01-31T00:41:44.000+000036.80157-76.289480.0191.075.0SUNNY HOPEIMO94821343EVB97051963212.770AList(1, 0, List(List(List(-76.28948, 36.80157))), List(List()))6177489347739647990.0
3530960002018-01-31T00:47:45.000+000036.80155-76.289420.0191.075.0SUNNY HOPEIMO94821343EVB97051963212.770AList(1, 0, List(List(List(-76.28942, 36.80155))), List(List()))6177489347739647990.0
3530960002018-01-31T00:44:45.000+000036.80155-76.289430.0191.075.0SUNNY HOPEIMO94821343EVB97051963212.770AList(1, 0, List(List(List(-76.28943, 36.80155))), List(List()))6177489347739647990.0
3530960002018-01-31T00:38:43.000+000036.80157-76.289470.0191.074.0SUNNY HOPEIMO94821343EVB97051963212.770AList(1, 0, List(List(List(-76.28947, 36.80157))), List(List()))6177489347739647990.0
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 353096000, + "2018-01-31T01:02:43.000+0000", + 36.80157, + -76.28943, + 0.0, + 191.0, + 75.0, + "SUNNY HOPE", + "IMO9482134", + "3EVB9", + 70, + 5, + 196, + 32, + 12.7, + 70, + "A", + [ + 1, + 0, + [ + [ + [ + -76.28943, + 36.80157 + ] + ] + ], + [ + [] + ] + ], + 617748934773964799, + 0.0 + ], + [ + 353096000, + "2018-01-31T00:41:44.000+0000", + 36.80157, + -76.28948, + 0.0, + 191.0, + 75.0, + "SUNNY HOPE", + "IMO9482134", + "3EVB9", + 70, + 5, + 196, + 32, + 12.7, + 70, + "A", + [ + 1, + 0, + [ + [ + [ + -76.28948, + 36.80157 + ] + ] + ], + [ + [] + ] + ], + 617748934773964799, + 0.0 + ], + [ + 353096000, + "2018-01-31T00:47:45.000+0000", + 36.80155, + -76.28942, + 0.0, + 191.0, + 75.0, + "SUNNY HOPE", + "IMO9482134", + "3EVB9", + 70, + 5, + 196, + 32, + 12.7, + 70, + "A", + [ + 1, + 0, + [ + [ + [ + -76.28942, + 36.80155 + ] + ] + ], + [ + [] + ] + ], + 617748934773964799, + 0.0 + ], + [ + 353096000, + "2018-01-31T00:44:45.000+0000", + 36.80155, + -76.28943, + 0.0, + 191.0, + 75.0, + "SUNNY HOPE", + "IMO9482134", + "3EVB9", + 70, + 5, + 196, + 32, + 12.7, + 70, + "A", + [ + 1, + 0, + [ + [ + [ + -76.28943, + 36.80155 + ] + ] + ], + [ + [] + ] + ], + 617748934773964799, + 0.0 + ], + [ + 353096000, + "2018-01-31T00:38:43.000+0000", + 36.80157, + -76.28947, + 0.0, + 191.0, + 74.0, + "SUNNY HOPE", + "IMO9482134", + "3EVB9", + 70, + 5, + 196, + 32, + 12.7, + 70, + "A", + [ + 1, + 0, + [ + [ + [ + -76.28947, + 36.80157 + ] + ] + ], + [ + [] + ] + ], + 617748934773964799, + 0.0 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "MMSI", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "BaseDateTime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "LAT", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "LON", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "SOG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "COG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Heading", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "VesselName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "IMO", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "CallSign", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "VesselType", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Status", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Length", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Width", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Draft", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Cargo", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "TranscieverClass", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "point_geom", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"type_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"srid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boundary\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"holes\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "ix", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "sog_kmph", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "cargos_indexed = (\n", + " cargos.withColumn(\"point_geom\", mos.st_point(\"LON\", \"LAT\"))\n", + " .withColumn(\"ix\", mos.grid_pointascellid(\"point_geom\", resolution=F.lit(9)))\n", + " .withColumn(\"sog_kmph\", F.round(col(\"sog\") * 1.852, 2))\n", + ")\n", + "display(cargos_indexed.limit(5)) # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d2bc9c28-4db9-405b-b8a6-a6916204ccc0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_We will write the treated output to a new table._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a05f901e-e3d6-4855-854e-fa6ea6492f9a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(\n", + " cargos_indexed.withColumn(\"point_geom\", mos.st_aswkb(\"point_geom\"))\n", + " .write.mode(\"overwrite\")\n", + " .saveAsTable(\"cargos_indexed\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "006d4a9d-473b-4477-89b9-c4906209f8f3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_We will optimise our table to colocate data and make querying faster._\n", + "\n", + "> This is showing [ZORDER](https://docs.databricks.com/en/delta/data-skipping.html); for newer runtimes (DBR 13.3 LTS+), can also consider [Liquid Clustering](https://docs.databricks.com/en/delta/clustering.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "136231e0-d292-482a-947d-6765f18e9459", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
pathmetrics
s3://databricks-e2demofieldengwest/b169b504-4c54-49f2-bc3a-adf4b128f36d/tables/935ef93c-fdc0-4db3-b1ba-bf9f32bc26faList(4, 4, List(3318525, 5963269, 4417031.0, 4, 17668124), List(1602001, 7746854, 4757181.75, 4, 19028727), 0, List(minCubeSize(107374182400), List(0, 0), List(4, 19028727), 0, List(4, 19028727), 1, null), 1, 4, 0, false, 0, 0, 1701117514920, 1701117559879, 4, 1, null, List(0, 0), 20, 20, 8027)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "s3://databricks-e2demofieldengwest/b169b504-4c54-49f2-bc3a-adf4b128f36d/tables/935ef93c-fdc0-4db3-b1ba-bf9f32bc26fa", + [ + 4, + 4, + [ + 3318525, + 5963269, + 4417031.0, + 4, + 17668124 + ], + [ + 1602001, + 7746854, + 4757181.75, + 4, + 19028727 + ], + 0, + [ + "minCubeSize(107374182400)", + [ + 0, + 0 + ], + [ + 4, + 19028727 + ], + 0, + [ + 4, + 19028727 + ], + 1, + null + ], + 1, + 4, + 0, + false, + 0, + 0, + 1701117514920, + 1701117559879, + 4, + 1, + null, + [ + 0, + 0 + ], + 20, + 20, + 8027 + ] + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "metrics", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"numFilesAdded\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"numFilesRemoved\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"filesAdded\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"min\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"max\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"avg\",\"type\":\"double\",\"nullable\":false,\"metadata\":{}},{\"name\":\"totalFiles\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"totalSize\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"filesRemoved\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"min\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"max\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"avg\",\"type\":\"double\",\"nullable\":false,\"metadata\":{}},{\"name\":\"totalFiles\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"totalSize\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"partitionsOptimized\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"zOrderStats\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"strategyName\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"inputCubeFiles\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"num\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"size\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"inputOtherFiles\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"num\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"size\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"inputNumCubes\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"mergedFiles\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"num\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"size\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"numOutputCubes\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"mergedNumCubes\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"numBatches\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"totalConsideredFiles\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"totalFilesSkipped\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"preserveInsertionOrder\",\"type\":\"boolean\",\"nullable\":false,\"metadata\":{}},{\"name\":\"numFilesSkippedToReduceWriteAmplification\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"numBytesSkippedToReduceWriteAmplification\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"startTimeMs\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"endTimeMs\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"totalClusterParallelism\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"totalScheduledTasks\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"autoCompactParallelismStats\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"maxClusterActiveParallelism\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"minClusterActiveParallelism\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"maxSessionActiveParallelism\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"minSessionActiveParallelism\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"deletionVectorStats\",\"type\":{\"type\":\"struct\",\"fields\":[{\"name\":\"numDeletionVectorsRemoved\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"numDeletionVectorRowsRemoved\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}}]},\"nullable\":true,\"metadata\":{}},{\"name\":\"numTableColumns\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"numTableColumnsWithStats\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}},{\"name\":\"totalTaskExecutionTimeMs\",\"type\":\"long\",\"nullable\":false,\"metadata\":{}}]}" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql OPTIMIZE ship2ship.cargos_indexed ZORDER by (ix, BaseDateTime)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b6702ef5-b1cc-4611-8b2c-b8cc3d302f57", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Visualisation\n", + "And we can perform a quick visual inspection of the indexed AIS data." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "30322c23-052a-4ac2-b140-fa1e117e31ea", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b59dd42d-7a9e-4005-b0c2-57350fff2ee4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4e642c82-4421-45bb-8ca9-9ac27a66d7f5", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# ship2ship.cargos_indexed \"ix\" \"h3\" 10_000" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549842153077, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "02. Data Ingestion", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/Ship2ShipTransfers/02. Data Ingestion.py b/notebooks/examples/python/Ship2ShipTransfers/02. Data Ingestion.py deleted file mode 100644 index 5f4f3aff7..000000000 --- a/notebooks/examples/python/Ship2ShipTransfers/02. Data Ingestion.py +++ /dev/null @@ -1,83 +0,0 @@ -# Databricks notebook source -# MAGIC %md ## Data Ingestion - -# COMMAND ---------- - -# MAGIC %pip install databricks_mosaic - -# COMMAND ---------- - - -from pyspark.sql.functions import * -import mosaic as mos - -spark.conf.set("spark.databricks.labs.mosaic.geometry.api", "JTS") -spark.conf.set("spark.databricks.labs.mosaic.index.system", "H3") -mos.enable_mosaic(spark, dbutils) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC We begin with loading from a table. Here we use captured `AIS` data. -# MAGIC -# MAGIC - MMSI: unique 9-digit identification code of the ship - numeric -# MAGIC - VesselName: name of the ship - string -# MAGIC - CallSign: unique callsign of the ship - string -# MAGIC - BaseDateTime: timestamp of the AIS message - datetime -# MAGIC - LAT: latitude of the ship (in degree: [-90 ; 90], negative value represents South, 91 indicates ‘not available’) - numeric -# MAGIC - LON: longitude of the ship (in degree: [-180 ; 180], negative value represents West, 181 indicates ‘not available’) - numeric -# MAGIC - SOG: speed over ground, in knots - numeric -# MAGIC - Status: status of the ship - string - -# COMMAND ---------- - -cargos = spark.read.table("ship2ship.AIS") -display(cargos) - -# COMMAND ---------- - -# MAGIC %md ## Data Transformation - -# COMMAND ---------- - -# MAGIC %md ### Indexing -# MAGIC To facilitate downstream analytics it is also possible to create a quick point index leveraging a chosen H3 resolution. -# MAGIC In this case, resolution `9` has an edge length of ~174 metres. - -# COMMAND ---------- - -cargos_indexed = ( - cargos.withColumn("point_geom", mos.st_point("LON", "LAT")) - .withColumn("ix", mos.grid_pointascellid("point_geom", resolution=lit(9))) - .withColumn("sog_kmph", round(col("sog") * 1.852, 2)) -) -display(cargos_indexed) - -# COMMAND ---------- - -# MAGIC %md ## Exporting -# MAGIC and we can write the treated output to a new table. - -# COMMAND ---------- - -( - cargos_indexed.withColumn("point_geom", mos.st_aswkb("point_geom")) - .write.mode("overwrite") - .saveAsTable("ship2ship.cargos_indexed") -) - -# COMMAND ---------- - -# DBTITLE 1,We can optimise our table to colocate data and make querying faster -# MAGIC %sql OPTIMIZE ship2ship.cargos_indexed ZORDER by (ix, BaseDateTime) - -# COMMAND ---------- - -# MAGIC %md ## Visualisation -# MAGIC And we can perform a quick visual inspection of the data. - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC ship2ship.cargos_indexed "ix" "h3" 10_000 diff --git a/notebooks/examples/python/Ship2ShipTransfers/03.a Overlap Detection.ipynb b/notebooks/examples/python/Ship2ShipTransfers/03.a Overlap Detection.ipynb new file mode 100644 index 000000000..6c5b8625b --- /dev/null +++ b/notebooks/examples/python/Ship2ShipTransfers/03.a Overlap Detection.ipynb @@ -0,0 +1,1365 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3e261c46-d2ae-4747-87cf-31876281d0ad", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Overlap Detection\n", + "\n", + "> We now try to detect potentially overlapping pings using a buffer on a particular day.\n", + "\n", + "---\n", + "__Last Updated:__ 27 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5c50bfa2-9a7f-4ab5-9419-7c7bc7134d66", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c382f650-dc2d-434c-8be6-72076126febc", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bdba63fe-887d-4c3a-aee7-90968ac14bc0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import os\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "434a34b8-1a0b-4e96-985d-2c2855b470b5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Configure Database__\n", + "\n", + "> Adjust this to settings from the Data Prep notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c743a069-0cb1-485a-bee3-30851e314b33", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[2]: DataFrame[]" + ] + } + ], + "source": [ + "catalog_name = \"mjohns\"\n", + "db_name = \"ship2ship\"\n", + "\n", + "sql(f\"use catalog {catalog_name}\")\n", + "sql(f\"use schema {db_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "37f91db0-c7b5-4d55-954b-9880918dedde", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 521,430\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
MMSIBaseDateTimeLATLONSOGCOGHeadingVesselNameIMOCallSignVesselTypeStatusLengthWidthDraftCargoTranscieverClasspoint_geomixsog_kmph
3693270002018-01-31T18:14:57.000+000039.24078-76.534780.087.0324.0FREEDOMIMO9129706WDB54837051903210.270AAAAAAAHAUyI51eSjg0BDntHhCMP06177436188506849270.0
3693270002018-01-31T17:59:56.000+000039.24078-76.534770.052.0324.0FREEDOMIMO9129706WDB54837051903210.270AAAAAAAHAUyI5q/M4cUBDntHhCMP06177436188506849270.0
3693270002018-01-31T17:56:56.000+000039.24078-76.534780.075.0324.0FREEDOMIMO9129706WDB54837051903210.270AAAAAAAHAUyI51eSjg0BDntHhCMP06177436188506849270.0
3693270002018-01-31T17:53:56.000+000039.24078-76.534770.05.0324.0FREEDOMIMO9129706WDB54837051903210.270AAAAAAAHAUyI5q/M4cUBDntHhCMP06177436188506849270.0
3693270002018-01-31T17:50:56.000+000039.24078-76.534780.080.0324.0FREEDOMIMO9129706WDB54837051903210.270AAAAAAAHAUyI51eSjg0BDntHhCMP06177436188506849270.0
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 369327000, + "2018-01-31T18:14:57.000+0000", + 39.24078, + -76.53478, + 0.0, + 87.0, + 324.0, + "FREEDOM", + "IMO9129706", + "WDB5483", + 70, + 5, + 190, + 32, + 10.2, + 70, + "A", + "AAAAAAHAUyI51eSjg0BDntHhCMP0", + 617743618850684927, + 0.0 + ], + [ + 369327000, + "2018-01-31T17:59:56.000+0000", + 39.24078, + -76.53477, + 0.0, + 52.0, + 324.0, + "FREEDOM", + "IMO9129706", + "WDB5483", + 70, + 5, + 190, + 32, + 10.2, + 70, + "A", + "AAAAAAHAUyI5q/M4cUBDntHhCMP0", + 617743618850684927, + 0.0 + ], + [ + 369327000, + "2018-01-31T17:56:56.000+0000", + 39.24078, + -76.53478, + 0.0, + 75.0, + 324.0, + "FREEDOM", + "IMO9129706", + "WDB5483", + 70, + 5, + 190, + 32, + 10.2, + 70, + "A", + "AAAAAAHAUyI51eSjg0BDntHhCMP0", + 617743618850684927, + 0.0 + ], + [ + 369327000, + "2018-01-31T17:53:56.000+0000", + 39.24078, + -76.53477, + 0.0, + 5.0, + 324.0, + "FREEDOM", + "IMO9129706", + "WDB5483", + 70, + 5, + 190, + 32, + 10.2, + 70, + "A", + "AAAAAAHAUyI5q/M4cUBDntHhCMP0", + 617743618850684927, + 0.0 + ], + [ + 369327000, + "2018-01-31T17:50:56.000+0000", + 39.24078, + -76.53478, + 0.0, + 80.0, + 324.0, + "FREEDOM", + "IMO9129706", + "WDB5483", + 70, + 5, + 190, + 32, + 10.2, + 70, + "A", + "AAAAAAHAUyI51eSjg0BDntHhCMP0", + 617743618850684927, + 0.0 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "MMSI", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "BaseDateTime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "LAT", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "LON", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "SOG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "COG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Heading", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "VesselName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "IMO", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "CallSign", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "VesselType", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Status", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Length", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Width", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Draft", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Cargo", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "TranscieverClass", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "point_geom", + "type": "\"binary\"" + }, + { + "metadata": "{}", + "name": "ix", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "sog_kmph", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "cargos_indexed = spark.read.table(\"cargos_indexed\").filter(\n", + " col(\"BaseDateTime\").between(\n", + " \"2018-01-31T00:00:00.000+0000\", \"2018-01-31T23:59:00.000+0000\"\n", + " )\n", + ")\n", + "print(f\"count? {cargos_indexed.count():,}\")\n", + "cargos_indexed.limit(5).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "de54e366-47e8-4ce9-984a-df4dfba430a8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Buffering\n", + "\n", + "

\n", + "\n", + "1. Convert the point into a polygon by buffering it with a certain area to turn this into a circle.\n", + "2. Index the polygon to leverage more performant querying.\n", + "\n", + "> Since our projection is not in metres, we convert from decimal degrees, with `(0.00001 - 0.000001)` as being equal to one metre at the equator. Here we choose an buffer of roughly 100 metres, ref http://wiki.gis.com/wiki/index.php/Decimal_degrees." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3eed8a55-d1fa-4b87-ab3d-0196918813df", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "one_metre = 0.00001 - 0.000001\n", + "buffer = 100 * one_metre\n", + "\n", + "(\n", + " cargos_indexed\n", + " .repartition(sc.defaultParallelism * 20) # <- repartition is important!\n", + " .withColumn(\"buffer_geom\", mos.st_buffer(\"point_geom\", F.lit(buffer)))\n", + " .withColumn(\"ix\", mos.grid_tessellateexplode(\"buffer_geom\", F.lit(9)))\n", + " .write.mode(\"overwrite\")\n", + " .saveAsTable(\"cargos_buffered\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f2fc92da-66c3-4260-b119-e6212d11b567", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_We will optimise our table to colocate data and make querying faster._\n", + "\n", + "> This is showing [ZORDER](https://docs.databricks.com/en/delta/data-skipping.html); for newer runtimes (DBR 13.3 LTS+), can also consider [Liquid Clustering](https://docs.databricks.com/en/delta/clustering.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a61ad409-af82-4909-a6c1-fc94e4209a9b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "

MMSIBaseDateTimeLATLONSOGCOGHeadingVesselNameIMOCallSignVesselTypeStatusLengthWidthDraftCargoTranscieverClasspoint_geomixsog_kmphbuffer_geom
6360924002018-01-31T10:17:31.000+000033.72458-118.275910.0265.7341.0POLARLIGHTIMO9189873D5BR7705153249.1nullAAAAAAAHAXZGogmqo60BA3L8Jlar4List(false, 617725793643266047, AAAAAAMAAAABAAAACMBdkZnDiwSrQEDcvwmVqvjAXZGaDBOAYkBA3LlIs7gVwF2RmuLjX/hAQNyzwGvlVMBdkZv8QQc2QEDcr6Of48PAXZGfITb9gEBA3LhCQSN4wF2Rmlhq32pAQNzGwctWl8BdkZoME4BiQEDcxMp3ndvAXZE= (truncated))0.0AAAAAAMAAAABAAAAIcBdkZnDiwSrQEDcvwmVqvjAXZGaDBOAYkBA3LlIs7gVwF2RmuLjX/hAQNyzwGvlVMBdkZw/uVTfQEDcrqcreejAXZGeFS2MH0BA3KovG3FfwF2RoFE1kGNAQNymhDMC4cBdkaLd1cYZQEDco8qHGRLAXZE= (truncated)
6360924002018-01-31T10:17:31.000+000033.72458-118.275910.0265.7341.0POLARLIGHTIMO9189873D5BR7705153249.1nullAAAAAAAHAXZGogmqo60BA3L8Jlar4List(false, 617725793603158015, AAAAAAMAAAABAAAAEMBdkZv8QQc2QEDcr6Of48PAXZGcP7lU30BA3K6nK3nowF2RnhUtjB9AQNyqLxtxX8BdkaBRNZBjQEDcpoQzAuHAXZGi3dXGGUBA3KPKhxkSwF2RpaH5r3pAQNyiHOdZ5sBdkaiCaqjrQEDcoYvWYnjAXZE= (truncated))0.0AAAAAAMAAAABAAAAIcBdkZnDiwSrQEDcvwmVqvjAXZGaDBOAYkBA3LlIs7gVwF2RmuLjX/hAQNyzwGvlVMBdkZw/uVTfQEDcrqcreejAXZGeFS2MH0BA3KovG3FfwF2RoFE1kGNAQNymhDMC4cBdkaLd1cYZQEDco8qHGRLAXZE= (truncated)
6360924002018-01-31T10:17:31.000+000033.72458-118.275910.0265.7341.0POLARLIGHTIMO9189873D5BR7705153249.1nullAAAAAAAHAXZGogmqo60BA3L8Jlar4List(false, 617725793643528191, AAAAAAMAAAABAAAAFMBdkbaTxKM9QEDctq7esKjAXZG2+MHRdEBA3LlIs7gVwF2Rt0FKTStAQNy/CZWq+MBdkbb4wdF0QEDcxMp3ndvAXZG2IfHx3kBA3MpSv3CcwF2RtMUb/PdAQNzPa//cCMBdkbLvp8W3QEDc0+QP5JHAXZE= (truncated))0.0AAAAAAMAAAABAAAAIcBdkZnDiwSrQEDcvwmVqvjAXZGaDBOAYkBA3LlIs7gVwF2RmuLjX/hAQNyzwGvlVMBdkZw/uVTfQEDcrqcreejAXZGeFS2MH0BA3KovG3FfwF2RoFE1kGNAQNymhDMC4cBdkaLd1cYZQEDco8qHGRLAXZE= (truncated)
6360175192018-01-31T08:41:24.000+000039.93302-75.132630.072.46.0PORT ORIENTIMO9735103D5LI8705199nullnullnullAAAAAAAHAUsh9Aood/EBD920zCUHIList(false, 617733347188670463, AAAAAAMAAAABAAAACsBSyG+bOfK0QEP3YReHt3bAUshwv9jJ8EBD91zQnxC4wFLIcpVNATBAQ/dYWI8IL8BSyHTRVQV0QEP3VK2mmbHAUsh3XfU7KkBD91Hz+q/iwFLIeiIZJItAQ/dQRlrwtsBSyH0Cih38QEP3T7VJ+UjAUsg= (truncated))0.0AAAAAAMAAAABAAAAIcBSyG5Dqnm8QEP3bTMJQcjAUshujDL1c0BD92dyJ07lwFLIb2MC1QlAQ/dh6d98JMBSyHC/2MnwQEP3XNCfELjAUshylU0BMEBD91hYjwgvwFLIdNFVBXRAQ/dUraaZscBSyHdd9TsqQEP3UfP6r+LAUsg= (truncated)
6360175192018-01-31T08:41:24.000+000039.93302-75.132630.072.46.0PORT ORIENTIMO9735103D5LI8705199nullnullnullAAAAAAAHAUsh9Aood/EBD920zCUHIList(false, 617733347188408319, AAAAAAMAAAABAAAAGsBSyG5Dqnm8QEP3bTMJQcjAUshujDL1c0BD92dyJ07lwFLIb2MC1QlAQ/dh6d98JMBSyG+bOfK0QEP3YReHt3bAUsh+baoyv0BD91NVSkmVwFLIiGG9eiVAQ/dapji51cBSyIlFO3IIQEP3XNCfELjAUsg= (truncated))0.0AAAAAAMAAAABAAAAIcBSyG5Dqnm8QEP3bTMJQcjAUshujDL1c0BD92dyJ07lwFLIb2MC1QlAQ/dh6d98JMBSyHC/2MnwQEP3XNCfELjAUshylU0BMEBD91hYjwgvwFLIdNFVBXRAQ/dUraaZscBSyHdd9TsqQEP3UfP6r+LAUsg= (truncated)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 636092400, + "2018-01-31T10:17:31.000+0000", + 33.72458, + -118.27591, + 0.0, + 265.7, + 341.0, + "POLARLIGHT", + "IMO9189873", + "D5BR7", + 70, + 5, + 153, + 24, + 9.1, + null, + "A", + "AAAAAAHAXZGogmqo60BA3L8Jlar4", + [ + false, + 617725793643266047, + "AAAAAAMAAAABAAAACMBdkZnDiwSrQEDcvwmVqvjAXZGaDBOAYkBA3LlIs7gVwF2RmuLjX/hAQNyzwGvlVMBdkZv8QQc2QEDcr6Of48PAXZGfITb9gEBA3LhCQSN4wF2Rmlhq32pAQNzGwctWl8BdkZoME4BiQEDcxMp3ndvAXZE= (truncated)" + ], + 0.0, + "AAAAAAMAAAABAAAAIcBdkZnDiwSrQEDcvwmVqvjAXZGaDBOAYkBA3LlIs7gVwF2RmuLjX/hAQNyzwGvlVMBdkZw/uVTfQEDcrqcreejAXZGeFS2MH0BA3KovG3FfwF2RoFE1kGNAQNymhDMC4cBdkaLd1cYZQEDco8qHGRLAXZE= (truncated)" + ], + [ + 636092400, + "2018-01-31T10:17:31.000+0000", + 33.72458, + -118.27591, + 0.0, + 265.7, + 341.0, + "POLARLIGHT", + "IMO9189873", + "D5BR7", + 70, + 5, + 153, + 24, + 9.1, + null, + "A", + "AAAAAAHAXZGogmqo60BA3L8Jlar4", + [ + false, + 617725793603158015, + "AAAAAAMAAAABAAAAEMBdkZv8QQc2QEDcr6Of48PAXZGcP7lU30BA3K6nK3nowF2RnhUtjB9AQNyqLxtxX8BdkaBRNZBjQEDcpoQzAuHAXZGi3dXGGUBA3KPKhxkSwF2RpaH5r3pAQNyiHOdZ5sBdkaiCaqjrQEDcoYvWYnjAXZE= (truncated)" + ], + 0.0, + "AAAAAAMAAAABAAAAIcBdkZnDiwSrQEDcvwmVqvjAXZGaDBOAYkBA3LlIs7gVwF2RmuLjX/hAQNyzwGvlVMBdkZw/uVTfQEDcrqcreejAXZGeFS2MH0BA3KovG3FfwF2RoFE1kGNAQNymhDMC4cBdkaLd1cYZQEDco8qHGRLAXZE= (truncated)" + ], + [ + 636092400, + "2018-01-31T10:17:31.000+0000", + 33.72458, + -118.27591, + 0.0, + 265.7, + 341.0, + "POLARLIGHT", + "IMO9189873", + "D5BR7", + 70, + 5, + 153, + 24, + 9.1, + null, + "A", + "AAAAAAHAXZGogmqo60BA3L8Jlar4", + [ + false, + 617725793643528191, + "AAAAAAMAAAABAAAAFMBdkbaTxKM9QEDctq7esKjAXZG2+MHRdEBA3LlIs7gVwF2Rt0FKTStAQNy/CZWq+MBdkbb4wdF0QEDcxMp3ndvAXZG2IfHx3kBA3MpSv3CcwF2RtMUb/PdAQNzPa//cCMBdkbLvp8W3QEDc0+QP5JHAXZE= (truncated)" + ], + 0.0, + "AAAAAAMAAAABAAAAIcBdkZnDiwSrQEDcvwmVqvjAXZGaDBOAYkBA3LlIs7gVwF2RmuLjX/hAQNyzwGvlVMBdkZw/uVTfQEDcrqcreejAXZGeFS2MH0BA3KovG3FfwF2RoFE1kGNAQNymhDMC4cBdkaLd1cYZQEDco8qHGRLAXZE= (truncated)" + ], + [ + 636017519, + "2018-01-31T08:41:24.000+0000", + 39.93302, + -75.13263, + 0.0, + 72.4, + 6.0, + "PORT ORIENT", + "IMO9735103", + "D5LI8", + 70, + 5, + 199, + null, + null, + null, + "A", + "AAAAAAHAUsh9Aood/EBD920zCUHI", + [ + false, + 617733347188670463, + "AAAAAAMAAAABAAAACsBSyG+bOfK0QEP3YReHt3bAUshwv9jJ8EBD91zQnxC4wFLIcpVNATBAQ/dYWI8IL8BSyHTRVQV0QEP3VK2mmbHAUsh3XfU7KkBD91Hz+q/iwFLIeiIZJItAQ/dQRlrwtsBSyH0Cih38QEP3T7VJ+UjAUsg= (truncated)" + ], + 0.0, + "AAAAAAMAAAABAAAAIcBSyG5Dqnm8QEP3bTMJQcjAUshujDL1c0BD92dyJ07lwFLIb2MC1QlAQ/dh6d98JMBSyHC/2MnwQEP3XNCfELjAUshylU0BMEBD91hYjwgvwFLIdNFVBXRAQ/dUraaZscBSyHdd9TsqQEP3UfP6r+LAUsg= (truncated)" + ], + [ + 636017519, + "2018-01-31T08:41:24.000+0000", + 39.93302, + -75.13263, + 0.0, + 72.4, + 6.0, + "PORT ORIENT", + "IMO9735103", + "D5LI8", + 70, + 5, + 199, + null, + null, + null, + "A", + "AAAAAAHAUsh9Aood/EBD920zCUHI", + [ + false, + 617733347188408319, + "AAAAAAMAAAABAAAAGsBSyG5Dqnm8QEP3bTMJQcjAUshujDL1c0BD92dyJ07lwFLIb2MC1QlAQ/dh6d98JMBSyG+bOfK0QEP3YReHt3bAUsh+baoyv0BD91NVSkmVwFLIiGG9eiVAQ/dapji51cBSyIlFO3IIQEP3XNCfELjAUsg= (truncated)" + ], + 0.0, + "AAAAAAMAAAABAAAAIcBSyG5Dqnm8QEP3bTMJQcjAUshujDL1c0BD92dyJ07lwFLIb2MC1QlAQ/dh6d98JMBSyHC/2MnwQEP3XNCfELjAUshylU0BMEBD91hYjwgvwFLIdNFVBXRAQ/dUraaZscBSyHdd9TsqQEP3UfP6r+LAUsg= (truncated)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "MMSI", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "BaseDateTime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "LAT", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "LON", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "SOG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "COG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Heading", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "VesselName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "IMO", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "CallSign", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "VesselType", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Status", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Length", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Width", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Draft", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Cargo", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "TranscieverClass", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "point_geom", + "type": "\"binary\"" + }, + { + "metadata": "{}", + "name": "ix", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"is_core\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"index_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"wkb\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "sog_kmph", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "buffer_geom", + "type": "\"binary\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "OPTIMIZE cargos_buffered ZORDER BY (ix.index_id, BaseDateTime);\n", + "SELECT * FROM cargos_buffered LIMIT 5;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f34c9210-ebe2-4852-abd1-a911175cacc6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Implement Algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "04bcdefb-3327-4cde-af5e-f8e1d886ac58", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "buffered_events = (\n", + " spark.read.table(\"cargos_buffered\")\n", + " .repartition(sc.defaultParallelism * 20) # <- repartition is important!\n", + ")\n", + "\n", + "def ts_diff(ts1, ts2):\n", + " \"\"\"Output the difference between two timestamps in seconds.\n", + "\n", + " Args:\n", + " ts1 (Timestamp): First Timestamp\n", + " ts2 (Timestamp): Second Timestamp\n", + "\n", + " Returns:\n", + " long: The difference between two timestamps in seconds.\n", + " \"\"\"\n", + " return F.abs(col(ts1).cast(\"long\") - col(ts2).cast(\"long\"))\n", + "\n", + "\n", + "def time_window(sog1, sog2, heading1, heading2, radius):\n", + " \"\"\"Create dynamic time window based on speed, buffer radius and heading.\n", + "\n", + " Args:\n", + " sog1 (double): vessel 1's speed over ground, in knots\n", + " sog2 (double): vessel 2's speed over ground, in knots\n", + " heading1 (double): vessel 1's heading angle in degrees\n", + " heading2 (double): vessel 2's heading angle in degrees\n", + " radius (double): buffer radius in degrees\n", + "\n", + " Returns:\n", + " double: dynamic time window in seconds based on the speed and radius\n", + " \"\"\"\n", + " v_x1 = col(sog1) * F.cos(col(heading1))\n", + " v_y1 = col(sog1) * F.sin(col(heading1))\n", + " v_x2 = col(sog2) * F.cos(col(heading2))\n", + " v_y2 = col(sog2) * F.sin(col(heading2))\n", + "\n", + " # compute relative vectors speed based x and y partial speeds\n", + " v_relative = F.sqrt((v_x1 + v_x2) * (v_x1 + v_x2) + (v_y1 + v_y2) * (v_y1 + v_y2))\n", + " # convert to m/s and determine ratio between speed and radius\n", + " return v_relative * F.lit(1000) / F.lit(radius) / F.lit(3600)\n", + "\n", + "\n", + "candidates = (\n", + " buffered_events.alias(\"a\")\n", + " .join(\n", + " buffered_events.alias(\"b\"),\n", + " [\n", + " col(\"a.ix.index_id\")\n", + " == col(\"b.ix.index_id\"), # to only compare across efficient indices\n", + " col(\"a.mmsi\")\n", + " < col(\"b.mmsi\"), # to prevent comparing candidates bidirectionally\n", + " ts_diff(\"a.BaseDateTime\", \"b.BaseDateTime\")\n", + " < time_window(\"a.sog_kmph\", \"b.sog_kmph\", \"a.heading\", \"b.heading\", buffer),\n", + " ],\n", + " )\n", + " .where(\n", + " (\n", + " col(\"a.ix.is_core\") | col(\"b.ix.is_core\")\n", + " ) # if either candidate fully covers an index, no further comparison is needed\n", + " | mos.st_intersects(\n", + " \"a.ix.wkb\", \"b.ix.wkb\"\n", + " ) # limit geospatial querying to cases where indices alone cannot give certainty\n", + " )\n", + " .select(\n", + " col(\"a.vesselName\").alias(\"vessel_1\"),\n", + " col(\"b.vesselName\").alias(\"vessel_2\"),\n", + " col(\"a.BaseDateTime\").alias(\"timestamp_1\"),\n", + " col(\"b.BaseDateTime\").alias(\"timestamp_2\"),\n", + " col(\"a.ix.index_id\").alias(\"ix\"),\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "10113308-de6d-4ac3-96a1-fa9af2d41497", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(candidates.write.mode(\"overwrite\").saveAsTable(\"overlap_candidates\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b244dd20-28fd-4014-9e93-67ddbb940666", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
vessel_1vessel_2timestamp_1timestamp_2ix
AZURGLOBAL ETERNITY2018-01-31T10:18:40.000+00002018-01-31T07:48:02.000+0000617700141109608447
DANAE CEVER SHINE2018-01-31T10:26:13.000+00002018-01-31T08:44:18.000+0000617710257664688127
UMM SALALDREAM CANARY2018-01-31T03:41:48.000+00002018-01-31T05:24:39.000+0000617710268076785663
ZIM RIO GRANDEAFRICAN RAVEN2018-01-31T05:58:48.000+00002018-01-31T07:16:41.000+0000617711246234353663
ZIM RIO GRANDEAUTO BANNER2018-01-31T05:58:48.000+00002018-01-31T07:27:14.000+0000617711246234353663
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "AZUR", + "GLOBAL ETERNITY", + "2018-01-31T10:18:40.000+0000", + "2018-01-31T07:48:02.000+0000", + 617700141109608447 + ], + [ + "DANAE C", + "EVER SHINE", + "2018-01-31T10:26:13.000+0000", + "2018-01-31T08:44:18.000+0000", + 617710257664688127 + ], + [ + "UMM SALAL", + "DREAM CANARY", + "2018-01-31T03:41:48.000+0000", + "2018-01-31T05:24:39.000+0000", + 617710268076785663 + ], + [ + "ZIM RIO GRANDE", + "AFRICAN RAVEN", + "2018-01-31T05:58:48.000+0000", + "2018-01-31T07:16:41.000+0000", + 617711246234353663 + ], + [ + "ZIM RIO GRANDE", + "AUTO BANNER", + "2018-01-31T05:58:48.000+0000", + "2018-01-31T07:27:14.000+0000", + 617711246234353663 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "vessel_1", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "vessel_2", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "timestamp_1", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "timestamp_2", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "ix", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "SELECT * FROM overlap_candidates limit 5" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f869f88b-590c-4ed6-b755-7184deea9692", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
ixcount
6177442821997854711622
6177442822000476151622
6177442821992611831622
617748934640271359893
617748934703972351893
617748934640271359867
617700170145202175624
617743619083993087487
618194245959286783435
618194245957976063435
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 617744282199785471, + 1622 + ], + [ + 617744282200047615, + 1622 + ], + [ + 617744282199261183, + 1622 + ], + [ + 617748934640271359, + 893 + ], + [ + 617748934703972351, + 893 + ], + [ + 617748934640271359, + 867 + ], + [ + 617700170145202175, + 624 + ], + [ + 617743619083993087, + 487 + ], + [ + 618194245959286783, + 435 + ], + [ + 618194245957976063, + 435 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "ix", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "count", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "CREATE OR REPLACE TEMPORARY VIEW agg_overlap AS\n", + "SELECT ix, count(*) AS count\n", + "FROM overlap_candidates\n", + "GROUP BY ix, vessel_1, vessel_2\n", + "ORDER BY count DESC;\n", + "\n", + "SELECT * FROM agg_overlap LIMIT 10;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3bf7af9a-ba25-40a4-820c-a7cc7ddd687c", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Plot Common Overlaps" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c659afdd-f59b-470d-9d58-6587c1b66f0f", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4e813232-0e8c-41f0-8a62-ab6ee303c5be", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "80ccfd58-5789-4981-96a0-0169e3d8a108", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# \"agg_overlap\" \"ix\" \"h3\" 10_000" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549842153093, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "03.a Overlap Detection", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/Ship2ShipTransfers/03.a Overlap Detection.py b/notebooks/examples/python/Ship2ShipTransfers/03.a Overlap Detection.py deleted file mode 100644 index 02864978a..000000000 --- a/notebooks/examples/python/Ship2ShipTransfers/03.a Overlap Detection.py +++ /dev/null @@ -1,167 +0,0 @@ -# Databricks notebook source -# MAGIC %md # Overlap Detection -# MAGIC -# MAGIC We now try to detect potentially overlapping pings using a buffer on a particular day. - -# COMMAND ---------- - -# MAGIC %pip install databricks_mosaic - -# COMMAND ---------- - -from pyspark.sql.functions import * -import mosaic as mos - -spark.conf.set("spark.databricks.labs.mosaic.geometry.api", "JTS") -spark.conf.set("spark.databricks.labs.mosaic.index.system", "H3") -mos.enable_mosaic(spark, dbutils) - -# COMMAND ---------- - -cargos_indexed = spark.read.table("ship2ship.cargos_indexed").filter( - col("BaseDateTime").between( - "2018-01-31T00:00:00.000+0000", "2018-01-31T23:59:00.000+0000" - ) -) -display(cargos_indexed) -cargos_indexed.count() - -# COMMAND ---------- - -# MAGIC %md ## Buffering -# MAGIC -# MAGIC 1. Convert the point into a polygon by buffering it with a certain area to turn this into a circle. -# MAGIC 2. Index the polygon to leverage more performant querying. -# MAGIC -# MAGIC ![](http://1fykyq3mdn5r21tpna3wkdyi-wpengine.netdna-ssl.com/wp-content/uploads/2018/06/image14-1.png) -# MAGIC -# MAGIC -# MAGIC -# MAGIC Since our projection is not in metres, we convert from decimal degrees, with `(0.00001 - 0.000001)` as being equal to one metre at the equator. -# MAGIC Here we choose an buffer of roughly 100 metres. -# MAGIC Ref: http://wiki.gis.com/wiki/index.php/Decimal_degrees - -# COMMAND ---------- - -one_metre = 0.00001 - 0.000001 -buffer = 100 * one_metre - -( - cargos_indexed - # We increase parallelism as the default execution plan does not take full advantage of it. - .repartition(sc.defaultParallelism * 20) - .withColumn("buffer_geom", mos.st_buffer("point_geom", lit(buffer))) - .withColumn("ix", mos.grid_tessellateexplode("buffer_geom", lit(9))) - .write.mode("overwrite") - .saveAsTable("ship2ship.cargos_buffered") -) - -# COMMAND ---------- - -# DBTITLE 1,We can optimise our table to colocate data and make querying faster -# MAGIC %sql -# MAGIC OPTIMIZE ship2ship.cargos_buffered ZORDER BY (ix.index_id, BaseDateTime); -# MAGIC SELECT * FROM ship2ship.cargos_buffered; - -# COMMAND ---------- - -# MAGIC %md ## Implement Algorithm - -# COMMAND ---------- - -buffered_events = spark.read.table("ship2ship.cargos_buffered").repartition( - sc.defaultParallelism * 20 -) - - -def ts_diff(ts1, ts2): - """Output the difference between two timestamps in seconds. - - Args: - ts1 (Timestamp): First Timestamp - ts2 (Timestamp): Second Timestamp - - Returns: - long: The difference between two timestamps in seconds. - """ - return abs(col(ts1).cast("long") - col(ts2).cast("long")) - - -def time_window(sog1, sog2, heading1, heading2, radius): - """Create dynamic time window based on speed, buffer radius and heading. - - Args: - sog1 (double): vessel 1's speed over ground, in knots - sog2 (double): vessel 2's speed over ground, in knots - heading1 (double): vessel 1's heading angle in degrees - heading2 (double): vessel 2's heading angle in degrees - radius (double): buffer radius in degrees - - Returns: - double: dynamic time window in seconds based on the speed and radius - """ - v_x1 = col(sog1) * cos(col(heading1)) - v_y1 = col(sog1) * sin(col(heading1)) - v_x2 = col(sog2) * cos(col(heading2)) - v_y2 = col(sog2) * sin(col(heading2)) - - # compute relative vectors speed based x and y partial speeds - v_relative = sqrt((v_x1 + v_x2) * (v_x1 + v_x2) + (v_y1 + v_y2) * (v_y1 + v_y2)) - # convert to m/s and determine ratio between speed and radius - return v_relative * lit(1000) / lit(radius) / lit(3600) - - -candidates = ( - buffered_events.alias("a") - .join( - buffered_events.alias("b"), - [ - col("a.ix.index_id") - == col("b.ix.index_id"), # to only compare across efficient indices - col("a.mmsi") - < col("b.mmsi"), # to prevent comparing candidates bidirectionally - ts_diff("a.BaseDateTime", "b.BaseDateTime") - < time_window("a.sog_kmph", "b.sog_kmph", "a.heading", "b.heading", buffer), - ], - ) - .where( - ( - col("a.ix.is_core") | col("b.ix.is_core") - ) # if either candidate fully covers an index, no further comparison is needed - | mos.st_intersects( - "a.ix.wkb", "b.ix.wkb" - ) # limit geospatial querying to cases where indices alone cannot give certainty - ) - .select( - col("a.vesselName").alias("vessel_1"), - col("b.vesselName").alias("vessel_2"), - col("a.BaseDateTime").alias("timestamp_1"), - col("b.BaseDateTime").alias("timestamp_2"), - col("a.ix.index_id").alias("ix"), - ) -) - -# COMMAND ---------- - -(candidates.write.mode("overwrite").saveAsTable("ship2ship.overlap_candidates")) - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC SELECT * FROM ship2ship.overlap_candidates - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC CREATE OR REPLACE TEMPORARY VIEW agg_overlap AS -# MAGIC SELECT ix, count(*) AS count -# MAGIC FROM ship2ship.overlap_candidates -# MAGIC GROUP BY ix, vessel_1, vessel_2 -# MAGIC ORDER BY count DESC; -# MAGIC SELECT * FROM agg_overlap; - -# COMMAND ---------- - -# DBTITLE 1,Plotting Common Overlaps -# MAGIC %%mosaic_kepler -# MAGIC "agg_overlap" "ix" "h3" 10_000 diff --git a/notebooks/examples/python/Ship2ShipTransfers/03.b Advanced Overlap Detection.ipynb b/notebooks/examples/python/Ship2ShipTransfers/03.b Advanced Overlap Detection.ipynb new file mode 100644 index 000000000..a791ff1b2 --- /dev/null +++ b/notebooks/examples/python/Ship2ShipTransfers/03.b Advanced Overlap Detection.ipynb @@ -0,0 +1,1384 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d9d6eea1-2420-4ed4-898a-f623ba434ea7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Line Aggregation\n", + "\n", + "> Instead of the point-to-point evaluation, we will instead be aggregating into lines and comparing as such.\n", + "\n", + "---\n", + "__Last Updated:__ 27 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c0464989-7c10-4a28-94fa-14b0b44e0fef", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1d190dd4-df71-488d-9faa-535020d58335", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8eb58507-5cb6-4348-bccb-ad8853e90cc7", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "535765d9-808b-451f-a6ff-bcf82db80e28", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Configure Database__\n", + "\n", + "> Adjust this to settings from the Data Prep notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ffaf5dd0-00da-4794-8539-8de2fa45d06d", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[2]: DataFrame[]" + ] + } + ], + "source": [ + "catalog_name = \"mjohns\"\n", + "db_name = \"ship2ship\"\n", + "\n", + "sql(f\"use catalog {catalog_name}\")\n", + "sql(f\"use schema {db_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "660a9e5e-2d82-4b64-97f8-8ebaa49baa8c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 521,867\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
MMSIBaseDateTimeLATLONSOGCOGHeadingVesselNameIMOCallSignVesselTypeStatusLengthWidthDraftCargoTranscieverClasspoint_geomixsog_kmph
3675317802018-01-31T03:43:14.000+000029.75981-91.617894.386.488.0POSEIDONIMO8842234WDG3989700217nullnullAAAAAAAHAVueLgn+hoUA9woLofSx86181940945648353277.96
3675317802018-01-31T03:42:05.000+000029.75974-91.619464.492.187.0POSEIDONIMO8842234WDG3989700217nullnullAAAAAAAHAVuelO45LiEA9wn5SFXaK6181940945648353278.15
3675317802018-01-31T03:48:04.000+000029.76002-91.611393.885.597.0POSEIDONIMO8842234WDG3989700217nullnullAAAAAAAHAVuchA4XGfkA9wpCrtE5R6181940945650974717.04
3675317802018-01-31T03:46:56.000+000029.75997-91.612834.083.890.0POSEIDONIMO8842234WDG3989700217nullnullAAAAAAAHAVuc4m1IAfkA9wo1k1/Dt6181940945650974717.41
3675317802018-01-31T03:45:46.000+000029.75983-91.614364.389.287.0POSEIDONIMO8842234WDG3989700217nullnullAAAAAAAHAVudRrJr+HkA9woQ4CIUK6181940945653596157.96
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 367531780, + "2018-01-31T03:43:14.000+0000", + 29.75981, + -91.61789, + 4.3, + 86.4, + 88.0, + "POSEIDON", + "IMO8842234", + "WDG3989", + 70, + 0, + 21, + 7, + null, + null, + "A", + "AAAAAAHAVueLgn+hoUA9woLofSx8", + 618194094564835327, + 7.96 + ], + [ + 367531780, + "2018-01-31T03:42:05.000+0000", + 29.75974, + -91.61946, + 4.4, + 92.1, + 87.0, + "POSEIDON", + "IMO8842234", + "WDG3989", + 70, + 0, + 21, + 7, + null, + null, + "A", + "AAAAAAHAVuelO45LiEA9wn5SFXaK", + 618194094564835327, + 8.15 + ], + [ + 367531780, + "2018-01-31T03:48:04.000+0000", + 29.76002, + -91.61139, + 3.8, + 85.5, + 97.0, + "POSEIDON", + "IMO8842234", + "WDG3989", + 70, + 0, + 21, + 7, + null, + null, + "A", + "AAAAAAHAVuchA4XGfkA9wpCrtE5R", + 618194094565097471, + 7.04 + ], + [ + 367531780, + "2018-01-31T03:46:56.000+0000", + 29.75997, + -91.61283, + 4.0, + 83.8, + 90.0, + "POSEIDON", + "IMO8842234", + "WDG3989", + 70, + 0, + 21, + 7, + null, + null, + "A", + "AAAAAAHAVuc4m1IAfkA9wo1k1/Dt", + 618194094565097471, + 7.41 + ], + [ + 367531780, + "2018-01-31T03:45:46.000+0000", + 29.75983, + -91.61436, + 4.3, + 89.2, + 87.0, + "POSEIDON", + "IMO8842234", + "WDG3989", + 70, + 0, + 21, + 7, + null, + null, + "A", + "AAAAAAHAVudRrJr+HkA9woQ4CIUK", + 618194094565359615, + 7.96 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "MMSI", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "BaseDateTime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "LAT", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "LON", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "SOG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "COG", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Heading", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "VesselName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "IMO", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "CallSign", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "VesselType", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Status", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Length", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Width", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "Draft", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Cargo", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "TranscieverClass", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "point_geom", + "type": "\"binary\"" + }, + { + "metadata": "{}", + "name": "ix", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "sog_kmph", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "cargos_indexed = spark.read.table(\"cargos_indexed\")\n", + " \n", + "print(f\"count? {cargos_indexed.count():,}\")\n", + "cargos_indexed.limit(5).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c824e4a4-1ec7-4f03-a98d-132160c55d1f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Create Lines\n", + "\n", + "We can `groupBy` across a timewindow to give us aggregated geometries to work with.\n", + "\n", + "When we collect the various points within a timewindow, we want to construct the linestring by the order in which they were generated (timestamp).\n", + "We choose a buffer of a max of 200 metres in this case.\n", + "Since our projection is not in metres, we convert from decimal degrees. With `(0.00001 - 0.000001)` as being equal to one metre at the equator\n", + "Ref: http://wiki.gis.com/wiki/index.php/Decimal_degrees" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "807361a6-0f28-4966-8d8b-32271a6f8f3d", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 72,933\n" + ] + } + ], + "source": [ + "spark.catalog.clearCache() # <- cache is useful for dev (avoid recompute)\n", + "lines = (\n", + " cargos_indexed\n", + " .repartition(sc.defaultParallelism * 20) # <- repartition is important!\n", + " .groupBy(\"mmsi\", F.window(\"BaseDateTime\", \"15 minutes\"))\n", + " # We link the points to their respective timestamps in the aggregation\n", + " .agg(F.collect_list(F.struct(col(\"point_geom\"), col(\"BaseDateTime\"))).alias(\"coords\"))\n", + " # And then sort our array of points by the timestamp to form a trajectory\n", + " .withColumn(\n", + " \"coords\",\n", + " F.expr(\n", + " \"\"\"\n", + " array_sort(coords, (left, right) -> \n", + " case \n", + " when left.BaseDateTime < right.BaseDateTime then -1 \n", + " when left.BaseDateTime > right.BaseDateTime then 1 \n", + " else 0 \n", + " end\n", + " )\"\"\"\n", + " ),\n", + " )\n", + " .withColumn(\"line\", mos.st_makeline(col(\"coords.point_geom\")))\n", + " .cache()\n", + ")\n", + "print(f\"count? {lines.count():,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "736c0b76-f14e-4f02-9e9f-bb174f65e01a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Note here that this decreases the total number of rows across which we are running our comparisons._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ebcba535-1aa3-41e0-b8db-8d40f5b7031d", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+\n| mmsi| window| coords| line|buffer_r| buffer_geom| buffer| ix|\n+---------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+\n|564995000|{2018-01-31 20:00...|[{\u0000\u0000\u0000\u0000\u0001�_�:)�y�@H...|[00 00 00 00 02 0...| 0.00192|[00 00 00 00 03 0...|POLYGON ((-126.04...|{false, 617710246...|\n+---------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+\n\n" + ] + } + ], + "source": [ + "\n", + "one_metre = 0.00001 - 0.000001\n", + "buffer = 200 * one_metre\n", + "\n", + "\n", + "def get_buffer(line, buffer=buffer):\n", + " \"\"\"Create buffer as function of number of points in linestring\n", + " The buffer size is inversely proportional to the number of points, providing a larger buffer for slower ships.\n", + " The intuition behind this choice is held in the way AIS positions are emitted. The faster the vessel moves the\n", + " more positions it will emit — yielding a smoother trajectory, where slower vessels will yield far fewer positions\n", + " and a harder to reconstruct trajectory which inherently holds more uncertainty.\n", + "\n", + " Args:\n", + " line (geometry): linestring geometry as generated with st_makeline.\n", + "\n", + " Returns:\n", + " double: buffer size in degrees\n", + " \"\"\"\n", + " np = mos.st_numpoints(line)\n", + " max_np = lines.select(F.max(np)).collect()[0][0]\n", + " return F.lit(max_np) * F.lit(buffer) / np\n", + "\n", + "\n", + "cargo_movement = (\n", + " lines.withColumn(\"buffer_r\", get_buffer(\"line\"))\n", + " .withColumn(\"buffer_geom\", mos.st_buffer(\"line\", col(\"buffer_r\")))\n", + " .withColumn(\"buffer\", mos.st_astext(\"buffer_geom\"))\n", + " .withColumn(\"ix\", mos.grid_tessellateexplode(\"buffer_geom\", F.lit(9)))\n", + ")\n", + "\n", + "cargo_movement.createOrReplaceTempView(\"ship_path\") # <- create a temp view\n", + "spark.read.table(\"ship_path\").limit(1).show() # <- limiting + using `show` for ipynb only" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7c8a1d63-1ecf-4377-94fd-519a6f5a7cb0", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "to_plot = spark.read.table(\"ship_path\").select(\"buffer\").limit(3_000).distinct()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1286c751-b4d6-461b-9fcd-4b7349502d81", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Example buffer paths_" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e515bbee-1fef-4d28-b77c-e2def71dcd99", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "390ff6af-5876-4368-afcb-f5cf0a005190", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7992e47b-30d5-44ca-af2f-ad9d75917501", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# to_plot \"buffer\" \"geometry\" 3_000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ed81a1f8-e1fa-4119-935f-76a34397c3f7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Find All Candidates\n", + "\n", + "We employ a join strategy using Mosaic indices as before, but this time we leverage the buffered ship paths." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c457b6f5-71f7-40c3-95f0-05bbf3b28566", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "candidates_lines = (\n", + " cargo_movement.alias(\"a\")\n", + " .join(\n", + " cargo_movement.alias(\"b\"),\n", + " [\n", + " col(\"a.ix.index_id\")\n", + " == col(\"b.ix.index_id\"), # to only compare across efficient indices\n", + " col(\"a.mmsi\")\n", + " < col(\"b.mmsi\"), # to prevent comparing candidates bidirectionally\n", + " col(\"a.window\")\n", + " == col(\"b.window\"), # to compare across the same time window\n", + " ],\n", + " )\n", + " .where(\n", + " (\n", + " col(\"a.ix.is_core\") | col(\"b.ix.is_core\")\n", + " ) # if either candidate fully covers an index, no further comparison is needed\n", + " | mos.st_intersects(\n", + " \"a.ix.wkb\", \"b.ix.wkb\"\n", + " ) # limit geospatial querying to cases where indices alone cannot give certainty\n", + " )\n", + " .select(\n", + " col(\"a.mmsi\").alias(\"vessel_1\"),\n", + " col(\"b.mmsi\").alias(\"vessel_2\"),\n", + " col(\"a.window\").alias(\"window\"),\n", + " col(\"a.buffer\").alias(\"line_1\"),\n", + " col(\"b.buffer\").alias(\"line_2\"),\n", + " col(\"a.ix.index_id\").alias(\"index\"),\n", + " )\n", + " .drop_duplicates()\n", + ")\n", + "\n", + "(\n", + " candidates_lines.write.mode(\"overwrite\")\n", + " .option(\"overwriteSchema\", \"true\")\n", + " .saveAsTable(\"overlap_candidates_lines\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5e5cd7f7-97f8-47e4-a2a7-f04e4661d483", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
vessel_1vessel_2windowline_1line_2index
477700700477999700List(2018-01-31T14:45:00.000+0000, 2018-01-31T15:00:00.000+0000)POLYGON ((-162.5172963924641 54.096237807783645, -162.50149864536905 54.09560112276218, -162.49286297163883 54.095271288001655, -162.49278616875375 54.09526763555918, -162.48225934728822 54.094668385760386, -162.47514253036482 54.09430905138557, -162.47513660892625 54.09430874812911, -162.46454660892627 54.093758748129105, -162.4637491321803 54.09363816915261, -162.46299050251778 54.09336432707218, -162.46229987365115 54.09294774548538, -162.46170378606044 54.092404433388985, -162.4612251470575 54.09175526996214, -162.46088235047074 54.09102520219131, -162.4606885697808 54.09024228617148, -162.4606512518709 54.08943660892625, -162.4607718308474 54.088639132180305, -162.46104567292784 54.08788050251777, -162.46146225451463 54.087189873651134, -162.46200556661103 54.086593786060426, -162.46265473003785 54.08611514705748, -162.46338479780871 54.085772350470734, -162.46416771382852 54.08557856978079, -162.46497339107376 54.08554125187089, -162.4755604304651 54.08609109810936, -162.48268746963518 54.086450948614434, -162.48271383124626 54.086452364440824, -162.4932154468692 54.08705017937192, -162.50181702836116 54.08737871199835, -162.50182568054078 54.08737905158179, -162.5177056805408 54.08801905158179, -162.51786135952378 54.088028283896314, -162.5248813595238 54.08857828389632, -162.52567538980614 54.088719791354094, -162.52642655628375 54.089013487409225, -162.52710599205017 54.089448085487014, -162.5276875867699 54.09000688422706, -162.52814899008402 54.090668409307156, -162.52847247052196 54.09140723868947, -162.52864559691187 54.09219497957517, -162.5286617161037 54.0930013595238, -162.52852020864592 54.09379538980615, -162.52822651259078 54.09454655628376, -162.527791914513 54.09522599205017, -162.52723311577296 54.095807586769894, -162.52657159069287 54.09626899008403, -162.52583276131054 54.096592470521955, -162.52504502042484 54.09676559691187, -162.52423864047623 54.096781716103685, -162.5172963924641 54.096237807783645))POLYGON ((-162.49169 54.1088, -162.49224338392438 54.10318139872594, -162.49388226946368 54.09777871714789, -162.49654367516567 54.092799577289036, -162.50012532470183 54.08843532470183, -162.50448957728904 54.08485367516569, -162.50946871714788 54.082192269463675, -162.51487139872594 54.08055338392439, -162.52049 54.080000000000005, -162.52610860127405 54.08055338392439, -162.5315112828521 54.082192269463675, -162.53649042271095 54.08485367516569, -162.54085467529816 54.08843532470183, -162.54443632483432 54.092799577289036, -162.5470977305363 54.09777871714789, -162.54873661607562 54.10318139872594, -162.54928999999998 54.1088, -162.54873661607562 54.11441860127407, -162.5470977305363 54.119821282852115, -162.54443632483432 54.12480042271097, -162.54085467529816 54.12916467529818, -162.53649042271095 54.132746324834315, -162.5315112828521 54.13540773053633, -162.52610860127405 54.137046616075615, -162.52049 54.1376, -162.51487139872594 54.137046616075615, -162.50946871714788 54.13540773053633, -162.50448957728904 54.132746324834315, -162.50012532470183 54.12916467529818, -162.49654367516567 54.12480042271097, -162.49388226946368 54.119821282852115, -162.49224338392438 54.11441860127407, -162.49169 54.1088))617219359258181631
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 477700700, + 477999700, + [ + "2018-01-31T14:45:00.000+0000", + "2018-01-31T15:00:00.000+0000" + ], + "POLYGON ((-162.5172963924641 54.096237807783645, -162.50149864536905 54.09560112276218, -162.49286297163883 54.095271288001655, -162.49278616875375 54.09526763555918, -162.48225934728822 54.094668385760386, -162.47514253036482 54.09430905138557, -162.47513660892625 54.09430874812911, -162.46454660892627 54.093758748129105, -162.4637491321803 54.09363816915261, -162.46299050251778 54.09336432707218, -162.46229987365115 54.09294774548538, -162.46170378606044 54.092404433388985, -162.4612251470575 54.09175526996214, -162.46088235047074 54.09102520219131, -162.4606885697808 54.09024228617148, -162.4606512518709 54.08943660892625, -162.4607718308474 54.088639132180305, -162.46104567292784 54.08788050251777, -162.46146225451463 54.087189873651134, -162.46200556661103 54.086593786060426, -162.46265473003785 54.08611514705748, -162.46338479780871 54.085772350470734, -162.46416771382852 54.08557856978079, -162.46497339107376 54.08554125187089, -162.4755604304651 54.08609109810936, -162.48268746963518 54.086450948614434, -162.48271383124626 54.086452364440824, -162.4932154468692 54.08705017937192, -162.50181702836116 54.08737871199835, -162.50182568054078 54.08737905158179, -162.5177056805408 54.08801905158179, -162.51786135952378 54.088028283896314, -162.5248813595238 54.08857828389632, -162.52567538980614 54.088719791354094, -162.52642655628375 54.089013487409225, -162.52710599205017 54.089448085487014, -162.5276875867699 54.09000688422706, -162.52814899008402 54.090668409307156, -162.52847247052196 54.09140723868947, -162.52864559691187 54.09219497957517, -162.5286617161037 54.0930013595238, -162.52852020864592 54.09379538980615, -162.52822651259078 54.09454655628376, -162.527791914513 54.09522599205017, -162.52723311577296 54.095807586769894, -162.52657159069287 54.09626899008403, -162.52583276131054 54.096592470521955, -162.52504502042484 54.09676559691187, -162.52423864047623 54.096781716103685, -162.5172963924641 54.096237807783645))", + "POLYGON ((-162.49169 54.1088, -162.49224338392438 54.10318139872594, -162.49388226946368 54.09777871714789, -162.49654367516567 54.092799577289036, -162.50012532470183 54.08843532470183, -162.50448957728904 54.08485367516569, -162.50946871714788 54.082192269463675, -162.51487139872594 54.08055338392439, -162.52049 54.080000000000005, -162.52610860127405 54.08055338392439, -162.5315112828521 54.082192269463675, -162.53649042271095 54.08485367516569, -162.54085467529816 54.08843532470183, -162.54443632483432 54.092799577289036, -162.5470977305363 54.09777871714789, -162.54873661607562 54.10318139872594, -162.54928999999998 54.1088, -162.54873661607562 54.11441860127407, -162.5470977305363 54.119821282852115, -162.54443632483432 54.12480042271097, -162.54085467529816 54.12916467529818, -162.53649042271095 54.132746324834315, -162.5315112828521 54.13540773053633, -162.52610860127405 54.137046616075615, -162.52049 54.1376, -162.51487139872594 54.137046616075615, -162.50946871714788 54.13540773053633, -162.50448957728904 54.132746324834315, -162.50012532470183 54.12916467529818, -162.49654367516567 54.12480042271097, -162.49388226946368 54.119821282852115, -162.49224338392438 54.11441860127407, -162.49169 54.1088))", + 617219359258181631 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "vessel_1", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "vessel_2", + "type": "\"integer\"" + }, + { + "metadata": "{\"spark.timeWindow\":true}", + "name": "window", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"start\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}},{\"name\":\"end\",\"type\":\"timestamp\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "line_1", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "line_2", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "index", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "SELECT * FROM overlap_candidates_lines LIMIT 1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4c8a5dff-3c09-40c4-b33c-0a4dc7fd9fbd", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_We can show the most common locations for overlaps happening, as well some example ship paths during those overlaps._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "887fb194-d654-4add-bc8e-1c34bd44cb7f", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "CREATE OR REPLACE TEMPORARY VIEW agg_overlap AS\n", + "SELECT index AS ix, count(*) AS count, FIRST(line_1) AS line_1, FIRST(line_2) AS line_2\n", + "FROM ship2ship.overlap_candidates_lines\n", + "GROUP BY ix\n", + "ORDER BY count DESC" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d7eca312-89d6-4816-9423-51985e63bb4e", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3a071ce6-4735-4fc5-9c78-c0893178c197", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9a44e4fa-60bf-494a-9ce9-277cceeda8c7", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# \"agg_overlap\" \"ix\" \"h3\" 2_000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "98a4ee66-50c2-4101-a679-614bf598bc2c", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Filtering Out Harbours\n", + "In the data we see many overlaps near harbours. We can reasonably assume that these are overlaps due to being in close proximity of the harbour, not a transfer.\n", + "Therefore, we can filter those out below." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "525d02b4-0522-41c1-bacf-830a4720706a", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "harbours_h3 = spark.read.table(\"harbours_h3\")\n", + "candidates = spark.read.table(\"overlap_candidates_lines\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4b022d20-5cf5-46b1-937a-4828172124ca", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "matches = (\n", + " candidates.join(\n", + " harbours_h3, how=\"leftanti\", on=candidates[\"index\"] == harbours_h3[\"h3\"]\n", + " )\n", + " .groupBy(\"vessel_1\", \"vessel_2\")\n", + " .agg(F.first(\"line_1\").alias(\"line_1\"), F.first(\"line_2\").alias(\"line_2\"))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4364ee70-357f-4832-b36c-f898f7617426", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(\n", + " matches.write.mode(\"overwrite\")\n", + " .option(\"overwriteSchema\", \"true\")\n", + " .saveAsTable(\"overlap_candidates_lines_filtered\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9ebf35e4-533e-4a50-a840-49e54fbec963", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
format_number(count(1), 0)
3,093
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "3,093" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{\"__autoGeneratedAlias\":\"true\"}", + "name": "format_number(count(1), 0)", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "SELECT format_number(COUNT(1),0) FROM overlap_candidates_lines_filtered;" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "45d0c208-ea6d-4b3f-b3c1-8b41d731a868", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ca7ea57c-1d4c-4076-b5c0-668410e971a2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b7318dba-c0d3-47dc-8b12-a1fc13c7fb00", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# overlap_candidates_lines_filtered \"line_1\" \"geometry\" 2_000" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549842153110, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "03.b Advanced Overlap Detection", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/Ship2ShipTransfers/03.b Advanced Overlap Detection.py b/notebooks/examples/python/Ship2ShipTransfers/03.b Advanced Overlap Detection.py deleted file mode 100644 index d867ab3c0..000000000 --- a/notebooks/examples/python/Ship2ShipTransfers/03.b Advanced Overlap Detection.py +++ /dev/null @@ -1,217 +0,0 @@ -# Databricks notebook source -# MAGIC %md ## Line Aggregation -# MAGIC -# MAGIC Instead of the point-to-point evaluation, we will instead be aggregating into lines and comparing as such. - -# COMMAND ---------- - -# MAGIC %pip install databricks_mosaic - -# COMMAND ---------- - -from pyspark.sql.functions import * -import mosaic as mos - -spark.conf.set("spark.databricks.labs.mosaic.geometry.api", "JTS") -spark.conf.set("spark.databricks.labs.mosaic.index.system", "H3") -mos.enable_mosaic(spark, dbutils) - -# COMMAND ---------- - -cargos_indexed = spark.read.table("ship2ship.cargos_indexed").repartition( - sc.defaultParallelism * 20 -) -display(cargos_indexed) -cargos_indexed.count() - -# COMMAND ---------- - -# MAGIC %md ## Create Lines -# MAGIC -# MAGIC We can `groupBy` across a timewindow to give us aggregated geometries to work with. -# MAGIC -# MAGIC When we collect the various points within a timewindow, we want to construct the linestring by the order in which they were generated (timestamp). -# MAGIC We choose a buffer of a max of 200 metres in this case. -# MAGIC Since our projection is not in metres, we convert from decimal degrees. With `(0.00001 - 0.000001)` as being equal to one metre at the equator -# MAGIC Ref: http://wiki.gis.com/wiki/index.php/Decimal_degrees - -# COMMAND ---------- - -lines = ( - cargos_indexed.repartition(sc.defaultParallelism * 20) - .groupBy("mmsi", window("BaseDateTime", "15 minutes")) - # We link the points to their respective timestamps in the aggregation - .agg(collect_list(struct(col("point_geom"), col("BaseDateTime"))).alias("coords")) - # And then sort our array of points by the timestamp to form a trajectory - .withColumn( - "coords", - expr( - """ - array_sort(coords, (left, right) -> - case - when left.BaseDateTime < right.BaseDateTime then -1 - when left.BaseDateTime > right.BaseDateTime then 1 - else 0 - end - )""" - ), - ) - .withColumn("line", mos.st_makeline(col("coords.point_geom"))) - .cache() -) - -# COMMAND ---------- - -# DBTITLE 1,Note here that this decreases the total number of rows across which we are running our comparisons. -lines.count() - -# COMMAND ---------- - - -one_metre = 0.00001 - 0.000001 -buffer = 200 * one_metre - - -def get_buffer(line, buffer=buffer): - """Create buffer as function of number of points in linestring - The buffer size is inversely proportional to the number of points, providing a larger buffer for slower ships. - The intuition behind this choice is held in the way AIS positions are emitted. The faster the vessel moves the - more positions it will emit — yielding a smoother trajectory, where slower vessels will yield far fewer positions - and a harder to reconstruct trajectory which inherently holds more uncertainty. - - Args: - line (geometry): linestring geometry as generated with st_makeline. - - Returns: - double: buffer size in degrees - """ - np = mos.st_numpoints(line) - max_np = lines.select(max(np)).collect()[0][0] - return lit(max_np) * lit(buffer) / np - - -cargo_movement = ( - lines.withColumn("buffer_r", get_buffer("line")) - .withColumn("buffer_geom", mos.st_buffer("line", col("buffer_r"))) - .withColumn("buffer", mos.st_astext("buffer_geom")) - .withColumn("ix", mos.grid_tessellateexplode("buffer_geom", lit(9))) -) - -(cargo_movement.createOrReplaceTempView("ship_path")) - -display(spark.read.table("ship_path")) - -# COMMAND ---------- - -to_plot = spark.read.table("ship_path").select("buffer").limit(3_000).distinct() - -# COMMAND ---------- - -# DBTITLE 1,Example Buffer Paths -# MAGIC %%mosaic_kepler -# MAGIC to_plot "buffer" "geometry" 3_000 - -# COMMAND ---------- - -# MAGIC %md ## Find All Candidates -# MAGIC -# MAGIC We employ a join strategy using Mosaic indices as before, but this time we leverage the buffered ship paths. - -# COMMAND ---------- - -candidates_lines = ( - cargo_movement.alias("a") - .join( - cargo_movement.alias("b"), - [ - col("a.ix.index_id") - == col("b.ix.index_id"), # to only compare across efficient indices - col("a.mmsi") - < col("b.mmsi"), # to prevent comparing candidates bidirectionally - col("a.window") - == col("b.window"), # to compare across the same time window - ], - ) - .where( - ( - col("a.ix.is_core") | col("b.ix.is_core") - ) # if either candidate fully covers an index, no further comparison is needed - | mos.st_intersects( - "a.ix.wkb", "b.ix.wkb" - ) # limit geospatial querying to cases where indices alone cannot give certainty - ) - .select( - col("a.mmsi").alias("vessel_1"), - col("b.mmsi").alias("vessel_2"), - col("a.window").alias("window"), - col("a.buffer").alias("line_1"), - col("b.buffer").alias("line_2"), - col("a.ix.index_id").alias("index"), - ) - .drop_duplicates() -) - -( - candidates_lines.write.mode("overwrite") - .option("overwriteSchema", "true") - .saveAsTable("ship2ship.overlap_candidates_lines") -) - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC SELECT * FROM ship2ship.overlap_candidates_lines; - -# COMMAND ---------- - -# DBTITLE 1,We can show the most common locations for overlaps happening, as well some example ship paths during those overlaps. -# MAGIC %sql -# MAGIC CREATE OR REPLACE TEMPORARY VIEW agg_overlap AS -# MAGIC SELECT index AS ix, count(*) AS count, FIRST(line_1) AS line_1, FIRST(line_2) AS line_2 -# MAGIC FROM ship2ship.overlap_candidates_lines -# MAGIC GROUP BY ix -# MAGIC ORDER BY count DESC - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC "agg_overlap" "ix" "h3" 2_000 - -# COMMAND ---------- - -# MAGIC %md ## Filtering Out Harbours -# MAGIC In the data we see many overlaps near harbours. We can reasonably assume that these are overlaps due to being in close proximity of the harbour, not a transfer. -# MAGIC Therefore, we can filter those out below. - -# COMMAND ---------- - -harbours_h3 = spark.read.table("ship2ship.harbours_h3") -candidates = spark.read.table("ship2ship.overlap_candidates_lines") - -# COMMAND ---------- - -matches = ( - candidates.join( - harbours_h3, how="leftanti", on=candidates["index"] == harbours_h3["h3"] - ) - .groupBy("vessel_1", "vessel_2") - .agg(first("line_1").alias("line_1"), first("line_2").alias("line_2")) -) - -# COMMAND ---------- - -( - matches.write.mode("overwrite") - .option("overwriteSchema", "true") - .saveAsTable("ship2ship.overlap_candidates_lines_filtered") -) - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC SELECT COUNT(*) FROM ship2ship.overlap_candidates_lines_filtered; - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC ship2ship.overlap_candidates_lines_filtered "line_1" "geometry" 2_000 diff --git a/notebooks/examples/python/Ship2ShipTransfers/README.md b/notebooks/examples/python/Ship2ShipTransfers/README.md index 0692209f2..c27ed2758 100644 --- a/notebooks/examples/python/Ship2ShipTransfers/README.md +++ b/notebooks/examples/python/Ship2ShipTransfers/README.md @@ -1,4 +1,7 @@ # Ship2Ship Transfer Detection + +> Note: `ipynb` files can be previewed in GitHub and can also be imported into Databricks, more [here](https://docs.databricks.com/en/notebooks/notebook-export-import.html). + ![Ship Overlap](./images/kepler_output.png) ## Introduction This is an algorithmic implementation to detect Ship to Ship transfers at scale using Databricks. It was presented at the [Data and AI Summit 2022](https://www.youtube.com/watch?v=XQNflqbgP7Q). diff --git a/notebooks/examples/python/SpatialKNN/01. Data Prep.ipynb b/notebooks/examples/python/SpatialKNN/01. Data Prep.ipynb new file mode 100644 index 000000000..0eb5cbbd7 --- /dev/null +++ b/notebooks/examples/python/SpatialKNN/01. Data Prep.ipynb @@ -0,0 +1,2273 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2c3f3354-af76-43e5-9fa3-f7202c692e0d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup\n", + "\n", + "> Generates the following in database `mosaic_spatial_knn`: (1) table `building_50k`, (2) table `trip_1m`. These are sufficient samples of the full data for this example. __Note:__ You will need to run the actual Spatial KNN on [Databricks ML Runtime](https://docs.databricks.com/en/release-notes/runtime/index.html), for this one it doesn't matter.\n", + "\n", + "

\n", + "\n", + "1. To use Databricks Labs [Mosaic](https://databrickslabs.github.io/mosaic/index.html) library for geospatial data engineering, analysis, and visualization functionality:\n", + " * Install with `%pip install databricks-mosaic`\n", + " * Import and use with the following:\n", + " ```\n", + " import mosaic as mos\n", + " mos.enable_mosaic(spark, dbutils)\n", + " ```\n", + "

\n", + "\n", + "2. To use [KeplerGl](https://kepler.gl/) OSS library for map layer rendering:\n", + " * Already installed with Mosaic, use `%%mosaic_kepler` magic [[Mosaic Docs](https://databrickslabs.github.io/mosaic/usage/kepler.html)]\n", + " * Import with `from keplergl import KeplerGl` to use directly\n", + "\n", + "If you have trouble with Volume access:\n", + "\n", + "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", + "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html).\n", + "\n", + "---\n", + "__Last Updated:__ 27 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c41f69e4-a93a-4a8b-91d8-94019843bd02", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2e4ea63d-e7b9-4e3f-bed9-63d51b13e50e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import os\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e437d01f-add6-4dac-b9cf-1aa9516057ed", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Setup Data Location__\n", + "\n", + "> You can alter this, of course, to match your preferred location. __Note:__ this is showing DBFS for continuity outside Unity Catalog + Shared Access clusters, but you can easily modify paths to use [Volumes](https://docs.databricks.com/en/sql/language-manual/sql-ref-volumes.html), see more details [here](https://databrickslabs.github.io/mosaic/usage/installation.html) as available." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "88fbb923-8e5a-4319-b00c-191bb2bbd140", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "The raw data will be stored in 'dbfs:/mjohns@databricks.com/geospatial/mosaic/data/spatial_knn'\n" + ] + } + ], + "source": [ + "user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()\n", + "\n", + "raw_path = f\"dbfs:/{user_name}/geospatial/mosaic/data/spatial_knn\"\n", + "raw_fuse_path = raw_path.replace(\"dbfs:\",\"/dbfs\")\n", + "dbutils.fs.mkdirs(raw_path)\n", + "\n", + "os.environ['RAW_PATH'] = raw_path\n", + "os.environ['RAW_FUSE_PATH'] = raw_fuse_path\n", + "\n", + "print(f\"The raw data will be stored in '{raw_path}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "98c85386-f334-4570-8857-cd0caac6047f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "building_filename = \"nyc_building_footprints.geojson\"\n", + "os.environ['BUILDING_FILENAME'] = building_filename" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "203ef9df-5ac4-4239-b2c3-3af35a3251fb", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Setup Catalog and Schema__\n", + "\n", + "> You will have to adjust for your environment." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "59419e55-1588-41cf-a191-d210f6e912f7", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[1]: DataFrame[]" + ] + } + ], + "source": [ + "catalog_name = \"mjohns\"\n", + "sql(f\"USE CATALOG {catalog_name}\")\n", + "\n", + "db_name = \"mosaic_spatial_knn\"\n", + "sql(f\"CREATE DATABASE IF NOT EXISTS {db_name}\")\n", + "sql(f\"USE SCHEMA {db_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3eb32bde-8575-4190-b8ce-192803905292", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "

databasetableNameisTemporary
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "database", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "tableName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "isTemporary", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql show tables" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b82264e3-334c-42c3-a2e4-7ddc9f719d8a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup NYC Building Data (`Building` Table | 50K)\n", + "\n", + "> While the overall data size is ~1.1M, we are going to just take 50K for purposes of this example." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5f37b525-cf8b-43f6-9cfc-75839ef4b8a4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Download Data (789MB)__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cb424979-adae-46a2-b89c-ead1e7497813", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "import requests\n", + "import pathlib\n", + "\n", + "def download_url(data_location, dataset_subpath, url):\n", + " fuse_dir = pathlib.Path(data_location.replace('dbfs:',''))\n", + " if (\n", + " not fuse_dir.name.startswith('/Volumes/') and \n", + " not fuse_dir.name.startswith('/Workspace/')\n", + " ):\n", + " fuse_dir = pathlib.Path(data_location.replace('dbfs:/', '/dbfs/'))\n", + " fuse_dir.mkdir(parents=True, exist_ok=True)\n", + " fuse_path = fuse_dir / dataset_subpath\n", + " if not fuse_path.exists():\n", + " req = requests.get(url)\n", + " with open(fuse_path, 'wb') as f:\n", + " f.write(req.content)\n", + " else:\n", + " print(f\"'{fuse_path}' exists...skipping\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "834611da-2dce-4738-b9f7-38aae360c473", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "'/dbfs/mjohns@databricks.com/geospatial/mosaic/data/spatial_knn/nyc_building_footprints.geojson' exists...skipping\n" + ] + } + ], + "source": [ + "# buildings - data preview = https://data.cityofnewyork.us/Housing-Development/Building-Footprints/nqwf-w8eh\n", + "download_url(raw_path, building_filename, \"https://data.cityofnewyork.us/api/geospatial/nqwf-w8eh?method=export&format=GeoJSON\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "145be74e-b1f6-4e78-a434-8076844ee721", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "-rwxrwxrwx 1 root root 836M Nov 27 16:45 \u001B[0m\u001B[01;32m'/dbfs/mjohns@databricks.com/geospatial/mosaic/data/spatial_knn/nyc_building_footprints.geojson'\u001B[0m\u001B[K*\r\n" + ] + } + ], + "source": [ + "ls -lh $RAW_FUSE_PATH/$BUILDING_FILENAME" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e1fc720f-b7b3-4a25-a89b-072c16e7310b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Generate DataFrame__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "362a23a3-f8e4-45fd-8bd3-737767dcfa7c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "@udf(returnType=StringType())\n", + "def fix_geojson(gj_dict):\n", + " \"\"\"\n", + " This GeoJSON has coordinates nested as a string, \n", + " so standardize here to avoid issues, gets to same as\n", + " expected when `to_json(\"feature.geometry\")` is\n", + " normally called.\n", + " \"\"\"\n", + " import json\n", + " \n", + " r_list = []\n", + " for l in gj_dict['coordinates']:\n", + " if isinstance(l,str):\n", + " r_list.append(json.loads(l))\n", + " else:\n", + " r_list.append(l)\n", + " \n", + " return json.dumps(\n", + " {\n", + " \"type\": gj_dict['type'],\n", + " \"coordinates\": r_list\n", + " }\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "06f7583c-0f39-4e53-be99-e477f389e772", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 1,109,072\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
typepropertiesjson_geometry
FeatureCollectionList(2042760052, 2048658, 1935, 61809, 2100, Photogramm, {3DCF27FF-A2D0-49BC-A96A-8A25FEEFB8EE}, 78, 40.72, 2017-08-22T00:00:00.000, Constructed, 2042760052, null, 0.0, 0.0){\"type\": \"MultiPolygon\", \"coordinates\": [[[[-73.85143689311231, 40.85381546242524], [-73.85140609192288, 40.8537759883946], [-73.85145374874513, 40.85375454871117], [-73.8514285730422, 40.853722286769], [-73.85153533063556, 40.85367425956854], [-73.85154239770608, 40.85367108001943], [-73.85157483915371, 40.85371265607289], [-73.85158935911977, 40.853706123670676], [-73.85161245356112, 40.853735721045325], [-73.85159921333282, 40.853741677881], [-73.85160609708538, 40.85375049985463], [-73.8515494571809, 40.85377598115823], [-73.8514909949624, 40.85380228223765], [-73.85148454995291, 40.85379402272926], [-73.85143689311231, 40.85381546242524]]]]}
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "FeatureCollection", + [ + "2042760052", + "2048658", + "1935", + "61809", + "2100", + "Photogramm", + "{3DCF27FF-A2D0-49BC-A96A-8A25FEEFB8EE}", + "78", + "40.72", + "2017-08-22T00:00:00.000", + "Constructed", + "2042760052", + null, + "0.0", + "0.0" + ], + "{\"type\": \"MultiPolygon\", \"coordinates\": [[[[-73.85143689311231, 40.85381546242524], [-73.85140609192288, 40.8537759883946], [-73.85145374874513, 40.85375454871117], [-73.8514285730422, 40.853722286769], [-73.85153533063556, 40.85367425956854], [-73.85154239770608, 40.85367108001943], [-73.85157483915371, 40.85371265607289], [-73.85158935911977, 40.853706123670676], [-73.85161245356112, 40.853735721045325], [-73.85159921333282, 40.853741677881], [-73.85160609708538, 40.85375049985463], [-73.8515494571809, 40.85377598115823], [-73.8514909949624, 40.85380228223765], [-73.85148454995291, 40.85379402272926], [-73.85143689311231, 40.85381546242524]]]]}" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "properties", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"base_bbl\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"bin\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"cnstrct_yr\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"doitt_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"feat_code\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"geomsource\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"globalid\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"groundelev\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"heightroof\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"lstmoddate\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"lststatype\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"mpluto_bbl\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_area\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_len\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "json_geometry", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "spark.catalog.clearCache() # <- cache useful for dev (avoid recomputes)\n", + "\n", + "_df_geojson_raw = (\n", + " spark.read\n", + " .option(\"multiline\", \"true\")\n", + " .format(\"json\")\n", + " .load(f\"{raw_path}/{building_filename}\")\n", + " .select(\"type\", F.explode(col(\"features\")).alias(\"feature\"))\n", + " .repartition(24)\n", + " .select(\n", + " \"type\", \n", + " \"feature.properties\", \n", + " fix_geojson(\"feature.geometry\").alias(\"json_geometry\")\n", + " )\n", + " .cache()\n", + ")\n", + "\n", + "print(f\"count? {_df_geojson_raw.count():,}\")\n", + "display(_df_geojson_raw.limit(1))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cc16e0de-2a44-4abd-90fb-eee3e49111ba", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "_df_geojson = (\n", + " _df_geojson_raw\n", + " .withColumn(\"geom\", mos.st_geomfromgeojson(\"json_geometry\"))\n", + " .withColumn(\"geom_wkt\", mos.st_astext(\"geom\"))\n", + " .withColumn(\"is_valid\", mos.st_isvalid(\"geom_wkt\"))\n", + " .select(\"properties.*\", \"geom_wkt\", \"is_valid\")\n", + ")\n", + "\n", + "# print(f\"count? {_df_geojson.count():,}\")\n", + "# display(_df_geojson.limit(1))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "07449a49-1763-4127-b486-ece94d89fc37", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Get Sample of 50K__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4e9d0289-b285-4427-9649-1f8b4469546e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 50,000\n" + ] + } + ], + "source": [ + "_df_geojson_50k = (\n", + " _df_geojson\n", + " .sample(0.05)\n", + " .limit(50_000)\n", + ")\n", + "\n", + "print(f\"count? {_df_geojson_50k.count():,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "20532979-c8e1-4f79-a637-6f23293091d4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Write out to Delta Lake__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fd86cab5-f718-410d-a2ff-7d168c5c4c34", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(\n", + " _df_geojson_50k\n", + " .write\n", + " .format(\"delta\")\n", + " .mode(\"overwrite\")\n", + " .saveAsTable(f\"building_50k\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e5052760-55fc-400c-8cd0-f5d0c204d030", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
count
50,000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "50,000" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "count", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql select format_number(count(1), 0) as count from building_50k" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1868cd58-c401-41df-86dc-8700bffce9d7", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
base_bblbincnstrct_yrdoitt_idfeat_codegeomsourceglobalidgroundelevheightrooflstmoddatelststatypempluto_bblnameshape_areashape_lengeom_wktis_valid
40320800044574701192512016345110Photogramm{0493656C-85A1-4943-9061-281C9FEB33BA}6415.042017-08-17T00:00:00.000Constructed4032080004null0.00.0MULTIPOLYGON (((-73.8545691983773 40.71240691824138, -73.85450439415116 40.71238320422216, -73.8545268433918 40.712347683085575, -73.85459854980775 40.712373923641536, -73.85457857898825 40.71240552349207, -73.85457167676932 40.712402996941606, -73.8545691983773 40.71240691824138)))true
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "4032080004", + "4574701", + "1925", + "1201634", + "5110", + "Photogramm", + "{0493656C-85A1-4943-9061-281C9FEB33BA}", + "64", + "15.04", + "2017-08-17T00:00:00.000", + "Constructed", + "4032080004", + null, + "0.0", + "0.0", + "MULTIPOLYGON (((-73.8545691983773 40.71240691824138, -73.85450439415116 40.71238320422216, -73.8545268433918 40.712347683085575, -73.85459854980775 40.712373923641536, -73.85457857898825 40.71240552349207, -73.85457167676932 40.712402996941606, -73.8545691983773 40.71240691824138)))", + true + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "base_bbl", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "bin", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "cnstrct_yr", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "doitt_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "feat_code", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geomsource", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "globalid", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "groundelev", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "heightroof", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "lstmoddate", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "lststatype", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "mpluto_bbl", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "shape_area", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "shape_len", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql select * from building_50k limit 1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5815c93b-9e89-458a-9ae3-8623d4ca23e7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup NYC Taxi Data (`taxi_trip` | 1M)\n", + "\n", + "> This data is available as part of `databricks-datasets` for customer. We are just going to take 1M trips for our purposes.\n", + "\n", + "__Will write sample out to Delta Lake__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8025bdfa-d10d-41e5-ba8e-6589e41295cf", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(\n", + " spark.table(\"delta.`/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`\")\n", + " .sample(0.001)\n", + " .withColumn(\n", + " \"pickup_point\", mos.st_aswkt(mos.st_point(F.col(\"pickup_longitude\"), F.col(\"pickup_latitude\")))\n", + " )\n", + " .withColumn(\n", + " \"dropoff_point\", mos.st_aswkt(mos.st_point(F.col(\"dropoff_longitude\"), F.col(\"dropoff_latitude\")))\n", + " )\n", + " .limit(1_000_000)\n", + " .write\n", + " .format(\"delta\")\n", + " .mode(\"overwrite\")\n", + " .saveAsTable(f\"taxi_trip_1m\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "724edd96-e852-485f-8369-234d3aba8dd4", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
count
1,000,000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "1,000,000" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "count", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql select format_number(count(1), 0) as count from taxi_trip_1m" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ef9ca8a5-8c77-4b84-8a6a-3d0a3ccae64e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
vendor_idpickup_datetimedropoff_datetimepassenger_counttrip_distancepickup_longitudepickup_latituderate_code_idstore_and_fwd_flagdropoff_longitudedropoff_latitudepayment_typefare_amountextramta_taxtip_amounttolls_amounttotal_amountpickup_pointdropoff_point
VTS2009-11-29T03:24:00.000+00002009-11-29T03:39:00.000+000015.2-73.98892240.722nullnull-73.95042240.7836CASH14.10.50.50.00.015.1POINT (-73.988922 40.722)POINT (-73.950422 40.7836)
VTS2009-11-15T01:03:00.000+00002009-11-15T01:14:00.000+000012.75-74.00879240.708683nullnull-73.99070840.732917CASH8.90.50.50.00.09.9POINT (-74.008792 40.708683)POINT (-73.990708 40.732917)
VTS2009-11-18T18:44:00.000+00002009-11-18T19:04:00.000+000013.81-74.00937540.712577nullnull-73.98143540.760865CASH12.51.00.50.00.014.0POINT (-74.009375 40.712577)POINT (-73.981435 40.760865)
CMT2009-11-04T22:53:38.000+00002009-11-04T23:04:20.000+000012.9-73.99742440.721479null0-73.97495340.758131Cash9.30.50.50.00.010.3POINT (-73.997424 40.721479)POINT (-73.974953 40.758131)
CMT2009-11-29T00:52:18.000+00002009-11-29T01:05:12.000+000013.9-73.9988140.734645null0-73.98792940.779451Cash11.30.50.50.00.012.3POINT (-73.99881 40.734645)POINT (-73.987929 40.779451)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "VTS", + "2009-11-29T03:24:00.000+0000", + "2009-11-29T03:39:00.000+0000", + 1, + 5.2, + -73.988922, + 40.722, + null, + null, + -73.950422, + 40.7836, + "CASH", + 14.1, + 0.5, + 0.5, + 0.0, + 0.0, + 15.1, + "POINT (-73.988922 40.722)", + "POINT (-73.950422 40.7836)" + ], + [ + "VTS", + "2009-11-15T01:03:00.000+0000", + "2009-11-15T01:14:00.000+0000", + 1, + 2.75, + -74.008792, + 40.708683, + null, + null, + -73.990708, + 40.732917, + "CASH", + 8.9, + 0.5, + 0.5, + 0.0, + 0.0, + 9.9, + "POINT (-74.008792 40.708683)", + "POINT (-73.990708 40.732917)" + ], + [ + "VTS", + "2009-11-18T18:44:00.000+0000", + "2009-11-18T19:04:00.000+0000", + 1, + 3.81, + -74.009375, + 40.712577, + null, + null, + -73.981435, + 40.760865, + "CASH", + 12.5, + 1.0, + 0.5, + 0.0, + 0.0, + 14.0, + "POINT (-74.009375 40.712577)", + "POINT (-73.981435 40.760865)" + ], + [ + "CMT", + "2009-11-04T22:53:38.000+0000", + "2009-11-04T23:04:20.000+0000", + 1, + 2.9, + -73.997424, + 40.721479, + null, + "0", + -73.974953, + 40.758131, + "Cash", + 9.3, + 0.5, + 0.5, + 0.0, + 0.0, + 10.3, + "POINT (-73.997424 40.721479)", + "POINT (-73.974953 40.758131)" + ], + [ + "CMT", + "2009-11-29T00:52:18.000+0000", + "2009-11-29T01:05:12.000+0000", + 1, + 3.9, + -73.99881, + 40.734645, + null, + "0", + -73.987929, + 40.779451, + "Cash", + 11.3, + 0.5, + 0.5, + 0.0, + 0.0, + 12.3, + "POINT (-73.99881 40.734645)", + "POINT (-73.987929 40.779451)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "vendor_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "dropoff_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "passenger_count", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "rate_code_id", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "store_and_fwd_flag", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "dropoff_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "payment_type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "fare_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "extra", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mta_tax", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tip_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tolls_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "total_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_point", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_point", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql select * from taxi_trip_1m limit 5" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2e3912f0-89af-4454-a8fd-44489e6c3736", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Verify" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "100931ad-f0b2-4e11-a6cc-01a87652c28a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
databasetableNameisTemporary
mosaic_spatial_knnbuilding_50kfalse
mosaic_spatial_knntaxi_trip_1mfalse
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "mosaic_spatial_knn", + "building_50k", + false + ], + [ + "mosaic_spatial_knn", + "taxi_trip_1m", + false + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "database", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "tableName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "isTemporary", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql show tables from mosaic_spatial_knn" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "da89da17-dbdf-4e4c-9729-b1b557f77e60", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
col_namedata_typecomment
base_bblstringnull
binstringnull
cnstrct_yrstringnull
doitt_idstringnull
feat_codestringnull
geomsourcestringnull
globalidstringnull
groundelevstringnull
heightroofstringnull
lstmoddatestringnull
lststatypestringnull
mpluto_bblstringnull
namestringnull
shape_areastringnull
shape_lenstringnull
geom_wktstringnull
is_validbooleannull
# Detailed Table Information
Catalogmjohns
Databasemosaic_spatial_knn
Tablebuilding_50k
Created TimeMon Nov 27 16:50:40 UTC 2023
Last AccessUNKNOWN
Created BySpark
TypeMANAGED
Locations3://databricks-e2demofieldengwest/b169b504-4c54-49f2-bc3a-adf4b128f36d/tables/e9b1c374-ff02-4adf-8a7b-9be9f53ea1aa
Providerdelta
Ownermjohns@databricks.com
Is_managed_locationtrue
Predictive OptimizationENABLE (inherited from METASTORE unity-catalog-demo)
Table Properties[delta.minReaderVersion=1,delta.minWriterVersion=2]
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "base_bbl", + "string", + null + ], + [ + "bin", + "string", + null + ], + [ + "cnstrct_yr", + "string", + null + ], + [ + "doitt_id", + "string", + null + ], + [ + "feat_code", + "string", + null + ], + [ + "geomsource", + "string", + null + ], + [ + "globalid", + "string", + null + ], + [ + "groundelev", + "string", + null + ], + [ + "heightroof", + "string", + null + ], + [ + "lstmoddate", + "string", + null + ], + [ + "lststatype", + "string", + null + ], + [ + "mpluto_bbl", + "string", + null + ], + [ + "name", + "string", + null + ], + [ + "shape_area", + "string", + null + ], + [ + "shape_len", + "string", + null + ], + [ + "geom_wkt", + "string", + null + ], + [ + "is_valid", + "boolean", + null + ], + [ + "", + "", + "" + ], + [ + "# Detailed Table Information", + "", + "" + ], + [ + "Catalog", + "mjohns", + "" + ], + [ + "Database", + "mosaic_spatial_knn", + "" + ], + [ + "Table", + "building_50k", + "" + ], + [ + "Created Time", + "Mon Nov 27 16:50:40 UTC 2023", + "" + ], + [ + "Last Access", + "UNKNOWN", + "" + ], + [ + "Created By", + "Spark ", + "" + ], + [ + "Type", + "MANAGED", + "" + ], + [ + "Location", + "s3://databricks-e2demofieldengwest/b169b504-4c54-49f2-bc3a-adf4b128f36d/tables/e9b1c374-ff02-4adf-8a7b-9be9f53ea1aa", + "" + ], + [ + "Provider", + "delta", + "" + ], + [ + "Owner", + "mjohns@databricks.com", + "" + ], + [ + "Is_managed_location", + "true", + "" + ], + [ + "Predictive Optimization", + "ENABLE (inherited from METASTORE unity-catalog-demo)", + "" + ], + [ + "Table Properties", + "[delta.minReaderVersion=1,delta.minWriterVersion=2]", + "" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{\"comment\":\"name of the column\"}", + "name": "col_name", + "type": "\"string\"" + }, + { + "metadata": "{\"comment\":\"data type of the column\"}", + "name": "data_type", + "type": "\"string\"" + }, + { + "metadata": "{\"comment\":\"comment of the column\"}", + "name": "comment", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "-- notice this is a managed table (see 'Location' col_name)\n", + "describe table extended building_50k" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "20012234-f169-4f80-b613-4418defb56da", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
col_namedata_typecomment
vendor_idstringnull
pickup_datetimetimestampnull
dropoff_datetimetimestampnull
passenger_countintnull
trip_distancedoublenull
pickup_longitudedoublenull
pickup_latitudedoublenull
rate_code_idintnull
store_and_fwd_flagstringnull
dropoff_longitudedoublenull
dropoff_latitudedoublenull
payment_typestringnull
fare_amountdoublenull
extradoublenull
mta_taxdoublenull
tip_amountdoublenull
tolls_amountdoublenull
total_amountdoublenull
pickup_pointstringnull
dropoff_pointstringnull
# Detailed Table Information
Catalogmjohns
Databasemosaic_spatial_knn
Tabletaxi_trip_1m
Created TimeMon Nov 27 16:54:48 UTC 2023
Last AccessUNKNOWN
Created BySpark
TypeMANAGED
Locations3://databricks-e2demofieldengwest/b169b504-4c54-49f2-bc3a-adf4b128f36d/tables/ffbbebe2-8d1d-4c17-973f-5c00fdcb4faf
Providerdelta
Ownermjohns@databricks.com
Is_managed_locationtrue
Predictive OptimizationENABLE (inherited from METASTORE unity-catalog-demo)
Table Properties[delta.minReaderVersion=1,delta.minWriterVersion=2]
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "vendor_id", + "string", + null + ], + [ + "pickup_datetime", + "timestamp", + null + ], + [ + "dropoff_datetime", + "timestamp", + null + ], + [ + "passenger_count", + "int", + null + ], + [ + "trip_distance", + "double", + null + ], + [ + "pickup_longitude", + "double", + null + ], + [ + "pickup_latitude", + "double", + null + ], + [ + "rate_code_id", + "int", + null + ], + [ + "store_and_fwd_flag", + "string", + null + ], + [ + "dropoff_longitude", + "double", + null + ], + [ + "dropoff_latitude", + "double", + null + ], + [ + "payment_type", + "string", + null + ], + [ + "fare_amount", + "double", + null + ], + [ + "extra", + "double", + null + ], + [ + "mta_tax", + "double", + null + ], + [ + "tip_amount", + "double", + null + ], + [ + "tolls_amount", + "double", + null + ], + [ + "total_amount", + "double", + null + ], + [ + "pickup_point", + "string", + null + ], + [ + "dropoff_point", + "string", + null + ], + [ + "", + "", + "" + ], + [ + "# Detailed Table Information", + "", + "" + ], + [ + "Catalog", + "mjohns", + "" + ], + [ + "Database", + "mosaic_spatial_knn", + "" + ], + [ + "Table", + "taxi_trip_1m", + "" + ], + [ + "Created Time", + "Mon Nov 27 16:54:48 UTC 2023", + "" + ], + [ + "Last Access", + "UNKNOWN", + "" + ], + [ + "Created By", + "Spark ", + "" + ], + [ + "Type", + "MANAGED", + "" + ], + [ + "Location", + "s3://databricks-e2demofieldengwest/b169b504-4c54-49f2-bc3a-adf4b128f36d/tables/ffbbebe2-8d1d-4c17-973f-5c00fdcb4faf", + "" + ], + [ + "Provider", + "delta", + "" + ], + [ + "Owner", + "mjohns@databricks.com", + "" + ], + [ + "Is_managed_location", + "true", + "" + ], + [ + "Predictive Optimization", + "ENABLE (inherited from METASTORE unity-catalog-demo)", + "" + ], + [ + "Table Properties", + "[delta.minReaderVersion=1,delta.minWriterVersion=2]", + "" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{\"comment\":\"name of the column\"}", + "name": "col_name", + "type": "\"string\"" + }, + { + "metadata": "{\"comment\":\"data type of the column\"}", + "name": "data_type", + "type": "\"string\"" + }, + { + "metadata": "{\"comment\":\"comment of the column\"}", + "name": "comment", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "-- notice this is a managed table (see 'Location' col_name)\n", + "describe table extended taxi_trip_1m" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e68d70ce-31d0-45bc-99a2-91b46f8ed7c9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Optional: Clean up initial GeoJSON\n", + "\n", + "> Now that the building data (sample) is in Delta Lake, we don't need it." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c1b003da-e31f-4a47-91a8-cb473ff566c4", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
pathnamesizemodificationTime
dbfs:/mjohns@databricks.com/geospatial/mosaic/data/spatial_knn/nyc_building_footprints.geojsonnyc_building_footprints.geojson8756735361701103503000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/mjohns@databricks.com/geospatial/mosaic/data/spatial_knn/nyc_building_footprints.geojson", + "nyc_building_footprints.geojson", + 875673536, + 1701103503000 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "size", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "display(dbutils.fs.ls(raw_path))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "49f5ac2a-4ed0-48bb-9e84-3d32755ab4c5", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- uncomment to remove geojson file --\n", + "# dbutils.fs.rm(f\"{raw_path}/{building_filename}\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549842133948, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "01. Data Prep", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/SpatialKNN/01. Data Prep.py b/notebooks/examples/python/SpatialKNN/01. Data Prep.py deleted file mode 100644 index e55be66cc..000000000 --- a/notebooks/examples/python/SpatialKNN/01. Data Prep.py +++ /dev/null @@ -1,257 +0,0 @@ -# Databricks notebook source -# MAGIC %md ## Setup -# MAGIC -# MAGIC > Generates the following in database `mosaic_spatial_knn`: (1) table `building_50k`, (2) table `trip_1m`. These are sufficient samples of the full data for this example. - -# COMMAND ---------- - -# MAGIC %pip install databricks-mosaic --quiet - -# COMMAND ---------- - -import os -from pyspark.sql import functions as F -from pyspark.sql.functions import col, udf -from pyspark.sql.types import * - -import mosaic as mos - -spark.conf.set("spark.databricks.labs.mosaic.geometry.api", "JTS") -mos.enable_mosaic(spark, dbutils) - -spark.conf.set("spark.databricks.optimizer.adaptive.enabled", "false") -spark.conf.set("spark.sql.shuffle.partitions", 512) - -# COMMAND ---------- - -# MAGIC %md __Setup Data Location__ -# MAGIC -# MAGIC > You can alter this, of course, to match your preferred location.
- -# COMMAND ---------- - -user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get() - -raw_path = f"dbfs:/{user_name}/geospatial/mosaic/data/spatial_knn" -raw_fuse_path = raw_path.replace("dbfs:","/dbfs") -dbutils.fs.mkdirs(raw_path) - -os.environ['RAW_PATH'] = raw_path -os.environ['RAW_FUSE_PATH'] = raw_fuse_path - -print(f"The raw data will be stored in {raw_path}") - -# COMMAND ---------- - -building_filename = "nyc_building_footprints.geojson" -os.environ['BUILDING_FILENAME'] = building_filename - -# COMMAND ---------- - -db_name = "mosaic_spatial_knn" -sql(f"CREATE DATABASE IF NOT EXISTS {db_name}") - -# COMMAND ---------- - -# MAGIC %md ## Setup NYC Building Data (`Building` Table | 50K) -# MAGIC -# MAGIC > While the overall data size is ~1.1M, we are going to just take 50K for purposes of this example. - -# COMMAND ---------- - -# MAGIC %md __Download Data (789MB)__ - -# COMMAND ---------- - -import requests -import pathlib - -def download_url(data_location, dataset_subpath, url): - local_path = pathlib.Path(data_location.replace('dbfs:/', '/dbfs/')) - local_path.mkdir(parents=True, exist_ok=True) - req = requests.get(url) - with open(local_path / dataset_subpath, 'wb') as f: - f.write(req.content) - -# COMMAND ---------- - -# buildings - data preview = https://data.cityofnewyork.us/Housing-Development/Building-Footprints/nqwf-w8eh -download_url(raw_path, building_filename, "https://data.cityofnewyork.us/api/geospatial/nqwf-w8eh?method=export&format=GeoJSON") - -# COMMAND ---------- - -display(dbutils.fs.ls(raw_path)) - -# COMMAND ---------- - -ls -l --block-size=M $RAW_FUSE_PATH/$BUILDING_FILENAME - -# COMMAND ---------- - -# MAGIC %md __Generate DataFrame__ - -# COMMAND ---------- - -@udf(returnType=StringType()) -def fix_geojson(gj_dict): - """ - This GeoJSON has coordinates nested as a string, - so standardize here to avoid issues, gets to same as - expected when `to_json("feature.geometry")` is - normally called. - """ - import json - - r_list = [] - for l in gj_dict['coordinates']: - if isinstance(l,str): - r_list.append(json.loads(l)) - else: - r_list.append(l) - - return json.dumps( - { - "type": gj_dict['type'], - "coordinates": r_list - } - ) - -# COMMAND ---------- - -spark.catalog.clearCache() - -_df_geojson_raw = ( - spark.read - .option("multiline", "true") - .format("json") - .load(f"{raw_path}/{building_filename}") - .select("type", F.explode(col("features")).alias("feature")) - .repartition(24) - .select( - "type", - "feature.properties", - fix_geojson("feature.geometry").alias("json_geometry") - ) - .cache() -) - -print(f"count? {_df_geojson_raw.count():,}") -display(_df_geojson_raw.limit(1)) - -# COMMAND ---------- - -_df_geojson = ( - _df_geojson_raw - .withColumn("geom", mos.st_geomfromgeojson("json_geometry")) - .withColumn("geom_wkt", mos.st_astext("geom")) - .withColumn("is_valid", mos.st_isvalid("geom_wkt")) - .select("properties.*", "geom_wkt", "is_valid") -) - -# print(f"count? {_df_geojson.count():,}") -# display(_df_geojson.limit(1)) - -# COMMAND ---------- - -# MAGIC %md __Get Sample of 50K__ - -# COMMAND ---------- - -_df_geojson_50k = ( - _df_geojson - .sample(0.05) - .limit(50_000) -) - -print(f"count? {_df_geojson_50k.count():,}") - -# COMMAND ---------- - -# MAGIC %md __Write out to Delta Lake__ - -# COMMAND ---------- - -( - _df_geojson_50k - .write - .format("delta") - .mode("overwrite") - .saveAsTable(f"{db_name}.building_50k") -) - -# COMMAND ---------- - -# MAGIC %sql select format_number(count(1), 0) as count from mosaic_spatial_knn.building_50k - -# COMMAND ---------- - -# MAGIC %sql select * from mosaic_spatial_knn.building_50k limit 5 - -# COMMAND ---------- - -# MAGIC %md ## Setup NYC Taxi Data (`taxi_trip` | 1M) -# MAGIC -# MAGIC > This data is available as part of `databricks-datasets` for customer. We are just going to take 1M trips for our purposes. -# MAGIC -# MAGIC __Will write sample out to Delta Lake__ - -# COMMAND ---------- - -( - spark.table("delta.`/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`") - .sample(0.001) - .withColumn( - "pickup_point", mos.st_aswkt(mos.st_point(F.col("pickup_longitude"), F.col("pickup_latitude"))) - ) - .withColumn( - "dropoff_point", mos.st_aswkt(mos.st_point(F.col("dropoff_longitude"), F.col("dropoff_latitude"))) - ) - .limit(1_000_000) - .write - .format("delta") - .mode("overwrite") - .saveAsTable(f"{db_name}.taxi_trip_1m") -) - -# COMMAND ---------- - -# MAGIC %sql select format_number(count(1), 0) as count from mosaic_spatial_knn.taxi_trip_1m - -# COMMAND ---------- - -# MAGIC %sql select * from mosaic_spatial_knn.taxi_trip_1m limit 5 - -# COMMAND ---------- - -# MAGIC %md ## Verify - -# COMMAND ---------- - -# MAGIC %sql show tables from mosaic_spatial_knn - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC -- notice this is a managed table (see 'Location' col_name) -# MAGIC describe table extended mosaic_spatial_knn.building_50k - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC -- notice this is a managed table (see 'Location' col_name) -# MAGIC describe table extended mosaic_spatial_knn.taxi_trip_1m - -# COMMAND ---------- - -# MAGIC %md ## Optional: Clean up initial GeoJSON -# MAGIC -# MAGIC > Now that the building data (sample) is in Delta Lake, we don't need it. - -# COMMAND ---------- - -display(dbutils.fs.ls(raw_path)) - -# COMMAND ---------- - -# -- uncomment to remove geojson file -- -# dbutils.fs.rm(f"{raw_path}/{building_filename}") diff --git a/notebooks/examples/python/SpatialKNN/02. Spatial KNN.ipynb b/notebooks/examples/python/SpatialKNN/02. Spatial KNN.ipynb new file mode 100644 index 000000000..a92bdf5e3 --- /dev/null +++ b/notebooks/examples/python/SpatialKNN/02. Spatial KNN.ipynb @@ -0,0 +1,2213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "97ceecad-501b-405b-9bd5-63818a6bd442", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Scalable KNN on Databricks with Mosaic\n", + "\n", + "> See [Blog](https://medium.com/@milos.colic/scalable-spatial-nearest-neighbours-with-mosaic-336ce37edbae) | [Mosaic Docs](https://databrickslabs.github.io/mosaic/models/spatial-knn.html) | [SpatialKNN API](https://github.com/databrickslabs/mosaic/blob/main/python/mosaic/models/knn/spatial_knn.py) -- __Note:__ Make sure you run this on Databricks ML Runtime.\n", + "\n", + "

\n", + "\n", + "1. To use Databricks Labs [Mosaic](https://databrickslabs.github.io/mosaic/index.html) library for geospatial data engineering, analysis, and visualization functionality:\n", + " * Install with `%pip install databricks-mosaic`\n", + " * Import and use with the following:\n", + " ```\n", + " import mosaic as mos\n", + " mos.enable_mosaic(spark, dbutils)\n", + " ```\n", + "

\n", + "\n", + "2. To use [KeplerGl](https://kepler.gl/) OSS library for map layer rendering:\n", + " * Already installed with Mosaic, use `%%mosaic_kepler` magic [[Mosaic Docs](https://databrickslabs.github.io/mosaic/usage/kepler.html)]\n", + " * Import with `from keplergl import KeplerGl` to use directly\n", + "\n", + "If you have trouble with Volume access:\n", + "\n", + "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", + "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html).\n", + "\n", + "---\n", + "__Last Updated:__ 27 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2327ed26-e078-4026-b45f-f4232fa2599e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> Usually when asserting the notion of nearest neighbors we bound that notion to the _K_ neighbors, if left unbound the answers produced by the analysis are basically orderings of the whole data assets based on the proximity/distance and the computational costs to produce such outputs can be very prohibitive since they would result in comparing all features across all data assets.\n", + "\n", + "__Optimized Algorithm (Right Side Below)__\n", + "

\n", + "\n", + "1. For each geometry in set L generate a kloop (hollow ring)\n", + "1. Generate match candidates within \n", + "1. For each match candidate C calculate the distance to the landmark\n", + "1. For each L[i] count the matches; stop if count = k \n", + "1. If count < k, increase the size of the kloop; repeat (s1)\n", + "1. If count > k, remove matches furthest from the L[i]; stop\n", + "1. Optional: early stopping if no new match candidates are found in the kloop of any L geometry for N iterations \n", + "1. Continue with the next kloop up to max iterations\n", + "1. Return C geometries with smallest distance to each L[i]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b20fb87a-0588-4989-83e2-9d73f089e346", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%python\n", + "\n", + "displayHTML(f\"\"\"\n", + "\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bd1269ad-372e-4715-8af2-6630aad4e009", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Install + Enable Mosaic" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b5446ea4-29ac-49c3-968e-71b9f78cc698", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6e1412c9-1696-4d06-9d52-e5f91a4ded8f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "13077c0f-eb5f-40c9-a483-5c3292f89a43", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "username? 'mjohns@databricks.com'\n" + ] + } + ], + "source": [ + "user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()\n", + "print(f\"username? '{user_name}'\")\n", + " \n", + "spark.sparkContext.setCheckpointDir(f\"dbfs:/tmp/mosaic/{user_name}/checkpoints\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1c1399b0-d923-4cc5-9784-4ad0df1a915e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Setup Catalog and Schema__\n", + "\n", + "> These values will mirror the Data Prep notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1c2ea1a6-6214-49f0-bf6f-364ac734a40b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[4]: DataFrame[]" + ] + } + ], + "source": [ + "catalog_name = \"mjohns\"\n", + "db_name = \"mosaic_spatial_knn\"\n", + "sql(f\"USE CATALOG {catalog_name}\")\n", + "sql(f\"USE SCHEMA {db_name}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cd3d5bce-9450-49d5-93ca-0df6d37923ca", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
databasetableNameisTemporary
mosaic_spatial_knnbuilding_50kfalse
mosaic_spatial_knntaxi_trip_1mfalse
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "mosaic_spatial_knn", + "building_50k", + false + ], + [ + "mosaic_spatial_knn", + "taxi_trip_1m", + false + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "database", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "tableName", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "isTemporary", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql show tables" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3542b314-9230-4747-9870-571398bd8d05", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Load Landmark + Candidates Tables\n", + "\n", + "> We will load a handfull of datasets we have prepared in our data prep notebook. For this use case we will first manually walk through the approach and then we will apply the model that comes with mosaic." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9b14f217-ad5a-4fd6-b59a-d8556e444f81", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Bldg POINT count? 1,160\nBldg MULTIPOLYGON count? 48,840\nTrip count? 1,000,000\n" + ] + } + ], + "source": [ + "df_bldg = spark.read.table(\"building_50k\").where(mos.st_geometrytype(F.col(\"geom_wkt\")) == \"POINT\")\n", + "df_bldg_shape = spark.read.table(\"building_50k\").where(mos.st_geometrytype(F.col(\"geom_wkt\")) == \"MULTIPOLYGON\")\n", + "df_trip = spark.read.table(\"taxi_trip_1m\")\n", + "\n", + "# sanity checks on counts (may vary based on your sample)\n", + "print(f\"Bldg POINT count? {df_bldg.count():,}\")\n", + "print(f\"Bldg MULTIPOLYGON count? {df_bldg_shape.count():,}\")\n", + "print(f\"Trip count? {df_trip.count():,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bbeb3509-ca83-4a2c-a76a-2ae37551c9ce", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Render with Kepler\n", + "> We will render our building shapes and krings and kdiscs / kloops around the shapes; showing 1% subset of building, you can pan and zoom in the viewport." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7fd2b313-4d49-4089-84a3-04dbfd57938a", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "621dca4f-2c35-4be7-97b7-346e588116cf", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b7e3986e-f42d-4dd7-9c87-116001c79dcd", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# df_bldg_shape \"geom_wkt\" \"geometry\" 500" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "50fd814d-7db7-46ff-b727-4d36d3be8a3e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> In order to find out the nearest neighbors we can create a kring around each of our point of interests. For that purpose mosaic comes with geometry aware kring and kdisc / kloop (hexring) implementations. These expressions also have their auto-explode versions that we are going to use here. It is much easier to join already exploded cell IDs between 2 datasets. __Note: the gridding system is h3 [[1](https://docs.databricks.com/en/sql/language-manual/sql-ref-h3-geospatial-functions.html)|[2](https://h3geo.org/)] by default; this example uses resolution 9 and does just 1 kring around the geometry.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "15c88bf4-75c8-4deb-bf18-7f50cade8151", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "with_kring_1 = df_bldg_shape.select(\n", + " \"geom_wkt\",\n", + " mos.grid_geometrykringexplode(\"geom_wkt\", F.lit(9), F.lit(1))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a6fe03e8-6452-4a7e-93c8-55855d9508da", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3a2d3407-172e-4468-81ea-bb470a3f26f1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2b1061a2-864e-468d-96e8-d7c4bdb1dba1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# with_kring_1 \"cellId\" \"h3\" 500" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "26ae803f-0bca-4a00-ac36-cdb0fbf5f8e1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> But what do we do if we dont have enough neighbors in the krings we just ran? We need to keep iterating. Our second iteration and all iterations onward are kdisc / kloop based. This allows us to only compare candidates we absolutely need to compare. __Note: example uses resolution=`9` again and kloop=`2`.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "53f863b6-a923-4dbf-82f1-c9f01a121126", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "with_kdisc_2 = df_bldg_shape.select(\n", + " \"geom_wkt\",\n", + " mos.grid_geometrykloopexplode(\"geom_wkt\", F.lit(9), F.lit(2))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8e7baf60-1882-41b2-bdc0-fdfa2bae22a3", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ae69fd7e-e092-48f4-a017-876173afece4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb4cc584-6ec6-455d-8edb-6e07e31e47e1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# with_kdisc_2 \"cellId\" \"h3\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "befd0a7f-a478-4e70-9526-d0fda29a0d23", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> This is great, but what about complex shapes that are do not require radial catchment areas? What about data like streets or rivers? Mosaic's implementation of geometry aware krings and kloops can be used here as well (not shown).\n", + "\n", + "```\n", + "with_kdisc_3 = streets.select(\n", + " F.col(\"geometry\"),\n", + " mos.grid_geometrykloopexplode(\"geometry\", F.lit(9), F.lit(2))\n", + ")\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2c9b8b43-564c-4897-bd1f-b5df4a7a8985", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Prep for KNN\n", + "\n", + "> There are a lot of things to keep track of if one is to implemet a scalable KNN approach. Luckily Mosaic comes with an implemetation of a spark transformer that can do all of those steps for us. __Note: The following requires [Databricks Runtime for Machine Learning](https://docs.databricks.com/en/release-notes/runtime/index.html).__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "207c838c-3f71-493a-9ab0-f3bd4464db8a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "2023/11/27 17:38:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.\n2023/11/27 17:38:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.ml.\n" + ] + } + ], + "source": [ + "from mosaic.models import SpatialKNN\n", + "import mlflow\n", + "\n", + "mlflow.autolog(disable=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7230d971-c6aa-4295-876c-4bc01469ac16", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Look at Landmarks (`df_bldg_shape` | ~48K)__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2648390e-b776-4074-aab9-9e823443d19f", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "landmarks (building shapes) count? 48,840\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
base_bblbincnstrct_yrdoitt_idfeat_codegeomsourceglobalidgroundelevheightrooflstmoddatelststatypempluto_bblnameshape_areashape_lengeom_wktis_valid
40320800044574701192512016345110Photogramm{0493656C-85A1-4943-9061-281C9FEB33BA}6415.042017-08-17T00:00:00.000Constructed4032080004null0.00.0MULTIPOLYGON (((-73.8545691983773 40.71240691824138, -73.85450439415116 40.71238320422216, -73.8545268433918 40.712347683085575, -73.85459854980775 40.712373923641536, -73.85457857898825 40.71240552349207, -73.85457167676932 40.712402996941606, -73.8545691983773 40.71240691824138)))true
4034560010408256819307175432100Photogramm{16EF9995-7325-48E6-A47B-83014B157BC6}7637.212017-08-22T00:00:00.000Constructed4034560010null0.00.0MULTIPOLYGON (((-73.90834122368813 40.70186200477241, -73.90835071312429 40.70185270643405, -73.90831293543681 40.70183037695949, -73.90830344599857 40.7018396752948, -73.90822862016675 40.7017954473286, -73.90825754228445 40.70176710604632, -73.90828798097903 40.70173727921914, -73.90836280796816 40.701781507148, -73.90835706146972 40.70178713797922, -73.90835417637156 40.701789965091926, -73.90839195404996 40.70181229455309, -73.90840058564466 40.70180383660631, -73.90848623665563 40.701854463103636, -73.908426875912 40.70191263131447, -73.90834122368813 40.70186200477241)))true
5043130025510545019804306322100Photogramm{93D7D4EA-36F2-4160-9571-EFAA31AF05AD}8425.142017-08-22T00:00:00.000Constructed5043130025null0.00.0MULTIPOLYGON (((-74.1269721146288 40.57417949547929, -74.12694018673571 40.57417522445134, -74.12693909742006 40.574179962418604, -74.12691535940886 40.57417678727848, -74.12679587556198 40.574160806118144, -74.12681595670063 40.57407351313443, -74.12688779788068 40.57408312255846, -74.12699832508223 40.57409790501996, -74.12698118065005 40.574172434344625, -74.12697933467022 40.57418046096693, -74.1269721146288 40.57417949547929)))true
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "4032080004", + "4574701", + "1925", + "1201634", + "5110", + "Photogramm", + "{0493656C-85A1-4943-9061-281C9FEB33BA}", + "64", + "15.04", + "2017-08-17T00:00:00.000", + "Constructed", + "4032080004", + null, + "0.0", + "0.0", + "MULTIPOLYGON (((-73.8545691983773 40.71240691824138, -73.85450439415116 40.71238320422216, -73.8545268433918 40.712347683085575, -73.85459854980775 40.712373923641536, -73.85457857898825 40.71240552349207, -73.85457167676932 40.712402996941606, -73.8545691983773 40.71240691824138)))", + true + ], + [ + "4034560010", + "4082568", + "1930", + "717543", + "2100", + "Photogramm", + "{16EF9995-7325-48E6-A47B-83014B157BC6}", + "76", + "37.21", + "2017-08-22T00:00:00.000", + "Constructed", + "4034560010", + null, + "0.0", + "0.0", + "MULTIPOLYGON (((-73.90834122368813 40.70186200477241, -73.90835071312429 40.70185270643405, -73.90831293543681 40.70183037695949, -73.90830344599857 40.7018396752948, -73.90822862016675 40.7017954473286, -73.90825754228445 40.70176710604632, -73.90828798097903 40.70173727921914, -73.90836280796816 40.701781507148, -73.90835706146972 40.70178713797922, -73.90835417637156 40.701789965091926, -73.90839195404996 40.70181229455309, -73.90840058564466 40.70180383660631, -73.90848623665563 40.701854463103636, -73.908426875912 40.70191263131447, -73.90834122368813 40.70186200477241)))", + true + ], + [ + "5043130025", + "5105450", + "1980", + "430632", + "2100", + "Photogramm", + "{93D7D4EA-36F2-4160-9571-EFAA31AF05AD}", + "84", + "25.14", + "2017-08-22T00:00:00.000", + "Constructed", + "5043130025", + null, + "0.0", + "0.0", + "MULTIPOLYGON (((-74.1269721146288 40.57417949547929, -74.12694018673571 40.57417522445134, -74.12693909742006 40.574179962418604, -74.12691535940886 40.57417678727848, -74.12679587556198 40.574160806118144, -74.12681595670063 40.57407351313443, -74.12688779788068 40.57408312255846, -74.12699832508223 40.57409790501996, -74.12698118065005 40.574172434344625, -74.12697933467022 40.57418046096693, -74.1269721146288 40.57417949547929)))", + true + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "base_bbl", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "bin", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "cnstrct_yr", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "doitt_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "feat_code", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geomsource", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "globalid", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "groundelev", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "heightroof", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "lstmoddate", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "lststatype", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "mpluto_bbl", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "shape_area", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "shape_len", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "print(f\"landmarks (building shapes) count? {df_bldg_shape.count():,}\")\n", + "df_bldg_shape.limit(3).display()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6e81d255-ee83-47e1-ab25-62a8d5527243", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Look at Candidates (`df_trip` | 1M)__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4b7780e0-9a31-4658-8e07-41a8dbb74f8e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\tcandidates (trips) count? 1,000,000\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
vendor_idpickup_datetimedropoff_datetimepassenger_counttrip_distancepickup_longitudepickup_latituderate_code_idstore_and_fwd_flagdropoff_longitudedropoff_latitudepayment_typefare_amountextramta_taxtip_amounttolls_amounttotal_amountpickup_pointdropoff_point
VTS2009-11-29T03:24:00.000+00002009-11-29T03:39:00.000+000015.2-73.98892240.722nullnull-73.95042240.7836CASH14.10.50.50.00.015.1POINT (-73.988922 40.722)POINT (-73.950422 40.7836)
VTS2009-11-15T01:03:00.000+00002009-11-15T01:14:00.000+000012.75-74.00879240.708683nullnull-73.99070840.732917CASH8.90.50.50.00.09.9POINT (-74.008792 40.708683)POINT (-73.990708 40.732917)
VTS2009-11-18T18:44:00.000+00002009-11-18T19:04:00.000+000013.81-74.00937540.712577nullnull-73.98143540.760865CASH12.51.00.50.00.014.0POINT (-74.009375 40.712577)POINT (-73.981435 40.760865)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "VTS", + "2009-11-29T03:24:00.000+0000", + "2009-11-29T03:39:00.000+0000", + 1, + 5.2, + -73.988922, + 40.722, + null, + null, + -73.950422, + 40.7836, + "CASH", + 14.1, + 0.5, + 0.5, + 0.0, + 0.0, + 15.1, + "POINT (-73.988922 40.722)", + "POINT (-73.950422 40.7836)" + ], + [ + "VTS", + "2009-11-15T01:03:00.000+0000", + "2009-11-15T01:14:00.000+0000", + 1, + 2.75, + -74.008792, + 40.708683, + null, + null, + -73.990708, + 40.732917, + "CASH", + 8.9, + 0.5, + 0.5, + 0.0, + 0.0, + 9.9, + "POINT (-74.008792 40.708683)", + "POINT (-73.990708 40.732917)" + ], + [ + "VTS", + "2009-11-18T18:44:00.000+0000", + "2009-11-18T19:04:00.000+0000", + 1, + 3.81, + -74.009375, + 40.712577, + null, + null, + -73.981435, + 40.760865, + "CASH", + 12.5, + 1.0, + 0.5, + 0.0, + 0.0, + 14.0, + "POINT (-74.009375 40.712577)", + "POINT (-73.981435 40.760865)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "vendor_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "dropoff_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "passenger_count", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "rate_code_id", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "store_and_fwd_flag", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "dropoff_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "payment_type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "fare_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "extra", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mta_tax", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tip_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tolls_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "total_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_point", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_point", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "print(f\"\\tcandidates (trips) count? {df_trip.count():,}\")\n", + "df_trip.limit(3).display()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "45c65747-5925-47d0-8a2f-b5a2dd93ca75", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Run the KNN Transform\n", + "\n", + "> In this example we will compare ~50K building shapes (polygons) to 1M taxi trips (points). Since this approach is defined as an algorithm it can be easily chained. E.g. We could, as a follow-on, check using another instance of the knn model which streets are closest to the set of taxi trips that are idetified in the first run (not shown).\n", + "\n", + "The transformer has the following parameters, from [here](https://databrickslabs.github.io/mosaic/models/spatial-knn.html):\n", + "\n", + "

\n", + "\n", + "* `candidatesDf`: the dataframe containing the geometries that will be used as candidates for the KNN search\n", + "* `candidatesFeatureCol`: the name of the column that contains the candidates geometries\n", + "* `candidatesRowID`: the name of the column that contains the candidates ids\n", + "* `landmarksFeatureCol`: the name of the column that contains the landmarks geometries\n", + "* `landmarksRowID`: the name of the column that contains the landmarks ids\n", + "* `kNeighbours`: the number of neighbours to return\n", + "* `maxIterations`: the maximum number of iterations to perform\n", + "* `distanceThreshold`: the distance threshold to stop the iterations (in CRS units)\n", + "* `earlyStopIterations`: the number of subsequent iterations upon which to stop if no new neighbours\n", + "* `checkpointTablePrefix`: the prefix of the checkpoint table\n", + "* `indexResolution`: the resolution of the index (grid system specific)\n", + "* `approximate`: whether to stop after max iterations (approximate = true) or to perform the finalisation step (approximate = false) - no default value, the caller must specify this parameter" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6931dae2-6f94-4b57-b3c1-905c2d2de839", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "with mlflow.start_run(): \n", + "\n", + " knn = SpatialKNN()\n", + " knn.setUseTableCheckpoint(True)\n", + " knn.setCheckpointTablePrefix(\"checkpoint_table_knn\")\n", + " knn.model.cleanupCheckpoint\n", + " \n", + " knn.setApproximate(True)\n", + " knn.setKNeighbours(20)\n", + " knn.setIndexResolution(10)\n", + " knn.setMaxIterations(10)\n", + " knn.setEarlyStopIterations(3)\n", + " knn.setDistanceThreshold(1.0)\n", + " \n", + " knn.setLandmarksFeatureCol(\"geom_wkt\")\n", + " knn.setLandmarksRowID(\"landmarks_id\")\n", + " \n", + " knn.setCandidatesFeatureCol(\"pickup_point\")\n", + " knn.setCandidatesRowID(\"candidates_id\")\n", + " knn.setCandidatesDf(df_trip.where(\"pickup_point is not null\"))\n", + "\n", + " df_neigh = knn.transform(df_bldg_shape)\n", + " \n", + " mlflow.log_params(knn.getParams())\n", + " mlflow.log_metrics(knn.getMetrics())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f0e5e9a2-1cd1-426d-a6f9-9372a39694f0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is an example of a generated (reproducible) experiment run._\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "abba45f2-0155-428c-9f92-8644bfc0c5d2", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0b784847-5d8c-423e-ba2c-d2a692672f74", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Generate KNN Transform Result Table `transform_result` (~620K)__\n", + "\n", + "> Write out the results from `df_neigh` to delta lake " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6afe00db-01ca-4f18-8424-593fc70f9ef2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "(\n", + " df_neigh\n", + " .write\n", + " .format(\"delta\")\n", + " .mode(\"overwrite\")\n", + " .saveAsTable(f\"transform_result\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "df08f36f-01b4-4316-8cea-f45cfc754936", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "SpatialKNN transform count? 672,085\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "

landmarks_idcandidates_idbase_bblbincnstrct_yrdoitt_idfeat_codegeomsourceglobalidgroundelevheightrooflstmoddatelststatypempluto_bblnameshape_areashape_lengeom_wktis_validiterationmatch_radiusvendor_idpickup_datetimedropoff_datetimepassenger_counttrip_distancepickup_longitudepickup_latituderate_code_idstore_and_fwd_flagdropoff_longitudedropoff_latitudepayment_typefare_amountextramta_taxtip_amounttolls_amounttotal_amountpickup_pointdropoff_pointgeom_wkt_pickup_point_distanceneighbour_number
2711806532039990048204229419265999122100Photogramm{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}4123.812017-08-22T00:00:00.000Constructed2039990048null0.00.0MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))true2nullVTS2009-05-15T03:59:00.000+00002009-05-15T04:03:00.000+000020.84-73.84823740.843122nullnull-73.83659740.843998CASH4.10.5null0.00.04.6POINT (-73.848237 40.843122)POINT (-73.836597 40.843998)0.00188759413210478051
2717078572039990048204229419265999122100Photogramm{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}4123.812017-08-22T00:00:00.000Constructed2039990048null0.00.0MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))true2nullVTS2012-10-16T11:36:00.000+00002012-10-16T11:50:00.000+000011.91-73.85021740.8443151null-73.84589340.838313CRD10.50.00.51.00.012.0POINT (-73.850217 40.844315)POINT (-73.845893 40.838313)0.00219560631909489862
2716270882039990048204229419265999122100Photogramm{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}4123.812017-08-22T00:00:00.000Constructed2039990048null0.00.0MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))true2nullCMT2010-09-06T19:58:23.000+00002010-09-06T20:06:24.000+000022.6-73.84688240.8421231N-73.8431640.843225CRD8.10.50.51.360.010.46POINT (-73.846882 40.842123)POINT (-73.84316 40.843225)0.0029426300230721023
2713255702039990048204229419265999122100Photogramm{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}4123.812017-08-22T00:00:00.000Constructed2039990048null0.00.0MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))true30.002942630023072102VTS2011-08-09T15:07:00.000+00002011-08-09T15:52:00.000+000025.39-73.8457240.8426371null-73.84111540.84522CRD23.30.00.54.660.028.46POINT (-73.84572 40.842637)POINT (-73.841115 40.84522)0.0041394130025831834
2712198882039990048204229419265999122100Photogramm{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}4123.812017-08-22T00:00:00.000Constructed2039990048null0.00.0MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))true50.006319279550955254VTS2012-04-06T15:48:00.000+00002012-04-06T16:10:00.000+000054.98-73.85041740.8371681null-73.85041740.837168CSH15.30.00.50.00.015.8POINT (-73.850417 40.837168)POINT (-73.850417 40.837168)0.004793117261415765
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 271, + 180653, + "2039990048", + "2042294", + "1926", + "599912", + "2100", + "Photogramm", + "{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}", + "41", + "23.81", + "2017-08-22T00:00:00.000", + "Constructed", + "2039990048", + null, + "0.0", + "0.0", + "MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))", + true, + 2, + null, + "VTS", + "2009-05-15T03:59:00.000+0000", + "2009-05-15T04:03:00.000+0000", + 2, + 0.84, + -73.848237, + 40.843122, + null, + null, + -73.836597, + 40.843998, + "CASH", + 4.1, + 0.5, + null, + 0.0, + 0.0, + 4.6, + "POINT (-73.848237 40.843122)", + "POINT (-73.836597 40.843998)", + 0.0018875941321047805, + 1 + ], + [ + 271, + 707857, + "2039990048", + "2042294", + "1926", + "599912", + "2100", + "Photogramm", + "{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}", + "41", + "23.81", + "2017-08-22T00:00:00.000", + "Constructed", + "2039990048", + null, + "0.0", + "0.0", + "MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))", + true, + 2, + null, + "VTS", + "2012-10-16T11:36:00.000+0000", + "2012-10-16T11:50:00.000+0000", + 1, + 1.91, + -73.850217, + 40.844315, + 1, + null, + -73.845893, + 40.838313, + "CRD", + 10.5, + 0.0, + 0.5, + 1.0, + 0.0, + 12.0, + "POINT (-73.850217 40.844315)", + "POINT (-73.845893 40.838313)", + 0.0021956063190948986, + 2 + ], + [ + 271, + 627088, + "2039990048", + "2042294", + "1926", + "599912", + "2100", + "Photogramm", + "{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}", + "41", + "23.81", + "2017-08-22T00:00:00.000", + "Constructed", + "2039990048", + null, + "0.0", + "0.0", + "MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))", + true, + 2, + null, + "CMT", + "2010-09-06T19:58:23.000+0000", + "2010-09-06T20:06:24.000+0000", + 2, + 2.6, + -73.846882, + 40.842123, + 1, + "N", + -73.84316, + 40.843225, + "CRD", + 8.1, + 0.5, + 0.5, + 1.36, + 0.0, + 10.46, + "POINT (-73.846882 40.842123)", + "POINT (-73.84316 40.843225)", + 0.002942630023072102, + 3 + ], + [ + 271, + 325570, + "2039990048", + "2042294", + "1926", + "599912", + "2100", + "Photogramm", + "{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}", + "41", + "23.81", + "2017-08-22T00:00:00.000", + "Constructed", + "2039990048", + null, + "0.0", + "0.0", + "MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))", + true, + 3, + 0.002942630023072102, + "VTS", + "2011-08-09T15:07:00.000+0000", + "2011-08-09T15:52:00.000+0000", + 2, + 5.39, + -73.84572, + 40.842637, + 1, + null, + -73.841115, + 40.84522, + "CRD", + 23.3, + 0.0, + 0.5, + 4.66, + 0.0, + 28.46, + "POINT (-73.84572 40.842637)", + "POINT (-73.841115 40.84522)", + 0.004139413002583183, + 4 + ], + [ + 271, + 219888, + "2039990048", + "2042294", + "1926", + "599912", + "2100", + "Photogramm", + "{7BD3CEAE-C07F-4813-B459-B63E4C65DCE3}", + "41", + "23.81", + "2017-08-22T00:00:00.000", + "Constructed", + "2039990048", + null, + "0.0", + "0.0", + "MULTIPOLYGON (((-73.85008652889444 40.84198940079596, -73.85004043789522 40.84202461954545, -73.84988295321862 40.84214495395065, -73.84982454680676 40.84210086986926, -73.8498946040775 40.84204733842935, -73.8500281224862 40.841945315915964, -73.85008652889444 40.84198940079596)))", + true, + 5, + 0.006319279550955254, + "VTS", + "2012-04-06T15:48:00.000+0000", + "2012-04-06T16:10:00.000+0000", + 5, + 4.98, + -73.850417, + 40.837168, + 1, + null, + -73.850417, + 40.837168, + "CSH", + 15.3, + 0.0, + 0.5, + 0.0, + 0.0, + 15.8, + "POINT (-73.850417 40.837168)", + "POINT (-73.850417 40.837168)", + 0.00479311726141576, + 5 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "landmarks_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "candidates_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "base_bbl", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "bin", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "cnstrct_yr", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "doitt_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "feat_code", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geomsource", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "globalid", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "groundelev", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "heightroof", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "lstmoddate", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "lststatype", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "mpluto_bbl", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "shape_area", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "shape_len", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + }, + { + "metadata": "{}", + "name": "iteration", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "match_radius", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "vendor_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "dropoff_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "passenger_count", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "rate_code_id", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "store_and_fwd_flag", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "dropoff_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "payment_type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "fare_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "extra", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mta_tax", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tip_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tolls_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "total_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_point", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_point", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geom_wkt_pickup_point_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "neighbour_number", + "type": "\"integer\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_result = spark.table(f\"transform_result\")\n", + "print(f\"SpatialKNN transform count? {df_result.count():,}\")\n", + "df_result.limit(5).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "0ce58d33-a998-4246-a222-28a7281f3ab8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Render Transform Results\n", + "\n", + "> Finally we can render our knn sets (from `df_neigh`) in kepler and verify that results make sense." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d8e8557d-1e0d-41e6-9bc8-ad8397ba7207", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "knn_geoms = df_result.select(\"geom_wkt\", \"pickup_point\", \"dropoff_point\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "326cfb02-f91a-4eba-8f20-96ea2a8a6828", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ed99f908-8e82-402b-8fc2-ec85a4565d1e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2389db0c-4e4a-47af-9bb3-88ffab3ded91", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# knn_geoms \"geom_wkt\" \"geometry\" " + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549842141315, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "02. Spatial KNN", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/SpatialKNN/02. Spatial KNN.py b/notebooks/examples/python/SpatialKNN/02. Spatial KNN.py deleted file mode 100644 index 980ff05bd..000000000 --- a/notebooks/examples/python/SpatialKNN/02. Spatial KNN.py +++ /dev/null @@ -1,242 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC # Scalable KNN on Databricks with Mosaic -# MAGIC -# MAGIC > See [[Blog](https://medium.com/@milos.colic/scalable-spatial-nearest-neighbours-with-mosaic-336ce37edbae) | [Mosaic Docs](https://databrickslabs.github.io/mosaic/models/spatial-knn.html) | [SpatialKNN API](https://github.com/databrickslabs/mosaic/blob/main/python/mosaic/models/knn/spatial_knn.py)] -# MAGIC -# MAGIC _Note: Make sure you run this on Databricks ML Runtime._ - -# COMMAND ---------- - -# MAGIC %md -# MAGIC > Usually when asserting the notion of nearest neighbors we bound that notion to the _K_ neighbors, if left unbound the answers produced by the analysis are basically orderings of the whole data assets based on the proximity/distance and the computational costs to produce such outputs can be very prohibitive since they would result in comparing all features across all data assets. -# MAGIC -# MAGIC __Optimized Algorithm (Right Side Below)__ -# MAGIC

-# MAGIC -# MAGIC 1. For each geometry in set L generate a kloop (hollow ring) -# MAGIC 1. Generate match candidates within -# MAGIC 1. For each match candidate C calculate the distance to the landmark -# MAGIC 1. For each L[i] count the matches; stop if count = k -# MAGIC 1. If count < k, increase the size of the kloop; repeat (s1) -# MAGIC 1. If count > k, remove matches furthest from the L[i]; stop -# MAGIC 1. Optional: early stopping if no new match candidates are found in the kloop of any L geometry for N iterations -# MAGIC 1. Continue with the next kloop up to max iterations -# MAGIC 1. Return C geometries with smallest distance to each L[i] - -# COMMAND ---------- - -# MAGIC %python -# MAGIC -# MAGIC displayHTML(f""" -# MAGIC -# MAGIC """) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Install + Enable Mosaic - -# COMMAND ---------- - -# MAGIC %pip install databricks-mosaic --quiet - -# COMMAND ---------- - -from pyspark.sql import functions as F -from pyspark.sql.functions import col, udf - -import mosaic as mos - -spark.conf.set("spark.databricks.labs.mosaic.geometry.api", "JTS") -mos.enable_mosaic(spark, dbutils) - -# COMMAND ---------- - -user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get() -print(f"username? '{user_name}'") - -spark.sparkContext.setCheckpointDir(f"dbfs:/tmp/mosaic/{user_name}/checkpoints") -spark.conf.set("spark.databricks.optimizer.adaptive.enabled", "false") -spark.conf.set("spark.sql.shuffle.partitions", 512) - -# COMMAND ---------- - -db_name = "mosaic_spatial_knn" -sql(f"use {db_name}") - -# COMMAND ---------- - -# MAGIC %sql show tables - -# COMMAND ---------- - -# MAGIC %md ## Load Landmark + Candidates Tables -# MAGIC -# MAGIC > We will load a handfull of datasets we have prepared in our data prep notebook. For this use case we will first manually walk through the approach and then we will apply the model that comes with mosaic. - -# COMMAND ---------- - -df_bldg = spark.read.table("building_50k").where(mos.st_geometrytype(F.col("geom_wkt")) == "Point") -df_bldg_shape = spark.read.table("building_50k").where(mos.st_geometrytype(F.col("geom_wkt")) == "MultiPolygon") -df_trip = spark.read.table("taxi_trip_1m") - -# COMMAND ---------- - -# MAGIC %md ## Render with Kepler -# MAGIC > We will render our building shapes and krings and kdiscs / kloops around the shapes. - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC df_bldg_shape "geom_wkt" "geometry" 500 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC > In order to find out the nearest neighbors we can create a kring around each of our point of interests. For that purpose mosaic comes with geometry concious kring and kdisc / kloop (hexring) implementations. These expressions also have their auto-explode versions that we are going to use here. It is much easier to join already exploded cell IDs between 2 datasets. - -# COMMAND ---------- - -with_kring_1 = df_bldg_shape.select( - F.col("geom_wkt"), - mos.grid_geometrykringexplode("geom_wkt", F.lit(9), F.lit(1)) -) - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC with_kring_1 "cellId" "h3" 500 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC > But what do we do if we dont have enough neighbors in the krings we just ran? We need to keep iterating. Our second iteration and all iterations onward are kdisc / kloop based. This allows us to only compare candidates we absolutely need to compare. - -# COMMAND ---------- - -with_kdisc_2 = df_bldg_shape.select( - F.col("geom_wkt"), - mos.grid_geometrykloopexplode("geom_wkt", F.lit(9), F.lit(2)) -) - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC with_kdisc_2 "cellId" "h3" - -# COMMAND ---------- - -# MAGIC %md -# MAGIC > This is great, but what about complex shapes that are do not require radial catchment areas? What about data like streets or rivers? Mosaic's implementation of geometry concious krings and kloops can be used here as well (not shown). -# MAGIC -# MAGIC ``` -# MAGIC with_kdisc_3 = streets.select( -# MAGIC F.col("geometry"), -# MAGIC mos.grid_geometrykloopexplode("geometry", F.lit(9), F.lit(2)) -# MAGIC ) -# MAGIC ``` - -# COMMAND ---------- - -# MAGIC %md ## Prep for KNN -# MAGIC -# MAGIC > There are a lot of things to keep track of if one is to implemet a scalable KNN approach. Luckily Mosaic comes with an implemetation of a spark transformer that can do all of those steps for us. - -# COMMAND ---------- - -from mosaic.models import SpatialKNN -import mlflow -mlflow.autolog(disable=False) - -# COMMAND ---------- - -# MAGIC %md __Look at Landmarks (`df_bldg_shape` | ~48K)__ - -# COMMAND ---------- - -print(f"landmarks (building shapes) count? {df_bldg_shape.count():,}") -df_bldg_shape.limit(3).display() - -# COMMAND ---------- - -# MAGIC %md __Look at Candidates (`df_trip` | 1M)__ - -# COMMAND ---------- - -print(f"\tcandidates (trips) count? {df_trip.count():,}") -df_trip.limit(3).display() - -# COMMAND ---------- - -# MAGIC %md ## Run the KNN Transform -# MAGIC -# MAGIC > In this example we will compare ~50K building shapes (polygons) to 1M taxi trips (points). Since this approach is defined as an algorithm it can be easily chained. E.g. We could, as a follow-on, check using another instance of the knn model which streets are closest to the set of taxi trips that are idetified in the first run (not shown). - -# COMMAND ---------- - -with mlflow.start_run(): - - knn = SpatialKNN() - knn.setUseTableCheckpoint(True) - knn.setCheckpointTablePrefix("checkpoint_table_knn") - knn.model.cleanupCheckpoint - - knn.setApproximate(True) - knn.setKNeighbours(20) - knn.setIndexResolution(10) - knn.setMaxIterations(10) - knn.setEarlyStopIterations(3) - knn.setDistanceThreshold(1.0) - - knn.setLandmarksFeatureCol("geom_wkt") - knn.setLandmarksRowID("landmarks_id") - - knn.setCandidatesFeatureCol("pickup_point") - knn.setCandidatesRowID("candidates_id") - knn.setCandidatesDf(df_trip.where("pickup_point is not null")) - - df_neigh = knn.transform(df_bldg_shape) - - mlflow.log_params(knn.getParams()) - mlflow.log_metrics(knn.getMetrics()) - -# COMMAND ---------- - -# MAGIC %md __Generate KNN Transform Result Table `transform_result` (~620K)__ -# MAGIC -# MAGIC > Write out the results from `df_neigh` to delta lake - -# COMMAND ---------- - -( - df_neigh - .write - .format("delta") - .mode("overwrite") - .saveAsTable(f"{db_name}.transform_result") -) - -# COMMAND ---------- - -df_result = spark.table(f"{db_name}.transform_result") -print(f"SpatialKNN transform count? {df_result.count():,}") -df_result.display() - -# COMMAND ---------- - -# MAGIC %md ## Render Transform Results -# MAGIC -# MAGIC > Finally we can render our knn sets (from `df_neigh`) in kepler and verify that results make sense. - -# COMMAND ---------- - -knn_geoms = df_result.select("geom_wkt", "pickup_point", "dropoff_point") - -# COMMAND ---------- - -# MAGIC %%mosaic_kepler -# MAGIC knn_geoms "geom_wkt" "geometry" diff --git a/notebooks/examples/python/SpatialKNN/README.md b/notebooks/examples/python/SpatialKNN/README.md index bfa267138..8a1731385 100644 --- a/notebooks/examples/python/SpatialKNN/README.md +++ b/notebooks/examples/python/SpatialKNN/README.md @@ -2,6 +2,8 @@ ### This is a self-contained example for running Spatial K-Nearest Neighbors in Mosaic. +> Note: `ipynb` files can be previewed in GitHub and can also be imported into Databricks, more [here](https://docs.databricks.com/en/notebooks/notebook-export-import.html). + __Notebooks + Tables:__

diff --git a/notebooks/examples/python/TransformBNG/README.md b/notebooks/examples/python/TransformBNG/README.md new file mode 100644 index 000000000..01a488a89 --- /dev/null +++ b/notebooks/examples/python/TransformBNG/README.md @@ -0,0 +1,3 @@ +# Transform + Join British National Grid + +> Note: `ipynb` files can be previewed in GitHub and can also be imported into Databricks, more [here](https://docs.databricks.com/en/notebooks/notebook-export-import.html); also, though this focuses on transforming from EPSG:4326 into BNG Coordinate Reference System, the example is applicable for any [ST_Transform](https://databrickslabs.github.io/mosaic/api/spatial-functions.html#st_transform) pattern. diff --git a/notebooks/examples/python/TransformBNG/transform_join_bng.ipynb b/notebooks/examples/python/TransformBNG/transform_join_bng.ipynb new file mode 100644 index 000000000..8b7a622e9 --- /dev/null +++ b/notebooks/examples/python/TransformBNG/transform_join_bng.ipynb @@ -0,0 +1,5556 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8b99536c-bb54-4da1-a917-7a8ba43bac82", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# BNG: Transform + Join\n", + "\n", + "> Example of transforming WGS84 (EPSG:4326) into British National Grid (EPSG:27700), performing a spatial point-in-polygon join, and then generating a heat map of the results. More at Mosaic Docs [[BNG](https://databrickslabs.github.io/mosaic/usage/grid-indexes-bng.html) | [ST_Transform](https://databrickslabs.github.io/mosaic/api/spatial-functions.html#st_transform) | [ST_SetSRID](https://databrickslabs.github.io/mosaic/api/spatial-functions.html#st_setsrid)].\n", + "\n", + "1. To use Databricks Labs [Mosaic](https://databrickslabs.github.io/mosaic/index.html) library for geospatial data engineering, analysis, and visualization functionality:\n", + " * Install with `%pip install databricks-mosaic`\n", + " * Import and use with the following:\n", + " ```\n", + " import mosaic as mos\n", + " mos.enable_mosaic(spark, dbutils)\n", + " ```\n", + "

\n", + "\n", + "2. To use [KeplerGl](https://kepler.gl/) OSS library for map layer rendering:\n", + " * Already installed with Mosaic, use `%%mosaic_kepler` magic [[Mosaic Docs](https://databrickslabs.github.io/mosaic/usage/kepler.html)]\n", + " * Import with `from keplergl import KeplerGl` to use directly\n", + "\n", + "If you have trouble with Volume access:\n", + "\n", + "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", + "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html).\n", + "\n", + "--- \n", + " __Last Update__ 28 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "939b71aa-e0b2-49b5-859c-c8eac6a55d62", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Install Mosaic\n", + "\n", + "> Mosaic framework is available via pip install and it comes with bindings for Python, SQL, Scala and R. The wheel file coming with pip installation is registering any necessary jars for other language support." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "13cae37b-6613-424b-a6fa-fadfc04b1680", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1e29541a-626f-4d51-b355-41882f0886eb", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Enable Mosaic in the notebook\n", + "\n", + "> To get started, you'll need to attach the wheel to your cluster and import instances as in the cell below. The defautl grid index system is set to H3. In order to use British National Grid you'll need to set the configuration parameter." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d6a60d91-69af-40fc-8e05-71bca8bd583f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "spark.conf.set(\"spark.databricks.labs.mosaic.index.system\", \"BNG\")\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import os\n", + "import pathlib\n", + "import requests\n", + "import zipfile\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c3eb8396-ee87-4bae-8cc3-4c00179e9c9c", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Setup Catalog + Schema\n", + "\n", + "> You will want to adjust for your environment." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bb84c94d-7e68-43e9-94bc-1219d67cc927", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[47]: DataFrame[]" + ] + } + ], + "source": [ + "catalog_name = \"mjohns\"\n", + "sql(f\"USE CATALOG {catalog_name}\")\n", + "\n", + "db_name = \"london_cycling\"\n", + "sql(f\"CREATE DATABASE IF NOT EXISTS {db_name}\")\n", + "sql(f\"USE SCHEMA {db_name}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8d6145cb-a2d6-4e79-bc6a-7921d2942775", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup Data\n", + "\n", + "> The download snippets are setup to only download 1x." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e8b93a88-4d2d-4c71-8672-e120a1dd4c0c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial data stored in '/tmp/mosaic/mjohns@databricks.com'\n" + ] + } + ], + "source": [ + "user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()\n", + "\n", + "data_dir = f\"/tmp/mosaic/{user_name}\"\n", + "print(f\"Initial data stored in '{data_dir}'\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6727e81c-9d4f-49a0-919d-73d4b7301ef5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initial London Postcodes [177]\n", + "\n", + "> Make sure we have London Postcode shapes available in our environment." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ff5530b0-5ca0-4742-81f2-8affa6104e39", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "POSTCODES_DIR_FUSE? '/dbfs/tmp/mosaic/mjohns@databricks.com/postcodes'\n" + ] + } + ], + "source": [ + "postcodes_dir = f\"{data_dir}/postcodes\"\n", + "postcodes_dir_fuse = f\"/dbfs{postcodes_dir}\"\n", + "dbutils.fs.mkdirs(postcodes_dir)\n", + "\n", + "os.environ['POSTCODES_DIR_FUSE'] = postcodes_dir_fuse\n", + "print(f\"POSTCODES_DIR_FUSE? '{postcodes_dir_fuse}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "df0ee902-2731-4572-93a9-2913bd7517c7", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "File ‘/dbfs/tmp/mosaic/mjohns@databricks.com/postcodes/London_Postcode_Zones.geojson’ already there; not retrieving.\n\ntotal 937K\n-rwxrwxrwx 1 root root 937K Nov 28 15:19 London_Postcode_Zones.geojson\n" + ] + } + ], + "source": [ + "%sh \n", + "wget -P $POSTCODES_DIR_FUSE -nc https://raw.githubusercontent.com/databrickslabs/mosaic/main/notebooks/data/London_Postcode_Zones.geojson\n", + "ls -lh $POSTCODES_DIR_FUSE" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d2d365e1-290a-4589-87d0-01eea5e45ea5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Load Postcode Polygons from GeoJSON_" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5d56dff1-c178-40a8-9436-9cbde9de5140", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 177\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "

typepropertiesjson_geometrygeometry
FeatureCollectionList(E2 postcode district, E2){\"coordinates\":[[[-0.04385,51.53179],[-0.04324,51.53155],[-0.04277,51.53153],[-0.04163,51.5311],[-0.0415,51.53073],[-0.0415,51.5307],[-0.04142,51.53057],[-0.04089,51.53015],[-0.04087,51.53013],[-0.04046,51.52965],[-0.04046,51.52958],[-0.04052,51.52941],[-0.04096,51.52886],[-0.04074,51.52868],[-0.04047,51.52738],[-0.04145,51.52711],[-0.04152,51.52714],[-0.04299,51.52695],[-0.04319,51.52677],[-0.04332,51.52673],[-0.04371,51.5267],[-0.04399,51.52655],[-0.04427,51.52646],[-0.0449,51.52659],[-0.04568,51.52638],[-0.04598,51.5265],[-0.04605,51.52651],[-0.04697,51.52632],[-0.04731,51.52601],[-0.04748,51.526],[-0.04798,51.52615],[-0.04847,51.52636],[-0.04881,51.52611],[-0.04903,51.52559],[-0.04903,51.52549],[-0.04973,51.5255],[-0.04994,51.5253],[-0.05028,51.52521],[-0.05061,51.52522],[-0.0508,51.52508],[-0.05175,51.52497],[-0.05203,51.52527],[-0.05272,51.52562],[-0.05292,51.5256],[-0.05272,51.52446],[-0.05297,51.52434],[-0.05318,51.52434],[-0.05362,51.52441],[-0.05375,51.52432],[-0.05381,51.5243],[-0.05478,51.52427],[-0.05486,51.52431],[-0.05537,51.5243],[-0.0556,51.52419],[-0.05646,51.52428],[-0.0565,51.5243],[-0.05685,51.52432],[-0.05701,51.52412],[-0.05803,51.52381],[-0.05844,51.52392],[-0.0585,51.52392],[-0.0588,51.52386],[-0.05955,51.52391],[-0.05965,51.524],[-0.06114,51.52382],[-0.06126,51.52387],[-0.06152,51.52368],[-0.06277,51.52339],[-0.063,51.52324],[-0.06344,51.52324],[-0.06371,51.52332],[-0.06426,51.52333],[-0.06452,51.52321],[-0.06593,51.52312],[-0.06628,51.52297],[-0.06663,51.52309],[-0.06722,51.52323],[-0.06832,51.52308],[-0.06866,51.52297],[-0.06878,51.52284],[-0.06913,51.52281],[-0.0697,51.52274],[-0.07002,51.52294],[-0.07184,51.52302],[-0.0716,51.52335],[-0.07159,51.52355],[-0.07121,51.52386],[-0.07098,51.52393],[-0.0708,51.52436],[-0.07093,51.52452],[-0.07167,51.52459],[-0.07187,51.52454],[-0.07205,51.52435],[-0.07242,51.52414],[-0.07245,51.52412],[-0.07258,51.52409],[-0.0731,51.52414],[-0.07339,51.52448],[-0.07379,51.52454],[-0.0741,51.52448],[-0.07412,51.52443],[-0.07475,51.52402],[-0.0749,51.52434],[-0.07499,51.52439],[-0.07573,51.52432],[-0.07592,51.52447],[-0.07614,51.52449],[-0.07638,51.52447],[-0.07704,51.52482],[-0.07715,51.52516],[-0.07678,51.52529],[-0.07674,51.5256],[-0.07684,51.52578],[-0.07761,51.52603],[-0.07733,51.52625],[-0.0771,51.52682],[-0.07758,51.52696],[-0.07776,51.52737],[-0.07836,51.52779],[-0.07773,51.5281],[-0.0777,51.52815],[-0.07775,51.52826],[-0.07888,51.52838],[-0.07891,51.52842],[-0.07891,51.52854],[-0.07912,51.52878],[-0.07922,51.52931],[-0.07914,51.52945],[-0.07915,51.52966],[-0.07837,51.52981],[-0.07814,51.53003],[-0.07765,51.53008],[-0.0776,51.53027],[-0.07725,51.53037],[-0.07732,51.53066],[-0.07678,51.53106],[-0.07724,51.53149],[-0.07669,51.53208],[-0.07723,51.53234],[-0.07781,51.53236],[-0.07782,51.53237],[-0.07788,51.53285],[-0.07798,51.53291],[-0.07734,51.53365],[-0.07741,51.53369],[-0.07787,51.53377],[-0.07828,51.53422],[-0.07721,51.5345],[-0.07697,51.53444],[-0.07678,51.53451],[-0.07675,51.53466],[-0.07699,51.53491],[-0.07749,51.53493],[-0.07784,51.53517],[-0.07773,51.53525],[-0.0771,51.53533],[-0.07706,51.53558],[-0.07707,51.53561],[-0.07733,51.53571],[-0.07733,51.53586],[-0.07689,51.53611],[-0.07704,51.53638],[-0.07668,51.53666],[-0.07614,51.53661],[-0.07573,51.53631],[-0.07495,51.53642],[-0.07462,51.53626],[-0.07396,51.5364],[-0.07315,51.53591],[-0.07302,51.53588],[-0.0728,51.536],[-0.07197,51.53602],[-0.07143,51.53581],[-0.07109,51.53597],[-0.07033,51.53598],[-0.07013,51.53583],[-0.0694,51.5358],[-0.06928,51.53588],[-0.06827,51.53599],[-0.06805,51.53582],[-0.06726,51.53579],[-0.06724,51.53579],[-0.06669,51.53573],[-0.06665,51.53574],[-0.06597,51.53571],[-0.06579,51.53564],[-0.06521,51.5356],[-0.06506,51.53568],[-0.06461,51.5357],[-0.06455,51.53566],[-0.06397,51.53544],[-0.06329,51.53555],[-0.06269,51.53536],[-0.06259,51.53535],[-0.06244,51.53545],[-0.06091,51.53522],[-0.06101,51.53499],[-0.06079,51.53463],[-0.05997,51.53429],[-0.06003,51.53367],[-0.05817,51.53428],[-0.05786,51.53414],[-0.05728,51.53417],[-0.05721,51.53419],[-0.05713,51.53418],[-0.05642,51.53487],[-0.05644,51.53512],[-0.0564,51.53513],[-0.05523,51.53489],[-0.05461,51.5352],[-0.05428,51.53514],[-0.05411,51.53503],[-0.05396,51.53498],[-0.05236,51.53495],[-0.05164,51.53509],[-0.05117,51.53535],[-0.0508,51.53537],[-0.04984,51.53517],[-0.04914,51.53482],[-0.04913,51.53482],[-0.0476,51.5344],[-0.04652,51.5342],[-0.04631,51.53418],[-0.04556,51.53416],[-0.04521,51.5342],[-0.04562,51.53221],[-0.04508,51.53186],[-0.04406,51.532],[-0.04385,51.53179]]],\"type\":\"Polygon\"}List(5, 4326, List(List(List(-0.04385, 51.53179), List(-0.04324, 51.53155), List(-0.04277, 51.53153), List(-0.04163, 51.5311), List(-0.0415, 51.53073), List(-0.0415, 51.5307), List(-0.04142, 51.53057), List(-0.04089, 51.53015), List(-0.04087, 51.53013), List(-0.04046, 51.52965), List(-0.04046, 51.52958), List(-0.04052, 51.52941), List(-0.04096, 51.52886), List(-0.04074, 51.52868), List(-0.04047, 51.52738), List(-0.04145, 51.52711), List(-0.04152, 51.52714), List(-0.04299, 51.52695), List(-0.04319, 51.52677), List(-0.04332, 51.52673), List(-0.04371, 51.5267), List(-0.04399, 51.52655), List(-0.04427, 51.52646), List(-0.0449, 51.52659), List(-0.04568, 51.52638), List(-0.04598, 51.5265), List(-0.04605, 51.52651), List(-0.04697, 51.52632), List(-0.04731, 51.52601), List(-0.04748, 51.526), List(-0.04798, 51.52615), List(-0.04847, 51.52636), List(-0.04881, 51.52611), List(-0.04903, 51.52559), List(-0.04903, 51.52549), List(-0.04973, 51.5255), List(-0.04994, 51.5253), List(-0.05028, 51.52521), List(-0.05061, 51.52522), List(-0.0508, 51.52508), List(-0.05175, 51.52497), List(-0.05203, 51.52527), List(-0.05272, 51.52562), List(-0.05292, 51.5256), List(-0.05272, 51.52446), List(-0.05297, 51.52434), List(-0.05318, 51.52434), List(-0.05362, 51.52441), List(-0.05375, 51.52432), List(-0.05381, 51.5243), List(-0.05478, 51.52427), List(-0.05486, 51.52431), List(-0.05537, 51.5243), List(-0.0556, 51.52419), List(-0.05646, 51.52428), List(-0.0565, 51.5243), List(-0.05685, 51.52432), List(-0.05701, 51.52412), List(-0.05803, 51.52381), List(-0.05844, 51.52392), List(-0.0585, 51.52392), List(-0.0588, 51.52386), List(-0.05955, 51.52391), List(-0.05965, 51.524), List(-0.06114, 51.52382), List(-0.06126, 51.52387), List(-0.06152, 51.52368), List(-0.06277, 51.52339), List(-0.063, 51.52324), List(-0.06344, 51.52324), List(-0.06371, 51.52332), List(-0.06426, 51.52333), List(-0.06452, 51.52321), List(-0.06593, 51.52312), List(-0.06628, 51.52297), List(-0.06663, 51.52309), List(-0.06722, 51.52323), List(-0.06832, 51.52308), List(-0.06866, 51.52297), List(-0.06878, 51.52284), List(-0.06913, 51.52281), List(-0.0697, 51.52274), List(-0.07002, 51.52294), List(-0.07184, 51.52302), List(-0.0716, 51.52335), List(-0.07159, 51.52355), List(-0.07121, 51.52386), List(-0.07098, 51.52393), List(-0.0708, 51.52436), List(-0.07093, 51.52452), List(-0.07167, 51.52459), List(-0.07187, 51.52454), List(-0.07205, 51.52435), List(-0.07242, 51.52414), List(-0.07245, 51.52412), List(-0.07258, 51.52409), List(-0.0731, 51.52414), List(-0.07339, 51.52448), List(-0.07379, 51.52454), List(-0.0741, 51.52448), List(-0.07412, 51.52443), List(-0.07475, 51.52402), List(-0.0749, 51.52434), List(-0.07499, 51.52439), List(-0.07573, 51.52432), List(-0.07592, 51.52447), List(-0.07614, 51.52449), List(-0.07638, 51.52447), List(-0.07704, 51.52482), List(-0.07715, 51.52516), List(-0.07678, 51.52529), List(-0.07674, 51.5256), List(-0.07684, 51.52578), List(-0.07761, 51.52603), List(-0.07733, 51.52625), List(-0.0771, 51.52682), List(-0.07758, 51.52696), List(-0.07776, 51.52737), List(-0.07836, 51.52779), List(-0.07773, 51.5281), List(-0.0777, 51.52815), List(-0.07775, 51.52826), List(-0.07888, 51.52838), List(-0.07891, 51.52842), List(-0.07891, 51.52854), List(-0.07912, 51.52878), List(-0.07922, 51.52931), List(-0.07914, 51.52945), List(-0.07915, 51.52966), List(-0.07837, 51.52981), List(-0.07814, 51.53003), List(-0.07765, 51.53008), List(-0.0776, 51.53027), List(-0.07725, 51.53037), List(-0.07732, 51.53066), List(-0.07678, 51.53106), List(-0.07724, 51.53149), List(-0.07669, 51.53208), List(-0.07723, 51.53234), List(-0.07781, 51.53236), List(-0.07782, 51.53237), List(-0.07788, 51.53285), List(-0.07798, 51.53291), List(-0.07734, 51.53365), List(-0.07741, 51.53369), List(-0.07787, 51.53377), List(-0.07828, 51.53422), List(-0.07721, 51.5345), List(-0.07697, 51.53444), List(-0.07678, 51.53451), List(-0.07675, 51.53466), List(-0.07699, 51.53491), List(-0.07749, 51.53493), List(-0.07784, 51.53517), List(-0.07773, 51.53525), List(-0.0771, 51.53533), List(-0.07706, 51.53558), List(-0.07707, 51.53561), List(-0.07733, 51.53571), List(-0.07733, 51.53586), List(-0.07689, 51.53611), List(-0.07704, 51.53638), List(-0.07668, 51.53666), List(-0.07614, 51.53661), List(-0.07573, 51.53631), List(-0.07495, 51.53642), List(-0.07462, 51.53626), List(-0.07396, 51.5364), List(-0.07315, 51.53591), List(-0.07302, 51.53588), List(-0.0728, 51.536), List(-0.07197, 51.53602), List(-0.07143, 51.53581), List(-0.07109, 51.53597), List(-0.07033, 51.53598), List(-0.07013, 51.53583), List(-0.0694, 51.5358), List(-0.06928, 51.53588), List(-0.06827, 51.53599), List(-0.06805, 51.53582), List(-0.06726, 51.53579), List(-0.06724, 51.53579), List(-0.06669, 51.53573), List(-0.06665, 51.53574), List(-0.06597, 51.53571), List(-0.06579, 51.53564), List(-0.06521, 51.5356), List(-0.06506, 51.53568), List(-0.06461, 51.5357), List(-0.06455, 51.53566), List(-0.06397, 51.53544), List(-0.06329, 51.53555), List(-0.06269, 51.53536), List(-0.06259, 51.53535), List(-0.06244, 51.53545), List(-0.06091, 51.53522), List(-0.06101, 51.53499), List(-0.06079, 51.53463), List(-0.05997, 51.53429), List(-0.06003, 51.53367), List(-0.05817, 51.53428), List(-0.05786, 51.53414), List(-0.05728, 51.53417), List(-0.05721, 51.53419), List(-0.05713, 51.53418), List(-0.05642, 51.53487), List(-0.05644, 51.53512), List(-0.0564, 51.53513), List(-0.05523, 51.53489), List(-0.05461, 51.5352), List(-0.05428, 51.53514), List(-0.05411, 51.53503), List(-0.05396, 51.53498), List(-0.05236, 51.53495), List(-0.05164, 51.53509), List(-0.05117, 51.53535), List(-0.0508, 51.53537), List(-0.04984, 51.53517), List(-0.04914, 51.53482), List(-0.04913, 51.53482), List(-0.0476, 51.5344), List(-0.04652, 51.5342), List(-0.04631, 51.53418), List(-0.04556, 51.53416), List(-0.04521, 51.5342), List(-0.04562, 51.53221), List(-0.04508, 51.53186), List(-0.04406, 51.532), List(-0.04385, 51.53179))), List(List()))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "FeatureCollection", + [ + "E2 postcode district", + "E2" + ], + "{\"coordinates\":[[[-0.04385,51.53179],[-0.04324,51.53155],[-0.04277,51.53153],[-0.04163,51.5311],[-0.0415,51.53073],[-0.0415,51.5307],[-0.04142,51.53057],[-0.04089,51.53015],[-0.04087,51.53013],[-0.04046,51.52965],[-0.04046,51.52958],[-0.04052,51.52941],[-0.04096,51.52886],[-0.04074,51.52868],[-0.04047,51.52738],[-0.04145,51.52711],[-0.04152,51.52714],[-0.04299,51.52695],[-0.04319,51.52677],[-0.04332,51.52673],[-0.04371,51.5267],[-0.04399,51.52655],[-0.04427,51.52646],[-0.0449,51.52659],[-0.04568,51.52638],[-0.04598,51.5265],[-0.04605,51.52651],[-0.04697,51.52632],[-0.04731,51.52601],[-0.04748,51.526],[-0.04798,51.52615],[-0.04847,51.52636],[-0.04881,51.52611],[-0.04903,51.52559],[-0.04903,51.52549],[-0.04973,51.5255],[-0.04994,51.5253],[-0.05028,51.52521],[-0.05061,51.52522],[-0.0508,51.52508],[-0.05175,51.52497],[-0.05203,51.52527],[-0.05272,51.52562],[-0.05292,51.5256],[-0.05272,51.52446],[-0.05297,51.52434],[-0.05318,51.52434],[-0.05362,51.52441],[-0.05375,51.52432],[-0.05381,51.5243],[-0.05478,51.52427],[-0.05486,51.52431],[-0.05537,51.5243],[-0.0556,51.52419],[-0.05646,51.52428],[-0.0565,51.5243],[-0.05685,51.52432],[-0.05701,51.52412],[-0.05803,51.52381],[-0.05844,51.52392],[-0.0585,51.52392],[-0.0588,51.52386],[-0.05955,51.52391],[-0.05965,51.524],[-0.06114,51.52382],[-0.06126,51.52387],[-0.06152,51.52368],[-0.06277,51.52339],[-0.063,51.52324],[-0.06344,51.52324],[-0.06371,51.52332],[-0.06426,51.52333],[-0.06452,51.52321],[-0.06593,51.52312],[-0.06628,51.52297],[-0.06663,51.52309],[-0.06722,51.52323],[-0.06832,51.52308],[-0.06866,51.52297],[-0.06878,51.52284],[-0.06913,51.52281],[-0.0697,51.52274],[-0.07002,51.52294],[-0.07184,51.52302],[-0.0716,51.52335],[-0.07159,51.52355],[-0.07121,51.52386],[-0.07098,51.52393],[-0.0708,51.52436],[-0.07093,51.52452],[-0.07167,51.52459],[-0.07187,51.52454],[-0.07205,51.52435],[-0.07242,51.52414],[-0.07245,51.52412],[-0.07258,51.52409],[-0.0731,51.52414],[-0.07339,51.52448],[-0.07379,51.52454],[-0.0741,51.52448],[-0.07412,51.52443],[-0.07475,51.52402],[-0.0749,51.52434],[-0.07499,51.52439],[-0.07573,51.52432],[-0.07592,51.52447],[-0.07614,51.52449],[-0.07638,51.52447],[-0.07704,51.52482],[-0.07715,51.52516],[-0.07678,51.52529],[-0.07674,51.5256],[-0.07684,51.52578],[-0.07761,51.52603],[-0.07733,51.52625],[-0.0771,51.52682],[-0.07758,51.52696],[-0.07776,51.52737],[-0.07836,51.52779],[-0.07773,51.5281],[-0.0777,51.52815],[-0.07775,51.52826],[-0.07888,51.52838],[-0.07891,51.52842],[-0.07891,51.52854],[-0.07912,51.52878],[-0.07922,51.52931],[-0.07914,51.52945],[-0.07915,51.52966],[-0.07837,51.52981],[-0.07814,51.53003],[-0.07765,51.53008],[-0.0776,51.53027],[-0.07725,51.53037],[-0.07732,51.53066],[-0.07678,51.53106],[-0.07724,51.53149],[-0.07669,51.53208],[-0.07723,51.53234],[-0.07781,51.53236],[-0.07782,51.53237],[-0.07788,51.53285],[-0.07798,51.53291],[-0.07734,51.53365],[-0.07741,51.53369],[-0.07787,51.53377],[-0.07828,51.53422],[-0.07721,51.5345],[-0.07697,51.53444],[-0.07678,51.53451],[-0.07675,51.53466],[-0.07699,51.53491],[-0.07749,51.53493],[-0.07784,51.53517],[-0.07773,51.53525],[-0.0771,51.53533],[-0.07706,51.53558],[-0.07707,51.53561],[-0.07733,51.53571],[-0.07733,51.53586],[-0.07689,51.53611],[-0.07704,51.53638],[-0.07668,51.53666],[-0.07614,51.53661],[-0.07573,51.53631],[-0.07495,51.53642],[-0.07462,51.53626],[-0.07396,51.5364],[-0.07315,51.53591],[-0.07302,51.53588],[-0.0728,51.536],[-0.07197,51.53602],[-0.07143,51.53581],[-0.07109,51.53597],[-0.07033,51.53598],[-0.07013,51.53583],[-0.0694,51.5358],[-0.06928,51.53588],[-0.06827,51.53599],[-0.06805,51.53582],[-0.06726,51.53579],[-0.06724,51.53579],[-0.06669,51.53573],[-0.06665,51.53574],[-0.06597,51.53571],[-0.06579,51.53564],[-0.06521,51.5356],[-0.06506,51.53568],[-0.06461,51.5357],[-0.06455,51.53566],[-0.06397,51.53544],[-0.06329,51.53555],[-0.06269,51.53536],[-0.06259,51.53535],[-0.06244,51.53545],[-0.06091,51.53522],[-0.06101,51.53499],[-0.06079,51.53463],[-0.05997,51.53429],[-0.06003,51.53367],[-0.05817,51.53428],[-0.05786,51.53414],[-0.05728,51.53417],[-0.05721,51.53419],[-0.05713,51.53418],[-0.05642,51.53487],[-0.05644,51.53512],[-0.0564,51.53513],[-0.05523,51.53489],[-0.05461,51.5352],[-0.05428,51.53514],[-0.05411,51.53503],[-0.05396,51.53498],[-0.05236,51.53495],[-0.05164,51.53509],[-0.05117,51.53535],[-0.0508,51.53537],[-0.04984,51.53517],[-0.04914,51.53482],[-0.04913,51.53482],[-0.0476,51.5344],[-0.04652,51.5342],[-0.04631,51.53418],[-0.04556,51.53416],[-0.04521,51.5342],[-0.04562,51.53221],[-0.04508,51.53186],[-0.04406,51.532],[-0.04385,51.53179]]],\"type\":\"Polygon\"}", + [ + 5, + 4326, + [ + [ + [ + -0.04385, + 51.53179 + ], + [ + -0.04324, + 51.53155 + ], + [ + -0.04277, + 51.53153 + ], + [ + -0.04163, + 51.5311 + ], + [ + -0.0415, + 51.53073 + ], + [ + -0.0415, + 51.5307 + ], + [ + -0.04142, + 51.53057 + ], + [ + -0.04089, + 51.53015 + ], + [ + -0.04087, + 51.53013 + ], + [ + -0.04046, + 51.52965 + ], + [ + -0.04046, + 51.52958 + ], + [ + -0.04052, + 51.52941 + ], + [ + -0.04096, + 51.52886 + ], + [ + -0.04074, + 51.52868 + ], + [ + -0.04047, + 51.52738 + ], + [ + -0.04145, + 51.52711 + ], + [ + -0.04152, + 51.52714 + ], + [ + -0.04299, + 51.52695 + ], + [ + -0.04319, + 51.52677 + ], + [ + -0.04332, + 51.52673 + ], + [ + -0.04371, + 51.5267 + ], + [ + -0.04399, + 51.52655 + ], + [ + -0.04427, + 51.52646 + ], + [ + -0.0449, + 51.52659 + ], + [ + -0.04568, + 51.52638 + ], + [ + -0.04598, + 51.5265 + ], + [ + -0.04605, + 51.52651 + ], + [ + -0.04697, + 51.52632 + ], + [ + -0.04731, + 51.52601 + ], + [ + -0.04748, + 51.526 + ], + [ + -0.04798, + 51.52615 + ], + [ + -0.04847, + 51.52636 + ], + [ + -0.04881, + 51.52611 + ], + [ + -0.04903, + 51.52559 + ], + [ + -0.04903, + 51.52549 + ], + [ + -0.04973, + 51.5255 + ], + [ + -0.04994, + 51.5253 + ], + [ + -0.05028, + 51.52521 + ], + [ + -0.05061, + 51.52522 + ], + [ + -0.0508, + 51.52508 + ], + [ + -0.05175, + 51.52497 + ], + [ + -0.05203, + 51.52527 + ], + [ + -0.05272, + 51.52562 + ], + [ + -0.05292, + 51.5256 + ], + [ + -0.05272, + 51.52446 + ], + [ + -0.05297, + 51.52434 + ], + [ + -0.05318, + 51.52434 + ], + [ + -0.05362, + 51.52441 + ], + [ + -0.05375, + 51.52432 + ], + [ + -0.05381, + 51.5243 + ], + [ + -0.05478, + 51.52427 + ], + [ + -0.05486, + 51.52431 + ], + [ + -0.05537, + 51.5243 + ], + [ + -0.0556, + 51.52419 + ], + [ + -0.05646, + 51.52428 + ], + [ + -0.0565, + 51.5243 + ], + [ + -0.05685, + 51.52432 + ], + [ + -0.05701, + 51.52412 + ], + [ + -0.05803, + 51.52381 + ], + [ + -0.05844, + 51.52392 + ], + [ + -0.0585, + 51.52392 + ], + [ + -0.0588, + 51.52386 + ], + [ + -0.05955, + 51.52391 + ], + [ + -0.05965, + 51.524 + ], + [ + -0.06114, + 51.52382 + ], + [ + -0.06126, + 51.52387 + ], + [ + -0.06152, + 51.52368 + ], + [ + -0.06277, + 51.52339 + ], + [ + -0.063, + 51.52324 + ], + [ + -0.06344, + 51.52324 + ], + [ + -0.06371, + 51.52332 + ], + [ + -0.06426, + 51.52333 + ], + [ + -0.06452, + 51.52321 + ], + [ + -0.06593, + 51.52312 + ], + [ + -0.06628, + 51.52297 + ], + [ + -0.06663, + 51.52309 + ], + [ + -0.06722, + 51.52323 + ], + [ + -0.06832, + 51.52308 + ], + [ + -0.06866, + 51.52297 + ], + [ + -0.06878, + 51.52284 + ], + [ + -0.06913, + 51.52281 + ], + [ + -0.0697, + 51.52274 + ], + [ + -0.07002, + 51.52294 + ], + [ + -0.07184, + 51.52302 + ], + [ + -0.0716, + 51.52335 + ], + [ + -0.07159, + 51.52355 + ], + [ + -0.07121, + 51.52386 + ], + [ + -0.07098, + 51.52393 + ], + [ + -0.0708, + 51.52436 + ], + [ + -0.07093, + 51.52452 + ], + [ + -0.07167, + 51.52459 + ], + [ + -0.07187, + 51.52454 + ], + [ + -0.07205, + 51.52435 + ], + [ + -0.07242, + 51.52414 + ], + [ + -0.07245, + 51.52412 + ], + [ + -0.07258, + 51.52409 + ], + [ + -0.0731, + 51.52414 + ], + [ + -0.07339, + 51.52448 + ], + [ + -0.07379, + 51.52454 + ], + [ + -0.0741, + 51.52448 + ], + [ + -0.07412, + 51.52443 + ], + [ + -0.07475, + 51.52402 + ], + [ + -0.0749, + 51.52434 + ], + [ + -0.07499, + 51.52439 + ], + [ + -0.07573, + 51.52432 + ], + [ + -0.07592, + 51.52447 + ], + [ + -0.07614, + 51.52449 + ], + [ + -0.07638, + 51.52447 + ], + [ + -0.07704, + 51.52482 + ], + [ + -0.07715, + 51.52516 + ], + [ + -0.07678, + 51.52529 + ], + [ + -0.07674, + 51.5256 + ], + [ + -0.07684, + 51.52578 + ], + [ + -0.07761, + 51.52603 + ], + [ + -0.07733, + 51.52625 + ], + [ + -0.0771, + 51.52682 + ], + [ + -0.07758, + 51.52696 + ], + [ + -0.07776, + 51.52737 + ], + [ + -0.07836, + 51.52779 + ], + [ + -0.07773, + 51.5281 + ], + [ + -0.0777, + 51.52815 + ], + [ + -0.07775, + 51.52826 + ], + [ + -0.07888, + 51.52838 + ], + [ + -0.07891, + 51.52842 + ], + [ + -0.07891, + 51.52854 + ], + [ + -0.07912, + 51.52878 + ], + [ + -0.07922, + 51.52931 + ], + [ + -0.07914, + 51.52945 + ], + [ + -0.07915, + 51.52966 + ], + [ + -0.07837, + 51.52981 + ], + [ + -0.07814, + 51.53003 + ], + [ + -0.07765, + 51.53008 + ], + [ + -0.0776, + 51.53027 + ], + [ + -0.07725, + 51.53037 + ], + [ + -0.07732, + 51.53066 + ], + [ + -0.07678, + 51.53106 + ], + [ + -0.07724, + 51.53149 + ], + [ + -0.07669, + 51.53208 + ], + [ + -0.07723, + 51.53234 + ], + [ + -0.07781, + 51.53236 + ], + [ + -0.07782, + 51.53237 + ], + [ + -0.07788, + 51.53285 + ], + [ + -0.07798, + 51.53291 + ], + [ + -0.07734, + 51.53365 + ], + [ + -0.07741, + 51.53369 + ], + [ + -0.07787, + 51.53377 + ], + [ + -0.07828, + 51.53422 + ], + [ + -0.07721, + 51.5345 + ], + [ + -0.07697, + 51.53444 + ], + [ + -0.07678, + 51.53451 + ], + [ + -0.07675, + 51.53466 + ], + [ + -0.07699, + 51.53491 + ], + [ + -0.07749, + 51.53493 + ], + [ + -0.07784, + 51.53517 + ], + [ + -0.07773, + 51.53525 + ], + [ + -0.0771, + 51.53533 + ], + [ + -0.07706, + 51.53558 + ], + [ + -0.07707, + 51.53561 + ], + [ + -0.07733, + 51.53571 + ], + [ + -0.07733, + 51.53586 + ], + [ + -0.07689, + 51.53611 + ], + [ + -0.07704, + 51.53638 + ], + [ + -0.07668, + 51.53666 + ], + [ + -0.07614, + 51.53661 + ], + [ + -0.07573, + 51.53631 + ], + [ + -0.07495, + 51.53642 + ], + [ + -0.07462, + 51.53626 + ], + [ + -0.07396, + 51.5364 + ], + [ + -0.07315, + 51.53591 + ], + [ + -0.07302, + 51.53588 + ], + [ + -0.0728, + 51.536 + ], + [ + -0.07197, + 51.53602 + ], + [ + -0.07143, + 51.53581 + ], + [ + -0.07109, + 51.53597 + ], + [ + -0.07033, + 51.53598 + ], + [ + -0.07013, + 51.53583 + ], + [ + -0.0694, + 51.5358 + ], + [ + -0.06928, + 51.53588 + ], + [ + -0.06827, + 51.53599 + ], + [ + -0.06805, + 51.53582 + ], + [ + -0.06726, + 51.53579 + ], + [ + -0.06724, + 51.53579 + ], + [ + -0.06669, + 51.53573 + ], + [ + -0.06665, + 51.53574 + ], + [ + -0.06597, + 51.53571 + ], + [ + -0.06579, + 51.53564 + ], + [ + -0.06521, + 51.5356 + ], + [ + -0.06506, + 51.53568 + ], + [ + -0.06461, + 51.5357 + ], + [ + -0.06455, + 51.53566 + ], + [ + -0.06397, + 51.53544 + ], + [ + -0.06329, + 51.53555 + ], + [ + -0.06269, + 51.53536 + ], + [ + -0.06259, + 51.53535 + ], + [ + -0.06244, + 51.53545 + ], + [ + -0.06091, + 51.53522 + ], + [ + -0.06101, + 51.53499 + ], + [ + -0.06079, + 51.53463 + ], + [ + -0.05997, + 51.53429 + ], + [ + -0.06003, + 51.53367 + ], + [ + -0.05817, + 51.53428 + ], + [ + -0.05786, + 51.53414 + ], + [ + -0.05728, + 51.53417 + ], + [ + -0.05721, + 51.53419 + ], + [ + -0.05713, + 51.53418 + ], + [ + -0.05642, + 51.53487 + ], + [ + -0.05644, + 51.53512 + ], + [ + -0.0564, + 51.53513 + ], + [ + -0.05523, + 51.53489 + ], + [ + -0.05461, + 51.5352 + ], + [ + -0.05428, + 51.53514 + ], + [ + -0.05411, + 51.53503 + ], + [ + -0.05396, + 51.53498 + ], + [ + -0.05236, + 51.53495 + ], + [ + -0.05164, + 51.53509 + ], + [ + -0.05117, + 51.53535 + ], + [ + -0.0508, + 51.53537 + ], + [ + -0.04984, + 51.53517 + ], + [ + -0.04914, + 51.53482 + ], + [ + -0.04913, + 51.53482 + ], + [ + -0.0476, + 51.5344 + ], + [ + -0.04652, + 51.5342 + ], + [ + -0.04631, + 51.53418 + ], + [ + -0.04556, + 51.53416 + ], + [ + -0.04521, + 51.5342 + ], + [ + -0.04562, + 51.53221 + ], + [ + -0.04508, + 51.53186 + ], + [ + -0.04406, + 51.532 + ], + [ + -0.04385, + 51.53179 + ] + ] + ], + [ + [] + ] + ] + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "properties", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"Description\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"Name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "json_geometry", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geometry", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"type_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"srid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boundary\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"holes\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "postcodes = (\n", + " spark.read\n", + " .option(\"multiline\", \"true\")\n", + " .format(\"json\")\n", + " .load(postcodes_dir)\n", + " .select(\"type\", explode(col(\"features\")).alias(\"feature\"))\n", + " .select(\"type\", col(\"feature.properties\").alias(\"properties\"), to_json(col(\"feature.geometry\")).alias(\"json_geometry\"))\n", + " .withColumn(\"geometry\", mos.st_geomfromgeojson(\"json_geometry\")) \n", + ")\n", + "print(f\"count? {postcodes.count():,}\")\n", + "postcodes.limit(1).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b78ff76e-e8f4-4e9b-9e3e-d32ce6c7725f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initial London Cycling Data [~40M]\n", + "\n", + "> We will setup a temporary location to store our UPRN data and then generate a table." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a554943f-382e-4714-9e04-0fd878cfdf76", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "UPRNS_DIR_FUSE? '/dbfs/tmp/mosaic/mjohns@databricks.com/cycling'\n" + ] + } + ], + "source": [ + "uprns_dir = f\"{data_dir}/cycling\"\n", + "uprns_dir_fuse = f\"/dbfs{uprns_dir}\"\n", + "dbutils.fs.mkdirs(uprns_dir)\n", + "\n", + "os.environ['UPRNS_DIR_FUSE'] = uprns_dir_fuse\n", + "print(f\"UPRNS_DIR_FUSE? '{uprns_dir_fuse}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d95c4291-732d-428a-b718-930f38e6d53a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "...skipping '/dbfs/tmp/mosaic/mjohns@databricks.com/cycling/uprns.zip', already exits.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
pathnamesizemodificationTime
dbfs:/tmp/mosaic/mjohns@databricks.com/cycling/uprns.zipuprns.zip5827705051701185111000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/tmp/mosaic/mjohns@databricks.com/cycling/uprns.zip", + "uprns.zip", + 582770505, + 1701185111000 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "size", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "uprns_url = 'https://api.os.uk/downloads/v1/products/OpenUPRN/downloads?area=GB&format=CSV&redirect'\n", + "\n", + "# The DBFS file system is mounted under /dbfs/ directory on Databricks cluster nodes\n", + "uprns_dir_fuse_path = pathlib.Path(uprns_dir_fuse)\n", + "uprns_dir_fuse_path.mkdir(parents=True, exist_ok=True)\n", + "\n", + "uprns_zip_fuse_path = uprns_dir_fuse_path / 'uprns.zip'\n", + "if not uprns_zip_fuse_path.exists():\n", + " req = requests.get(uprns_url)\n", + " with open(uprns_zip_fuse_path, 'wb') as f:\n", + " f.write(req.content)\n", + "else:\n", + " print(f\"...skipping '{uprns_zip_fuse_path}', already exits.\")\n", + "\n", + "display(dbutils.fs.ls(uprns_dir))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "933c4aae-2e11-468c-acde-3a375907b713", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "uprns_data_fuse_path = uprns_dir_fuse_path / 'data'\n", + "uprns_data_fuse_path.mkdir(parents=True, exist_ok=True)\n", + "with zipfile.ZipFile(uprns_zip_fuse_path, 'r') as zip_ref:\n", + " zip_ref.extractall(uprns_data_fuse_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "56b674ee-372c-40bf-997e-19b500c9f454", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "total 1.9G\n-rwxrwxrwx 1 root root 192 Nov 28 15:35 licence.txt\n-rwxrwxrwx 1 root root 1.9G Nov 28 15:35 osopenuprn_202310.csv\n-rwxrwxrwx 1 root root 88 Nov 28 15:35 versions.txt\n" + ] + } + ], + "source": [ + "%sh ls -lh $UPRNS_DIR_FUSE/data" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0f8ad464-02f3-4187-8741-6d1c08124166", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "csv_name = 'osopenuprn_202310.csv' # <- adjust to the name ^\n", + "\n", + "# - alter csv columns\n", + "_df = (\n", + " spark\n", + " .read\n", + " .option(\"header\", \"true\")\n", + " .option(\"inferSchema\" , \"true\")\n", + " .csv(f\"{uprns_dir}/data/{csv_name}\")\n", + ")\n", + "columns = [F.col(cn).alias(cn.replace(' ', '')) for cn in _df.columns]\n", + "\n", + "# - write csv to table\n", + "spark.sql(\"drop table if exists uprns\")\n", + "_df.select(*columns).write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"uprns\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "68809775-8916-4b5a-8b02-b1d9d0b0e8ee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> We will load the Unique Property Reference Numbers (UPRNs) data to represent point data. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "90988947-8918-4ce6-9408-71eb0645ee48", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 40,593,998\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
UPRNX_COORDINATEY_COORDINATELATITUDELONGITUDE
1358260.66172796.551.4526008-2.602075
26352967.0181077.051.5266333-2.6793612
27352967.0181077.051.5266333-2.6793612
30354800.0180469.051.5213173-2.6528615
31354796.0180460.051.521236-2.652918
32353473.0180409.051.5206696-2.671979
33352548.0180308.051.5196842-2.6852966
34352515.0180360.051.5201489-2.6857792
38352462.0180401.051.5205131-2.6865486
41354662.0180364.051.5203621-2.6548369
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1, + 358260.66, + 172796.5, + 51.4526008, + -2.602075 + ], + [ + 26, + 352967.0, + 181077.0, + 51.5266333, + -2.6793612 + ], + [ + 27, + 352967.0, + 181077.0, + 51.5266333, + -2.6793612 + ], + [ + 30, + 354800.0, + 180469.0, + 51.5213173, + -2.6528615 + ], + [ + 31, + 354796.0, + 180460.0, + 51.521236, + -2.652918 + ], + [ + 32, + 353473.0, + 180409.0, + 51.5206696, + -2.671979 + ], + [ + 33, + 352548.0, + 180308.0, + 51.5196842, + -2.6852966 + ], + [ + 34, + 352515.0, + 180360.0, + 51.5201489, + -2.6857792 + ], + [ + 38, + 352462.0, + 180401.0, + 51.5205131, + -2.6865486 + ], + [ + 41, + 354662.0, + 180364.0, + 51.5203621, + -2.6548369 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "UPRN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "X_COORDINATE", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Y_COORDINATE", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "LATITUDE", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "LONGITUDE", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "uprns = spark.read.table(\"uprns\")\n", + "print(f\"count? {uprns.count():,}\") # <- faster after table gen\n", + "uprns.limit(10).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f302b355-f176-4f1e-84ac-1b268b7d5cec", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Reproject Postcode Geometries to BNG SRID\n", + "> British National Grid expects coordinate of geometries to be provided in EPSG:27700. Our geometries are provided in EPSG:4326. So we will need to reproject the geometries. Mosaic has the necessary functionality to help us achieve this." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "106918d1-cb9f-497b-a54a-db7d4015c226", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
typepropertiesgeometry
FeatureCollectionList(E2 postcode district, E2)List(5, 27700, List(List(List(535781.9380575956, 183244.6883788783), List(535824.9598637373, 183219.13114683155), List(535857.6174945063, 183217.77906512795), List(535937.9652897029, 183172.0778235984), List(535948.0833085249, 183131.17423886806), List(535948.1726131596, 183127.83816223528), List(535954.1083358275, 183113.53036918928), List(535992.1193609729, 183067.80952594848), List(535993.5661128352, 183065.62262108468), List(536023.433274843, 183013.00698265154), List(536023.6417598329, 183005.22280622984), List(536019.9864220938, 182986.20691772352), List(535991.10526591, 182924.22825999785), List(536006.9008830489, 182904.62042294373), List(536029.5004004786, 182760.5587263456), List(535962.3273532258, 182728.7139400427), List(535957.3825413126, 182731.9200414074), List(535855.981967767, 182708.06319533306), List(535842.6443589143, 182687.67569398397), List(535833.745864351, 182682.98643173242), List(535806.7826882608, 182678.92696711444), List(535787.8063862727, 182661.7273213567), List(535768.6516090925, 182651.19990051258), List(535724.5650621236, 182664.48831547052), List(535671.083893747, 182639.69031467568), List(535649.9178289331, 182652.47882477642), List(535645.0325510963, 182653.46117701154), List(535581.7803402226, 182630.62884472188), List(535559.116011542, 182595.52657006297), List(535547.3534337038, 182594.09984491928), List(535512.2253112149, 182609.85480291082), List(535477.6131089614, 182632.3006572011), List(535454.7701589285, 182603.87101454858), List(535441.0515128053, 182545.6387309049), List(535441.3480281356, 182534.51847295318), List(535392.7614843239, 182534.3359901223), List(535378.7871684541, 182511.70721004112), List(535355.4689781135, 182501.07044976193), List(535332.54809557, 182501.57253937522), List(535319.7830267497, 182485.65304688836), List(535254.2094731658, 182471.66562843113), List(535233.8983120153, 182504.50927462179), List(535184.99899094, 182542.15613858588), List(535171.1848080601, 182539.5628792516), List(535188.4320349725, 182413.1610924584), List(535171.4449779494, 182399.35527161031), List(535156.8775325917, 182398.96765152615), List(535126.1481960944, 182405.93981478648), List(535117.3964801998, 182395.69169817545), List(535113.2935073929, 182393.35693709343), List(535046.0944043219, 182388.23154424207), List(535040.426642115, 182392.53211760416), List(535005.0780822264, 182390.4797147903), List(534989.4483038574, 182377.82341214077), List(534929.5250849993, 182386.2466646775), List(534926.6912537713, 182388.39701484208), List(534902.353081025, 182389.97622999403), List(534891.8446569596, 182367.44095034344), List(534822.0030447827, 182331.08968924946), List(534793.2369068528, 182342.5672069927), List(534789.0747382878, 182342.45676511177), List(534768.4409085622, 182335.2324451239), List(534716.2662738861, 182339.41255793435), List(534709.0639194129, 182349.23683925637), List(534606.2339552218, 182326.48043027992), List(534597.7622726331, 182331.81999374554), List(534580.2859382916, 182310.2136126347), List(534494.4274051443, 182275.668296282), List(534478.9138032356, 182258.56548488926), List(534448.3907594526, 182257.75755606516), List(534429.425290218, 182266.15809321858), List(534391.2421295254, 182266.26060967514), List(534373.558798522, 182252.43915361603), List(534276.0107687588, 182239.84453922248), List(534252.1718241313, 182222.5224069195), List(534227.5393820464, 182235.22514476796), List(534186.199365603, 182249.71225135983), List(534110.3320511248, 182231.01675352425), List(534087.0688351066, 182218.16183286835), List(534079.1258644154, 182203.4857467148), List(534054.9339733121, 182199.50889328966), List(534015.597704571, 182190.68140581233), List(533992.8122403203, 182212.33640354726), List(533866.3224438406, 182217.90405003884), List(533882.0043343764, 182255.03975232004), List(533882.1118874322, 182277.29861049942), List(533907.563830076, 182312.466308441), List(533923.3136380416, 182320.67111585021), List(533934.5394068043, 182368.8175582806), List(533925.0523586851, 182386.37225865485), List(533873.5143864275, 182392.80335840082), List(533859.7871893895, 182386.877599178), List(533847.8575499062, 182365.42002740753), List(533822.8062397578, 182341.39119331172), List(533820.7837512142, 182339.11231186916), List(533811.8536204272, 182335.5386665268), List(533775.6351340465, 182340.14873929322), List(533754.5225045574, 182377.4279951505), List(533726.5993517478, 182383.3696325071), List(533705.2706813519, 182376.13139397942), List(533704.0296535669, 182370.53473031026), List(533661.5269331775, 182323.7914324954), List(533650.1852706587, 182359.10259587853), List(533643.7957804046, 182364.49848743435), List(533592.6674969478, 182355.3640262651), List(533579.048749956, 182371.69786662376), List(533563.729136071, 182373.5206389891), List(533547.1391008857, 182370.85886459786), List(533500.3328063814, 182408.57647217682), List(533491.708691396, 182446.18495565542), List(533516.9947808256, 182461.31590327783), List(533518.8633508299, 182495.86176251573), List(533511.4005260145, 182515.695971929), List(533457.2576487551, 182542.0931076025), List(533476.0373819193, 182567.06810922833), List(533490.3257582185, 182630.87308835395), List(533456.6213927829, 182645.56658983114), List(533442.9378613638, 182690.83179383923), List(533400.0925503863, 182736.4438360546), List(533442.8860284169, 182772.06468460686), List(533444.820844416, 182777.67950834992), List(533441.031309065, 182789.82072529983), List(533362.30066867, 182801.1064503934), List(533360.1029890878, 182805.4999354959), List(533359.7526092468, 182818.84430400614), List(533344.485770287, 182845.15060313017), List(533336.0022694117, 182903.90613927244), List(533341.1424542875, 182919.62024957192), List(533339.8357444134, 182942.95468899893), List(533393.4991665736, 182961.05581658782), List(533408.8094940147, 182985.93951877212), List(533442.6499810042, 182992.39254114387), List(533445.5628353866, 183013.61224821693), List(533469.5465898013, 183025.3704739589), List(533463.8439242825, 183057.49177497294), List(533500.1287308584, 183102.95741226373), List(533466.967019293, 183149.93616036046), List(533503.3892198745, 183216.54865868646), List(533465.176452015, 183244.47702701658), List(533424.8910481959, 183245.6440381208), List(533424.1682635264, 183246.73784710694), List(533418.6045028507, 183300.00601616694), List(533411.4936098668, 183306.49599349493), List(533453.7188489789, 183389.95262020518), List(533448.7471163005, 183394.2731622412), List(533416.6101809462, 183402.33112426393), List(533386.8603066149, 183451.62554262602), List(533460.2507298898, 183484.71224287833), List(533477.0709651434, 183478.4775450573), List(533490.0435256548, 183486.6081507582), List(533491.6856267123, 183503.3433177545), List(533474.3100794131, 183530.70656481414), List(533439.575167135, 183532.01926295372), List(533414.6004345681, 183558.070208603), List(533421.9954622076, 183567.16690503713), List(533465.4536569463, 183577.21137004573), List(533467.4970158483, 183605.08507556678), List(533466.7158090527, 183608.4029416383), List(533448.3920701249, 183619.04934156616), List(533447.9536899813, 183635.72981729486), List(533477.7375418543, 183664.33265876118), List(533466.545667419, 183694.08407003182), List(533490.6933227498, 183725.8772636903), List(533528.2886847861, 183721.30179077585), List(533557.5998214906, 183688.68865956645), List(533611.3715561538, 183702.3441248663), List(533634.7255277758, 183685.15388438356), List(533680.0872158157, 183701.92715838), List(533737.6963941175, 183648.91685636976), List(533746.7999457252, 183645.81822844158), List(533761.7057262553, 183659.56450243742), List(533819.2086347946, 183663.3052394627), List(533857.2738980403, 183640.93969440804), List(533880.3842868664, 183659.35383981554), List(533933.0619141799, 183661.8558411452), List(533947.3721862542, 183645.5412537885), List(533998.086794612, 183643.54092136555), List(534006.174193125, 183652.65679096844), List(534075.89609284, 183666.7381784143), List(534091.6525898686, 183648.23655415466), List(534146.5283805125, 183646.3475924168), List(534147.9154105934, 183646.38423628634), List(534186.2350593824, 183640.7199105107), List(534188.979735214, 183641.9052504226), List(534236.2269981015, 183639.81565870665), List(534248.916094217, 183632.36147769052), List(534289.25772802, 183628.97699438152), List(534299.4251987567, 183638.14836380753), List(534330.5746032915, 183641.19794210594), List(534334.8533790117, 183636.85990650317), List(534375.7247760285, 183613.459571335), List(534422.5603115619, 183626.940126969), List(534464.7309192211, 183606.9133092533), List(534471.6955801392, 183605.98493706266), List(534481.8038829465, 183617.38073531695), List(534588.5903195359, 183594.61541820713), List(534582.3329739553, 183568.85492995696), List(534598.6517142366, 183529.22629237536), List(534656.5239630286, 183492.92506529193), List(534654.1909793459, 183423.86893065734), List(534781.3901572204, 183495.12499185512), List(534803.3031006856, 183480.127294405), List(534843.4397878762, 183484.53139197238), List(534848.2354977566, 183486.8843684813), List(534853.813337246, 183485.9196823354), List(534901.0161172879, 183563.95757195458), List(534898.8904922986, 183591.72143490257), List(534901.6350458016, 183592.90716247621), List(534983.4868244318, 183568.37485173013), List(535025.5688460219, 183603.99094667443), List(535048.6325535319, 183597.9274125757), List(535060.7478173448, 183586.008682548), List(535071.2986004304, 183580.72525295737), List(535182.3515840536, 183580.3420830167), List(535231.8708644358, 183597.24007576355), List(535263.6963046396, 183627.02099730785), List(535289.2972956816, 183629.92867538054), List(535356.4680675853, 183609.4624531094), List(535406.0525481519, 183571.83582854533), List(535406.7460762692, 183571.85432246135), List(535514.1023035615, 183527.979870348), List(535589.5978763667, 183507.73880783212), List(535604.2215525524, 183505.90367014642), List(535656.2962848989, 183505.068944166), List(535680.4512552277, 183510.1655864418), List(535657.927840578, 183288.11259578395), List(535696.4204844927, 183250.19230655016), List(535766.7486843397, 183267.65151389752), List(535781.9380575956, 183244.6883788783), List(535781.9380575956, 183244.6883788783))), List(List()))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "FeatureCollection", + [ + "E2 postcode district", + "E2" + ], + [ + 5, + 27700, + [ + [ + [ + 535781.9380575956, + 183244.6883788783 + ], + [ + 535824.9598637373, + 183219.13114683155 + ], + [ + 535857.6174945063, + 183217.77906512795 + ], + [ + 535937.9652897029, + 183172.0778235984 + ], + [ + 535948.0833085249, + 183131.17423886806 + ], + [ + 535948.1726131596, + 183127.83816223528 + ], + [ + 535954.1083358275, + 183113.53036918928 + ], + [ + 535992.1193609729, + 183067.80952594848 + ], + [ + 535993.5661128352, + 183065.62262108468 + ], + [ + 536023.433274843, + 183013.00698265154 + ], + [ + 536023.6417598329, + 183005.22280622984 + ], + [ + 536019.9864220938, + 182986.20691772352 + ], + [ + 535991.10526591, + 182924.22825999785 + ], + [ + 536006.9008830489, + 182904.62042294373 + ], + [ + 536029.5004004786, + 182760.5587263456 + ], + [ + 535962.3273532258, + 182728.7139400427 + ], + [ + 535957.3825413126, + 182731.9200414074 + ], + [ + 535855.981967767, + 182708.06319533306 + ], + [ + 535842.6443589143, + 182687.67569398397 + ], + [ + 535833.745864351, + 182682.98643173242 + ], + [ + 535806.7826882608, + 182678.92696711444 + ], + [ + 535787.8063862727, + 182661.7273213567 + ], + [ + 535768.6516090925, + 182651.19990051258 + ], + [ + 535724.5650621236, + 182664.48831547052 + ], + [ + 535671.083893747, + 182639.69031467568 + ], + [ + 535649.9178289331, + 182652.47882477642 + ], + [ + 535645.0325510963, + 182653.46117701154 + ], + [ + 535581.7803402226, + 182630.62884472188 + ], + [ + 535559.116011542, + 182595.52657006297 + ], + [ + 535547.3534337038, + 182594.09984491928 + ], + [ + 535512.2253112149, + 182609.85480291082 + ], + [ + 535477.6131089614, + 182632.3006572011 + ], + [ + 535454.7701589285, + 182603.87101454858 + ], + [ + 535441.0515128053, + 182545.6387309049 + ], + [ + 535441.3480281356, + 182534.51847295318 + ], + [ + 535392.7614843239, + 182534.3359901223 + ], + [ + 535378.7871684541, + 182511.70721004112 + ], + [ + 535355.4689781135, + 182501.07044976193 + ], + [ + 535332.54809557, + 182501.57253937522 + ], + [ + 535319.7830267497, + 182485.65304688836 + ], + [ + 535254.2094731658, + 182471.66562843113 + ], + [ + 535233.8983120153, + 182504.50927462179 + ], + [ + 535184.99899094, + 182542.15613858588 + ], + [ + 535171.1848080601, + 182539.5628792516 + ], + [ + 535188.4320349725, + 182413.1610924584 + ], + [ + 535171.4449779494, + 182399.35527161031 + ], + [ + 535156.8775325917, + 182398.96765152615 + ], + [ + 535126.1481960944, + 182405.93981478648 + ], + [ + 535117.3964801998, + 182395.69169817545 + ], + [ + 535113.2935073929, + 182393.35693709343 + ], + [ + 535046.0944043219, + 182388.23154424207 + ], + [ + 535040.426642115, + 182392.53211760416 + ], + [ + 535005.0780822264, + 182390.4797147903 + ], + [ + 534989.4483038574, + 182377.82341214077 + ], + [ + 534929.5250849993, + 182386.2466646775 + ], + [ + 534926.6912537713, + 182388.39701484208 + ], + [ + 534902.353081025, + 182389.97622999403 + ], + [ + 534891.8446569596, + 182367.44095034344 + ], + [ + 534822.0030447827, + 182331.08968924946 + ], + [ + 534793.2369068528, + 182342.5672069927 + ], + [ + 534789.0747382878, + 182342.45676511177 + ], + [ + 534768.4409085622, + 182335.2324451239 + ], + [ + 534716.2662738861, + 182339.41255793435 + ], + [ + 534709.0639194129, + 182349.23683925637 + ], + [ + 534606.2339552218, + 182326.48043027992 + ], + [ + 534597.7622726331, + 182331.81999374554 + ], + [ + 534580.2859382916, + 182310.2136126347 + ], + [ + 534494.4274051443, + 182275.668296282 + ], + [ + 534478.9138032356, + 182258.56548488926 + ], + [ + 534448.3907594526, + 182257.75755606516 + ], + [ + 534429.425290218, + 182266.15809321858 + ], + [ + 534391.2421295254, + 182266.26060967514 + ], + [ + 534373.558798522, + 182252.43915361603 + ], + [ + 534276.0107687588, + 182239.84453922248 + ], + [ + 534252.1718241313, + 182222.5224069195 + ], + [ + 534227.5393820464, + 182235.22514476796 + ], + [ + 534186.199365603, + 182249.71225135983 + ], + [ + 534110.3320511248, + 182231.01675352425 + ], + [ + 534087.0688351066, + 182218.16183286835 + ], + [ + 534079.1258644154, + 182203.4857467148 + ], + [ + 534054.9339733121, + 182199.50889328966 + ], + [ + 534015.597704571, + 182190.68140581233 + ], + [ + 533992.8122403203, + 182212.33640354726 + ], + [ + 533866.3224438406, + 182217.90405003884 + ], + [ + 533882.0043343764, + 182255.03975232004 + ], + [ + 533882.1118874322, + 182277.29861049942 + ], + [ + 533907.563830076, + 182312.466308441 + ], + [ + 533923.3136380416, + 182320.67111585021 + ], + [ + 533934.5394068043, + 182368.8175582806 + ], + [ + 533925.0523586851, + 182386.37225865485 + ], + [ + 533873.5143864275, + 182392.80335840082 + ], + [ + 533859.7871893895, + 182386.877599178 + ], + [ + 533847.8575499062, + 182365.42002740753 + ], + [ + 533822.8062397578, + 182341.39119331172 + ], + [ + 533820.7837512142, + 182339.11231186916 + ], + [ + 533811.8536204272, + 182335.5386665268 + ], + [ + 533775.6351340465, + 182340.14873929322 + ], + [ + 533754.5225045574, + 182377.4279951505 + ], + [ + 533726.5993517478, + 182383.3696325071 + ], + [ + 533705.2706813519, + 182376.13139397942 + ], + [ + 533704.0296535669, + 182370.53473031026 + ], + [ + 533661.5269331775, + 182323.7914324954 + ], + [ + 533650.1852706587, + 182359.10259587853 + ], + [ + 533643.7957804046, + 182364.49848743435 + ], + [ + 533592.6674969478, + 182355.3640262651 + ], + [ + 533579.048749956, + 182371.69786662376 + ], + [ + 533563.729136071, + 182373.5206389891 + ], + [ + 533547.1391008857, + 182370.85886459786 + ], + [ + 533500.3328063814, + 182408.57647217682 + ], + [ + 533491.708691396, + 182446.18495565542 + ], + [ + 533516.9947808256, + 182461.31590327783 + ], + [ + 533518.8633508299, + 182495.86176251573 + ], + [ + 533511.4005260145, + 182515.695971929 + ], + [ + 533457.2576487551, + 182542.0931076025 + ], + [ + 533476.0373819193, + 182567.06810922833 + ], + [ + 533490.3257582185, + 182630.87308835395 + ], + [ + 533456.6213927829, + 182645.56658983114 + ], + [ + 533442.9378613638, + 182690.83179383923 + ], + [ + 533400.0925503863, + 182736.4438360546 + ], + [ + 533442.8860284169, + 182772.06468460686 + ], + [ + 533444.820844416, + 182777.67950834992 + ], + [ + 533441.031309065, + 182789.82072529983 + ], + [ + 533362.30066867, + 182801.1064503934 + ], + [ + 533360.1029890878, + 182805.4999354959 + ], + [ + 533359.7526092468, + 182818.84430400614 + ], + [ + 533344.485770287, + 182845.15060313017 + ], + [ + 533336.0022694117, + 182903.90613927244 + ], + [ + 533341.1424542875, + 182919.62024957192 + ], + [ + 533339.8357444134, + 182942.95468899893 + ], + [ + 533393.4991665736, + 182961.05581658782 + ], + [ + 533408.8094940147, + 182985.93951877212 + ], + [ + 533442.6499810042, + 182992.39254114387 + ], + [ + 533445.5628353866, + 183013.61224821693 + ], + [ + 533469.5465898013, + 183025.3704739589 + ], + [ + 533463.8439242825, + 183057.49177497294 + ], + [ + 533500.1287308584, + 183102.95741226373 + ], + [ + 533466.967019293, + 183149.93616036046 + ], + [ + 533503.3892198745, + 183216.54865868646 + ], + [ + 533465.176452015, + 183244.47702701658 + ], + [ + 533424.8910481959, + 183245.6440381208 + ], + [ + 533424.1682635264, + 183246.73784710694 + ], + [ + 533418.6045028507, + 183300.00601616694 + ], + [ + 533411.4936098668, + 183306.49599349493 + ], + [ + 533453.7188489789, + 183389.95262020518 + ], + [ + 533448.7471163005, + 183394.2731622412 + ], + [ + 533416.6101809462, + 183402.33112426393 + ], + [ + 533386.8603066149, + 183451.62554262602 + ], + [ + 533460.2507298898, + 183484.71224287833 + ], + [ + 533477.0709651434, + 183478.4775450573 + ], + [ + 533490.0435256548, + 183486.6081507582 + ], + [ + 533491.6856267123, + 183503.3433177545 + ], + [ + 533474.3100794131, + 183530.70656481414 + ], + [ + 533439.575167135, + 183532.01926295372 + ], + [ + 533414.6004345681, + 183558.070208603 + ], + [ + 533421.9954622076, + 183567.16690503713 + ], + [ + 533465.4536569463, + 183577.21137004573 + ], + [ + 533467.4970158483, + 183605.08507556678 + ], + [ + 533466.7158090527, + 183608.4029416383 + ], + [ + 533448.3920701249, + 183619.04934156616 + ], + [ + 533447.9536899813, + 183635.72981729486 + ], + [ + 533477.7375418543, + 183664.33265876118 + ], + [ + 533466.545667419, + 183694.08407003182 + ], + [ + 533490.6933227498, + 183725.8772636903 + ], + [ + 533528.2886847861, + 183721.30179077585 + ], + [ + 533557.5998214906, + 183688.68865956645 + ], + [ + 533611.3715561538, + 183702.3441248663 + ], + [ + 533634.7255277758, + 183685.15388438356 + ], + [ + 533680.0872158157, + 183701.92715838 + ], + [ + 533737.6963941175, + 183648.91685636976 + ], + [ + 533746.7999457252, + 183645.81822844158 + ], + [ + 533761.7057262553, + 183659.56450243742 + ], + [ + 533819.2086347946, + 183663.3052394627 + ], + [ + 533857.2738980403, + 183640.93969440804 + ], + [ + 533880.3842868664, + 183659.35383981554 + ], + [ + 533933.0619141799, + 183661.8558411452 + ], + [ + 533947.3721862542, + 183645.5412537885 + ], + [ + 533998.086794612, + 183643.54092136555 + ], + [ + 534006.174193125, + 183652.65679096844 + ], + [ + 534075.89609284, + 183666.7381784143 + ], + [ + 534091.6525898686, + 183648.23655415466 + ], + [ + 534146.5283805125, + 183646.3475924168 + ], + [ + 534147.9154105934, + 183646.38423628634 + ], + [ + 534186.2350593824, + 183640.7199105107 + ], + [ + 534188.979735214, + 183641.9052504226 + ], + [ + 534236.2269981015, + 183639.81565870665 + ], + [ + 534248.916094217, + 183632.36147769052 + ], + [ + 534289.25772802, + 183628.97699438152 + ], + [ + 534299.4251987567, + 183638.14836380753 + ], + [ + 534330.5746032915, + 183641.19794210594 + ], + [ + 534334.8533790117, + 183636.85990650317 + ], + [ + 534375.7247760285, + 183613.459571335 + ], + [ + 534422.5603115619, + 183626.940126969 + ], + [ + 534464.7309192211, + 183606.9133092533 + ], + [ + 534471.6955801392, + 183605.98493706266 + ], + [ + 534481.8038829465, + 183617.38073531695 + ], + [ + 534588.5903195359, + 183594.61541820713 + ], + [ + 534582.3329739553, + 183568.85492995696 + ], + [ + 534598.6517142366, + 183529.22629237536 + ], + [ + 534656.5239630286, + 183492.92506529193 + ], + [ + 534654.1909793459, + 183423.86893065734 + ], + [ + 534781.3901572204, + 183495.12499185512 + ], + [ + 534803.3031006856, + 183480.127294405 + ], + [ + 534843.4397878762, + 183484.53139197238 + ], + [ + 534848.2354977566, + 183486.8843684813 + ], + [ + 534853.813337246, + 183485.9196823354 + ], + [ + 534901.0161172879, + 183563.95757195458 + ], + [ + 534898.8904922986, + 183591.72143490257 + ], + [ + 534901.6350458016, + 183592.90716247621 + ], + [ + 534983.4868244318, + 183568.37485173013 + ], + [ + 535025.5688460219, + 183603.99094667443 + ], + [ + 535048.6325535319, + 183597.9274125757 + ], + [ + 535060.7478173448, + 183586.008682548 + ], + [ + 535071.2986004304, + 183580.72525295737 + ], + [ + 535182.3515840536, + 183580.3420830167 + ], + [ + 535231.8708644358, + 183597.24007576355 + ], + [ + 535263.6963046396, + 183627.02099730785 + ], + [ + 535289.2972956816, + 183629.92867538054 + ], + [ + 535356.4680675853, + 183609.4624531094 + ], + [ + 535406.0525481519, + 183571.83582854533 + ], + [ + 535406.7460762692, + 183571.85432246135 + ], + [ + 535514.1023035615, + 183527.979870348 + ], + [ + 535589.5978763667, + 183507.73880783212 + ], + [ + 535604.2215525524, + 183505.90367014642 + ], + [ + 535656.2962848989, + 183505.068944166 + ], + [ + 535680.4512552277, + 183510.1655864418 + ], + [ + 535657.927840578, + 183288.11259578395 + ], + [ + 535696.4204844927, + 183250.19230655016 + ], + [ + 535766.7486843397, + 183267.65151389752 + ], + [ + 535781.9380575956, + 183244.6883788783 + ], + [ + 535781.9380575956, + 183244.6883788783 + ] + ] + ], + [ + [] + ] + ] + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "properties", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"Description\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"Name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "geometry", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"type_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"srid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boundary\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"holes\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "postcodes_bng = (\n", + " postcodes.select(\n", + " \"type\", \"properties\", \"geometry\"\n", + " ).withColumn(\n", + " \"geometry\", mos.st_setsrid(\"geometry\", lit(4326))\n", + " ).withColumn(\n", + " \"geometry\", mos.st_transform(\"geometry\", lit(27700))\n", + " )\n", + ")\n", + "postcodes_bng.limit(1).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1d0e1078-8088-45dc-8607-74e99ca98c59", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Example: Compute some basic geometry attributes._\n", + "\n", + "> Mosaic provides a number of functions for extracting the properties of geometries. Below are some that are relevant to Polygon geometries:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "261519da-73b4-4ef3-830e-b74e6eff28ac", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
geometrycalculated_areacalculated_length
List(5, 27700, List(List(List(535781.9380575956, 183244.6883788783), List(535824.9598637373, 183219.13114683155), List(535857.6174945063, 183217.77906512795), List(535937.9652897029, 183172.0778235984), List(535948.0833085249, 183131.17423886806), List(535948.1726131596, 183127.83816223528), List(535954.1083358275, 183113.53036918928), List(535992.1193609729, 183067.80952594848), List(535993.5661128352, 183065.62262108468), List(536023.433274843, 183013.00698265154), List(536023.6417598329, 183005.22280622984), List(536019.9864220938, 182986.20691772352), List(535991.10526591, 182924.22825999785), List(536006.9008830489, 182904.62042294373), List(536029.5004004786, 182760.5587263456), List(535962.3273532258, 182728.7139400427), List(535957.3825413126, 182731.9200414074), List(535855.981967767, 182708.06319533306), List(535842.6443589143, 182687.67569398397), List(535833.745864351, 182682.98643173242), List(535806.7826882608, 182678.92696711444), List(535787.8063862727, 182661.7273213567), List(535768.6516090925, 182651.19990051258), List(535724.5650621236, 182664.48831547052), List(535671.083893747, 182639.69031467568), List(535649.9178289331, 182652.47882477642), List(535645.0325510963, 182653.46117701154), List(535581.7803402226, 182630.62884472188), List(535559.116011542, 182595.52657006297), List(535547.3534337038, 182594.09984491928), List(535512.2253112149, 182609.85480291082), List(535477.6131089614, 182632.3006572011), List(535454.7701589285, 182603.87101454858), List(535441.0515128053, 182545.6387309049), List(535441.3480281356, 182534.51847295318), List(535392.7614843239, 182534.3359901223), List(535378.7871684541, 182511.70721004112), List(535355.4689781135, 182501.07044976193), List(535332.54809557, 182501.57253937522), List(535319.7830267497, 182485.65304688836), List(535254.2094731658, 182471.66562843113), List(535233.8983120153, 182504.50927462179), List(535184.99899094, 182542.15613858588), List(535171.1848080601, 182539.5628792516), List(535188.4320349725, 182413.1610924584), List(535171.4449779494, 182399.35527161031), List(535156.8775325917, 182398.96765152615), List(535126.1481960944, 182405.93981478648), List(535117.3964801998, 182395.69169817545), List(535113.2935073929, 182393.35693709343), List(535046.0944043219, 182388.23154424207), List(535040.426642115, 182392.53211760416), List(535005.0780822264, 182390.4797147903), List(534989.4483038574, 182377.82341214077), List(534929.5250849993, 182386.2466646775), List(534926.6912537713, 182388.39701484208), List(534902.353081025, 182389.97622999403), List(534891.8446569596, 182367.44095034344), List(534822.0030447827, 182331.08968924946), List(534793.2369068528, 182342.5672069927), List(534789.0747382878, 182342.45676511177), List(534768.4409085622, 182335.2324451239), List(534716.2662738861, 182339.41255793435), List(534709.0639194129, 182349.23683925637), List(534606.2339552218, 182326.48043027992), List(534597.7622726331, 182331.81999374554), List(534580.2859382916, 182310.2136126347), List(534494.4274051443, 182275.668296282), List(534478.9138032356, 182258.56548488926), List(534448.3907594526, 182257.75755606516), List(534429.425290218, 182266.15809321858), List(534391.2421295254, 182266.26060967514), List(534373.558798522, 182252.43915361603), List(534276.0107687588, 182239.84453922248), List(534252.1718241313, 182222.5224069195), List(534227.5393820464, 182235.22514476796), List(534186.199365603, 182249.71225135983), List(534110.3320511248, 182231.01675352425), List(534087.0688351066, 182218.16183286835), List(534079.1258644154, 182203.4857467148), List(534054.9339733121, 182199.50889328966), List(534015.597704571, 182190.68140581233), List(533992.8122403203, 182212.33640354726), List(533866.3224438406, 182217.90405003884), List(533882.0043343764, 182255.03975232004), List(533882.1118874322, 182277.29861049942), List(533907.563830076, 182312.466308441), List(533923.3136380416, 182320.67111585021), List(533934.5394068043, 182368.8175582806), List(533925.0523586851, 182386.37225865485), List(533873.5143864275, 182392.80335840082), List(533859.7871893895, 182386.877599178), List(533847.8575499062, 182365.42002740753), List(533822.8062397578, 182341.39119331172), List(533820.7837512142, 182339.11231186916), List(533811.8536204272, 182335.5386665268), List(533775.6351340465, 182340.14873929322), List(533754.5225045574, 182377.4279951505), List(533726.5993517478, 182383.3696325071), List(533705.2706813519, 182376.13139397942), List(533704.0296535669, 182370.53473031026), List(533661.5269331775, 182323.7914324954), List(533650.1852706587, 182359.10259587853), List(533643.7957804046, 182364.49848743435), List(533592.6674969478, 182355.3640262651), List(533579.048749956, 182371.69786662376), List(533563.729136071, 182373.5206389891), List(533547.1391008857, 182370.85886459786), List(533500.3328063814, 182408.57647217682), List(533491.708691396, 182446.18495565542), List(533516.9947808256, 182461.31590327783), List(533518.8633508299, 182495.86176251573), List(533511.4005260145, 182515.695971929), List(533457.2576487551, 182542.0931076025), List(533476.0373819193, 182567.06810922833), List(533490.3257582185, 182630.87308835395), List(533456.6213927829, 182645.56658983114), List(533442.9378613638, 182690.83179383923), List(533400.0925503863, 182736.4438360546), List(533442.8860284169, 182772.06468460686), List(533444.820844416, 182777.67950834992), List(533441.031309065, 182789.82072529983), List(533362.30066867, 182801.1064503934), List(533360.1029890878, 182805.4999354959), List(533359.7526092468, 182818.84430400614), List(533344.485770287, 182845.15060313017), List(533336.0022694117, 182903.90613927244), List(533341.1424542875, 182919.62024957192), List(533339.8357444134, 182942.95468899893), List(533393.4991665736, 182961.05581658782), List(533408.8094940147, 182985.93951877212), List(533442.6499810042, 182992.39254114387), List(533445.5628353866, 183013.61224821693), List(533469.5465898013, 183025.3704739589), List(533463.8439242825, 183057.49177497294), List(533500.1287308584, 183102.95741226373), List(533466.967019293, 183149.93616036046), List(533503.3892198745, 183216.54865868646), List(533465.176452015, 183244.47702701658), List(533424.8910481959, 183245.6440381208), List(533424.1682635264, 183246.73784710694), List(533418.6045028507, 183300.00601616694), List(533411.4936098668, 183306.49599349493), List(533453.7188489789, 183389.95262020518), List(533448.7471163005, 183394.2731622412), List(533416.6101809462, 183402.33112426393), List(533386.8603066149, 183451.62554262602), List(533460.2507298898, 183484.71224287833), List(533477.0709651434, 183478.4775450573), List(533490.0435256548, 183486.6081507582), List(533491.6856267123, 183503.3433177545), List(533474.3100794131, 183530.70656481414), List(533439.575167135, 183532.01926295372), List(533414.6004345681, 183558.070208603), List(533421.9954622076, 183567.16690503713), List(533465.4536569463, 183577.21137004573), List(533467.4970158483, 183605.08507556678), List(533466.7158090527, 183608.4029416383), List(533448.3920701249, 183619.04934156616), List(533447.9536899813, 183635.72981729486), List(533477.7375418543, 183664.33265876118), List(533466.545667419, 183694.08407003182), List(533490.6933227498, 183725.8772636903), List(533528.2886847861, 183721.30179077585), List(533557.5998214906, 183688.68865956645), List(533611.3715561538, 183702.3441248663), List(533634.7255277758, 183685.15388438356), List(533680.0872158157, 183701.92715838), List(533737.6963941175, 183648.91685636976), List(533746.7999457252, 183645.81822844158), List(533761.7057262553, 183659.56450243742), List(533819.2086347946, 183663.3052394627), List(533857.2738980403, 183640.93969440804), List(533880.3842868664, 183659.35383981554), List(533933.0619141799, 183661.8558411452), List(533947.3721862542, 183645.5412537885), List(533998.086794612, 183643.54092136555), List(534006.174193125, 183652.65679096844), List(534075.89609284, 183666.7381784143), List(534091.6525898686, 183648.23655415466), List(534146.5283805125, 183646.3475924168), List(534147.9154105934, 183646.38423628634), List(534186.2350593824, 183640.7199105107), List(534188.979735214, 183641.9052504226), List(534236.2269981015, 183639.81565870665), List(534248.916094217, 183632.36147769052), List(534289.25772802, 183628.97699438152), List(534299.4251987567, 183638.14836380753), List(534330.5746032915, 183641.19794210594), List(534334.8533790117, 183636.85990650317), List(534375.7247760285, 183613.459571335), List(534422.5603115619, 183626.940126969), List(534464.7309192211, 183606.9133092533), List(534471.6955801392, 183605.98493706266), List(534481.8038829465, 183617.38073531695), List(534588.5903195359, 183594.61541820713), List(534582.3329739553, 183568.85492995696), List(534598.6517142366, 183529.22629237536), List(534656.5239630286, 183492.92506529193), List(534654.1909793459, 183423.86893065734), List(534781.3901572204, 183495.12499185512), List(534803.3031006856, 183480.127294405), List(534843.4397878762, 183484.53139197238), List(534848.2354977566, 183486.8843684813), List(534853.813337246, 183485.9196823354), List(534901.0161172879, 183563.95757195458), List(534898.8904922986, 183591.72143490257), List(534901.6350458016, 183592.90716247621), List(534983.4868244318, 183568.37485173013), List(535025.5688460219, 183603.99094667443), List(535048.6325535319, 183597.9274125757), List(535060.7478173448, 183586.008682548), List(535071.2986004304, 183580.72525295737), List(535182.3515840536, 183580.3420830167), List(535231.8708644358, 183597.24007576355), List(535263.6963046396, 183627.02099730785), List(535289.2972956816, 183629.92867538054), List(535356.4680675853, 183609.4624531094), List(535406.0525481519, 183571.83582854533), List(535406.7460762692, 183571.85432246135), List(535514.1023035615, 183527.979870348), List(535589.5978763667, 183507.73880783212), List(535604.2215525524, 183505.90367014642), List(535656.2962848989, 183505.068944166), List(535680.4512552277, 183510.1655864418), List(535657.927840578, 183288.11259578395), List(535696.4204844927, 183250.19230655016), List(535766.7486843397, 183267.65151389752), List(535781.9380575956, 183244.6883788783), List(535781.9380575956, 183244.6883788783))), List(List()))2912908.09396458048925.18009182342
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + [ + 5, + 27700, + [ + [ + [ + 535781.9380575956, + 183244.6883788783 + ], + [ + 535824.9598637373, + 183219.13114683155 + ], + [ + 535857.6174945063, + 183217.77906512795 + ], + [ + 535937.9652897029, + 183172.0778235984 + ], + [ + 535948.0833085249, + 183131.17423886806 + ], + [ + 535948.1726131596, + 183127.83816223528 + ], + [ + 535954.1083358275, + 183113.53036918928 + ], + [ + 535992.1193609729, + 183067.80952594848 + ], + [ + 535993.5661128352, + 183065.62262108468 + ], + [ + 536023.433274843, + 183013.00698265154 + ], + [ + 536023.6417598329, + 183005.22280622984 + ], + [ + 536019.9864220938, + 182986.20691772352 + ], + [ + 535991.10526591, + 182924.22825999785 + ], + [ + 536006.9008830489, + 182904.62042294373 + ], + [ + 536029.5004004786, + 182760.5587263456 + ], + [ + 535962.3273532258, + 182728.7139400427 + ], + [ + 535957.3825413126, + 182731.9200414074 + ], + [ + 535855.981967767, + 182708.06319533306 + ], + [ + 535842.6443589143, + 182687.67569398397 + ], + [ + 535833.745864351, + 182682.98643173242 + ], + [ + 535806.7826882608, + 182678.92696711444 + ], + [ + 535787.8063862727, + 182661.7273213567 + ], + [ + 535768.6516090925, + 182651.19990051258 + ], + [ + 535724.5650621236, + 182664.48831547052 + ], + [ + 535671.083893747, + 182639.69031467568 + ], + [ + 535649.9178289331, + 182652.47882477642 + ], + [ + 535645.0325510963, + 182653.46117701154 + ], + [ + 535581.7803402226, + 182630.62884472188 + ], + [ + 535559.116011542, + 182595.52657006297 + ], + [ + 535547.3534337038, + 182594.09984491928 + ], + [ + 535512.2253112149, + 182609.85480291082 + ], + [ + 535477.6131089614, + 182632.3006572011 + ], + [ + 535454.7701589285, + 182603.87101454858 + ], + [ + 535441.0515128053, + 182545.6387309049 + ], + [ + 535441.3480281356, + 182534.51847295318 + ], + [ + 535392.7614843239, + 182534.3359901223 + ], + [ + 535378.7871684541, + 182511.70721004112 + ], + [ + 535355.4689781135, + 182501.07044976193 + ], + [ + 535332.54809557, + 182501.57253937522 + ], + [ + 535319.7830267497, + 182485.65304688836 + ], + [ + 535254.2094731658, + 182471.66562843113 + ], + [ + 535233.8983120153, + 182504.50927462179 + ], + [ + 535184.99899094, + 182542.15613858588 + ], + [ + 535171.1848080601, + 182539.5628792516 + ], + [ + 535188.4320349725, + 182413.1610924584 + ], + [ + 535171.4449779494, + 182399.35527161031 + ], + [ + 535156.8775325917, + 182398.96765152615 + ], + [ + 535126.1481960944, + 182405.93981478648 + ], + [ + 535117.3964801998, + 182395.69169817545 + ], + [ + 535113.2935073929, + 182393.35693709343 + ], + [ + 535046.0944043219, + 182388.23154424207 + ], + [ + 535040.426642115, + 182392.53211760416 + ], + [ + 535005.0780822264, + 182390.4797147903 + ], + [ + 534989.4483038574, + 182377.82341214077 + ], + [ + 534929.5250849993, + 182386.2466646775 + ], + [ + 534926.6912537713, + 182388.39701484208 + ], + [ + 534902.353081025, + 182389.97622999403 + ], + [ + 534891.8446569596, + 182367.44095034344 + ], + [ + 534822.0030447827, + 182331.08968924946 + ], + [ + 534793.2369068528, + 182342.5672069927 + ], + [ + 534789.0747382878, + 182342.45676511177 + ], + [ + 534768.4409085622, + 182335.2324451239 + ], + [ + 534716.2662738861, + 182339.41255793435 + ], + [ + 534709.0639194129, + 182349.23683925637 + ], + [ + 534606.2339552218, + 182326.48043027992 + ], + [ + 534597.7622726331, + 182331.81999374554 + ], + [ + 534580.2859382916, + 182310.2136126347 + ], + [ + 534494.4274051443, + 182275.668296282 + ], + [ + 534478.9138032356, + 182258.56548488926 + ], + [ + 534448.3907594526, + 182257.75755606516 + ], + [ + 534429.425290218, + 182266.15809321858 + ], + [ + 534391.2421295254, + 182266.26060967514 + ], + [ + 534373.558798522, + 182252.43915361603 + ], + [ + 534276.0107687588, + 182239.84453922248 + ], + [ + 534252.1718241313, + 182222.5224069195 + ], + [ + 534227.5393820464, + 182235.22514476796 + ], + [ + 534186.199365603, + 182249.71225135983 + ], + [ + 534110.3320511248, + 182231.01675352425 + ], + [ + 534087.0688351066, + 182218.16183286835 + ], + [ + 534079.1258644154, + 182203.4857467148 + ], + [ + 534054.9339733121, + 182199.50889328966 + ], + [ + 534015.597704571, + 182190.68140581233 + ], + [ + 533992.8122403203, + 182212.33640354726 + ], + [ + 533866.3224438406, + 182217.90405003884 + ], + [ + 533882.0043343764, + 182255.03975232004 + ], + [ + 533882.1118874322, + 182277.29861049942 + ], + [ + 533907.563830076, + 182312.466308441 + ], + [ + 533923.3136380416, + 182320.67111585021 + ], + [ + 533934.5394068043, + 182368.8175582806 + ], + [ + 533925.0523586851, + 182386.37225865485 + ], + [ + 533873.5143864275, + 182392.80335840082 + ], + [ + 533859.7871893895, + 182386.877599178 + ], + [ + 533847.8575499062, + 182365.42002740753 + ], + [ + 533822.8062397578, + 182341.39119331172 + ], + [ + 533820.7837512142, + 182339.11231186916 + ], + [ + 533811.8536204272, + 182335.5386665268 + ], + [ + 533775.6351340465, + 182340.14873929322 + ], + [ + 533754.5225045574, + 182377.4279951505 + ], + [ + 533726.5993517478, + 182383.3696325071 + ], + [ + 533705.2706813519, + 182376.13139397942 + ], + [ + 533704.0296535669, + 182370.53473031026 + ], + [ + 533661.5269331775, + 182323.7914324954 + ], + [ + 533650.1852706587, + 182359.10259587853 + ], + [ + 533643.7957804046, + 182364.49848743435 + ], + [ + 533592.6674969478, + 182355.3640262651 + ], + [ + 533579.048749956, + 182371.69786662376 + ], + [ + 533563.729136071, + 182373.5206389891 + ], + [ + 533547.1391008857, + 182370.85886459786 + ], + [ + 533500.3328063814, + 182408.57647217682 + ], + [ + 533491.708691396, + 182446.18495565542 + ], + [ + 533516.9947808256, + 182461.31590327783 + ], + [ + 533518.8633508299, + 182495.86176251573 + ], + [ + 533511.4005260145, + 182515.695971929 + ], + [ + 533457.2576487551, + 182542.0931076025 + ], + [ + 533476.0373819193, + 182567.06810922833 + ], + [ + 533490.3257582185, + 182630.87308835395 + ], + [ + 533456.6213927829, + 182645.56658983114 + ], + [ + 533442.9378613638, + 182690.83179383923 + ], + [ + 533400.0925503863, + 182736.4438360546 + ], + [ + 533442.8860284169, + 182772.06468460686 + ], + [ + 533444.820844416, + 182777.67950834992 + ], + [ + 533441.031309065, + 182789.82072529983 + ], + [ + 533362.30066867, + 182801.1064503934 + ], + [ + 533360.1029890878, + 182805.4999354959 + ], + [ + 533359.7526092468, + 182818.84430400614 + ], + [ + 533344.485770287, + 182845.15060313017 + ], + [ + 533336.0022694117, + 182903.90613927244 + ], + [ + 533341.1424542875, + 182919.62024957192 + ], + [ + 533339.8357444134, + 182942.95468899893 + ], + [ + 533393.4991665736, + 182961.05581658782 + ], + [ + 533408.8094940147, + 182985.93951877212 + ], + [ + 533442.6499810042, + 182992.39254114387 + ], + [ + 533445.5628353866, + 183013.61224821693 + ], + [ + 533469.5465898013, + 183025.3704739589 + ], + [ + 533463.8439242825, + 183057.49177497294 + ], + [ + 533500.1287308584, + 183102.95741226373 + ], + [ + 533466.967019293, + 183149.93616036046 + ], + [ + 533503.3892198745, + 183216.54865868646 + ], + [ + 533465.176452015, + 183244.47702701658 + ], + [ + 533424.8910481959, + 183245.6440381208 + ], + [ + 533424.1682635264, + 183246.73784710694 + ], + [ + 533418.6045028507, + 183300.00601616694 + ], + [ + 533411.4936098668, + 183306.49599349493 + ], + [ + 533453.7188489789, + 183389.95262020518 + ], + [ + 533448.7471163005, + 183394.2731622412 + ], + [ + 533416.6101809462, + 183402.33112426393 + ], + [ + 533386.8603066149, + 183451.62554262602 + ], + [ + 533460.2507298898, + 183484.71224287833 + ], + [ + 533477.0709651434, + 183478.4775450573 + ], + [ + 533490.0435256548, + 183486.6081507582 + ], + [ + 533491.6856267123, + 183503.3433177545 + ], + [ + 533474.3100794131, + 183530.70656481414 + ], + [ + 533439.575167135, + 183532.01926295372 + ], + [ + 533414.6004345681, + 183558.070208603 + ], + [ + 533421.9954622076, + 183567.16690503713 + ], + [ + 533465.4536569463, + 183577.21137004573 + ], + [ + 533467.4970158483, + 183605.08507556678 + ], + [ + 533466.7158090527, + 183608.4029416383 + ], + [ + 533448.3920701249, + 183619.04934156616 + ], + [ + 533447.9536899813, + 183635.72981729486 + ], + [ + 533477.7375418543, + 183664.33265876118 + ], + [ + 533466.545667419, + 183694.08407003182 + ], + [ + 533490.6933227498, + 183725.8772636903 + ], + [ + 533528.2886847861, + 183721.30179077585 + ], + [ + 533557.5998214906, + 183688.68865956645 + ], + [ + 533611.3715561538, + 183702.3441248663 + ], + [ + 533634.7255277758, + 183685.15388438356 + ], + [ + 533680.0872158157, + 183701.92715838 + ], + [ + 533737.6963941175, + 183648.91685636976 + ], + [ + 533746.7999457252, + 183645.81822844158 + ], + [ + 533761.7057262553, + 183659.56450243742 + ], + [ + 533819.2086347946, + 183663.3052394627 + ], + [ + 533857.2738980403, + 183640.93969440804 + ], + [ + 533880.3842868664, + 183659.35383981554 + ], + [ + 533933.0619141799, + 183661.8558411452 + ], + [ + 533947.3721862542, + 183645.5412537885 + ], + [ + 533998.086794612, + 183643.54092136555 + ], + [ + 534006.174193125, + 183652.65679096844 + ], + [ + 534075.89609284, + 183666.7381784143 + ], + [ + 534091.6525898686, + 183648.23655415466 + ], + [ + 534146.5283805125, + 183646.3475924168 + ], + [ + 534147.9154105934, + 183646.38423628634 + ], + [ + 534186.2350593824, + 183640.7199105107 + ], + [ + 534188.979735214, + 183641.9052504226 + ], + [ + 534236.2269981015, + 183639.81565870665 + ], + [ + 534248.916094217, + 183632.36147769052 + ], + [ + 534289.25772802, + 183628.97699438152 + ], + [ + 534299.4251987567, + 183638.14836380753 + ], + [ + 534330.5746032915, + 183641.19794210594 + ], + [ + 534334.8533790117, + 183636.85990650317 + ], + [ + 534375.7247760285, + 183613.459571335 + ], + [ + 534422.5603115619, + 183626.940126969 + ], + [ + 534464.7309192211, + 183606.9133092533 + ], + [ + 534471.6955801392, + 183605.98493706266 + ], + [ + 534481.8038829465, + 183617.38073531695 + ], + [ + 534588.5903195359, + 183594.61541820713 + ], + [ + 534582.3329739553, + 183568.85492995696 + ], + [ + 534598.6517142366, + 183529.22629237536 + ], + [ + 534656.5239630286, + 183492.92506529193 + ], + [ + 534654.1909793459, + 183423.86893065734 + ], + [ + 534781.3901572204, + 183495.12499185512 + ], + [ + 534803.3031006856, + 183480.127294405 + ], + [ + 534843.4397878762, + 183484.53139197238 + ], + [ + 534848.2354977566, + 183486.8843684813 + ], + [ + 534853.813337246, + 183485.9196823354 + ], + [ + 534901.0161172879, + 183563.95757195458 + ], + [ + 534898.8904922986, + 183591.72143490257 + ], + [ + 534901.6350458016, + 183592.90716247621 + ], + [ + 534983.4868244318, + 183568.37485173013 + ], + [ + 535025.5688460219, + 183603.99094667443 + ], + [ + 535048.6325535319, + 183597.9274125757 + ], + [ + 535060.7478173448, + 183586.008682548 + ], + [ + 535071.2986004304, + 183580.72525295737 + ], + [ + 535182.3515840536, + 183580.3420830167 + ], + [ + 535231.8708644358, + 183597.24007576355 + ], + [ + 535263.6963046396, + 183627.02099730785 + ], + [ + 535289.2972956816, + 183629.92867538054 + ], + [ + 535356.4680675853, + 183609.4624531094 + ], + [ + 535406.0525481519, + 183571.83582854533 + ], + [ + 535406.7460762692, + 183571.85432246135 + ], + [ + 535514.1023035615, + 183527.979870348 + ], + [ + 535589.5978763667, + 183507.73880783212 + ], + [ + 535604.2215525524, + 183505.90367014642 + ], + [ + 535656.2962848989, + 183505.068944166 + ], + [ + 535680.4512552277, + 183510.1655864418 + ], + [ + 535657.927840578, + 183288.11259578395 + ], + [ + 535696.4204844927, + 183250.19230655016 + ], + [ + 535766.7486843397, + 183267.65151389752 + ], + [ + 535781.9380575956, + 183244.6883788783 + ], + [ + 535781.9380575956, + 183244.6883788783 + ] + ] + ], + [ + [] + ] + ], + 2912908.0939645804, + 8925.18009182342 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "geometry", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"type_id\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"srid\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"boundary\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}},{\"name\":\"holes\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":{\"type\":\"array\",\"elementType\":\"double\",\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "calculated_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "calculated_length", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "display(\n", + " postcodes_bng\n", + " .withColumn(\"calculated_area\", mos.st_area(col(\"geometry\")))\n", + " .withColumn(\"calculated_length\", mos.st_length(col(\"geometry\")))\n", + " # Note: The unit of measure of the area and length depends on the CRS used.\n", + " # For British National Grid locations it will be square meters and meters\n", + " .select(\"geometry\", \"calculated_area\", \"calculated_length\")\n", + " .limit(1) # <- limiting for ipynb only\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c965eb2f-3b8b-4f25-8658-f9e6b6907b71", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Reproject UPRN Points to BNG SRID\n", + "\n", + "> The UPRNs table contains Unique Property Reference Numbers and positions provided in EPSG:27700 and EPSG:4326. Since we are operating in EPSG:27700 and using BNG as our indexing system, we will use the location data provided via Northings and Eastings coordinates." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a967ca2f-6205-42fa-8d7e-4e167c846373", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
UPRNX_COORDINATEY_COORDINATEuprn_point
10010266671542744.02176316.22POINT (542744.02 176316.22)
10010266672543142.76174400.84POINT (543142.76 174400.84)
10010266673540615.6175439.19POINT (540615.6 175439.19)
10010266674540614.66175444.2POINT (540614.66 175444.2)
10010266675540613.72175449.22POINT (540613.72 175449.22)
10010266676540615.45175454.73POINT (540615.45 175454.73)
10010266677540614.55175459.75POINT (540614.55 175459.75)
10010266678540613.65175464.78POINT (540613.65 175464.78)
10010266679537778.13177195.59POINT (537778.13 177195.59)
10010266680537778.13177195.59POINT (537778.13 177195.59)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 10010266671, + 542744.02, + 176316.22, + "POINT (542744.02 176316.22)" + ], + [ + 10010266672, + 543142.76, + 174400.84, + "POINT (543142.76 174400.84)" + ], + [ + 10010266673, + 540615.6, + 175439.19, + "POINT (540615.6 175439.19)" + ], + [ + 10010266674, + 540614.66, + 175444.2, + "POINT (540614.66 175444.2)" + ], + [ + 10010266675, + 540613.72, + 175449.22, + "POINT (540613.72 175449.22)" + ], + [ + 10010266676, + 540615.45, + 175454.73, + "POINT (540615.45 175454.73)" + ], + [ + 10010266677, + 540614.55, + 175459.75, + "POINT (540614.55 175459.75)" + ], + [ + 10010266678, + 540613.65, + 175464.78, + "POINT (540613.65 175464.78)" + ], + [ + 10010266679, + 537778.13, + 177195.59, + "POINT (537778.13 177195.59)" + ], + [ + 10010266680, + 537778.13, + 177195.59, + "POINT (537778.13 177195.59)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "UPRN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "X_COORDINATE", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Y_COORDINATE", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "uprn_point", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "_uprns_bng = (\n", + " uprns\n", + " .withColumn(\"uprn_point\", mos.st_point(col(\"X_COORDINATE\"), col(\"Y_COORDINATE\")))\n", + " # we are using WKT here for simpler displaying, use WKB for faster query run time\n", + " .withColumn(\"uprn_point\", mos.st_aswkt(\"uprn_point\")) \n", + " .where(mos.st_hasvalidcoordinates(\"uprn_point\", lit('EPSG:27700'), lit('reprojected_bounds')))\n", + " .where(mos.st_isvalid(col(\"uprn_point\")))\n", + " .drop(\"LATITUDE\", \"LONGITUDE\")\n", + ")\n", + "_uprns_bng.limit(10).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "55b05a95-8893-45a1-bca5-c6dc3fe85e22", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> Next step is optional. However, since we are constructing POINT geometries and ensuring they are valid it is prudent to write out the validated dataset. That way we are making sure validation is performed only once at ingestion time and not each time spark runs the queries (due to spark lazy evaluation). " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "48ba2ff3-5223-46d0-82d6-9c98bf91f07e", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "spark.sql(\"drop table if exists uprns_bng\")\n", + "_uprns_bng.write.format(\"delta\").mode(\"overwrite\").saveAsTable(\"uprns_bng\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2d9c3987-ca29-4f44-a9c1-e0acd64a89ce", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 40,591,073\n" + ] + } + ], + "source": [ + "uprns_bng = spark.read.table(\"uprns_bng\")\n", + "print(f\"count? {uprns_bng.count():,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7f63e084-67a6-4fb0-90c8-505346ab72f6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Spatial Joins\n", + "\n", + "> We can use Mosaic to perform spatial joins both with and without Mosaic indexing strategies. Indexing is very important when handling very different geometries both in size and in shape (ie. number of vertices). In the context of Mosaic we are using grid index systems rather than traditional tree based index system. The reason for this is the fact grid index systems like BNG and/or H3 are far better suited for distributed massive scale systems. Mosaic comes with grid_tessallate expressions that allow the caller to index an arbitrary shape within grid index system of choice. One thing to note here is that tessellation is a specialised way of converting a geometry to set of grid index system cells with their local geometries.
\n", + "\n", + "__Tessellation is applicable to any shape, Polygon, LineString, Points and their Multi* variants.__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9aeb0674-fd9b-4685-864e-4783fbb8ab6d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### [1] Getting the optimal resolution\n", + "\n", + "> We can use Mosaic functionality to identify how to best index/tessellate our data based on the data inside the specific dataframe.
\n", + "Selecting an apropriate tessellation resolution can have a considerable impact on the performance.
" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ca8daa25-4062-45c4-8703-87a07685a4b6", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\n Optimal resolution code is :-4.\n Optimal resolution name is :500m.\n\n" + ] + } + ], + "source": [ + "from mosaic import MosaicFrame\n", + "\n", + "postcodes_mosaic_frame = MosaicFrame(postcodes_bng, \"geometry\")\n", + "optimal_resolution = postcodes_mosaic_frame.get_optimal_resolution(sample_fraction=0.75)\n", + "optimal_resolution_str = postcodes_mosaic_frame.get_optimal_resolution_str(sample_fraction=0.75)\n", + "\n", + "print(f\"\"\"\n", + " Optimal resolution code is :{optimal_resolution}.\n", + " Optimal resolution name is :{optimal_resolution_str}.\n", + "\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "16f26da4-32ef-46bf-8eb3-7f1f470d16b8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "\n", + "> Not every resolution will yield performance improvements. By a rule of thumb it is always better to select more coarse resolution than to select a more fine grained resolution - if not sure select a lower resolution. Tessellation is a trade off between decomposition and explosion factor. The more fine grained the resolution is the more explosion of rows will impact the preprocessing time. However, it will make data more parallel. On the other hand, if the resolution is too coarse we are not addressing localisation related data skews. You can think of Mosaic's tessellation as a way to partition an overly complex row into multiple rows that have a balanced amount of computation each." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "85d9df4f-c1a6-4f9d-8da8-dd905fee243c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
resolutionmean_index_areamean_geometry_areapercentile_25_geometry_areapercentile_50_geometry_areapercentile_75_geometry_area
410000.0412.35439945734294101.33915798543552393.67019398293326579.0929378698623
-4250000.016.494175978293724.053566319417420515.74680775931733123.163717514794495
-52500.01649.4175978293717405.35663194174211574.6807759317332316.3717514794494
31000000.04.123543994573431.01339157985435513.9367019398293335.790929378698624
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 4, + 10000.0, + 412.35439945734294, + 101.33915798543552, + 393.67019398293326, + 579.0929378698623 + ], + [ + -4, + 250000.0, + 16.49417597829372, + 4.0535663194174205, + 15.746807759317331, + 23.163717514794495 + ], + [ + -5, + 2500.0, + 1649.4175978293717, + 405.3566319417421, + 1574.680775931733, + 2316.3717514794494 + ], + [ + 3, + 1000000.0, + 4.12354399457343, + 1.0133915798543551, + 3.936701939829333, + 5.790929378698624 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "resolution", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "mean_index_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mean_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_25_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_50_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_75_geometry_area", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "display(\n", + " postcodes_mosaic_frame.get_resolution_metrics(sample_rows=150)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4acb267c-c186-4ab3-acf2-e624ccb624ae", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### [2] Indexing/Tessellating using the optimal resolution\n", + "\n", + "> We will use mosaic sql functions to index our points data. Here we will use resolution -4 (500m), index resolution depends on the dataset in use. There is a second best choice which is 4 (100m). The user can pass either numerical resolution or the string label to the grid expressions. BNG provides 2 types of hierarchies. The standard hierarchy which operates with index resolutions in base 10 (i.e. (6, 1m), (5, 10m), (4, 100m), (3, 1km), (2, 10km), (1, 100km)) and cell ids follow the format of letter pair followed by coordinate bins at the selected resolution (e.g. TQ100100 for (4, 100m)). The quad hierachy (or quadrant hierarchy) which operates with index resolutions in base 5 (i.e. (-6, 5m), (-5, 50m), (-4, 500m), (-3, 5km), (-2, 50km), (-1, 500km)) and cell ids follow the format of letter pair followed by coordinate bins at the selected resolution and folowed by quadrant letters (e.g. TQ100100SW for (-4, 500m)). Quadrants correspond to compas directions SW (south west), NW (north west), NE (north east) and SE (south east)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c24e60b6-9e35-46c9-b1c9-9bc8807efe79", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Full processing of all the Cycling data can take a little while, so we offer you a full or sample option, depending on your appetite._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "32ba3606-791b-4510-8482-7c4226bd64d8", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 406,871\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
UPRNX_COORDINATEY_COORDINATEuprn_pointuprn_bng_500muprn_bng_500m_struprn_bng_100m_str
10023609852268073.09213302.63POINT (268073.09 213302.63)SN6813SWSN6813SWSN680133
10023610059225741.11214655.3POINT (225741.11 214655.3)SN2514NESN2514NESN257146
10023610071250735.0200942.0POINT (250735 200942)SN5000NESN5000NESN507009
10023610128250243.0199800.0POINT (250243 199800)SS5099NWSS5099NWSS502998
10023610129251011.0200584.0POINT (251011 200584)SN5100NWSN5100NWSN510005
10023610195250831.0199092.0POINT (250831 199092)SS5099SESS5099SESS508990
10023610218255862.0201157.0POINT (255862 201157)SN5501SESN5501SESN558011
10023610233241079.0207472.0POINT (241079 207472)SN4107SWSN4107SWSN410074
10023610499253442.0200037.0POINT (253442 200037)SN5300SWSN5300SWSN534000
10023610552242093.0219684.0POINT (242093 219684)SN4219NWSN4219NWSN420196
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 10023609852, + 268073.09, + 213302.63, + "POINT (268073.09 213302.63)", + "SN6813SW", + "SN6813SW", + "SN680133" + ], + [ + 10023610059, + 225741.11, + 214655.3, + "POINT (225741.11 214655.3)", + "SN2514NE", + "SN2514NE", + "SN257146" + ], + [ + 10023610071, + 250735.0, + 200942.0, + "POINT (250735 200942)", + "SN5000NE", + "SN5000NE", + "SN507009" + ], + [ + 10023610128, + 250243.0, + 199800.0, + "POINT (250243 199800)", + "SS5099NW", + "SS5099NW", + "SS502998" + ], + [ + 10023610129, + 251011.0, + 200584.0, + "POINT (251011 200584)", + "SN5100NW", + "SN5100NW", + "SN510005" + ], + [ + 10023610195, + 250831.0, + 199092.0, + "POINT (250831 199092)", + "SS5099SE", + "SS5099SE", + "SS508990" + ], + [ + 10023610218, + 255862.0, + 201157.0, + "POINT (255862 201157)", + "SN5501SE", + "SN5501SE", + "SN558011" + ], + [ + 10023610233, + 241079.0, + 207472.0, + "POINT (241079 207472)", + "SN4107SW", + "SN4107SW", + "SN410074" + ], + [ + 10023610499, + 253442.0, + 200037.0, + "POINT (253442 200037)", + "SN5300SW", + "SN5300SW", + "SN534000" + ], + [ + 10023610552, + 242093.0, + 219684.0, + "POINT (242093 219684)", + "SN4219NW", + "SN4219NW", + "SN420196" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "UPRN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "X_COORDINATE", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "Y_COORDINATE", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "uprn_point", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "uprn_bng_500m", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "uprn_bng_500m_str", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "uprn_bng_100m_str", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "uprns_opt = (\n", + " uprns_bng\n", + " .withColumn(\"uprn_bng_500m\", mos.grid_pointascellid(\"uprn_point\", F.lit(optimal_resolution)))\n", + " .withColumn(\"uprn_bng_500m_str\", mos.grid_pointascellid(\"uprn_point\", F.lit(optimal_resolution_str)))\n", + " .withColumn(\"uprn_bng_100m_str\", mos.grid_pointascellid(\"uprn_point\", F.lit(\"100m\")))\n", + ")\n", + "# - uncomment to only use a 1% sample\n", + "# using 400K of the 40M\n", + "uprns_opt = uprns_opt.sample(0.01)\n", + "\n", + "print(f\"count? {uprns_opt.count():,}\")\n", + "uprns_opt.limit(10).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ef8de0cd-6673-4da8-a1df-32c4acfaaed6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> Mosaic has a builtin wrappers for KeplerGL map plots using mosaic_kepler IPython magics. Mosaic magics automatically handle bng grid idex system and CRS conversion for you. Given that Kepler Plots are rendered on the browser side we are automatically limiting the row count to 1000. The end user can override the number of ploted rows by specifying the desired number." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "87b711fe-adde-493e-8a95-200f49d2df2f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "spark.catalog.clearCache() # <- cache is useful for dev (avoid recomputes)\n", + "count_per_index = uprns_opt.groupBy(\"uprn_bng_500m\").count().cache()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e483feda-4bc5-43b5-a1f7-1cd50041ccd6", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dfb141a7-df85-4a92-90a4-032d29064653", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can configure layer properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "831bcb6d-6fb8-4ae9-8be0-5c3991e78efd", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# count_per_index \"uprn_bng_500m\" \"bng\" 50" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ff47c4db-1ac0-4359-b769-4dc0331767e6", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> We will use Mosaic to tessellate our postcode geometries using a built in tessellation generator (explode) function ." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7099bac3-78fe-4ce7-b596-aaf8cc41ef50", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
typepropertieschips
FeatureCollectionList(E2 postcode district, E2)List(true, TQ3482NW, AAAAAAMAAAABAAAABkEgS+AAAAAAQQZHIAAAAABBIE/IAAAAAEEGRyAAAAAAQSBPyAAAAABBBlbAAAAAAEEgS+AAAAAAQQZWwAAAAABBIEvgAAAAAEEGRyAAAAAAQSBL4AAAAABBBkcgAAAAAA==)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "FeatureCollection", + [ + "E2 postcode district", + "E2" + ], + [ + true, + "TQ3482NW", + "AAAAAAMAAAABAAAABkEgS+AAAAAAQQZHIAAAAABBIE/IAAAAAEEGRyAAAAAAQSBPyAAAAABBBlbAAAAAAEEgS+AAAAAAQQZWwAAAAABBIEvgAAAAAEEGRyAAAAAAQSBL4AAAAABBBkcgAAAAAA==" + ] + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "properties", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"Description\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"Name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "chips", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"is_core\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"index_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"wkb\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}}]}" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "postcodes_with_index = (\n", + " postcodes_bng\n", + " # We break down the original geometry in multiple smaller mosaic chips\n", + " # each fully contained in a grid cell\n", + " .withColumn(\"chips\", mos.grid_tessellateexplode(col(\"geometry\"), F.lit(optimal_resolution)))\n", + " # We don't need the original geometry any more, since we have broken it down into\n", + " # Smaller mosaic chips.\n", + " .drop(\"json_geometry\", \"geometry\")\n", + ")\n", + "\n", + "postcodes_with_index.limit(1).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "88749047-2354-488c-803e-b41bec0aa0dc", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "to_display = postcodes_with_index.select(\"properties.Name\", \"chips.index_id\", mos.st_aswkt(\"chips.wkb\").alias(\"geometry\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2ea12c68-7eac-4145-81fe-78b165fa8c58", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "99b91323-c21e-4f9c-95fa-3a7bed931ac2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can configure layer properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c2cc06bf-0c7c-49fd-923d-6cec0e70fc0b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# to_display \"geometry\" \"geometry(27700)\" 200" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "be1e343e-f091-4493-88b5-b7aace1643e5", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### [3] Performing the spatial join\n", + "\n", + "> We can now do spatial join between our UPRNs and postcodes." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a48fb8aa-6abf-4fd9-a900-178eca780f69", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
DescriptionNameuprn_pointUPRNindex_idindex_geometry
E5 postcode districtE5POINT (535126 186727)10008234067TQ3586NWPOLYGON ((535000 186500, 535500 186500, 535500 187000, 535000 187000, 535000 186500, 535000 186500))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "E5 postcode district", + "E5", + "POINT (535126 186727)", + 10008234067, + "TQ3586NW", + "POLYGON ((535000 186500, 535500 186500, 535500 187000, 535000 187000, 535000 186500, 535000 186500))" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "Description", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "Name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "uprn_point", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "UPRN", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "index_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "index_geometry", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "with_postcodes = (\n", + " uprns_opt.join(\n", + " postcodes_with_index,\n", + " uprns_opt[\"uprn_bng_500m\"] == postcodes_with_index[\"chips.index_id\"],\n", + " how = \"right_outer\" # to perserve even emtpy chips\n", + " ).where(\n", + " # If the borough is a core chip (the chip is fully contained within the geometry), then we do not need\n", + " # to perform any intersection, because any point matching the same index will certainly be contained in\n", + " # the borough. Otherwise we need to perform an st_contains operation on the chip geometry.\n", + " col(\"chips.is_core\") | mos.st_contains(col(\"chips.wkb\"), col(\"uprn_point\"))\n", + " ).select(\n", + " \"properties.*\", \"uprn_point\", \"UPRN\", \"chips.index_id\", mos.st_aswkt(\"chips.wkb\").alias(\"index_geometry\")\n", + " )\n", + ")\n", + "\n", + "with_postcodes.limit(1).display() # <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b06d302c-42a2-45c9-8042-ad52921d26e3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Visualise the results in Kepler\n", + "\n", + "> We have already used this in the notebook, but want to call out that Mosaic abstracts interaction with Kepler in python through `mosaic_kepler` magic. The magic takes care of conversion between EPSG:27700 and EPSG:4326 so that Kepler can properly render. It can handle columns with bng index ids (int and str formats are both supported) and geometries that are provided in EPSG:27700. Mosaic will convert all the geometries for proper rendering." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "30b30c5e-cfef-4eda-ad13-b9f4f44e6e55", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7f9cd19a-b587-49ec-bdc9-dc5b6480927c", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can configure layer properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7e6f66d6-a1bd-46a0-aecc-e148ca0807fa", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# with_postcodes \"index_geometry\" \"geometry(27700)\" 5000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a95edd3d-41f0-40c6-8655-f3af573d73d8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> Using mosaic it takes only a few lines of code to produce BNG based heat map and visualise it in Kepler. By default the colors wont be affected by the counts and you'd need to change the options in Kepler UI. Navigate to the layer, expland it and for the fill color click on the 3 dots icon, then select count as the field for color scaling. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "20124fe4-538e-4378-bf95-15de644695a4", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "properties_per_index = with_postcodes.groupBy(\"index_id\").count()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ea2d5649-30b0-481c-b2e0-46bdaab19e33", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bfbed669-9885-4282-b5bf-f910b5539660", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can configure layer properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d50a7fe8-f4fd-428d-8b08-b894cc6f45ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# properties_per_index \"index_id\" \"bng\" 6000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "615cff88-d314-4290-90b9-0013214e7e1e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> We can do the same per chip." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eddf229d-f65e-4d30-b85e-6ad673eacd07", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "properties_per_chip = with_postcodes.groupBy(\"index_geometry\").count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6693ec87-ce2c-48a6-b32d-8680df5ba34d", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> Note that if you dont use \"right_outer\" join some chips may be empty. This is due to no UPRNs being located in those exact chips." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e6decba5-7443-49fa-8836-6f44b6d18169", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e4201501-8308-4116-bf56-c2f63a2c9528", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can configure layer properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f7886a3e-cc59-4a61-a8ad-8e515f038d11", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# properties_per_chip \"index_geometry\" \"geometry(27700)\" 20000" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549842242909, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "transform_join_bng", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/python/Validate/README.md b/notebooks/examples/python/Validate/README.md new file mode 100644 index 000000000..a0540a4b1 --- /dev/null +++ b/notebooks/examples/python/Validate/README.md @@ -0,0 +1,5 @@ +# Validate Examples + +> Examples of validating spatial data. + +__Note: `ipynb` files can be previewed in GitHub and can also be imported into Databricks, more [here](https://docs.databricks.com/en/notebooks/notebook-export-import.html).__ diff --git a/notebooks/examples/python/Validate/shapely_validate_udfs.ipynb b/notebooks/examples/python/Validate/shapely_validate_udfs.ipynb new file mode 100644 index 000000000..e0f60a0ab --- /dev/null +++ b/notebooks/examples/python/Validate/shapely_validate_udfs.ipynb @@ -0,0 +1,1673 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "89a84928-6a2b-4aaa-b7b1-c2e011199bfb", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Shapely Validate Example \n", + "\n", + "> Parallel handling of of a mixture of valid and invalid geometries using [regular](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.udf.html?highlight=udf#pyspark.sql.functions.udf) and [vectorized pandas](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.pandas_udf.html?highlight=pandas%20udf#pyspark.sql.functions.pandas_udf) UDFs.\n", + "\n", + "__Libraries__\n", + "\n", + "

\n", + "\n", + "* 'databricks-mosaic' (installs geopandas and dependencies as well as keplergl)\n", + "\n", + "--- \n", + " __Last Update__ 22 NOV 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f9812f64-8eff-4888-8d15-85d60aa3464f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6cc73475-225a-4f32-8ddc-93c564095776", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "97b74931-0c94-41e6-afa3-b7a3236ecce2", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d01ad97d-188a-4ce0-ab7c-28029e77d30b", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 10_000) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.databricks.sql import functions as dbf\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import udf, col\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import geopandas as gpd\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "import shapely\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c9620eaf-3776-4ea4-b5b9-f7da9f164a8f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Data\n", + "\n", + "> Generating a dataset with some bad data, adapted from [here](https://github.com/kleunen/boost_geometry_correct).\n", + "\n", + "These are the types of issues that can come up with geometries [[1](https://stackoverflow.com/questions/49902090/dataset-of-invalid-geometries-in-boostgeometry)]...\n", + "\n", + "```\n", + "//Hole Outside Shell\n", + "check(\"POLYGON((0 0, 10 0, 10 10, 0 10, 0 0), (15 15, 15 20, 20 20, 20 15, 15 15))\");\n", + "//Nested Holes\n", + "check(\"POLYGON((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 2 8, 8 8, 8 2, 2 2), (3 3, 3 7, 7 7, 7 3, 3 3))\");\n", + "//Disconnected Interior\n", + "check(\"POLYGON((0 0, 10 0, 10 10, 0 10, 0 0), (5 0, 10 5, 5 10, 0 5, 5 0))\");\n", + "//Self Intersection\n", + "check(\"POLYGON((0 0, 10 10, 0 10, 10 0, 0 0))\");\n", + "//Ring Self Intersection\n", + "check(\"POLYGON((5 0, 10 0, 10 10, 0 10, 0 0, 5 0, 3 3, 5 6, 7 3, 5 0))\");\n", + "//Nested Shells\n", + "check(\"MULTIPOLYGON(((0 0, 10 0, 10 10, 0 10, 0 0)),(( 2 2, 8 2, 8 8, 2 8, 2 2)))\");\n", + "//Duplicated Rings\n", + "check(\"MULTIPOLYGON(((0 0, 10 0, 10 10, 0 10, 0 0)),((0 0, 10 0, 10 10, 0 10, 0 0)))\");\n", + "//Too Few Points\n", + "check(\"POLYGON((2 2, 8 2))\");\n", + "//Invalid Coordinate\n", + "check(\"POLYGON((NaN 3, 3 4, 4 4, 4 3, 3 3))\");\n", + "//Ring Not Closed\n", + "check(\"POLYGON((0 0, 0 10, 10 10, 10 0))\");\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0f9928a2-3172-4240-a4b2-e08a5b15467c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "test_wkts = []" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "253969cc-ecd7-4e59-b5c5-3ab52d52240a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__[1a] Polygon self-intersection__\n", + "\n", + "> Exterior xy plot with shapely (to see the lines)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d25d6b19-d6da-49bc-a1cc-b5bb43306984", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "test_wkts.append((1, \"\"\"POLYGON ((5 0, 2.5 9, 9.5 3.5, 0.5 3.5, 7.5 9, 5 0))\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4416db98-d3e2-4548-a761-40745f49c98b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[107]: []" + ] + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(*shapely.wkt.loads(test_wkts[0][1]).exterior.xy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9ff48a09-e1e8-4a2a-b125-9e88b9aaabb0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__[1b] Polygon with hole inside__\n", + "\n", + "> Exterior xy plot with shapely (to see the lines)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "43ba0913-ea2f-48e0-b381-35f6fbe453fb", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "test_wkts.append((2, \"\"\"POLYGON ((55 10, 141 237, 249 23, 21 171, 252 169, 24 89, 266 73, 55 10))\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3a0264fa-c032-4740-a00f-82592de824bc", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[108]: []" + ] + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(*shapely.wkt.loads(test_wkts[1][1]).exterior.xy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "79eecc6e-ddc8-4223-97d6-5474b5e34d37", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__[1c] Polygon with multiple intersections at same point__\n", + "\n", + "> Exterior xy plot with shapely (to see the lines)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "08a50a1d-f9c1-420c-b79b-7983d01bf5c2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "test_wkts.append((3, \"\"\"POLYGON ((0 0, 10 0, 0 10, 10 10, 0 0, 5 0, 5 10, 0 10, 0 5, 10 5, 10 0, 0 0))\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7eeb16d7-5f2b-44d1-b6b5-36e45d65e9e8", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[109]: []" + ] + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(*shapely.wkt.loads(test_wkts[2][1]).exterior.xy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ca173a4d-639d-4659-937e-aa3bdb5a8c55", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__[1d] Valid Polygon__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dc2f019a-d4c1-4a68-94e7-be14e9288ac2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "test_wkts.append((4, \"\"\"POLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2a7735bb-ff89-4357-8b89-f991776e045a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[115]: []" + ] + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(*shapely.wkt.loads(test_wkts[3][1]).exterior.xy)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c67147a4-505d-4be1-a36d-68d27643e925", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__[2] Make Spark DataFrame from `test_wkts`__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e65a616e-9abe-4c52-b8f2-06b789188bc8", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 4\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "

row_idgeom_wkt
1POLYGON ((5 0, 2.5 9, 9.5 3.5, 0.5 3.5, 7.5 9, 5 0))
2POLYGON ((55 10, 141 237, 249 23, 21 171, 252 169, 24 89, 266 73, 55 10))
3POLYGON ((0 0, 10 0, 0 10, 10 10, 0 0, 5 0, 5 10, 0 10, 0 5, 10 5, 10 0, 0 0))
4POLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1, + "POLYGON ((5 0, 2.5 9, 9.5 3.5, 0.5 3.5, 7.5 9, 5 0))" + ], + [ + 2, + "POLYGON ((55 10, 141 237, 249 23, 21 171, 252 169, 24 89, 266 73, 55 10))" + ], + [ + 3, + "POLYGON ((0 0, 10 0, 0 10, 10 10, 0 0, 5 0, 5 10, 0 10, 0 5, 10 5, 10 0, 0 0))" + ], + [ + 4, + "POLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "row_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df = (\n", + " spark\n", + " .createDataFrame(test_wkts, schema=['row_id', 'geom_wkt'])\n", + ")\n", + "print(f\"count? {df.count():,}\")\n", + "df.display()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ea4d2ea0-1224-41f6-ae5b-72f2c34b9a5b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Regular UDF: Test + Fix Validity\n", + "\n", + "> Will use Mosaic to initially test; then only apply UDF to invalids" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bfdaa108-3c66-404e-ac2a-a6de18cbd5d4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### UDFs" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "f99ec990-0629-446c-90ff-447d6496f260", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "@udf(returnType=StringType())\n", + "def explain_wkt_validity(geom_wkt:str) -> str:\n", + " \"\"\"\n", + " Add explanation of validity or invalidity\n", + " \"\"\"\n", + " from shapely import wkt\n", + " from shapely.validation import explain_validity\n", + "\n", + " _geom = wkt.loads(geom_wkt)\n", + " return explain_validity(_geom)\n", + "\n", + "\n", + "@udf(returnType=StringType())\n", + "def make_wkt_valid(geom_wkt:str) -> str:\n", + " \"\"\"\n", + " - test for wkt being valid\n", + " - attempts to make valid\n", + " - may have to change type, e.g. POLYGON to MULTIPOLYGON\n", + " returns valid wkt\n", + " \"\"\"\n", + " from shapely import wkt \n", + " from shapely.validation import make_valid\n", + "\n", + " _geom = wkt.loads(geom_wkt)\n", + " if _geom.is_valid:\n", + " return geom_wkt\n", + " _geom_fix = make_valid(_geom)\n", + " return _geom_fix.wkt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "aad1c6ee-183f-4e74-92bf-07e94ff2c81e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Test Validity" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f38c18f1-3248-4c48-ae3f-6872618d168b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
row_idgeom_wktis_valid
1POLYGON ((5 0, 2.5 9, 9.5 3.5, 0.5 3.5, 7.5 9, 5 0))false
2POLYGON ((55 10, 141 237, 249 23, 21 171, 252 169, 24 89, 266 73, 55 10))false
3POLYGON ((0 0, 10 0, 0 10, 10 10, 0 0, 5 0, 5 10, 0 10, 0 5, 10 5, 10 0, 0 0))false
4POLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))true
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1, + "POLYGON ((5 0, 2.5 9, 9.5 3.5, 0.5 3.5, 7.5 9, 5 0))", + false + ], + [ + 2, + "POLYGON ((55 10, 141 237, 249 23, 21 171, 252 169, 24 89, 266 73, 55 10))", + false + ], + [ + 3, + "POLYGON ((0 0, 10 0, 0 10, 10 10, 0 0, 5 0, 5 10, 0 10, 0 5, 10 5, 10 0, 0 0))", + false + ], + [ + 4, + "POLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))", + true + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "row_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_test_valid = (\n", + " df\n", + " .withColumn(\"is_valid\", mos.st_isvalid(\"geom_wkt\"))\n", + ")\n", + "\n", + "df_test_valid.display()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a87a8fd9-1fd1-44be-9704-7995bf7f2bea", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Let's get an explanation for our 3 invalids__" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4f783941-c167-4135-a1a2-8ebe969c66f9", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Recommend `explain_wkt_valid` only to help you understand, not as part of production pipeline, so doing separately._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "47ed3341-8321-47e1-a9ec-ef67ff4fd007", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
row_idgeom_wktis_validinvalid_explain
1POLYGON ((5 0, 2.5 9, 9.5 3.5, 0.5 3.5, 7.5 9, 5 0))falseSelf-intersection[4.02777777777778 3.5]
2POLYGON ((55 10, 141 237, 249 23, 21 171, 252 169, 24 89, 266 73, 55 10))falseSelf-intersection[201.596683628707 53.7705737848745]
3POLYGON ((0 0, 10 0, 0 10, 10 10, 0 0, 5 0, 5 10, 0 10, 0 5, 10 5, 10 0, 0 0))falseRing Self-intersection[5 10]
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1, + "POLYGON ((5 0, 2.5 9, 9.5 3.5, 0.5 3.5, 7.5 9, 5 0))", + false, + "Self-intersection[4.02777777777778 3.5]" + ], + [ + 2, + "POLYGON ((55 10, 141 237, 249 23, 21 171, 252 169, 24 89, 266 73, 55 10))", + false, + "Self-intersection[201.596683628707 53.7705737848745]" + ], + [ + 3, + "POLYGON ((0 0, 10 0, 0 10, 10 10, 0 0, 5 0, 5 10, 0 10, 0 5, 10 5, 10 0, 0 0))", + false, + "Ring Self-intersection[5 10]" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "row_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + }, + { + "metadata": "{}", + "name": "invalid_explain", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "display(\n", + " df_test_valid\n", + " .select(\n", + " \"*\",\n", + " F\n", + " .when(col(\"is_valid\") == False, explain_wkt_validity(\"geom_wkt\"))\n", + " .otherwise(F.lit(None))\n", + " .alias(\"invalid_explain\")\n", + " )\n", + " .filter(\"is_valid = false\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d7b716ae-f63c-47f2-91c7-7ae9f08d8114", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Fix Validity" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e25db3fb-e1ea-48d8-ba36-65082da5e933", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 4\nnum orig invalid? 3\nnum final invalid? 0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
row_idis_orig_validgeom_wktis_valid
1falseMULTIPOLYGON (((0.5 3.5, 3.3957654723127035 5.7752442996742674, 4.027777777777778 3.5, 0.5 3.5)), ((2.5 9, 5 7.035714285714286, 3.3957654723127035 5.7752442996742674, 2.5 9)), ((7.5 9, 6.6042345276872965 5.7752442996742674, 5 7.035714285714286, 7.5 9)), ((9.5 3.5, 5.972222222222222 3.5, 6.6042345276872965 5.7752442996742674, 9.5 3.5)), ((5 0, 4.027777777777778 3.5, 5.972222222222222 3.5, 5 0)))true
2falseMULTIPOLYGON (((21 171, 115.68501587180901 170.1802163127982, 97.24514608274922 121.50753675330314, 21 171)), ((141 237, 174.98122638059246 169.66682920882604, 115.68501587180901 170.1802163127982, 141 237)), ((252 169, 186.85374007521938 146.1416631842875, 174.98122638059246 169.66682920882604, 252 169)), ((222.3085097882541 75.88869356771873, 161.30987316587914 79.92166127828898, 104.05263157894737 117.08864265927977, 186.85374007521938 146.1416631842875, 222.3085097882541 75.88869356771873)), ((266 73, 229.29693213749567 62.04126409792525, 222.3085097882541 75.88869356771873, 266 73)), ((249 23, 201.59668362870678 53.77057378487454, 229.29693213749567 62.04126409792525, 249 23)), ((55 10, 83.44063221452673 85.07004084532055, 161.30987316587914 79.92166127828898, 201.59668362870678 53.77057378487454, 55 10)), ((24 89, 94.27070148854622 113.6563864872092, 83.44063221452673 85.07004084532055, 24 89)), ((97.24514608274922 121.50753675330314, 104.05263157894737 117.08864265927977, 94.27070148854622 113.6563864872092, 97.24514608274922 121.50753675330314)))true
3falseGEOMETRYCOLLECTION (MULTIPOLYGON (((10 5, 10 0, 5 0, 0 0, 5 5, 10 5)), ((5 5, 0 5, 0 10, 5 10, 10 10, 5 5))), MULTILINESTRING ((10 0, 5 5), (5 5, 0 10), (5 0, 5 5), (5 5, 5 10)))true
4truePOLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))true
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1, + false, + "MULTIPOLYGON (((0.5 3.5, 3.3957654723127035 5.7752442996742674, 4.027777777777778 3.5, 0.5 3.5)), ((2.5 9, 5 7.035714285714286, 3.3957654723127035 5.7752442996742674, 2.5 9)), ((7.5 9, 6.6042345276872965 5.7752442996742674, 5 7.035714285714286, 7.5 9)), ((9.5 3.5, 5.972222222222222 3.5, 6.6042345276872965 5.7752442996742674, 9.5 3.5)), ((5 0, 4.027777777777778 3.5, 5.972222222222222 3.5, 5 0)))", + true + ], + [ + 2, + false, + "MULTIPOLYGON (((21 171, 115.68501587180901 170.1802163127982, 97.24514608274922 121.50753675330314, 21 171)), ((141 237, 174.98122638059246 169.66682920882604, 115.68501587180901 170.1802163127982, 141 237)), ((252 169, 186.85374007521938 146.1416631842875, 174.98122638059246 169.66682920882604, 252 169)), ((222.3085097882541 75.88869356771873, 161.30987316587914 79.92166127828898, 104.05263157894737 117.08864265927977, 186.85374007521938 146.1416631842875, 222.3085097882541 75.88869356771873)), ((266 73, 229.29693213749567 62.04126409792525, 222.3085097882541 75.88869356771873, 266 73)), ((249 23, 201.59668362870678 53.77057378487454, 229.29693213749567 62.04126409792525, 249 23)), ((55 10, 83.44063221452673 85.07004084532055, 161.30987316587914 79.92166127828898, 201.59668362870678 53.77057378487454, 55 10)), ((24 89, 94.27070148854622 113.6563864872092, 83.44063221452673 85.07004084532055, 24 89)), ((97.24514608274922 121.50753675330314, 104.05263157894737 117.08864265927977, 94.27070148854622 113.6563864872092, 97.24514608274922 121.50753675330314)))", + true + ], + [ + 3, + false, + "GEOMETRYCOLLECTION (MULTIPOLYGON (((10 5, 10 0, 5 0, 0 0, 5 5, 10 5)), ((5 5, 0 5, 0 10, 5 10, 10 10, 5 5))), MULTILINESTRING ((10 0, 5 5), (5 5, 0 10), (5 0, 5 5), (5 5, 5 10)))", + true + ], + [ + 4, + true, + "POLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))", + true + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "row_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "is_orig_valid", + "type": "\"boolean\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_valid = (\n", + " df\n", + " .withColumnRenamed(\"geom_wkt\", \"orig_geom_wkt\")\n", + " .withColumn(\"is_orig_valid\", mos.st_isvalid(\"orig_geom_wkt\"))\n", + " .select(\n", + " \"*\",\n", + " F\n", + " .when(col(\"is_orig_valid\") == False, make_wkt_valid(\"orig_geom_wkt\"))\n", + " .otherwise(col(\"orig_geom_wkt\"))\n", + " .alias(\"geom_wkt\")\n", + " )\n", + " .withColumn(\"is_valid\", mos.st_isvalid(\"geom_wkt\"))\n", + " .drop(\"orig_geom_wkt\")\n", + ")\n", + "\n", + "print(f\"\"\"count? {df_valid.count():,}\"\"\")\n", + "print(f\"\"\"num orig invalid? {df_valid.filter(col(\"is_orig_valid\") == False).count():,}\"\"\")\n", + "print(f\"\"\"num final invalid? {df_valid.filter(col(\"is_valid\") == False).count():,}\"\"\")\n", + "display(df_valid)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "705f56e7-ef65-4b66-8eca-46e6cc4e754c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[118]: ['{\"row_id\":1,\"is_orig_valid\":false,\"geom_wkt\":\"MULTIPOLYGON (((0.5 3.5, 3.3957654723127035 5.7752442996742674, 4.027777777777778 3.5, 0.5 3.5)), ((2.5 9, 5 7.035714285714286, 3.3957654723127035 5.7752442996742674, 2.5 9)), ((7.5 9, 6.6042345276872965 5.7752442996742674, 5 7.035714285714286, 7.5 9)), ((9.5 3.5, 5.972222222222222 3.5, 6.6042345276872965 5.7752442996742674, 9.5 3.5)), ((5 0, 4.027777777777778 3.5, 5.972222222222222 3.5, 5 0)))\",\"is_valid\":true}',\n '{\"row_id\":2,\"is_orig_valid\":false,\"geom_wkt\":\"MULTIPOLYGON (((21 171, 115.68501587180901 170.1802163127982, 97.24514608274922 121.50753675330314, 21 171)), ((141 237, 174.98122638059246 169.66682920882604, 115.68501587180901 170.1802163127982, 141 237)), ((252 169, 186.85374007521938 146.1416631842875, 174.98122638059246 169.66682920882604, 252 169)), ((222.3085097882541 75.88869356771873, 161.30987316587914 79.92166127828898, 104.05263157894737 117.08864265927977, 186.85374007521938 146.1416631842875, 222.3085097882541 75.88869356771873)), ((266 73, 229.29693213749567 62.04126409792525, 222.3085097882541 75.88869356771873, 266 73)), ((249 23, 201.59668362870678 53.77057378487454, 229.29693213749567 62.04126409792525, 249 23)), ((55 10, 83.44063221452673 85.07004084532055, 161.30987316587914 79.92166127828898, 201.59668362870678 53.77057378487454, 55 10)), ((24 89, 94.27070148854622 113.6563864872092, 83.44063221452673 85.07004084532055, 24 89)), ((97.24514608274922 121.50753675330314, 104.05263157894737 117.08864265927977, 94.27070148854622 113.6563864872092, 97.24514608274922 121.50753675330314)))\",\"is_valid\":true}',\n '{\"row_id\":3,\"is_orig_valid\":false,\"geom_wkt\":\"GEOMETRYCOLLECTION (MULTIPOLYGON (((10 5, 10 0, 5 0, 0 0, 5 5, 10 5)), ((5 5, 0 5, 0 10, 5 10, 10 10, 5 5))), MULTILINESTRING ((10 0, 5 5), (5 5, 0 10), (5 0, 5 5), (5 5, 5 10)))\",\"is_valid\":true}',\n '{\"row_id\":4,\"is_orig_valid\":true,\"geom_wkt\":\"POLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))\",\"is_valid\":true}']" + ] + } + ], + "source": [ + "fix_wkts = df_valid.orderBy('row_id').toJSON().collect()\n", + "fix_wkts" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c8bada46-bbc5-4ffe-b4ef-a876ac3022df", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Row 1: Fixed [Self-Intersection]__ \n", + "\n", + "> Using GeoPandas to plot area for fixed." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ac875ef1-124b-43e7-82a6-82a46dd5263e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[125]: " + ] + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "gpd.GeoSeries(shapely.wkt.loads(json.loads(fix_wkts[0])['geom_wkt'])).plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "76452d6e-53ad-4075-8552-c66b877c8801", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Row 2: Fixed [Self-Intersection]__\n", + "\n", + "> Using GeoPandas to plot area for fixed." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4fd70ddc-cef4-405f-8b85-b732691e53ab", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[126]: " + ] + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "gpd.GeoSeries(shapely.wkt.loads(json.loads(fix_wkts[1])['geom_wkt'])).plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3fadc613-c5b2-464e-a718-c1e76ec1a208", + "showTitle": false, + "title": "" + } + }, + "source": [ + "__Row 3: Fixed [Ring Self-Intersection]__\n", + "\n", + "> Using GeoPandas to plot area for fixed." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f9e3d0eb-718a-41cc-bae2-fccf1e0d6cd5", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Out[127]: " + ] + }, + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD4CAYAAAAq5pAIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAANqElEQVR4nO3dXYwd5X3H8e8Pr12wAQfLWwo27qIGkVDUytFRQ4IUtZhKtEljpFYVUYlIRLQXbQmJIkVOb7iNqihNLqpIK4eENpbTylAFRVEaShJFkSK3axsV7E0EIsTYGLwU8RYSzMr/XuxxWK/37Zx55swz8/w+Etpzzg7P/IX57sx52bEiAjPrvouaHsDMRsOxmxXCsZsVwrGbFcKxmxVibJQ727p1a0xMTIxyl2ZFOXTo0IsRMb7U90Ya+8TEBNPT06PcpVlRJP1iue/5NN6sEI7drBCO3awQjt2sEI7drBCrxi7pfkmnJT2x4LEtkh6R9GT/6xX1jmlmVa3lyP514LZFj+0BHo2I64BH+/fNLGOrxh4RPwJeWvTwbuCB/u0HgNtTDRQR/N/rb6Zazsz6hv1QzZURcap/+3ngyuU2lDQJTALs2LFj1YX3PPg4/3HkJO/6ncu4eP26IcezFGZOvQrAu6+6vOFJyrZ543r+8S//gCs2bai0TuVP0EVESFr2ChgRMQVMAfR6vVWvlHHD1Zfxb9Nn+d+Tr1QdzRL572cWn9jZqGy+ZD37PvHeyqHD8K/GvyDpKoD+19OVJ+m7ZsvGVEuZtdq50G/ctjnJesPG/jBwV//2XcC3kkxjZkD60GFtb73tB34CXC/phKS7gc8DfyrpSeDW/n0zS6CO0GENz9kj4iPLfGtX0knMrLbQwZ+gM8tGnaGDYzfLQt2hg2M3a9woQgfHbtaoUYUOjt2sMaMMHRy7WSNGHTo4drORayJ0cOxmI9VU6ODYzUamydDBsZuNRNOhg2M3q10OoYNjN6tVLqGDYzerTU6hg2M3q0VuoYNjN0sux9DBsZsllWvo4NjNksk5dHDsZknkHjo4drPK2hA6OHazStoSOjh2s6G1KXRw7GZDaVvo4NjNBtbG0MGxmw2kraGDYzdbszaHDo7dbE3aHjo4drNVdSF0cOxmK+pK6ODYzZbVpdDBsZstqWuhg2M3u0AXQ4eKsUv6tKSjkp6QtF/SxakGM2tCV0OHCrFL2gZ8EuhFxI3AOuCOVIOZjVqXQ4fqp/FjwCWSxoCNwHPVRzIbva6HDhVij4iTwBeA48Ap4JWI+N7i7SRNSpqWND07Ozv8pGY1KSF0qHYafwWwG7gWuBrYJOnOxdtFxFRE9CKiNz4+PvykZjUoJXSodhp/K/DziJiNiLeAh4D3pxnLrH4lhQ7VYj8O3CRpoyQBu4CZNGOZ1au00KHac/aDwAHgMPB4f62pRHOZ1abE0GH+1fShRcR9wH2JZjGrXamhgz9BZwUpOXRw7FaI0kMHx24FcOjzHLt1mkN/m2O3znLo53Ps1kkO/UKO3TrHoS/NsVunOPTlOXbrDIe+MsduneDQV+fYrfUc+to4dms1h752jt1ay6EPxrFbKzn0wTl2ax2HPhzHbq3i0Ifn2K01HHo1jt1awaFX59gtew49DcduWXPo6Th2y5ZDT8uxW5YcenqO3bLj0Ovh2C0rDr0+jt2y4dDr5dgtCw69fo7dGufQR8OxW6Mc+ug4dmuMQx8tx26NcOijVyl2Se+QdEDSTyXNSHpfqsGsuxx6Myr9/ezAl4HvRsRfSdoAbEwwk3WYQ2/O0LFL2gx8APgYQEScAc6kGcu6yKE3q8pp/LXALPA1SUck7ZW0afFGkiYlTUuanp2drbA7azOH3rwqsY8B7wG+EhE7gV8CexZvFBFTEdGLiN74+HiF3VlbOfQ8VIn9BHAiIg727x9gPn6z33Do+Rg69oh4HnhW0vX9h3YBx5JMZZ3g0PNS9dX4e4B9/VfinwY+Xn0k6wKHnp9KsUfEY0AvzSjWFQ49T/4EnSXl0PPl2C0Zh543x25JOPT8OXarzKG3g2O3Shx6ezh2G5pDbxfHbkNx6O3j2G1gDr2dHLsNxKG3l2O3NXPo7ebYbU0cevs5dluVQ+8Gx24rWneRHHpHOHZb0e+Nb3LoHaGIGNnOer1eTE9Pr7jNmbmzvP7m3IgmsuUcPv4Sk/9yiLMBl188xrqL1PRIRbvvL36f23duW3U7SYciYslfO6968YrkNoxdxJaxDU2PUbSX3zjDPz3yJGf7x4FXf+0fvk07M3e28ho+jbfzvPzGGf5m70GOPvdq06NYYo7dfsOhd5tjN8Chl8Cxm0MvhGMvnEMvh2MvmEMvi2MvlEMvj2MvkEMvk2MvjEMvl2MviEMvm2MvhEM3x14Ah27g2DvPods5jr3DHLot5Ng7yqHbYpVjl7RO0hFJ304xkFXn0G0pKY7s9wIzCdaxBBy6LadS7JK2Ax8E9qYZx6pw6LaSqkf2LwGfBZa9Zo6kSUnTkqZnZ2cr7s6W49BtNUPHLulDwOmIOLTSdhExFRG9iOiNj48PuztbgUO3tahyZL8Z+LCkZ4BvArdI+kaSqWzNHLqt1dCxR8TnImJ7REwAdwDfj4g7k01mq3LoNgi/z95SDt0GleS68RHxQ+CHKday1Tl0G4aP7C3j0G1Yjr1FHLpV4dhbwqFbVY69BRy6peDYM+fQLRXHnjGHbik59kw5dEvNsWfIoVsdHHtmHLrVxbFnxKFbnRx7Jhy61c2xZ8Ch2yg49oY5dBsVx94gh26j5Ngb4tBt1Bx7Axy6NcGxj5hDt6Y49hFy6NYkxz4iDt2a5thHwKFbDhx7zRy65cKx18ihW04ce00cuuXGsdfAoVuOHHtiDt1y5dgTcuiWM8eeiEO33Dn2BBy6tYFjr8ihW1s49gocurXJ0LFLukbSDyQdk3RU0r0pB8udQ7e2qfL3s88Bn4mIw5IuAw5JeiQijiWaLVsO3dpo6CN7RJyKiMP9268BM8C2VIPlyqFbWyV5zi5pAtgJHFzie5OSpiVNz87OpthdYxy6tVnl2CVdCjwIfCoiLqggIqYiohcRvfHx8aq7a4xDt7arFLuk9cyHvi8iHkozUn4cunVBlVfjBXwVmImIL6YbKS8O3bqiypH9ZuCjwC2SHuv/8+eJ5sqCQ7cuGfqtt4j4MaCEs2TFoVvX+BN0S3Do1kWOfRGHbl3l2Bdw6NZljr3PoVvXOXYcupWh+NgdupWi6NgdupWk2NgdupWmyNgdupWouNgdupWqqNgdupWsmNgdupWuiNgdulkBsTt0s3mdjt2hm72ts7E7dLPzdTJ2h252oc7F7tDNltap2B262fI6E7tDN1tZJ2J36Gara33sDt1sbVodu0M3W7vWxu7QzQbTytgdutngWhe7QzcbTqtid+hmw2tN7A7drJpWxO7QzarLPnaHbpZG1rE7dLN0KsUu6TZJP5P0lKQ9qYYCh26W2tCxS1oH/DPwZ8ANwEck3ZBiKIdudr4XX3+z8hpjFf7dPwKeioinASR9E9gNHKsy0Btn5rhn/xFOvfJrtmzaUGUpq+jVX70FwOWXrG94knLNnT3La7+a479mXuBv/+SdldaqEvs24NkF908A7128kaRJYBJgx44dqy66ccMY/3r3BcuYFeup069z1eaLK69T+wt0ETEVEb2I6I2Pj9e9O7POeedvX8qm36pyXJ5XJfaTwDUL7m/vP2ZmGaoS+/8A10m6VtIG4A7g4TRjmVlqQ58bRMScpL8H/hNYB9wfEUeTTWZmSVV6IhAR3wG+k2gWM6tR1p+gM7N0HLtZIRy7WSEcu1khFBGj25k0C/xiDZtuBV6seZxh5Twb5D1fzrNB3vOtdbbfjYglP7020tjXStJ0RPSanmMpOc8Gec+X82yQ93wpZvNpvFkhHLtZIXKNfarpAVaQ82yQ93w5zwZ5z1d5tiyfs5tZerke2c0sMcduVoisYq/zApZVSbpG0g8kHZN0VNK9Tc+0mKR1ko5I+nbTsywm6R2SDkj6qaQZSe9reqZzJH26/2f6hKT9kqpfFqbaPPdLOi3piQWPbZH0iKQn+1+vGHTdbGKv8wKWicwBn4mIG4CbgL/LbD6Ae4GZpodYxpeB70bEu4A/JJM5JW0DPgn0IuJG5n9d+45mp+LrwG2LHtsDPBoR1wGP9u8PJJvYWXABy4g4A5y7gGUWIuJURBzu336N+f9ZtzU71dskbQc+COxtepbFJG0GPgB8FSAizkTEy40Odb4x4BJJY8BG4Lkmh4mIHwEvLXp4N/BA//YDwO2DrptT7EtdwDKbmBaSNAHsBA42PMpCXwI+C5xteI6lXAvMAl/rP83YK2lT00MBRMRJ4AvAceAU8EpEfK/ZqZZ0ZUSc6t9+Hrhy0AVyir0VJF0KPAh8KiKyuLC9pA8BpyPiUNOzLGMMeA/wlYjYCfySIU5D69B/7rub+R9IVwObJN3Z7FQri/n3ywd+zzyn2LO/gKWk9cyHvi8iHmp6ngVuBj4s6Rnmn/7cIukbzY50nhPAiYg4dyZ0gPn4c3Ar8POImI2It4CHgPc3PNNSXpB0FUD/6+lBF8gp9qwvYClJzD/nnImILzY9z0IR8bmI2B4RE8z/d/t+RGRzdIqI54FnJV3ff2gXFf8ykYSOAzdJ2tj/M95FJi8eLvIwcFf/9l3AtwZdoPrFqBNpwQUsbwY+Cjwu6bH+Y//Qvw6fre4eYF//B/nTwMcbngeAiDgo6QBwmPl3XI7Q8MdmJe0H/hjYKukEcB/weeDfJd3N/K+J//XA6/rjsmZlyOk03sxq5NjNCuHYzQrh2M0K4djNCuHYzQrh2M0K8f/GDxQNkdOWvgAAAABJRU5ErkJggg==\n" + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "\n", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "image" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "gpd.GeoSeries(shapely.wkt.loads(json.loads(fix_wkts[2])['geom_wkt'])).plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "230bf52a-3b2e-4c65-8cf7-5098af27c943", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Option: Vectorized Pandas UDF\n", + "\n", + "> If you want to go further with performance, you can use a vectorized pandas UDF\n", + "\n", + "__Note: We are using the Pandas Series [Vectorized UDF](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.pandas_udf.html) variant.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "77c2b320-f5b1-4a27-90de-820e6e0886a7", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "from pyspark.sql.functions import pandas_udf\n", + "\n", + "@pandas_udf(StringType())\n", + "def vectorized_make_wkt_valid(s:pd.Series) -> pd.Series:\n", + " \"\"\"\n", + " - test for wkt being valid\n", + " - attempts to make valid\n", + " - may have to change type, e.g. POLYGON to MULTIPOLYGON\n", + " returns valid wkt\n", + " \"\"\"\n", + " from shapely import wkt \n", + " from shapely.validation import make_valid\n", + "\n", + " def to_valid(w:str) -> str:\n", + " _geom = wkt.loads(w)\n", + " if _geom.is_valid:\n", + " return w\n", + " _geom_fix = make_valid(_geom)\n", + " return _geom_fix.wkt\n", + "\n", + " return s.apply(to_valid) " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6c63edd3-f87e-4ede-bf74-c1d49c7177e2", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_This variation doesn't show all the interim testing, just the fixing._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8b871326-d013-49c0-863f-51e5c16ddd60", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "count? 4\nnum orig invalid? 3\nnum final invalid? 0\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
row_idis_orig_validgeom_wktis_valid
1falseMULTIPOLYGON (((0.5 3.5, 3.3957654723127035 5.7752442996742674, 4.027777777777778 3.5, 0.5 3.5)), ((2.5 9, 5 7.035714285714286, 3.3957654723127035 5.7752442996742674, 2.5 9)), ((7.5 9, 6.6042345276872965 5.7752442996742674, 5 7.035714285714286, 7.5 9)), ((9.5 3.5, 5.972222222222222 3.5, 6.6042345276872965 5.7752442996742674, 9.5 3.5)), ((5 0, 4.027777777777778 3.5, 5.972222222222222 3.5, 5 0)))true
2falseMULTIPOLYGON (((21 171, 115.68501587180901 170.1802163127982, 97.24514608274922 121.50753675330314, 21 171)), ((141 237, 174.98122638059246 169.66682920882604, 115.68501587180901 170.1802163127982, 141 237)), ((252 169, 186.85374007521938 146.1416631842875, 174.98122638059246 169.66682920882604, 252 169)), ((222.3085097882541 75.88869356771873, 161.30987316587914 79.92166127828898, 104.05263157894737 117.08864265927977, 186.85374007521938 146.1416631842875, 222.3085097882541 75.88869356771873)), ((266 73, 229.29693213749567 62.04126409792525, 222.3085097882541 75.88869356771873, 266 73)), ((249 23, 201.59668362870678 53.77057378487454, 229.29693213749567 62.04126409792525, 249 23)), ((55 10, 83.44063221452673 85.07004084532055, 161.30987316587914 79.92166127828898, 201.59668362870678 53.77057378487454, 55 10)), ((24 89, 94.27070148854622 113.6563864872092, 83.44063221452673 85.07004084532055, 24 89)), ((97.24514608274922 121.50753675330314, 104.05263157894737 117.08864265927977, 94.27070148854622 113.6563864872092, 97.24514608274922 121.50753675330314)))true
4truePOLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))true
3falseGEOMETRYCOLLECTION (MULTIPOLYGON (((10 5, 10 0, 5 0, 0 0, 5 5, 10 5)), ((5 5, 0 5, 0 10, 5 10, 10 10, 5 5))), MULTILINESTRING ((10 0, 5 5), (5 5, 0 10), (5 0, 5 5), (5 5, 5 10)))true
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1, + false, + "MULTIPOLYGON (((0.5 3.5, 3.3957654723127035 5.7752442996742674, 4.027777777777778 3.5, 0.5 3.5)), ((2.5 9, 5 7.035714285714286, 3.3957654723127035 5.7752442996742674, 2.5 9)), ((7.5 9, 6.6042345276872965 5.7752442996742674, 5 7.035714285714286, 7.5 9)), ((9.5 3.5, 5.972222222222222 3.5, 6.6042345276872965 5.7752442996742674, 9.5 3.5)), ((5 0, 4.027777777777778 3.5, 5.972222222222222 3.5, 5 0)))", + true + ], + [ + 2, + false, + "MULTIPOLYGON (((21 171, 115.68501587180901 170.1802163127982, 97.24514608274922 121.50753675330314, 21 171)), ((141 237, 174.98122638059246 169.66682920882604, 115.68501587180901 170.1802163127982, 141 237)), ((252 169, 186.85374007521938 146.1416631842875, 174.98122638059246 169.66682920882604, 252 169)), ((222.3085097882541 75.88869356771873, 161.30987316587914 79.92166127828898, 104.05263157894737 117.08864265927977, 186.85374007521938 146.1416631842875, 222.3085097882541 75.88869356771873)), ((266 73, 229.29693213749567 62.04126409792525, 222.3085097882541 75.88869356771873, 266 73)), ((249 23, 201.59668362870678 53.77057378487454, 229.29693213749567 62.04126409792525, 249 23)), ((55 10, 83.44063221452673 85.07004084532055, 161.30987316587914 79.92166127828898, 201.59668362870678 53.77057378487454, 55 10)), ((24 89, 94.27070148854622 113.6563864872092, 83.44063221452673 85.07004084532055, 24 89)), ((97.24514608274922 121.50753675330314, 104.05263157894737 117.08864265927977, 94.27070148854622 113.6563864872092, 97.24514608274922 121.50753675330314)))", + true + ], + [ + 4, + true, + "POLYGON (( -84.3641541604937 33.71316821215546, -84.36414611386687 33.71303657522174, -84.36409515189553 33.71303657522174, -84.36410319852232 33.71317267442025, -84.3641541604937 33.71316821215546 ))", + true + ], + [ + 3, + false, + "GEOMETRYCOLLECTION (MULTIPOLYGON (((10 5, 10 0, 5 0, 0 0, 5 5, 10 5)), ((5 5, 0 5, 0 10, 5 10, 10 10, 5 5))), MULTILINESTRING ((10 0, 5 5), (5 5, 0 10), (5 0, 5 5), (5 5, 5 10)))", + true + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "row_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "is_orig_valid", + "type": "\"boolean\"" + }, + { + "metadata": "{}", + "name": "geom_wkt", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "is_valid", + "type": "\"boolean\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "df_valid1 = (\n", + " df # <- initial dataframe\n", + " .withColumnRenamed(\"geom_wkt\", \"orig_geom_wkt\")\n", + " .withColumn(\"is_orig_valid\", mos.st_isvalid(\"orig_geom_wkt\"))\n", + " .repartition(sc.defaultParallelism * 8, \"orig_geom_wkt\") # <- useful at scale\n", + " .select(\n", + " \"*\",\n", + " F\n", + " .when(col(\"is_orig_valid\") == False, vectorized_make_wkt_valid(\"orig_geom_wkt\")) # <- Pandas UDF\n", + " .otherwise(col(\"orig_geom_wkt\"))\n", + " .alias(\"geom_wkt\")\n", + " )\n", + " .withColumn(\"is_valid\", mos.st_isvalid(\"geom_wkt\"))\n", + " .drop(\"orig_geom_wkt\")\n", + ")\n", + "\n", + "print(f\"\"\"count? {df_valid1.count():,}\"\"\")\n", + "print(f\"\"\"num orig invalid? {df_valid1.filter(col(\"is_orig_valid\") == False).count():,}\"\"\")\n", + "print(f\"\"\"num final invalid? {df_valid1.filter(col(\"is_valid\") == False).count():,}\"\"\")\n", + "display(df_valid1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4ebff4ab-4d1e-4339-9182-e52d427e4071", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> _To further optimize as an automated workflow, you would writing to Delta Tables and avoiding unnecessary calls to `count` / `display`._\n", + "\n", + "__Notes:__\n", + "\n", + "* At-scale, there are benefits to adding call like `.repartition(sc.defaultParallelism * 8, \"orig_geom_wkt\")` when coupled with spark confs to adjust AQE (see top of notebook) as this give you more control of partitioning since there is compute-heavy (aka UDF) tasks that Spark cannot plan for as well as a \"pure\" data-heavy operation.\n", + "* The focus of this notebook was not on rendering on a map, so we just used matplot lib with both Shapely (for pre-fixed geoms) and GeoPandas (for fixed geoms)\n", + "* The use of `.when()` conditional allows us to avoid UDF calls except where `is_valid=False` which saves on unnecessary compute time\n", + "* We avoided shapely `explain_validity` call except to initially understand as that call can be computationally expensive (and is only informational)\n", + "* This is just a subset of validation, but hopefully offers enough breadcrumbs for common issues you may face when processing invalid geometries" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 85549841996182, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "shapely_validate_udfs", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/scala/MosaicAndSedona.ipynb b/notebooks/examples/scala/MosaicAndSedona.ipynb new file mode 100644 index 000000000..a2f62aff4 --- /dev/null +++ b/notebooks/examples/scala/MosaicAndSedona.ipynb @@ -0,0 +1,1084 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bf98136c-9276-4388-8eef-b567621fe1a4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Mosaic & Sedona\n", + "\n", + "> You can combine the usage of [Mosaic](https://databrickslabs.github.io/mosaic/index.html) with other geospatial libraries. In this example we combine it with [Sedona](https://sedona.apache.org).\n", + "\n", + "## Setup\n", + "\n", + "This notebook will run if you have both Mosaic and Sedona installed on your cluster as described below.\n", + "\n", + "### Install Sedona\n", + "\n", + "To install Sedona, follow the [official Sedona instructions](https://sedona.apache.org/1.5.0/setup/databricks/).\n", + "\n", + "E.g. Add the following maven coordinates to a non-photon cluster [[1](https://docs.databricks.com/en/libraries/package-repositories.html)]. This is showing DBR 12.2 LTS. \n", + "\n", + "```\n", + "org.apache.sedona:sedona-spark-shaded-3.0_2.12:1.5.0\n", + "org.datasyslab:geotools-wrapper:1.5.0-28.2\n", + "```\n", + "\n", + "### Install Mosaic\n", + "\n", + "Download Mosaic JAR to your local machine (e.g. from [here](https://github.com/databrickslabs/mosaic/releases/download/v_0.3.12/mosaic-0.3.12-jar-with-dependencies.jar) for 0.3.12) and then UPLOAD to your cluster [[1](https://docs.databricks.com/en/libraries/cluster-libraries.html#install-a-library-on-a-cluster)]. \n", + "\n", + "### Notes\n", + "\n", + "* See instructions for `SedonaContext.create(spark)` [[1](https://sedona.apache.org/1.5.0/tutorial/sql/?h=sedonacontext#initiate-sedonacontext)]. \n", + "* And, Sedona identifies that it might have issues if executed on a [Photon](https://www.databricks.com/product/photon) cluster; again this example is showing DBR 12.2 LTS on the Mosaic 0.3 series.\n", + "\n", + "--- \n", + " __Last Update__ 01 DEC 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "46dcda8a-cd24-4016-acf9-6ede54978d2f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup\n", + "\n", + "> We are installing Mosaic without SQL functions registered (via Scala) and are installing Sedona SQL as normal." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c91dd7bf-319c-489c-9715-6c512f027d64", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
import org.apache.spark.sql.functions._\n", + "import com.databricks.labs.mosaic.functions.MosaicContext\n", + "import com.databricks.labs.mosaic.H3\n", + "import com.databricks.labs.mosaic.JTS\n", + "mosaicContext: com.databricks.labs.mosaic.functions.MosaicContext = com.databricks.labs.mosaic.functions.MosaicContext@64740153\n", + "import mosaicContext.functions._\n", + "import org.apache.sedona.spark.SedonaContext\n", + "sedona: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@15e8a79a\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
import org.apache.spark.sql.functions._\nimport com.databricks.labs.mosaic.functions.MosaicContext\nimport com.databricks.labs.mosaic.H3\nimport com.databricks.labs.mosaic.JTS\nmosaicContext: com.databricks.labs.mosaic.functions.MosaicContext = com.databricks.labs.mosaic.functions.MosaicContext@64740153\nimport mosaicContext.functions._\nimport org.apache.sedona.spark.SedonaContext\nsedona: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@15e8a79a\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "\n", + "// -- spark functions\n", + "import org.apache.spark.sql.functions._\n", + "\n", + "// -- mosaic functions\n", + "import com.databricks.labs.mosaic.functions.MosaicContext\n", + "import com.databricks.labs.mosaic.H3\n", + "import com.databricks.labs.mosaic.JTS\n", + "\n", + "val mosaicContext = MosaicContext.build(H3, JTS)\n", + "import mosaicContext.functions._\n", + "\n", + "// ! don't register SQL functions !\n", + "// - this allows sedona to be the main spatial SQL provider\n", + "//mosaicContext.register()\n", + "\n", + "// -- sedona functions\n", + "import org.apache.sedona.spark.SedonaContext\n", + "val sedona = SedonaContext.create(spark)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a446841d-9ce1-4b0c-97e8-b705ab06caee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_When we list user functions, we see all the Sedona provided ones._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0394a8a2-dcfd-49c0-a2df-85ecd0272029", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
function
hive_metastore.default.st_geohash
st_3ddistance
st_addpoint
st_affine
st_angle
st_areaspheroid
st_asbinary
st_asewkb
st_asewkt
st_asgeojson
st_asgml
st_askml
st_azimuth
st_boundary
st_boundingdiagonal
st_buffer
st_buildarea
st_centroid
st_closestpoint
st_collect
st_collectionextract
st_concavehull
st_contains
st_convexhull
st_coorddim
st_coveredby
st_covers
st_crosses
st_degrees
st_difference
st_dimension
st_disjoint
st_distance
st_distancesphere
st_distancespheroid
st_dump
st_dumppoints
st_endpoint
st_envelope
st_envelope_aggr
st_equals
st_exteriorring
st_flipcoordinates
st_force3d
st_force_2d
st_frechetdistance
st_geohash
st_geometricmedian
st_geometryn
st_geomfromewkt
st_geomfromgeohash
st_geomfromgeojson
st_geomfromgml
st_geomfromkml
st_geomfromwkb
st_h3celldistance
st_h3cellids
st_h3kring
st_h3togeom
st_hausdorffdistance
st_interiorringn
st_intersection
st_intersection_aggr
st_intersects
st_isclosed
st_iscollection
st_isring
st_issimple
st_isvalid
st_lengthspheroid
st_linefrommultipoint
st_linefromtext
st_lineinterpolatepoint
st_linemerge
st_linestringfromtext
st_linesubstring
st_makeline
st_makepoint
st_makepolygon
st_makevalid
st_minimumboundingcircle
st_minimumboundingradius
st_mlinefromtext
st_mpolyfromtext
st_multi
st_normalize
st_nrings
st_numgeometries
st_numinteriorrings
st_numpoints
st_orderingequals
st_overlaps
st_pointfromtext
st_pointn
st_pointonsurface
st_pointz
st_polygon
st_polygonfromenvelope
st_polygonfromtext
st_reduceprecision
st_removepoint
st_reverse
st_s2cellids
st_setpoint
st_simplifypreservetopology
st_split
st_startpoint
st_subdivide
st_subdivideexplode
st_symdifference
st_touches
st_transform
st_translate
st_union
st_union_aggr
st_voronoipolygons
st_within
st_x
st_y
st_z
st_zmax
st_zmin
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "hive_metastore.default.st_geohash" + ], + [ + "st_3ddistance" + ], + [ + "st_addpoint" + ], + [ + "st_affine" + ], + [ + "st_angle" + ], + [ + "st_areaspheroid" + ], + [ + "st_asbinary" + ], + [ + "st_asewkb" + ], + [ + "st_asewkt" + ], + [ + "st_asgeojson" + ], + [ + "st_asgml" + ], + [ + "st_askml" + ], + [ + "st_azimuth" + ], + [ + "st_boundary" + ], + [ + "st_boundingdiagonal" + ], + [ + "st_buffer" + ], + [ + "st_buildarea" + ], + [ + "st_centroid" + ], + [ + "st_closestpoint" + ], + [ + "st_collect" + ], + [ + "st_collectionextract" + ], + [ + "st_concavehull" + ], + [ + "st_contains" + ], + [ + "st_convexhull" + ], + [ + "st_coorddim" + ], + [ + "st_coveredby" + ], + [ + "st_covers" + ], + [ + "st_crosses" + ], + [ + "st_degrees" + ], + [ + "st_difference" + ], + [ + "st_dimension" + ], + [ + "st_disjoint" + ], + [ + "st_distance" + ], + [ + "st_distancesphere" + ], + [ + "st_distancespheroid" + ], + [ + "st_dump" + ], + [ + "st_dumppoints" + ], + [ + "st_endpoint" + ], + [ + "st_envelope" + ], + [ + "st_envelope_aggr" + ], + [ + "st_equals" + ], + [ + "st_exteriorring" + ], + [ + "st_flipcoordinates" + ], + [ + "st_force3d" + ], + [ + "st_force_2d" + ], + [ + "st_frechetdistance" + ], + [ + "st_geohash" + ], + [ + "st_geometricmedian" + ], + [ + "st_geometryn" + ], + [ + "st_geomfromewkt" + ], + [ + "st_geomfromgeohash" + ], + [ + "st_geomfromgeojson" + ], + [ + "st_geomfromgml" + ], + [ + "st_geomfromkml" + ], + [ + "st_geomfromwkb" + ], + [ + "st_h3celldistance" + ], + [ + "st_h3cellids" + ], + [ + "st_h3kring" + ], + [ + "st_h3togeom" + ], + [ + "st_hausdorffdistance" + ], + [ + "st_interiorringn" + ], + [ + "st_intersection" + ], + [ + "st_intersection_aggr" + ], + [ + "st_intersects" + ], + [ + "st_isclosed" + ], + [ + "st_iscollection" + ], + [ + "st_isring" + ], + [ + "st_issimple" + ], + [ + "st_isvalid" + ], + [ + "st_lengthspheroid" + ], + [ + "st_linefrommultipoint" + ], + [ + "st_linefromtext" + ], + [ + "st_lineinterpolatepoint" + ], + [ + "st_linemerge" + ], + [ + "st_linestringfromtext" + ], + [ + "st_linesubstring" + ], + [ + "st_makeline" + ], + [ + "st_makepoint" + ], + [ + "st_makepolygon" + ], + [ + "st_makevalid" + ], + [ + "st_minimumboundingcircle" + ], + [ + "st_minimumboundingradius" + ], + [ + "st_mlinefromtext" + ], + [ + "st_mpolyfromtext" + ], + [ + "st_multi" + ], + [ + "st_normalize" + ], + [ + "st_nrings" + ], + [ + "st_numgeometries" + ], + [ + "st_numinteriorrings" + ], + [ + "st_numpoints" + ], + [ + "st_orderingequals" + ], + [ + "st_overlaps" + ], + [ + "st_pointfromtext" + ], + [ + "st_pointn" + ], + [ + "st_pointonsurface" + ], + [ + "st_pointz" + ], + [ + "st_polygon" + ], + [ + "st_polygonfromenvelope" + ], + [ + "st_polygonfromtext" + ], + [ + "st_reduceprecision" + ], + [ + "st_removepoint" + ], + [ + "st_reverse" + ], + [ + "st_s2cellids" + ], + [ + "st_setpoint" + ], + [ + "st_simplifypreservetopology" + ], + [ + "st_split" + ], + [ + "st_startpoint" + ], + [ + "st_subdivide" + ], + [ + "st_subdivideexplode" + ], + [ + "st_symdifference" + ], + [ + "st_touches" + ], + [ + "st_transform" + ], + [ + "st_translate" + ], + [ + "st_union" + ], + [ + "st_union_aggr" + ], + [ + "st_voronoipolygons" + ], + [ + "st_within" + ], + [ + "st_x" + ], + [ + "st_y" + ], + [ + "st_z" + ], + [ + "st_zmax" + ], + [ + "st_zmin" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "function", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "show user functions like 'st_*'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1805f461-ecab-4a03-980d-fb403a3a028e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Queries\n", + "\n", + "> Showing how Sedona (registered Spark SQL) and Mosaic (Scala) can co-exist on the same cluster. Not shown here, but this could also be Mosaic Python bindings." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c1e4ac30-daf0-423c-8117-b7c3c4c06e52", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
wkt
POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))" + ] + ], + "datasetInfos": [ + { + "name": "df", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "wkt", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "tableIdentifier": null, + "typeStr": "org.apache.spark.sql.DataFrame" + } + ], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "wkt", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "\n", + "val df = Seq(\"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))\").toDF(\"wkt\")\n", + "display(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "fbb24b11-f88d-46fb-a365-773d35923704", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is a Scala call to use the Sedona (Spark SQL) functions using `selectExpr`._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6f44c258-6919-43bc-9b52-a9167ce48078", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
sedona_area
550.0
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 550.0 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "sedona_area", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "display(\n", + " df\n", + " .selectExpr(\"ST_Area(ST_GeomFromText(wkt)) AS sedona_area\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3a484604-c4bc-4234-acf0-32994de54554", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is Scala call to the same Mosaic-provided `ST_Area` function._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ccf12e8d-82ff-47d9-ab5e-f64b2c487223", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
mosaic_area
550.0
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 550.0 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "mosaic_area", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "display(\n", + " df\n", + " .select(st_area($\"wkt\").as(\"mosaic_area\"))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6dd1e21d-7a84-4c5e-b5f6-b02831d846b0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Mosaic + Sedona_\n", + "\n", + "> Showing blending Mosaic calls (in Scala) with Sedona (Spark SQL) calls, using `expr`." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e0602e02-01ec-45cd-8c17-aa30e0d0d969", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
mosaic_areasedona_areawkt
550.0550.0POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 550.0, + 550.0, + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "mosaic_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "sedona_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "wkt", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "display(\n", + " df\n", + " .select(\n", + " st_area($\"wkt\").as(\"mosaic_area\"), // <- mosaic (scala)\n", + " expr(\"ST_Area(ST_GeomFromText(wkt)) AS sedona_area\"), // <- sedona (spark sql)\n", + " $\"wkt\"\n", + " )\n", + ")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": -1, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "MosaicAndSedona", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/scala/MosaicAndSedona.scala b/notebooks/examples/scala/MosaicAndSedona.scala deleted file mode 100644 index 532b96fa5..000000000 --- a/notebooks/examples/scala/MosaicAndSedona.scala +++ /dev/null @@ -1,47 +0,0 @@ -// Databricks notebook source -// MAGIC %md -// MAGIC # Mosaic & Sedona -// MAGIC -// MAGIC You can combine the usage of Mosaic with other geospatial libraries. -// MAGIC -// MAGIC In this example we combine the use of [Sedona](https://sedona.apache.org) and Mosaic. -// MAGIC -// MAGIC ## Setup -// MAGIC -// MAGIC This notebook will run if you have both Mosaic and Sedona installed on your cluster. -// MAGIC -// MAGIC ### Install sedona -// MAGIC -// MAGIC To install Sedona, follow the [official Sedona instructions](https://sedona.apache.org/1.4.0/setup/databricks). - -// COMMAND ---------- - -// Register Sedona in the 'default' database -import org.apache.sedona.sql.utils.SedonaSQLRegistrator -SedonaSQLRegistrator.registerAll(spark) - -// Import Mosaic functions -import com.databricks.labs.mosaic.functions.MosaicContext -import com.databricks.labs.mosaic.H3 -import com.databricks.labs.mosaic.JTS - -val mosaicContext = MosaicContext.build(H3, JTS) -import mosaicContext.functions._ -import org.apache.spark.sql.functions._ - -// COMMAND ---------- - -// Example dataset -val df = spark.createDataFrame(Seq(Tuple1("POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"))).toDF("wkt") - -// COMMAND ---------- - -df - .withColumn("mosaic_area", st_area($"wkt")) // Mosaic - .withColumn("sedona_area", expr("ST_Area(ST_GeomFromWKT(wkt))")) // Sedona - .withColumn("sedona_flipped", expr("ST_FlipCoordinates(ST_GeomFromWKT(wkt))")) // Sedona - .show() - -// COMMAND ---------- - - diff --git a/notebooks/examples/scala/QuickstartNotebook.ipynb b/notebooks/examples/scala/QuickstartNotebook.ipynb new file mode 100644 index 000000000..ef9dc87c8 --- /dev/null +++ b/notebooks/examples/scala/QuickstartNotebook.ipynb @@ -0,0 +1,3116 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db1d4ea7-d138-4740-ac41-74998430b3df", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Mosaic Quickstart\n", + "\n", + "> Perform a point-in-polygon spatial join between NYC Taxi trips and zones. __Note: this does not get into performance tweaks that are available for scaled joins.__\n", + "\n", + "### Notes\n", + "\n", + "For \"pure\" scala, download Mosaic JAR to your local machine (e.g. from [here](https://github.com/databrickslabs/mosaic/releases/download/v_0.3.12/mosaic-0.3.12-jar-with-dependencies.jar) for 0.3.12) and then UPLOAD to your cluster [[1](https://docs.databricks.com/en/libraries/cluster-libraries.html#install-a-library-on-a-cluster)]. \n", + "\n", + "If you have trouble with Volume access:\n", + "\n", + "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", + "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html).\n", + "\n", + "--- \n", + " __Last Update__ 01 DEC 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d6cbd7f0-cfa1-41f9-88dc-dccd355343d4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Install Mosaic\n", + "\n", + "> Mosaic framework is available via pip install and it comes with bindings for Python, SQL, Scala and R. The wheel file coming with pip installation is registering any necessary jars for other language support." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a08085be-479c-453b-883f-800bc7d022fc", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
import org.apache.spark.sql.functions._\n", + "import com.databricks.labs.mosaic.functions.MosaicContext\n", + "import com.databricks.labs.mosaic.H3\n", + "import com.databricks.labs.mosaic.JTS\n", + "mosaicContext: com.databricks.labs.mosaic.functions.MosaicContext = com.databricks.labs.mosaic.functions.MosaicContext@3cc36d4a\n", + "import mosaicContext.functions._\n", + "formatter: java.text.NumberFormat = java.text.DecimalFormat@674dc\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
import org.apache.spark.sql.functions._\nimport com.databricks.labs.mosaic.functions.MosaicContext\nimport com.databricks.labs.mosaic.H3\nimport com.databricks.labs.mosaic.JTS\nmosaicContext: com.databricks.labs.mosaic.functions.MosaicContext = com.databricks.labs.mosaic.functions.MosaicContext@3cc36d4a\nimport mosaicContext.functions._\nformatter: java.text.NumberFormat = java.text.DecimalFormat@674dc\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "\n", + "// -- configure AQE for more compute heavy operations\n", + "// - choose option-1 or option-2 below, essential for REPARTITION!\n", + "// spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", false) // <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", false) // <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1024) // <-- default is 200\n", + "\n", + "// -- spark functions\n", + "import org.apache.spark.sql.functions._\n", + "\n", + "// -- mosaic functions\n", + "import com.databricks.labs.mosaic.functions.MosaicContext\n", + "import com.databricks.labs.mosaic.H3\n", + "import com.databricks.labs.mosaic.JTS\n", + "\n", + "val mosaicContext = MosaicContext.build(H3, JTS)\n", + "import mosaicContext.functions._\n", + "\n", + "// register SQL functions\n", + "mosaicContext.register()\n", + "\n", + "// formatter\n", + "val formatter = java.text.NumberFormat.getIntegerInstance" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9251d814-2e9f-4287-8fd9-769f0bf40c68", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup Data" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f6b988fa-a55c-463c-8225-c549a56d377c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
userName: String = mjohns@databricks.com\n", + "dataDir: String = /tmp/mosaic/mjohns@databricks.com\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
userName: String = mjohns@databricks.com\ndataDir: String = /tmp/mosaic/mjohns@databricks.com\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "val userName = dbutils.notebook.getContext.userName.get\n", + "val dataDir = s\"/tmp/mosaic/$userName\" // <- DBFS\n", + "\n", + "spark.conf.set(\"DATA_DIR\", dataDir)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c8bce9ad-77ef-46e1-857b-cfc8bd02016c", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "%python\n", + "\n", + "import os\n", + "import pathlib\n", + "import requests\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")\n", + "data_dir = spark.conf.get(\"DATA_DIR\")\n", + "os.environ['DATA_DIR'] = data_dir" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "293022d3-40c6-4928-946b-7bfe8a6fbb1e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Download NYC Taxi Zones\n", + "\n", + "> Make sure we have New York City Taxi zone shapes available in our environment." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0327a117-5256-4ec3-8c44-34b6a8dfc555", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
zoneDir: String = /tmp/mosaic/mjohns@databricks.com/taxi_zones\n", + "zoneDirFuse: String = /dbfs/tmp/mosaic/mjohns@databricks.com/taxi_zones\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
zoneDir: String = /tmp/mosaic/mjohns@databricks.com/taxi_zones\nzoneDirFuse: String = /dbfs/tmp/mosaic/mjohns@databricks.com/taxi_zones\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "val zoneDir = s\"$dataDir/taxi_zones\" // <- DBFS\n", + "val zoneDirFuse = s\"/dbfs$zoneDir\" // <- FUSE\n", + "dbutils.fs.mkdirs(zoneDir)\n", + "\n", + "spark.conf.set(\"ZONE_DIR\", zoneDir)\n", + "spark.conf.set(\"ZONE_DIR_FUSE\", zoneDirFuse)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e3283b3f-4b81-459c-b513-2d782e9acc7f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "%python \n", + "zone_dir = spark.conf.get(\"ZONE_DIR\")\n", + "zone_dir_fuse = spark.conf.get(\"ZONE_DIR_FUSE\")\n", + "\n", + "os.environ['ZONE_DIR'] = zone_dir\n", + "os.environ['ZONE_DIR_FUSE'] = zone_dir_fuse" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "941175c3-4142-4254-b47d-c8e813dfe95c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "...skipping '/dbfs/tmp/mosaic/mjohns@databricks.com/taxi_zones/nyc_taxi_zones.geojson', already exits.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
pathnamesizemodificationTime
dbfs:/tmp/mosaic/mjohns@databricks.com/taxi_zones/nyc_taxi_zones.geojsonnyc_taxi_zones.geojson38924781701183475000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/tmp/mosaic/mjohns@databricks.com/taxi_zones/nyc_taxi_zones.geojson", + "nyc_taxi_zones.geojson", + 3892478, + 1701183475000 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "size", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%python\n", + "zone_url = 'https://data.cityofnewyork.us/api/geospatial/d3c5-ddgc?method=export&format=GeoJSON'\n", + "\n", + "zone_fusepath = pathlib.Path(zone_dir_fuse) / 'nyc_taxi_zones.geojson'\n", + "if not zone_fuse_path.exists():\n", + " req = requests.get(zone_url)\n", + " with open(zone_fuse_path, 'wb') as f:\n", + " f.write(req.content)\n", + "else:\n", + " print(f\"...skipping '{zone_fuse_path}', already exits.\")\n", + "\n", + "display(dbutils.fs.ls(zone_dir))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "56b5edd3-3e43-428b-b13d-752c2a3a18a3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initial Taxi Zone from GeoJSON [Polygons]\n", + "\n", + "> With the functionality Mosaic brings we can easily load GeoJSON files. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e3ded8e6-05d0-4b48-a823-5aba45e63347", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
count? 263\n", + "neighbourhoods: org.apache.spark.sql.DataFrame = [type: string, properties: struct<borough: string, location_id: string ... 4 more fields> ... 2 more fields]\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
count? 263\nneighbourhoods: org.apache.spark.sql.DataFrame = [type: string, properties: struct<borough: string, location_id: string ... 4 more fields> ... 2 more fields]\n
", + "datasetInfos": [ + { + "name": "neighbourhoods", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "properties", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "borough", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "location_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "objectid", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "shape_area", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "shape_leng", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "zone", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "json_geometry", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "geometry", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "tableIdentifier": null, + "typeStr": "org.apache.spark.sql.DataFrame" + } + ], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "val neighbourhoods = (\n", + " spark.read\n", + " .option(\"multiline\", \"true\")\n", + " .format(\"json\")\n", + " .load(zoneDir)\n", + " .select(col(\"type\"), explode(col(\"features\")).alias(\"feature\"))\n", + " .select(col(\"type\"), col(\"feature.properties\").alias(\"properties\"), to_json(col(\"feature.geometry\")).alias(\"json_geometry\"))\n", + " .withColumn(\"geometry\", st_aswkt(st_geomfromgeojson(col(\"json_geometry\"))))\n", + ")\n", + "println(s\"count? ${formatter.format(neighbourhoods.count)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9da72bc8-a324-4183-ac82-6b87e95ce7d5", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
typepropertiesjson_geometrygeometry
FeatureCollectionList(EWR, 1, 1, 0.0007823067885, 0.116357453189, Newark Airport){\"coordinates\":[[[[-74.18445299999996,40.694995999999904],[-74.18448899999999,40.69509499999987],[-74.18449799999996,40.69518499999987],[-74.18438099999997,40.69587799999989],[-74.18428199999994,40.6962109999999],[-74.18402099999997,40.697074999999884],[-74.18391299999996,40.69750699999986],[-74.18375099999997,40.69779499999988],[-74.18363399999998,40.6983259999999],[-74.18356199999994,40.698451999999875],[-74.18354399999998,40.69855999999988],[-74.18350799999996,40.69870399999992],[-74.18327399999998,40.70008999999988],[-74.18315699999994,40.701214999999884],[-74.18316599999997,40.702384999999886],[-74.18313899999998,40.7026279999999],[-74.18309399999998,40.7028529999999],[-74.18299499999995,40.70315899999985],[-74.18284199999994,40.70346499999989],[-74.18264399999998,40.70373499999988],[-74.18242799999996,40.70395099999992],[-74.18220299999996,40.704139999999896],[-74.18203199999994,40.70425699999987],[-74.18180699999994,40.7043919999999],[-74.18157299999996,40.70449999999988],[-74.18132099999997,40.70460799999991],[-74.18080799999996,40.7047879999999],[-74.179467,40.70534599999992],[-74.17887299999995,40.70554399999987],[-74.17831499999994,40.70572399999987],[-74.17776599999996,40.70589499999988],[-74.17709099999996,40.706092999999896],[-74.17699199999998,40.70613799999988],[-74.17689299999995,40.70619199999988],[-74.17664999999994,40.70641699999988],[-74.17642499999994,40.706695999999916],[-74.17628999999994,40.70689399999988],[-74.17608299999995,40.70710999999989],[-74.17599299999995,40.70719099999991],[-74.17589399999997,40.707262999999905],[-74.17565999999994,40.70737999999988],[-74.17538099999996,40.707469999999915],[-74.17515599999996,40.707514999999894],[-74.17475999999994,40.707595999999924],[-74.17417499999993,40.70766799999991],[-74.17388699999998,40.70773099999992],[-74.17347299999994,40.707748999999865],[-74.17275299999994,40.707802999999906],[-74.17188899999996,40.707910999999854],[-74.17163699999998,40.70795599999986],[-74.17133999999999,40.707964999999895],[-74.17120499999999,40.70795599999986],[-74.16994499999998,40.707973999999886],[-74.16888299999994,40.7079379999999],[-74.16681299999993,40.70785699999989],[-74.16442799999999,40.70779399999987],[-74.16401399999995,40.70777599999992],[-74.16233999999997,40.707721999999876],[-74.16081899999995,40.70764099999991],[-74.16057599999993,40.70760499999988],[-74.16033299999998,40.70756899999987],[-74.160063,40.7074879999999],[-74.15938799999998,40.707262999999905],[-74.15904599999999,40.707145999999916],[-74.15891999999997,40.70710999999989],[-74.15827199999995,40.70687599999993],[-74.15459099999998,40.705651999999894],[-74.15409599999998,40.70544499999989],[-74.15401499999997,40.70538199999988],[-74.15387999999996,40.705327999999895],[-74.15376299999997,40.705408999999875],[-74.15323199999995,40.70524699999987],[-74.15317799999997,40.70531899999989],[-74.15306999999996,40.7052829999999],[-74.15359199999995,40.70437399999987],[-74.15386199999995,40.7038429999999],[-74.15513999999996,40.70155699999987],[-74.15544599999998,40.70108899999988],[-74.15575199999995,40.7006659999999],[-74.15600399999994,40.70026099999991],[-74.15635499999996,40.69975699999986],[-74.15745299999998,40.69809199999988],[-74.15754299999998,40.6979389999999],[-74.15758799999998,40.69781299999988],[-74.15762399999994,40.69767799999991],[-74.15829899999994,40.696705999999885],[-74.15951399999994,40.69488799999988],[-74.15958599999993,40.69476199999984],[-74.16014399999995,40.69410499999988],[-74.16057599999993,40.693222999999875],[-74.16262799999998,40.69028899999989],[-74.16279899999995,40.69002799999989],[-74.16290699999996,40.68987499999987],[-74.16292499999997,40.689874999999866],[-74.16295199999996,40.689874999999866],[-74.16306899999995,40.68989299999988],[-74.16309599999994,40.689928999999886],[-74.16322199999996,40.68998299999989],[-74.16331199999996,40.68999199999993],[-74.16341099999994,40.69000099999988],[-74.16352799999999,40.69000999999986],[-74.16380699999996,40.69004599999989],[-74.16410399999995,40.690081999999904],[-74.16417599999994,40.690081999999904],[-74.16422999999998,40.69005499999988],[-74.16436499999998,40.69003699999991],[-74.16450899999995,40.68998299999986],[-74.16467099999994,40.68988399999989],[-74.16479699999996,40.689757999999884],[-74.16491399999995,40.689586999999904],[-74.16499499999998,40.689388999999885],[-74.16528299999999,40.68891199999991],[-74.16542699999997,40.6887589999999],[-74.16548099999994,40.68863299999987],[-74.16560699999997,40.68842599999988],[-74.16576899999995,40.68802999999986],[-74.16587699999997,40.68787699999991],[-74.16583199999997,40.68757999999987],[-74.16582299999999,40.68748999999987],[-74.16580499999998,40.687156999999914],[-74.16582299999999,40.68703999999986],[-74.16589499999998,40.6868419999999],[-74.16604799999999,40.68655399999988],[-74.16639899999996,40.686022999999864],[-74.16650699999997,40.68588799999986],[-74.16674099999994,40.685491999999925],[-74.16695699999997,40.68523099999988],[-74.16738899999996,40.684546999999924],[-74.16781199999997,40.6839439999999],[-74.16791099999995,40.68379099999988],[-74.16804599999995,40.68360199999991],[-74.16816299999994,40.683475999999885],[-74.16822599999995,40.68334999999991],[-74.16848699999997,40.68299899999991],[-74.16886499999998,40.68239599999987],[-74.16916199999997,40.68199999999991],[-74.16929699999997,40.68178399999989],[-74.16947699999997,40.68155899999991],[-74.16981899999996,40.681018999999885],[-74.16995399999996,40.680874999999915],[-74.17005299999994,40.68066799999987],[-74.17041299999994,40.6801549999999],[-74.17051199999997,40.67999299999987],[-74.17067399999996,40.679650999999886],[-74.17093499999999,40.679290999999864],[-74.17144799999994,40.67847199999989],[-74.17151999999999,40.678381999999885],[-74.17160999999999,40.678255999999884],[-74.17193399999996,40.67782399999988],[-74.17200599999995,40.67773399999988],[-74.17283399999997,40.67656399999988],[-74.17314899999997,40.67619499999991],[-74.17322999999999,40.6760779999999],[-74.17329299999994,40.67601499999989],[-74.17358999999993,40.67571799999991],[-74.17423799999995,40.67493499999991],[-74.17437299999995,40.674817999999895],[-74.17484999999994,40.67432299999992],[-74.17500299999995,40.6741699999999],[-74.17538999999995,40.67375599999987],[-74.17604699999998,40.673044999999895],[-74.17630799999995,40.67276599999986],[-74.17641599999996,40.672621999999876],[-74.17663199999998,40.67239699999989],[-74.17678499999994,40.67218099999991],[-74.17697399999997,40.6719379999999],[-74.17709099999996,40.671784999999886],[-74.17734299999995,40.67155999999988],[-74.17754999999994,40.67142499999989],[-74.17778399999997,40.671316999999874],[-74.17802699999999,40.671208999999884],[-74.17862999999994,40.671037999999896],[-74.17888199999999,40.671001999999895],[-74.17912499999994,40.67099299999991],[-74.17933199999999,40.67101099999992],[-74.17979099999997,40.67115499999989],[-74.17997999999994,40.671208999999884],[-74.18010599999997,40.671262999999904],[-74.18030399999998,40.67129899999986],[-74.18133899999998,40.67170399999986],[-74.18213999999996,40.67202799999989],[-74.18384999999995,40.672648999999886],[-74.18437199999994,40.67290999999989],[-74.18458799999996,40.67302699999988],[-74.18492099999997,40.673269999999896],[-74.18503799999996,40.67335999999989],[-74.18513699999994,40.673458999999866],[-74.18547899999999,40.67390899999987],[-74.18594699999994,40.674664999999905],[-74.18670299999997,40.67578999999992],[-74.18733299999997,40.67674399999987],[-74.18767499999996,40.67729299999991],[-74.18795399999995,40.67761699999989],[-74.18819699999995,40.67792299999992],[-74.18852099999998,40.67848099999987],[-74.18877299999997,40.67885899999989],[-74.18905199999995,40.67933599999985],[-74.18935799999997,40.67975899999988],[-74.18949299999997,40.680091999999895],[-74.18969999999996,40.680793999999885],[-74.18977199999995,40.68113599999987],[-74.189781,40.681198999999886],[-74.18983499999996,40.68131599999987],[-74.18991599999998,40.68154099999988],[-74.18996999999996,40.6818019999999],[-74.18999699999995,40.6822519999999],[-74.18999699999995,40.68262999999992],[-74.18996999999996,40.68295399999989],[-74.18998799999997,40.68317899999989],[-74.18995199999995,40.683520999999885],[-74.18993399999994,40.68370999999992],[-74.189871,40.684078999999876],[-74.189781,40.68481699999991],[-74.18976299999997,40.68503299999986],[-74.18962799999997,40.686103999999915],[-74.18955599999998,40.68689599999987],[-74.18951999999996,40.6872019999999],[-74.18947499999996,40.68748999999985],[-74.18939399999994,40.68773299999988],[-74.18939399999994,40.68783199999991],[-74.18941199999995,40.687939999999855],[-74.18940299999997,40.68809299999987],[-74.18934899999994,40.68826399999989],[-74.18922299999997,40.68862399999989],[-74.18898899999994,40.68904699999991],[-74.18870099999998,40.689442999999876],[-74.18779199999994,40.690189999999866],[-74.18723399999999,40.69059499999986],[-74.18636999999995,40.69118899999991],[-74.18591099999998,40.69144999999988],[-74.18563199999994,40.69164799999987],[-74.18445299999996,40.694995999999904]]]],\"type\":\"MultiPolygon\"}MULTIPOLYGON (((-74.18445299999996 40.694995999999904, -74.18448899999999 40.69509499999987, -74.18449799999996 40.69518499999987, -74.18438099999997 40.69587799999989, -74.18428199999994 40.6962109999999, -74.18402099999997 40.697074999999884, -74.18391299999996 40.69750699999986, -74.18375099999997 40.69779499999988, -74.18363399999998 40.6983259999999, -74.18356199999994 40.698451999999875, -74.18354399999998 40.69855999999988, -74.18350799999996 40.69870399999992, -74.18327399999998 40.70008999999988, -74.18315699999994 40.701214999999884, -74.18316599999997 40.702384999999886, -74.18313899999998 40.7026279999999, -74.18309399999998 40.7028529999999, -74.18299499999995 40.70315899999985, -74.18284199999994 40.70346499999989, -74.18264399999998 40.70373499999988, -74.18242799999996 40.70395099999992, -74.18220299999996 40.704139999999896, -74.18203199999994 40.70425699999987, -74.18180699999994 40.7043919999999, -74.18157299999996 40.70449999999988, -74.18132099999997 40.70460799999991, -74.18080799999996 40.7047879999999, -74.179467 40.70534599999992, -74.17887299999995 40.70554399999987, -74.17831499999994 40.70572399999987, -74.17776599999996 40.70589499999988, -74.17709099999996 40.706092999999896, -74.17699199999998 40.70613799999988, -74.17689299999995 40.70619199999988, -74.17664999999994 40.70641699999988, -74.17642499999994 40.706695999999916, -74.17628999999994 40.70689399999988, -74.17608299999995 40.70710999999989, -74.17599299999995 40.70719099999991, -74.17589399999997 40.707262999999905, -74.17565999999994 40.70737999999988, -74.17538099999996 40.707469999999915, -74.17515599999996 40.707514999999894, -74.17475999999994 40.707595999999924, -74.17417499999993 40.70766799999991, -74.17388699999998 40.70773099999992, -74.17347299999994 40.707748999999865, -74.17275299999994 40.707802999999906, -74.17188899999996 40.707910999999854, -74.17163699999998 40.70795599999986, -74.17133999999999 40.707964999999895, -74.17120499999999 40.70795599999986, -74.16994499999998 40.707973999999886, -74.16888299999994 40.7079379999999, -74.16681299999993 40.70785699999989, -74.16442799999999 40.70779399999987, -74.16401399999995 40.70777599999992, -74.16233999999997 40.707721999999876, -74.16081899999995 40.70764099999991, -74.16057599999993 40.70760499999988, -74.16033299999998 40.70756899999987, -74.160063 40.7074879999999, -74.15938799999998 40.707262999999905, -74.15904599999999 40.707145999999916, -74.15891999999997 40.70710999999989, -74.15827199999995 40.70687599999993, -74.15459099999998 40.705651999999894, -74.15409599999998 40.70544499999989, -74.15401499999997 40.70538199999988, -74.15387999999996 40.705327999999895, -74.15376299999997 40.705408999999875, -74.15323199999995 40.70524699999987, -74.15317799999997 40.70531899999989, -74.15306999999996 40.7052829999999, -74.15359199999995 40.70437399999987, -74.15386199999995 40.7038429999999, -74.15513999999996 40.70155699999987, -74.15544599999998 40.70108899999988, -74.15575199999995 40.7006659999999, -74.15600399999994 40.70026099999991, -74.15635499999996 40.69975699999986, -74.15745299999998 40.69809199999988, -74.15754299999998 40.6979389999999, -74.15758799999998 40.69781299999988, -74.15762399999994 40.69767799999991, -74.15829899999994 40.696705999999885, -74.15951399999994 40.69488799999988, -74.15958599999993 40.69476199999984, -74.16014399999995 40.69410499999988, -74.16057599999993 40.693222999999875, -74.16262799999998 40.69028899999989, -74.16279899999995 40.69002799999989, -74.16290699999996 40.68987499999987, -74.16292499999997 40.689874999999866, -74.16295199999996 40.689874999999866, -74.16306899999995 40.68989299999988, -74.16309599999994 40.689928999999886, -74.16322199999996 40.68998299999989, -74.16331199999996 40.68999199999993, -74.16341099999994 40.69000099999988, -74.16352799999999 40.69000999999986, -74.16380699999996 40.69004599999989, -74.16410399999995 40.690081999999904, -74.16417599999994 40.690081999999904, -74.16422999999998 40.69005499999988, -74.16436499999998 40.69003699999991, -74.16450899999995 40.68998299999986, -74.16467099999994 40.68988399999989, -74.16479699999996 40.689757999999884, -74.16491399999995 40.689586999999904, -74.16499499999998 40.689388999999885, -74.16528299999999 40.68891199999991, -74.16542699999997 40.6887589999999, -74.16548099999994 40.68863299999987, -74.16560699999997 40.68842599999988, -74.16576899999995 40.68802999999986, -74.16587699999997 40.68787699999991, -74.16583199999997 40.68757999999987, -74.16582299999999 40.68748999999987, -74.16580499999998 40.687156999999914, -74.16582299999999 40.68703999999986, -74.16589499999998 40.6868419999999, -74.16604799999999 40.68655399999988, -74.16639899999996 40.686022999999864, -74.16650699999997 40.68588799999986, -74.16674099999994 40.685491999999925, -74.16695699999997 40.68523099999988, -74.16738899999996 40.684546999999924, -74.16781199999997 40.6839439999999, -74.16791099999995 40.68379099999988, -74.16804599999995 40.68360199999991, -74.16816299999994 40.683475999999885, -74.16822599999995 40.68334999999991, -74.16848699999997 40.68299899999991, -74.16886499999998 40.68239599999987, -74.16916199999997 40.68199999999991, -74.16929699999997 40.68178399999989, -74.16947699999997 40.68155899999991, -74.16981899999996 40.681018999999885, -74.16995399999996 40.680874999999915, -74.17005299999994 40.68066799999987, -74.17041299999994 40.6801549999999, -74.17051199999997 40.67999299999987, -74.17067399999996 40.679650999999886, -74.17093499999999 40.679290999999864, -74.17144799999994 40.67847199999989, -74.17151999999999 40.678381999999885, -74.17160999999999 40.678255999999884, -74.17193399999996 40.67782399999988, -74.17200599999995 40.67773399999988, -74.17283399999997 40.67656399999988, -74.17314899999997 40.67619499999991, -74.17322999999999 40.6760779999999, -74.17329299999994 40.67601499999989, -74.17358999999993 40.67571799999991, -74.17423799999995 40.67493499999991, -74.17437299999995 40.674817999999895, -74.17484999999994 40.67432299999992, -74.17500299999995 40.6741699999999, -74.17538999999995 40.67375599999987, -74.17604699999998 40.673044999999895, -74.17630799999995 40.67276599999986, -74.17641599999996 40.672621999999876, -74.17663199999998 40.67239699999989, -74.17678499999994 40.67218099999991, -74.17697399999997 40.6719379999999, -74.17709099999996 40.671784999999886, -74.17734299999995 40.67155999999988, -74.17754999999994 40.67142499999989, -74.17778399999997 40.671316999999874, -74.17802699999999 40.671208999999884, -74.17862999999994 40.671037999999896, -74.17888199999999 40.671001999999895, -74.17912499999994 40.67099299999991, -74.17933199999999 40.67101099999992, -74.17979099999997 40.67115499999989, -74.17997999999994 40.671208999999884, -74.18010599999997 40.671262999999904, -74.18030399999998 40.67129899999986, -74.18133899999998 40.67170399999986, -74.18213999999996 40.67202799999989, -74.18384999999995 40.672648999999886, -74.18437199999994 40.67290999999989, -74.18458799999996 40.67302699999988, -74.18492099999997 40.673269999999896, -74.18503799999996 40.67335999999989, -74.18513699999994 40.673458999999866, -74.18547899999999 40.67390899999987, -74.18594699999994 40.674664999999905, -74.18670299999997 40.67578999999992, -74.18733299999997 40.67674399999987, -74.18767499999996 40.67729299999991, -74.18795399999995 40.67761699999989, -74.18819699999995 40.67792299999992, -74.18852099999998 40.67848099999987, -74.18877299999997 40.67885899999989, -74.18905199999995 40.67933599999985, -74.18935799999997 40.67975899999988, -74.18949299999997 40.680091999999895, -74.18969999999996 40.680793999999885, -74.18977199999995 40.68113599999987, -74.189781 40.681198999999886, -74.18983499999996 40.68131599999987, -74.18991599999998 40.68154099999988, -74.18996999999996 40.6818019999999, -74.18999699999995 40.6822519999999, -74.18999699999995 40.68262999999992, -74.18996999999996 40.68295399999989, -74.18998799999997 40.68317899999989, -74.18995199999995 40.683520999999885, -74.18993399999994 40.68370999999992, -74.189871 40.684078999999876, -74.189781 40.68481699999991, -74.18976299999997 40.68503299999986, -74.18962799999997 40.686103999999915, -74.18955599999998 40.68689599999987, -74.18951999999996 40.6872019999999, -74.18947499999996 40.68748999999985, -74.18939399999994 40.68773299999988, -74.18939399999994 40.68783199999991, -74.18941199999995 40.687939999999855, -74.18940299999997 40.68809299999987, -74.18934899999994 40.68826399999989, -74.18922299999997 40.68862399999989, -74.18898899999994 40.68904699999991, -74.18870099999998 40.689442999999876, -74.18779199999994 40.690189999999866, -74.18723399999999 40.69059499999986, -74.18636999999995 40.69118899999991, -74.18591099999998 40.69144999999988, -74.18563199999994 40.69164799999987, -74.18445299999996 40.694995999999904)))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "FeatureCollection", + [ + "EWR", + "1", + "1", + "0.0007823067885", + "0.116357453189", + "Newark Airport" + ], + "{\"coordinates\":[[[[-74.18445299999996,40.694995999999904],[-74.18448899999999,40.69509499999987],[-74.18449799999996,40.69518499999987],[-74.18438099999997,40.69587799999989],[-74.18428199999994,40.6962109999999],[-74.18402099999997,40.697074999999884],[-74.18391299999996,40.69750699999986],[-74.18375099999997,40.69779499999988],[-74.18363399999998,40.6983259999999],[-74.18356199999994,40.698451999999875],[-74.18354399999998,40.69855999999988],[-74.18350799999996,40.69870399999992],[-74.18327399999998,40.70008999999988],[-74.18315699999994,40.701214999999884],[-74.18316599999997,40.702384999999886],[-74.18313899999998,40.7026279999999],[-74.18309399999998,40.7028529999999],[-74.18299499999995,40.70315899999985],[-74.18284199999994,40.70346499999989],[-74.18264399999998,40.70373499999988],[-74.18242799999996,40.70395099999992],[-74.18220299999996,40.704139999999896],[-74.18203199999994,40.70425699999987],[-74.18180699999994,40.7043919999999],[-74.18157299999996,40.70449999999988],[-74.18132099999997,40.70460799999991],[-74.18080799999996,40.7047879999999],[-74.179467,40.70534599999992],[-74.17887299999995,40.70554399999987],[-74.17831499999994,40.70572399999987],[-74.17776599999996,40.70589499999988],[-74.17709099999996,40.706092999999896],[-74.17699199999998,40.70613799999988],[-74.17689299999995,40.70619199999988],[-74.17664999999994,40.70641699999988],[-74.17642499999994,40.706695999999916],[-74.17628999999994,40.70689399999988],[-74.17608299999995,40.70710999999989],[-74.17599299999995,40.70719099999991],[-74.17589399999997,40.707262999999905],[-74.17565999999994,40.70737999999988],[-74.17538099999996,40.707469999999915],[-74.17515599999996,40.707514999999894],[-74.17475999999994,40.707595999999924],[-74.17417499999993,40.70766799999991],[-74.17388699999998,40.70773099999992],[-74.17347299999994,40.707748999999865],[-74.17275299999994,40.707802999999906],[-74.17188899999996,40.707910999999854],[-74.17163699999998,40.70795599999986],[-74.17133999999999,40.707964999999895],[-74.17120499999999,40.70795599999986],[-74.16994499999998,40.707973999999886],[-74.16888299999994,40.7079379999999],[-74.16681299999993,40.70785699999989],[-74.16442799999999,40.70779399999987],[-74.16401399999995,40.70777599999992],[-74.16233999999997,40.707721999999876],[-74.16081899999995,40.70764099999991],[-74.16057599999993,40.70760499999988],[-74.16033299999998,40.70756899999987],[-74.160063,40.7074879999999],[-74.15938799999998,40.707262999999905],[-74.15904599999999,40.707145999999916],[-74.15891999999997,40.70710999999989],[-74.15827199999995,40.70687599999993],[-74.15459099999998,40.705651999999894],[-74.15409599999998,40.70544499999989],[-74.15401499999997,40.70538199999988],[-74.15387999999996,40.705327999999895],[-74.15376299999997,40.705408999999875],[-74.15323199999995,40.70524699999987],[-74.15317799999997,40.70531899999989],[-74.15306999999996,40.7052829999999],[-74.15359199999995,40.70437399999987],[-74.15386199999995,40.7038429999999],[-74.15513999999996,40.70155699999987],[-74.15544599999998,40.70108899999988],[-74.15575199999995,40.7006659999999],[-74.15600399999994,40.70026099999991],[-74.15635499999996,40.69975699999986],[-74.15745299999998,40.69809199999988],[-74.15754299999998,40.6979389999999],[-74.15758799999998,40.69781299999988],[-74.15762399999994,40.69767799999991],[-74.15829899999994,40.696705999999885],[-74.15951399999994,40.69488799999988],[-74.15958599999993,40.69476199999984],[-74.16014399999995,40.69410499999988],[-74.16057599999993,40.693222999999875],[-74.16262799999998,40.69028899999989],[-74.16279899999995,40.69002799999989],[-74.16290699999996,40.68987499999987],[-74.16292499999997,40.689874999999866],[-74.16295199999996,40.689874999999866],[-74.16306899999995,40.68989299999988],[-74.16309599999994,40.689928999999886],[-74.16322199999996,40.68998299999989],[-74.16331199999996,40.68999199999993],[-74.16341099999994,40.69000099999988],[-74.16352799999999,40.69000999999986],[-74.16380699999996,40.69004599999989],[-74.16410399999995,40.690081999999904],[-74.16417599999994,40.690081999999904],[-74.16422999999998,40.69005499999988],[-74.16436499999998,40.69003699999991],[-74.16450899999995,40.68998299999986],[-74.16467099999994,40.68988399999989],[-74.16479699999996,40.689757999999884],[-74.16491399999995,40.689586999999904],[-74.16499499999998,40.689388999999885],[-74.16528299999999,40.68891199999991],[-74.16542699999997,40.6887589999999],[-74.16548099999994,40.68863299999987],[-74.16560699999997,40.68842599999988],[-74.16576899999995,40.68802999999986],[-74.16587699999997,40.68787699999991],[-74.16583199999997,40.68757999999987],[-74.16582299999999,40.68748999999987],[-74.16580499999998,40.687156999999914],[-74.16582299999999,40.68703999999986],[-74.16589499999998,40.6868419999999],[-74.16604799999999,40.68655399999988],[-74.16639899999996,40.686022999999864],[-74.16650699999997,40.68588799999986],[-74.16674099999994,40.685491999999925],[-74.16695699999997,40.68523099999988],[-74.16738899999996,40.684546999999924],[-74.16781199999997,40.6839439999999],[-74.16791099999995,40.68379099999988],[-74.16804599999995,40.68360199999991],[-74.16816299999994,40.683475999999885],[-74.16822599999995,40.68334999999991],[-74.16848699999997,40.68299899999991],[-74.16886499999998,40.68239599999987],[-74.16916199999997,40.68199999999991],[-74.16929699999997,40.68178399999989],[-74.16947699999997,40.68155899999991],[-74.16981899999996,40.681018999999885],[-74.16995399999996,40.680874999999915],[-74.17005299999994,40.68066799999987],[-74.17041299999994,40.6801549999999],[-74.17051199999997,40.67999299999987],[-74.17067399999996,40.679650999999886],[-74.17093499999999,40.679290999999864],[-74.17144799999994,40.67847199999989],[-74.17151999999999,40.678381999999885],[-74.17160999999999,40.678255999999884],[-74.17193399999996,40.67782399999988],[-74.17200599999995,40.67773399999988],[-74.17283399999997,40.67656399999988],[-74.17314899999997,40.67619499999991],[-74.17322999999999,40.6760779999999],[-74.17329299999994,40.67601499999989],[-74.17358999999993,40.67571799999991],[-74.17423799999995,40.67493499999991],[-74.17437299999995,40.674817999999895],[-74.17484999999994,40.67432299999992],[-74.17500299999995,40.6741699999999],[-74.17538999999995,40.67375599999987],[-74.17604699999998,40.673044999999895],[-74.17630799999995,40.67276599999986],[-74.17641599999996,40.672621999999876],[-74.17663199999998,40.67239699999989],[-74.17678499999994,40.67218099999991],[-74.17697399999997,40.6719379999999],[-74.17709099999996,40.671784999999886],[-74.17734299999995,40.67155999999988],[-74.17754999999994,40.67142499999989],[-74.17778399999997,40.671316999999874],[-74.17802699999999,40.671208999999884],[-74.17862999999994,40.671037999999896],[-74.17888199999999,40.671001999999895],[-74.17912499999994,40.67099299999991],[-74.17933199999999,40.67101099999992],[-74.17979099999997,40.67115499999989],[-74.17997999999994,40.671208999999884],[-74.18010599999997,40.671262999999904],[-74.18030399999998,40.67129899999986],[-74.18133899999998,40.67170399999986],[-74.18213999999996,40.67202799999989],[-74.18384999999995,40.672648999999886],[-74.18437199999994,40.67290999999989],[-74.18458799999996,40.67302699999988],[-74.18492099999997,40.673269999999896],[-74.18503799999996,40.67335999999989],[-74.18513699999994,40.673458999999866],[-74.18547899999999,40.67390899999987],[-74.18594699999994,40.674664999999905],[-74.18670299999997,40.67578999999992],[-74.18733299999997,40.67674399999987],[-74.18767499999996,40.67729299999991],[-74.18795399999995,40.67761699999989],[-74.18819699999995,40.67792299999992],[-74.18852099999998,40.67848099999987],[-74.18877299999997,40.67885899999989],[-74.18905199999995,40.67933599999985],[-74.18935799999997,40.67975899999988],[-74.18949299999997,40.680091999999895],[-74.18969999999996,40.680793999999885],[-74.18977199999995,40.68113599999987],[-74.189781,40.681198999999886],[-74.18983499999996,40.68131599999987],[-74.18991599999998,40.68154099999988],[-74.18996999999996,40.6818019999999],[-74.18999699999995,40.6822519999999],[-74.18999699999995,40.68262999999992],[-74.18996999999996,40.68295399999989],[-74.18998799999997,40.68317899999989],[-74.18995199999995,40.683520999999885],[-74.18993399999994,40.68370999999992],[-74.189871,40.684078999999876],[-74.189781,40.68481699999991],[-74.18976299999997,40.68503299999986],[-74.18962799999997,40.686103999999915],[-74.18955599999998,40.68689599999987],[-74.18951999999996,40.6872019999999],[-74.18947499999996,40.68748999999985],[-74.18939399999994,40.68773299999988],[-74.18939399999994,40.68783199999991],[-74.18941199999995,40.687939999999855],[-74.18940299999997,40.68809299999987],[-74.18934899999994,40.68826399999989],[-74.18922299999997,40.68862399999989],[-74.18898899999994,40.68904699999991],[-74.18870099999998,40.689442999999876],[-74.18779199999994,40.690189999999866],[-74.18723399999999,40.69059499999986],[-74.18636999999995,40.69118899999991],[-74.18591099999998,40.69144999999988],[-74.18563199999994,40.69164799999987],[-74.18445299999996,40.694995999999904]]]],\"type\":\"MultiPolygon\"}", + "MULTIPOLYGON (((-74.18445299999996 40.694995999999904, -74.18448899999999 40.69509499999987, -74.18449799999996 40.69518499999987, -74.18438099999997 40.69587799999989, -74.18428199999994 40.6962109999999, -74.18402099999997 40.697074999999884, -74.18391299999996 40.69750699999986, -74.18375099999997 40.69779499999988, -74.18363399999998 40.6983259999999, -74.18356199999994 40.698451999999875, -74.18354399999998 40.69855999999988, -74.18350799999996 40.69870399999992, -74.18327399999998 40.70008999999988, -74.18315699999994 40.701214999999884, -74.18316599999997 40.702384999999886, -74.18313899999998 40.7026279999999, -74.18309399999998 40.7028529999999, -74.18299499999995 40.70315899999985, -74.18284199999994 40.70346499999989, -74.18264399999998 40.70373499999988, -74.18242799999996 40.70395099999992, -74.18220299999996 40.704139999999896, -74.18203199999994 40.70425699999987, -74.18180699999994 40.7043919999999, -74.18157299999996 40.70449999999988, -74.18132099999997 40.70460799999991, -74.18080799999996 40.7047879999999, -74.179467 40.70534599999992, -74.17887299999995 40.70554399999987, -74.17831499999994 40.70572399999987, -74.17776599999996 40.70589499999988, -74.17709099999996 40.706092999999896, -74.17699199999998 40.70613799999988, -74.17689299999995 40.70619199999988, -74.17664999999994 40.70641699999988, -74.17642499999994 40.706695999999916, -74.17628999999994 40.70689399999988, -74.17608299999995 40.70710999999989, -74.17599299999995 40.70719099999991, -74.17589399999997 40.707262999999905, -74.17565999999994 40.70737999999988, -74.17538099999996 40.707469999999915, -74.17515599999996 40.707514999999894, -74.17475999999994 40.707595999999924, -74.17417499999993 40.70766799999991, -74.17388699999998 40.70773099999992, -74.17347299999994 40.707748999999865, -74.17275299999994 40.707802999999906, -74.17188899999996 40.707910999999854, -74.17163699999998 40.70795599999986, -74.17133999999999 40.707964999999895, -74.17120499999999 40.70795599999986, -74.16994499999998 40.707973999999886, -74.16888299999994 40.7079379999999, -74.16681299999993 40.70785699999989, -74.16442799999999 40.70779399999987, -74.16401399999995 40.70777599999992, -74.16233999999997 40.707721999999876, -74.16081899999995 40.70764099999991, -74.16057599999993 40.70760499999988, -74.16033299999998 40.70756899999987, -74.160063 40.7074879999999, -74.15938799999998 40.707262999999905, -74.15904599999999 40.707145999999916, -74.15891999999997 40.70710999999989, -74.15827199999995 40.70687599999993, -74.15459099999998 40.705651999999894, -74.15409599999998 40.70544499999989, -74.15401499999997 40.70538199999988, -74.15387999999996 40.705327999999895, -74.15376299999997 40.705408999999875, -74.15323199999995 40.70524699999987, -74.15317799999997 40.70531899999989, -74.15306999999996 40.7052829999999, -74.15359199999995 40.70437399999987, -74.15386199999995 40.7038429999999, -74.15513999999996 40.70155699999987, -74.15544599999998 40.70108899999988, -74.15575199999995 40.7006659999999, -74.15600399999994 40.70026099999991, -74.15635499999996 40.69975699999986, -74.15745299999998 40.69809199999988, -74.15754299999998 40.6979389999999, -74.15758799999998 40.69781299999988, -74.15762399999994 40.69767799999991, -74.15829899999994 40.696705999999885, -74.15951399999994 40.69488799999988, -74.15958599999993 40.69476199999984, -74.16014399999995 40.69410499999988, -74.16057599999993 40.693222999999875, -74.16262799999998 40.69028899999989, -74.16279899999995 40.69002799999989, -74.16290699999996 40.68987499999987, -74.16292499999997 40.689874999999866, -74.16295199999996 40.689874999999866, -74.16306899999995 40.68989299999988, -74.16309599999994 40.689928999999886, -74.16322199999996 40.68998299999989, -74.16331199999996 40.68999199999993, -74.16341099999994 40.69000099999988, -74.16352799999999 40.69000999999986, -74.16380699999996 40.69004599999989, -74.16410399999995 40.690081999999904, -74.16417599999994 40.690081999999904, -74.16422999999998 40.69005499999988, -74.16436499999998 40.69003699999991, -74.16450899999995 40.68998299999986, -74.16467099999994 40.68988399999989, -74.16479699999996 40.689757999999884, -74.16491399999995 40.689586999999904, -74.16499499999998 40.689388999999885, -74.16528299999999 40.68891199999991, -74.16542699999997 40.6887589999999, -74.16548099999994 40.68863299999987, -74.16560699999997 40.68842599999988, -74.16576899999995 40.68802999999986, -74.16587699999997 40.68787699999991, -74.16583199999997 40.68757999999987, -74.16582299999999 40.68748999999987, -74.16580499999998 40.687156999999914, -74.16582299999999 40.68703999999986, -74.16589499999998 40.6868419999999, -74.16604799999999 40.68655399999988, -74.16639899999996 40.686022999999864, -74.16650699999997 40.68588799999986, -74.16674099999994 40.685491999999925, -74.16695699999997 40.68523099999988, -74.16738899999996 40.684546999999924, -74.16781199999997 40.6839439999999, -74.16791099999995 40.68379099999988, -74.16804599999995 40.68360199999991, -74.16816299999994 40.683475999999885, -74.16822599999995 40.68334999999991, -74.16848699999997 40.68299899999991, -74.16886499999998 40.68239599999987, -74.16916199999997 40.68199999999991, -74.16929699999997 40.68178399999989, -74.16947699999997 40.68155899999991, -74.16981899999996 40.681018999999885, -74.16995399999996 40.680874999999915, -74.17005299999994 40.68066799999987, -74.17041299999994 40.6801549999999, -74.17051199999997 40.67999299999987, -74.17067399999996 40.679650999999886, -74.17093499999999 40.679290999999864, -74.17144799999994 40.67847199999989, -74.17151999999999 40.678381999999885, -74.17160999999999 40.678255999999884, -74.17193399999996 40.67782399999988, -74.17200599999995 40.67773399999988, -74.17283399999997 40.67656399999988, -74.17314899999997 40.67619499999991, -74.17322999999999 40.6760779999999, -74.17329299999994 40.67601499999989, -74.17358999999993 40.67571799999991, -74.17423799999995 40.67493499999991, -74.17437299999995 40.674817999999895, -74.17484999999994 40.67432299999992, -74.17500299999995 40.6741699999999, -74.17538999999995 40.67375599999987, -74.17604699999998 40.673044999999895, -74.17630799999995 40.67276599999986, -74.17641599999996 40.672621999999876, -74.17663199999998 40.67239699999989, -74.17678499999994 40.67218099999991, -74.17697399999997 40.6719379999999, -74.17709099999996 40.671784999999886, -74.17734299999995 40.67155999999988, -74.17754999999994 40.67142499999989, -74.17778399999997 40.671316999999874, -74.17802699999999 40.671208999999884, -74.17862999999994 40.671037999999896, -74.17888199999999 40.671001999999895, -74.17912499999994 40.67099299999991, -74.17933199999999 40.67101099999992, -74.17979099999997 40.67115499999989, -74.17997999999994 40.671208999999884, -74.18010599999997 40.671262999999904, -74.18030399999998 40.67129899999986, -74.18133899999998 40.67170399999986, -74.18213999999996 40.67202799999989, -74.18384999999995 40.672648999999886, -74.18437199999994 40.67290999999989, -74.18458799999996 40.67302699999988, -74.18492099999997 40.673269999999896, -74.18503799999996 40.67335999999989, -74.18513699999994 40.673458999999866, -74.18547899999999 40.67390899999987, -74.18594699999994 40.674664999999905, -74.18670299999997 40.67578999999992, -74.18733299999997 40.67674399999987, -74.18767499999996 40.67729299999991, -74.18795399999995 40.67761699999989, -74.18819699999995 40.67792299999992, -74.18852099999998 40.67848099999987, -74.18877299999997 40.67885899999989, -74.18905199999995 40.67933599999985, -74.18935799999997 40.67975899999988, -74.18949299999997 40.680091999999895, -74.18969999999996 40.680793999999885, -74.18977199999995 40.68113599999987, -74.189781 40.681198999999886, -74.18983499999996 40.68131599999987, -74.18991599999998 40.68154099999988, -74.18996999999996 40.6818019999999, -74.18999699999995 40.6822519999999, -74.18999699999995 40.68262999999992, -74.18996999999996 40.68295399999989, -74.18998799999997 40.68317899999989, -74.18995199999995 40.683520999999885, -74.18993399999994 40.68370999999992, -74.189871 40.684078999999876, -74.189781 40.68481699999991, -74.18976299999997 40.68503299999986, -74.18962799999997 40.686103999999915, -74.18955599999998 40.68689599999987, -74.18951999999996 40.6872019999999, -74.18947499999996 40.68748999999985, -74.18939399999994 40.68773299999988, -74.18939399999994 40.68783199999991, -74.18941199999995 40.687939999999855, -74.18940299999997 40.68809299999987, -74.18934899999994 40.68826399999989, -74.18922299999997 40.68862399999989, -74.18898899999994 40.68904699999991, -74.18870099999998 40.689442999999876, -74.18779199999994 40.690189999999866, -74.18723399999999 40.69059499999986, -74.18636999999995 40.69118899999991, -74.18591099999998 40.69144999999988, -74.18563199999994 40.69164799999987, -74.18445299999996 40.694995999999904)))" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "properties", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"borough\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"location_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"objectid\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_area\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_leng\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"zone\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "json_geometry", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geometry", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "display(neighbourhoods.limit(1)) // <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3738db02-9bc9-437b-a723-00c438a6fb87", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Compute some basic geometry attributes\n", + "\n", + "> Mosaic provides a number of functions for extracting the properties of geometries. Here are some that are relevant to Polygon geometries:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0f8bb69f-8e1e-4626-b37e-fd885c773fe7", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
geometrycalculatedAreacalculatedLength
MULTIPOLYGON (((-74.18445299999996 40.694995999999904, -74.18448899999999 40.69509499999987, -74.18449799999996 40.69518499999987, -74.18438099999997 40.69587799999989, -74.18428199999994 40.6962109999999, -74.18402099999997 40.697074999999884, -74.18391299999996 40.69750699999986, -74.18375099999997 40.69779499999988, -74.18363399999998 40.6983259999999, -74.18356199999994 40.698451999999875, -74.18354399999998 40.69855999999988, -74.18350799999996 40.69870399999992, -74.18327399999998 40.70008999999988, -74.18315699999994 40.701214999999884, -74.18316599999997 40.702384999999886, -74.18313899999998 40.7026279999999, -74.18309399999998 40.7028529999999, -74.18299499999995 40.70315899999985, -74.18284199999994 40.70346499999989, -74.18264399999998 40.70373499999988, -74.18242799999996 40.70395099999992, -74.18220299999996 40.704139999999896, -74.18203199999994 40.70425699999987, -74.18180699999994 40.7043919999999, -74.18157299999996 40.70449999999988, -74.18132099999997 40.70460799999991, -74.18080799999996 40.7047879999999, -74.179467 40.70534599999992, -74.17887299999995 40.70554399999987, -74.17831499999994 40.70572399999987, -74.17776599999996 40.70589499999988, -74.17709099999996 40.706092999999896, -74.17699199999998 40.70613799999988, -74.17689299999995 40.70619199999988, -74.17664999999994 40.70641699999988, -74.17642499999994 40.706695999999916, -74.17628999999994 40.70689399999988, -74.17608299999995 40.70710999999989, -74.17599299999995 40.70719099999991, -74.17589399999997 40.707262999999905, -74.17565999999994 40.70737999999988, -74.17538099999996 40.707469999999915, -74.17515599999996 40.707514999999894, -74.17475999999994 40.707595999999924, -74.17417499999993 40.70766799999991, -74.17388699999998 40.70773099999992, -74.17347299999994 40.707748999999865, -74.17275299999994 40.707802999999906, -74.17188899999996 40.707910999999854, -74.17163699999998 40.70795599999986, -74.17133999999999 40.707964999999895, -74.17120499999999 40.70795599999986, -74.16994499999998 40.707973999999886, -74.16888299999994 40.7079379999999, -74.16681299999993 40.70785699999989, -74.16442799999999 40.70779399999987, -74.16401399999995 40.70777599999992, -74.16233999999997 40.707721999999876, -74.16081899999995 40.70764099999991, -74.16057599999993 40.70760499999988, -74.16033299999998 40.70756899999987, -74.160063 40.7074879999999, -74.15938799999998 40.707262999999905, -74.15904599999999 40.707145999999916, -74.15891999999997 40.70710999999989, -74.15827199999995 40.70687599999993, -74.15459099999998 40.705651999999894, -74.15409599999998 40.70544499999989, -74.15401499999997 40.70538199999988, -74.15387999999996 40.705327999999895, -74.15376299999997 40.705408999999875, -74.15323199999995 40.70524699999987, -74.15317799999997 40.70531899999989, -74.15306999999996 40.7052829999999, -74.15359199999995 40.70437399999987, -74.15386199999995 40.7038429999999, -74.15513999999996 40.70155699999987, -74.15544599999998 40.70108899999988, -74.15575199999995 40.7006659999999, -74.15600399999994 40.70026099999991, -74.15635499999996 40.69975699999986, -74.15745299999998 40.69809199999988, -74.15754299999998 40.6979389999999, -74.15758799999998 40.69781299999988, -74.15762399999994 40.69767799999991, -74.15829899999994 40.696705999999885, -74.15951399999994 40.69488799999988, -74.15958599999993 40.69476199999984, -74.16014399999995 40.69410499999988, -74.16057599999993 40.693222999999875, -74.16262799999998 40.69028899999989, -74.16279899999995 40.69002799999989, -74.16290699999996 40.68987499999987, -74.16292499999997 40.689874999999866, -74.16295199999996 40.689874999999866, -74.16306899999995 40.68989299999988, -74.16309599999994 40.689928999999886, -74.16322199999996 40.68998299999989, -74.16331199999996 40.68999199999993, -74.16341099999994 40.69000099999988, -74.16352799999999 40.69000999999986, -74.16380699999996 40.69004599999989, -74.16410399999995 40.690081999999904, -74.16417599999994 40.690081999999904, -74.16422999999998 40.69005499999988, -74.16436499999998 40.69003699999991, -74.16450899999995 40.68998299999986, -74.16467099999994 40.68988399999989, -74.16479699999996 40.689757999999884, -74.16491399999995 40.689586999999904, -74.16499499999998 40.689388999999885, -74.16528299999999 40.68891199999991, -74.16542699999997 40.6887589999999, -74.16548099999994 40.68863299999987, -74.16560699999997 40.68842599999988, -74.16576899999995 40.68802999999986, -74.16587699999997 40.68787699999991, -74.16583199999997 40.68757999999987, -74.16582299999999 40.68748999999987, -74.16580499999998 40.687156999999914, -74.16582299999999 40.68703999999986, -74.16589499999998 40.6868419999999, -74.16604799999999 40.68655399999988, -74.16639899999996 40.686022999999864, -74.16650699999997 40.68588799999986, -74.16674099999994 40.685491999999925, -74.16695699999997 40.68523099999988, -74.16738899999996 40.684546999999924, -74.16781199999997 40.6839439999999, -74.16791099999995 40.68379099999988, -74.16804599999995 40.68360199999991, -74.16816299999994 40.683475999999885, -74.16822599999995 40.68334999999991, -74.16848699999997 40.68299899999991, -74.16886499999998 40.68239599999987, -74.16916199999997 40.68199999999991, -74.16929699999997 40.68178399999989, -74.16947699999997 40.68155899999991, -74.16981899999996 40.681018999999885, -74.16995399999996 40.680874999999915, -74.17005299999994 40.68066799999987, -74.17041299999994 40.6801549999999, -74.17051199999997 40.67999299999987, -74.17067399999996 40.679650999999886, -74.17093499999999 40.679290999999864, -74.17144799999994 40.67847199999989, -74.17151999999999 40.678381999999885, -74.17160999999999 40.678255999999884, -74.17193399999996 40.67782399999988, -74.17200599999995 40.67773399999988, -74.17283399999997 40.67656399999988, -74.17314899999997 40.67619499999991, -74.17322999999999 40.6760779999999, -74.17329299999994 40.67601499999989, -74.17358999999993 40.67571799999991, -74.17423799999995 40.67493499999991, -74.17437299999995 40.674817999999895, -74.17484999999994 40.67432299999992, -74.17500299999995 40.6741699999999, -74.17538999999995 40.67375599999987, -74.17604699999998 40.673044999999895, -74.17630799999995 40.67276599999986, -74.17641599999996 40.672621999999876, -74.17663199999998 40.67239699999989, -74.17678499999994 40.67218099999991, -74.17697399999997 40.6719379999999, -74.17709099999996 40.671784999999886, -74.17734299999995 40.67155999999988, -74.17754999999994 40.67142499999989, -74.17778399999997 40.671316999999874, -74.17802699999999 40.671208999999884, -74.17862999999994 40.671037999999896, -74.17888199999999 40.671001999999895, -74.17912499999994 40.67099299999991, -74.17933199999999 40.67101099999992, -74.17979099999997 40.67115499999989, -74.17997999999994 40.671208999999884, -74.18010599999997 40.671262999999904, -74.18030399999998 40.67129899999986, -74.18133899999998 40.67170399999986, -74.18213999999996 40.67202799999989, -74.18384999999995 40.672648999999886, -74.18437199999994 40.67290999999989, -74.18458799999996 40.67302699999988, -74.18492099999997 40.673269999999896, -74.18503799999996 40.67335999999989, -74.18513699999994 40.673458999999866, -74.18547899999999 40.67390899999987, -74.18594699999994 40.674664999999905, -74.18670299999997 40.67578999999992, -74.18733299999997 40.67674399999987, -74.18767499999996 40.67729299999991, -74.18795399999995 40.67761699999989, -74.18819699999995 40.67792299999992, -74.18852099999998 40.67848099999987, -74.18877299999997 40.67885899999989, -74.18905199999995 40.67933599999985, -74.18935799999997 40.67975899999988, -74.18949299999997 40.680091999999895, -74.18969999999996 40.680793999999885, -74.18977199999995 40.68113599999987, -74.189781 40.681198999999886, -74.18983499999996 40.68131599999987, -74.18991599999998 40.68154099999988, -74.18996999999996 40.6818019999999, -74.18999699999995 40.6822519999999, -74.18999699999995 40.68262999999992, -74.18996999999996 40.68295399999989, -74.18998799999997 40.68317899999989, -74.18995199999995 40.683520999999885, -74.18993399999994 40.68370999999992, -74.189871 40.684078999999876, -74.189781 40.68481699999991, -74.18976299999997 40.68503299999986, -74.18962799999997 40.686103999999915, -74.18955599999998 40.68689599999987, -74.18951999999996 40.6872019999999, -74.18947499999996 40.68748999999985, -74.18939399999994 40.68773299999988, -74.18939399999994 40.68783199999991, -74.18941199999995 40.687939999999855, -74.18940299999997 40.68809299999987, -74.18934899999994 40.68826399999989, -74.18922299999997 40.68862399999989, -74.18898899999994 40.68904699999991, -74.18870099999998 40.689442999999876, -74.18779199999994 40.690189999999866, -74.18723399999999 40.69059499999986, -74.18636999999995 40.69118899999991, -74.18591099999998 40.69144999999988, -74.18563199999994 40.69164799999987, -74.18445299999996 40.694995999999904)))7.823067885002558E-40.11635745318867867
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "MULTIPOLYGON (((-74.18445299999996 40.694995999999904, -74.18448899999999 40.69509499999987, -74.18449799999996 40.69518499999987, -74.18438099999997 40.69587799999989, -74.18428199999994 40.6962109999999, -74.18402099999997 40.697074999999884, -74.18391299999996 40.69750699999986, -74.18375099999997 40.69779499999988, -74.18363399999998 40.6983259999999, -74.18356199999994 40.698451999999875, -74.18354399999998 40.69855999999988, -74.18350799999996 40.69870399999992, -74.18327399999998 40.70008999999988, -74.18315699999994 40.701214999999884, -74.18316599999997 40.702384999999886, -74.18313899999998 40.7026279999999, -74.18309399999998 40.7028529999999, -74.18299499999995 40.70315899999985, -74.18284199999994 40.70346499999989, -74.18264399999998 40.70373499999988, -74.18242799999996 40.70395099999992, -74.18220299999996 40.704139999999896, -74.18203199999994 40.70425699999987, -74.18180699999994 40.7043919999999, -74.18157299999996 40.70449999999988, -74.18132099999997 40.70460799999991, -74.18080799999996 40.7047879999999, -74.179467 40.70534599999992, -74.17887299999995 40.70554399999987, -74.17831499999994 40.70572399999987, -74.17776599999996 40.70589499999988, -74.17709099999996 40.706092999999896, -74.17699199999998 40.70613799999988, -74.17689299999995 40.70619199999988, -74.17664999999994 40.70641699999988, -74.17642499999994 40.706695999999916, -74.17628999999994 40.70689399999988, -74.17608299999995 40.70710999999989, -74.17599299999995 40.70719099999991, -74.17589399999997 40.707262999999905, -74.17565999999994 40.70737999999988, -74.17538099999996 40.707469999999915, -74.17515599999996 40.707514999999894, -74.17475999999994 40.707595999999924, -74.17417499999993 40.70766799999991, -74.17388699999998 40.70773099999992, -74.17347299999994 40.707748999999865, -74.17275299999994 40.707802999999906, -74.17188899999996 40.707910999999854, -74.17163699999998 40.70795599999986, -74.17133999999999 40.707964999999895, -74.17120499999999 40.70795599999986, -74.16994499999998 40.707973999999886, -74.16888299999994 40.7079379999999, -74.16681299999993 40.70785699999989, -74.16442799999999 40.70779399999987, -74.16401399999995 40.70777599999992, -74.16233999999997 40.707721999999876, -74.16081899999995 40.70764099999991, -74.16057599999993 40.70760499999988, -74.16033299999998 40.70756899999987, -74.160063 40.7074879999999, -74.15938799999998 40.707262999999905, -74.15904599999999 40.707145999999916, -74.15891999999997 40.70710999999989, -74.15827199999995 40.70687599999993, -74.15459099999998 40.705651999999894, -74.15409599999998 40.70544499999989, -74.15401499999997 40.70538199999988, -74.15387999999996 40.705327999999895, -74.15376299999997 40.705408999999875, -74.15323199999995 40.70524699999987, -74.15317799999997 40.70531899999989, -74.15306999999996 40.7052829999999, -74.15359199999995 40.70437399999987, -74.15386199999995 40.7038429999999, -74.15513999999996 40.70155699999987, -74.15544599999998 40.70108899999988, -74.15575199999995 40.7006659999999, -74.15600399999994 40.70026099999991, -74.15635499999996 40.69975699999986, -74.15745299999998 40.69809199999988, -74.15754299999998 40.6979389999999, -74.15758799999998 40.69781299999988, -74.15762399999994 40.69767799999991, -74.15829899999994 40.696705999999885, -74.15951399999994 40.69488799999988, -74.15958599999993 40.69476199999984, -74.16014399999995 40.69410499999988, -74.16057599999993 40.693222999999875, -74.16262799999998 40.69028899999989, -74.16279899999995 40.69002799999989, -74.16290699999996 40.68987499999987, -74.16292499999997 40.689874999999866, -74.16295199999996 40.689874999999866, -74.16306899999995 40.68989299999988, -74.16309599999994 40.689928999999886, -74.16322199999996 40.68998299999989, -74.16331199999996 40.68999199999993, -74.16341099999994 40.69000099999988, -74.16352799999999 40.69000999999986, -74.16380699999996 40.69004599999989, -74.16410399999995 40.690081999999904, -74.16417599999994 40.690081999999904, -74.16422999999998 40.69005499999988, -74.16436499999998 40.69003699999991, -74.16450899999995 40.68998299999986, -74.16467099999994 40.68988399999989, -74.16479699999996 40.689757999999884, -74.16491399999995 40.689586999999904, -74.16499499999998 40.689388999999885, -74.16528299999999 40.68891199999991, -74.16542699999997 40.6887589999999, -74.16548099999994 40.68863299999987, -74.16560699999997 40.68842599999988, -74.16576899999995 40.68802999999986, -74.16587699999997 40.68787699999991, -74.16583199999997 40.68757999999987, -74.16582299999999 40.68748999999987, -74.16580499999998 40.687156999999914, -74.16582299999999 40.68703999999986, -74.16589499999998 40.6868419999999, -74.16604799999999 40.68655399999988, -74.16639899999996 40.686022999999864, -74.16650699999997 40.68588799999986, -74.16674099999994 40.685491999999925, -74.16695699999997 40.68523099999988, -74.16738899999996 40.684546999999924, -74.16781199999997 40.6839439999999, -74.16791099999995 40.68379099999988, -74.16804599999995 40.68360199999991, -74.16816299999994 40.683475999999885, -74.16822599999995 40.68334999999991, -74.16848699999997 40.68299899999991, -74.16886499999998 40.68239599999987, -74.16916199999997 40.68199999999991, -74.16929699999997 40.68178399999989, -74.16947699999997 40.68155899999991, -74.16981899999996 40.681018999999885, -74.16995399999996 40.680874999999915, -74.17005299999994 40.68066799999987, -74.17041299999994 40.6801549999999, -74.17051199999997 40.67999299999987, -74.17067399999996 40.679650999999886, -74.17093499999999 40.679290999999864, -74.17144799999994 40.67847199999989, -74.17151999999999 40.678381999999885, -74.17160999999999 40.678255999999884, -74.17193399999996 40.67782399999988, -74.17200599999995 40.67773399999988, -74.17283399999997 40.67656399999988, -74.17314899999997 40.67619499999991, -74.17322999999999 40.6760779999999, -74.17329299999994 40.67601499999989, -74.17358999999993 40.67571799999991, -74.17423799999995 40.67493499999991, -74.17437299999995 40.674817999999895, -74.17484999999994 40.67432299999992, -74.17500299999995 40.6741699999999, -74.17538999999995 40.67375599999987, -74.17604699999998 40.673044999999895, -74.17630799999995 40.67276599999986, -74.17641599999996 40.672621999999876, -74.17663199999998 40.67239699999989, -74.17678499999994 40.67218099999991, -74.17697399999997 40.6719379999999, -74.17709099999996 40.671784999999886, -74.17734299999995 40.67155999999988, -74.17754999999994 40.67142499999989, -74.17778399999997 40.671316999999874, -74.17802699999999 40.671208999999884, -74.17862999999994 40.671037999999896, -74.17888199999999 40.671001999999895, -74.17912499999994 40.67099299999991, -74.17933199999999 40.67101099999992, -74.17979099999997 40.67115499999989, -74.17997999999994 40.671208999999884, -74.18010599999997 40.671262999999904, -74.18030399999998 40.67129899999986, -74.18133899999998 40.67170399999986, -74.18213999999996 40.67202799999989, -74.18384999999995 40.672648999999886, -74.18437199999994 40.67290999999989, -74.18458799999996 40.67302699999988, -74.18492099999997 40.673269999999896, -74.18503799999996 40.67335999999989, -74.18513699999994 40.673458999999866, -74.18547899999999 40.67390899999987, -74.18594699999994 40.674664999999905, -74.18670299999997 40.67578999999992, -74.18733299999997 40.67674399999987, -74.18767499999996 40.67729299999991, -74.18795399999995 40.67761699999989, -74.18819699999995 40.67792299999992, -74.18852099999998 40.67848099999987, -74.18877299999997 40.67885899999989, -74.18905199999995 40.67933599999985, -74.18935799999997 40.67975899999988, -74.18949299999997 40.680091999999895, -74.18969999999996 40.680793999999885, -74.18977199999995 40.68113599999987, -74.189781 40.681198999999886, -74.18983499999996 40.68131599999987, -74.18991599999998 40.68154099999988, -74.18996999999996 40.6818019999999, -74.18999699999995 40.6822519999999, -74.18999699999995 40.68262999999992, -74.18996999999996 40.68295399999989, -74.18998799999997 40.68317899999989, -74.18995199999995 40.683520999999885, -74.18993399999994 40.68370999999992, -74.189871 40.684078999999876, -74.189781 40.68481699999991, -74.18976299999997 40.68503299999986, -74.18962799999997 40.686103999999915, -74.18955599999998 40.68689599999987, -74.18951999999996 40.6872019999999, -74.18947499999996 40.68748999999985, -74.18939399999994 40.68773299999988, -74.18939399999994 40.68783199999991, -74.18941199999995 40.687939999999855, -74.18940299999997 40.68809299999987, -74.18934899999994 40.68826399999989, -74.18922299999997 40.68862399999989, -74.18898899999994 40.68904699999991, -74.18870099999998 40.689442999999876, -74.18779199999994 40.690189999999866, -74.18723399999999 40.69059499999986, -74.18636999999995 40.69118899999991, -74.18591099999998 40.69144999999988, -74.18563199999994 40.69164799999987, -74.18445299999996 40.694995999999904)))", + 7.823067885002558E-4, + 0.11635745318867867 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "geometry", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "calculatedArea", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "calculatedLength", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "display(\n", + " neighbourhoods\n", + " .withColumn(\"calculatedArea\", st_area(col(\"geometry\")))\n", + " .withColumn(\"calculatedLength\", st_length(col(\"geometry\")))\n", + " // Note: The unit of measure of the area and length depends on the CRS used.\n", + " // For GPS locations it will be square radians and radians\n", + " .select(\"geometry\", \"calculatedArea\", \"calculatedLength\")\n", + " .limit(1) // <- limiting for ipynb only\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "704024a0-8171-4218-a95e-ff8ef6ace37c", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initial Trips Data [Points]\n", + "\n", + "> We will load some Taxi trips data to represent point data; this data is coming from Databricks public datasets available in your environment. __Note: this is 1.6 billion trips as-is; while it is no problem to process this, to keep this to a quickstart level, we are going to use just 1/10th of 1% or ~1.6 million.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7ebf19ba-2e86-4a17-8e84-7e3c521251f3", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
count? 1,608,670\n", + "trips: org.apache.spark.sql.DataFrame = [row_id: bigint, vendor_id: string ... 17 more fields]\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
count? 1,608,670\ntrips: org.apache.spark.sql.DataFrame = [row_id: bigint, vendor_id: string ... 17 more fields]\n
", + "datasetInfos": [ + { + "name": "trips", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "row_id", + "nullable": false, + "type": "long" + }, + { + "metadata": {}, + "name": "vendor_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "pickup_datetime", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "dropoff_datetime", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "passenger_count", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "trip_distance", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "pickup_longitude", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "pickup_latitude", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "rate_code_id", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "dropoff_longitude", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "dropoff_latitude", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "fare_amount", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "extra", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "mta_tax", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "tip_amount", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "tolls_amount", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "total_amount", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "pickup_geom", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dropoff_geom", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "tableIdentifier": null, + "typeStr": "org.apache.spark.sql.DataFrame" + } + ], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "\n", + "val trips = spark.table(\"delta.`/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`\")\n", + " .sample(0.001)\n", + " .drop(\"vendorId\", \"rateCodeId\", \"store_and_fwd_flag\", \"payment_type\")\n", + " .withColumn(\"pickup_geom\", st_astext(st_point($\"pickup_longitude\", $\"pickup_latitude\")))\n", + " .withColumn(\"dropoff_geom\", st_astext(st_point($\"dropoff_longitude\", $\"dropoff_latitude\")))\n", + " .selectExpr(\"xxhash64(pickup_datetime, dropoff_datetime, pickup_geom, dropoff_geom) as row_id\",\"*\")\n", + "\n", + "println(s\"count? ${formatter.format(trips.count)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d905fbe9-5104-4a09-bdee-4b5adba23bcf", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Spatial Joins\n", + "\n", + "> We can use Mosaic to perform spatial joins both with and without Mosaic indexing strategies. Indexing is very important when handling very different geometries both in size and in shape (ie. number of vertices)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e9ff1fa2-ca0b-4472-8c8a-1b317da11e76", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Getting the optimal resolution\n", + "\n", + "> We can use Mosaic functionality to identify how to best index our data based on the data inside the specific dataframe. Selecting an appropriate indexing resolution can have a considerable impact on the performance." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2bff2755-9a4f-481b-a00f-93e0fd2ebd8a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
Optimal resolution is 9\n", + "import com.databricks.labs.mosaic.sql.MosaicFrame\n", + "mosaicFrame: com.databricks.labs.mosaic.sql.MosaicFrame = [type: string, properties: struct<borough: string, location_id: string ... 4 more fields> ... 2 more fields]\n", + "optimalResolution: Int = 9\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
Optimal resolution is 9\nimport com.databricks.labs.mosaic.sql.MosaicFrame\nmosaicFrame: com.databricks.labs.mosaic.sql.MosaicFrame = [type: string, properties: struct<borough: string, location_id: string ... 4 more fields> ... 2 more fields]\noptimalResolution: Int = 9\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "\n", + "import com.databricks.labs.mosaic.sql.MosaicFrame\n", + "\n", + "val mosaicFrame = MosaicFrame(neighbourhoods)\n", + " .setGeometryColumn(\"geometry\")\n", + "\n", + "val optimalResolution = mosaicFrame.getOptimalResolution(0.75)\n", + "\n", + "println(s\"Optimal resolution is $optimalResolution\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b28464a3-f420-4264-b58b-a7e7d79329ad", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> Not every resolution will yield performance improvements. By a rule of thumb it is always better to under-index than over-index - if not sure select a lower resolution. Higher resolutions are needed when we have very imbalanced geometries with respect to their size or with respect to the number of vertices. In such case indexing with more indices will considerably increase the parallel nature of the operations. You can think of Mosaic as a way to partition an overly complex row into multiple rows that have a balanced amount of computation each." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b733893f-7f52-4021-9dd6-23932494ec53", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
resolutionmean_index_areamean_geometry_areapercentile_25_geometry_areapercentile_50_geometry_areapercentile_75_geometry_area
112.3050887040151685E-71393.4792000326781461.6662802245658991.44601168828591961.150753088525
101.6135627796508305E-6199.0683724106790565.9522976726467141.6350842375098280.16427404166126
91.1294934384716961E-528.4383516881839049.42176104192583420.23359254991216740.02348569578239
87.906505133983356E-54.0625954310322991.34595717012354022.8904945528469825.717603885925354
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 11, + 2.3050887040151685E-7, + 1393.4792000326781, + 461.6662802245658, + 991.4460116882859, + 1961.150753088525 + ], + [ + 10, + 1.6135627796508305E-6, + 199.06837241067905, + 65.9522976726467, + 141.6350842375098, + 280.16427404166126 + ], + [ + 9, + 1.1294934384716961E-5, + 28.438351688183904, + 9.421761041925834, + 20.233592549912167, + 40.02348569578239 + ], + [ + 8, + 7.906505133983356E-5, + 4.062595431032299, + 1.3459571701235402, + 2.890494552846982, + 5.717603885925354 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "resolution", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "mean_index_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mean_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_25_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_50_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_75_geometry_area", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "display(\n", + " mosaicFrame.analyzer.getResolutionMetrics()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "eb868752-f425-4c70-aab9-9a7f0d45f049", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Indexing using the optimal resolution\n", + "\n", + "> We will use mosaic sql functions to index our points data. Here we will use resolution 9, index resolution depends on the dataset in use." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4c31c1a3-c547-4111-a162-07730752a7aa", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
row_idpickup_h3dropoff_h3vendor_idpickup_datetimedropoff_datetimepassenger_counttrip_distancepickup_longitudepickup_latituderate_code_iddropoff_longitudedropoff_latitudefare_amountextramta_taxtip_amounttolls_amounttotal_amountpickup_geomdropoff_geomtrip_line
4790840662229985948617733123877109759617733122619080703CMT2012-09-22T13:02:28.000+00002012-09-22T13:16:43.000+000012.9-73.96409540.7566721-73.97676640.78042512.50.00.50.00.013.0POINT (-73.964095 40.756672)POINT (-73.976766 40.780425)LINESTRING (-73.964095 40.756672, -73.976766 40.780425)
4640209734139842635617733123876847615617733122610954239CMT2012-09-22T13:17:08.000+00002012-09-22T13:32:01.000+000021.7-73.96863240.7591831-73.98261240.77173811.00.00.50.00.011.5POINT (-73.968632 40.759183)POINT (-73.982612 40.771738)LINESTRING (-73.968632 40.759183, -73.982612 40.771738)
-7307102645695879559617733123877109759617733123838050303CMT2012-09-22T13:32:33.000+00002012-09-22T13:36:22.000+000010.8-73.96470840.7555541-73.9572240.7663425.00.00.50.00.05.5POINT (-73.964708 40.755554)POINT (-73.95722 40.766342)LINESTRING (-73.964708 40.755554, -73.95722 40.766342)
-4628045955618738638617733123877371903617733123836477439CMT2012-09-22T14:38:57.000+00002012-09-22T14:42:54.000+000010.6-73.96238540.7605221-73.95416340.7638624.50.00.50.00.05.0POINT (-73.962385 40.760522)POINT (-73.954163 40.763862)LINESTRING (-73.962385 40.760522, -73.954163 40.763862)
-3903290589377512186617733123877371903617733123866361855CMT2012-09-22T14:30:56.000+00002012-09-22T14:40:47.000+000011.0-73.96130840.760421-73.97554640.7609978.00.00.50.00.08.5POINT (-73.961308 40.76042)POINT (-73.975546 40.760997)LINESTRING (-73.961308 40.76042, -73.975546 40.760997)
6507070034880946577617733123866099711617733123804495871CMT2012-09-22T15:04:36.000+00002012-09-22T15:23:52.000+000012.4-73.96950140.7573321-74.00021440.74790514.00.00.50.00.014.5POINT (-73.969501 40.757332)POINT (-74.000214 40.747905)LINESTRING (-73.969501 40.757332, -74.000214 40.747905)
9035681533352024775617733123878682623617733151079792639CMT2012-09-22T16:52:27.000+00002012-09-22T17:10:59.000+000013.2-73.97224240.7652421-73.99759540.72420214.50.00.50.00.015.0POINT (-73.972242 40.765242)POINT (-73.997595 40.724202)LINESTRING (-73.972242 40.765242, -73.997595 40.724202)
2778164646035401016617733123869507583617733136115826687CMT2012-09-22T18:02:32.000+00002012-09-22T18:33:28.000+0000416.6-73.97173740.7562952-73.79028440.64686352.00.00.50.04.857.3POINT (-73.971737 40.756295)POINT (-73.790284 40.646863)LINESTRING (-73.971737 40.756295, -73.790284 40.646863)
2017582055297242140617733123865837567617733151078481919CMT2012-09-22T17:34:00.000+00002012-09-22T18:17:31.000+000013.4-73.97130640.7601271-73.9998640.72371227.00.00.50.00.027.5POINT (-73.971306 40.760127)POINT (-73.99986 40.723712)LINESTRING (-73.971306 40.760127, -73.99986 40.723712)
7268476285856530227617733123869507583617733136116088831CMT2012-09-22T18:37:37.000+00002012-09-22T19:20:11.000+0000114.6-73.9733340.7546972-73.79068640.64426352.00.00.50.00.052.5POINT (-73.97333 40.754697)POINT (-73.790686 40.644263)LINESTRING (-73.97333 40.754697, -73.790686 40.644263)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 4790840662229985948, + 617733123877109759, + 617733122619080703, + "CMT", + "2012-09-22T13:02:28.000+0000", + "2012-09-22T13:16:43.000+0000", + 1, + 2.9, + -73.964095, + 40.756672, + 1, + -73.976766, + 40.780425, + 12.5, + 0.0, + 0.5, + 0.0, + 0.0, + 13.0, + "POINT (-73.964095 40.756672)", + "POINT (-73.976766 40.780425)", + "LINESTRING (-73.964095 40.756672, -73.976766 40.780425)" + ], + [ + 4640209734139842635, + 617733123876847615, + 617733122610954239, + "CMT", + "2012-09-22T13:17:08.000+0000", + "2012-09-22T13:32:01.000+0000", + 2, + 1.7, + -73.968632, + 40.759183, + 1, + -73.982612, + 40.771738, + 11.0, + 0.0, + 0.5, + 0.0, + 0.0, + 11.5, + "POINT (-73.968632 40.759183)", + "POINT (-73.982612 40.771738)", + "LINESTRING (-73.968632 40.759183, -73.982612 40.771738)" + ], + [ + -7307102645695879559, + 617733123877109759, + 617733123838050303, + "CMT", + "2012-09-22T13:32:33.000+0000", + "2012-09-22T13:36:22.000+0000", + 1, + 0.8, + -73.964708, + 40.755554, + 1, + -73.95722, + 40.766342, + 5.0, + 0.0, + 0.5, + 0.0, + 0.0, + 5.5, + "POINT (-73.964708 40.755554)", + "POINT (-73.95722 40.766342)", + "LINESTRING (-73.964708 40.755554, -73.95722 40.766342)" + ], + [ + -4628045955618738638, + 617733123877371903, + 617733123836477439, + "CMT", + "2012-09-22T14:38:57.000+0000", + "2012-09-22T14:42:54.000+0000", + 1, + 0.6, + -73.962385, + 40.760522, + 1, + -73.954163, + 40.763862, + 4.5, + 0.0, + 0.5, + 0.0, + 0.0, + 5.0, + "POINT (-73.962385 40.760522)", + "POINT (-73.954163 40.763862)", + "LINESTRING (-73.962385 40.760522, -73.954163 40.763862)" + ], + [ + -3903290589377512186, + 617733123877371903, + 617733123866361855, + "CMT", + "2012-09-22T14:30:56.000+0000", + "2012-09-22T14:40:47.000+0000", + 1, + 1.0, + -73.961308, + 40.76042, + 1, + -73.975546, + 40.760997, + 8.0, + 0.0, + 0.5, + 0.0, + 0.0, + 8.5, + "POINT (-73.961308 40.76042)", + "POINT (-73.975546 40.760997)", + "LINESTRING (-73.961308 40.76042, -73.975546 40.760997)" + ], + [ + 6507070034880946577, + 617733123866099711, + 617733123804495871, + "CMT", + "2012-09-22T15:04:36.000+0000", + "2012-09-22T15:23:52.000+0000", + 1, + 2.4, + -73.969501, + 40.757332, + 1, + -74.000214, + 40.747905, + 14.0, + 0.0, + 0.5, + 0.0, + 0.0, + 14.5, + "POINT (-73.969501 40.757332)", + "POINT (-74.000214 40.747905)", + "LINESTRING (-73.969501 40.757332, -74.000214 40.747905)" + ], + [ + 9035681533352024775, + 617733123878682623, + 617733151079792639, + "CMT", + "2012-09-22T16:52:27.000+0000", + "2012-09-22T17:10:59.000+0000", + 1, + 3.2, + -73.972242, + 40.765242, + 1, + -73.997595, + 40.724202, + 14.5, + 0.0, + 0.5, + 0.0, + 0.0, + 15.0, + "POINT (-73.972242 40.765242)", + "POINT (-73.997595 40.724202)", + "LINESTRING (-73.972242 40.765242, -73.997595 40.724202)" + ], + [ + 2778164646035401016, + 617733123869507583, + 617733136115826687, + "CMT", + "2012-09-22T18:02:32.000+0000", + "2012-09-22T18:33:28.000+0000", + 4, + 16.6, + -73.971737, + 40.756295, + 2, + -73.790284, + 40.646863, + 52.0, + 0.0, + 0.5, + 0.0, + 4.8, + 57.3, + "POINT (-73.971737 40.756295)", + "POINT (-73.790284 40.646863)", + "LINESTRING (-73.971737 40.756295, -73.790284 40.646863)" + ], + [ + 2017582055297242140, + 617733123865837567, + 617733151078481919, + "CMT", + "2012-09-22T17:34:00.000+0000", + "2012-09-22T18:17:31.000+0000", + 1, + 3.4, + -73.971306, + 40.760127, + 1, + -73.99986, + 40.723712, + 27.0, + 0.0, + 0.5, + 0.0, + 0.0, + 27.5, + "POINT (-73.971306 40.760127)", + "POINT (-73.99986 40.723712)", + "LINESTRING (-73.971306 40.760127, -73.99986 40.723712)" + ], + [ + 7268476285856530227, + 617733123869507583, + 617733136116088831, + "CMT", + "2012-09-22T18:37:37.000+0000", + "2012-09-22T19:20:11.000+0000", + 1, + 14.6, + -73.97333, + 40.754697, + 2, + -73.790686, + 40.644263, + 52.0, + 0.0, + 0.5, + 0.0, + 0.0, + 52.5, + "POINT (-73.97333 40.754697)", + "POINT (-73.790686 40.644263)", + "LINESTRING (-73.97333 40.754697, -73.790686 40.644263)" + ] + ], + "datasetInfos": [ + { + "name": "tripsWithIndex", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "row_id", + "nullable": false, + "type": "long" + }, + { + "metadata": {}, + "name": "pickup_h3", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "dropoff_h3", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "vendor_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "pickup_datetime", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "dropoff_datetime", + "nullable": true, + "type": "timestamp" + }, + { + "metadata": {}, + "name": "passenger_count", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "trip_distance", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "pickup_longitude", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "pickup_latitude", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "rate_code_id", + "nullable": true, + "type": "integer" + }, + { + "metadata": {}, + "name": "dropoff_longitude", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "dropoff_latitude", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "fare_amount", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "extra", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "mta_tax", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "tip_amount", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "tolls_amount", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "total_amount", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "pickup_geom", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dropoff_geom", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "trip_line", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "tableIdentifier": null, + "typeStr": "org.apache.spark.sql.DataFrame" + } + ], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "row_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "pickup_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "dropoff_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "vendor_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "dropoff_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "passenger_count", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "rate_code_id", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "dropoff_longitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "dropoff_latitude", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "fare_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "extra", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mta_tax", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tip_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "tolls_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "total_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "trip_line", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "val tripsWithIndex = trips\n", + " .withColumn(\"pickup_h3\", grid_pointascellid(col(\"pickup_geom\"), lit(optimalResolution)))\n", + " .withColumn(\"dropoff_h3\", grid_pointascellid(col(\"dropoff_geom\"), lit(optimalResolution)))\n", + " .withColumn(\"trip_line\", st_makeline(array(\"pickup_geom\", \"dropoff_geom\")))\n", + ".selectExpr(\n", + " \"row_id\", \"pickup_h3\", \"dropoff_h3\",\n", + " \"* except(row_id, pickup_h3, dropoff_h3)\"\n", + ")\n", + "display(tripsWithIndex.limit(10)) // <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e519823d-b03b-4984-9a6a-4988aff54648", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> We will also index our neighbourhoods using a built in generator function." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ac398701-f9f4-4ecf-a1c6-82a7c7535dc5", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
count? 11,885\n", + "neighbourhoodsWithIndex: org.apache.spark.sql.DataFrame = [type: string, properties: struct<borough: string, location_id: string ... 4 more fields> ... 1 more field]\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
count? 11,885\nneighbourhoodsWithIndex: org.apache.spark.sql.DataFrame = [type: string, properties: struct<borough: string, location_id: string ... 4 more fields> ... 1 more field]\n
", + "datasetInfos": [ + { + "name": "neighbourhoodsWithIndex", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "type", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "properties", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "borough", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "location_id", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "objectid", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "shape_area", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "shape_leng", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "zone", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + } + }, + { + "metadata": {}, + "name": "mosaic_index", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "is_core", + "nullable": true, + "type": "boolean" + }, + { + "metadata": {}, + "name": "index_id", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "wkb", + "nullable": true, + "type": "binary" + } + ], + "type": "struct" + } + } + ], + "type": "struct" + }, + "tableIdentifier": null, + "typeStr": "org.apache.spark.sql.DataFrame" + } + ], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "val neighbourhoodsWithIndex = neighbourhoods\n", + " // We break down the original geometry in multiple smaller mosaic chips, each with its\n", + " // own index\n", + " .withColumn(\"mosaic_index\", grid_tessellateexplode(col(\"geometry\"), lit(optimalResolution)))\n", + " // We don't need the original geometry any more, since we have broken it down into\n", + " // Smaller mosaic chips.\n", + " .drop(\"json_geometry\", \"geometry\")\n", + "\n", + "println(s\"count? ${formatter.format(neighbourhoodsWithIndex.count)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "67f7a24b-8a6e-493b-9551-0d08ff757434", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
typepropertiesmosaic_index
FeatureCollectionList(EWR, 1, 1, 0.0007823067885, 0.116357453189, Newark Airport)List(true, 617733150781997055, AAAAAAMAAAABAAAACMBSi+u3pmJPQERXt9Zja+DAUowOEmsLgUBEV5j/c7NdwFKMDNGKYo5ARFdfWe/7QsBSi+k2kMI9QERXRItV/dvAUovG3BrqLkBEV2Nhr37nwFKLyBxP4SdARFedBzkzSsBSi+u3pmJPQERXt9Zja+DAUos= (truncated))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "FeatureCollection", + [ + "EWR", + "1", + "1", + "0.0007823067885", + "0.116357453189", + "Newark Airport" + ], + [ + true, + 617733150781997055, + "AAAAAAMAAAABAAAACMBSi+u3pmJPQERXt9Zja+DAUowOEmsLgUBEV5j/c7NdwFKMDNGKYo5ARFdfWe/7QsBSi+k2kMI9QERXRItV/dvAUovG3BrqLkBEV2Nhr37nwFKLyBxP4SdARFedBzkzSsBSi+u3pmJPQERXt9Zja+DAUos= (truncated)" + ] + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "properties", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"borough\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"location_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"objectid\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_area\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_leng\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"zone\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "mosaic_index", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"is_core\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"index_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"wkb\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}}]}" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "display(neighbourhoodsWithIndex.limit(1)) // <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a8028556-2f1e-4f84-9ef0-e400815908d1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Performing the spatial join\n", + "\n", + "> We can now do spatial joins to both pickup and drop off zones based on geolocations in our datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "76b26cfb-6ebd-4b25-8bef-c718d88448a2", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
trip_distancepickup_geomdropoff_geompickup_h3dropoff_h3pickup_zonetrip_line
0.6POINT (-73.964261 40.792384)POINT (-73.955786 40.787794)617733122568486911617733122587885567Upper West Side NorthLINESTRING (-73.964261 40.792384, -73.955786 40.787794)
0.4POINT (-73.950633 40.785838)POINT (-73.948693 40.781558)617733122582380543617733122582642687East Harlem SouthLINESTRING (-73.950633 40.785838, -73.948693 40.781558)
1.0POINT (-73.953601 40.790798)POINT (-73.966704 40.788905)617733122648178687617733122560622591East Harlem SouthLINESTRING (-73.953601 40.790798, -73.966704 40.788905)
1.3POINT (-73.862603 40.769665)POINT (-73.862603 40.769665)617733124072407039617733124072407039LaGuardia AirportLINESTRING (-73.862603 40.769665, -73.862603 40.769665)
1.15POINT (-73.949988 40.780458)POINT (-73.957972 40.766948)617733122586050559617733123838050303Yorkville WestLINESTRING (-73.949988 40.780458, -73.957972 40.766948)
2.01POINT (-73.959122 40.77728)POINT (-73.969067 40.753577)617733122574778367617733123867934719Upper East Side NorthLINESTRING (-73.959122 40.77728, -73.969067 40.753577)
1.56POINT (-73.945795 40.77369)POINT (-73.96541 40.765965)617733122584739839617733123874226175Yorkville EastLINESTRING (-73.945795 40.77369, -73.96541 40.765965)
8.2POINT (-73.870503 40.773508)POINT (-73.974395 40.749617)617733124388552703617733123868721151LaGuardia AirportLINESTRING (-73.870503 40.773508, -73.974395 40.749617)
2.94POINT (-73.955023 40.769212)POINT (-73.982877 40.750892)617733122576351231617733123808690175Lenox Hill WestLINESTRING (-73.955023 40.769212, -73.982877 40.750892)
0.8POINT (-73.958105 40.77897)POINT (-73.950742 40.775888)617733122573991935617733122585264127Upper East Side NorthLINESTRING (-73.958105 40.77897, -73.950742 40.775888)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 0.6, + "POINT (-73.964261 40.792384)", + "POINT (-73.955786 40.787794)", + 617733122568486911, + 617733122587885567, + "Upper West Side North", + "LINESTRING (-73.964261 40.792384, -73.955786 40.787794)" + ], + [ + 0.4, + "POINT (-73.950633 40.785838)", + "POINT (-73.948693 40.781558)", + 617733122582380543, + 617733122582642687, + "East Harlem South", + "LINESTRING (-73.950633 40.785838, -73.948693 40.781558)" + ], + [ + 1.0, + "POINT (-73.953601 40.790798)", + "POINT (-73.966704 40.788905)", + 617733122648178687, + 617733122560622591, + "East Harlem South", + "LINESTRING (-73.953601 40.790798, -73.966704 40.788905)" + ], + [ + 1.3, + "POINT (-73.862603 40.769665)", + "POINT (-73.862603 40.769665)", + 617733124072407039, + 617733124072407039, + "LaGuardia Airport", + "LINESTRING (-73.862603 40.769665, -73.862603 40.769665)" + ], + [ + 1.15, + "POINT (-73.949988 40.780458)", + "POINT (-73.957972 40.766948)", + 617733122586050559, + 617733123838050303, + "Yorkville West", + "LINESTRING (-73.949988 40.780458, -73.957972 40.766948)" + ], + [ + 2.01, + "POINT (-73.959122 40.77728)", + "POINT (-73.969067 40.753577)", + 617733122574778367, + 617733123867934719, + "Upper East Side North", + "LINESTRING (-73.959122 40.77728, -73.969067 40.753577)" + ], + [ + 1.56, + "POINT (-73.945795 40.77369)", + "POINT (-73.96541 40.765965)", + 617733122584739839, + 617733123874226175, + "Yorkville East", + "LINESTRING (-73.945795 40.77369, -73.96541 40.765965)" + ], + [ + 8.2, + "POINT (-73.870503 40.773508)", + "POINT (-73.974395 40.749617)", + 617733124388552703, + 617733123868721151, + "LaGuardia Airport", + "LINESTRING (-73.870503 40.773508, -73.974395 40.749617)" + ], + [ + 2.94, + "POINT (-73.955023 40.769212)", + "POINT (-73.982877 40.750892)", + 617733122576351231, + 617733123808690175, + "Lenox Hill West", + "LINESTRING (-73.955023 40.769212, -73.982877 40.750892)" + ], + [ + 0.8, + "POINT (-73.958105 40.77897)", + "POINT (-73.950742 40.775888)", + 617733122573991935, + 617733122585264127, + "Upper East Side North", + "LINESTRING (-73.958105 40.77897, -73.950742 40.775888)" + ] + ], + "datasetInfos": [ + { + "name": "pickupNeighbourhoods", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "pickup_zone", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "mosaic_index", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "is_core", + "nullable": true, + "type": "boolean" + }, + { + "metadata": {}, + "name": "index_id", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "wkb", + "nullable": true, + "type": "binary" + } + ], + "type": "struct" + } + } + ], + "type": "struct" + }, + "tableIdentifier": null, + "typeStr": "org.apache.spark.sql.DataFrame" + }, + { + "name": "withPickupZone", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "trip_distance", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "pickup_geom", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dropoff_geom", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "pickup_h3", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "dropoff_h3", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "pickup_zone", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "trip_line", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "tableIdentifier": null, + "typeStr": "org.apache.spark.sql.DataFrame" + } + ], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "dropoff_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "pickup_zone", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "trip_line", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "val pickupNeighbourhoods = neighbourhoodsWithIndex.select(col(\"properties.zone\").alias(\"pickup_zone\"), col(\"mosaic_index\"))\n", + "\n", + "val withPickupZone = \n", + " tripsWithIndex.join(\n", + " pickupNeighbourhoods,\n", + " tripsWithIndex.col(\"pickup_h3\") === pickupNeighbourhoods.col(\"mosaic_index.index_id\")\n", + " ).where(\n", + " // If the borough is a core chip (the chip is fully contained within the geometry), then we do not need\n", + " // to perform any intersection, because any point matching the same index will certainly be contained in\n", + " // the borough. Otherwise we need to perform an st_contains operation on the chip geometry.\n", + " col(\"mosaic_index.is_core\") || st_contains(col(\"mosaic_index.wkb\"), col(\"pickup_geom\"))\n", + " ).select(\n", + " \"trip_distance\", \"pickup_geom\", \"dropoff_geom\", \"pickup_h3\", \"dropoff_h3\", \"pickup_zone\", \"trip_line\"\n", + " )\n", + "\n", + "display(withPickupZone.limit(10)) // <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db947fdf-b039-4675-838f-0d16fdd4516f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> We can easily perform a similar join for the drop off location. __Note: in this case using `withPickupZone` from above as the left sid of the join.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8edd850b-84f1-4c37-bef8-aac1836dd779", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
trip_distancepickup_geompickup_zonedropoff_geomdropoff_zonepickup_h3dropoff_h3trip_line
0.6POINT (-73.964261 40.792384)Upper West Side NorthPOINT (-73.955786 40.787794)Upper East Side North617733122568486911617733122587885567LINESTRING (-73.964261 40.792384, -73.955786 40.787794)
0.4POINT (-73.950633 40.785838)East Harlem SouthPOINT (-73.948693 40.781558)Yorkville West617733122582380543617733122582642687LINESTRING (-73.950633 40.785838, -73.948693 40.781558)
1.0POINT (-73.953601 40.790798)East Harlem SouthPOINT (-73.966704 40.788905)Upper West Side North617733122648178687617733122560622591LINESTRING (-73.953601 40.790798, -73.966704 40.788905)
1.3POINT (-73.862603 40.769665)LaGuardia AirportPOINT (-73.862603 40.769665)LaGuardia Airport617733124072407039617733124072407039LINESTRING (-73.862603 40.769665, -73.862603 40.769665)
1.15POINT (-73.949988 40.780458)Yorkville WestPOINT (-73.957972 40.766948)Lenox Hill West617733122586050559617733123838050303LINESTRING (-73.949988 40.780458, -73.957972 40.766948)
2.01POINT (-73.959122 40.77728)Upper East Side NorthPOINT (-73.969067 40.753577)UN/Turtle Bay South617733122574778367617733123867934719LINESTRING (-73.959122 40.77728, -73.969067 40.753577)
1.56POINT (-73.945795 40.77369)Yorkville EastPOINT (-73.96541 40.765965)Upper East Side South617733122584739839617733123874226175LINESTRING (-73.945795 40.77369, -73.96541 40.765965)
8.2POINT (-73.870503 40.773508)LaGuardia AirportPOINT (-73.974395 40.749617)UN/Turtle Bay South617733124388552703617733123868721151LINESTRING (-73.870503 40.773508, -73.974395 40.749617)
2.94POINT (-73.955023 40.769212)Lenox Hill WestPOINT (-73.982877 40.750892)Midtown South617733122576351231617733123808690175LINESTRING (-73.955023 40.769212, -73.982877 40.750892)
0.8POINT (-73.958105 40.77897)Upper East Side NorthPOINT (-73.950742 40.775888)Yorkville West617733122573991935617733122585264127LINESTRING (-73.958105 40.77897, -73.950742 40.775888)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 0.6, + "POINT (-73.964261 40.792384)", + "Upper West Side North", + "POINT (-73.955786 40.787794)", + "Upper East Side North", + 617733122568486911, + 617733122587885567, + "LINESTRING (-73.964261 40.792384, -73.955786 40.787794)" + ], + [ + 0.4, + "POINT (-73.950633 40.785838)", + "East Harlem South", + "POINT (-73.948693 40.781558)", + "Yorkville West", + 617733122582380543, + 617733122582642687, + "LINESTRING (-73.950633 40.785838, -73.948693 40.781558)" + ], + [ + 1.0, + "POINT (-73.953601 40.790798)", + "East Harlem South", + "POINT (-73.966704 40.788905)", + "Upper West Side North", + 617733122648178687, + 617733122560622591, + "LINESTRING (-73.953601 40.790798, -73.966704 40.788905)" + ], + [ + 1.3, + "POINT (-73.862603 40.769665)", + "LaGuardia Airport", + "POINT (-73.862603 40.769665)", + "LaGuardia Airport", + 617733124072407039, + 617733124072407039, + "LINESTRING (-73.862603 40.769665, -73.862603 40.769665)" + ], + [ + 1.15, + "POINT (-73.949988 40.780458)", + "Yorkville West", + "POINT (-73.957972 40.766948)", + "Lenox Hill West", + 617733122586050559, + 617733123838050303, + "LINESTRING (-73.949988 40.780458, -73.957972 40.766948)" + ], + [ + 2.01, + "POINT (-73.959122 40.77728)", + "Upper East Side North", + "POINT (-73.969067 40.753577)", + "UN/Turtle Bay South", + 617733122574778367, + 617733123867934719, + "LINESTRING (-73.959122 40.77728, -73.969067 40.753577)" + ], + [ + 1.56, + "POINT (-73.945795 40.77369)", + "Yorkville East", + "POINT (-73.96541 40.765965)", + "Upper East Side South", + 617733122584739839, + 617733123874226175, + "LINESTRING (-73.945795 40.77369, -73.96541 40.765965)" + ], + [ + 8.2, + "POINT (-73.870503 40.773508)", + "LaGuardia Airport", + "POINT (-73.974395 40.749617)", + "UN/Turtle Bay South", + 617733124388552703, + 617733123868721151, + "LINESTRING (-73.870503 40.773508, -73.974395 40.749617)" + ], + [ + 2.94, + "POINT (-73.955023 40.769212)", + "Lenox Hill West", + "POINT (-73.982877 40.750892)", + "Midtown South", + 617733122576351231, + 617733123808690175, + "LINESTRING (-73.955023 40.769212, -73.982877 40.750892)" + ], + [ + 0.8, + "POINT (-73.958105 40.77897)", + "Upper East Side North", + "POINT (-73.950742 40.775888)", + "Yorkville West", + 617733122573991935, + 617733122585264127, + "LINESTRING (-73.958105 40.77897, -73.950742 40.775888)" + ] + ], + "datasetInfos": [ + { + "name": "dropoffNeighbourhoods", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "dropoff_zone", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "mosaic_index", + "nullable": true, + "type": { + "fields": [ + { + "metadata": {}, + "name": "is_core", + "nullable": true, + "type": "boolean" + }, + { + "metadata": {}, + "name": "index_id", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "wkb", + "nullable": true, + "type": "binary" + } + ], + "type": "struct" + } + } + ], + "type": "struct" + }, + "tableIdentifier": null, + "typeStr": "org.apache.spark.sql.DataFrame" + }, + { + "name": "withDropoffZone", + "schema": { + "fields": [ + { + "metadata": {}, + "name": "trip_distance", + "nullable": true, + "type": "double" + }, + { + "metadata": {}, + "name": "pickup_geom", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "pickup_zone", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dropoff_geom", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "dropoff_zone", + "nullable": true, + "type": "string" + }, + { + "metadata": {}, + "name": "pickup_h3", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "dropoff_h3", + "nullable": true, + "type": "long" + }, + { + "metadata": {}, + "name": "trip_line", + "nullable": true, + "type": "string" + } + ], + "type": "struct" + }, + "tableIdentifier": null, + "typeStr": "org.apache.spark.sql.DataFrame" + } + ], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_zone", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_zone", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "dropoff_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "trip_line", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "val dropoffNeighbourhoods = neighbourhoodsWithIndex.select(col(\"properties.zone\").alias(\"dropoff_zone\"), col(\"mosaic_index\"))\n", + "\n", + "val withDropoffZone = \n", + " withPickupZone.join(\n", + " dropoffNeighbourhoods,\n", + " withPickupZone.col(\"dropoff_h3\") === dropoffNeighbourhoods.col(\"mosaic_index.index_id\")\n", + " ).where(\n", + " col(\"mosaic_index.is_core\") || st_contains(col(\"mosaic_index.wkb\"), col(\"dropoff_geom\"))\n", + " ).select(\n", + " \"trip_distance\", \"pickup_geom\", \"pickup_zone\", \"dropoff_geom\", \"dropoff_zone\", \"pickup_h3\", \"dropoff_h3\"\n", + " )\n", + " .withColumn(\"trip_line\", st_astext(st_makeline(array(st_geomfromwkt(col(\"pickup_geom\")), st_geomfromwkt(col(\"dropoff_geom\"))))))\n", + "\n", + "display(withDropoffZone.limit(10)) // <- limiting for ipynb only" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "30d4507b-7189-455e-9a4e-681d2f4714ac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Visualise the results in Kepler\n", + "\n", + "> Mosaic abstracts interaction with Kepler in python through the use of the `%%mosaic_kepler` magic. When python is not the notebook language, you can prepend `%python` before the magic to make the switch." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "17c9cbeb-f411-4b2d-94d7-6737aaf1e1c4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is the initial rendering with trip lines._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "81d92d32-c979-4dd8-9e7b-1a19d2507f13", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d331fd71-d383-485e-bb6f-6bcd2302dae7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is with trip lines off and some other adjustments._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4c1c94de-97bb-41e3-abd2-955b6ea3effd", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e1311242-147c-461e-9689-e10e02bd66e8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can toggle layers on/off and adjust properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0a093833-f267-4194-b06e-5575001727d2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# withDropoffZone \"pickup_h3\" \"h3\" 5000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c1466346-8537-4e15-9afc-54056129af5a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Databricks Lakehouse can read / write most any data format\n", + "\n", + "> Here are [built-in](https://docs.databricks.com/en/external-data/index.html) formats as well as Mosaic [readers](https://databrickslabs.github.io/mosaic/api/api.html). __Note: best performance with Delta Lake format__, ref [Databricks](https://docs.databricks.com/en/delta/index.html) and [OSS](https://docs.delta.io/latest/index.html) docs for Delta Lake. Beyond built-in formats, Databricks is a platform on which you can install a wide variety of libraries, e.g. [1](https://docs.databricks.com/en/libraries/index.html#python-environment-management) | [2](https://docs.databricks.com/en/compute/compatibility.html) | [3](https://docs.databricks.com/en/init-scripts/index.html).\n", + "\n", + "Example of [reading](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html?highlight=read#pyspark.sql.DataFrameReader) and [writing](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html?highlight=pyspark%20sql%20dataframe%20writer#pyspark.sql.DataFrameWriter) a Spark DataFrame with Delta Lake format.\n", + "\n", + "```\n", + "# - `write.format(\"delta\")` is default in Databricks\n", + "# - can save to a specified path in the Lakehouse\n", + "# - can save as a table in the Databricks Metastore\n", + "df.write.save(\"\")\n", + "df.write.saveAsTable(\"\")\n", + "```\n", + "\n", + "Example of loading a Delta Lake Table as a Spark DataFrame.\n", + "\n", + "```\n", + "# - `read.format(\"delta\")` is default in Databricks\n", + "# - can load a specified path in the Lakehouse\n", + "# - can load a table in the Databricks Metastore\n", + "df.read.load(\"\")\n", + "df.table(\"\")\n", + "```\n", + "\n", + "More on [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/index.html) in Databricks Lakehouse for Governing [Tables](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#tables) and [Volumes](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#volumes)." + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 1148550101154077, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "QuickstartNotebook", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/scala/QuickstartNotebook.scala b/notebooks/examples/scala/QuickstartNotebook.scala deleted file mode 100644 index e1b1db134..000000000 --- a/notebooks/examples/scala/QuickstartNotebook.scala +++ /dev/null @@ -1,267 +0,0 @@ -// Databricks notebook source -// MAGIC %md -// MAGIC ## Setup NYC taxi zones -// MAGIC In order to setup the data please run the notebook available at "../../data/DownloadNYCTaxiZones".
-// MAGIC DownloadNYCTaxiZones notebook will make sure we have New York City Taxi zone shapes available in our environment. - -// COMMAND ---------- - -val user_name = dbutils.notebook.getContext.userName.get - -val raw_path = s"dbfs:/tmp/mosaic/$user_name" -val raw_taxi_zones_path = s"$raw_path/taxi_zones" - -print(s"The raw data is stored in $raw_path") - -// COMMAND ---------- - -// MAGIC %md -// MAGIC ## Enable Mosaic in the notebook -// MAGIC Mosaic requires Databricks Runtime (DBR) version 10.0 or higher (11.2 with photon or higher is recommended).
-// MAGIC To get started, you'll need to attach the JAR to your cluster and import instances as in the cell below. - -// COMMAND ---------- - -import com.databricks.labs.mosaic.functions.MosaicContext -import com.databricks.labs.mosaic.H3 -import com.databricks.labs.mosaic.JTS - -val mosaicContext = MosaicContext.build(H3, JTS) -import mosaicContext.functions._ -import org.apache.spark.sql.functions._ - -// COMMAND ---------- - -// MAGIC %md ## Read polygons from GeoJson - -// COMMAND ---------- - -// MAGIC %md -// MAGIC With the functionality Mosaic brings we can easily load GeoJSON files using spark.
-// MAGIC In the past this required GeoPandas in python and conversion to spark dataframe.
- -// COMMAND ---------- - -val neighbourhoods = ( - spark.read - .option("multiline", "true") - .format("json") - .load(raw_taxi_zones_path) - .select(col("type"), explode(col("features")).alias("feature")) - .select(col("type"), col("feature.properties").alias("properties"), to_json(col("feature.geometry")).alias("json_geometry")) - .withColumn("geometry", st_aswkt(st_geomfromgeojson(col("json_geometry")))) -) - -display( - neighbourhoods -) - -// COMMAND ---------- - -// MAGIC %md -// MAGIC ## Compute some basic geometry attributes - -// COMMAND ---------- - -// MAGIC %md -// MAGIC Mosaic provides a number of functions for extracting the properties of geometries. Here are some that are relevant to Polygon geometries: - -// COMMAND ---------- - -display( - neighbourhoods - .withColumn("calculatedArea", st_area(col("geometry"))) - .withColumn("calculatedLength", st_length(col("geometry"))) - // Note: The unit of measure of the area and length depends on the CRS used. - // For GPS locations it will be square radians and radians - .select("geometry", "calculatedArea", "calculatedLength") -) - -// COMMAND ---------- - -// MAGIC %md -// MAGIC ## Read points data - -// COMMAND ---------- - -// MAGIC %md -// MAGIC We will load some Taxi trips data to represent point data.
-// MAGIC We already loaded some shapes representing polygons that correspond to NYC neighbourhoods.
- -// COMMAND ---------- - -val tripsTable = spark.table("delta.`/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`") - -// COMMAND ---------- - -val trips = tripsTable - .drop("vendorId", "rateCodeId", "store_and_fwd_flag", "payment_type") - .withColumn("pickup_geom", st_astext(st_point(col("pickup_longitude"), col("pickup_latitude")))) - .withColumn("dropoff_geom", st_astext(st_point(col("dropoff_longitude"), col("dropoff_latitude")))) - -// COMMAND ---------- - -display(trips.select("pickup_geom", "dropoff_geom")) - -// COMMAND ---------- - -// MAGIC %md -// MAGIC ## Spatial Joins - -// COMMAND ---------- - -// MAGIC %md -// MAGIC We can use Mosaic to perform spatial joins both with and without Mosaic indexing strategies.
-// MAGIC Indexing is very important when handling very different geometries both in size and in shape (ie. number of vertices).
- -// COMMAND ---------- - -// MAGIC %md -// MAGIC ### Getting the optimal resolution - -// COMMAND ---------- - -// MAGIC %md -// MAGIC We can use Mosaic functionality to identify how to best index our data based on the data inside the specific dataframe.
-// MAGIC Selecting an apropriate indexing resolution can have a considerable impact on the performance.
- -// COMMAND ---------- - -import com.databricks.labs.mosaic.sql.MosaicFrame - -val mosaicFrame = MosaicFrame(neighbourhoods) - .setGeometryColumn("geometry") - -val optimalResolution = mosaicFrame.getOptimalResolution(0.75) - -println(s"Optimal resolution is $optimalResolution") - -// COMMAND ---------- - -// MAGIC %md -// MAGIC Not every resolution will yield performance improvements.
-// MAGIC By a rule of thumb it is always better to under-index than over-index - if not sure select a lower resolution.
-// MAGIC Higher resolutions are needed when we have very imbalanced geometries with respect to their size or with respect to the number of vertices.
-// MAGIC In such case indexing with more indices will considerably increase the parallel nature of the operations.
-// MAGIC You can think of Mosaic as a way to partition an overly complex row into multiple rows that have a balanced amount of computation each. - -// COMMAND ---------- - -display( - mosaicFrame.analyzer.getResolutionMetrics() -) - -// COMMAND ---------- - -// MAGIC %md -// MAGIC ### Indexing using the optimal resolution - -// COMMAND ---------- - -// MAGIC %md -// MAGIC We will use mosaic sql functions to index our points data.
-// MAGIC Here we will use resolution 9, index resolution depends on the dataset in use. - -// COMMAND ---------- - -val tripsWithIndex = trips - .withColumn("pickup_h3", grid_pointascellid(col("pickup_geom"), lit(optimalResolution))) - .withColumn("dropoff_h3", grid_pointascellid(col("dropoff_geom"), lit(optimalResolution))) - -display(tripsWithIndex) - -// COMMAND ---------- - -// MAGIC %md -// MAGIC We will also index our neighbourhoods using a built in generator function. - -// COMMAND ---------- - -val neighbourhoodsWithIndex = neighbourhoods - // We break down the original geometry in multiple smaller mosaic chips, each with its - // own index - .withColumn("mosaic_index", grid_tessellateexplode(col("geometry"), lit(optimalResolution))) - // We don't need the original geometry any more, since we have broken it down into - // Smaller mosaic chips. - .drop("json_geometry", "geometry") - -display(neighbourhoodsWithIndex) - -// COMMAND ---------- - -// MAGIC %md -// MAGIC ### Performing the spatial join - -// COMMAND ---------- - -// MAGIC %md -// MAGIC We can now do spatial joins to both pickup and drop off zones based on geolocations in our datasets. - -// COMMAND ---------- - -val pickupNeighbourhoods = neighbourhoodsWithIndex.select(col("properties.zone").alias("pickup_zone"), col("mosaic_index")) - -val withPickupZone = - tripsWithIndex.join( - pickupNeighbourhoods, - tripsWithIndex.col("pickup_h3") === pickupNeighbourhoods.col("mosaic_index.index_id") - ).where( - // If the borough is a core chip (the chip is fully contained within the geometry), then we do not need - // to perform any intersection, because any point matching the same index will certainly be contained in - // the borough. Otherwise we need to perform an st_contains operation on the chip geometry. - col("mosaic_index.is_core") || st_contains(col("mosaic_index.wkb"), col("pickup_geom")) - ).select( - "trip_distance", "pickup_geom", "pickup_zone", "dropoff_geom", "pickup_h3", "dropoff_h3" - ) - -display(withPickupZone) - -// COMMAND ---------- - -// MAGIC %md -// MAGIC We can easily perform a similar join for the drop off location. - -// COMMAND ---------- - -val dropoffNeighbourhoods = neighbourhoodsWithIndex.select(col("properties.zone").alias("dropoff_zone"), col("mosaic_index")) - -val withDropoffZone = - withPickupZone.join( - dropoffNeighbourhoods, - withPickupZone.col("dropoff_h3") === dropoffNeighbourhoods.col("mosaic_index.index_id") - ).where( - col("mosaic_index.is_core") || st_contains(col("mosaic_index.wkb"), col("dropoff_geom")) - ).select( - "trip_distance", "pickup_geom", "pickup_zone", "dropoff_geom", "dropoff_zone", "pickup_h3", "dropoff_h3" - ) - .withColumn("trip_line", st_astext(st_makeline(array(st_geomfromwkt(col("pickup_geom")), st_geomfromwkt(col("dropoff_geom")))))) - -display(withDropoffZone) - -// COMMAND ---------- - -withDropoffZone.createOrReplaceTempView("withDropoffZone") - -// COMMAND ---------- - -// MAGIC %md -// MAGIC ## Visualise the results in Kepler - -// COMMAND ---------- - -// MAGIC %md -// MAGIC For visualisation there simply aren't good options in scala.
-// MAGIC Luckily in our notebooks you can easily switch to python just for UI.
-// MAGIC Mosaic abstracts interaction with Kepler in python. - -// COMMAND ---------- - -// MAGIC %python -// MAGIC from mosaic import enable_mosaic -// MAGIC enable_mosaic(spark, dbutils) - -// COMMAND ---------- - -// MAGIC %python -// MAGIC %%mosaic_kepler -// MAGIC "withDropoffZone" "pickup_h3" "h3" 5000 diff --git a/notebooks/examples/scala/README.md b/notebooks/examples/scala/README.md new file mode 100644 index 000000000..d40a00d29 --- /dev/null +++ b/notebooks/examples/scala/README.md @@ -0,0 +1,3 @@ +# Scala Examples + +> __Note: `ipynb` files can be previewed in GitHub and can also be imported into Databricks, more [here](https://docs.databricks.com/en/notebooks/notebook-export-import.html).__ diff --git a/notebooks/examples/sql/MosaicAndSedona.ipynb b/notebooks/examples/sql/MosaicAndSedona.ipynb new file mode 100644 index 000000000..eb933065b --- /dev/null +++ b/notebooks/examples/sql/MosaicAndSedona.ipynb @@ -0,0 +1,1534 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bf98136c-9276-4388-8eef-b567621fe1a4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Mosaic & Sedona\n", + "\n", + "> You can combine the usage of [Mosaic](https://databrickslabs.github.io/mosaic/index.html) with other geospatial libraries. In this example we combine it with [Sedona](https://sedona.apache.org).\n", + "\n", + "## Setup\n", + "\n", + "This notebook will run if you have both Mosaic and Sedona installed on your cluster as described below.\n", + "\n", + "### Install Sedona\n", + "\n", + "To install Sedona, follow the [official Sedona instructions](https://sedona.apache.org/1.5.0/setup/databricks/).\n", + "\n", + "E.g. Add the following maven coordinates to a non-photon cluster [[1](https://docs.databricks.com/en/libraries/package-repositories.html)]. This is showing DBR 12.2 LTS. \n", + "\n", + "```\n", + "org.apache.sedona:sedona-spark-shaded-3.0_2.12:1.5.0\n", + "org.datasyslab:geotools-wrapper:1.5.0-28.2\n", + "```\n", + "\n", + "### Install Mosaic\n", + "\n", + "Download Mosaic JAR to your local machine (e.g. from [here](https://github.com/databrickslabs/mosaic/releases/download/v_0.3.12/mosaic-0.3.12-jar-with-dependencies.jar) for 0.3.12) and then UPLOAD to your cluster [[1](https://docs.databricks.com/en/libraries/cluster-libraries.html#install-a-library-on-a-cluster)]. \n", + "\n", + "### Notes\n", + "\n", + "* This is for [SPARK SQL](https://www.databricks.com/glossary/what-is-spark-sql#:~:text=Spark%20SQL%20is%20a%20Spark,on%20existing%20deployments%20and%20data.) which is different from [DBSQL](https://www.databricks.com/product/databricks-sql); __The best way to combine is to not register mosaic SQL functions since Sedona is primarily SQL.__\n", + "* See instructions for `SedonaContext.create(spark)` [[1](https://sedona.apache.org/1.5.0/tutorial/sql/?h=sedonacontext#initiate-sedonacontext)]. \n", + "* And, Sedona identifies that it might have issues if executed on a [Photon](https://www.databricks.com/product/photon) cluster; again this example is showing DBR 12.2 LTS on the Mosaic 0.3 series.\n", + "\n", + "--- \n", + " __Last Update__ 01 DEC 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "27dd2429-1135-457b-912f-931e7aaa447e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Prior to Setup\n", + "\n", + "> Notice that even in DBR 12.2 LTS, Databricks initially has gated functions, meaning they will not execute on the runtime but are there. However, we will see that after registering functions, e.g. from Sedona, those then become available (in DBR)." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "80dcd1b7-5f05-47a9-a5e5-f8361811cec4", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
function
st_area
st_astext
st_geogfromtext
st_geogfromwkt
st_geometrytype
st_geomfromtext
st_geomfromwkt
st_isempty
st_length
st_ndims
st_npoints
st_point
st_setsrid
st_srid
st_xmax
st_xmin
st_ymax
st_ymin
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "st_area" + ], + [ + "st_astext" + ], + [ + "st_geogfromtext" + ], + [ + "st_geogfromwkt" + ], + [ + "st_geometrytype" + ], + [ + "st_geomfromtext" + ], + [ + "st_geomfromwkt" + ], + [ + "st_isempty" + ], + [ + "st_length" + ], + [ + "st_ndims" + ], + [ + "st_npoints" + ], + [ + "st_point" + ], + [ + "st_setsrid" + ], + [ + "st_srid" + ], + [ + "st_xmax" + ], + [ + "st_xmin" + ], + [ + "st_ymax" + ], + [ + "st_ymin" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "function", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "-- before we do anything\n", + "-- have gated product functions\n", + "show system functions like 'st_*'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b7cac536-773b-47a4-90ce-5a2c77bdca8e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_The following exception will be thrown if you attempt to execute the gated functions:_\n", + "\n", + "```\n", + "AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve \"st_area(POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10)))\" due to data type mismatch: parameter 1 requires (\"GEOMETRY\" or \"GEOGRAPHY\") type, however, \"POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))\" is of \"STRING\" type.; line 1 pos 7;\n", + "'Project [unresolvedalias(st_area(POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))), None)]\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "85cd6a7a-dd6d-4cf6-8f65-0ebf640c2ab2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "%sql \n", + "-- assumes you are in DBR 12.2 LTS\n", + "-- so this will not execute\n", + "-- uncomment to verify\n", + "-- select st_area('POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))')" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a4a24590-8542-4a71-a1c9-03690da5316e", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
function_desc
Function: st_area
Class: com.databricks.sql.catalyst.expressions.st.ST_Area
Usage: st_area(geo) - Returns the area of the input GEOGRAPHY or GEOMETRY value.
Extended Usage:\n", + " Examples:\n", + " \n", + " Since: 3.3.0\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "Function: st_area" + ], + [ + "Class: com.databricks.sql.catalyst.expressions.st.ST_Area" + ], + [ + "Usage: st_area(geo) - Returns the area of the input GEOGRAPHY or GEOMETRY value." + ], + [ + "Extended Usage:\n Examples:\n \n Since: 3.3.0\n" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "function_desc", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "-- notice, e.g. these are initially gated product functions\n", + "describe function extended st_area" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "46dcda8a-cd24-4016-acf9-6ede54978d2f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup\n", + "\n", + "> We are installing Mosaic without SQL functions registered (via Scala) and are installing Sedona SQL as normal." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c91dd7bf-319c-489c-9715-6c512f027d64", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "
import org.apache.spark.sql.functions._\n", + "import com.databricks.labs.mosaic.functions.MosaicContext\n", + "import com.databricks.labs.mosaic.H3\n", + "import com.databricks.labs.mosaic.JTS\n", + "mosaicContext: com.databricks.labs.mosaic.functions.MosaicContext = com.databricks.labs.mosaic.functions.MosaicContext@70e1f779\n", + "import mosaicContext.functions._\n", + "import org.apache.sedona.spark.SedonaContext\n", + "sedona: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6daae4a9\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "
import org.apache.spark.sql.functions._\nimport com.databricks.labs.mosaic.functions.MosaicContext\nimport com.databricks.labs.mosaic.H3\nimport com.databricks.labs.mosaic.JTS\nmosaicContext: com.databricks.labs.mosaic.functions.MosaicContext = com.databricks.labs.mosaic.functions.MosaicContext@70e1f779\nimport mosaicContext.functions._\nimport org.apache.sedona.spark.SedonaContext\nsedona: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6daae4a9\n
", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "type": "html" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "\n", + "// -- spark functions\n", + "import org.apache.spark.sql.functions._\n", + "\n", + "// -- mosaic functions\n", + "import com.databricks.labs.mosaic.functions.MosaicContext\n", + "import com.databricks.labs.mosaic.H3\n", + "import com.databricks.labs.mosaic.JTS\n", + "\n", + "val mosaicContext = MosaicContext.build(H3, JTS)\n", + "import mosaicContext.functions._\n", + "\n", + "// ! don't register SQL functions !\n", + "// - this allows sedona to be the main spatial SQL provider\n", + "//mosaicContext.register()\n", + "\n", + "// -- sedona functions\n", + "import org.apache.sedona.spark.SedonaContext\n", + "val sedona = SedonaContext.create(spark)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a446841d-9ce1-4b0c-97e8-b705ab06caee", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Now when we list user functions, we see all the Sedona provided ones._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0394a8a2-dcfd-49c0-a2df-85ecd0272029", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
function
hive_metastore.default.st_geohash
st_3ddistance
st_addpoint
st_affine
st_angle
st_area
st_areaspheroid
st_asbinary
st_asewkb
st_asewkt
st_asgeojson
st_asgml
st_askml
st_astext
st_azimuth
st_boundary
st_boundingdiagonal
st_buffer
st_buildarea
st_centroid
st_closestpoint
st_collect
st_collectionextract
st_concavehull
st_contains
st_convexhull
st_coorddim
st_coveredby
st_covers
st_crosses
st_degrees
st_difference
st_dimension
st_disjoint
st_distance
st_distancesphere
st_distancespheroid
st_dump
st_dumppoints
st_endpoint
st_envelope
st_envelope_aggr
st_equals
st_exteriorring
st_flipcoordinates
st_force3d
st_force_2d
st_frechetdistance
st_geogfromtext
st_geogfromwkt
st_geohash
st_geometricmedian
st_geometryn
st_geometrytype
st_geomfromewkt
st_geomfromgeohash
st_geomfromgeojson
st_geomfromgml
st_geomfromkml
st_geomfromtext
st_geomfromwkb
st_geomfromwkt
st_h3celldistance
st_h3cellids
st_h3kring
st_h3togeom
st_hausdorffdistance
st_interiorringn
st_intersection
st_intersection_aggr
st_intersects
st_isclosed
st_iscollection
st_isempty
st_isring
st_issimple
st_isvalid
st_length
st_lengthspheroid
st_linefrommultipoint
st_linefromtext
st_lineinterpolatepoint
st_linemerge
st_linestringfromtext
st_linesubstring
st_makeline
st_makepoint
st_makepolygon
st_makevalid
st_minimumboundingcircle
st_minimumboundingradius
st_mlinefromtext
st_mpolyfromtext
st_multi
st_ndims
st_normalize
st_npoints
st_nrings
st_numgeometries
st_numinteriorrings
st_numpoints
st_orderingequals
st_overlaps
st_point
st_pointfromtext
st_pointn
st_pointonsurface
st_pointz
st_polygon
st_polygonfromenvelope
st_polygonfromtext
st_reduceprecision
st_removepoint
st_reverse
st_s2cellids
st_setpoint
st_setsrid
st_simplifypreservetopology
st_split
st_srid
st_startpoint
st_subdivide
st_subdivideexplode
st_symdifference
st_touches
st_transform
st_translate
st_union
st_union_aggr
st_voronoipolygons
st_within
st_x
st_xmax
st_xmin
st_y
st_ymax
st_ymin
st_z
st_zmax
st_zmin
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "hive_metastore.default.st_geohash" + ], + [ + "st_3ddistance" + ], + [ + "st_addpoint" + ], + [ + "st_affine" + ], + [ + "st_angle" + ], + [ + "st_area" + ], + [ + "st_areaspheroid" + ], + [ + "st_asbinary" + ], + [ + "st_asewkb" + ], + [ + "st_asewkt" + ], + [ + "st_asgeojson" + ], + [ + "st_asgml" + ], + [ + "st_askml" + ], + [ + "st_astext" + ], + [ + "st_azimuth" + ], + [ + "st_boundary" + ], + [ + "st_boundingdiagonal" + ], + [ + "st_buffer" + ], + [ + "st_buildarea" + ], + [ + "st_centroid" + ], + [ + "st_closestpoint" + ], + [ + "st_collect" + ], + [ + "st_collectionextract" + ], + [ + "st_concavehull" + ], + [ + "st_contains" + ], + [ + "st_convexhull" + ], + [ + "st_coorddim" + ], + [ + "st_coveredby" + ], + [ + "st_covers" + ], + [ + "st_crosses" + ], + [ + "st_degrees" + ], + [ + "st_difference" + ], + [ + "st_dimension" + ], + [ + "st_disjoint" + ], + [ + "st_distance" + ], + [ + "st_distancesphere" + ], + [ + "st_distancespheroid" + ], + [ + "st_dump" + ], + [ + "st_dumppoints" + ], + [ + "st_endpoint" + ], + [ + "st_envelope" + ], + [ + "st_envelope_aggr" + ], + [ + "st_equals" + ], + [ + "st_exteriorring" + ], + [ + "st_flipcoordinates" + ], + [ + "st_force3d" + ], + [ + "st_force_2d" + ], + [ + "st_frechetdistance" + ], + [ + "st_geogfromtext" + ], + [ + "st_geogfromwkt" + ], + [ + "st_geohash" + ], + [ + "st_geometricmedian" + ], + [ + "st_geometryn" + ], + [ + "st_geometrytype" + ], + [ + "st_geomfromewkt" + ], + [ + "st_geomfromgeohash" + ], + [ + "st_geomfromgeojson" + ], + [ + "st_geomfromgml" + ], + [ + "st_geomfromkml" + ], + [ + "st_geomfromtext" + ], + [ + "st_geomfromwkb" + ], + [ + "st_geomfromwkt" + ], + [ + "st_h3celldistance" + ], + [ + "st_h3cellids" + ], + [ + "st_h3kring" + ], + [ + "st_h3togeom" + ], + [ + "st_hausdorffdistance" + ], + [ + "st_interiorringn" + ], + [ + "st_intersection" + ], + [ + "st_intersection_aggr" + ], + [ + "st_intersects" + ], + [ + "st_isclosed" + ], + [ + "st_iscollection" + ], + [ + "st_isempty" + ], + [ + "st_isring" + ], + [ + "st_issimple" + ], + [ + "st_isvalid" + ], + [ + "st_length" + ], + [ + "st_lengthspheroid" + ], + [ + "st_linefrommultipoint" + ], + [ + "st_linefromtext" + ], + [ + "st_lineinterpolatepoint" + ], + [ + "st_linemerge" + ], + [ + "st_linestringfromtext" + ], + [ + "st_linesubstring" + ], + [ + "st_makeline" + ], + [ + "st_makepoint" + ], + [ + "st_makepolygon" + ], + [ + "st_makevalid" + ], + [ + "st_minimumboundingcircle" + ], + [ + "st_minimumboundingradius" + ], + [ + "st_mlinefromtext" + ], + [ + "st_mpolyfromtext" + ], + [ + "st_multi" + ], + [ + "st_ndims" + ], + [ + "st_normalize" + ], + [ + "st_npoints" + ], + [ + "st_nrings" + ], + [ + "st_numgeometries" + ], + [ + "st_numinteriorrings" + ], + [ + "st_numpoints" + ], + [ + "st_orderingequals" + ], + [ + "st_overlaps" + ], + [ + "st_point" + ], + [ + "st_pointfromtext" + ], + [ + "st_pointn" + ], + [ + "st_pointonsurface" + ], + [ + "st_pointz" + ], + [ + "st_polygon" + ], + [ + "st_polygonfromenvelope" + ], + [ + "st_polygonfromtext" + ], + [ + "st_reduceprecision" + ], + [ + "st_removepoint" + ], + [ + "st_reverse" + ], + [ + "st_s2cellids" + ], + [ + "st_setpoint" + ], + [ + "st_setsrid" + ], + [ + "st_simplifypreservetopology" + ], + [ + "st_split" + ], + [ + "st_srid" + ], + [ + "st_startpoint" + ], + [ + "st_subdivide" + ], + [ + "st_subdivideexplode" + ], + [ + "st_symdifference" + ], + [ + "st_touches" + ], + [ + "st_transform" + ], + [ + "st_translate" + ], + [ + "st_union" + ], + [ + "st_union_aggr" + ], + [ + "st_voronoipolygons" + ], + [ + "st_within" + ], + [ + "st_x" + ], + [ + "st_xmax" + ], + [ + "st_xmin" + ], + [ + "st_y" + ], + [ + "st_ymax" + ], + [ + "st_ymin" + ], + [ + "st_z" + ], + [ + "st_zmax" + ], + [ + "st_zmin" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "function", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "show user functions like 'st_*'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c87fd220-d78e-402a-9452-e15191128a1b", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Notice that the prior system registered functions have been replaced, e.g. `ST_Area`._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7d5f73c2-d7c1-4e61-bf15-41d51a1d3829", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
function_desc
Function: ST_Area
Class: org.apache.spark.sql.sedona_sql.expressions.ST_Area
Usage: N/A.
Extended Usage:\n", + " No example/argument for ST_Area.\n", + "
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "Function: ST_Area" + ], + [ + "Class: org.apache.spark.sql.sedona_sql.expressions.ST_Area" + ], + [ + "Usage: N/A." + ], + [ + "Extended Usage:\n No example/argument for ST_Area.\n" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "function_desc", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "-- notice, e.g. the provided function now are available\n", + "describe function extended st_area" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1805f461-ecab-4a03-980d-fb403a3a028e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Queries\n", + "\n", + "> Showing how Sedona (registered Spark SQL) and Mosaic (Scala) can co-exist on the same cluster. Not shown here, but the could also be Mosaic Python bindings." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c1e4ac30-daf0-423c-8117-b7c3c4c06e52", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
wkt
POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "wkt", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "CREATE OR REPLACE TEMPORARY VIEW sample AS (\n", + " SELECT 'POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))' AS wkt\n", + ");\n", + "\n", + "SELECT * FROM sample" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "fbb24b11-f88d-46fb-a365-773d35923704", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is a Spark SQL call to use the Sedona functions._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6f44c258-6919-43bc-9b52-a9167ce48078", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
sedona_area
550.0
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 550.0 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "sedona_area", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "SELECT ST_Area(ST_GeomFromText(wkt)) AS sedona_area FROM sample" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3a484604-c4bc-4234-acf0-32994de54554", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is Scala call to the same Mosaic-provided `ST_Area` function._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ccf12e8d-82ff-47d9-ab5e-f64b2c487223", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
mosaic_area
550.0
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 550.0 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "mosaic_area", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "// verify scala functions registered\n", + "display(\n", + " spark\n", + " .table(\"sample\")\n", + " .select(st_area($\"wkt\").as(\"mosaic_area\"))\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6dd1e21d-7a84-4c5e-b5f6-b02831d846b0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Mosaic + Sedona_\n", + "\n", + "> Showing blending Mosaic calls (in Scala) with Sedona (Spark SQL) calls." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e0602e02-01ec-45cd-8c17-aa30e0d0d969", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
mosaic_areasedona_areawkt
550.0550.0POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 550.0, + 550.0, + "POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "mosaic_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "sedona_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "wkt", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%scala\n", + "display(\n", + " spark.table(\"sample\")\n", + " .select(\n", + " st_area($\"wkt\").as(\"mosaic_area\"), // <- mosaic (scala)\n", + " expr(\"ST_Area(ST_GeomFromText(wkt)) AS sedona_area\"), // <- sedona (spark sql)\n", + " $\"wkt\"\n", + " )\n", + ")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 1148550101113066, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "MosaicAndSedona", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/sql/MosaicAndSedona.sql b/notebooks/examples/sql/MosaicAndSedona.sql deleted file mode 100644 index 40b33167b..000000000 --- a/notebooks/examples/sql/MosaicAndSedona.sql +++ /dev/null @@ -1,56 +0,0 @@ --- Databricks notebook source --- MAGIC %md --- MAGIC # Mosaic & Sedona --- MAGIC --- MAGIC You can combine the usage of Mosaic with other geospatial libraries. --- MAGIC --- MAGIC In this example we combine the use of [Sedona](https://sedona.apache.org) and Mosaic. --- MAGIC --- MAGIC ## Setup --- MAGIC --- MAGIC This notebook will run if you have both Mosaic and Sedona installed on your cluster. --- MAGIC --- MAGIC ### Install sedona --- MAGIC --- MAGIC To install Sedona, follow the [official Sedona instructions](https://sedona.apache.org/1.4.0/setup/databricks). - --- COMMAND ---------- - --- MAGIC %python --- MAGIC # import pyspark.sql.functions as f --- MAGIC # import mosaic as mos --- MAGIC # from sedona.register.geo_registrator import SedonaRegistrator --- MAGIC --- MAGIC # mos.enable_mosaic(spark, dbutils) # Enable Mosaic --- MAGIC # SedonaRegistrator.registerAll(spark) # Register Sedona SQL functions - --- COMMAND ---------- - --- MAGIC %scala --- MAGIC // Register Sedona in the 'default' database --- MAGIC import org.apache.sedona.sql.utils.SedonaSQLRegistrator --- MAGIC SedonaSQLRegistrator.registerAll(spark) --- MAGIC --- MAGIC // Import Mosaic functions --- MAGIC import com.databricks.labs.mosaic.functions.MosaicContext --- MAGIC import com.databricks.labs.mosaic.H3 --- MAGIC import com.databricks.labs.mosaic.JTS --- MAGIC --- MAGIC val mosaicContext = MosaicContext.build(H3, JTS) --- MAGIC import mosaicContext.functions._ --- MAGIC import org.apache.spark.sql.functions._ - --- COMMAND ---------- - --- MAGIC %scala --- MAGIC // Example dataset --- MAGIC spark.createDataFrame(Seq(Tuple1("POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"))).toDF("wkt").createOrReplaceTempView("sample") - --- COMMAND ---------- - -SELECT - mosaic.ST_Area(wkt) as mosaic_area, -- Mosaic - ST_Area(ST_GeomFromWKT(wkt)) as sedona_area, -- Sedona - ST_FlipCoordinates(ST_GeomFromWKT(wkt)) as sedona_flipped, -- Sedona - wkt -FROM sample diff --git a/notebooks/examples/sql/QuickstartNotebook.ipynb b/notebooks/examples/sql/QuickstartNotebook.ipynb new file mode 100644 index 000000000..a174d3d2f --- /dev/null +++ b/notebooks/examples/sql/QuickstartNotebook.ipynb @@ -0,0 +1,2316 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db1d4ea7-d138-4740-ac41-74998430b3df", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Mosaic Quickstart\n", + "\n", + "> Perform a point-in-polygon spatial join between NYC Taxi trips and zones. __Note: this does not get into performance tweaks that are available for scaled joins.__\n", + "\n", + "1. To use Databricks Labs [Mosaic](https://databrickslabs.github.io/mosaic/index.html) library for geospatial data engineering, analysis, and visualization functionality:\n", + " * Install with `%pip install databricks-mosaic`\n", + " * Import and use with the following:\n", + " ```\n", + " import mosaic as mos\n", + " mos.enable_mosaic(spark, dbutils)\n", + " ```\n", + "

\n", + "\n", + "2. To use [KeplerGl](https://kepler.gl/) OSS library for map layer rendering:\n", + " * Already installed with Mosaic, use `%%mosaic_kepler` magic [[Mosaic Docs](https://databrickslabs.github.io/mosaic/usage/kepler.html)]\n", + " * Import with `from keplergl import KeplerGl` to use directly\n", + "\n", + "If you have trouble with Volume access:\n", + "\n", + "* For Mosaic 0.3 series (< DBR 13) - you can copy resources to DBFS as a workaround\n", + "* For Mosaic 0.4 series (DBR 13.3 LTS) - you will need to either copy resources to DBFS or setup for Unity Catalog + Shared Access which will involve your workspace admin. Instructions, as updated, will be [here](https://databrickslabs.github.io/mosaic/usage/install-gdal.html).\n", + "\n", + "--- \n", + " __Last Update__ 01 DEC 2023 [Mosaic 0.3.12]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d6cbd7f0-cfa1-41f9-88dc-dccd355343d4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Install Mosaic\n", + "\n", + "> Mosaic framework is available via pip install and it comes with bindings for Python, SQL, Scala and R. The wheel file coming with pip installation is registering any necessary jars for other language support." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2fb6d01c-9da4-471a-b765-eb4578600eb1", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Python interpreter will be restarted.\nPython interpreter will be restarted.\n" + ] + } + ], + "source": [ + "%pip install \"databricks-mosaic<0.4,>=0.3\" --quiet # <- Mosaic 0.3 series\n", + "# %pip install \"databricks-mosaic<0.5,>=0.4\" --quiet # <- Mosaic 0.4 series (as available)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3444513e-7349-4159-8dda-c5bff1225a12", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# -- configure AQE for more compute heavy operations\n", + "# - choose option-1 or option-2 below, essential for REPARTITION!\n", + "# spark.conf.set(\"spark.databricks.optimizer.adaptive.enabled\", False) # <- option-1: turn off completely for full control\n", + "spark.conf.set(\"spark.sql.adaptive.coalescePartitions.enabled\", False) # <- option-2: just tweak partition management\n", + "spark.conf.set(\"spark.sql.shuffle.partitions\", 1_024) # <-- default is 200\n", + "\n", + "# -- import databricks + spark functions\n", + "from pyspark.sql import functions as F\n", + "from pyspark.sql.functions import col, udf\n", + "from pyspark.sql.types import *\n", + "\n", + "# -- setup mosaic\n", + "import mosaic as mos\n", + "\n", + "mos.enable_mosaic(spark, dbutils)\n", + "# mos.enable_gdal(spark) # <- not needed for this example\n", + "\n", + "# --other imports\n", + "import os\n", + "import pathlib\n", + "import requests\n", + "import warnings\n", + "\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9251d814-2e9f-4287-8fd9-769f0bf40c68", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Setup Data" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c1f01df5-a33a-4d99-91f3-3dbe066ecc98", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Initial data stored in '/tmp/mosaic/mjohns@databricks.com'\n" + ] + } + ], + "source": [ + "user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()\n", + "\n", + "data_dir = f\"/tmp/mosaic/{user_name}\" # <- DBFS\n", + "print(f\"Initial data stored in '{data_dir}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "124695f4-6e1d-462c-b0a8-433fa7f7803b", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "none on /local_disk0/.wsfs type fuse (rw,nosuid,nodev,relatime,user_id=0,group_id=0,default_permissions)\nworkspace on /Workspace type fuse (rw,nosuid,nodev,relatime,user_id=0,group_id=0,default_permissions,allow_other)\n/: on /dbfs type fuse (rw,nosuid,nodev,relatime,user_id=0,group_id=0,default_permissions,allow_other)\n/: on /Volumes type fuse (rw,nosuid,nodev,relatime,user_id=0,group_id=0,default_permissions,allow_other)\n" + ] + } + ], + "source": [ + "%sh mount -l -t fuse" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "293022d3-40c6-4928-946b-7bfe8a6fbb1e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Download NYC Taxi Zones\n", + "\n", + "> Make sure we have New York City Taxi zone shapes available in our environment." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e3283b3f-4b81-459c-b513-2d782e9acc7f", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "ZONE_DIR_FUSE? '/dbfs/tmp/mosaic/mjohns@databricks.com/taxi_zones'\n" + ] + } + ], + "source": [ + "zone_dir = f\"{data_dir}/taxi_zones\" # <- DBFS\n", + "zone_dir_fuse = f\"/dbfs{zone_dir}\" # <- FUSE\n", + "dbutils.fs.mkdirs(zone_dir)\n", + "\n", + "os.environ['ZONE_DIR_FUSE'] = zone_dir_fuse\n", + "print(f\"ZONE_DIR_FUSE? '{zone_dir_fuse}'\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "941175c3-4142-4254-b47d-c8e813dfe95c", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "...skipping '/dbfs/tmp/mosaic/mjohns@databricks.com/taxi_zones/nyc_taxi_zones.geojson', already exits.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "

pathnamesizemodificationTime
dbfs:/tmp/mosaic/mjohns@databricks.com/taxi_zones/nyc_taxi_zones.geojsonnyc_taxi_zones.geojson38924781701183475000
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "dbfs:/tmp/mosaic/mjohns@databricks.com/taxi_zones/nyc_taxi_zones.geojson", + "nyc_taxi_zones.geojson", + 3892478, + 1701183475000 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "path", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "name", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "size", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "modificationTime", + "type": "\"long\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "zone_url = 'https://data.cityofnewyork.us/api/geospatial/d3c5-ddgc?method=export&format=GeoJSON'\n", + "\n", + "zone_fusepath = pathlib.Path(zone_dir_fuse) / 'nyc_taxi_zones.geojson'\n", + "if not zone_fuse_path.exists():\n", + " req = requests.get(zone_url)\n", + " with open(zone_fuse_path, 'wb') as f:\n", + " f.write(req.content)\n", + "else:\n", + " print(f\"...skipping '{zone_fuse_path}', already exits.\")\n", + "\n", + "display(dbutils.fs.ls(zone_dir))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "56b5edd3-3e43-428b-b13d-752c2a3a18a3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initial Taxi Zone from GeoJSON [Polygons]\n", + "\n", + "> With the functionality Mosaic brings we can easily load GeoJSON files. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "172476a3-aa95-45cd-91e8-040d865b37d1", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "%python \n", + "# Note: Here we are using python for convenience since our\n", + "# data is on DBFS; after we create a temp view, can pick up \n", + "# in Spark SQL\n", + "(\n", + " spark.read\n", + " .option(\"multiline\", \"true\")\n", + " .format(\"json\")\n", + " .load(zone_dir)\n", + " .select(\"type\", F.explode(col(\"features\")).alias(\"feature\"))\n", + " .select(\"type\", col(\"feature.properties\").alias(\"properties\"), F.to_json(col(\"feature.geometry\")).alias(\"json_geometry\"))\n", + " .withColumn(\"geometry\", mos.st_aswkt(mos.st_geomfromgeojson(\"json_geometry\")))\n", + ").createOrReplaceTempView(\"neighbourhoods\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8927e98f-5978-4408-ac88-909a87fcf602", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
count
263
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "263" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "count", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql select format_number(count(1), 0) as count from neighbourhoods" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7b63951b-7335-4160-94a0-ec94ea995fcf", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
typepropertiesjson_geometrygeometry
FeatureCollectionList(EWR, 1, 1, 0.0007823067885, 0.116357453189, Newark Airport){\"coordinates\":[[[[-74.18445299999996,40.694995999999904],[-74.18448899999999,40.69509499999987],[-74.18449799999996,40.69518499999987],[-74.18438099999997,40.69587799999989],[-74.18428199999994,40.6962109999999],[-74.18402099999997,40.697074999999884],[-74.18391299999996,40.69750699999986],[-74.18375099999997,40.69779499999988],[-74.18363399999998,40.6983259999999],[-74.18356199999994,40.698451999999875],[-74.18354399999998,40.69855999999988],[-74.18350799999996,40.69870399999992],[-74.18327399999998,40.70008999999988],[-74.18315699999994,40.701214999999884],[-74.18316599999997,40.702384999999886],[-74.18313899999998,40.7026279999999],[-74.18309399999998,40.7028529999999],[-74.18299499999995,40.70315899999985],[-74.18284199999994,40.70346499999989],[-74.18264399999998,40.70373499999988],[-74.18242799999996,40.70395099999992],[-74.18220299999996,40.704139999999896],[-74.18203199999994,40.70425699999987],[-74.18180699999994,40.7043919999999],[-74.18157299999996,40.70449999999988],[-74.18132099999997,40.70460799999991],[-74.18080799999996,40.7047879999999],[-74.179467,40.70534599999992],[-74.17887299999995,40.70554399999987],[-74.17831499999994,40.70572399999987],[-74.17776599999996,40.70589499999988],[-74.17709099999996,40.706092999999896],[-74.17699199999998,40.70613799999988],[-74.17689299999995,40.70619199999988],[-74.17664999999994,40.70641699999988],[-74.17642499999994,40.706695999999916],[-74.17628999999994,40.70689399999988],[-74.17608299999995,40.70710999999989],[-74.17599299999995,40.70719099999991],[-74.17589399999997,40.707262999999905],[-74.17565999999994,40.70737999999988],[-74.17538099999996,40.707469999999915],[-74.17515599999996,40.707514999999894],[-74.17475999999994,40.707595999999924],[-74.17417499999993,40.70766799999991],[-74.17388699999998,40.70773099999992],[-74.17347299999994,40.707748999999865],[-74.17275299999994,40.707802999999906],[-74.17188899999996,40.707910999999854],[-74.17163699999998,40.70795599999986],[-74.17133999999999,40.707964999999895],[-74.17120499999999,40.70795599999986],[-74.16994499999998,40.707973999999886],[-74.16888299999994,40.7079379999999],[-74.16681299999993,40.70785699999989],[-74.16442799999999,40.70779399999987],[-74.16401399999995,40.70777599999992],[-74.16233999999997,40.707721999999876],[-74.16081899999995,40.70764099999991],[-74.16057599999993,40.70760499999988],[-74.16033299999998,40.70756899999987],[-74.160063,40.7074879999999],[-74.15938799999998,40.707262999999905],[-74.15904599999999,40.707145999999916],[-74.15891999999997,40.70710999999989],[-74.15827199999995,40.70687599999993],[-74.15459099999998,40.705651999999894],[-74.15409599999998,40.70544499999989],[-74.15401499999997,40.70538199999988],[-74.15387999999996,40.705327999999895],[-74.15376299999997,40.705408999999875],[-74.15323199999995,40.70524699999987],[-74.15317799999997,40.70531899999989],[-74.15306999999996,40.7052829999999],[-74.15359199999995,40.70437399999987],[-74.15386199999995,40.7038429999999],[-74.15513999999996,40.70155699999987],[-74.15544599999998,40.70108899999988],[-74.15575199999995,40.7006659999999],[-74.15600399999994,40.70026099999991],[-74.15635499999996,40.69975699999986],[-74.15745299999998,40.69809199999988],[-74.15754299999998,40.6979389999999],[-74.15758799999998,40.69781299999988],[-74.15762399999994,40.69767799999991],[-74.15829899999994,40.696705999999885],[-74.15951399999994,40.69488799999988],[-74.15958599999993,40.69476199999984],[-74.16014399999995,40.69410499999988],[-74.16057599999993,40.693222999999875],[-74.16262799999998,40.69028899999989],[-74.16279899999995,40.69002799999989],[-74.16290699999996,40.68987499999987],[-74.16292499999997,40.689874999999866],[-74.16295199999996,40.689874999999866],[-74.16306899999995,40.68989299999988],[-74.16309599999994,40.689928999999886],[-74.16322199999996,40.68998299999989],[-74.16331199999996,40.68999199999993],[-74.16341099999994,40.69000099999988],[-74.16352799999999,40.69000999999986],[-74.16380699999996,40.69004599999989],[-74.16410399999995,40.690081999999904],[-74.16417599999994,40.690081999999904],[-74.16422999999998,40.69005499999988],[-74.16436499999998,40.69003699999991],[-74.16450899999995,40.68998299999986],[-74.16467099999994,40.68988399999989],[-74.16479699999996,40.689757999999884],[-74.16491399999995,40.689586999999904],[-74.16499499999998,40.689388999999885],[-74.16528299999999,40.68891199999991],[-74.16542699999997,40.6887589999999],[-74.16548099999994,40.68863299999987],[-74.16560699999997,40.68842599999988],[-74.16576899999995,40.68802999999986],[-74.16587699999997,40.68787699999991],[-74.16583199999997,40.68757999999987],[-74.16582299999999,40.68748999999987],[-74.16580499999998,40.687156999999914],[-74.16582299999999,40.68703999999986],[-74.16589499999998,40.6868419999999],[-74.16604799999999,40.68655399999988],[-74.16639899999996,40.686022999999864],[-74.16650699999997,40.68588799999986],[-74.16674099999994,40.685491999999925],[-74.16695699999997,40.68523099999988],[-74.16738899999996,40.684546999999924],[-74.16781199999997,40.6839439999999],[-74.16791099999995,40.68379099999988],[-74.16804599999995,40.68360199999991],[-74.16816299999994,40.683475999999885],[-74.16822599999995,40.68334999999991],[-74.16848699999997,40.68299899999991],[-74.16886499999998,40.68239599999987],[-74.16916199999997,40.68199999999991],[-74.16929699999997,40.68178399999989],[-74.16947699999997,40.68155899999991],[-74.16981899999996,40.681018999999885],[-74.16995399999996,40.680874999999915],[-74.17005299999994,40.68066799999987],[-74.17041299999994,40.6801549999999],[-74.17051199999997,40.67999299999987],[-74.17067399999996,40.679650999999886],[-74.17093499999999,40.679290999999864],[-74.17144799999994,40.67847199999989],[-74.17151999999999,40.678381999999885],[-74.17160999999999,40.678255999999884],[-74.17193399999996,40.67782399999988],[-74.17200599999995,40.67773399999988],[-74.17283399999997,40.67656399999988],[-74.17314899999997,40.67619499999991],[-74.17322999999999,40.6760779999999],[-74.17329299999994,40.67601499999989],[-74.17358999999993,40.67571799999991],[-74.17423799999995,40.67493499999991],[-74.17437299999995,40.674817999999895],[-74.17484999999994,40.67432299999992],[-74.17500299999995,40.6741699999999],[-74.17538999999995,40.67375599999987],[-74.17604699999998,40.673044999999895],[-74.17630799999995,40.67276599999986],[-74.17641599999996,40.672621999999876],[-74.17663199999998,40.67239699999989],[-74.17678499999994,40.67218099999991],[-74.17697399999997,40.6719379999999],[-74.17709099999996,40.671784999999886],[-74.17734299999995,40.67155999999988],[-74.17754999999994,40.67142499999989],[-74.17778399999997,40.671316999999874],[-74.17802699999999,40.671208999999884],[-74.17862999999994,40.671037999999896],[-74.17888199999999,40.671001999999895],[-74.17912499999994,40.67099299999991],[-74.17933199999999,40.67101099999992],[-74.17979099999997,40.67115499999989],[-74.17997999999994,40.671208999999884],[-74.18010599999997,40.671262999999904],[-74.18030399999998,40.67129899999986],[-74.18133899999998,40.67170399999986],[-74.18213999999996,40.67202799999989],[-74.18384999999995,40.672648999999886],[-74.18437199999994,40.67290999999989],[-74.18458799999996,40.67302699999988],[-74.18492099999997,40.673269999999896],[-74.18503799999996,40.67335999999989],[-74.18513699999994,40.673458999999866],[-74.18547899999999,40.67390899999987],[-74.18594699999994,40.674664999999905],[-74.18670299999997,40.67578999999992],[-74.18733299999997,40.67674399999987],[-74.18767499999996,40.67729299999991],[-74.18795399999995,40.67761699999989],[-74.18819699999995,40.67792299999992],[-74.18852099999998,40.67848099999987],[-74.18877299999997,40.67885899999989],[-74.18905199999995,40.67933599999985],[-74.18935799999997,40.67975899999988],[-74.18949299999997,40.680091999999895],[-74.18969999999996,40.680793999999885],[-74.18977199999995,40.68113599999987],[-74.189781,40.681198999999886],[-74.18983499999996,40.68131599999987],[-74.18991599999998,40.68154099999988],[-74.18996999999996,40.6818019999999],[-74.18999699999995,40.6822519999999],[-74.18999699999995,40.68262999999992],[-74.18996999999996,40.68295399999989],[-74.18998799999997,40.68317899999989],[-74.18995199999995,40.683520999999885],[-74.18993399999994,40.68370999999992],[-74.189871,40.684078999999876],[-74.189781,40.68481699999991],[-74.18976299999997,40.68503299999986],[-74.18962799999997,40.686103999999915],[-74.18955599999998,40.68689599999987],[-74.18951999999996,40.6872019999999],[-74.18947499999996,40.68748999999985],[-74.18939399999994,40.68773299999988],[-74.18939399999994,40.68783199999991],[-74.18941199999995,40.687939999999855],[-74.18940299999997,40.68809299999987],[-74.18934899999994,40.68826399999989],[-74.18922299999997,40.68862399999989],[-74.18898899999994,40.68904699999991],[-74.18870099999998,40.689442999999876],[-74.18779199999994,40.690189999999866],[-74.18723399999999,40.69059499999986],[-74.18636999999995,40.69118899999991],[-74.18591099999998,40.69144999999988],[-74.18563199999994,40.69164799999987],[-74.18445299999996,40.694995999999904]]]],\"type\":\"MultiPolygon\"}MULTIPOLYGON (((-74.18445299999996 40.694995999999904, -74.18448899999999 40.69509499999987, -74.18449799999996 40.69518499999987, -74.18438099999997 40.69587799999989, -74.18428199999994 40.6962109999999, -74.18402099999997 40.697074999999884, -74.18391299999996 40.69750699999986, -74.18375099999997 40.69779499999988, -74.18363399999998 40.6983259999999, -74.18356199999994 40.698451999999875, -74.18354399999998 40.69855999999988, -74.18350799999996 40.69870399999992, -74.18327399999998 40.70008999999988, -74.18315699999994 40.701214999999884, -74.18316599999997 40.702384999999886, -74.18313899999998 40.7026279999999, -74.18309399999998 40.7028529999999, -74.18299499999995 40.70315899999985, -74.18284199999994 40.70346499999989, -74.18264399999998 40.70373499999988, -74.18242799999996 40.70395099999992, -74.18220299999996 40.704139999999896, -74.18203199999994 40.70425699999987, -74.18180699999994 40.7043919999999, -74.18157299999996 40.70449999999988, -74.18132099999997 40.70460799999991, -74.18080799999996 40.7047879999999, -74.179467 40.70534599999992, -74.17887299999995 40.70554399999987, -74.17831499999994 40.70572399999987, -74.17776599999996 40.70589499999988, -74.17709099999996 40.706092999999896, -74.17699199999998 40.70613799999988, -74.17689299999995 40.70619199999988, -74.17664999999994 40.70641699999988, -74.17642499999994 40.706695999999916, -74.17628999999994 40.70689399999988, -74.17608299999995 40.70710999999989, -74.17599299999995 40.70719099999991, -74.17589399999997 40.707262999999905, -74.17565999999994 40.70737999999988, -74.17538099999996 40.707469999999915, -74.17515599999996 40.707514999999894, -74.17475999999994 40.707595999999924, -74.17417499999993 40.70766799999991, -74.17388699999998 40.70773099999992, -74.17347299999994 40.707748999999865, -74.17275299999994 40.707802999999906, -74.17188899999996 40.707910999999854, -74.17163699999998 40.70795599999986, -74.17133999999999 40.707964999999895, -74.17120499999999 40.70795599999986, -74.16994499999998 40.707973999999886, -74.16888299999994 40.7079379999999, -74.16681299999993 40.70785699999989, -74.16442799999999 40.70779399999987, -74.16401399999995 40.70777599999992, -74.16233999999997 40.707721999999876, -74.16081899999995 40.70764099999991, -74.16057599999993 40.70760499999988, -74.16033299999998 40.70756899999987, -74.160063 40.7074879999999, -74.15938799999998 40.707262999999905, -74.15904599999999 40.707145999999916, -74.15891999999997 40.70710999999989, -74.15827199999995 40.70687599999993, -74.15459099999998 40.705651999999894, -74.15409599999998 40.70544499999989, -74.15401499999997 40.70538199999988, -74.15387999999996 40.705327999999895, -74.15376299999997 40.705408999999875, -74.15323199999995 40.70524699999987, -74.15317799999997 40.70531899999989, -74.15306999999996 40.7052829999999, -74.15359199999995 40.70437399999987, -74.15386199999995 40.7038429999999, -74.15513999999996 40.70155699999987, -74.15544599999998 40.70108899999988, -74.15575199999995 40.7006659999999, -74.15600399999994 40.70026099999991, -74.15635499999996 40.69975699999986, -74.15745299999998 40.69809199999988, -74.15754299999998 40.6979389999999, -74.15758799999998 40.69781299999988, -74.15762399999994 40.69767799999991, -74.15829899999994 40.696705999999885, -74.15951399999994 40.69488799999988, -74.15958599999993 40.69476199999984, -74.16014399999995 40.69410499999988, -74.16057599999993 40.693222999999875, -74.16262799999998 40.69028899999989, -74.16279899999995 40.69002799999989, -74.16290699999996 40.68987499999987, -74.16292499999997 40.689874999999866, -74.16295199999996 40.689874999999866, -74.16306899999995 40.68989299999988, -74.16309599999994 40.689928999999886, -74.16322199999996 40.68998299999989, -74.16331199999996 40.68999199999993, -74.16341099999994 40.69000099999988, -74.16352799999999 40.69000999999986, -74.16380699999996 40.69004599999989, -74.16410399999995 40.690081999999904, -74.16417599999994 40.690081999999904, -74.16422999999998 40.69005499999988, -74.16436499999998 40.69003699999991, -74.16450899999995 40.68998299999986, -74.16467099999994 40.68988399999989, -74.16479699999996 40.689757999999884, -74.16491399999995 40.689586999999904, -74.16499499999998 40.689388999999885, -74.16528299999999 40.68891199999991, -74.16542699999997 40.6887589999999, -74.16548099999994 40.68863299999987, -74.16560699999997 40.68842599999988, -74.16576899999995 40.68802999999986, -74.16587699999997 40.68787699999991, -74.16583199999997 40.68757999999987, -74.16582299999999 40.68748999999987, -74.16580499999998 40.687156999999914, -74.16582299999999 40.68703999999986, -74.16589499999998 40.6868419999999, -74.16604799999999 40.68655399999988, -74.16639899999996 40.686022999999864, -74.16650699999997 40.68588799999986, -74.16674099999994 40.685491999999925, -74.16695699999997 40.68523099999988, -74.16738899999996 40.684546999999924, -74.16781199999997 40.6839439999999, -74.16791099999995 40.68379099999988, -74.16804599999995 40.68360199999991, -74.16816299999994 40.683475999999885, -74.16822599999995 40.68334999999991, -74.16848699999997 40.68299899999991, -74.16886499999998 40.68239599999987, -74.16916199999997 40.68199999999991, -74.16929699999997 40.68178399999989, -74.16947699999997 40.68155899999991, -74.16981899999996 40.681018999999885, -74.16995399999996 40.680874999999915, -74.17005299999994 40.68066799999987, -74.17041299999994 40.6801549999999, -74.17051199999997 40.67999299999987, -74.17067399999996 40.679650999999886, -74.17093499999999 40.679290999999864, -74.17144799999994 40.67847199999989, -74.17151999999999 40.678381999999885, -74.17160999999999 40.678255999999884, -74.17193399999996 40.67782399999988, -74.17200599999995 40.67773399999988, -74.17283399999997 40.67656399999988, -74.17314899999997 40.67619499999991, -74.17322999999999 40.6760779999999, -74.17329299999994 40.67601499999989, -74.17358999999993 40.67571799999991, -74.17423799999995 40.67493499999991, -74.17437299999995 40.674817999999895, -74.17484999999994 40.67432299999992, -74.17500299999995 40.6741699999999, -74.17538999999995 40.67375599999987, -74.17604699999998 40.673044999999895, -74.17630799999995 40.67276599999986, -74.17641599999996 40.672621999999876, -74.17663199999998 40.67239699999989, -74.17678499999994 40.67218099999991, -74.17697399999997 40.6719379999999, -74.17709099999996 40.671784999999886, -74.17734299999995 40.67155999999988, -74.17754999999994 40.67142499999989, -74.17778399999997 40.671316999999874, -74.17802699999999 40.671208999999884, -74.17862999999994 40.671037999999896, -74.17888199999999 40.671001999999895, -74.17912499999994 40.67099299999991, -74.17933199999999 40.67101099999992, -74.17979099999997 40.67115499999989, -74.17997999999994 40.671208999999884, -74.18010599999997 40.671262999999904, -74.18030399999998 40.67129899999986, -74.18133899999998 40.67170399999986, -74.18213999999996 40.67202799999989, -74.18384999999995 40.672648999999886, -74.18437199999994 40.67290999999989, -74.18458799999996 40.67302699999988, -74.18492099999997 40.673269999999896, -74.18503799999996 40.67335999999989, -74.18513699999994 40.673458999999866, -74.18547899999999 40.67390899999987, -74.18594699999994 40.674664999999905, -74.18670299999997 40.67578999999992, -74.18733299999997 40.67674399999987, -74.18767499999996 40.67729299999991, -74.18795399999995 40.67761699999989, -74.18819699999995 40.67792299999992, -74.18852099999998 40.67848099999987, -74.18877299999997 40.67885899999989, -74.18905199999995 40.67933599999985, -74.18935799999997 40.67975899999988, -74.18949299999997 40.680091999999895, -74.18969999999996 40.680793999999885, -74.18977199999995 40.68113599999987, -74.189781 40.681198999999886, -74.18983499999996 40.68131599999987, -74.18991599999998 40.68154099999988, -74.18996999999996 40.6818019999999, -74.18999699999995 40.6822519999999, -74.18999699999995 40.68262999999992, -74.18996999999996 40.68295399999989, -74.18998799999997 40.68317899999989, -74.18995199999995 40.683520999999885, -74.18993399999994 40.68370999999992, -74.189871 40.684078999999876, -74.189781 40.68481699999991, -74.18976299999997 40.68503299999986, -74.18962799999997 40.686103999999915, -74.18955599999998 40.68689599999987, -74.18951999999996 40.6872019999999, -74.18947499999996 40.68748999999985, -74.18939399999994 40.68773299999988, -74.18939399999994 40.68783199999991, -74.18941199999995 40.687939999999855, -74.18940299999997 40.68809299999987, -74.18934899999994 40.68826399999989, -74.18922299999997 40.68862399999989, -74.18898899999994 40.68904699999991, -74.18870099999998 40.689442999999876, -74.18779199999994 40.690189999999866, -74.18723399999999 40.69059499999986, -74.18636999999995 40.69118899999991, -74.18591099999998 40.69144999999988, -74.18563199999994 40.69164799999987, -74.18445299999996 40.694995999999904)))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "FeatureCollection", + [ + "EWR", + "1", + "1", + "0.0007823067885", + "0.116357453189", + "Newark Airport" + ], + "{\"coordinates\":[[[[-74.18445299999996,40.694995999999904],[-74.18448899999999,40.69509499999987],[-74.18449799999996,40.69518499999987],[-74.18438099999997,40.69587799999989],[-74.18428199999994,40.6962109999999],[-74.18402099999997,40.697074999999884],[-74.18391299999996,40.69750699999986],[-74.18375099999997,40.69779499999988],[-74.18363399999998,40.6983259999999],[-74.18356199999994,40.698451999999875],[-74.18354399999998,40.69855999999988],[-74.18350799999996,40.69870399999992],[-74.18327399999998,40.70008999999988],[-74.18315699999994,40.701214999999884],[-74.18316599999997,40.702384999999886],[-74.18313899999998,40.7026279999999],[-74.18309399999998,40.7028529999999],[-74.18299499999995,40.70315899999985],[-74.18284199999994,40.70346499999989],[-74.18264399999998,40.70373499999988],[-74.18242799999996,40.70395099999992],[-74.18220299999996,40.704139999999896],[-74.18203199999994,40.70425699999987],[-74.18180699999994,40.7043919999999],[-74.18157299999996,40.70449999999988],[-74.18132099999997,40.70460799999991],[-74.18080799999996,40.7047879999999],[-74.179467,40.70534599999992],[-74.17887299999995,40.70554399999987],[-74.17831499999994,40.70572399999987],[-74.17776599999996,40.70589499999988],[-74.17709099999996,40.706092999999896],[-74.17699199999998,40.70613799999988],[-74.17689299999995,40.70619199999988],[-74.17664999999994,40.70641699999988],[-74.17642499999994,40.706695999999916],[-74.17628999999994,40.70689399999988],[-74.17608299999995,40.70710999999989],[-74.17599299999995,40.70719099999991],[-74.17589399999997,40.707262999999905],[-74.17565999999994,40.70737999999988],[-74.17538099999996,40.707469999999915],[-74.17515599999996,40.707514999999894],[-74.17475999999994,40.707595999999924],[-74.17417499999993,40.70766799999991],[-74.17388699999998,40.70773099999992],[-74.17347299999994,40.707748999999865],[-74.17275299999994,40.707802999999906],[-74.17188899999996,40.707910999999854],[-74.17163699999998,40.70795599999986],[-74.17133999999999,40.707964999999895],[-74.17120499999999,40.70795599999986],[-74.16994499999998,40.707973999999886],[-74.16888299999994,40.7079379999999],[-74.16681299999993,40.70785699999989],[-74.16442799999999,40.70779399999987],[-74.16401399999995,40.70777599999992],[-74.16233999999997,40.707721999999876],[-74.16081899999995,40.70764099999991],[-74.16057599999993,40.70760499999988],[-74.16033299999998,40.70756899999987],[-74.160063,40.7074879999999],[-74.15938799999998,40.707262999999905],[-74.15904599999999,40.707145999999916],[-74.15891999999997,40.70710999999989],[-74.15827199999995,40.70687599999993],[-74.15459099999998,40.705651999999894],[-74.15409599999998,40.70544499999989],[-74.15401499999997,40.70538199999988],[-74.15387999999996,40.705327999999895],[-74.15376299999997,40.705408999999875],[-74.15323199999995,40.70524699999987],[-74.15317799999997,40.70531899999989],[-74.15306999999996,40.7052829999999],[-74.15359199999995,40.70437399999987],[-74.15386199999995,40.7038429999999],[-74.15513999999996,40.70155699999987],[-74.15544599999998,40.70108899999988],[-74.15575199999995,40.7006659999999],[-74.15600399999994,40.70026099999991],[-74.15635499999996,40.69975699999986],[-74.15745299999998,40.69809199999988],[-74.15754299999998,40.6979389999999],[-74.15758799999998,40.69781299999988],[-74.15762399999994,40.69767799999991],[-74.15829899999994,40.696705999999885],[-74.15951399999994,40.69488799999988],[-74.15958599999993,40.69476199999984],[-74.16014399999995,40.69410499999988],[-74.16057599999993,40.693222999999875],[-74.16262799999998,40.69028899999989],[-74.16279899999995,40.69002799999989],[-74.16290699999996,40.68987499999987],[-74.16292499999997,40.689874999999866],[-74.16295199999996,40.689874999999866],[-74.16306899999995,40.68989299999988],[-74.16309599999994,40.689928999999886],[-74.16322199999996,40.68998299999989],[-74.16331199999996,40.68999199999993],[-74.16341099999994,40.69000099999988],[-74.16352799999999,40.69000999999986],[-74.16380699999996,40.69004599999989],[-74.16410399999995,40.690081999999904],[-74.16417599999994,40.690081999999904],[-74.16422999999998,40.69005499999988],[-74.16436499999998,40.69003699999991],[-74.16450899999995,40.68998299999986],[-74.16467099999994,40.68988399999989],[-74.16479699999996,40.689757999999884],[-74.16491399999995,40.689586999999904],[-74.16499499999998,40.689388999999885],[-74.16528299999999,40.68891199999991],[-74.16542699999997,40.6887589999999],[-74.16548099999994,40.68863299999987],[-74.16560699999997,40.68842599999988],[-74.16576899999995,40.68802999999986],[-74.16587699999997,40.68787699999991],[-74.16583199999997,40.68757999999987],[-74.16582299999999,40.68748999999987],[-74.16580499999998,40.687156999999914],[-74.16582299999999,40.68703999999986],[-74.16589499999998,40.6868419999999],[-74.16604799999999,40.68655399999988],[-74.16639899999996,40.686022999999864],[-74.16650699999997,40.68588799999986],[-74.16674099999994,40.685491999999925],[-74.16695699999997,40.68523099999988],[-74.16738899999996,40.684546999999924],[-74.16781199999997,40.6839439999999],[-74.16791099999995,40.68379099999988],[-74.16804599999995,40.68360199999991],[-74.16816299999994,40.683475999999885],[-74.16822599999995,40.68334999999991],[-74.16848699999997,40.68299899999991],[-74.16886499999998,40.68239599999987],[-74.16916199999997,40.68199999999991],[-74.16929699999997,40.68178399999989],[-74.16947699999997,40.68155899999991],[-74.16981899999996,40.681018999999885],[-74.16995399999996,40.680874999999915],[-74.17005299999994,40.68066799999987],[-74.17041299999994,40.6801549999999],[-74.17051199999997,40.67999299999987],[-74.17067399999996,40.679650999999886],[-74.17093499999999,40.679290999999864],[-74.17144799999994,40.67847199999989],[-74.17151999999999,40.678381999999885],[-74.17160999999999,40.678255999999884],[-74.17193399999996,40.67782399999988],[-74.17200599999995,40.67773399999988],[-74.17283399999997,40.67656399999988],[-74.17314899999997,40.67619499999991],[-74.17322999999999,40.6760779999999],[-74.17329299999994,40.67601499999989],[-74.17358999999993,40.67571799999991],[-74.17423799999995,40.67493499999991],[-74.17437299999995,40.674817999999895],[-74.17484999999994,40.67432299999992],[-74.17500299999995,40.6741699999999],[-74.17538999999995,40.67375599999987],[-74.17604699999998,40.673044999999895],[-74.17630799999995,40.67276599999986],[-74.17641599999996,40.672621999999876],[-74.17663199999998,40.67239699999989],[-74.17678499999994,40.67218099999991],[-74.17697399999997,40.6719379999999],[-74.17709099999996,40.671784999999886],[-74.17734299999995,40.67155999999988],[-74.17754999999994,40.67142499999989],[-74.17778399999997,40.671316999999874],[-74.17802699999999,40.671208999999884],[-74.17862999999994,40.671037999999896],[-74.17888199999999,40.671001999999895],[-74.17912499999994,40.67099299999991],[-74.17933199999999,40.67101099999992],[-74.17979099999997,40.67115499999989],[-74.17997999999994,40.671208999999884],[-74.18010599999997,40.671262999999904],[-74.18030399999998,40.67129899999986],[-74.18133899999998,40.67170399999986],[-74.18213999999996,40.67202799999989],[-74.18384999999995,40.672648999999886],[-74.18437199999994,40.67290999999989],[-74.18458799999996,40.67302699999988],[-74.18492099999997,40.673269999999896],[-74.18503799999996,40.67335999999989],[-74.18513699999994,40.673458999999866],[-74.18547899999999,40.67390899999987],[-74.18594699999994,40.674664999999905],[-74.18670299999997,40.67578999999992],[-74.18733299999997,40.67674399999987],[-74.18767499999996,40.67729299999991],[-74.18795399999995,40.67761699999989],[-74.18819699999995,40.67792299999992],[-74.18852099999998,40.67848099999987],[-74.18877299999997,40.67885899999989],[-74.18905199999995,40.67933599999985],[-74.18935799999997,40.67975899999988],[-74.18949299999997,40.680091999999895],[-74.18969999999996,40.680793999999885],[-74.18977199999995,40.68113599999987],[-74.189781,40.681198999999886],[-74.18983499999996,40.68131599999987],[-74.18991599999998,40.68154099999988],[-74.18996999999996,40.6818019999999],[-74.18999699999995,40.6822519999999],[-74.18999699999995,40.68262999999992],[-74.18996999999996,40.68295399999989],[-74.18998799999997,40.68317899999989],[-74.18995199999995,40.683520999999885],[-74.18993399999994,40.68370999999992],[-74.189871,40.684078999999876],[-74.189781,40.68481699999991],[-74.18976299999997,40.68503299999986],[-74.18962799999997,40.686103999999915],[-74.18955599999998,40.68689599999987],[-74.18951999999996,40.6872019999999],[-74.18947499999996,40.68748999999985],[-74.18939399999994,40.68773299999988],[-74.18939399999994,40.68783199999991],[-74.18941199999995,40.687939999999855],[-74.18940299999997,40.68809299999987],[-74.18934899999994,40.68826399999989],[-74.18922299999997,40.68862399999989],[-74.18898899999994,40.68904699999991],[-74.18870099999998,40.689442999999876],[-74.18779199999994,40.690189999999866],[-74.18723399999999,40.69059499999986],[-74.18636999999995,40.69118899999991],[-74.18591099999998,40.69144999999988],[-74.18563199999994,40.69164799999987],[-74.18445299999996,40.694995999999904]]]],\"type\":\"MultiPolygon\"}", + "MULTIPOLYGON (((-74.18445299999996 40.694995999999904, -74.18448899999999 40.69509499999987, -74.18449799999996 40.69518499999987, -74.18438099999997 40.69587799999989, -74.18428199999994 40.6962109999999, -74.18402099999997 40.697074999999884, -74.18391299999996 40.69750699999986, -74.18375099999997 40.69779499999988, -74.18363399999998 40.6983259999999, -74.18356199999994 40.698451999999875, -74.18354399999998 40.69855999999988, -74.18350799999996 40.69870399999992, -74.18327399999998 40.70008999999988, -74.18315699999994 40.701214999999884, -74.18316599999997 40.702384999999886, -74.18313899999998 40.7026279999999, -74.18309399999998 40.7028529999999, -74.18299499999995 40.70315899999985, -74.18284199999994 40.70346499999989, -74.18264399999998 40.70373499999988, -74.18242799999996 40.70395099999992, -74.18220299999996 40.704139999999896, -74.18203199999994 40.70425699999987, -74.18180699999994 40.7043919999999, -74.18157299999996 40.70449999999988, -74.18132099999997 40.70460799999991, -74.18080799999996 40.7047879999999, -74.179467 40.70534599999992, -74.17887299999995 40.70554399999987, -74.17831499999994 40.70572399999987, -74.17776599999996 40.70589499999988, -74.17709099999996 40.706092999999896, -74.17699199999998 40.70613799999988, -74.17689299999995 40.70619199999988, -74.17664999999994 40.70641699999988, -74.17642499999994 40.706695999999916, -74.17628999999994 40.70689399999988, -74.17608299999995 40.70710999999989, -74.17599299999995 40.70719099999991, -74.17589399999997 40.707262999999905, -74.17565999999994 40.70737999999988, -74.17538099999996 40.707469999999915, -74.17515599999996 40.707514999999894, -74.17475999999994 40.707595999999924, -74.17417499999993 40.70766799999991, -74.17388699999998 40.70773099999992, -74.17347299999994 40.707748999999865, -74.17275299999994 40.707802999999906, -74.17188899999996 40.707910999999854, -74.17163699999998 40.70795599999986, -74.17133999999999 40.707964999999895, -74.17120499999999 40.70795599999986, -74.16994499999998 40.707973999999886, -74.16888299999994 40.7079379999999, -74.16681299999993 40.70785699999989, -74.16442799999999 40.70779399999987, -74.16401399999995 40.70777599999992, -74.16233999999997 40.707721999999876, -74.16081899999995 40.70764099999991, -74.16057599999993 40.70760499999988, -74.16033299999998 40.70756899999987, -74.160063 40.7074879999999, -74.15938799999998 40.707262999999905, -74.15904599999999 40.707145999999916, -74.15891999999997 40.70710999999989, -74.15827199999995 40.70687599999993, -74.15459099999998 40.705651999999894, -74.15409599999998 40.70544499999989, -74.15401499999997 40.70538199999988, -74.15387999999996 40.705327999999895, -74.15376299999997 40.705408999999875, -74.15323199999995 40.70524699999987, -74.15317799999997 40.70531899999989, -74.15306999999996 40.7052829999999, -74.15359199999995 40.70437399999987, -74.15386199999995 40.7038429999999, -74.15513999999996 40.70155699999987, -74.15544599999998 40.70108899999988, -74.15575199999995 40.7006659999999, -74.15600399999994 40.70026099999991, -74.15635499999996 40.69975699999986, -74.15745299999998 40.69809199999988, -74.15754299999998 40.6979389999999, -74.15758799999998 40.69781299999988, -74.15762399999994 40.69767799999991, -74.15829899999994 40.696705999999885, -74.15951399999994 40.69488799999988, -74.15958599999993 40.69476199999984, -74.16014399999995 40.69410499999988, -74.16057599999993 40.693222999999875, -74.16262799999998 40.69028899999989, -74.16279899999995 40.69002799999989, -74.16290699999996 40.68987499999987, -74.16292499999997 40.689874999999866, -74.16295199999996 40.689874999999866, -74.16306899999995 40.68989299999988, -74.16309599999994 40.689928999999886, -74.16322199999996 40.68998299999989, -74.16331199999996 40.68999199999993, -74.16341099999994 40.69000099999988, -74.16352799999999 40.69000999999986, -74.16380699999996 40.69004599999989, -74.16410399999995 40.690081999999904, -74.16417599999994 40.690081999999904, -74.16422999999998 40.69005499999988, -74.16436499999998 40.69003699999991, -74.16450899999995 40.68998299999986, -74.16467099999994 40.68988399999989, -74.16479699999996 40.689757999999884, -74.16491399999995 40.689586999999904, -74.16499499999998 40.689388999999885, -74.16528299999999 40.68891199999991, -74.16542699999997 40.6887589999999, -74.16548099999994 40.68863299999987, -74.16560699999997 40.68842599999988, -74.16576899999995 40.68802999999986, -74.16587699999997 40.68787699999991, -74.16583199999997 40.68757999999987, -74.16582299999999 40.68748999999987, -74.16580499999998 40.687156999999914, -74.16582299999999 40.68703999999986, -74.16589499999998 40.6868419999999, -74.16604799999999 40.68655399999988, -74.16639899999996 40.686022999999864, -74.16650699999997 40.68588799999986, -74.16674099999994 40.685491999999925, -74.16695699999997 40.68523099999988, -74.16738899999996 40.684546999999924, -74.16781199999997 40.6839439999999, -74.16791099999995 40.68379099999988, -74.16804599999995 40.68360199999991, -74.16816299999994 40.683475999999885, -74.16822599999995 40.68334999999991, -74.16848699999997 40.68299899999991, -74.16886499999998 40.68239599999987, -74.16916199999997 40.68199999999991, -74.16929699999997 40.68178399999989, -74.16947699999997 40.68155899999991, -74.16981899999996 40.681018999999885, -74.16995399999996 40.680874999999915, -74.17005299999994 40.68066799999987, -74.17041299999994 40.6801549999999, -74.17051199999997 40.67999299999987, -74.17067399999996 40.679650999999886, -74.17093499999999 40.679290999999864, -74.17144799999994 40.67847199999989, -74.17151999999999 40.678381999999885, -74.17160999999999 40.678255999999884, -74.17193399999996 40.67782399999988, -74.17200599999995 40.67773399999988, -74.17283399999997 40.67656399999988, -74.17314899999997 40.67619499999991, -74.17322999999999 40.6760779999999, -74.17329299999994 40.67601499999989, -74.17358999999993 40.67571799999991, -74.17423799999995 40.67493499999991, -74.17437299999995 40.674817999999895, -74.17484999999994 40.67432299999992, -74.17500299999995 40.6741699999999, -74.17538999999995 40.67375599999987, -74.17604699999998 40.673044999999895, -74.17630799999995 40.67276599999986, -74.17641599999996 40.672621999999876, -74.17663199999998 40.67239699999989, -74.17678499999994 40.67218099999991, -74.17697399999997 40.6719379999999, -74.17709099999996 40.671784999999886, -74.17734299999995 40.67155999999988, -74.17754999999994 40.67142499999989, -74.17778399999997 40.671316999999874, -74.17802699999999 40.671208999999884, -74.17862999999994 40.671037999999896, -74.17888199999999 40.671001999999895, -74.17912499999994 40.67099299999991, -74.17933199999999 40.67101099999992, -74.17979099999997 40.67115499999989, -74.17997999999994 40.671208999999884, -74.18010599999997 40.671262999999904, -74.18030399999998 40.67129899999986, -74.18133899999998 40.67170399999986, -74.18213999999996 40.67202799999989, -74.18384999999995 40.672648999999886, -74.18437199999994 40.67290999999989, -74.18458799999996 40.67302699999988, -74.18492099999997 40.673269999999896, -74.18503799999996 40.67335999999989, -74.18513699999994 40.673458999999866, -74.18547899999999 40.67390899999987, -74.18594699999994 40.674664999999905, -74.18670299999997 40.67578999999992, -74.18733299999997 40.67674399999987, -74.18767499999996 40.67729299999991, -74.18795399999995 40.67761699999989, -74.18819699999995 40.67792299999992, -74.18852099999998 40.67848099999987, -74.18877299999997 40.67885899999989, -74.18905199999995 40.67933599999985, -74.18935799999997 40.67975899999988, -74.18949299999997 40.680091999999895, -74.18969999999996 40.680793999999885, -74.18977199999995 40.68113599999987, -74.189781 40.681198999999886, -74.18983499999996 40.68131599999987, -74.18991599999998 40.68154099999988, -74.18996999999996 40.6818019999999, -74.18999699999995 40.6822519999999, -74.18999699999995 40.68262999999992, -74.18996999999996 40.68295399999989, -74.18998799999997 40.68317899999989, -74.18995199999995 40.683520999999885, -74.18993399999994 40.68370999999992, -74.189871 40.684078999999876, -74.189781 40.68481699999991, -74.18976299999997 40.68503299999986, -74.18962799999997 40.686103999999915, -74.18955599999998 40.68689599999987, -74.18951999999996 40.6872019999999, -74.18947499999996 40.68748999999985, -74.18939399999994 40.68773299999988, -74.18939399999994 40.68783199999991, -74.18941199999995 40.687939999999855, -74.18940299999997 40.68809299999987, -74.18934899999994 40.68826399999989, -74.18922299999997 40.68862399999989, -74.18898899999994 40.68904699999991, -74.18870099999998 40.689442999999876, -74.18779199999994 40.690189999999866, -74.18723399999999 40.69059499999986, -74.18636999999995 40.69118899999991, -74.18591099999998 40.69144999999988, -74.18563199999994 40.69164799999987, -74.18445299999996 40.694995999999904)))" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "properties", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"borough\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"location_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"objectid\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_area\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_leng\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"zone\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "json_geometry", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "geometry", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "-- limiting for ipynb only\n", + "select * from neighbourhoods limit 1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3738db02-9bc9-437b-a723-00c438a6fb87", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Compute some basic geometry attributes\n", + "\n", + "> Mosaic provides a number of functions for extracting the properties of geometries. Here are some that are relevant to Polygon geometries:" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c7103a81-672b-4427-ab98-89e8cec0c094", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
geometrycalculatedAreacalculatedLength
MULTIPOLYGON (((-74.18445299999996 40.694995999999904, -74.18448899999999 40.69509499999987, -74.18449799999996 40.69518499999987, -74.18438099999997 40.69587799999989, -74.18428199999994 40.6962109999999, -74.18402099999997 40.697074999999884, -74.18391299999996 40.69750699999986, -74.18375099999997 40.69779499999988, -74.18363399999998 40.6983259999999, -74.18356199999994 40.698451999999875, -74.18354399999998 40.69855999999988, -74.18350799999996 40.69870399999992, -74.18327399999998 40.70008999999988, -74.18315699999994 40.701214999999884, -74.18316599999997 40.702384999999886, -74.18313899999998 40.7026279999999, -74.18309399999998 40.7028529999999, -74.18299499999995 40.70315899999985, -74.18284199999994 40.70346499999989, -74.18264399999998 40.70373499999988, -74.18242799999996 40.70395099999992, -74.18220299999996 40.704139999999896, -74.18203199999994 40.70425699999987, -74.18180699999994 40.7043919999999, -74.18157299999996 40.70449999999988, -74.18132099999997 40.70460799999991, -74.18080799999996 40.7047879999999, -74.179467 40.70534599999992, -74.17887299999995 40.70554399999987, -74.17831499999994 40.70572399999987, -74.17776599999996 40.70589499999988, -74.17709099999996 40.706092999999896, -74.17699199999998 40.70613799999988, -74.17689299999995 40.70619199999988, -74.17664999999994 40.70641699999988, -74.17642499999994 40.706695999999916, -74.17628999999994 40.70689399999988, -74.17608299999995 40.70710999999989, -74.17599299999995 40.70719099999991, -74.17589399999997 40.707262999999905, -74.17565999999994 40.70737999999988, -74.17538099999996 40.707469999999915, -74.17515599999996 40.707514999999894, -74.17475999999994 40.707595999999924, -74.17417499999993 40.70766799999991, -74.17388699999998 40.70773099999992, -74.17347299999994 40.707748999999865, -74.17275299999994 40.707802999999906, -74.17188899999996 40.707910999999854, -74.17163699999998 40.70795599999986, -74.17133999999999 40.707964999999895, -74.17120499999999 40.70795599999986, -74.16994499999998 40.707973999999886, -74.16888299999994 40.7079379999999, -74.16681299999993 40.70785699999989, -74.16442799999999 40.70779399999987, -74.16401399999995 40.70777599999992, -74.16233999999997 40.707721999999876, -74.16081899999995 40.70764099999991, -74.16057599999993 40.70760499999988, -74.16033299999998 40.70756899999987, -74.160063 40.7074879999999, -74.15938799999998 40.707262999999905, -74.15904599999999 40.707145999999916, -74.15891999999997 40.70710999999989, -74.15827199999995 40.70687599999993, -74.15459099999998 40.705651999999894, -74.15409599999998 40.70544499999989, -74.15401499999997 40.70538199999988, -74.15387999999996 40.705327999999895, -74.15376299999997 40.705408999999875, -74.15323199999995 40.70524699999987, -74.15317799999997 40.70531899999989, -74.15306999999996 40.7052829999999, -74.15359199999995 40.70437399999987, -74.15386199999995 40.7038429999999, -74.15513999999996 40.70155699999987, -74.15544599999998 40.70108899999988, -74.15575199999995 40.7006659999999, -74.15600399999994 40.70026099999991, -74.15635499999996 40.69975699999986, -74.15745299999998 40.69809199999988, -74.15754299999998 40.6979389999999, -74.15758799999998 40.69781299999988, -74.15762399999994 40.69767799999991, -74.15829899999994 40.696705999999885, -74.15951399999994 40.69488799999988, -74.15958599999993 40.69476199999984, -74.16014399999995 40.69410499999988, -74.16057599999993 40.693222999999875, -74.16262799999998 40.69028899999989, -74.16279899999995 40.69002799999989, -74.16290699999996 40.68987499999987, -74.16292499999997 40.689874999999866, -74.16295199999996 40.689874999999866, -74.16306899999995 40.68989299999988, -74.16309599999994 40.689928999999886, -74.16322199999996 40.68998299999989, -74.16331199999996 40.68999199999993, -74.16341099999994 40.69000099999988, -74.16352799999999 40.69000999999986, -74.16380699999996 40.69004599999989, -74.16410399999995 40.690081999999904, -74.16417599999994 40.690081999999904, -74.16422999999998 40.69005499999988, -74.16436499999998 40.69003699999991, -74.16450899999995 40.68998299999986, -74.16467099999994 40.68988399999989, -74.16479699999996 40.689757999999884, -74.16491399999995 40.689586999999904, -74.16499499999998 40.689388999999885, -74.16528299999999 40.68891199999991, -74.16542699999997 40.6887589999999, -74.16548099999994 40.68863299999987, -74.16560699999997 40.68842599999988, -74.16576899999995 40.68802999999986, -74.16587699999997 40.68787699999991, -74.16583199999997 40.68757999999987, -74.16582299999999 40.68748999999987, -74.16580499999998 40.687156999999914, -74.16582299999999 40.68703999999986, -74.16589499999998 40.6868419999999, -74.16604799999999 40.68655399999988, -74.16639899999996 40.686022999999864, -74.16650699999997 40.68588799999986, -74.16674099999994 40.685491999999925, -74.16695699999997 40.68523099999988, -74.16738899999996 40.684546999999924, -74.16781199999997 40.6839439999999, -74.16791099999995 40.68379099999988, -74.16804599999995 40.68360199999991, -74.16816299999994 40.683475999999885, -74.16822599999995 40.68334999999991, -74.16848699999997 40.68299899999991, -74.16886499999998 40.68239599999987, -74.16916199999997 40.68199999999991, -74.16929699999997 40.68178399999989, -74.16947699999997 40.68155899999991, -74.16981899999996 40.681018999999885, -74.16995399999996 40.680874999999915, -74.17005299999994 40.68066799999987, -74.17041299999994 40.6801549999999, -74.17051199999997 40.67999299999987, -74.17067399999996 40.679650999999886, -74.17093499999999 40.679290999999864, -74.17144799999994 40.67847199999989, -74.17151999999999 40.678381999999885, -74.17160999999999 40.678255999999884, -74.17193399999996 40.67782399999988, -74.17200599999995 40.67773399999988, -74.17283399999997 40.67656399999988, -74.17314899999997 40.67619499999991, -74.17322999999999 40.6760779999999, -74.17329299999994 40.67601499999989, -74.17358999999993 40.67571799999991, -74.17423799999995 40.67493499999991, -74.17437299999995 40.674817999999895, -74.17484999999994 40.67432299999992, -74.17500299999995 40.6741699999999, -74.17538999999995 40.67375599999987, -74.17604699999998 40.673044999999895, -74.17630799999995 40.67276599999986, -74.17641599999996 40.672621999999876, -74.17663199999998 40.67239699999989, -74.17678499999994 40.67218099999991, -74.17697399999997 40.6719379999999, -74.17709099999996 40.671784999999886, -74.17734299999995 40.67155999999988, -74.17754999999994 40.67142499999989, -74.17778399999997 40.671316999999874, -74.17802699999999 40.671208999999884, -74.17862999999994 40.671037999999896, -74.17888199999999 40.671001999999895, -74.17912499999994 40.67099299999991, -74.17933199999999 40.67101099999992, -74.17979099999997 40.67115499999989, -74.17997999999994 40.671208999999884, -74.18010599999997 40.671262999999904, -74.18030399999998 40.67129899999986, -74.18133899999998 40.67170399999986, -74.18213999999996 40.67202799999989, -74.18384999999995 40.672648999999886, -74.18437199999994 40.67290999999989, -74.18458799999996 40.67302699999988, -74.18492099999997 40.673269999999896, -74.18503799999996 40.67335999999989, -74.18513699999994 40.673458999999866, -74.18547899999999 40.67390899999987, -74.18594699999994 40.674664999999905, -74.18670299999997 40.67578999999992, -74.18733299999997 40.67674399999987, -74.18767499999996 40.67729299999991, -74.18795399999995 40.67761699999989, -74.18819699999995 40.67792299999992, -74.18852099999998 40.67848099999987, -74.18877299999997 40.67885899999989, -74.18905199999995 40.67933599999985, -74.18935799999997 40.67975899999988, -74.18949299999997 40.680091999999895, -74.18969999999996 40.680793999999885, -74.18977199999995 40.68113599999987, -74.189781 40.681198999999886, -74.18983499999996 40.68131599999987, -74.18991599999998 40.68154099999988, -74.18996999999996 40.6818019999999, -74.18999699999995 40.6822519999999, -74.18999699999995 40.68262999999992, -74.18996999999996 40.68295399999989, -74.18998799999997 40.68317899999989, -74.18995199999995 40.683520999999885, -74.18993399999994 40.68370999999992, -74.189871 40.684078999999876, -74.189781 40.68481699999991, -74.18976299999997 40.68503299999986, -74.18962799999997 40.686103999999915, -74.18955599999998 40.68689599999987, -74.18951999999996 40.6872019999999, -74.18947499999996 40.68748999999985, -74.18939399999994 40.68773299999988, -74.18939399999994 40.68783199999991, -74.18941199999995 40.687939999999855, -74.18940299999997 40.68809299999987, -74.18934899999994 40.68826399999989, -74.18922299999997 40.68862399999989, -74.18898899999994 40.68904699999991, -74.18870099999998 40.689442999999876, -74.18779199999994 40.690189999999866, -74.18723399999999 40.69059499999986, -74.18636999999995 40.69118899999991, -74.18591099999998 40.69144999999988, -74.18563199999994 40.69164799999987, -74.18445299999996 40.694995999999904)))7.823067885002558E-40.11635745318867867
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "MULTIPOLYGON (((-74.18445299999996 40.694995999999904, -74.18448899999999 40.69509499999987, -74.18449799999996 40.69518499999987, -74.18438099999997 40.69587799999989, -74.18428199999994 40.6962109999999, -74.18402099999997 40.697074999999884, -74.18391299999996 40.69750699999986, -74.18375099999997 40.69779499999988, -74.18363399999998 40.6983259999999, -74.18356199999994 40.698451999999875, -74.18354399999998 40.69855999999988, -74.18350799999996 40.69870399999992, -74.18327399999998 40.70008999999988, -74.18315699999994 40.701214999999884, -74.18316599999997 40.702384999999886, -74.18313899999998 40.7026279999999, -74.18309399999998 40.7028529999999, -74.18299499999995 40.70315899999985, -74.18284199999994 40.70346499999989, -74.18264399999998 40.70373499999988, -74.18242799999996 40.70395099999992, -74.18220299999996 40.704139999999896, -74.18203199999994 40.70425699999987, -74.18180699999994 40.7043919999999, -74.18157299999996 40.70449999999988, -74.18132099999997 40.70460799999991, -74.18080799999996 40.7047879999999, -74.179467 40.70534599999992, -74.17887299999995 40.70554399999987, -74.17831499999994 40.70572399999987, -74.17776599999996 40.70589499999988, -74.17709099999996 40.706092999999896, -74.17699199999998 40.70613799999988, -74.17689299999995 40.70619199999988, -74.17664999999994 40.70641699999988, -74.17642499999994 40.706695999999916, -74.17628999999994 40.70689399999988, -74.17608299999995 40.70710999999989, -74.17599299999995 40.70719099999991, -74.17589399999997 40.707262999999905, -74.17565999999994 40.70737999999988, -74.17538099999996 40.707469999999915, -74.17515599999996 40.707514999999894, -74.17475999999994 40.707595999999924, -74.17417499999993 40.70766799999991, -74.17388699999998 40.70773099999992, -74.17347299999994 40.707748999999865, -74.17275299999994 40.707802999999906, -74.17188899999996 40.707910999999854, -74.17163699999998 40.70795599999986, -74.17133999999999 40.707964999999895, -74.17120499999999 40.70795599999986, -74.16994499999998 40.707973999999886, -74.16888299999994 40.7079379999999, -74.16681299999993 40.70785699999989, -74.16442799999999 40.70779399999987, -74.16401399999995 40.70777599999992, -74.16233999999997 40.707721999999876, -74.16081899999995 40.70764099999991, -74.16057599999993 40.70760499999988, -74.16033299999998 40.70756899999987, -74.160063 40.7074879999999, -74.15938799999998 40.707262999999905, -74.15904599999999 40.707145999999916, -74.15891999999997 40.70710999999989, -74.15827199999995 40.70687599999993, -74.15459099999998 40.705651999999894, -74.15409599999998 40.70544499999989, -74.15401499999997 40.70538199999988, -74.15387999999996 40.705327999999895, -74.15376299999997 40.705408999999875, -74.15323199999995 40.70524699999987, -74.15317799999997 40.70531899999989, -74.15306999999996 40.7052829999999, -74.15359199999995 40.70437399999987, -74.15386199999995 40.7038429999999, -74.15513999999996 40.70155699999987, -74.15544599999998 40.70108899999988, -74.15575199999995 40.7006659999999, -74.15600399999994 40.70026099999991, -74.15635499999996 40.69975699999986, -74.15745299999998 40.69809199999988, -74.15754299999998 40.6979389999999, -74.15758799999998 40.69781299999988, -74.15762399999994 40.69767799999991, -74.15829899999994 40.696705999999885, -74.15951399999994 40.69488799999988, -74.15958599999993 40.69476199999984, -74.16014399999995 40.69410499999988, -74.16057599999993 40.693222999999875, -74.16262799999998 40.69028899999989, -74.16279899999995 40.69002799999989, -74.16290699999996 40.68987499999987, -74.16292499999997 40.689874999999866, -74.16295199999996 40.689874999999866, -74.16306899999995 40.68989299999988, -74.16309599999994 40.689928999999886, -74.16322199999996 40.68998299999989, -74.16331199999996 40.68999199999993, -74.16341099999994 40.69000099999988, -74.16352799999999 40.69000999999986, -74.16380699999996 40.69004599999989, -74.16410399999995 40.690081999999904, -74.16417599999994 40.690081999999904, -74.16422999999998 40.69005499999988, -74.16436499999998 40.69003699999991, -74.16450899999995 40.68998299999986, -74.16467099999994 40.68988399999989, -74.16479699999996 40.689757999999884, -74.16491399999995 40.689586999999904, -74.16499499999998 40.689388999999885, -74.16528299999999 40.68891199999991, -74.16542699999997 40.6887589999999, -74.16548099999994 40.68863299999987, -74.16560699999997 40.68842599999988, -74.16576899999995 40.68802999999986, -74.16587699999997 40.68787699999991, -74.16583199999997 40.68757999999987, -74.16582299999999 40.68748999999987, -74.16580499999998 40.687156999999914, -74.16582299999999 40.68703999999986, -74.16589499999998 40.6868419999999, -74.16604799999999 40.68655399999988, -74.16639899999996 40.686022999999864, -74.16650699999997 40.68588799999986, -74.16674099999994 40.685491999999925, -74.16695699999997 40.68523099999988, -74.16738899999996 40.684546999999924, -74.16781199999997 40.6839439999999, -74.16791099999995 40.68379099999988, -74.16804599999995 40.68360199999991, -74.16816299999994 40.683475999999885, -74.16822599999995 40.68334999999991, -74.16848699999997 40.68299899999991, -74.16886499999998 40.68239599999987, -74.16916199999997 40.68199999999991, -74.16929699999997 40.68178399999989, -74.16947699999997 40.68155899999991, -74.16981899999996 40.681018999999885, -74.16995399999996 40.680874999999915, -74.17005299999994 40.68066799999987, -74.17041299999994 40.6801549999999, -74.17051199999997 40.67999299999987, -74.17067399999996 40.679650999999886, -74.17093499999999 40.679290999999864, -74.17144799999994 40.67847199999989, -74.17151999999999 40.678381999999885, -74.17160999999999 40.678255999999884, -74.17193399999996 40.67782399999988, -74.17200599999995 40.67773399999988, -74.17283399999997 40.67656399999988, -74.17314899999997 40.67619499999991, -74.17322999999999 40.6760779999999, -74.17329299999994 40.67601499999989, -74.17358999999993 40.67571799999991, -74.17423799999995 40.67493499999991, -74.17437299999995 40.674817999999895, -74.17484999999994 40.67432299999992, -74.17500299999995 40.6741699999999, -74.17538999999995 40.67375599999987, -74.17604699999998 40.673044999999895, -74.17630799999995 40.67276599999986, -74.17641599999996 40.672621999999876, -74.17663199999998 40.67239699999989, -74.17678499999994 40.67218099999991, -74.17697399999997 40.6719379999999, -74.17709099999996 40.671784999999886, -74.17734299999995 40.67155999999988, -74.17754999999994 40.67142499999989, -74.17778399999997 40.671316999999874, -74.17802699999999 40.671208999999884, -74.17862999999994 40.671037999999896, -74.17888199999999 40.671001999999895, -74.17912499999994 40.67099299999991, -74.17933199999999 40.67101099999992, -74.17979099999997 40.67115499999989, -74.17997999999994 40.671208999999884, -74.18010599999997 40.671262999999904, -74.18030399999998 40.67129899999986, -74.18133899999998 40.67170399999986, -74.18213999999996 40.67202799999989, -74.18384999999995 40.672648999999886, -74.18437199999994 40.67290999999989, -74.18458799999996 40.67302699999988, -74.18492099999997 40.673269999999896, -74.18503799999996 40.67335999999989, -74.18513699999994 40.673458999999866, -74.18547899999999 40.67390899999987, -74.18594699999994 40.674664999999905, -74.18670299999997 40.67578999999992, -74.18733299999997 40.67674399999987, -74.18767499999996 40.67729299999991, -74.18795399999995 40.67761699999989, -74.18819699999995 40.67792299999992, -74.18852099999998 40.67848099999987, -74.18877299999997 40.67885899999989, -74.18905199999995 40.67933599999985, -74.18935799999997 40.67975899999988, -74.18949299999997 40.680091999999895, -74.18969999999996 40.680793999999885, -74.18977199999995 40.68113599999987, -74.189781 40.681198999999886, -74.18983499999996 40.68131599999987, -74.18991599999998 40.68154099999988, -74.18996999999996 40.6818019999999, -74.18999699999995 40.6822519999999, -74.18999699999995 40.68262999999992, -74.18996999999996 40.68295399999989, -74.18998799999997 40.68317899999989, -74.18995199999995 40.683520999999885, -74.18993399999994 40.68370999999992, -74.189871 40.684078999999876, -74.189781 40.68481699999991, -74.18976299999997 40.68503299999986, -74.18962799999997 40.686103999999915, -74.18955599999998 40.68689599999987, -74.18951999999996 40.6872019999999, -74.18947499999996 40.68748999999985, -74.18939399999994 40.68773299999988, -74.18939399999994 40.68783199999991, -74.18941199999995 40.687939999999855, -74.18940299999997 40.68809299999987, -74.18934899999994 40.68826399999989, -74.18922299999997 40.68862399999989, -74.18898899999994 40.68904699999991, -74.18870099999998 40.689442999999876, -74.18779199999994 40.690189999999866, -74.18723399999999 40.69059499999986, -74.18636999999995 40.69118899999991, -74.18591099999998 40.69144999999988, -74.18563199999994 40.69164799999987, -74.18445299999996 40.694995999999904)))", + 7.823067885002558E-4, + 0.11635745318867867 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "geometry", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "calculatedArea", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "calculatedLength", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "-- limiting for ipynb only\n", + "select \n", + " geometry, \n", + " st_area(geometry) as calculatedArea, \n", + " st_length(geometry) as calculatedLength \n", + "from neighbourhoods \n", + "limit 1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "704024a0-8171-4218-a95e-ff8ef6ace37c", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Initial Trips Data [Points]\n", + "\n", + "> We will load some Taxi trips data to represent point data; this data is coming from Databricks public datasets available in your environment. __Note: this is 1.6 billion trips as-is; while it is no problem to process this, to keep this to a quickstart level, we are going to use just 1/10th of 1% or ~1.6 million.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "983adfbd-0bb4-43e7-941d-55fb8844b306", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
count
1,611,203
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "1,611,203" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "count", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "create or replace temp view trips as (\n", + " select \n", + " xxhash64(pickup_datetime, dropoff_datetime, pickup_geom, dropoff_geom) as row_id, *\n", + " from (\n", + " select \n", + " trip_distance,\n", + " pickup_datetime,\n", + " dropoff_datetime,\n", + " st_astext(st_point(pickup_longitude, pickup_latitude)) as pickup_geom,\n", + " st_astext(st_point(dropoff_longitude, dropoff_latitude)) as dropoff_geom,\n", + " total_amount\n", + " from delta.`/databricks-datasets/nyctaxi/tables/nyctaxi_yellow`\n", + " tablesample (0.1 percent) repeatable (123)\n", + " )\n", + ");\n", + "\n", + "select format_number(count(1),0) as count from trips;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d905fbe9-5104-4a09-bdee-4b5adba23bcf", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Spatial Joins\n", + "\n", + "> We can use Mosaic to perform spatial joins both with and without Mosaic indexing strategies. Indexing is very important when handling very different geometries both in size and in shape (ie. number of vertices)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e9ff1fa2-ca0b-4472-8c8a-1b317da11e76", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Getting the optimal resolution\n", + "\n", + "> We can use Mosaic functionality to identify how to best index our data based on the data inside the specific dataframe. Selecting an appropriate indexing resolution can have a considerable impact on the performance." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "2bff2755-9a4f-481b-a00f-93e0fd2ebd8a", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Optimal resolution is 9\n" + ] + } + ], + "source": [ + "from mosaic import MosaicFrame\n", + "\n", + "neighbourhoods_mosaic_frame = MosaicFrame(spark.table(\"neighbourhoods\"), \"geometry\")\n", + "optimal_resolution = neighbourhoods_mosaic_frame.get_optimal_resolution(sample_fraction=0.75)\n", + "\n", + "print(f\"Optimal resolution is {optimal_resolution}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "b28464a3-f420-4264-b58b-a7e7d79329ad", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> Not every resolution will yield performance improvements. By a rule of thumb it is always better to under-index than over-index - if not sure select a lower resolution. Higher resolutions are needed when we have very imbalanced geometries with respect to their size or with respect to the number of vertices. In such case indexing with more indices will considerably increase the parallel nature of the operations. You can think of Mosaic as a way to partition an overly complex row into multiple rows that have a balanced amount of computation each." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6129062a-c951-4e1d-aac7-e146225dc9d8", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
resolutionmean_index_areamean_geometry_areapercentile_25_geometry_areapercentile_50_geometry_areapercentile_75_geometry_area
101.613031989295642E-6209.2813735025822366.88836237217488140.47899017489053277.457697890302
91.129122336468363E-529.897340555608889.55548081312268720.06842816384189739.63681595151253
112.3043288507855827E-71464.9712436152888468.2190572805376983.35402474692811942.206045030451
87.90392229801954E-54.2710130172709331.36505729876610032.86689439030573875.662355037138002
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 10, + 1.613031989295642E-6, + 209.28137350258223, + 66.88836237217488, + 140.47899017489053, + 277.457697890302 + ], + [ + 9, + 1.129122336468363E-5, + 29.89734055560888, + 9.555480813122687, + 20.068428163841897, + 39.63681595151253 + ], + [ + 11, + 2.3043288507855827E-7, + 1464.9712436152888, + 468.2190572805376, + 983.3540247469281, + 1942.206045030451 + ], + [ + 8, + 7.90392229801954E-5, + 4.271013017270933, + 1.3650572987661003, + 2.8668943903057387, + 5.662355037138002 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "resolution", + "type": "\"integer\"" + }, + { + "metadata": "{}", + "name": "mean_index_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "mean_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_25_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_50_geometry_area", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "percentile_75_geometry_area", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "display(\n", + " neighbourhoods_mosaic_frame.get_resolution_metrics(sample_rows=150)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "eb868752-f425-4c70-aab9-9a7f0d45f049", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Indexing using the optimal resolution\n", + "\n", + "> We will use mosaic sql functions to index our points data. Here we will use resolution 9, index resolution depends on the dataset in use." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e0e73687-eda2-46de-8d86-e38375d38b58", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
row_idpickup_h3dropoff_h3trip_distancepickup_datetimedropoff_datetimepickup_geomdropoff_geomtotal_amounttrip_line
89727808757135120736177331225768755196177331238747504631.072009-09-29T10:59:00.000+00002009-09-29T11:07:00.000+0000POINT (-73.956758 40.769978)POINT (-73.97026 40.765078)6.1LINESTRING (-73.956758 40.769978, -73.97026 40.765078)
45956017075756627936177331243885527036177331238658375678.82009-09-15T10:31:04.000+00002009-09-15T11:06:37.000+0000POINT (-73.872786 40.774201)POINT (-73.974646 40.760039)26.1LINESTRING (-73.872786 40.774201, -73.974646 40.760039)
31377993212758462316177331226439843836177331226521108471.12009-09-17T07:02:38.000+00002009-09-17T07:07:17.000+0000POINT (-73.94527 40.786908)POINT (-73.938264 40.797143)5.3LINESTRING (-73.94527 40.786908, -73.938264 40.797143)
-444650668213766046177331225870991356177331225750405111.42009-09-01T11:52:04.000+00002009-09-01T11:59:17.000+0000POINT (-73.967334 40.788187)POINT (-73.955356 40.78257)6.1LINESTRING (-73.967334 40.788187, -73.955356 40.78257)
-23003782924435447556177331225760890876177331238658375671.52009-09-28T09:09:40.000+00002009-09-28T09:17:04.000+0000POINT (-73.956012 40.772426)POINT (-73.972197 40.759981)6.9LINESTRING (-73.956012 40.772426, -73.972197 40.759981)
-44992069449022164136177331225760890876177331243576197114.32009-09-05T01:47:07.000+00002009-09-05T01:58:06.000+0000POINT (-73.956822 40.770955)POINT (-73.922656 40.767439)11.7LINESTRING (-73.956822 40.770955, -73.922656 40.767439)
56578353006779902296177331225742540796177331238380503031.12009-09-18T09:07:45.000+00002009-09-18T09:18:32.000+0000POINT (-73.955658 40.776628)POINT (-73.958469 40.766472)7.3LINESTRING (-73.955658 40.776628, -73.958469 40.766472)
82566615274726672976177331225852641276177331238726533113.12009-09-19T19:53:31.000+00002009-09-19T20:08:56.000+0000POINT (-73.948588 40.773806)POINT (-73.985448 40.756029)10.9LINESTRING (-73.948588 40.773806, -73.985448 40.756029)
91917781604722246046177331225742540796177331510879191033.862009-09-28T20:11:00.000+00002009-09-28T20:29:00.000+0000POINT (-73.95882 40.777945)POINT (-73.992457 40.7305)15.4LINESTRING (-73.95882 40.777945, -73.992457 40.7305)
-25326786790545572946177331225829048316177331509728378873.172009-09-19T11:58:00.000+00002009-09-19T12:15:00.000+0000POINT (-73.954202 40.7845)POINT (-73.987672 40.757067)11.3LINESTRING (-73.954202 40.7845, -73.987672 40.757067)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 8972780875713512073, + 617733122576875519, + 617733123874750463, + 1.07, + "2009-09-29T10:59:00.000+0000", + "2009-09-29T11:07:00.000+0000", + "POINT (-73.956758 40.769978)", + "POINT (-73.97026 40.765078)", + 6.1, + "LINESTRING (-73.956758 40.769978, -73.97026 40.765078)" + ], + [ + 4595601707575662793, + 617733124388552703, + 617733123865837567, + 8.8, + "2009-09-15T10:31:04.000+0000", + "2009-09-15T11:06:37.000+0000", + "POINT (-73.872786 40.774201)", + "POINT (-73.974646 40.760039)", + 26.1, + "LINESTRING (-73.872786 40.774201, -73.974646 40.760039)" + ], + [ + 3137799321275846231, + 617733122643984383, + 617733122652110847, + 1.1, + "2009-09-17T07:02:38.000+0000", + "2009-09-17T07:07:17.000+0000", + "POINT (-73.94527 40.786908)", + "POINT (-73.938264 40.797143)", + 5.3, + "LINESTRING (-73.94527 40.786908, -73.938264 40.797143)" + ], + [ + -44465066821376604, + 617733122587099135, + 617733122575040511, + 1.4, + "2009-09-01T11:52:04.000+0000", + "2009-09-01T11:59:17.000+0000", + "POINT (-73.967334 40.788187)", + "POINT (-73.955356 40.78257)", + 6.1, + "LINESTRING (-73.967334 40.788187, -73.955356 40.78257)" + ], + [ + -2300378292443544755, + 617733122576089087, + 617733123865837567, + 1.5, + "2009-09-28T09:09:40.000+0000", + "2009-09-28T09:17:04.000+0000", + "POINT (-73.956012 40.772426)", + "POINT (-73.972197 40.759981)", + 6.9, + "LINESTRING (-73.956012 40.772426, -73.972197 40.759981)" + ], + [ + -4499206944902216413, + 617733122576089087, + 617733124357619711, + 4.3, + "2009-09-05T01:47:07.000+0000", + "2009-09-05T01:58:06.000+0000", + "POINT (-73.956822 40.770955)", + "POINT (-73.922656 40.767439)", + 11.7, + "LINESTRING (-73.956822 40.770955, -73.922656 40.767439)" + ], + [ + 5657835300677990229, + 617733122574254079, + 617733123838050303, + 1.1, + "2009-09-18T09:07:45.000+0000", + "2009-09-18T09:18:32.000+0000", + "POINT (-73.955658 40.776628)", + "POINT (-73.958469 40.766472)", + 7.3, + "LINESTRING (-73.955658 40.776628, -73.958469 40.766472)" + ], + [ + 8256661527472667297, + 617733122585264127, + 617733123872653311, + 3.1, + "2009-09-19T19:53:31.000+0000", + "2009-09-19T20:08:56.000+0000", + "POINT (-73.948588 40.773806)", + "POINT (-73.985448 40.756029)", + 10.9, + "LINESTRING (-73.948588 40.773806, -73.985448 40.756029)" + ], + [ + 9191778160472224604, + 617733122574254079, + 617733151087919103, + 3.86, + "2009-09-28T20:11:00.000+0000", + "2009-09-28T20:29:00.000+0000", + "POINT (-73.95882 40.777945)", + "POINT (-73.992457 40.7305)", + 15.4, + "LINESTRING (-73.95882 40.777945, -73.992457 40.7305)" + ], + [ + -2532678679054557294, + 617733122582904831, + 617733150972837887, + 3.17, + "2009-09-19T11:58:00.000+0000", + "2009-09-19T12:15:00.000+0000", + "POINT (-73.954202 40.7845)", + "POINT (-73.987672 40.757067)", + 11.3, + "LINESTRING (-73.954202 40.7845, -73.987672 40.757067)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "row_id", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "pickup_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "dropoff_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "dropoff_datetime", + "type": "\"timestamp\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "total_amount", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "trip_line", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "create or replace temp view tripsWithIndex as (\n", + " select \n", + " row_id, pickup_h3, dropoff_h3,\n", + " * except(row_id, pickup_h3, dropoff_h3)\n", + " from (\n", + " select \n", + " *,\n", + " grid_pointascellid(pickup_geom, 9) as pickup_h3,\n", + " grid_pointascellid(dropoff_geom, 9) as dropoff_h3,\n", + " st_makeline(array(pickup_geom, dropoff_geom)) as trip_line\n", + " from trips\n", + " )\n", + ");\n", + "\n", + "select * from tripsWithIndex limit 10;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e519823d-b03b-4984-9a6a-4988aff54648", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> We will also index our neighbourhoods using a built in generator function." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1c2ea737-1d6b-4399-b339-4394b1c7b286", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
count
11,885
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "11,885" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "count", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "-- [1] We break down the original geometry in multiple smaller mosaic chips,\n", + "-- each with its own index.\n", + "-- [2] We don't need the original geometry any more, since we have broken\n", + "-- it down into smaller mosaic chips.\n", + "create or replace temp view neighbourhoodsWithIndex as (\n", + " select \n", + " * except(geometry, json_geometry),\n", + " grid_tessellateexplode(geometry, 9) as mosaic_index\n", + " from neighbourhoods\n", + ");\n", + "\n", + "-- notice the explode results in more rows\n", + "select format_number(count(1),0) as count from neighbourhoodsWithIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "662e27dc-26cd-4681-bd13-bfc1e56440b4", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
typepropertiesmosaic_index
FeatureCollectionList(EWR, 1, 1, 0.0007823067885, 0.116357453189, Newark Airport)List(true, 617733150781997055, AAAAAAMAAAABAAAACMBSi+u3pmJPQERXt9Zja+DAUowOEmsLgUBEV5j/c7NdwFKMDNGKYo5ARFdfWe/7QsBSi+k2kMI9QERXRItV/dvAUovG3BrqLkBEV2Nhr37nwFKLyBxP4SdARFedBzkzSsBSi+u3pmJPQERXt9Zja+DAUos= (truncated))
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "FeatureCollection", + [ + "EWR", + "1", + "1", + "0.0007823067885", + "0.116357453189", + "Newark Airport" + ], + [ + true, + 617733150781997055, + "AAAAAAMAAAABAAAACMBSi+u3pmJPQERXt9Zja+DAUowOEmsLgUBEV5j/c7NdwFKMDNGKYo5ARFdfWe/7QsBSi+k2kMI9QERXRItV/dvAUovG3BrqLkBEV2Nhr37nwFKLyBxP4SdARFedBzkzSsBSi+u3pmJPQERXt9Zja+DAUos= (truncated)" + ] + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "properties", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"borough\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"location_id\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"objectid\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_area\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"shape_leng\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"zone\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "mosaic_index", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"is_core\",\"type\":\"boolean\",\"nullable\":true,\"metadata\":{}},{\"name\":\"index_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"wkb\",\"type\":\"binary\",\"nullable\":true,\"metadata\":{}}]}" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql \n", + "-- limiting for ipynb only\n", + "select * from neighbourhoodsWithIndex limit 1;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a8028556-2f1e-4f84-9ef0-e400815908d1", + "showTitle": false, + "title": "" + } + }, + "source": [ + "### Performing the spatial join\n", + "\n", + "> We can now do spatial joins to both pickup and drop off zones based on geolocations in our datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "43a8f2f9-36e1-49a6-820a-7f240be56b60", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
trip_distancepickup_geomdropoff_geompickup_h3dropoff_h3pickup_zonetrip_line
1.07POINT (-73.956758 40.769978)POINT (-73.97026 40.765078)617733122576875519617733123874750463Lenox Hill WestLINESTRING (-73.956758 40.769978, -73.97026 40.765078)
8.8POINT (-73.872786 40.774201)POINT (-73.974646 40.760039)617733124388552703617733123865837567LaGuardia AirportLINESTRING (-73.872786 40.774201, -73.974646 40.760039)
1.1POINT (-73.94527 40.786908)POINT (-73.938264 40.797143)617733122643984383617733122652110847East Harlem SouthLINESTRING (-73.94527 40.786908, -73.938264 40.797143)
1.4POINT (-73.967334 40.788187)POINT (-73.955356 40.78257)617733122587099135617733122575040511Upper West Side NorthLINESTRING (-73.967334 40.788187, -73.955356 40.78257)
1.5POINT (-73.956012 40.772426)POINT (-73.972197 40.759981)617733122576089087617733123865837567Lenox Hill WestLINESTRING (-73.956012 40.772426, -73.972197 40.759981)
4.3POINT (-73.956822 40.770955)POINT (-73.922656 40.767439)617733122576089087617733124357619711Lenox Hill WestLINESTRING (-73.956822 40.770955, -73.922656 40.767439)
1.1POINT (-73.955658 40.776628)POINT (-73.958469 40.766472)617733122574254079617733123838050303Upper East Side NorthLINESTRING (-73.955658 40.776628, -73.958469 40.766472)
3.1POINT (-73.948588 40.773806)POINT (-73.985448 40.756029)617733122585264127617733123872653311Yorkville EastLINESTRING (-73.948588 40.773806, -73.985448 40.756029)
3.86POINT (-73.95882 40.777945)POINT (-73.992457 40.7305)617733122574254079617733151087919103Upper East Side NorthLINESTRING (-73.95882 40.777945, -73.992457 40.7305)
3.17POINT (-73.954202 40.7845)POINT (-73.987672 40.757067)617733122582904831617733150972837887Upper East Side NorthLINESTRING (-73.954202 40.7845, -73.987672 40.757067)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1.07, + "POINT (-73.956758 40.769978)", + "POINT (-73.97026 40.765078)", + 617733122576875519, + 617733123874750463, + "Lenox Hill West", + "LINESTRING (-73.956758 40.769978, -73.97026 40.765078)" + ], + [ + 8.8, + "POINT (-73.872786 40.774201)", + "POINT (-73.974646 40.760039)", + 617733124388552703, + 617733123865837567, + "LaGuardia Airport", + "LINESTRING (-73.872786 40.774201, -73.974646 40.760039)" + ], + [ + 1.1, + "POINT (-73.94527 40.786908)", + "POINT (-73.938264 40.797143)", + 617733122643984383, + 617733122652110847, + "East Harlem South", + "LINESTRING (-73.94527 40.786908, -73.938264 40.797143)" + ], + [ + 1.4, + "POINT (-73.967334 40.788187)", + "POINT (-73.955356 40.78257)", + 617733122587099135, + 617733122575040511, + "Upper West Side North", + "LINESTRING (-73.967334 40.788187, -73.955356 40.78257)" + ], + [ + 1.5, + "POINT (-73.956012 40.772426)", + "POINT (-73.972197 40.759981)", + 617733122576089087, + 617733123865837567, + "Lenox Hill West", + "LINESTRING (-73.956012 40.772426, -73.972197 40.759981)" + ], + [ + 4.3, + "POINT (-73.956822 40.770955)", + "POINT (-73.922656 40.767439)", + 617733122576089087, + 617733124357619711, + "Lenox Hill West", + "LINESTRING (-73.956822 40.770955, -73.922656 40.767439)" + ], + [ + 1.1, + "POINT (-73.955658 40.776628)", + "POINT (-73.958469 40.766472)", + 617733122574254079, + 617733123838050303, + "Upper East Side North", + "LINESTRING (-73.955658 40.776628, -73.958469 40.766472)" + ], + [ + 3.1, + "POINT (-73.948588 40.773806)", + "POINT (-73.985448 40.756029)", + 617733122585264127, + 617733123872653311, + "Yorkville East", + "LINESTRING (-73.948588 40.773806, -73.985448 40.756029)" + ], + [ + 3.86, + "POINT (-73.95882 40.777945)", + "POINT (-73.992457 40.7305)", + 617733122574254079, + 617733151087919103, + "Upper East Side North", + "LINESTRING (-73.95882 40.777945, -73.992457 40.7305)" + ], + [ + 3.17, + "POINT (-73.954202 40.7845)", + "POINT (-73.987672 40.757067)", + 617733122582904831, + 617733150972837887, + "Upper East Side North", + "LINESTRING (-73.954202 40.7845, -73.987672 40.757067)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "dropoff_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "pickup_zone", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "trip_line", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "\n", + "create or replace temp view withPickupZone as (\n", + " select \n", + " trip_distance, pickup_geom, dropoff_geom, pickup_h3, dropoff_h3, pickup_zone, trip_line\n", + " from tripsWithIndex\n", + " join (\n", + " select \n", + " properties.zone as pickup_zone,\n", + " mosaic_index\n", + " from neighbourhoodsWithIndex\n", + " )\n", + " on mosaic_index.index_id == pickup_h3\n", + " where mosaic_index.is_core or st_contains(mosaic_index.wkb, pickup_geom)\n", + ");\n", + "\n", + "-- limiting for ipynb only\n", + "select * from withPickupZone limit 10;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "db947fdf-b039-4675-838f-0d16fdd4516f", + "showTitle": false, + "title": "" + } + }, + "source": [ + "> We can easily perform a similar join for the drop off location. __Note: in this case using `withPickupZone` from above as the left sid of the join.__" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "implicitDf": true, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c6f658f2-15f0-4fd9-8df8-531e83480178", + "showTitle": false, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
trip_distancepickup_geomdropoff_geompickup_h3dropoff_h3pickup_zonedropoff_zonetrip_line
1.07POINT (-73.956758 40.769978)POINT (-73.97026 40.765078)617733122576875519617733123874750463Lenox Hill WestUpper East Side SouthLINESTRING (-73.956758 40.769978, -73.97026 40.765078)
8.8POINT (-73.872786 40.774201)POINT (-73.974646 40.760039)617733124388552703617733123865837567LaGuardia AirportMidtown CenterLINESTRING (-73.872786 40.774201, -73.974646 40.760039)
1.1POINT (-73.94527 40.786908)POINT (-73.938264 40.797143)617733122643984383617733122652110847East Harlem SouthEast Harlem NorthLINESTRING (-73.94527 40.786908, -73.938264 40.797143)
1.4POINT (-73.967334 40.788187)POINT (-73.955356 40.78257)617733122587099135617733122575040511Upper West Side NorthUpper East Side NorthLINESTRING (-73.967334 40.788187, -73.955356 40.78257)
1.5POINT (-73.956012 40.772426)POINT (-73.972197 40.759981)617733122576089087617733123865837567Lenox Hill WestMidtown CenterLINESTRING (-73.956012 40.772426, -73.972197 40.759981)
4.3POINT (-73.956822 40.770955)POINT (-73.922656 40.767439)617733122576089087617733124357619711Lenox Hill WestOld AstoriaLINESTRING (-73.956822 40.770955, -73.922656 40.767439)
1.1POINT (-73.955658 40.776628)POINT (-73.958469 40.766472)617733122574254079617733123838050303Upper East Side NorthLenox Hill WestLINESTRING (-73.955658 40.776628, -73.958469 40.766472)
3.1POINT (-73.948588 40.773806)POINT (-73.985448 40.756029)617733122585264127617733123872653311Yorkville EastTimes Sq/Theatre DistrictLINESTRING (-73.948588 40.773806, -73.985448 40.756029)
3.86POINT (-73.95882 40.777945)POINT (-73.992457 40.7305)617733122574254079617733151087919103Upper East Side NorthGreenwich Village NorthLINESTRING (-73.95882 40.777945, -73.992457 40.7305)
3.17POINT (-73.954202 40.7845)POINT (-73.987672 40.757067)617733122582904831617733150972837887Upper East Side NorthTimes Sq/Theatre DistrictLINESTRING (-73.954202 40.7845, -73.987672 40.757067)
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + 1.07, + "POINT (-73.956758 40.769978)", + "POINT (-73.97026 40.765078)", + 617733122576875519, + 617733123874750463, + "Lenox Hill West", + "Upper East Side South", + "LINESTRING (-73.956758 40.769978, -73.97026 40.765078)" + ], + [ + 8.8, + "POINT (-73.872786 40.774201)", + "POINT (-73.974646 40.760039)", + 617733124388552703, + 617733123865837567, + "LaGuardia Airport", + "Midtown Center", + "LINESTRING (-73.872786 40.774201, -73.974646 40.760039)" + ], + [ + 1.1, + "POINT (-73.94527 40.786908)", + "POINT (-73.938264 40.797143)", + 617733122643984383, + 617733122652110847, + "East Harlem South", + "East Harlem North", + "LINESTRING (-73.94527 40.786908, -73.938264 40.797143)" + ], + [ + 1.4, + "POINT (-73.967334 40.788187)", + "POINT (-73.955356 40.78257)", + 617733122587099135, + 617733122575040511, + "Upper West Side North", + "Upper East Side North", + "LINESTRING (-73.967334 40.788187, -73.955356 40.78257)" + ], + [ + 1.5, + "POINT (-73.956012 40.772426)", + "POINT (-73.972197 40.759981)", + 617733122576089087, + 617733123865837567, + "Lenox Hill West", + "Midtown Center", + "LINESTRING (-73.956012 40.772426, -73.972197 40.759981)" + ], + [ + 4.3, + "POINT (-73.956822 40.770955)", + "POINT (-73.922656 40.767439)", + 617733122576089087, + 617733124357619711, + "Lenox Hill West", + "Old Astoria", + "LINESTRING (-73.956822 40.770955, -73.922656 40.767439)" + ], + [ + 1.1, + "POINT (-73.955658 40.776628)", + "POINT (-73.958469 40.766472)", + 617733122574254079, + 617733123838050303, + "Upper East Side North", + "Lenox Hill West", + "LINESTRING (-73.955658 40.776628, -73.958469 40.766472)" + ], + [ + 3.1, + "POINT (-73.948588 40.773806)", + "POINT (-73.985448 40.756029)", + 617733122585264127, + 617733123872653311, + "Yorkville East", + "Times Sq/Theatre District", + "LINESTRING (-73.948588 40.773806, -73.985448 40.756029)" + ], + [ + 3.86, + "POINT (-73.95882 40.777945)", + "POINT (-73.992457 40.7305)", + 617733122574254079, + 617733151087919103, + "Upper East Side North", + "Greenwich Village North", + "LINESTRING (-73.95882 40.777945, -73.992457 40.7305)" + ], + [ + 3.17, + "POINT (-73.954202 40.7845)", + "POINT (-73.987672 40.757067)", + 617733122582904831, + 617733150972837887, + "Upper East Side North", + "Times Sq/Theatre District", + "LINESTRING (-73.954202 40.7845, -73.987672 40.757067)" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "trip_distance", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "pickup_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_geom", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "pickup_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "dropoff_h3", + "type": "\"long\"" + }, + { + "metadata": "{}", + "name": "pickup_zone", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "dropoff_zone", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "trip_line", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "%sql\n", + "create or replace temp view withDropoffZone as (\n", + " select \n", + " trip_distance, pickup_geom, dropoff_geom, pickup_h3, dropoff_h3, pickup_zone, dropoff_zone, trip_line\n", + " from withPickupZone\n", + " join (\n", + " select \n", + " properties.zone as dropoff_zone,\n", + " mosaic_index\n", + " from neighbourhoodsWithIndex\n", + " )\n", + " on mosaic_index.index_id == dropoff_h3\n", + " where mosaic_index.is_core or st_contains(mosaic_index.wkb, dropoff_geom)\n", + ");\n", + "\n", + "-- limiting for ipynb only\n", + "select * from withDropoffZone limit 10" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "30d4507b-7189-455e-9a4e-681d2f4714ac", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Visualise the results in Kepler\n", + "\n", + "> Mosaic abstracts interaction with Kepler in python through the use of the `%%mosaic_kepler` magic. When python is not the notebook language, you can prepend `%python` before the magic to make the switch." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "17c9cbeb-f411-4b2d-94d7-6737aaf1e1c4", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is the initial rendering with trip lines._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "81d92d32-c979-4dd8-9e7b-1a19d2507f13", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d331fd71-d383-485e-bb6f-6bcd2302dae7", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Here is with trip lines off and some other adjustments._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4c1c94de-97bb-41e3-abd2-955b6ea3effd", + "showTitle": false, + "title": "" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "data": "", + "datasetInfos": [], + "metadata": {}, + "removedWidgets": [], + "textData": null, + "type": "htmlSandbox" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "displayHTML(\"\"\"\"\"\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e1311242-147c-461e-9689-e10e02bd66e8", + "showTitle": false, + "title": "" + } + }, + "source": [ + "_Uncomment the following within databricks for actual results. Hint: you can toggle layers on/off and adjust properties._" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0a093833-f267-4194-b06e-5575001727d2", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %%mosaic_kepler\n", + "# withDropoffZone \"pickup_h3\" \"h3\" 5000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "c1466346-8537-4e15-9afc-54056129af5a", + "showTitle": false, + "title": "" + } + }, + "source": [ + "## Databricks Lakehouse can read / write most any data format\n", + "\n", + "> Here are [built-in](https://docs.databricks.com/en/external-data/index.html) formats as well as Mosaic [readers](https://databrickslabs.github.io/mosaic/api/api.html). __Note: best performance with Delta Lake format__, ref [Databricks](https://docs.databricks.com/en/delta/index.html) and [OSS](https://docs.delta.io/latest/index.html) docs for Delta Lake. Beyond built-in formats, Databricks is a platform on which you can install a wide variety of libraries, e.g. [1](https://docs.databricks.com/en/libraries/index.html#python-environment-management) | [2](https://docs.databricks.com/en/compute/compatibility.html) | [3](https://docs.databricks.com/en/init-scripts/index.html).\n", + "\n", + "Example of [reading](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameReader.html?highlight=read#pyspark.sql.DataFrameReader) and [writing](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrameWriter.html?highlight=pyspark%20sql%20dataframe%20writer#pyspark.sql.DataFrameWriter) a Spark DataFrame with Delta Lake format.\n", + "\n", + "```\n", + "# - `write.format(\"delta\")` is default in Databricks\n", + "# - can save to a specified path in the Lakehouse\n", + "# - can save as a table in the Databricks Metastore\n", + "df.write.save(\"\")\n", + "df.write.saveAsTable(\"\")\n", + "```\n", + "\n", + "Example of loading a Delta Lake Table as a Spark DataFrame.\n", + "\n", + "```\n", + "# - `read.format(\"delta\")` is default in Databricks\n", + "# - can load a specified path in the Lakehouse\n", + "# - can load a table in the Databricks Metastore\n", + "df.read.load(\"\")\n", + "df.table(\"\")\n", + "```\n", + "\n", + "More on [Unity Catalog](https://docs.databricks.com/en/data-governance/unity-catalog/index.html) in Databricks Lakehouse for Governing [Tables](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#tables) and [Volumes](https://docs.databricks.com/en/data-governance/unity-catalog/index.html#volumes)." + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": 1148550101132091, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 2 + }, + "notebookName": "QuickstartNotebook", + "widgets": {} + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/examples/sql/QuickstartNotebook.sql b/notebooks/examples/sql/QuickstartNotebook.sql deleted file mode 100644 index 01e1ed996..000000000 --- a/notebooks/examples/sql/QuickstartNotebook.sql +++ /dev/null @@ -1,272 +0,0 @@ --- Databricks notebook source --- MAGIC %md --- MAGIC ## Setup NYC taxi zones --- MAGIC In order to setup the data please run the notebook available at "../../data/DownloadNYCTaxiZones".
--- MAGIC DownloadNYCTaxiZones notebook will make sure we have New York City Taxi zone shapes available in our environment. - --- COMMAND ---------- - --- MAGIC %python --- MAGIC user_name = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get() --- MAGIC --- MAGIC raw_path = f"dbfs:/tmp/mosaic/{user_name}" --- MAGIC raw_taxi_zones_path = f"{raw_path}/taxi_zones" --- MAGIC --- MAGIC print(f"The raw data is stored in {raw_path}") - --- COMMAND ---------- - --- MAGIC %md --- MAGIC ## Enable Mosaic in the notebook --- MAGIC To get started, you'll need to attach the JAR to your cluster and import instances as in the cell below.
- --- COMMAND ---------- - --- MAGIC %python --- MAGIC import mosaic as mos --- MAGIC mos.enable_mosaic(spark, dbutils) - --- COMMAND ---------- - --- MAGIC %md ## Read polygons from GeoJson - --- COMMAND ---------- - --- MAGIC %md --- MAGIC With the functionallity Mosaic brings we can easily load GeoJSON files using spark.
--- MAGIC In the past this required GeoPandas in python and conversion to spark dataframe.
--- MAGIC Scala and SQL were hard to demo.
- --- COMMAND ---------- - --- MAGIC %python --- MAGIC # Note: Here we are using python as a proxy to programmatically --- MAGIC # pass the location of our data source for taxi zones. --- MAGIC spark.sql(f"drop table if exists taxi_zones;") --- MAGIC spark.sql( --- MAGIC f""" --- MAGIC create table if not exists taxi_zones --- MAGIC using json --- MAGIC options (multiline = true) --- MAGIC location "{raw_taxi_zones_path}"; --- MAGIC """ --- MAGIC ) - --- COMMAND ---------- - -create or replace temp view neighbourhoods as ( - select - type, - feature.properties as properties, - st_astext(st_geomfromgeojson(to_json(feature.geometry))) as geometry - from ( - select - type, - explode(features) as feature - from taxi_zones - ) -); -select * from neighbourhoods; - --- COMMAND ---------- - --- MAGIC %md --- MAGIC ## Compute some basic geometry attributes - --- COMMAND ---------- - --- MAGIC %md --- MAGIC Mosaic provides a number of functions for extracting the properties of geometries. Here are some that are relevant to Polygon geometries: - --- COMMAND ---------- - -select geometry, st_area(geometry), st_length(geometry) from neighbourhoods; - --- COMMAND ---------- - --- MAGIC %md --- MAGIC ## Read points data - --- COMMAND ---------- - --- MAGIC %md --- MAGIC We will load some Taxi trips data to represent point data.
--- MAGIC We already loaded some shapes representing polygons that correspond to NYC neighbourhoods.
- --- COMMAND ---------- - -create table if not exists nyctaxi_yellow -using delta -location "/databricks-datasets/nyctaxi/tables/nyctaxi_yellow" - --- COMMAND ---------- - -create or replace temp view trips as ( - select - trip_distance, - pickup_datetime, - dropoff_datetime, - st_astext(st_point(pickup_longitude, pickup_latitude)) as pickup_geom, - st_astext(st_point(dropoff_longitude, dropoff_latitude)) as dropoff_geom, - total_amount - from nyctaxi_yellow -) - --- COMMAND ---------- - -select * from trips - --- COMMAND ---------- - --- MAGIC %md --- MAGIC ## Spatial Joins - --- COMMAND ---------- - --- MAGIC %md --- MAGIC We can use Mosaic to perform spatial joins both with and without Mosaic indexing strategies.
--- MAGIC Indexing is very important when handling very different geometries both in size and in shape (ie. number of vertices).
- --- COMMAND ---------- - --- MAGIC %md --- MAGIC ### Getting the optimal resolution - --- COMMAND ---------- - --- MAGIC %md --- MAGIC We can use Mosaic functionality to identify how to best index our data based on the data inside the specific dataframe.
--- MAGIC Selecting an apropriate indexing resolution can have a considerable impact on the performance.
- --- COMMAND ---------- - --- MAGIC %python --- MAGIC from mosaic import MosaicFrame --- MAGIC --- MAGIC neighbourhoods_mosaic_frame = MosaicFrame(spark.read.table("neighbourhoods"), "geometry") --- MAGIC optimal_resolution = neighbourhoods_mosaic_frame.get_optimal_resolution(sample_fraction=0.75) --- MAGIC --- MAGIC print(f"Optimal resolution is {optimal_resolution}") - --- COMMAND ---------- - --- MAGIC %md --- MAGIC It is worth noting that not each resolution will yield performance improvements.
--- MAGIC By a rule of thumb it is always better to under index than over index - if not sure select a lower resolution.
--- MAGIC Higher resolutions are needed when we have very imbalanced geometries with respect to their size or with respect to the number of vertices.
--- MAGIC In such case indexing with more indices will considerably increase the parallel nature of the operations.
--- MAGIC You can think of Mosaic as a way to partition an overly complex row into multiple rows that have a balanced amount of computation each. - --- COMMAND ---------- - --- MAGIC %python --- MAGIC display( --- MAGIC neighbourhoods_mosaic_frame.get_resolution_metrics(sample_rows=150) --- MAGIC ) - --- COMMAND ---------- - --- MAGIC %md --- MAGIC ### Indexing using the optimal resolution - --- COMMAND ---------- - --- MAGIC %md --- MAGIC We will use mosaic sql functions to index our points data.
--- MAGIC Here we will use resolution 9, index resolution depends on the dataset in use. - --- COMMAND ---------- - -create or replace temp view tripsWithIndex as ( - select - *, - grid_pointascellid(pickup_geom, 9) as pickup_h3, - grid_pointascellid(dropoff_geom, 9) as dropoff_h3, - st_makeline(array(pickup_geom, dropoff_geom)) as trip_line - from trips -) - --- COMMAND ---------- - --- MAGIC %md --- MAGIC We will also index our neighbourhoods using a built in generator function. - --- COMMAND ---------- - -create or replace temp view neighbourhoodsWithIndex as ( - select - *, - grid_tessellateexplode(geometry, 9) as mosaic_index - from neighbourhoods -) - --- COMMAND ---------- - --- MAGIC %md --- MAGIC ### Performing the spatial join - --- COMMAND ---------- - --- MAGIC %md --- MAGIC We can now do spatial joins to both pickup and drop off zones based on geolocations in our datasets. - --- COMMAND ---------- - -create or replace temp view withPickupZone as ( - select - trip_distance, pickup_geom, dropoff_geom, pickup_h3, dropoff_h3, pickup_zone, trip_line - from tripsWithIndex - join ( - select - properties.zone as pickup_zone, - mosaic_index - from neighbourhoodsWithIndex - ) - on mosaic_index.index_id == pickup_h3 - where mosaic_index.is_core or st_contains(mosaic_index.wkb, pickup_geom) -); -select * from withPickupZone; - --- COMMAND ---------- - --- MAGIC %md --- MAGIC We can easily perform a similar join for the drop off location. - --- COMMAND ---------- - -create or replace temp view withDropoffZone as ( - select - trip_distance, pickup_geom, dropoff_geom, pickup_h3, dropoff_h3, pickup_zone, dropoff_zone, trip_line - from withPickupZone - join ( - select - properties.zone as dropoff_zone, - mosaic_index - from neighbourhoodsWithIndex - ) - on mosaic_index.index_id == dropoff_h3 - where mosaic_index.is_core or st_contains(mosaic_index.wkb, dropoff_geom) -); -select * from withDropoffZone; - --- COMMAND ---------- - --- MAGIC %md --- MAGIC ## Visualise the results in Kepler - --- COMMAND ---------- - --- MAGIC %md --- MAGIC For visualisation there simply aren't good options in scala.
--- MAGIC Luckily in our notebooks you can easily switch to python just for UI.
--- MAGIC Mosaic abstracts interaction with Kepler in python. - --- COMMAND ---------- - --- MAGIC %python --- MAGIC %%mosaic_kepler --- MAGIC "withDropoffZone" "pickup_h3" "h3" 5000 - --- COMMAND ---------- - - diff --git a/notebooks/examples/sql/README.md b/notebooks/examples/sql/README.md new file mode 100644 index 000000000..92885f6cf --- /dev/null +++ b/notebooks/examples/sql/README.md @@ -0,0 +1,5 @@ +# Spark SQL Examples + +> A couple of examples focusing on Spark SQL in Databricks DBR Clusters. + +__Note: `ipynb` files can be previewed in GitHub and can also be imported into Databricks, more [here](https://docs.databricks.com/en/notebooks/notebook-export-import.html).__ diff --git a/python/mosaic/api/aggregators.py b/python/mosaic/api/aggregators.py index b4e9bf7e1..e221d06ba 100644 --- a/python/mosaic/api/aggregators.py +++ b/python/mosaic/api/aggregators.py @@ -16,6 +16,7 @@ "grid_cell_intersection_agg", "rst_merge_agg", "rst_combineavg_agg", + "rst_derivedband_agg", "st_intersection_agg", "st_intersects_agg", ] @@ -209,3 +210,23 @@ def rst_combineavg_agg(raster: ColumnOrName) -> Column: return config.mosaic_context.invoke_function( "rst_combineavg_agg", pyspark_to_java_column(raster) ) + + +def rst_derivedband_agg(raster: ColumnOrName, pythonFunc: ColumnOrName, funcName: ColumnOrName) -> Column: + """ + Returns the raster representing the aggregation of rasters using provided python function. + + Parameters + ---------- + raster: Column + pythonFunc: Column + funcName: Column + + Returns + ------- + Column + The resulting raster. + """ + return config.mosaic_context.invoke_function( + "rst_derivedband_agg", pyspark_to_java_column(raster), pyspark_to_java_column(pythonFunc), pyspark_to_java_column(funcName) + ) diff --git a/python/mosaic/api/raster.py b/python/mosaic/api/raster.py index e84d41ad4..ffc29cebe 100644 --- a/python/mosaic/api/raster.py +++ b/python/mosaic/api/raster.py @@ -13,6 +13,7 @@ "rst_boundingbox", "rst_clip", "rst_combineavg", + "rst_derivedband", "rst_frombands", "rst_fromfile", "rst_georeference", @@ -145,6 +146,34 @@ def rst_combineavg(rasters: ColumnOrName) -> Column: ) +def rst_derivedband(raster: ColumnOrName, pythonFunc: ColumnOrName, funcName: ColumnOrName) -> Column: + """ + Creates a new band by applying the given python function to the input rasters. + The result is a raster tile. + + Parameters + ---------- + raster : Column (StringType) + Path to the raster file. + pythonFunc : Column (StringType) + The python function to apply to the bands. + funcName : Column (StringType) + The name of the function. + + Returns + ------- + Column (StringType) + The path to the new raster. + + """ + return config.mosaic_context.invoke_function( + "rst_derivedband", + pyspark_to_java_column(raster), + pyspark_to_java_column(pythonFunc), + pyspark_to_java_column(funcName), + ) + + def rst_georeference(raster: ColumnOrName) -> Column: """ Returns GeoTransform of the raster as a GT array of doubles. diff --git a/python/setup.cfg b/python/setup.cfg index 2bc75b1be..1d34b6c9a 100644 --- a/python/setup.cfg +++ b/python/setup.cfg @@ -24,7 +24,6 @@ setup_requires = install_requires = keplergl==0.3.2 h3==3.7.3 - gdal[numpy]==3.4.3 [options.package_data] mosaic = diff --git a/python/test/test_raster_functions.py b/python/test/test_raster_functions.py index 077bd06df..cfbcb39ee 100644 --- a/python/test/test_raster_functions.py +++ b/python/test/test_raster_functions.py @@ -1,7 +1,3 @@ -import logging -import random -import unittest - from pyspark.sql.functions import abs, col, first, lit, sqrt, array from .context import api @@ -124,7 +120,7 @@ def test_raster_flatmap_functions(self): ) overlap_result.write.format("noop").mode("overwrite").save() - self.assertEqual(overlap_result.count(), 86) + self.assertEqual(overlap_result.count(), 87) def test_raster_aggregator_functions(self): collection = ( diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/api/FormatLookup.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/FormatLookup.scala new file mode 100644 index 000000000..e3aeb5296 --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/FormatLookup.scala @@ -0,0 +1,137 @@ +package com.databricks.labs.mosaic.core.raster.api + +object FormatLookup { + + // ShortDriverName -> FormatExtension + val formats: Map[String, String] = Map( + "AAIGrid" -> "asc", + "ACE2" -> "ace2", + "ADRG" -> "gen", + "AIG" -> "aig", + "AIRSAR" -> "airsar", + "ARCGEN" -> "gen", + "ARG" -> "arg", + "BLX" -> "blx", + "BMP" -> "bmp", + "BT" -> "bt", + "CAD" -> "dwg", + "CEOS" -> "ceos", + "COASP" -> "coasp", + "COSAR" -> "cosar", + "CPG" -> "cpg", + "CSW" -> "csw", + "CTG" -> "ctg", + "DB2ODBC" -> "db2", + "DERIVED" -> "derived", + "DGN" -> "dgn", + "DIMAP" -> "dim", + "DIPEx" -> "dipex", + "DOQ1" -> "doq1", + "DOQ2" -> "doq2", + "DTED" -> "dt0", + "DXF" -> "dxf", + "ECRGTOC" -> "toc", + "ECRGTP" -> "ecrgtp", + "EEDA" -> "eeda", + "EIR" -> "eir", + "ELAS" -> "elas", + "ENVI" -> "hdr", + "ERS" -> "ers", + "ESAT" -> "esat", + "ESRI Shapefile" -> "shp", + "ESRI" -> "ers", + "FAST" -> "fst", + "FIT" -> "fit", + "FITS" -> "fits", + "GFF" -> "gff", + "GIF" -> "gif", + "GLOBE" -> "globe", + "GMT" -> "gmt", + "GNM" -> "gnm", + "GRASSASCIIGrid" -> "asc", + "GRASS" -> "grass", + "GRIB" -> "grb", + "GTiff" -> "tif", + "GXF" -> "gxf", + "HDF4" -> "hdf4", + "HDF5" -> "hdf5", + "HF2" -> "hf2", + "HFA" -> "img", + "HTTP" -> "http", + "IDRISI" -> "rst", + "ILWIS" -> "mpr", + "INGR" -> "grd", + "IRIS" -> "ppm", + "ISIS2" -> "cub", + "ISIS3" -> "cub", + "JDEM" -> "mem", + "JPEG2000" -> "jp2", + "JPEG" -> "jpg", + "JP2OpenJPEG" -> "jp2", + "KMLSUPEROVERLAY" -> "kml", + "LAN" -> "lan", + "LCP" -> "lcp", + "L1B" -> "l1b", + "MBTiles" -> "mbtiles", + "MEM" -> "mem", + "MFF" -> "mff", + "MG4Lidar" -> "mg4l", + "MRF" -> "mrf", + "MSGN" -> "msgn", + "NDF" -> "ndf", + "NITF" -> "ntf", + "NTv2" -> "gsb", + "ODBC" -> "odbc", + "OGR_GMT" -> "gmt", + "OGR_PDS" -> "pds", + "OGR_SDTS" -> "sdts", + "OGR_VRT" -> "vrt", + "OGR" -> "shp", + "OpenAir" -> "oar", + "PCIDSK" -> "pix", + "PCRaster" -> "map", + "PDF" -> "pdf", + "PDS" -> "pds", + "PGDUMP" -> "pgdump", + "PGeo" -> "mdb", + "PLMOSAIC" -> "mosaic", + "PNG" -> "png", + "PostgreSQL" -> "pg", + "R" -> "r", + "RDA" -> "rda", + "RIK" -> "rik", + "RMF" -> "rmf", + "ROI_PAC" -> "rsc", + "RPFTOC" -> "toc", + "RS2" -> "rs2", + "RST" -> "rst", + "SAGA" -> "sdat", + "SAR_CEOS" -> "ceos", + "SAR_SG" -> "sgm", + "SDTS" -> "sdts", + "SEGUKOOA" -> "dat", + "SEGY" -> "segy", + "Sentinel2" -> "jp2", + "SRTMHGT" -> "hgt", + "SQLite" -> "sqlite", + "SUA" -> "sua", + "SVG" -> "svg", + "TIGER" -> "tiger", + "TIL" -> "til", + "TSX" -> "tsx", + "USGSDEM" -> "dem", + "VDV" -> "vdv", + "VICAR" -> "vicar", + "VFK" -> "vfk", + "VRT" -> "vrt", + "WCS" -> "wcs", + "WFS" -> "wfs", + "WMS" -> "wms", + "XLS" -> "xls", + "XLSX" -> "xlsx", + "XPlane" -> "bin", + "netCDF" -> "nc", + "Zarr" -> "zarr" + ) + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala index cfb3e4fd0..44937d6af 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala @@ -9,6 +9,8 @@ import org.apache.spark.unsafe.types.UTF8String import org.gdal.gdal.gdal import org.gdal.gdalconst.gdalconstConstants._ +import java.util.UUID + /** * GDAL Raster API. It uses [[MosaicRasterGDAL]] as the * [[com.databricks.labs.mosaic.core.raster.io.RasterReader]]. @@ -66,8 +68,9 @@ object GDAL { def getExtension(driverShortName: String): String = { val driver = gdal.GetDriverByName(driverShortName) val result = driver.GetMetadataItem("DMD_EXTENSION") + val toReturn = if (result == null) FormatLookup.formats(driverShortName) else result driver.delete() - result + toReturn } /** @@ -84,7 +87,7 @@ object GDAL { * Returns a Raster object. */ def readRaster( - inputRaster: => Any, + inputRaster: Any, parentPath: String, shortDriverName: String, inputDT: DataType @@ -117,13 +120,14 @@ object GDAL { * @return * Returns the paths of the written rasters. */ - def writeRasters(generatedRasters: => Seq[MosaicRasterGDAL], checkpointPath: String, rasterDT: DataType): Seq[Any] = { + def writeRasters(generatedRasters: Seq[MosaicRasterGDAL], checkpointPath: String, rasterDT: DataType): Seq[Any] = { generatedRasters.map(raster => if (raster != null) { rasterDT match { case StringType => + val uuid = UUID.randomUUID().toString val extension = GDAL.getExtension(raster.getDriversShortName) - val writePath = s"$checkpointPath/${raster.uuid}.$extension" + val writePath = s"$checkpointPath/$uuid.$extension" val outPath = raster.writeToPath(writePath) RasterCleaner.dispose(raster) UTF8String.fromString(outPath) @@ -159,7 +163,7 @@ object GDAL { * @return * Returns a Raster object. */ - def raster(content: => Array[Byte], parentPath: String, driverShortName: String): MosaicRasterGDAL = + def raster(content: Array[Byte], parentPath: String, driverShortName: String): MosaicRasterGDAL = MosaicRasterGDAL.readRaster(content, parentPath, driverShortName) /** diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala index 48eef2f07..3fa45f8e5 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala @@ -7,7 +7,7 @@ import scala.collection.JavaConverters.dictionaryAsScalaMapConverter import scala.util._ /** GDAL implementation of the MosaicRasterBand trait. */ -class MosaicRasterBandGDAL(band: => Band, id: Int) { +case class MosaicRasterBandGDAL(band: Band, id: Int) { def getBand: Band = band diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala index 42653c6bd..67178e172 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala @@ -9,7 +9,6 @@ import com.databricks.labs.mosaic.core.raster.io.{RasterCleaner, RasterReader, R import com.databricks.labs.mosaic.core.raster.operator.clip.RasterClipByVector import com.databricks.labs.mosaic.core.types.model.GeometryTypeEnum.POLYGON import com.databricks.labs.mosaic.utils.PathUtils -import org.apache.orc.util.Murmur3 import org.gdal.gdal.gdal.GDALInfo import org.gdal.gdal.{Dataset, InfoOptions, gdal} import org.gdal.gdalconst.gdalconstConstants._ @@ -18,17 +17,15 @@ import org.gdal.osr.SpatialReference import org.locationtech.proj4j.CRSFactory import java.nio.file.{Files, Paths, StandardCopyOption} -import java.util.{Locale, UUID, Vector => JVector} +import java.util.{Locale, Vector => JVector} import scala.collection.JavaConverters.dictionaryAsScalaMapConverter import scala.util.Try /** GDAL implementation of the MosaicRaster trait. */ //noinspection DuplicatedCode -class MosaicRasterGDAL( - _uuid: Long, - raster: => Dataset, +case class MosaicRasterGDAL( + raster: Dataset, path: String, - isTemp: Boolean, parentPath: String, driverShortName: String, memSize: Long @@ -100,7 +97,7 @@ class MosaicRasterGDAL( def pixelDiagSize: Double = math.sqrt(pixelXSize * pixelXSize + pixelYSize * pixelYSize) /** @return Returns file extension. */ - def getRasterFileExtension: String = getRaster.GetDriver().GetMetadataItem("DMD_EXTENSION") + def getRasterFileExtension: String = GDAL.getExtension(driverShortName) /** @return Returns the raster's bands as a Seq. */ def getBands: Seq[MosaicRasterBandGDAL] = (1 to numBands).map(getBand) @@ -148,7 +145,6 @@ class MosaicRasterGDAL( .getOrElse(Map.empty[String, String]) ) .getOrElse(Map.empty[String, String]) - } /** @@ -167,7 +163,7 @@ class MosaicRasterGDAL( val pieces = path.split(":") Seq( key -> pieces.last, - s"${pieces.last}_vsimem" -> path, + s"${pieces.last}_tmp" -> path, pieces.last -> s"${pieces.head}:$parentPath:${pieces.last}" ) } else Seq(key -> subdatasetsMap(key)) @@ -208,7 +204,7 @@ class MosaicRasterGDAL( */ def getBand(bandId: Int): MosaicRasterBandGDAL = { if (bandId > 0 && numBands >= bandId) { - new MosaicRasterBandGDAL(raster.GetRasterBand(bandId), bandId) + MosaicRasterBandGDAL(raster.GetRasterBand(bandId), bandId) } else { throw new ArrayIndexOutOfBoundsException() } @@ -270,7 +266,7 @@ class MosaicRasterGDAL( * @return * Returns a Seq of the results of the function. */ - def transformBands[T](f: => MosaicRasterBandGDAL => T): Seq[T] = for (i <- 1 to numBands) yield f(getBand(i)) + def transformBands[T](f: MosaicRasterBandGDAL => T): Seq[T] = for (i <- 1 to numBands) yield f(getBand(i)) /** * @return @@ -348,28 +344,13 @@ class MosaicRasterGDAL( * bytes. */ def cleanUp(): Unit = { - val isInMem = path.contains("/vsimem/") val isSubdataset = PathUtils.isSubdataset(path) val filePath = if (isSubdataset) PathUtils.fromSubdatasetPath(path) else path val pamFilePath = s"$filePath.aux.xml" - if (isInMem) { - // Delete the raster from the virtual file system - // Note that Unlink is not the same as Delete - // Unlink may leave PAM residuals - Try(gdal.GetDriverByName(driverShortName).Delete(path)) - Try(gdal.GetDriverByName(driverShortName).Delete(filePath)) - Try(gdal.Unlink(path)) - Try(gdal.Unlink(filePath)) - Try(gdal.Unlink(pamFilePath)) - } - if (isTemp) { - Try(gdal.GetDriverByName(driverShortName).Delete(path)) - Try(Files.deleteIfExists(Paths.get(path))) - Try(Files.deleteIfExists(Paths.get(filePath))) - Try(Files.deleteIfExists(Paths.get(pamFilePath))) - val tmpParent = Paths.get(path).getParent - if (tmpParent != null) Try(Files.deleteIfExists(tmpParent)) - } + Try(gdal.GetDriverByName(driverShortName).Delete(path)) + Try(Files.deleteIfExists(Paths.get(path))) + Try(Files.deleteIfExists(Paths.get(filePath))) + Try(Files.deleteIfExists(Paths.get(pamFilePath))) } /** @@ -381,15 +362,8 @@ class MosaicRasterGDAL( */ def getMemSize: Long = { if (memSize == -1) { - if (PathUtils.isInMemory(path)) { - val tempPath = PathUtils.createTmpFilePath(this.uuid.toString, GDAL.getExtension(driverShortName)) - writeToPath(tempPath) - val size = Files.size(Paths.get(tempPath)) - Files.delete(Paths.get(tempPath)) - size - } else { - Files.size(Paths.get(path)) - } + val toRead = if (path.startsWith("/vsizip/")) path.replace("/vsizip/", "") else path + Files.size(Paths.get(toRead)) } else { memSize } @@ -406,15 +380,10 @@ class MosaicRasterGDAL( * A boolean indicating if the write was successful. */ def writeToPath(path: String, dispose: Boolean = true): String = { - val isInMem = PathUtils.isInMemory(getPath) - if (isInMem) { - val driver = raster.GetDriver() - val ds = driver.CreateCopy(path, this.flushCache().getRaster) - ds.FlushCache() - ds.delete() - } else { - Files.copy(Paths.get(getPath), Paths.get(path), StandardCopyOption.REPLACE_EXISTING).toString - } + val driver = raster.GetDriver() + val ds = driver.CreateCopy(path, this.flushCache().getRaster) + ds.FlushCache() + ds.delete() if (dispose) RasterCleaner.dispose(this) path } @@ -426,19 +395,18 @@ class MosaicRasterGDAL( * A byte array containing the raster data. */ def writeToBytes(dispose: Boolean = true): Array[Byte] = { - if (PathUtils.isInMemory(path)) { - // Create a temporary directory to store the raster - // This is needed because Files cannot read from /vsimem/ directly - val path = PathUtils.createTmpFilePath(uuid.toString, GDAL.getExtension(driverShortName)) - writeToPath(path, dispose) - val byteArray = Files.readAllBytes(Paths.get(path)) - Files.delete(Paths.get(path)) - byteArray - } else { - val byteArray = Files.readAllBytes(Paths.get(path)) - if (dispose) RasterCleaner.dispose(this) - byteArray - } + val isSubdataset = PathUtils.isSubdataset(path) + val readPath = + if (isSubdataset) { + val tmpPath = PathUtils.createTmpFilePath(getRasterFileExtension) + writeToPath(tmpPath, dispose = false) + } else { + path + } + val byteArray = Files.readAllBytes(Paths.get(readPath)) + if (dispose) RasterCleaner.dispose(this) + Files.deleteIfExists(Paths.get(readPath)) + byteArray } /** @@ -460,15 +428,9 @@ class MosaicRasterGDAL( * usable again. */ def refresh(): MosaicRasterGDAL = { - new MosaicRasterGDAL(uuid, openRaster(path), path, isTemp, parentPath, driverShortName, memSize) + MosaicRasterGDAL(openRaster(path), path, parentPath, driverShortName, memSize) } - /** - * @return - * Returns the raster's UUID. - */ - def uuid: Long = _uuid - /** * @return * Returns the raster's size. @@ -511,12 +473,16 @@ class MosaicRasterGDAL( .getOrElse(Map.empty[String, String]) .values .find(_.toUpperCase(Locale.ROOT).endsWith(subsetName.toUpperCase(Locale.ROOT))) - .getOrElse(throw new Exception(s"Subdataset $subsetName not found")) + .getOrElse(throw new Exception(s""" + |Subdataset $subsetName not found! + |Available subdatasets: + | ${subdatasets.keys.filterNot(_.startsWith("SUBDATASET_")).mkString(", ")} + """.stripMargin)) val ds = openRaster(path) // Avoid costly IO to compute MEM size here // It will be available when the raster is serialized for next operation // If value is needed then it will be computed when getMemSize is called - MosaicRasterGDAL(ds, path, isTemp = false, parentPath, driverShortName, -1) + MosaicRasterGDAL(ds, path, parentPath, driverShortName, -1) } } @@ -554,7 +520,7 @@ object MosaicRasterGDAL extends RasterReader { */ def identifyDriver(parentPath: String): String = { val isSubdataset = PathUtils.isSubdataset(parentPath) - val path = PathUtils.getCleanPath(parentPath, parentPath.endsWith(".zip")) + val path = PathUtils.getCleanPath(parentPath) val readPath = if (isSubdataset) PathUtils.getSubdatasetPath(path) else PathUtils.getZipPath(path) @@ -563,58 +529,6 @@ object MosaicRasterGDAL extends RasterReader { driverShortName } - /** - * Creates a MosaicRaster object from a GDAL raster object. - * @param dataset - * The GDAL raster object. - * @param path - * The path to the raster file in vsimem or in temp dir. - * @param isTemp - * A boolean indicating if the raster is temporary. - * @param parentPath - * The path to the file of the raster on disk. - * @param driverShortName - * The driver short name of the raster. - * @param memSize - * The size of the raster in memory. - * @return - * A MosaicRaster object. - */ - def apply( - dataset: => Dataset, - path: String, - isTemp: Boolean, - parentPath: String, - driverShortName: String, - memSize: Long - ): MosaicRasterGDAL = { - val uuid = Murmur3.hash64(path.getBytes()) - val raster = new MosaicRasterGDAL(uuid, dataset, path, isTemp, parentPath, driverShortName, memSize) - raster - } - - /** - * Creates a MosaicRaster object from a file system path. - * @param path - * The path to the raster file. - * @param isTemp - * A boolean indicating if the raster is temporary. - * @param parentPath - * The path to the file of the raster on disk. - * @param driverShortName - * The driver short name of the raster. - * @param memSize - * The size of the raster in memory. - * @return - * A MosaicRaster object. - */ - def apply(path: String, isTemp: Boolean, parentPath: String, driverShortName: String, memSize: Long): MosaicRasterGDAL = { - val uuid = Murmur3.hash64(path.getBytes()) - val dataset = openRaster(path, Some(driverShortName)) - val raster = new MosaicRasterGDAL(uuid, dataset, path, isTemp, parentPath, driverShortName, memSize) - raster - } - /** * Reads a raster from a file system path. Reads a subdataset if the path * is to a subdataset. @@ -629,9 +543,7 @@ object MosaicRasterGDAL extends RasterReader { */ override def readRaster(inPath: String, parentPath: String): MosaicRasterGDAL = { val isSubdataset = PathUtils.isSubdataset(inPath) - val localCopy = PathUtils.copyToTmp(inPath) - val path = PathUtils.getCleanPath(localCopy, localCopy.endsWith(".zip")) - val uuid = Murmur3.hash64(path.getBytes()) + val path = PathUtils.getCleanPath(inPath) val readPath = if (isSubdataset) PathUtils.getSubdatasetPath(path) else PathUtils.getZipPath(path) @@ -642,7 +554,7 @@ object MosaicRasterGDAL extends RasterReader { // It will be available when the raster is serialized for next operation // If value is needed then it will be computed when getMemSize is called // We cannot just use memSize value of the parent due to the fact that the raster could be a subdataset - val raster = new MosaicRasterGDAL(uuid, dataset, path, true, parentPath, driverShortName, -1) + val raster = MosaicRasterGDAL(dataset, path, parentPath, driverShortName, -1) raster } @@ -655,30 +567,26 @@ object MosaicRasterGDAL extends RasterReader { * @return * A MosaicRaster object. */ - override def readRaster(contentBytes: => Array[Byte], parentPath: String, driverShortName: String): MosaicRasterGDAL = { + override def readRaster(contentBytes: Array[Byte], parentPath: String, driverShortName: String): MosaicRasterGDAL = { if (Option(contentBytes).isEmpty || contentBytes.isEmpty) { - new MosaicRasterGDAL(-1L, null, "", false, parentPath, "", -1) + MosaicRasterGDAL(null, "", parentPath, "", -1) } else { // This is a temp UUID for purposes of reading the raster through GDAL from memory // The stable UUID is kept in metadata of the raster - val uuid = Murmur3.hash64(UUID.randomUUID().toString.getBytes()) val extension = GDAL.getExtension(driverShortName) - val virtualPath = s"/vsimem/$uuid.$extension" - gdal.FileFromMemBuffer(virtualPath, contentBytes) - // Try reading as a virtual file, if that fails, read as a zipped virtual file - val dataset = Option( - openRaster(virtualPath, Some(driverShortName)) - ).getOrElse({ - // Unlink the previous virtual file - gdal.Unlink(virtualPath) - // Create a virtual zip file - val virtualZipPath = s"/vsimem/$uuid.zip" - val zippedPath = s"/vsizip/$virtualZipPath" - gdal.FileFromMemBuffer(virtualZipPath, contentBytes) - openRaster(zippedPath, Some(driverShortName)) - }) - val raster = new MosaicRasterGDAL(uuid, dataset, virtualPath, false, parentPath, driverShortName, contentBytes.length) - raster + val tmpPath = PathUtils.createTmpFilePath(extension) + Files.write(Paths.get(tmpPath), contentBytes) + // Try reading as a tmp file, if that fails, rename as a zipped file + val dataset = openRaster(tmpPath, Some(driverShortName)) + if (dataset == null) { + val zippedPath = PathUtils.createTmpFilePath("zip") + Files.move(Paths.get(tmpPath), Paths.get(zippedPath), StandardCopyOption.REPLACE_EXISTING) + val readPath = PathUtils.getZipPath(zippedPath) + val ds = openRaster(readPath, Some(driverShortName)) + MosaicRasterGDAL(ds, readPath, parentPath, driverShortName, contentBytes.length) + } else { + MosaicRasterGDAL(dataset, tmpPath, parentPath, driverShortName, contentBytes.length) + } } } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/io/RasterCleaner.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/io/RasterCleaner.scala index 9a8be672a..b83bd2b8c 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/io/RasterCleaner.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/io/RasterCleaner.scala @@ -30,7 +30,7 @@ object RasterCleaner { * @param ds * The dataset to destroy. */ - def destroy(ds: => Dataset): Unit = { + def destroy(ds: Dataset): Unit = { if (ds != null) { try { ds.FlushCache() @@ -49,7 +49,7 @@ object RasterCleaner { * @param raster * The raster to destroy and clean up. */ - def dispose(raster: => Any): Unit = { + def dispose(raster: Any): Unit = { raster match { case r: MosaicRasterGDAL => r.destroy() diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/io/RasterReader.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/io/RasterReader.scala index 65ef016cc..b207789ae 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/io/RasterReader.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/io/RasterReader.scala @@ -42,7 +42,7 @@ trait RasterReader extends Logging { * @return * A MosaicRaster object. */ - def readRaster(contentBytes: => Array[Byte], parentPath: String, driverShortName: String): MosaicRasterGDAL + def readRaster(contentBytes: Array[Byte], parentPath: String, driverShortName: String): MosaicRasterGDAL /** * Reads a raster band from a file system path. Reads a subdataset band if diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/CombineAVG.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/CombineAVG.scala index e41f82a09..caab0f299 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/CombineAVG.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/CombineAVG.scala @@ -18,7 +18,7 @@ object CombineAVG { * @return * A new raster with average of input rasters. */ - def compute(rasters: => Seq[MosaicRasterGDAL]): MosaicRasterGDAL = { + def compute(rasters: Seq[MosaicRasterGDAL]): MosaicRasterGDAL = { val pythonFunc = """ |import numpy as np diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/NDVI.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/NDVI.scala index 1b60baba8..e3ab94d98 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/NDVI.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/NDVI.scala @@ -20,14 +20,11 @@ object NDVI { * @return * MosaicRasterGDAL with NDVI computed. */ - def compute(raster: => MosaicRasterGDAL, redIndex: Int, nirIndex: Int): MosaicRasterGDAL = { - val tmpPath = PathUtils.createTmpFilePath(raster.uuid.toString, GDAL.getExtension(raster.getDriversShortName)) - raster.writeToPath(tmpPath) - val tmpRaster = MosaicRasterGDAL(tmpPath, isTemp=true, raster.getParentPath, raster.getDriversShortName, raster.getMemSize) - val ndviPath = PathUtils.createTmpFilePath(raster.uuid.toString + "NDVI", GDAL.getExtension(raster.getDriversShortName)) + def compute(raster: MosaicRasterGDAL, redIndex: Int, nirIndex: Int): MosaicRasterGDAL = { + val ndviPath = PathUtils.createTmpFilePath(GDAL.getExtension(raster.getDriversShortName)) // noinspection ScalaStyle val gdalCalcCommand = - s"""gdal_calc -A ${tmpRaster.getPath} --A_band=$redIndex -B ${tmpRaster.getPath} --B_band=$nirIndex --outfile=$ndviPath --calc="(B-A)/(B+A)"""" + s"""gdal_calc -A ${raster.getPath} --A_band=$redIndex -B ${raster.getPath} --B_band=$nirIndex --outfile=$ndviPath --calc="(B-A)/(B+A)"""" GDALCalc.executeCalc(gdalCalcCommand, ndviPath) } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala index 2c40ab81c..6daabc25c 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala @@ -35,18 +35,17 @@ object RasterClipByVector { * @return * A clipped raster. */ - def clip(raster: => MosaicRasterGDAL, geometry: MosaicGeometry, geomCRS: SpatialReference, geometryAPI: GeometryAPI): MosaicRasterGDAL = { + def clip(raster: MosaicRasterGDAL, geometry: MosaicGeometry, geomCRS: SpatialReference, geometryAPI: GeometryAPI): MosaicRasterGDAL = { val rasterCRS = raster.getSpatialReference val outShortName = raster.getDriversShortName val geomSrcCRS = if (geomCRS == null ) rasterCRS else geomCRS - val resultFileName = PathUtils.createTmpFilePath(raster.uuid.toString, GDAL.getExtension(outShortName)) + val resultFileName = PathUtils.createTmpFilePath(GDAL.getExtension(outShortName)) val shapeFileName = VectorClipper.generateClipper(geometry, geomSrcCRS, rasterCRS, geometryAPI) val result = GDALWarp.executeWarp( resultFileName, - isTemp = true, Seq(raster), command = s"gdalwarp -wo CUTLINE_ALL_TOUCHED=TRUE -of $outShortName -cutline $shapeFileName -crop_to_cutline -co COMPRESS=DEFLATE -dstalpha" ) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/VectorClipper.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/VectorClipper.scala index 7c7ea58f2..ae03b2d01 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/VectorClipper.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/VectorClipper.scala @@ -2,6 +2,7 @@ package com.databricks.labs.mosaic.core.raster.operator.clip import com.databricks.labs.mosaic.core.geometry.MosaicGeometry import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.utils.PathUtils import org.gdal.gdal.gdal import org.gdal.ogr.ogrConstants.OFTInteger import org.gdal.ogr.{DataSource, Feature, ogr} @@ -21,8 +22,7 @@ object VectorClipper { * The shapefile name. */ private def getShapefileName: String = { - val uuid = java.util.UUID.randomUUID() - val shapeFileName = s"/vsimem/${uuid.toString}.shp" + val shapeFileName = PathUtils.createTmpFilePath(".shp") shapeFileName } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALBuildVRT.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALBuildVRT.scala index c3b57d5f3..389defad6 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALBuildVRT.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALBuildVRT.scala @@ -11,8 +11,6 @@ object GDALBuildVRT { * * @param outputPath * The output path of the VRT file. - * @param isTemp - * Whether the output is a temp file. * @param rasters * The rasters to build the VRT from. * @param command @@ -20,14 +18,22 @@ object GDALBuildVRT { * @return * A MosaicRaster object. */ - def executeVRT(outputPath: String, isTemp: Boolean, rasters: => Seq[MosaicRasterGDAL], command: String): MosaicRasterGDAL = { + def executeVRT(outputPath: String, rasters: Seq[MosaicRasterGDAL], command: String): MosaicRasterGDAL = { require(command.startsWith("gdalbuildvrt"), "Not a valid GDAL Build VRT command.") val vrtOptionsVec = OperatorOptions.parseOptions(command) val vrtOptions = new BuildVRTOptions(vrtOptionsVec) val result = gdal.BuildVRT(outputPath, rasters.map(_.getRaster).toArray, vrtOptions) + if (result == null) { + throw new Exception( + s""" + |Build VRT failed. + |Command: $command + |Error: ${gdal.GetLastErrorMsg} + |""".stripMargin) + } // TODO: Figure out multiple parents, should this be an array? // VRT files are just meta files, mem size doesnt make much sense so we keep -1 - MosaicRasterGDAL(result, outputPath, isTemp, rasters.head.getParentPath, "VRT", -1).flushCache() + MosaicRasterGDAL(result, outputPath, rasters.head.getParentPath, "VRT", -1).flushCache() } } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALCalc.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALCalc.scala index eca7f16d6..97a273d13 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALCalc.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALCalc.scala @@ -2,11 +2,22 @@ package com.databricks.labs.mosaic.core.raster.operator.gdal import com.databricks.labs.mosaic.core.raster.api.GDAL import com.databricks.labs.mosaic.core.raster.gdal.MosaicRasterGDAL +import com.databricks.labs.mosaic.utils.SysUtils /** GDALCalc is a helper object for executing GDAL Calc commands. */ object GDALCalc { - val gdal_calc = "/usr/lib/python3/dist-packages/osgeo_utils/gdal_calc.py" + val gdal_calc: String = { + val calcPath = SysUtils.runCommand("find / -iname gdal_calc.py")._1.split("\n").headOption.getOrElse("") + if (calcPath.isEmpty) { + throw new RuntimeException("Could not find gdal_calc.py.") + } + if (calcPath == "ERROR") { + "/usr/lib/python3/dist-packages/osgeo_utils/gdal_calc.py" + } else { + calcPath + } + } /** * Executes the GDAL Calc command. @@ -19,9 +30,17 @@ object GDALCalc { */ def executeCalc(gdalCalcCommand: String, resultPath: String): MosaicRasterGDAL = { require(gdalCalcCommand.startsWith("gdal_calc"), "Not a valid GDAL Calc command.") - import sys.process._ val toRun = gdalCalcCommand.replace("gdal_calc", gdal_calc) - s"python3 $toRun".!! + val commandRes = SysUtils.runCommand(s"python3 $toRun") + if (commandRes._1 == "ERROR") { + throw new RuntimeException(s""" + |GDAL Calc command failed: + |STDOUT: + |${commandRes._2} + |STDERR: + |${commandRes._3} + |""".stripMargin) + } val result = GDAL.raster(resultPath, resultPath) result } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALTranslate.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALTranslate.scala index 0b44e006c..bf266cfbf 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALTranslate.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALTranslate.scala @@ -13,8 +13,6 @@ object GDALTranslate { * * @param outputPath * The output path of the translated file. - * @param isTemp - * Whether the output is a temp file. * @param raster * The raster to translate. * @param command @@ -22,13 +20,21 @@ object GDALTranslate { * @return * A MosaicRaster object. */ - def executeTranslate(outputPath: String, isTemp: Boolean, raster: => MosaicRasterGDAL, command: String): MosaicRasterGDAL = { + def executeTranslate(outputPath: String, raster: MosaicRasterGDAL, command: String): MosaicRasterGDAL = { require(command.startsWith("gdal_translate"), "Not a valid GDAL Translate command.") val translateOptionsVec = OperatorOptions.parseOptions(command) val translateOptions = new TranslateOptions(translateOptionsVec) val result = gdal.Translate(outputPath, raster.getRaster, translateOptions) + if (result == null) { + throw new Exception( + s""" + |Translate failed. + |Command: $command + |Error: ${gdal.GetLastErrorMsg} + |""".stripMargin) + } val size = Files.size(Paths.get(outputPath)) - MosaicRasterGDAL(result, outputPath, isTemp, raster.getParentPath, raster.getDriversShortName, size).flushCache() + raster.copy(raster = result, path = outputPath, memSize = size).flushCache() } } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALWarp.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALWarp.scala index e8393d728..2b13a957b 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALWarp.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALWarp.scala @@ -13,8 +13,6 @@ object GDALWarp { * * @param outputPath * The output path of the warped file. - * @param isTemp - * Whether the output is a temp file. * @param rasters * The rasters to warp. * @param command @@ -22,7 +20,7 @@ object GDALWarp { * @return * A MosaicRaster object. */ - def executeWarp(outputPath: String, isTemp: Boolean, rasters: => Seq[MosaicRasterGDAL], command: String): MosaicRasterGDAL = { + def executeWarp(outputPath: String, rasters: Seq[MosaicRasterGDAL], command: String): MosaicRasterGDAL = { require(command.startsWith("gdalwarp"), "Not a valid GDAL Warp command.") // Test: gdal.ParseCommandLine(command) val warpOptionsVec = OperatorOptions.parseOptions(command) @@ -30,15 +28,21 @@ object GDALWarp { val result = gdal.Warp(outputPath, rasters.map(_.getRaster).toArray, warpOptions) // TODO: Figure out multiple parents, should this be an array? // Format will always be the same as the first raster + if (result == null) { + throw new Exception(s""" + |Warp failed. + |Command: $command + |Error: ${gdal.GetLastErrorMsg} + |""".stripMargin) + } val size = Files.size(Paths.get(outputPath)) - MosaicRasterGDAL( - result, - outputPath, - isTemp, - rasters.head.getParentPath, - rasters.head.getDriversShortName, - size - ).flushCache() + rasters.head + .copy( + raster = result, + path = outputPath, + memSize = size + ) + .flushCache() } } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/merge/MergeBands.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/merge/MergeBands.scala index 2bd605445..6333c50c8 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/merge/MergeBands.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/merge/MergeBands.scala @@ -18,23 +18,20 @@ object MergeBands { * @return * A MosaicRaster object. */ - def merge(rasters: => Seq[MosaicRasterGDAL], resampling: String): MosaicRasterGDAL = { - val rasterUUID = java.util.UUID.randomUUID.toString + def merge(rasters: Seq[MosaicRasterGDAL], resampling: String): MosaicRasterGDAL = { val outShortName = rasters.head.getRaster.GetDriver.getShortName - val vrtPath = PathUtils.createTmpFilePath(rasterUUID, "vrt") - val rasterPath = PathUtils.createTmpFilePath(rasterUUID, "tif") + val vrtPath = PathUtils.createTmpFilePath("vrt") + val rasterPath = PathUtils.createTmpFilePath("tif") val vrtRaster = GDALBuildVRT.executeVRT( vrtPath, - isTemp = true, rasters, command = s"gdalbuildvrt -separate -resolution highest" ) val result = GDALTranslate.executeTranslate( rasterPath, - isTemp = true, vrtRaster, command = s"gdal_translate -r $resampling -of $outShortName -co COMPRESS=DEFLATE" ) @@ -57,23 +54,20 @@ object MergeBands { * @return * A MosaicRaster object. */ - def merge(rasters: => Seq[MosaicRasterGDAL], pixel: (Double, Double), resampling: String): MosaicRasterGDAL = { - val rasterUUID = java.util.UUID.randomUUID.toString + def merge(rasters: Seq[MosaicRasterGDAL], pixel: (Double, Double), resampling: String): MosaicRasterGDAL = { val outShortName = rasters.head.getRaster.GetDriver.getShortName - val vrtPath = PathUtils.createTmpFilePath(rasterUUID, "vrt") - val rasterPath = PathUtils.createTmpFilePath(rasterUUID, "tif") + val vrtPath = PathUtils.createTmpFilePath("vrt") + val rasterPath = PathUtils.createTmpFilePath("tif") val vrtRaster = GDALBuildVRT.executeVRT( vrtPath, - isTemp = true, rasters, command = s"gdalbuildvrt -separate -resolution user -tr ${pixel._1} ${pixel._2}" ) val result = GDALTranslate.executeTranslate( rasterPath, - isTemp = true, vrtRaster, command = s"gdalwarp -r $resampling -of $outShortName -co COMPRESS=DEFLATE -overwrite" ) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/merge/MergeRasters.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/merge/MergeRasters.scala index 08adf2053..694d9940a 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/merge/MergeRasters.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/merge/MergeRasters.scala @@ -16,23 +16,20 @@ object MergeRasters { * @return * A MosaicRaster object. */ - def merge(rasters: => Seq[MosaicRasterGDAL]): MosaicRasterGDAL = { - val rasterUUID = java.util.UUID.randomUUID.toString + def merge(rasters: Seq[MosaicRasterGDAL]): MosaicRasterGDAL = { val outShortName = rasters.head.getRaster.GetDriver.getShortName - val vrtPath = PathUtils.createTmpFilePath(rasterUUID, "vrt") - val rasterPath = PathUtils.createTmpFilePath(rasterUUID, "tif") + val vrtPath = PathUtils.createTmpFilePath("vrt") + val rasterPath = PathUtils.createTmpFilePath("tif") val vrtRaster = GDALBuildVRT.executeVRT( vrtPath, - isTemp = true, rasters, command = s"gdalbuildvrt -resolution highest" ) val result = GDALTranslate.executeTranslate( rasterPath, - isTemp = true, vrtRaster, command = s"gdal_translate -r bilinear -of $outShortName -co COMPRESS=DEFLATE" ) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/pixel/PixelCombineRasters.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/pixel/PixelCombineRasters.scala index 4462d01a3..5bf49fb96 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/pixel/PixelCombineRasters.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/pixel/PixelCombineRasters.scala @@ -19,26 +19,24 @@ object PixelCombineRasters { * @return * A MosaicRaster object. */ - def combine(rasters: => Seq[MosaicRasterGDAL], pythonFunc: String, pythonFuncName: String): MosaicRasterGDAL = { - val rasterUUID = java.util.UUID.randomUUID.toString + def combine(rasters: Seq[MosaicRasterGDAL], pythonFunc: String, pythonFuncName: String): MosaicRasterGDAL = { val outShortName = rasters.head.getRaster.GetDriver.getShortName - val vrtPath = PathUtils.createTmpFilePath(rasterUUID, "vrt") - val rasterPath = PathUtils.createTmpFilePath(rasterUUID, "tif") + val vrtPath = PathUtils.createTmpFilePath("vrt") + val rasterPath = PathUtils.createTmpFilePath("tif") val vrtRaster = GDALBuildVRT.executeVRT( vrtPath, - isTemp = true, rasters, command = s"gdalbuildvrt -resolution highest" ) + vrtRaster.destroy() addPixelFunction(vrtPath, pythonFunc, pythonFuncName) val result = GDALTranslate.executeTranslate( rasterPath, - isTemp = true, - vrtRaster, + vrtRaster.refresh(), command = s"gdal_translate -r bilinear -of $outShortName -co COMPRESS=DEFLATE" ) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/proj/RasterProject.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/proj/RasterProject.scala index a091c4495..efd7c8c67 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/proj/RasterProject.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/proj/RasterProject.scala @@ -25,10 +25,10 @@ object RasterProject { * @return * A projected raster. */ - def project(raster: => MosaicRasterGDAL, destCRS: SpatialReference): MosaicRasterGDAL = { + def project(raster: MosaicRasterGDAL, destCRS: SpatialReference): MosaicRasterGDAL = { val outShortName = raster.getDriversShortName - val resultFileName = PathUtils.createTmpFilePath(raster.uuid.toString, GDAL.getExtension(outShortName)) + val resultFileName = PathUtils.createTmpFilePath(GDAL.getExtension(outShortName)) // Note that Null is the right value here val authName = destCRS.GetAuthorityName(null) @@ -36,7 +36,6 @@ object RasterProject { val result = GDALWarp.executeWarp( resultFileName, - isTemp = true, Seq(raster), command = s"gdalwarp -of $outShortName -t_srs $authName:$authCode -r cubic -overwrite -co COMPRESS=DEFLATE" ) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/BalancedSubdivision.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/BalancedSubdivision.scala index 17cb39885..75e59c1fa 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/BalancedSubdivision.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/BalancedSubdivision.scala @@ -19,7 +19,7 @@ object BalancedSubdivision { * @return * The number of splits. */ - def getNumSplits(raster: => MosaicRasterGDAL, destSize: Int): Int = { + def getNumSplits(raster: MosaicRasterGDAL, destSize: Int): Int = { val size = raster.getMemSize val n = size.toDouble / (destSize * 1000 * 1000) val nInt = Math.ceil(n).toInt @@ -76,7 +76,7 @@ object BalancedSubdivision { * A sequence of MosaicRaster objects. */ def splitRaster( - tile: => MosaicRasterTile, + tile: MosaicRasterTile, sizeInMb: Int ): Seq[MosaicRasterTile] = { val numSplits = getNumSplits(tile.getRaster, sizeInMb) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/OverlappingTiles.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/OverlappingTiles.scala index 5897347ab..c1498ea05 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/OverlappingTiles.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/OverlappingTiles.scala @@ -28,7 +28,7 @@ object OverlappingTiles { * A sequence of MosaicRasterTile objects. */ def reTile( - tile: => MosaicRasterTile, + tile: MosaicRasterTile, tileWidth: Int, tileHeight: Int, overlapPercentage: Int @@ -46,14 +46,12 @@ object OverlappingTiles { val width = Math.min(tileWidth, xSize - i) val height = Math.min(tileHeight, ySize - j) - val uuid = java.util.UUID.randomUUID.toString val fileExtension = GDAL.getExtension(tile.getDriver) - val rasterPath = PathUtils.createTmpFilePath(uuid, fileExtension) + val rasterPath = PathUtils.createTmpFilePath(fileExtension) val shortName = raster.getRaster.GetDriver.getShortName val result = GDALTranslate.executeTranslate( rasterPath, - isTemp = true, raster, command = s"gdal_translate -of $shortName -srcwin $xOff $yOff $width $height" ) @@ -70,7 +68,7 @@ object OverlappingTiles { val (_, valid) = tiles.flatten.partition(_._1) - valid.map(t => new MosaicRasterTile(null, t._2, raster.getParentPath, raster.getDriversShortName)) + valid.map(t => MosaicRasterTile(null, t._2, raster.getParentPath, raster.getDriversShortName)) } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/RasterTessellate.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/RasterTessellate.scala index 8c5ce4f32..d186de0a5 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/RasterTessellate.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/RasterTessellate.scala @@ -27,7 +27,7 @@ object RasterTessellate { * @return * A sequence of MosaicRasterTile objects. */ - def tessellate(raster: => MosaicRasterGDAL, resolution: Int, indexSystem: IndexSystem, geometryAPI: GeometryAPI): Seq[MosaicRasterTile] = { + def tessellate(raster: MosaicRasterGDAL, resolution: Int, indexSystem: IndexSystem, geometryAPI: GeometryAPI): Seq[MosaicRasterTile] = { val indexSR = indexSystem.osrSpatialRef val bbox = raster.bbox(geometryAPI, indexSR) val cells = Mosaic.mosaicFill(bbox, resolution, keepCoreGeom = false, indexSystem, geometryAPI) @@ -38,13 +38,12 @@ object RasterTessellate { val cellID = cell.cellIdAsLong(indexSystem) val isValidCell = indexSystem.isValid(cellID) if (!isValidCell) { - (false, new MosaicRasterTile(cell.index, null, "", "")) + (false, MosaicRasterTile(cell.index, null, "", "")) } else { val cellRaster = tmpRaster.getRasterForCell(cellID, indexSystem, geometryAPI) val isValidRaster = cellRaster.getBandStats.values.map(_("mean")).sum > 0 && !cellRaster.isEmpty ( - isValidRaster, - new MosaicRasterTile(cell.index, cellRaster, raster.getParentPath, raster.getDriversShortName) + isValidRaster, MosaicRasterTile(cell.index, cellRaster, raster.getParentPath, raster.getDriversShortName) ) } }) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/ReTile.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/ReTile.scala index e03712467..edaab4720 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/ReTile.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/retile/ReTile.scala @@ -1,7 +1,7 @@ package com.databricks.labs.mosaic.core.raster.operator.retile import com.databricks.labs.mosaic.core.raster.io.RasterCleaner.dispose -import com.databricks.labs.mosaic.core.raster.operator.gdal.GDALTranslate +import com.databricks.labs.mosaic.core.raster.operator.gdal.{GDALBuildVRT, GDALTranslate} import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile import com.databricks.labs.mosaic.utils.PathUtils @@ -22,7 +22,7 @@ object ReTile { * A sequence of MosaicRasterTile objects. */ def reTile( - tile: => MosaicRasterTile, + tile: MosaicRasterTile, tileWidth: Int, tileHeight: Int ): Seq[MosaicRasterTile] = { @@ -37,14 +37,12 @@ object ReTile { val xOffset = if (xMin + tileWidth + 1 > xR) xR - xMin else tileWidth + 1 val yOffset = if (yMin + tileHeight + 1 > yR) yR - yMin else tileHeight + 1 - val rasterUUID = java.util.UUID.randomUUID.toString val fileExtension = raster.getRasterFileExtension - val rasterPath = PathUtils.createTmpFilePath(rasterUUID, fileExtension) + val rasterPath = PathUtils.createTmpFilePath(fileExtension) val shortDriver = raster.getDriversShortName val result = GDALTranslate.executeTranslate( rasterPath, - isTemp = true, raster, command = s"gdal_translate -of $shortDriver -srcwin $xMin $yMin $xOffset $yOffset -co COMPRESS=DEFLATE" ) @@ -59,7 +57,7 @@ object ReTile { val (_, valid) = tiles.partition(_._1) - valid.map(t => new MosaicRasterTile(null, t._2, raster.getParentPath, raster.getDriversShortName)) + valid.map(t => MosaicRasterTile(null, t._2, raster.getParentPath, raster.getDriversShortName)) } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/types/model/MosaicRasterTile.scala b/src/main/scala/com/databricks/labs/mosaic/core/types/model/MosaicRasterTile.scala index 48435d7da..e7a8e9218 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/types/model/MosaicRasterTile.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/types/model/MosaicRasterTile.scala @@ -19,9 +19,9 @@ import org.apache.spark.unsafe.types.UTF8String * @param driver * Driver used to read the raster. */ -class MosaicRasterTile( +case class MosaicRasterTile( index: Either[Long, String], - raster: => MosaicRasterGDAL, + raster: MosaicRasterGDAL, parentPath: String, driver: String ) { @@ -55,13 +55,13 @@ class MosaicRasterTile( (indexSystem.getCellIdDataType, index) match { case (_: LongType, Left(_)) => this case (_: StringType, Right(_)) => this - case (_: LongType, Right(value)) => new MosaicRasterTile( + case (_: LongType, Right(value)) => MosaicRasterTile( index = Left(indexSystem.parse(value)), raster = raster, parentPath = parentPath, driver = driver ) - case (_: StringType, Left(value)) => new MosaicRasterTile( + case (_: StringType, Left(value)) => MosaicRasterTile( index = Right(indexSystem.format(value)), raster = raster, parentPath = parentPath, @@ -162,12 +162,12 @@ object MosaicRasterTile { // noinspection TypeCheckCanBeMatch if (Option(index).isDefined) { if (index.isInstanceOf[Long]) { - new MosaicRasterTile(Left(index.asInstanceOf[Long]), raster, parentPath, driver) + MosaicRasterTile(Left(index.asInstanceOf[Long]), raster, parentPath, driver) } else { - new MosaicRasterTile(Right(index.asInstanceOf[UTF8String].toString), raster, parentPath, driver) + MosaicRasterTile(Right(index.asInstanceOf[UTF8String].toString), raster, parentPath, driver) } } else { - new MosaicRasterTile(null, raster, parentPath, driver) + MosaicRasterTile(null, raster, parentPath, driver) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/datasource/OGRFileFormat.scala b/src/main/scala/com/databricks/labs/mosaic/datasource/OGRFileFormat.scala index b1b6b78b4..166151678 100644 --- a/src/main/scala/com/databricks/labs/mosaic/datasource/OGRFileFormat.scala +++ b/src/main/scala/com/databricks/labs/mosaic/datasource/OGRFileFormat.scala @@ -366,8 +366,8 @@ object OGRFileFormat extends Serializable { * @return * the data source */ - def getDataSource(driverName: String, path: String, useZipPath: Boolean): org.gdal.ogr.DataSource = { - val cleanPath = PathUtils.getCleanPath(path, useZipPath) + def getDataSource(driverName: String, path: String): org.gdal.ogr.DataSource = { + val cleanPath = PathUtils.getCleanPath(path) // 0 is for no update driver if (driverName.nonEmpty) { ogr.GetDriverByName(driverName).Open(cleanPath, 0) @@ -398,10 +398,9 @@ object OGRFileFormat extends Serializable { val layerN = options.getOrElse("layerNumber", "0").toInt val layerName = options.getOrElse("layerName", "") val inferenceLimit = options.getOrElse("inferenceLimit", "200").toInt - val useZipPath = options.getOrElse("vsizip", "false").toBoolean val asWKB = options.getOrElse("asWKB", "false").toBoolean - val dataset = getDataSource(driverName, path, useZipPath) + val dataset = getDataSource(driverName, path) val resolvedLayerName = if (layerName.isEmpty) dataset.GetLayer(layerN).GetName() else layerName val layer = dataset.GetLayer(resolvedLayerName) layer.ResetReading() @@ -454,10 +453,9 @@ object OGRFileFormat extends Serializable { val layerN = options.getOrElse("layerNumber", "0").toInt val layerName = options.getOrElse("layerName", "") - val useZipPath = options.getOrElse("vsizip", "false").toBoolean val asWKB = options.getOrElse("asWKB", "false").toBoolean val path = file.filePath - val dataset = getDataSource(driverName, path, useZipPath) + val dataset = getDataSource(driverName, path) val resolvedLayerName = if (layerName.isEmpty) dataset.GetLayer(layerN).GetName() else layerName val layer = dataset.GetLayerByName(resolvedLayerName) layer.ResetReading() diff --git a/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReTileOnRead.scala b/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReTileOnRead.scala index acd53e535..285df2191 100644 --- a/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReTileOnRead.scala +++ b/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReTileOnRead.scala @@ -8,11 +8,14 @@ import com.databricks.labs.mosaic.core.types.RasterTileType import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile import com.databricks.labs.mosaic.datasource.Utils import com.databricks.labs.mosaic.datasource.gdal.GDALFileFormat._ +import com.databricks.labs.mosaic.utils.PathUtils import org.apache.hadoop.fs.{FileStatus, FileSystem} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ +import java.nio.file.{Files, Paths} + /** An object defining the retiling read strategy for the GDAL file format. */ object ReTileOnRead extends ReadStrategy { @@ -81,7 +84,8 @@ object ReTileOnRead extends ReadStrategy { val uuid = getUUID(status) val sizeInMB = options.getOrElse("sizeInMB", "16").toInt - val tiles = localSubdivide(inPath, sizeInMB) + val tmpPath = PathUtils.copyToTmp(inPath) + val tiles = localSubdivide(tmpPath, inPath, sizeInMB) val rows = tiles.map(tile => { val trimmedSchema = StructType(requiredSchema.filter(field => field.name != TILE)) @@ -104,6 +108,8 @@ object ReTileOnRead extends ReadStrategy { row }) + Files.deleteIfExists(Paths.get(tmpPath)) + rows.iterator } @@ -117,9 +123,10 @@ object ReTileOnRead extends ReadStrategy { * @return * A tuple of the raster and the tiles. */ - def localSubdivide(inPath: String, sizeInMB: Int): Seq[MosaicRasterTile] = { - val raster = MosaicRasterGDAL.readRaster(inPath, inPath) - val inTile = new MosaicRasterTile(null, raster, inPath, raster.getDriversShortName) + def localSubdivide(inPath: String, parentPath: String, sizeInMB: Int): Seq[MosaicRasterTile] = { + val cleanPath = PathUtils.getCleanPath(inPath) + val raster = MosaicRasterGDAL.readRaster(cleanPath, parentPath) + val inTile = new MosaicRasterTile(null, raster, parentPath, raster.getDriversShortName) val tiles = BalancedSubdivision.splitRaster(inTile, sizeInMB) RasterCleaner.dispose(raster) RasterCleaner.dispose(inTile) diff --git a/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReadInMemory.scala b/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReadInMemory.scala index 381804ff9..540afa8a7 100644 --- a/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReadInMemory.scala +++ b/src/main/scala/com/databricks/labs/mosaic/datasource/gdal/ReadInMemory.scala @@ -6,6 +6,7 @@ import com.databricks.labs.mosaic.core.raster.io.RasterCleaner import com.databricks.labs.mosaic.core.types.RasterTileType import com.databricks.labs.mosaic.datasource.Utils import com.databricks.labs.mosaic.datasource.gdal.GDALFileFormat._ +import com.databricks.labs.mosaic.utils.PathUtils import org.apache.hadoop.fs.{FileStatus, FileSystem} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow diff --git a/src/main/scala/com/databricks/labs/mosaic/datasource/multiread/OGRMultiReadDataFrameReader.scala b/src/main/scala/com/databricks/labs/mosaic/datasource/multiread/OGRMultiReadDataFrameReader.scala index a3ce92180..4947b9134 100644 --- a/src/main/scala/com/databricks/labs/mosaic/datasource/multiread/OGRMultiReadDataFrameReader.scala +++ b/src/main/scala/com/databricks/labs/mosaic/datasource/multiread/OGRMultiReadDataFrameReader.scala @@ -36,9 +36,8 @@ class OGRMultiReadDataFrameReader(sparkSession: SparkSession) extends MosaicData val layerNumber = config("layerNumber").toInt val layerName = config("layerName") val chunkSize = config("chunkSize").toInt - val vsizip = config("vsizip").toBoolean - val ds = OGRFileFormat.getDataSource(driverName, headPath, vsizip) + val ds = OGRFileFormat.getDataSource(driverName, headPath) val layer = OGRFileFormat.getLayer(ds, layerNumber, layerName) val partitionCount = 1 + (layer.GetFeatureCount / chunkSize) diff --git a/src/main/scala/com/databricks/labs/mosaic/datasource/multiread/RasterAsGridReader.scala b/src/main/scala/com/databricks/labs/mosaic/datasource/multiread/RasterAsGridReader.scala index 4cb39066a..c1f805afa 100644 --- a/src/main/scala/com/databricks/labs/mosaic/datasource/multiread/RasterAsGridReader.scala +++ b/src/main/scala/com/databricks/labs/mosaic/datasource/multiread/RasterAsGridReader.scala @@ -19,12 +19,11 @@ class RasterAsGridReader(sparkSession: SparkSession) extends MosaicDataFrameRead private val mc = MosaicContext.context() import mc.functions._ - val vsizipPathColF: Column => Column = - (path: Column) => - when( - path.endsWith(".zip"), - concat(lit("/vsizip/"), path) - ).otherwise(path) + def getNPartitions(config: Map[String, String]): Int = { + val shufflePartitions = sparkSession.conf.get("spark.sql.shuffle.partitions") + val nPartitions = config.getOrElse("nPartitions", shufflePartitions).toInt + nPartitions + } override def load(path: String): DataFrame = load(Seq(path): _*) @@ -32,12 +31,14 @@ class RasterAsGridReader(sparkSession: SparkSession) extends MosaicDataFrameRead val config = getConfig val resolution = config("resolution").toInt + val nPartitions = getNPartitions(config) val pathsDf = sparkSession.read .format("gdal") .option("extensions", config("extensions")) .option("raster_storage", "in-memory") .load(paths: _*) + .repartition(nPartitions) val rasterToGridCombiner = getRasterToGridFunc(config("combiner")) @@ -61,6 +62,7 @@ class RasterAsGridReader(sparkSession: SparkSession) extends MosaicDataFrameRead col("band_id"), explode(col("grid_measures")).alias("grid_measures") ) + .repartition(nPartitions) .select( col("band_id"), col("grid_measures").getItem("cellID").alias("cell_id"), @@ -87,12 +89,15 @@ class RasterAsGridReader(sparkSession: SparkSession) extends MosaicDataFrameRead private def retileRaster(rasterDf: DataFrame, config: Map[String, String]) = { val retile = config("retile").toBoolean val tileSize = config("tileSize").toInt + val nPartitions = getNPartitions(config) if (retile) { - rasterDf.withColumn( - "tile", - rst_retile(col("tile"), lit(tileSize), lit(tileSize)) - ) + rasterDf + .withColumn( + "tile", + rst_retile(col("tile"), lit(tileSize), lit(tileSize)) + ) + .repartition(nPartitions) } else { rasterDf } @@ -141,6 +146,7 @@ class RasterAsGridReader(sparkSession: SparkSession) extends MosaicDataFrameRead */ private def kRingResample(rasterDf: DataFrame, config: Map[String, String]) = { val k = config("kRingInterpolate").toInt + val nPartitions = getNPartitions(config) def weighted_sum(measureCol: String, weightCol: String) = { sum(col(measureCol) * col(weightCol)) / sum(col(weightCol)) @@ -150,6 +156,7 @@ class RasterAsGridReader(sparkSession: SparkSession) extends MosaicDataFrameRead rasterDf .withColumn("origin_cell_id", col("cell_id")) .withColumn("cell_id", explode(grid_cellkring(col("origin_cell_id"), k))) + .repartition(nPartitions) .withColumn("weight", lit(k + 1) - grid_distance(col("origin_cell_id"), col("cell_id"))) .groupBy("band_id", "cell_id") .agg(weighted_sum("measure", "weight")) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_BandMetaData.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_BandMetaData.scala index cf9bd60ba..241d913bc 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_BandMetaData.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_BandMetaData.scala @@ -40,7 +40,7 @@ case class RST_BandMetaData(raster: Expression, band: Expression, expressionConf * @return * The band metadata of the band as a map type result. */ - override def bandTransform(raster: => MosaicRasterTile, band: MosaicRasterBandGDAL): Any = { + override def bandTransform(raster: MosaicRasterTile, band: MosaicRasterBandGDAL): Any = { buildMapString(band.metadata) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_BoundingBox.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_BoundingBox.scala index 397d3ee8e..dad890b74 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_BoundingBox.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_BoundingBox.scala @@ -28,7 +28,7 @@ case class RST_BoundingBox( * @return * The bounding box of the raster as a WKB polygon. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { val raster = tile.getRaster val gt = raster.getRaster.GetGeoTransform() val (originX, originY) = GDAL.toWorldCoord(gt, 0, 0) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Clip.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Clip.scala index 7bad9b5d1..5e1f5b4ac 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Clip.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Clip.scala @@ -38,16 +38,11 @@ case class RST_Clip( * @return * The clipped raster. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any): Any = { val geometry = geometryAPI.geometry(arg1, geometryExpr.dataType) val geomCRS = geometry.getSpatialReferenceOSR val clipped = RasterClipByVector.clip(tile.getRaster, geometry, geomCRS, geometryAPI) - new MosaicRasterTile( - tile.getIndex, - clipped, - tile.getParentPath, - tile.getDriver - ) + tile.copy(raster = clipped) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvg.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvg.scala index adb4974ee..1d923fdc1 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvg.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvg.scala @@ -24,9 +24,9 @@ case class RST_CombineAvg( with CodegenFallback { /** Combines the rasters using average of pixels. */ - override def rasterTransform(tiles: => Seq[MosaicRasterTile]): Any = { + override def rasterTransform(tiles: Seq[MosaicRasterTile]): Any = { val index = if (tiles.map(_.getIndex).groupBy(identity).size == 1) tiles.head.getIndex else null - new MosaicRasterTile( + MosaicRasterTile( index, CombineAVG.compute(tiles.map(_.getRaster)), tiles.head.getParentPath, @@ -39,7 +39,7 @@ case class RST_CombineAvg( /** Expression info required for the expression registration for spark SQL. */ object RST_CombineAvg extends WithExpressionInfo { - override def name: String = "rst_combine_avg" + override def name: String = "rst_combineavg" override def usage: String = """ diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgAgg.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgAgg.scala index 767275953..c24680977 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgAgg.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgAgg.scala @@ -65,8 +65,6 @@ case class RST_CombineAvgAgg( if (buffer.isEmpty) { null - } else if (buffer.size == 1) { - buffer.head } else { // Do do move the expression @@ -79,7 +77,7 @@ case class RST_CombineAvgAgg( val parentPath = tiles.head.getParentPath val driver = tiles.head.getDriver - val result = new MosaicRasterTile(idx, combined, parentPath, driver) + val result = MosaicRasterTile(idx, combined, parentPath, driver) .formatCellId(IndexSystemFactory.getIndexSystem(expressionConfig.getIndexSystem)) .serialize(BinaryType, expressionConfig.getRasterCheckpoint) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBand.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBand.scala new file mode 100644 index 000000000..c1d9ea15a --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBand.scala @@ -0,0 +1,76 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.raster.operator.pixel.PixelCombineRasters +import com.databricks.labs.mosaic.core.types.RasterTileType +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.expressions.base.{GenericExpressionFactory, WithExpressionInfo} +import com.databricks.labs.mosaic.expressions.raster.base.RasterArray2ArgExpression +import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, NullIntolerant} +import org.apache.spark.unsafe.types.UTF8String + +/** Expression for combining rasters using average of pixels. */ +case class RST_DerivedBand( + rastersExpr: Expression, + pythonFuncExpr: Expression, + funcNameExpr: Expression, + expressionConfig: MosaicExpressionConfig +) extends RasterArray2ArgExpression[RST_DerivedBand]( + rastersExpr, + pythonFuncExpr, + funcNameExpr, + RasterTileType(expressionConfig.getCellIdType), + returnsRaster = true, + expressionConfig = expressionConfig + ) + with NullIntolerant + with CodegenFallback { + + /** Combines the rasters using average of pixels. */ + override def rasterTransform(tiles: Seq[MosaicRasterTile], arg1: Any, arg2: Any): Any = { + val pythonFunc = arg1.asInstanceOf[UTF8String].toString + val funcName = arg2.asInstanceOf[UTF8String].toString + val index = if (tiles.map(_.getIndex).groupBy(identity).size == 1) tiles.head.getIndex else null + val result = PixelCombineRasters.combine(tiles.map(_.getRaster), pythonFunc, funcName) + MosaicRasterTile( + index, + result, + tiles.head.getParentPath, + tiles.head.getDriver + ) + } + +} + +/** Expression info required for the expression registration for spark SQL. */ +object RST_DerivedBand extends WithExpressionInfo { + + override def name: String = "rst_derivedband" + + override def usage: String = + """ + |_FUNC_(expr1) - Returns a raster that is a result of combining an array of rasters using provided python function. + |""".stripMargin + + override def example: String = + """ + | Examples: + | > SELECT _FUNC_( + | array(raster_tile_1, raster_tile_2, raster_tile_3), + | 'def average(in_ar, out_ar, xoff, yoff, xsize, ysize, raster_xsize, raster_ysize, buf_radius, gt, **kwargs): + | out_ar[:] = np.sum(in_ar, axis=0) / len(in_ar) + | ', + | 'average' + | ); + | {index_id, raster, parent_path, driver} + | {index_id, raster, parent_path, driver} + | ... + | """.stripMargin + + override def builder(expressionConfig: MosaicExpressionConfig): FunctionBuilder = { + GenericExpressionFactory.getBaseBuilder[RST_DerivedBand](3, expressionConfig) + } + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAgg.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAgg.scala new file mode 100644 index 000000000..aa85362c6 --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAgg.scala @@ -0,0 +1,148 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.index.IndexSystemFactory +import com.databricks.labs.mosaic.core.raster.api.GDAL +import com.databricks.labs.mosaic.core.raster.io.RasterCleaner +import com.databricks.labs.mosaic.core.raster.operator.pixel.PixelCombineRasters +import com.databricks.labs.mosaic.core.types.RasterTileType +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.expressions.raster.base.RasterExpressionSerialization +import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.aggregate.{ImperativeAggregate, TypedImperativeAggregate} +import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo, UnsafeProjection, UnsafeRow} +import org.apache.spark.sql.catalyst.trees.TernaryLike +import org.apache.spark.sql.catalyst.util.GenericArrayData +import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType} +import org.apache.spark.unsafe.types.UTF8String + +import scala.collection.mutable.ArrayBuffer + +/** + * Returns a new raster that is a result of combining an array of rasters using + * average of pixels. + */ +//noinspection DuplicatedCode +case class RST_DerivedBandAgg( + rasterExpr: Expression, + pythonFuncExpr: Expression, + funcNameExpr: Expression, + expressionConfig: MosaicExpressionConfig, + mutableAggBufferOffset: Int = 0, + inputAggBufferOffset: Int = 0 +) extends TypedImperativeAggregate[ArrayBuffer[Any]] + with TernaryLike[Expression] + with RasterExpressionSerialization { + + GDAL.enable() + + override lazy val deterministic: Boolean = true + override val nullable: Boolean = false + override val dataType: DataType = RasterTileType(expressionConfig.getCellIdType) + override def prettyName: String = "rst_combine_avg_agg" + + private lazy val projection = UnsafeProjection.create(Array[DataType](ArrayType(elementType = dataType, containsNull = false))) + private lazy val row = new UnsafeRow(1) + + override def first: Expression = rasterExpr + override def second: Expression = pythonFuncExpr + override def third: Expression = funcNameExpr + + def update(buffer: ArrayBuffer[Any], input: InternalRow): ArrayBuffer[Any] = { + val value = first.eval(input) + buffer += InternalRow.copyValue(value) + buffer + } + + def merge(buffer: ArrayBuffer[Any], input: ArrayBuffer[Any]): ArrayBuffer[Any] = { + buffer ++= input + } + + override def createAggregationBuffer(): ArrayBuffer[Any] = ArrayBuffer.empty + + override def withNewInputAggBufferOffset(newInputAggBufferOffset: Int): ImperativeAggregate = + copy(inputAggBufferOffset = newInputAggBufferOffset) + + override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): ImperativeAggregate = + copy(mutableAggBufferOffset = newMutableAggBufferOffset) + + override def eval(buffer: ArrayBuffer[Any]): Any = { + GDAL.enable() + + if (buffer.isEmpty) { + null + } else { + + // This works for Literals only + val pythonFunc = pythonFuncExpr.eval(null).asInstanceOf[UTF8String].toString + val funcName = funcNameExpr.eval(null).asInstanceOf[UTF8String].toString + + // Do do move the expression + val tiles = buffer.map(row => MosaicRasterTile.deserialize(row.asInstanceOf[InternalRow], expressionConfig.getCellIdType)) + + // If merging multiple index rasters, the index value is dropped + val idx = if (tiles.map(_.getIndex).groupBy(identity).size == 1) tiles.head.getIndex else null + + val combined = PixelCombineRasters.combine(tiles.map(_.getRaster), pythonFunc, funcName) + // TODO: should parent path be an array? + val parentPath = tiles.head.getParentPath + val driver = tiles.head.getDriver + + val result = MosaicRasterTile(idx, combined, parentPath, driver) + .formatCellId(IndexSystemFactory.getIndexSystem(expressionConfig.getIndexSystem)) + .serialize(BinaryType, expressionConfig.getRasterCheckpoint) + + tiles.foreach(RasterCleaner.dispose(_)) + RasterCleaner.dispose(result) + + result + } + } + + override def serialize(obj: ArrayBuffer[Any]): Array[Byte] = { + val array = new GenericArrayData(obj.toArray) + projection.apply(InternalRow.apply(array)).getBytes + } + + override def deserialize(bytes: Array[Byte]): ArrayBuffer[Any] = { + val buffer = createAggregationBuffer() + row.pointTo(bytes, bytes.length) + row.getArray(0).foreach(dataType, (_, x: Any) => buffer += x) + buffer + } + + override protected def withNewChildrenInternal(newFirst: Expression, newSecond: Expression, newThird: Expression): RST_DerivedBandAgg = + copy(rasterExpr = newFirst, pythonFuncExpr = newSecond, funcNameExpr = newThird) + +} + +/** Expression info required for the expression registration for spark SQL. */ +object RST_DerivedBandAgg { + + def registryExpressionInfo(db: Option[String]): ExpressionInfo = + new ExpressionInfo( + classOf[RST_DerivedBandAgg].getCanonicalName, + db.orNull, + "rst_derived_band_agg", + """ + | _FUNC_(tiles)) - Combines rasters into a single raster using provided python function. + """.stripMargin, + "", + """ + | Examples: + | > SELECT _FUNC_(raster_tile, + | 'def average(in_ar, out_ar, xoff, yoff, xsize, ysize, raster_xsize, raster_ysize, buf_radius, gt, **kwargs): + | out_ar[:] = np.sum(in_ar, axis=0) / len(in_ar) + | ', + | 'average' + | ); + | {index_id, raster, parent_path, driver} + | """.stripMargin, + "", + "agg_funcs", + "1.0", + "", + "built-in" + ) + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBands.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBands.scala index fdb4bfdf0..e29238176 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBands.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBands.scala @@ -30,9 +30,9 @@ case class RST_FromBands( * @return * The stacked and resampled raster. */ - override def rasterTransform(rasters: => Seq[MosaicRasterTile]): Any = { + override def rasterTransform(rasters: Seq[MosaicRasterTile]): Any = { val raster = MergeBands.merge(rasters.map(_.getRaster), "bilinear") - new MosaicRasterTile(rasters.head.getIndex, raster, rasters.head.getParentPath, rasters.head.getDriver) + rasters.head.copy(raster = raster) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromFile.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromFile.scala index fa69cfcfa..5d13f49bb 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromFile.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromFile.scala @@ -10,6 +10,7 @@ import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile import com.databricks.labs.mosaic.datasource.gdal.ReTileOnRead import com.databricks.labs.mosaic.expressions.base.{GenericExpressionFactory, WithExpressionInfo} import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import com.databricks.labs.mosaic.utils.PathUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback @@ -17,6 +18,8 @@ import org.apache.spark.sql.catalyst.expressions.{CollectionGenerator, Expressio import org.apache.spark.sql.types.{DataType, IntegerType, StructField, StructType} import org.apache.spark.unsafe.types.UTF8String +import java.nio.file.{Files, Paths, StandardCopyOption} + /** * The raster for construction of a raster tile. This should be the first * expression in the expression tree for a raster tile. @@ -59,18 +62,24 @@ case class RST_FromFile( override def eval(input: InternalRow): TraversableOnce[InternalRow] = { GDAL.enable() val path = rasterPathExpr.eval(input).asInstanceOf[UTF8String].toString + val driver = MosaicRasterGDAL.identifyDriver(path) + val tmpPath = PathUtils.createTmpFilePath(GDAL.getExtension(driver)) + val readPath = PathUtils.getCleanPath(path) + Files.copy(Paths.get(readPath), Paths.get(tmpPath), StandardCopyOption.REPLACE_EXISTING) val targetSize = sizeInMB.eval(input).asInstanceOf[Int] if (targetSize <= 0) { - val raster = MosaicRasterGDAL.readRaster(path, path) - val tile = new MosaicRasterTile(null, raster, path, raster.getDriversShortName) + val raster = MosaicRasterGDAL.readRaster(tmpPath, path) + val tile = MosaicRasterTile(null, raster, path, raster.getDriversShortName) val row = tile.formatCellId(indexSystem).serialize() RasterCleaner.dispose(raster) RasterCleaner.dispose(tile) + Files.deleteIfExists(Paths.get(tmpPath)) Seq(InternalRow.fromSeq(Seq(row))) } else { - val tiles = ReTileOnRead.localSubdivide(path, targetSize) + val tiles = ReTileOnRead.localSubdivide(tmpPath, path, targetSize) val rows = tiles.map(_.formatCellId(indexSystem).serialize()) tiles.foreach(RasterCleaner.dispose(_)) + Files.deleteIfExists(Paths.get(tmpPath)) rows.map(row => InternalRow.fromSeq(Seq(row))) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GeoReference.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GeoReference.scala index 72a33e41c..fef2a4c32 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GeoReference.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GeoReference.scala @@ -16,7 +16,7 @@ case class RST_GeoReference(raster: Expression, expressionConfig: MosaicExpressi with CodegenFallback { /** Returns the georeference of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { val raster = tile.getRaster val geoTransform = raster.getRaster.GetGeoTransform() buildMapDouble( diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoData.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoData.scala index 25c1a0442..8f10b89cb 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoData.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoData.scala @@ -31,7 +31,7 @@ case class RST_GetNoData( * @return * The no data value of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { ArrayData.toArrayData(tile.getRaster.getBands.map(_.noDataValue)) } @@ -40,7 +40,7 @@ case class RST_GetNoData( /** Expression info required for the expression registration for spark SQL. */ object RST_GetNoData extends WithExpressionInfo { - override def name: String = "rst_get_no_data" + override def name: String = "rst_getnodata" override def usage: String = """ diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdataset.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdataset.scala index 1449bf6f3..3c8af03d8 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdataset.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdataset.scala @@ -23,10 +23,10 @@ case class RST_GetSubdataset(raster: Expression, subsetName: Expression, express with CodegenFallback { /** Returns the subdatasets of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any): Any = { val subsetName = arg1.asInstanceOf[UTF8String].toString val subdataset = tile.getRaster.getSubdataset(subsetName) - new MosaicRasterTile(tile.getIndex, subdataset, tile.getParentPath, tile.getDriver) + tile.copy(raster = subdataset) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Height.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Height.scala index 02a6da249..ceb638f29 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Height.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Height.scala @@ -16,7 +16,7 @@ case class RST_Height(raster: Expression, expressionConfig: MosaicExpressionConf with CodegenFallback { /** Returns the width of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = tile.getRaster.ySize + override def rasterTransform(tile: MosaicRasterTile): Any = tile.getRaster.ySize } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoData.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoData.scala index 604bba92a..ba5831424 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoData.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoData.scala @@ -33,25 +33,19 @@ case class RST_InitNoData( * @return * The raster with initialized no data values. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { val noDataValues = tile.getRaster.getBands.map(_.noDataValue).mkString(" ") val dstNoDataValues = tile.getRaster.getBands .map(_.getBand.getDataType) .map(GDAL.getNoDataConstant) .mkString(" ") - val resultPath = PathUtils.createTmpFilePath(tile.getRaster.uuid.toString, GDAL.getExtension(tile.getDriver)) + val resultPath = PathUtils.createTmpFilePath(GDAL.getExtension(tile.getDriver)) val result = GDALWarp.executeWarp( resultPath, - isTemp = true, Seq(tile.getRaster), command = s"""gdalwarp -of ${tile.getDriver} -dstnodata "$dstNoDataValues" -srcnodata "$noDataValues"""" ) - new MosaicRasterTile( - tile.getIndex, - result, - tile.getParentPath, - tile.getDriver - ) + tile.copy(raster = result) } } @@ -59,7 +53,7 @@ case class RST_InitNoData( /** Expression info required for the expression registration for spark SQL. */ object RST_InitNoData extends WithExpressionInfo { - override def name: String = "rst_init_no_data" + override def name: String = "rst_initnodata" override def usage: String = """ diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_IsEmpty.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_IsEmpty.scala index b54b63b55..d4fac7209 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_IsEmpty.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_IsEmpty.scala @@ -16,7 +16,7 @@ case class RST_IsEmpty(raster: Expression, expressionConfig: MosaicExpressionCon with CodegenFallback { /** Returns true if the raster is empty. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { val raster = tile.getRaster (raster.ySize == 0 && raster.xSize == 0) || raster.isEmpty } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MapAlgebra.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MapAlgebra.scala new file mode 100644 index 000000000..69461ebd4 --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MapAlgebra.scala @@ -0,0 +1,122 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.raster.api.GDAL +import com.databricks.labs.mosaic.core.raster.operator.gdal.GDALCalc +import com.databricks.labs.mosaic.core.types.RasterTileType +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.expressions.base.{GenericExpressionFactory, WithExpressionInfo} +import com.databricks.labs.mosaic.expressions.raster.base.RasterArray1ArgExpression +import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import com.databricks.labs.mosaic.utils.PathUtils +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, NullIntolerant} +import org.apache.spark.unsafe.types.UTF8String + +/** The expression for computing NDVI index. */ +case class RST_MapAlgebra( + rastersExpr: Expression, + jsonSpecExpr: Expression, + expressionConfig: MosaicExpressionConfig +) extends RasterArray1ArgExpression[RST_MapAlgebra]( + rastersExpr, + jsonSpecExpr, + RasterTileType(expressionConfig.getCellIdType), + returnsRaster = true, + expressionConfig = expressionConfig + ) + with NullIntolerant + with CodegenFallback { + + /** + * Computes NDVI index. + * @param tiles + * The raster to be used. + * @param arg1 + * The red band index. + * @return + * The raster contains NDVI index. + */ + override def rasterTransform(tiles: Seq[MosaicRasterTile], arg1: Any): Any = { + val jsonSpec = arg1.asInstanceOf[UTF8String].toString + val extension = GDAL.getExtension(tiles.head.getDriver) + val resultPath = PathUtils.createTmpFilePath(extension) + val command = parseSpec(jsonSpec, resultPath, tiles) + val result = GDALCalc.executeCalc(command, resultPath) + val index = if (tiles.map(_.getIndex).groupBy(identity).size == 1) tiles.head.getIndex else null + MosaicRasterTile( + index, + result, + resultPath, + tiles.head.getDriver + ) + } + + def parseSpec(jsonSpec: String, resultPath: String, tiles: Seq[MosaicRasterTile]): String = { + import org.json4s._ + import org.json4s.jackson.JsonMethods._ + implicit val formats: DefaultFormats.type = org.json4s.DefaultFormats + + val AZRasters = ('A' to 'Z').toList.map(l => s"${l}_index") + val AZBands = ('A' to 'Z').toList.map(l => s"${l}_band") + val json = parse(jsonSpec) + + val namedRasters = AZRasters + .map(raster => (raster, (json \ raster).toOption)) + .filter(_._2.isDefined) + .map(raster => (raster._1, raster._2.get.extract[Int])) + .map { case (raster, index) => (raster, tiles(index).getRaster.getPath) } + + val paramRasters = (if (namedRasters.isEmpty) { + tiles.zipWithIndex.map { case (tile, index) => (s"${('A' + index).toChar}", tile.getRaster.getPath) } + } else { + namedRasters + }) + .map(raster => s" -${raster._1.split("_").head} ${raster._2}") + .mkString + + val namedBands = AZBands + .map(band => (band, (json \ band).toOption)) + .filter(_._2.isDefined) + .map(band => (band._1, band._2.get.extract[Int])) + .map(band => s" --${band._1}=${band._2}") + .mkString + + val calc = (json \ "calc").toOption + .map(_.extract[String]) + .getOrElse( + throw new IllegalArgumentException("Calc parameter is required") + ) + val extraOptions = (json \ "extra_options").toOption.map(_.extract[String]).getOrElse("") + + "gdal_calc" + paramRasters + + namedBands + + s" --outfile=$resultPath" + + s" --calc=$calc" + s" $extraOptions" + } + +} + +/** Expression info required for the expression registration for spark SQL. */ +object RST_MapAlgebra extends WithExpressionInfo { + + override def name: String = "rst_mapalgebra" + + override def usage: String = + """ + |_FUNC_(expr1, expr2) - Performs map algebra on the rasters. + |""".stripMargin + + override def example: String = + """ + | Examples: + | > SELECT _FUNC_(raster_tiles, "{calc: 'A+B', A_index: 0, B_index: 1}"); + | {index_id, raster, parent_path, driver} + | ... + | """.stripMargin + + override def builder(expressionConfig: MosaicExpressionConfig): FunctionBuilder = { + GenericExpressionFactory.getBaseBuilder[RST_MapAlgebra](2, expressionConfig) + } + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MemSize.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MemSize.scala index eeffa8814..804c4f195 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MemSize.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MemSize.scala @@ -16,7 +16,7 @@ case class RST_MemSize(raster: Expression, expressionConfig: MosaicExpressionCon with CodegenFallback { /** Returns the memory size of the raster in bytes. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = tile.getRaster.getMemSize + override def rasterTransform(tile: MosaicRasterTile): Any = tile.getRaster.getMemSize } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Merge.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Merge.scala index 54870aa65..1dd52295e 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Merge.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Merge.scala @@ -30,15 +30,10 @@ case class RST_Merge( * @return * The merged raster. */ - override def rasterTransform(tiles: => Seq[MosaicRasterTile]): Any = { + override def rasterTransform(tiles: Seq[MosaicRasterTile]): Any = { val index = if (tiles.map(_.getIndex).groupBy(identity).size == 1) tiles.head.getIndex else null val raster = MergeRasters.merge(tiles.map(_.getRaster)) - new MosaicRasterTile( - index, - raster, - tiles.head.getParentPath, - tiles.head.getDriver - ) + tiles.head.copy(raster = raster, index = index) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MergeAgg.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MergeAgg.scala index 552feb0b5..3b59618b9 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MergeAgg.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MergeAgg.scala @@ -79,7 +79,7 @@ case class RST_MergeAgg( val parentPath = tiles.head.getParentPath val driver = tiles.head.getDriver - val result = new MosaicRasterTile(idx, merged, parentPath, driver) + val result = MosaicRasterTile(idx, merged, parentPath, driver) .formatCellId(IndexSystemFactory.getIndexSystem(expressionConfig.getIndexSystem)) .serialize(BinaryType, expressionConfig.getRasterCheckpoint) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MetaData.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MetaData.scala index 1e62808f7..8a96ff0d1 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MetaData.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_MetaData.scala @@ -16,7 +16,7 @@ case class RST_MetaData(raster: Expression, expressionConfig: MosaicExpressionCo with CodegenFallback { /** Returns the metadata of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = buildMapString(tile.getRaster.metadata) + override def rasterTransform(tile: MosaicRasterTile): Any = buildMapString(tile.getRaster.metadata) } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVI.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVI.scala index 636b079ac..0c1b3be38 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVI.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVI.scala @@ -38,11 +38,11 @@ case class RST_NDVI( * @return * The raster contains NDVI index. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any, arg2: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val redInd = arg1.asInstanceOf[Int] val nirInd = arg2.asInstanceOf[Int] val result = NDVI.compute(tile.getRaster, redInd, nirInd) - new MosaicRasterTile(tile.getIndex, result, tile.getParentPath, tile.getDriver) + tile.copy(raster = result) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_NumBands.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_NumBands.scala index b4694821d..f5dd09551 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_NumBands.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_NumBands.scala @@ -16,7 +16,7 @@ case class RST_NumBands(raster: Expression, expressionConfig: MosaicExpressionCo with CodegenFallback { /** Returns the number of bands in the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = tile.getRaster.numBands + override def rasterTransform(tile: MosaicRasterTile): Any = tile.getRaster.numBands } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelHeight.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelHeight.scala index 1aad1085c..c7cd790c6 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelHeight.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelHeight.scala @@ -17,7 +17,7 @@ case class RST_PixelHeight(raster: Expression, expressionConfig: MosaicExpressio with CodegenFallback { /** Returns the pixel height of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { val raster = tile.getRaster val scaleY = raster.getRaster.GetGeoTransform()(5) val skewX = raster.getRaster.GetGeoTransform()(2) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelWidth.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelWidth.scala index b623c303e..4a5f37916 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelWidth.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelWidth.scala @@ -17,7 +17,7 @@ case class RST_PixelWidth(raster: Expression, expressionConfig: MosaicExpression with CodegenFallback { /** Returns the pixel width of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { val raster = tile.getRaster val scaleX = raster.getRaster.GetGeoTransform()(1) val skewY = raster.getRaster.GetGeoTransform()(4) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoord.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoord.scala index 11e734a67..09bbb8b77 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoord.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoord.scala @@ -27,7 +27,7 @@ case class RST_RasterToWorldCoord( * GeoTransform. This ensures the projection of the raster is respected. * The output is a WKT point. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any, arg2: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val x = arg1.asInstanceOf[Int] val y = arg2.asInstanceOf[Int] val gt = tile.getRaster.getRaster.GetGeoTransform() diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoordX.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoordX.scala index 0ef8a7def..3f2158500 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoordX.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoordX.scala @@ -25,7 +25,7 @@ case class RST_RasterToWorldCoordX( * Returns the world coordinates of the raster x pixel by applying * GeoTransform. This ensures the projection of the raster is respected. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any, arg2: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val x = arg1.asInstanceOf[Int] val y = arg2.asInstanceOf[Int] val gt = tile.getRaster.getRaster.GetGeoTransform() diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoordY.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoordY.scala index 2e6703b3c..15e7cf87d 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoordY.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_RasterToWorldCoordY.scala @@ -25,7 +25,7 @@ case class RST_RasterToWorldCoordY( * Returns the world coordinates of the raster y pixel by applying * GeoTransform. This ensures the projection of the raster is respected. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any, arg2: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val x = arg1.asInstanceOf[Int] val y = arg2.asInstanceOf[Int] val gt = tile.getRaster.getRaster.GetGeoTransform() diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ReTile.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ReTile.scala index 1d5fdedec..4465866dc 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ReTile.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ReTile.scala @@ -26,7 +26,7 @@ case class RST_ReTile( * Returns a set of new rasters with the specified tile size (tileWidth x * tileHeight). */ - override def rasterGenerator(tile: => MosaicRasterTile): Seq[MosaicRasterTile] = { + override def rasterGenerator(tile: MosaicRasterTile): Seq[MosaicRasterTile] = { val tileWidthValue = tileWidthExpr.eval().asInstanceOf[Int] val tileHeightValue = tileHeightExpr.eval().asInstanceOf[Int] ReTile.reTile(tile, tileWidthValue, tileHeightValue) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Rotation.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Rotation.scala index b54506882..c3cd097c7 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Rotation.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Rotation.scala @@ -16,7 +16,7 @@ case class RST_Rotation(raster: Expression, expressionConfig: MosaicExpressionCo with CodegenFallback { /** Returns the rotation angle of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { val gt = tile.getRaster.getRaster.GetGeoTransform() // arctan of y_skew and x_scale math.atan(gt(4) / gt(1)) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SRID.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SRID.scala index 293227d37..c8bce06b7 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SRID.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SRID.scala @@ -19,7 +19,7 @@ case class RST_SRID(raster: Expression, expressionConfig: MosaicExpressionConfig with CodegenFallback { /** Returns the SRID of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { // Reference: https://gis.stackexchange.com/questions/267321/extracting-epsg-from-a-raster-using-gdal-bindings-in-python val proj = new SpatialReference(tile.getRaster.getRaster.GetProjection()) Try(proj.AutoIdentifyEPSG()) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ScaleX.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ScaleX.scala index 4deaca6fd..c16891871 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ScaleX.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ScaleX.scala @@ -16,7 +16,7 @@ case class RST_ScaleX(raster: Expression, expressionConfig: MosaicExpressionConf with CodegenFallback { /** Returns the scale x of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { tile.getRaster.getRaster.GetGeoTransform()(1) } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ScaleY.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ScaleY.scala index 5875bbf7a..3b0779763 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ScaleY.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ScaleY.scala @@ -16,7 +16,7 @@ case class RST_ScaleY(raster: Expression, expressionConfig: MosaicExpressionConf with CodegenFallback { /** Returns the scale y of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { tile.getRaster.getRaster.GetGeoTransform()(5) } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoData.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoData.scala index 268091a10..a089ba4f6 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoData.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoData.scala @@ -11,6 +11,7 @@ import com.databricks.labs.mosaic.utils.PathUtils import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{Expression, NullIntolerant} +import org.apache.spark.sql.catalyst.util.ArrayData /** Returns a raster with the specified no data values. */ case class RST_SetNoData( @@ -36,26 +37,22 @@ case class RST_SetNoData( * @return * The raster with the specified no data values. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any): Any = { val noDataValues = tile.getRaster.getBands.map(_.noDataValue).mkString(" ") val dstNoDataValues = (arg1 match { - case doubles: Array[Double] => doubles case d: Double => Array.fill[Double](tile.getRaster.numBands)(d) - case _ => throw new IllegalArgumentException("No data values must be an array of doubles or a double") + case i: Int => Array.fill[Double](tile.getRaster.numBands)(i.toDouble) + case l: Long => Array.fill[Double](tile.getRaster.numBands)(l.toDouble) + case arrayData: ArrayData => arrayData.array.map(_.toString.toDouble) // Trick to convert SQL decimal to double + case _ => throw new IllegalArgumentException("No data values must be an array of numerical or a numerical value.") }).mkString(" ") - val resultPath = PathUtils.createTmpFilePath(tile.getRaster.uuid.toString, GDAL.getExtension(tile.getDriver)) + val resultPath = PathUtils.createTmpFilePath(GDAL.getExtension(tile.getDriver)) val result = GDALWarp.executeWarp( resultPath, - isTemp = true, Seq(tile.getRaster), command = s"""gdalwarp -of ${tile.getDriver} -dstnodata "$dstNoDataValues" -srcnodata "$noDataValues"""" ) - new MosaicRasterTile( - tile.getIndex, - result, - tile.getParentPath, - tile.getDriver - ) + tile.copy(raster = result) } } @@ -63,7 +60,7 @@ case class RST_SetNoData( /** Expression info required for the expression registration for spark SQL. */ object RST_SetNoData extends WithExpressionInfo { - override def name: String = "rst_set_no_data" + override def name: String = "rst_setnodata" override def usage: String = """ diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SkewX.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SkewX.scala index 697b758da..ee3d0c4dd 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SkewX.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SkewX.scala @@ -16,7 +16,7 @@ case class RST_SkewX(raster: Expression, expressionConfig: MosaicExpressionConfi with CodegenFallback { /** Returns the skew x of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { tile.getRaster.getRaster.GetGeoTransform()(2) } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SkewY.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SkewY.scala index 1fe4893c5..ff9903687 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SkewY.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_SkewY.scala @@ -16,7 +16,7 @@ case class RST_SkewY(raster: Expression, expressionConfig: MosaicExpressionConfi with CodegenFallback { /** Returns the skew y of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { tile.getRaster.getRaster.GetGeoTransform()(4) } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Subdatasets.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Subdatasets.scala index c90a967c3..8c58e7f74 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Subdatasets.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Subdatasets.scala @@ -21,7 +21,7 @@ case class RST_Subdatasets(raster: Expression, expressionConfig: MosaicExpressio with CodegenFallback { /** Returns the subdatasets of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = buildMapString(tile.getRaster.subdatasets) + override def rasterTransform(tile: MosaicRasterTile): Any = buildMapString(tile.getRaster.subdatasets) } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Subdivide.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Subdivide.scala index f0756d6fc..9692ccf2d 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Subdivide.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Subdivide.scala @@ -19,7 +19,7 @@ case class RST_Subdivide( with CodegenFallback { /** Returns a set of new rasters with the specified tile size (In MB). */ - override def rasterGenerator(tile: => MosaicRasterTile): Seq[MosaicRasterTile] = { + override def rasterGenerator(tile: MosaicRasterTile): Seq[MosaicRasterTile] = { val targetSize = sizeInMB.eval().asInstanceOf[Int] BalancedSubdivision.splitRaster(tile, targetSize) } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Summary.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Summary.scala index bcc296afa..ea75617c1 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Summary.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Summary.scala @@ -21,7 +21,7 @@ case class RST_Summary(raster: Expression, expressionConfig: MosaicExpressionCon with CodegenFallback { /** Returns the summary info the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { val raster = tile.getRaster val vector = new JVector[String]() // For other flags check the way gdalinfo.py script is called, InfoOptions expects a collection of same flags. diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Tessellate.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Tessellate.scala index bb22cdc5b..e2fa3cd22 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Tessellate.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Tessellate.scala @@ -25,7 +25,7 @@ case class RST_Tessellate( * Returns a set of new rasters which are the result of the tessellation of * the input raster. */ - override def rasterGenerator(tile: => MosaicRasterTile, resolution: Int): Seq[MosaicRasterTile] = { + override def rasterGenerator(tile: MosaicRasterTile, resolution: Int): Seq[MosaicRasterTile] = { RasterTessellate.tessellate( tile.getRaster, resolution, diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTiles.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTiles.scala index 287d2389d..5866e00aa 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTiles.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTiles.scala @@ -27,7 +27,7 @@ case class RST_ToOverlappingTiles( * Returns a set of new rasters which are the result of a rolling window * over the input raster. */ - override def rasterGenerator(tile: => MosaicRasterTile): Seq[MosaicRasterTile] = { + override def rasterGenerator(tile: MosaicRasterTile): Seq[MosaicRasterTile] = { val tileWidthValue = tileWidthExpr.eval().asInstanceOf[Int] val tileHeightValue = tileHeightExpr.eval().asInstanceOf[Int] val overlapValue = overlapExpr.eval().asInstanceOf[Int] diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpen.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpen.scala index f526cdb2a..b364d39da 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpen.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpen.scala @@ -16,7 +16,7 @@ case class RST_TryOpen(raster: Expression, expressionConfig: MosaicExpressionCon with CodegenFallback { /** Returns true if the raster can be opened. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { Option(tile.getRaster.getRaster).isDefined } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_UpperLeftX.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_UpperLeftX.scala index 7a53e488a..4f050bc7e 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_UpperLeftX.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_UpperLeftX.scala @@ -16,7 +16,7 @@ case class RST_UpperLeftX(raster: Expression, expressionConfig: MosaicExpression with CodegenFallback { /** Returns the upper left x of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { tile.getRaster.getRaster.GetGeoTransform()(0) } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_UpperLeftY.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_UpperLeftY.scala index 8e6525bab..0e052e3ae 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_UpperLeftY.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_UpperLeftY.scala @@ -16,7 +16,7 @@ case class RST_UpperLeftY(raster: Expression, expressionConfig: MosaicExpression with CodegenFallback { /** Returns the upper left y of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = { + override def rasterTransform(tile: MosaicRasterTile): Any = { tile.getRaster.getRaster.GetGeoTransform()(3) } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Width.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Width.scala index a8a9a280d..4bd56686a 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Width.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Width.scala @@ -16,7 +16,7 @@ case class RST_Width(raster: Expression, expressionConfig: MosaicExpressionConfi with CodegenFallback { /** Returns the width of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile): Any = tile.getRaster.xSize + override def rasterTransform(tile: MosaicRasterTile): Any = tile.getRaster.xSize } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoord.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoord.scala index 6f1774bf3..e5cf95180 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoord.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoord.scala @@ -25,7 +25,7 @@ case class RST_WorldToRasterCoord( * Returns the x and y of the raster by applying GeoTransform as a tuple of * Integers. This will ensure projection of the raster is respected. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any, arg2: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val xGeo = arg1.asInstanceOf[Double] val yGeo = arg2.asInstanceOf[Double] val gt = tile.getRaster.getRaster.GetGeoTransform() diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordX.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordX.scala index 7f6c3d65d..2b6f6aa0c 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordX.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordX.scala @@ -25,7 +25,7 @@ case class RST_WorldToRasterCoordX( * Returns the x coordinate of the raster by applying GeoTransform. This * will ensure projection of the raster is respected. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any, arg2: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val xGeo = arg1.asInstanceOf[Double] val gt = tile.getRaster.getRaster.GetGeoTransform() GDAL.fromWorldCoord(gt, xGeo, 0)._1 diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordY.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordY.scala index 16b2f2831..23540c7c7 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordY.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordY.scala @@ -25,7 +25,7 @@ case class RST_WorldToRasterCoordY( * Returns the y coordinate of the raster by applying GeoTransform. This * will ensure projection of the raster is respected. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any, arg2: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val xGeo = arg1.asInstanceOf[Double] val gt = tile.getRaster.getRaster.GetGeoTransform() GDAL.fromWorldCoord(gt, xGeo, 0)._2 diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/Raster1ArgExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/Raster1ArgExpression.scala index df8bd761e..082a98d88 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/Raster1ArgExpression.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/Raster1ArgExpression.scala @@ -59,7 +59,7 @@ abstract class Raster1ArgExpression[T <: Expression: ClassTag]( * @return * A result of the expression. */ - def rasterTransform(raster: => MosaicRasterTile, arg1: Any): Any + def rasterTransform(raster: MosaicRasterTile, arg1: Any): Any /** * Evaluation of the expression. It evaluates the raster path and the loads diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/Raster2ArgExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/Raster2ArgExpression.scala index 1db525c82..5c01a88e2 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/Raster2ArgExpression.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/Raster2ArgExpression.scala @@ -1,7 +1,6 @@ package com.databricks.labs.mosaic.expressions.raster.base import com.databricks.labs.mosaic.core.raster.api.GDAL -import com.databricks.labs.mosaic.core.raster.gdal.MosaicRasterGDAL import com.databricks.labs.mosaic.core.raster.io.RasterCleaner import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile import com.databricks.labs.mosaic.expressions.base.GenericExpressionFactory @@ -66,7 +65,7 @@ abstract class Raster2ArgExpression[T <: Expression: ClassTag]( * @return * A result of the expression. */ - def rasterTransform(raster: => MosaicRasterTile, arg1: Any, arg2: Any): Any + def rasterTransform(raster: MosaicRasterTile, arg1: Any, arg2: Any): Any /** * Evaluation of the expression. It evaluates the raster path and the loads diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArray1ArgExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArray1ArgExpression.scala new file mode 100644 index 000000000..2f06f8cc1 --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArray1ArgExpression.scala @@ -0,0 +1,89 @@ +package com.databricks.labs.mosaic.expressions.raster.base + +import com.databricks.labs.mosaic.core.raster.api.GDAL +import com.databricks.labs.mosaic.core.raster.io.RasterCleaner +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.expressions.base.GenericExpressionFactory +import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, NullIntolerant} +import org.apache.spark.sql.types.{ArrayType, DataType} + +import scala.reflect.ClassTag + +/** + * Base class for all raster expressions that take two arguments. It provides + * the boilerplate code needed to create a function builder for a given + * expression. It minimises amount of code needed to create a new expression. + * + * @param rastersExpr + * The rasters expression. It is an array column containing rasters as either + * paths or as content byte arrays. + * @param outputType + * The output type of the result. + * @param expressionConfig + * Additional arguments for the expression (expressionConfigs). + * @tparam T + * The type of the extending class. + */ +abstract class RasterArray1ArgExpression[T <: Expression: ClassTag]( + rastersExpr: Expression, + arg1Expr: Expression, + outputType: DataType, + returnsRaster: Boolean, + expressionConfig: MosaicExpressionConfig +) extends BinaryExpression + with NullIntolerant + with Serializable + with RasterExpressionSerialization { + + GDAL.enable() + + /** Output Data Type */ + override def dataType: DataType = if (returnsRaster) rastersExpr.dataType.asInstanceOf[ArrayType].elementType else outputType + + override def left: Expression = rastersExpr + + override def right: Expression = arg1Expr + + /** + * The function to be overridden by the extending class. It is called when + * the expression is evaluated. It provides the rasters to the expression. + * It abstracts spark serialization from the caller. + * @param rasters + * The sequence of rasters to be used. + * @param arg1 + * The first argument to the expression. + * @return + * A result of the expression. + */ + def rasterTransform(rasters: Seq[MosaicRasterTile], arg1: Any): Any + + /** + * Evaluation of the expression. It evaluates the raster path and the loads + * the raster from the path. It handles the clean up of the raster before + * returning the results. + * @param input + * The InternalRow of the expression. It contains an array containing + * raster tiles. It may be used for other argument expressions so it is + * passed to rasterTransform. + * + * @return + * The result of the expression. + */ + override def nullSafeEval(input: Any, arg1: Any): Any = { + GDAL.enable() + val tiles = RasterArrayUtils.getTiles(input, rastersExpr, expressionConfig) + val result = rasterTransform(tiles, arg1) + val serialized = serialize(result, returnsRaster, dataType, expressionConfig) + tiles.foreach(t => RasterCleaner.dispose(t)) + serialized + } + + override def makeCopy(newArgs: Array[AnyRef]): Expression = GenericExpressionFactory.makeCopyImpl[T](this, newArgs, 2, expressionConfig) + + override def withNewChildrenInternal( + newFirst: Expression, + newSecond: Expression + ): Expression = makeCopy(Array(newFirst, newSecond)) + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArray2ArgExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArray2ArgExpression.scala new file mode 100644 index 000000000..1e7fb60a6 --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArray2ArgExpression.scala @@ -0,0 +1,95 @@ +package com.databricks.labs.mosaic.expressions.raster.base + +import com.databricks.labs.mosaic.core.raster.api.GDAL +import com.databricks.labs.mosaic.core.raster.io.RasterCleaner +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.expressions.base.GenericExpressionFactory +import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import org.apache.spark.sql.catalyst.expressions.{Expression, NullIntolerant, TernaryExpression} +import org.apache.spark.sql.types.{ArrayType, DataType} + +import scala.reflect.ClassTag + +/** + * Base class for all raster expressions that take two arguments. It provides + * the boilerplate code needed to create a function builder for a given + * expression. It minimises amount of code needed to create a new expression. + * + * @param rastersExpr + * The rasters expression. It is an array column containing rasters as either + * paths or as content byte arrays. + * @param outputType + * The output type of the result. + * @param expressionConfig + * Additional arguments for the expression (expressionConfigs). + * @tparam T + * The type of the extending class. + */ +abstract class RasterArray2ArgExpression[T <: Expression: ClassTag]( + rastersExpr: Expression, + arg1Expr: Expression, + arg2Expr: Expression, + outputType: DataType, + returnsRaster: Boolean, + expressionConfig: MosaicExpressionConfig +) extends TernaryExpression + with NullIntolerant + with Serializable + with RasterExpressionSerialization { + + GDAL.enable() + + /** Output Data Type */ + override def dataType: DataType = if (returnsRaster) rastersExpr.dataType.asInstanceOf[ArrayType].elementType else outputType + + override def first: Expression = rastersExpr + + override def second: Expression = arg1Expr + + override def third: Expression = arg2Expr + + /** + * The function to be overridden by the extending class. It is called when + * the expression is evaluated. It provides the rasters to the expression. + * It abstracts spark serialization from the caller. + * @param rasters + * The sequence of rasters to be used. + * @param arg1 + * The first argument to the expression. + * @param arg2 + * The second argument to the expression. + * @return + * A result of the expression. + */ + def rasterTransform(rasters: Seq[MosaicRasterTile], arg1: Any, arg2: Any): Any + + /** + * Evaluation of the expression. It evaluates the raster path and the loads + * the raster from the path. It handles the clean up of the raster before + * returning the results. + * @param input + * The InternalRow of the expression. It contains an array containing + * raster tiles. It may be used for other argument expressions so it is + * passed to rasterTransform. + * + * @return + * The result of the expression. + */ + override def nullSafeEval(input: Any, arg1: Any, arg2: Any): Any = { + GDAL.enable() + val tiles = RasterArrayUtils.getTiles(input, rastersExpr, expressionConfig) + val result = rasterTransform(tiles, arg1, arg2) + val serialized = serialize(result, returnsRaster, dataType, expressionConfig) + tiles.foreach(t => RasterCleaner.dispose(t)) + serialized + } + + override def makeCopy(newArgs: Array[AnyRef]): Expression = GenericExpressionFactory.makeCopyImpl[T](this, newArgs, 3, expressionConfig) + + override def withNewChildrenInternal( + newFirst: Expression, + newSecond: Expression, + newThird: Expression + ): Expression = makeCopy(Array(newFirst, newSecond, newThird)) + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArrayExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArrayExpression.scala index 928b994b6..8daa0678b 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArrayExpression.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArrayExpression.scala @@ -5,9 +5,7 @@ import com.databricks.labs.mosaic.core.raster.io.RasterCleaner import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile import com.databricks.labs.mosaic.expressions.base.GenericExpressionFactory import com.databricks.labs.mosaic.functions.MosaicExpressionConfig -import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Expression, NullIntolerant, UnaryExpression} -import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types.{ArrayType, DataType} import scala.reflect.ClassTag @@ -53,30 +51,23 @@ abstract class RasterArrayExpression[T <: Expression: ClassTag]( * @return * A result of the expression. */ - def rasterTransform(rasters: => Seq[MosaicRasterTile]): Any + def rasterTransform(rasters: Seq[MosaicRasterTile]): Any /** * Evaluation of the expression. It evaluates the raster path and the loads * the raster from the path. It handles the clean up of the raster before * returning the results. * @param input - * The input to the expression. It is an array containing paths to raster - * files or byte arrays containing the raster files contents. + * The InternalRow of the expression. It contains an array containing + * raster tiles. It may be used for other argument expressions so it is + * passed to rasterTransform. * * @return * The result of the expression. */ override def nullSafeEval(input: Any): Any = { GDAL.enable() - val rasterDT = rastersExpr.dataType.asInstanceOf[ArrayType].elementType - val arrayData = input.asInstanceOf[ArrayData] - val n = arrayData.numElements() - val tiles = (0 until n) - .map(i => - MosaicRasterTile - .deserialize(arrayData.get(i, rasterDT).asInstanceOf[InternalRow], expressionConfig.getCellIdType) - ) - + val tiles = RasterArrayUtils.getTiles(input, rastersExpr, expressionConfig) val result = rasterTransform(tiles) val serialized = serialize(result, returnsRaster, dataType, expressionConfig) tiles.foreach(t => RasterCleaner.dispose(t)) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArrayUtils.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArrayUtils.scala new file mode 100644 index 000000000..0dd19346c --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterArrayUtils.scala @@ -0,0 +1,24 @@ +package com.databricks.labs.mosaic.expressions.raster.base + +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.util.ArrayData +import org.apache.spark.sql.types.ArrayType + +object RasterArrayUtils { + + def getTiles(input: Any, rastersExpr: Expression, expressionConfig: MosaicExpressionConfig): Seq[MosaicRasterTile] = { + val rasterDT = rastersExpr.dataType.asInstanceOf[ArrayType].elementType + val arrayData = input.asInstanceOf[ArrayData] + val n = arrayData.numElements() + val tiles = (0 until n) + .map(i => + MosaicRasterTile + .deserialize(arrayData.get(i, rasterDT).asInstanceOf[InternalRow], expressionConfig.getCellIdType) + ) + tiles + } + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterBandExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterBandExpression.scala index 1efcfb553..008f0df09 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterBandExpression.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterBandExpression.scala @@ -61,7 +61,7 @@ abstract class RasterBandExpression[T <: Expression: ClassTag]( * @return * The result of the expression. */ - def bandTransform(raster: => MosaicRasterTile, band: MosaicRasterBandGDAL): Any + def bandTransform(raster: MosaicRasterTile, band: MosaicRasterBandGDAL): Any /** * Evaluation of the expression. It evaluates the raster path and the loads diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterExpression.scala index 2207424a5..31751d0e1 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterExpression.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterExpression.scala @@ -61,7 +61,7 @@ abstract class RasterExpression[T <: Expression: ClassTag]( * @return * The result of the expression. */ - def rasterTransform(raster: => MosaicRasterTile): Any + def rasterTransform(raster: MosaicRasterTile): Any /** * Evaluation of the expression. It evaluates the raster path and the loads diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterExpressionSerialization.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterExpressionSerialization.scala index 0087314f9..a9bf17917 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterExpressionSerialization.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterExpressionSerialization.scala @@ -28,7 +28,7 @@ trait RasterExpressionSerialization { * The serialized result of the expression. */ def serialize( - data: => Any, + data: Any, returnsRaster: Boolean, outputDataType: DataType, expressionConfig: MosaicExpressionConfig diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterGeneratorExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterGeneratorExpression.scala index 8cb68c49d..4ad7d126e 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterGeneratorExpression.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterGeneratorExpression.scala @@ -70,7 +70,7 @@ abstract class RasterGeneratorExpression[T <: Expression: ClassTag]( * @return * Sequence of generated new rasters to be written. */ - def rasterGenerator(raster: => MosaicRasterTile): Seq[MosaicRasterTile] + def rasterGenerator(raster: MosaicRasterTile): Seq[MosaicRasterTile] override def eval(input: InternalRow): TraversableOnce[InternalRow] = { GDAL.enable() diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterGridExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterGridExpression.scala index b1e83de1b..26fcf0aa2 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterGridExpression.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterGridExpression.scala @@ -55,7 +55,7 @@ trait RasterGridExpression { * band. */ def griddedPixels( - raster: => MosaicRasterGDAL, + raster: MosaicRasterGDAL, indexSystem: IndexSystem, resolution: Int ): Seq[Map[Long, Seq[Double]]] = { diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterTessellateGeneratorExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterTessellateGeneratorExpression.scala index 8e02543c6..a31f001cd 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterTessellateGeneratorExpression.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterTessellateGeneratorExpression.scala @@ -68,7 +68,7 @@ abstract class RasterTessellateGeneratorExpression[T <: Expression: ClassTag]( * @return * Sequence of generated new rasters to be written. */ - def rasterGenerator(raster: => MosaicRasterTile, resolution: Int): Seq[MosaicRasterTile] + def rasterGenerator(raster: MosaicRasterTile, resolution: Int): Seq[MosaicRasterTile] override def eval(input: InternalRow): TraversableOnce[InternalRow] = { GDAL.enable() diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterToGridExpression.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterToGridExpression.scala index 4d39c35cb..98b9bafc6 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterToGridExpression.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/base/RasterToGridExpression.scala @@ -56,7 +56,7 @@ abstract class RasterToGridExpression[T <: Expression: ClassTag, P]( * @return * Sequence of (cellId, measure) of each band of the raster. */ - override def rasterTransform(tile: => MosaicRasterTile, arg1: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any): Any = { GDAL.enable() val resolution = arg1.asInstanceOf[Int] val transformed = griddedPixels(tile.getRaster, indexSystem, resolution) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/util/OGRReadeWithOffset.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/util/OGRReadeWithOffset.scala index ebb3f1c99..99494ca96 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/util/OGRReadeWithOffset.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/util/OGRReadeWithOffset.scala @@ -21,7 +21,6 @@ case class OGRReadeWithOffset(pathExpr: Expression, chunkIndexExpr: Expression, val layerNumber: Int = config("layerNumber").toInt val layerName: String = config("layerName") val chunkSize: Int = config("chunkSize").toInt - val vsizip: Boolean = config("vsizip").toBoolean val asWKB: Boolean = config("asWKB").toBoolean override def collectionType: DataType = schema @@ -35,7 +34,7 @@ case class OGRReadeWithOffset(pathExpr: Expression, chunkIndexExpr: Expression, val chunkIndex = chunkIndexExpr.eval(input).asInstanceOf[Int] OGRFileFormat.enableOGRDrivers() - val ds = OGRFileFormat.getDataSource(driverName, path, vsizip) + val ds = OGRFileFormat.getDataSource(driverName, path) val layer = OGRFileFormat.getLayer(ds, layerNumber, layerName) val start = chunkIndex * chunkSize diff --git a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala index 88c53d6fb..2caa02251 100644 --- a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala +++ b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala @@ -21,6 +21,7 @@ import org.apache.spark.sql.catalyst.expressions.{Expression, Literal} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{LongType, StringType} +import java.nio.file.Files import scala.reflect.runtime.universe //noinspection DuplicatedCode @@ -257,6 +258,7 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends mosaicRegistry.registerExpression[RST_BoundingBox](expressionConfig) mosaicRegistry.registerExpression[RST_Clip](expressionConfig) mosaicRegistry.registerExpression[RST_CombineAvg](expressionConfig) + mosaicRegistry.registerExpression[RST_DerivedBand](expressionConfig) mosaicRegistry.registerExpression[RST_GeoReference](expressionConfig) mosaicRegistry.registerExpression[RST_GetNoData](expressionConfig) mosaicRegistry.registerExpression[RST_GetSubdataset](expressionConfig) @@ -267,6 +269,7 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends mosaicRegistry.registerExpression[RST_Merge](expressionConfig) mosaicRegistry.registerExpression[RST_FromBands](expressionConfig) mosaicRegistry.registerExpression[RST_MetaData](expressionConfig) + mosaicRegistry.registerExpression[RST_MapAlgebra](expressionConfig) mosaicRegistry.registerExpression[RST_NDVI](expressionConfig) mosaicRegistry.registerExpression[RST_NumBands](expressionConfig) mosaicRegistry.registerExpression[RST_PixelWidth](expressionConfig) @@ -337,6 +340,11 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends RST_CombineAvgAgg.registryExpressionInfo(database), (exprs: Seq[Expression]) => RST_CombineAvgAgg(exprs(0), expressionConfig) ) + registry.registerFunction( + FunctionIdentifier("rst_derivedband_agg", database), + RST_DerivedBandAgg.registryExpressionInfo(database), + (exprs: Seq[Expression]) => RST_DerivedBandAgg(exprs(0), exprs(1), exprs(2), expressionConfig) + ) /** IndexSystem and GeometryAPI Specific methods */ registry.registerFunction( @@ -628,6 +636,8 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends def rst_boundingbox(raster: Column): Column = ColumnAdapter(RST_BoundingBox(raster.expr, expressionConfig)) def rst_clip(raster: Column, geometry: Column): Column = ColumnAdapter(RST_Clip(raster.expr, geometry.expr, expressionConfig)) def rst_combineavg(rasterArray: Column): Column = ColumnAdapter(RST_CombineAvg(rasterArray.expr, expressionConfig)) + def rst_derivedband(raster: Column, pythonFunc: Column, funcName: Column): Column = + ColumnAdapter(RST_DerivedBand(raster.expr, pythonFunc.expr, funcName.expr, expressionConfig)) def rst_georeference(raster: Column): Column = ColumnAdapter(RST_GeoReference(raster.expr, expressionConfig)) def rst_getnodata(raster: Column): Column = ColumnAdapter(RST_GetNoData(raster.expr, expressionConfig)) def rst_getsubdataset(raster: Column, subdatasetName: Column): Column = @@ -641,6 +651,8 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends def rst_frombands(bandsArray: Column): Column = ColumnAdapter(RST_FromBands(bandsArray.expr, expressionConfig)) def rst_merge(rasterArray: Column): Column = ColumnAdapter(RST_Merge(rasterArray.expr, expressionConfig)) def rst_metadata(raster: Column): Column = ColumnAdapter(RST_MetaData(raster.expr, expressionConfig)) + def rst_mapalgebra(rasterArray: Column, jsonSpec: Column): Column = + ColumnAdapter(RST_MapAlgebra(rasterArray.expr, jsonSpec.expr, expressionConfig)) def rst_ndvi(raster: Column, band1: Column, band2: Column): Column = ColumnAdapter(RST_NDVI(raster.expr, band1.expr, band2.expr, expressionConfig)) def rst_ndvi(raster: Column, band1: Int, band2: Int): Column = @@ -738,6 +750,10 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends ColumnAdapter(RST_MergeAgg(raster.expr, expressionConfig).toAggregateExpression(isDistinct = false)) def rst_combineavg_agg(raster: Column): Column = ColumnAdapter(RST_CombineAvgAgg(raster.expr, expressionConfig).toAggregateExpression(isDistinct = false)) + def rst_derivedband_agg(raster: Column, pythonFunc: Column, funcName: Column): Column = + ColumnAdapter( + RST_DerivedBandAgg(raster.expr, pythonFunc.expr, funcName.expr, expressionConfig).toAggregateExpression(isDistinct = false) + ) /** IndexSystem Specific */ @@ -947,6 +963,8 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends object MosaicContext extends Logging { + val tmpDir: String = Files.createTempDirectory("mosaic").toAbsolutePath.toString + private var instance: Option[MosaicContext] = None def build(indexSystem: IndexSystem, geometryAPI: GeometryAPI): MosaicContext = { diff --git a/src/main/scala/com/databricks/labs/mosaic/gdal/MosaicGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/gdal/MosaicGDAL.scala index 4b26bc472..f5849056d 100644 --- a/src/main/scala/com/databricks/labs/mosaic/gdal/MosaicGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/gdal/MosaicGDAL.scala @@ -1,5 +1,6 @@ package com.databricks.labs.mosaic.gdal +import com.databricks.labs.mosaic.functions.MosaicContext import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.gdal.gdal.gdal @@ -42,8 +43,8 @@ object MosaicGDAL extends Logging { /** Configures the GDAL environment. */ def configureGDAL(): Unit = { - val CPL_TMPDIR = Files.createTempDirectory("mosaic-gdal-tmp").toAbsolutePath.toString - val GDAL_PAM_PROXY_DIR = Files.createTempDirectory("mosaic-gdal-tmp").toAbsolutePath.toString + val CPL_TMPDIR = MosaicContext.tmpDir + val GDAL_PAM_PROXY_DIR = MosaicContext.tmpDir gdal.SetConfigOption("GDAL_VRT_ENABLE_PYTHON", "YES") gdal.SetConfigOption("GDAL_DISABLE_READDIR_ON_OPEN", "EMPTY_DIR") gdal.SetConfigOption("CPL_TMPDIR", CPL_TMPDIR) diff --git a/src/main/scala/com/databricks/labs/mosaic/utils/PathUtils.scala b/src/main/scala/com/databricks/labs/mosaic/utils/PathUtils.scala index 718f892f2..f3fb9d7b9 100644 --- a/src/main/scala/com/databricks/labs/mosaic/utils/PathUtils.scala +++ b/src/main/scala/com/databricks/labs/mosaic/utils/PathUtils.scala @@ -1,31 +1,16 @@ package com.databricks.labs.mosaic.utils +import com.databricks.labs.mosaic.core.raster.api.GDAL +import com.databricks.labs.mosaic.core.raster.gdal.MosaicRasterGDAL +import com.databricks.labs.mosaic.functions.MosaicContext + import java.nio.file.{Files, Paths} -import java.util.UUID object PathUtils { - def getFormatExtension(rawPath: String): String = { - val path: String = resolvePath(rawPath) - val fileName = path.split("/").last - val extension = fileName.split("\\.").last - extension - } - - private def resolvePath(rawPath: String): String = { - val path = - if (isSubdataset(rawPath)) { - val _ :: filePath :: _ :: Nil = rawPath.split(":").toList - filePath - } else { - rawPath - } - path - } - - def getCleanPath(path: String, useZipPath: Boolean): String = { + def getCleanPath(path: String): String = { val cleanPath = path.replace("file:/", "/").replace("dbfs:/", "/dbfs/") - if (useZipPath && cleanPath.endsWith(".zip")) { + if (cleanPath.endsWith(".zip") || cleanPath.contains(".zip:")) { getZipPath(cleanPath) } else { cleanPath @@ -36,10 +21,6 @@ object PathUtils { path.split(":").length == 3 } - def isInMemory(path: String): Boolean = { - path.startsWith("/vsimem/") || path.contains("/vsimem/") - } - def getSubdatasetPath(path: String): String = { // Subdatasets are paths with a colon in them. // We need to check for this condition and handle it. @@ -60,37 +41,9 @@ object PathUtils { readPath } - def copyToTmp(rawPath: String): String = { - try { - val path: String = resolvePath(rawPath) - - val fileName = path.split("/").last - val extension = getFormatExtension(path) - - val inPath = getCleanPath(path, useZipPath = extension == "zip") - - val randomID = UUID.randomUUID().toString - val tmpDir = Files.createTempDirectory(s"mosaic_local_$randomID").toFile.getAbsolutePath - - val outPath = s"$tmpDir/$fileName" - - Files.createDirectories(Paths.get(tmpDir)) - Files.copy(Paths.get(inPath), Paths.get(outPath)) - - if (isSubdataset(rawPath)) { - val format :: _ :: subdataset :: Nil = rawPath.split(":").toList - getSubdatasetPath(s"$format:$outPath:$subdataset") - } else { - outPath - } - } catch { - case _: Throwable => rawPath - } - } - - def createTmpFilePath(uuid: String, extension: String): String = { - val randomID = UUID.randomUUID() - val tmpDir = Files.createTempDirectory(s"mosaic_tmp_$randomID").toFile.getAbsolutePath + def createTmpFilePath(extension: String): String = { + val tmpDir = MosaicContext.tmpDir + val uuid = java.util.UUID.randomUUID.toString val outPath = s"$tmpDir/raster_${uuid.replace("-", "_")}.$extension" Files.createDirectories(Paths.get(outPath).getParent) outPath @@ -98,7 +51,20 @@ object PathUtils { def fromSubdatasetPath(path: String): String = { val _ :: filePath :: _ :: Nil = path.split(":").toList - filePath + var result = filePath + if (filePath.startsWith("\"")) result = result.drop(1) + if (filePath.endsWith("\"")) result = result.dropRight(1) + result + } + + def copyToTmp(inPath: String): String = { + val cleanPath = getCleanPath(inPath) + val copyFromPath = inPath.replace("file:/", "/").replace("dbfs:/", "/dbfs/") + val driver = MosaicRasterGDAL.identifyDriver(cleanPath) + val extension = if (inPath.endsWith(".zip")) "zip" else GDAL.getExtension(driver) + val tmpPath = createTmpFilePath(extension) + Files.copy(Paths.get(copyFromPath), Paths.get(tmpPath)) + tmpPath } } diff --git a/src/main/scala/com/databricks/labs/mosaic/utils/SysUtils.scala b/src/main/scala/com/databricks/labs/mosaic/utils/SysUtils.scala new file mode 100644 index 000000000..85fa12785 --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/utils/SysUtils.scala @@ -0,0 +1,26 @@ +package com.databricks.labs.mosaic.utils + +import java.io.{ByteArrayOutputStream, PrintWriter} + +object SysUtils { + + import sys.process._ + + def runCommand(cmd: String): (String, String, String) = { + val stdoutStream = new ByteArrayOutputStream + val stderrStream = new ByteArrayOutputStream + val stdoutWriter = new PrintWriter(stdoutStream) + val stderrWriter = new PrintWriter(stderrStream) + val exitValue = try { + //noinspection ScalaStyle + cmd.!!(ProcessLogger(stdoutWriter.println, stderrWriter.println)) + } catch { + case _: Exception => "ERROR" + } finally { + stdoutWriter.close() + stderrWriter.close() + } + (exitValue, stdoutStream.toString, stderrStream.toString) + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterBandGDAL.scala b/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterBandGDAL.scala index 34d743ac1..15eef2009 100644 --- a/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterBandGDAL.scala +++ b/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterBandGDAL.scala @@ -30,7 +30,7 @@ class TestRasterBandGDAL extends SharedSparkSessionGDAL { val testValues = testBand.values(1000, 1000, 100, 50) testValues.length shouldBe 5000 - testRaster.cleanUp() + testRaster.getRaster.delete() } test("Read band metadata and pixel data from a GRIdded Binary file.") { @@ -49,7 +49,7 @@ class TestRasterBandGDAL extends SharedSparkSessionGDAL { val testValues = testBand.values(1, 1, 4, 5) testValues.length shouldBe 20 - testRaster.cleanUp() + testRaster.getRaster.delete() } test("Read band metadata and pixel data from a NetCDF file.") { @@ -74,8 +74,8 @@ class TestRasterBandGDAL extends SharedSparkSessionGDAL { noException should be thrownBy testBand.values testValues.length shouldBe 1000 - testRaster.cleanUp() - superRaster.cleanUp() + testRaster.getRaster.delete() + superRaster.getRaster.delete() } } diff --git a/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterGDAL.scala b/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterGDAL.scala index a0055f9d9..e39279843 100644 --- a/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterGDAL.scala +++ b/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterGDAL.scala @@ -46,7 +46,8 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { noException should be thrownBy testRaster.spatialRef an[Exception] should be thrownBy testRaster.getBand(-1) an[Exception] should be thrownBy testRaster.getBand(Int.MaxValue) - testRaster.cleanUp() + + testRaster.getRaster.delete() } test("Read raster metadata from a GRIdded Binary file.") { @@ -62,7 +63,8 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { testRaster.proj4String shouldBe "+proj=longlat +R=6371229 +no_defs" testRaster.SRID shouldBe 0 testRaster.extent shouldBe Seq(-0.375, -0.375, 10.125, 10.125) - testRaster.cleanUp() + + testRaster.getRaster.delete() } test("Read raster metadata from a NetCDF file.") { @@ -86,16 +88,16 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { testRaster.SRID shouldBe 0 testRaster.extent shouldBe Seq(-180.00000610436345, -89.99999847369712, 180.00000610436345, 89.99999847369712) - testRaster.cleanUp() - superRaster.cleanUp() + testRaster.getRaster.delete() + superRaster.getRaster.delete() } test("Raster pixel and extent sizes are correct.") { assume(System.getProperty("os.name") == "Linux") val testRaster = MosaicRasterGDAL.readRaster( - filePath("/modis/MCD43A4.A2018185.h10v07.006.2018194033728_B01.TIF"), - filePath("/modis/MCD43A4.A2018185.h10v07.006.2018194033728_B01.TIF") + filePath("/modis/MCD43A4.A2018185.h10v07.006.2018194033728_B01.TIF"), + filePath("/modis/MCD43A4.A2018185.h10v07.006.2018194033728_B01.TIF") ) testRaster.pixelXSize - 463.312716527 < 0.0000001 shouldBe true @@ -110,7 +112,7 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { testRaster.xMin - -8895604.157333 < 0.0000001 shouldBe true testRaster.yMin - 2223901.039333 < 0.0000001 shouldBe true - testRaster.cleanUp() + testRaster.getRaster.delete() } } diff --git a/src/test/scala/com/databricks/labs/mosaic/datasource/OGRFileFormatTest.scala b/src/test/scala/com/databricks/labs/mosaic/datasource/OGRFileFormatTest.scala index 001642880..6ed735d1f 100644 --- a/src/test/scala/com/databricks/labs/mosaic/datasource/OGRFileFormatTest.scala +++ b/src/test/scala/com/databricks/labs/mosaic/datasource/OGRFileFormatTest.scala @@ -1,10 +1,9 @@ package com.databricks.labs.mosaic.datasource -import com.databricks.labs.mosaic.{H3, JTS} -import com.databricks.labs.mosaic.core.raster.api.GDAL import com.databricks.labs.mosaic.expressions.util.OGRReadeWithOffset import com.databricks.labs.mosaic.functions.MosaicContext import com.databricks.labs.mosaic.utils.PathUtils +import com.databricks.labs.mosaic.{H3, JTS} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.functions.{col, lit} import org.apache.spark.sql.test.SharedSparkSessionGDAL @@ -81,7 +80,7 @@ class OGRFileFormatTest extends QueryTest with SharedSparkSessionGDAL { noException should be thrownBy OGRFileFormat.enableOGRDrivers(force = true) - val path = PathUtils.getCleanPath(getClass.getResource("/binary/geodb/bridges.gdb.zip").getPath, useZipPath = true) + val path = PathUtils.getCleanPath(getClass.getResource("/binary/geodb/bridges.gdb.zip").getPath) val ds = ogr.Open(path, 0) noException should be thrownBy OGRFileFormat.getLayer(ds, 0, "layer2") diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgBehaviors.scala new file mode 100644 index 000000000..b0f1225d2 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgBehaviors.scala @@ -0,0 +1,52 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.collect_set +import org.scalatest.matchers.should.Matchers._ + +trait RST_CombineAvgBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis") + + val gridTiles = rastersInMemory.union(rastersInMemory) + .withColumn("tiles", rst_tessellate($"tile", 2)) + .select("path", "tiles") + .groupBy("path") + .agg( + rst_combineavg(collect_set($"tiles")).as("tiles") + ) + .select("tiles") + + rastersInMemory.union(rastersInMemory) + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql(""" + |select rst_combineavg(collect_set(tiles)) as tiles + |from ( + | select path, rst_tessellate(tile, 2) as tiles + | from source + |) + |group by path + |""".stripMargin).take(1) + + val result = gridTiles.collect() + + result.length should be(rastersInMemory.count()) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgTest.scala new file mode 100644 index 000000000..f6430df30 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_CombineAvgTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_CombineAvgTest extends QueryTest with SharedSparkSessionGDAL with RST_CombineAvgBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_CombineAvg with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggBehaviors.scala new file mode 100644 index 000000000..c3668bd83 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggBehaviors.scala @@ -0,0 +1,73 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.lit +import org.scalatest.matchers.should.Matchers._ + +trait RST_DerivedBandAggBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis") + + val funcName = "multiply" + + // Example code from: https://gdal.org/drivers/raster/vrt.html#vrt-that-multiplies-the-values-of-the-source-file-by-a-factor-of-1-5 + val pyFuncCode = """ + |import numpy as np + |def multiply(in_ar, out_ar, xoff, yoff, xsize, ysize, raster_xsize,raster_ysize, buf_radius, gt, **kwargs): + | factor = 1.5 + | out_ar[:] = np.round_(np.clip(in_ar[0] * factor,0,255)) + |""".stripMargin + + val gridTiles = rastersInMemory.union(rastersInMemory) + .withColumn("tiles", rst_tessellate($"tile", 2)) + .select("path", "tiles") + .groupBy("path") + .agg( + rst_derivedband_agg($"tiles", lit(pyFuncCode), lit(funcName)).as("tiles") + ) + .select("tiles") + + rastersInMemory.union(rastersInMemory) + .createOrReplaceTempView("source") + + // Do not indent the code in the SQL statement + // It will be wrongly interpreted in python as broken + noException should be thrownBy spark.sql(""" + |select rst_derivedband_agg( + | tiles, + |" + |import numpy as np + |def multiply(in_ar, out_ar, xoff, yoff, xsize, ysize, raster_xsize,raster_ysize, buf_radius, gt, **kwargs): + | factor = 1.2 + | out_ar[:] = np.round_(np.clip(in_ar[0] * factor,0,255)) + |", + | "multiply" + |) as tiles + |from ( + | select path, rst_tessellate(tile, 2) as tiles + | from source + |) + |group by path + |""".stripMargin).take(1) + + val result = gridTiles.collect() + + result.length should be(rastersInMemory.count()) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggTest.scala new file mode 100644 index 000000000..0ed21a397 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandAggTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_DerivedBandAggTest extends QueryTest with SharedSparkSessionGDAL with RST_DerivedBandAggBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_DerivedBandAggTest with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandBehaviors.scala new file mode 100644 index 000000000..bd2ded02a --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandBehaviors.scala @@ -0,0 +1,75 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.{collect_set, lit} +import org.scalatest.matchers.should.Matchers._ + +trait RST_DerivedBandBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis") + + val funcName = "multiply" + + // Example code from: https://gdal.org/drivers/raster/vrt.html#vrt-that-multiplies-the-values-of-the-source-file-by-a-factor-of-1-5 + val pyFuncCode = + """ + |import numpy as np + |def multiply(in_ar, out_ar, xoff, yoff, xsize, ysize, raster_xsize,raster_ysize, buf_radius, gt, **kwargs): + | factor = 1.5 + | out_ar[:] = np.round_(np.clip(in_ar[0] * factor,0,255)) + |""".stripMargin + + val gridTiles = rastersInMemory.union(rastersInMemory) + .withColumn("tiles", rst_tessellate($"tile", 2)) + .select("path", "tiles") + .groupBy("path") + .agg( + rst_derivedband(collect_set($"tiles"), lit(pyFuncCode), lit(funcName)).as("tiles") + ) + .select("tiles") + + rastersInMemory.union(rastersInMemory) + .createOrReplaceTempView("source") + + // Do not indent the code in the SQL statement + // It will be wrongly interpreted in python as broken + noException should be thrownBy spark.sql( + """ + |select rst_derivedband( + | collect_set(tiles), + |" + |import numpy as np + |def multiply(in_ar, out_ar, xoff, yoff, xsize, ysize, raster_xsize,raster_ysize, buf_radius, gt, **kwargs): + | factor = 1.2 + | out_ar[:] = np.round_(np.clip(in_ar[0] * factor,0,255)) + |", + | "multiply" + |) as tiles + |from ( + | select path, rst_tessellate(tile, 2) as tiles + | from source + |) + |group by path + |""".stripMargin).take(1) + + val result = gridTiles.collect() + + result.length should be(rastersInMemory.count()) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandTest.scala new file mode 100644 index 000000000..9960ef4d7 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_DerivedBandTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_DerivedBandTest extends QueryTest with SharedSparkSessionGDAL with RST_DerivedBandBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_DerivedBandTest with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBandsBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBandsBehaviors.scala new file mode 100644 index 000000000..3a7f7f4a2 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBandsBehaviors.scala @@ -0,0 +1,56 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.array +import org.scalatest.matchers.should.Matchers._ + +trait RST_FromBandsBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("binaryFile") + .load("src/test/resources/modis") + + val gridTiles = rastersInMemory + .withColumn("tile", rst_fromfile($"path")) + .withColumn("bbox", rst_boundingbox($"tile")) + .withColumn("stacked", rst_frombands(array($"tile", $"tile", $"tile"))) + .withColumn("bbox2", rst_boundingbox($"stacked")) + .withColumn("result", st_area($"bbox") === st_area($"bbox2")) + .select("result") + .as[Boolean] + .collect() + + gridTiles.forall(identity) should be(true) + + rastersInMemory.createOrReplaceTempView("source") + + val gridTilesSQL = spark + .sql(""" + |with subquery as ( + | select rst_fromfile(path) as tile from source + |), + |subquery2 as ( + | select rst_frombands(array(tile, tile, tile)) as stacked, tile from subquery + |) + |select st_area(rst_boundingbox(tile)) == st_area(rst_boundingbox(stacked)) as result + |from subquery2 + |""".stripMargin) + .as[Boolean] + .collect() + + gridTilesSQL.forall(identity) should be(true) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBandsTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBandsTest.scala new file mode 100644 index 000000000..b60888125 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_FromBandsTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_FromBandsTest extends QueryTest with SharedSparkSessionGDAL with RST_FromBandsBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_FromBands with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoDataBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoDataBehaviors.scala new file mode 100644 index 000000000..b1154f55e --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoDataBehaviors.scala @@ -0,0 +1,49 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.scalatest.matchers.should.Matchers._ + +trait RST_GetNoDataBehaviors extends QueryTest { + + //noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis/") + + val noDataVals = rastersInMemory + .withColumn("no_data", rst_getnodata($"tile")) + .select("no_data") + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql(""" + |select rst_getnodata(tile) from source + |""".stripMargin) + + noException should be thrownBy rastersInMemory + .withColumn("no_data", rst_getnodata($"tile")) + .select("no_data") + + val result = noDataVals.as[Seq[Double]].collect() + + result.forall(_.forall(_ == 32767.0)) should be(true) + + an[Exception] should be thrownBy spark.sql(""" + |select rst_getnodata() from source + |""".stripMargin) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoDataTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoDataTest.scala new file mode 100644 index 000000000..ce29c5870 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetNoDataTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_GetNoDataTest extends QueryTest with SharedSparkSessionGDAL with RST_GetNoDataBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_GetNoData with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdatasetBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdatasetBehaviors.scala new file mode 100644 index 000000000..cc572e475 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdatasetBehaviors.scala @@ -0,0 +1,51 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.lit +import org.scalatest.matchers.should.Matchers._ + +trait RST_GetSubdatasetBehaviors extends QueryTest { + + //noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/binary/netcdf-coral") + + val geoReferenceDf = rastersInMemory + .withColumn("subdataset", rst_getsubdataset($"tile", lit("bleaching_alert_area"))) + .select(rst_georeference($"subdataset")) + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql(""" + |select rst_georeference(rst_getsubdataset(tile, "bleaching_alert_area")) from source + |""".stripMargin) + + val result = geoReferenceDf.as[Map[String, Double]].take(1).head + + result.get("upperLeftX").get != 0.0 shouldBe true + result.get("upperLeftY").get != 0.0 shouldBe true + result.get("scaleX").get != 0.0 shouldBe true + result.get("scaleY").get != 0.0 shouldBe true + result.get("skewX").get != 0.0 shouldBe false + result.get("skewY").get != 0.0 shouldBe false + + an[Exception] should be thrownBy spark.sql(""" + |select rst_getsubdataset() from source + |""".stripMargin) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdatasetTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdatasetTest.scala new file mode 100644 index 000000000..019e4f226 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_GetSubdatasetTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_GetSubdatasetTest extends QueryTest with SharedSparkSessionGDAL with RST_GetSubdatasetBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_GetSubdataset with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoDataBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoDataBehaviors.scala new file mode 100644 index 000000000..cb00638e1 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoDataBehaviors.scala @@ -0,0 +1,53 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.scalatest.matchers.should.Matchers._ + +trait RST_InitNoDataBehaviors extends QueryTest { + + //noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis/") + + val noDataVals = rastersInMemory + .withColumn("tile", rst_initnodata($"tile")) + .withColumn("no_data", rst_getnodata($"tile")) + .select("no_data") + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql( + """ + |select rst_getnodata(rst_initnodata(tile)) from source + |""".stripMargin) + + noException should be thrownBy rastersInMemory + .withColumn("tile", rst_initnodata($"tile")) + .withColumn("no_data", rst_getnodata($"tile")) + .select("no_data") + + val result = noDataVals.as[Seq[Double]].collect() + + result.forall(_.forall(_ == -32768.0)) should be(true) + + an[Exception] should be thrownBy spark.sql( + """ + |select rst_initnodata() from source + |""".stripMargin) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoDataTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoDataTest.scala new file mode 100644 index 000000000..f861e8fef --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_InitNoDataTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_InitNoDataTest extends QueryTest with SharedSparkSessionGDAL with RST_InitNoDataBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_InitNoData with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_MapAlgebraBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_MapAlgebraBehaviors.scala new file mode 100644 index 000000000..fd15f8102 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_MapAlgebraBehaviors.scala @@ -0,0 +1,79 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.{array, lit} +import org.scalatest.matchers.should.Matchers._ + +trait RST_MapAlgebraBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis") + + val gridTiles = rastersInMemory + .withColumn("tiles", array($"tile", $"tile", $"tile")) + .withColumn("map_algebra", rst_mapalgebra($"tiles", lit("""{"calc": "A+B/C", "A_index": 0, "B_index": 1, "C_index": 2}"""))) + .select("tiles") + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql( + """ + |select rst_mapalgebra(tiles, '{"calc": "A+B/C", "A_index": 0, "B_index": 1, "C_index": 2}') + | as tiles + |from ( + | select array(tile, tile, tile) as tiles + | from source + |) + |""".stripMargin).take(1) + + noException should be thrownBy spark.sql( + """ + |select rst_mapalgebra(tiles, '{"calc": "A+B/C"}') + | as tiles + |from ( + | select array(tile, tile, tile) as tiles + | from source + |) + |""".stripMargin).take(1) + + noException should be thrownBy spark.sql( + """ + |select rst_mapalgebra(tiles, '{"calc": "A+B/C", "A_index": 0, "B_index": 1, "C_index": 1}') + | as tiles + |from ( + | select array(tile, tile, tile) as tiles + | from source + |) + |""".stripMargin).take(1) + + noException should be thrownBy spark.sql( + """ + |select rst_mapalgebra(tiles, '{"calc": "A+B/C", "A_index": 0, "B_index": 1, "C_index": 2, "A_band": 1, "B_band": 1, "C_band": 1}') + | as tiles + |from ( + | select array(tile, tile, tile) as tiles + | from source + |) + |""".stripMargin).take(1) + + val result = gridTiles.collect() + + result.length should be(rastersInMemory.count()) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_MapAlgebraTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_MapAlgebraTest.scala new file mode 100644 index 000000000..d7be403a3 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_MapAlgebraTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_MapAlgebraTest extends QueryTest with SharedSparkSessionGDAL with RST_MapAlgebraBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_MapAlgebra with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVIBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVIBehaviors.scala new file mode 100644 index 000000000..b433ccd79 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVIBehaviors.scala @@ -0,0 +1,44 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.{array, lit} +import org.scalatest.matchers.should.Matchers._ + +trait RST_NDVIBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis") + + val gridTiles = rastersInMemory + .withColumn("ndvi", rst_ndvi($"tile", lit(1), lit(1))) + .select("ndvi") + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql( + """ + |select rst_ndvi(tile, 1, 1) + | from source + |""".stripMargin).take(1) + + val result = gridTiles.collect() + + result.length should be(rastersInMemory.count()) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVITest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVITest.scala new file mode 100644 index 000000000..881ccad1e --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_NDVITest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_NDVITest extends QueryTest with SharedSparkSessionGDAL with RST_NDVIBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_NDVITest with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoDataBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoDataBehaviors.scala new file mode 100644 index 000000000..c28403817 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoDataBehaviors.scala @@ -0,0 +1,50 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.lit +import org.scalatest.matchers.should.Matchers._ + +trait RST_SetNoDataBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis") + + val gridTiles = rastersInMemory + .withColumn("tile", rst_setnodata($"tile", lit(1))) + .select("tile") + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql( + """ + |select rst_setnodata(tile, 1) + | from source + |""".stripMargin).take(1) + + noException should be thrownBy spark.sql( + """ + |select rst_setnodata(tile, array(1.0)) + | from source + |""".stripMargin).take(1) + + val result = gridTiles.collect() + + result.length should be(rastersInMemory.count()) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoDataTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoDataTest.scala new file mode 100644 index 000000000..28a0a0726 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_SetNoDataTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_SetNoDataTest extends QueryTest with SharedSparkSessionGDAL with RST_SetNoDataBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_SetNoData with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesBehaviors.scala new file mode 100644 index 000000000..d51f26891 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesBehaviors.scala @@ -0,0 +1,45 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.lit +import org.scalatest.matchers.should.Matchers._ + +trait RST_ToOverlappingTilesBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis") + + val gridTiles = rastersInMemory + .withColumn("tile", rst_to_overlapping_tiles($"tile", lit(500), lit(500), lit(10))) + .select("tile") + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql( + """ + |select rst_to_overlapping_tiles(tile, 500, 500, 10) + | from source + |""".stripMargin).take(1) + + + val result = gridTiles.collect() + + result.length > rastersInMemory.count() should be(true) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesTest.scala new file mode 100644 index 000000000..397f77330 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_ToOverlappingTilesTest extends QueryTest with SharedSparkSessionGDAL with RST_ToOverlappingTilesBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_ToOverlappingTilesTest with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpenBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpenBehaviors.scala new file mode 100644 index 000000000..3e5669614 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpenBehaviors.scala @@ -0,0 +1,44 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import org.apache.spark.sql.QueryTest +import org.scalatest.matchers.should.Matchers._ + +trait RST_TryOpenBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val rastersInMemory = spark.read + .format("gdal") + .option("raster_storage", "in-memory") + .load("src/test/resources/modis") + + val gridTiles = rastersInMemory + .withColumn("tile", rst_tryopen($"tile")) + .select("tile") + + rastersInMemory + .createOrReplaceTempView("source") + + noException should be thrownBy spark.sql( + """ + |select rst_tryopen(tile) + | from source + |""".stripMargin).take(1) + + + val result = gridTiles.collect() + + result.length == rastersInMemory.count() should be(true) + + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpenTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpenTest.scala new file mode 100644 index 000000000..ea24694db --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_TryOpenTest.scala @@ -0,0 +1,32 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +import scala.util.Try + +class RST_TryOpenTest extends QueryTest with SharedSparkSessionGDAL with RST_TryOpenBehaviors { + + private val noCodegen = + withSQLConf( + SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false", + SQLConf.CODEGEN_FACTORY_MODE.key -> CodegenObjectFactoryMode.NO_CODEGEN.toString + ) _ + + // Hotfix for SharedSparkSession afterAll cleanup. + override def afterAll(): Unit = Try(super.afterAll()) + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_TryOpen with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + behaviors(H3IndexSystem, JTS) + } + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/models/knn/SpatialKNNBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/models/knn/SpatialKNNBehaviors.scala index c142672ee..bc20f5ecb 100644 --- a/src/test/scala/com/databricks/labs/mosaic/models/knn/SpatialKNNBehaviors.scala +++ b/src/test/scala/com/databricks/labs/mosaic/models/knn/SpatialKNNBehaviors.scala @@ -28,7 +28,7 @@ trait SpatialKNNBehaviors extends MosaicSpatialQueryTest { val boroughs: DataFrame = getBoroughs(mc) - val tempLocation = Files.createTempDirectory("mosaic").toAbsolutePath.toString + val tempLocation = MosaicContext.tmpDir spark.sparkContext.setCheckpointDir(tempLocation) spark.sparkContext.setLogLevel("ERROR") @@ -93,7 +93,7 @@ trait SpatialKNNBehaviors extends MosaicSpatialQueryTest { val boroughs: DataFrame = getBoroughs(mc) - val tempLocation = Files.createTempDirectory("mosaic").toAbsolutePath.toString + val tempLocation = MosaicContext.tmpDir spark.sparkContext.setCheckpointDir(tempLocation) spark.sparkContext.setLogLevel("ERROR") diff --git a/src/test/scala/org/apache/spark/sql/test/SharedSparkSessionGDAL.scala b/src/test/scala/org/apache/spark/sql/test/SharedSparkSessionGDAL.scala index a666e0578..91e93240f 100644 --- a/src/test/scala/org/apache/spark/sql/test/SharedSparkSessionGDAL.scala +++ b/src/test/scala/org/apache/spark/sql/test/SharedSparkSessionGDAL.scala @@ -23,8 +23,7 @@ trait SharedSparkSessionGDAL extends SharedSparkSession { val session = new TestSparkSession(conf) session.sparkContext.setLogLevel("FATAL") Try { - val tempPath = Files.createTempDirectory("mosaic-gdal") - MosaicGDAL.prepareEnvironment(session, tempPath.toAbsolutePath.toString) + MosaicGDAL.prepareEnvironment(session, "/tmp") MosaicGDAL.enableGDAL(session) } session