From d4af23e6eb3d0659ffb6f8b1ff7a8401677ae22a Mon Sep 17 00:00:00 2001 From: zhaokuo03 Date: Sat, 23 Mar 2024 11:01:41 +0800 Subject: [PATCH] fix --- .../backendsapi/velox/VeloxBackend.scala | 1 + .../apache/spark/sql/expression/UDFResolver.scala | 12 +++++++++++- docs/get-started/Velox.md | 6 ++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/VeloxBackend.scala b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/VeloxBackend.scala index 8cccc2b150619..0ff2bd0d705df 100644 --- a/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/VeloxBackend.scala +++ b/backends-velox/src/main/scala/io/glutenproject/backendsapi/velox/VeloxBackend.scala @@ -65,6 +65,7 @@ object BackendSettings extends BackendSettingsApi { val SHUFFLE_SUPPORTED_CODEC = Set("lz4", "zstd") val GLUTEN_VELOX_UDF_LIB_PATHS = getBackendConfigPrefix() + ".udfLibraryPaths" + val GLUTEN_VELOX_DRIVER_UDF_LIB_PATHS = getBackendConfigPrefix() + ".driver.udfLibraryPaths" val MAXIMUM_BATCH_SIZE: Int = 32768 diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala b/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala index 62d87997cdab7..5cf7dfa71ab1a 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala @@ -84,6 +84,10 @@ object UDFResolver extends Logging { private val LIB_EXTENSION = ".so" + private lazy val isDriver: Boolean = + "driver".equals(SparkEnv.get.executorId) + + def registerUDF(name: String, bytes: Array[Byte]): Unit = { registerUDF(name, TypeConverter.from(bytes)) } @@ -114,7 +118,13 @@ object UDFResolver extends Logging { def resolveUdfConf(conf: java.util.Map[String, String]): Unit = { val sparkConf = SparkEnv.get.conf - val udfLibPaths = sparkConf.getOption(BackendSettings.GLUTEN_VELOX_UDF_LIB_PATHS) + val udfLibPaths = if (isDriver) { + sparkConf + .getOption(BackendSettings.GLUTEN_VELOX_DRIVER_UDF_LIB_PATHS) + .orElse(sparkConf.getOption(BackendSettings.GLUTEN_VELOX_UDF_LIB_PATHS)) + } else { + sparkConf.getOption(BackendSettings.GLUTEN_VELOX_UDF_LIB_PATHS) + } udfLibPaths match { case Some(paths) => diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md index 876a7a753c776..8a900131091a7 100644 --- a/docs/get-started/Velox.md +++ b/docs/get-started/Velox.md @@ -435,16 +435,22 @@ target_link_libraries(myudf PRIVATE ${VELOX_LIBRARY}) Gluten loads the UDF libraries at runtime. You can upload UDF libraries via `--files` or `--archives`, and configure the libray paths using the provided Spark configuration, which accepts comma separated list of library paths. +Note if running on Yarn client mode, the uploaded files are not reachable on driver side. Users should copy those files to somewhere reachable for driver and set `spark.gluten.sql.columnar.backend.velox.driver.udfLibraryPaths`. This configuration is also useful when the `udfLibraryPaths` is different between driver side and executor side. + - Use `--files` ```shell --files /path/to/gluten/cpp/build/velox/udf/examples/libmyudf.so --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=libmyudf.so +# Needed for Yarn client mode +--conf spark.gluten.sql.columnar.backend.velox.driver.udfLibraryPaths=file:///path/to/libmyudf.so ``` - Use `--archives` ```shell --archives /path/to/udf_archives.zip#udf_archives --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=udf_archives +# Needed for Yarn client mode +--conf spark.gluten.sql.columnar.backend.velox.driver.udfLibraryPaths=file:///path/to/udf_archives.zip ``` - Specify URI