for dataproc and databricks use ipython startup file to enable no-imp…

…ort [skip ci] (#833) to import spark_rapids_ml.install note that in both cases, default NBs are run as root. also use more parallelism for notebooks and example instances, leveraging stage level scheduling Signed-off-by: Erik Ordentlich <[email protected]>
NVIDIA · Feb 4, 2025 · 3411490 · 3411490
1 parent e909a46
commit 3411490
Show file tree

Hide file tree

Showing 5 changed files with 7 additions and 5 deletions.
diff --git a/notebooks/aws-emr/init-configurations.json b/notebooks/aws-emr/init-configurations.json
@@ -50,7 +50,7 @@
             "spark.executor.resource.gpu.amount":"1",
             "spark.executor.cores":"8",
             "spark.task.cpus":"1",
-            "spark.task.resource.gpu.amount":"1",
+            "spark.task.resource.gpu.amount":"0.125",
             "spark.rapids.memory.pinnedPool.size":"2G",
             "spark.executor.memoryOverhead":"2G",
             "spark.sql.files.maxPartitionBytes":"256m",

diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
@@ -24,7 +24,7 @@ If you already have a Databricks account, you can run the example notebooks on a
   - **Spark**
     - **Spark config**
       ```
-      spark.task.resource.gpu.amount 1
+      spark.task.resource.gpu.amount 0.125
       spark.databricks.delta.preview.enabled true
       spark.python.worker.reuse true
       spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.10.1.jar:/databricks/spark/python

diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -47,7 +47,8 @@ ln -s /usr/local/cuda-11.8 /usr/local/cuda
 # set up no-import-change for cluster if enabled
 if [[ $SPARK_RAPIDS_ML_NO_IMPORT_ENABLED == 1 ]]; then
     echo "enabling no import change in cluster" 1>&2
-    sed -i /databricks/python_shell/dbruntime/monkey_patches.py -e '1 s/\(.*\)/import spark_rapids_ml.install\n\1/g'
+    mkdir -p /root/.ipython/profile_default/startup
+    echo "import spark_rapids_ml.install" >/root/.ipython/profile_default/startup/00-spark-rapids-ml.py
 fi
 
 

diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md
@@ -52,7 +52,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D
   --metadata rapids-version=${RAPIDS_VERSION} \
   --metadata spark-rapids-ml-no-import-enabled=0 \
   --properties spark:spark.executor.resource.gpu.amount=1,\
-  spark:spark.task.resource.gpu.amount=1,\
+  spark:spark.task.resource.gpu.amount=0.0625,\
   spark:spark.executorEnv.CUPY_CACHE_DIR=/tmp/.cupy,\
   spark:spark.locality.wait=0,\
   spark:spark.sql.execution.arrow.pyspark.enabled=true,\

diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh
@@ -30,5 +30,6 @@ pip install spark-rapids-ml
 no_import_change=$(/usr/share/google/get_metadata_value attributes/spark-rapids-ml-no-import-enabled)
 if [[ $no_import_change == 1 ]]; then
     echo "enabling no import change in cluster" 1>&2
-    sed -i /usr/lib/spark/python/pyspark/shell.py -e '1 s/\(.*\)/import spark_rapids_ml.install\n\1/g'
+    mkdir -p /root/.ipython/profile_default/startup
+    echo "import spark_rapids_ml.install" >/root/.ipython/profile_default/startup/00-spark-rapids-ml.py
 fi