NVIDIA · YanxuanLiu · Jan 13, 2025 · Sep 25, 2024 · Sep 29, 2024 · Oct 8, 2024
diff --git a/.github/workflows/license-header-check.yml b/.github/workflows/license-header-check.yml
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# A workflow to check copyright/license header
+name: license header check
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  license-header-check:
+    runs-on: ubuntu-latest
+    if: "!contains(github.event.pull_request.title, '[bot]')"
+    steps:
+      - name: Get checkout depth
+        run: |
+          echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 10 ))" >> $GITHUB_ENV
+
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: ${{ env.PR_FETCH_DEPTH }}
+
+      - name: license-header-check
+        uses: NVIDIA/spark-rapids-common/license-header-check@main
+        with:
+          included_file_patterns: |
+            *.sh,
+            *.py,
+            *.toml,
+            *.cfg,
+            *Dockerfile*,
+            *Jenkinsfile*,
+            *.yml,
+            *.txt,
+            *.xml
+          excluded_file_patterns: |
+            thirdparty/*
diff --git a/ci/Dockerfile b/ci/Dockerfile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -37,6 +37,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     && conda config --set solver libmamba
 
 # install cuML
-ARG CUML_VER=24.10
+ARG CUML_VER=24.12
 RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER cuvs=$CUML_VER python=3.10 cuda-version=11.8 numpy~=1.0 \
     && conda clean --all -f -y
diff --git a/ci/lint_python.py b/ci/lint_python.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Dict, List, Tuple
 
 import argparse

diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 ARG PYSPARK_VERSION=3.3.1
-ARG RAPIDS_VERSION=24.10.0
+ARG RAPIDS_VERSION=24.12.0
 ARG ARCH=amd64
 #ARG ARCH=arm64
 # Install packages to build spark-rapids-ml

diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 
-ARG CUML_VERSION=24.10
+ARG CUML_VERSION=24.12
 
 # Install packages to build spark-rapids-ml
 RUN apt update -y \

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -1,2 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 numpydoc
 pydata-sphinx-theme
diff --git a/docs/site/FAQ.md b/docs/site/FAQ.md
@@ -9,8 +9,8 @@ nav_order: 4
 
 ### What versions of Apache Spark are supported?
 
-Apache Spark version 3.2.1 or higher.
+Apache Spark version 3.3.1 or higher.
 
 ### What versions of Python are supported
 
-Python 3.8 or higher.
+Python 3.10 or higher.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Configuration file for the Sphinx documentation builder.
 #
 # For the full list of built-in configuration values, see the documentation:
@@ -9,7 +23,7 @@
 project = 'spark-rapids-ml'
 copyright = '2024, NVIDIA'
 author = 'NVIDIA'
-release = '24.10.0'
+release = '24.12.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/notebooks/README.md b/notebooks/README.md
@@ -33,6 +33,23 @@ To run notebooks using Spark local mode on a server with one or more NVIDIA GPUs
     Then, browse to the `127.0.0.1` URL printed by the command in step 4.   Note that a tunnel is also opened to the Spark UI server on port 4040.  Once a notebook is opened, you can view it by browsing to http://127.0.0.1:4040 in another tab or window.
 8. **OPTIONAL**: If you have multiple GPUs in your server, replace the `CUDA_VISIBLE_DEVICES` setting in step 4 with a comma-separated list of the corresponding indices.  For example, for two GPUs use `CUDA_VISIBLE_DEVICES=0,1`.
 
+## No import change
+In these notebooks, the GPU accelerated implementations of algorithms in Spark MLlib are enabled via import statements from the `spark_rapids_ml` package.   Alternatively, acceleration can also be enabled by executing the following import statement at the start of a notebook:
+```
+import spark_rapids_ml.install
+```
+After executing a cell with this command, all subsequent imports and accesses of supported accelerated classes from `pyspark.ml` will automatically redirect and return their counterparts in `spark_rapids_ml`.  Unaccelerated classes will import from `pyspark.ml` as usual.  Thus, with the above single import statement, all supported acceleration in an existing `pyspark` notebook is enabled with no additional import statement or code changes.  Directly importing from `spark_rapids_ml` also still works (needed for non-MLlib algorithms like UMAP).
+
+For an example, see the notebook [kmeans-no-import-change.ipynb](kmeans-no-import-change.ipynb).
+
+*Note*: As of this release, in this mode, the remaining unsupported methods and attributes on accelerated classes and objects will still raise exceptions.
+
 ## Running notebooks on Databricks
 See [these instructions](databricks/README.md) for running the notebooks in a Databricks Spark cluster.
 
+## Running notebooks on Google Dataproc
+See [these instructions](dataproc/README.md) for running the notebooks in a Dataproc Spark cluster.
+
+## Running notebooks on AWS EMR
+See [these instructions](aws-emr/README.md) for running the notebooks in an AWS-EMR cluster.
+
diff --git a/notebooks/approx-nearest-neighbors.ipynb b/notebooks/approx-nearest-neighbors.ipynb
@@ -138,7 +138,8 @@
    "id": "204b7e72-737e-4a8d-81ce-5a275cb7446a",
    "metadata": {},
    "source": [
-    "## Spark RAPIDS ML (GPU)"
+    "## Spark RAPIDS ML (GPU)\n",
+    "The ApproximateNearestNeighbors class of Spark Rapids ML uses the ivfflat algorithm by default."
    ]
   },
   {
@@ -318,9 +319,9 @@
    "id": "8cd56670-7633-4fe6-ab75-0fd680c63baa",
    "metadata": {},
    "source": [
-    "# PySpark\n",
+    "## PySpark\n",
     "\n",
-    "PySpark does not have an exact kNN implementation, but it does have an LSH-based Approximate Nearest Neighbors implementation, shown here to illustrate the similarity between the APIs.  However, the algorithms are very different, so their results are only roughly comparable, and it would require elaborate tuning of parameters to produce similar results."
+    "PySpark has an LSH-based Approximate Nearest Neighbors implementation, shown here to illustrate the similarity between the APIs.  However, the algorithms are very different, so their results are only roughly comparable, and it would require elaborate tuning of parameters to produce similar results."
    ]
   },
   {
@@ -440,6 +441,74 @@
     "# saves the LSH hashes for the input rows\n",
     "model.write().overwrite().save(\"/tmp/ann_model\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b1398af2",
+   "metadata": {},
+   "source": [
+    "## Spark Rapids ML (GPU CAGRA algorithm) \n",
+    "CAGRA is a cutting-edge graph-based algorithm available in cuVS, and is now integrated into the ApproximateNearestNeighbors class of Spark Rapids ML. Cagra currently supports sqeuclidean distance metric only, and the metric must be set before using the main APIs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e0bef26",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "knn = ApproximateNearestNeighbors(k=2, algorithm='cagra', metric='sqeuclidean', algoParams={\"build_algo\" : \"nn_descent\"})\n",
+    "knn.setInputCol(\"features\")\n",
+    "knn_model = knn.fit(item_df)\n",
+    "item_id_df, query_id_df, neighbor_df = knn_model.kneighbors(query_df)\n",
+    "neighbor_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b22ac85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_df = knn_model.approxSimilarityJoin(query_df)\n",
+    "result_df.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "87fb3f48",
+   "metadata": {},
+   "source": [
+    "## Spark Rapids ML (GPU IVFPQ algorithm)\n",
+    "The IVFPQ algorithm combines the power of Inverted File Indexing with Product Quantization to deliver fast and memory-efficient approximate nearest neighbor search. It is now integrated into Spark Rapids ML."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d40b73ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "knn = ApproximateNearestNeighbors(k=2, algorithm='ivfpq', algoParams={\"M\": 2, \"n_bits\": 8})\n",
+    "knn.setInputCol(\"features\")\n",
+    "knn_model = knn.fit(item_df)\n",
+    "item_id_df, query_id_df, neighbor_df = knn_model.kneighbors(query_df)\n",
+    "neighbor_df.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11224698",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result_df = knn_model.approxSimilarityJoin(query_df)\n",
+    "result_df.show()"
+   ]
   }
  ],
  "metadata": {

diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh
@@ -1,4 +1,18 @@
 #!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 
 set -ex
 
@@ -13,7 +27,7 @@ sudo bash -c "wget https://www.python.org/ftp/python/3.10.9/Python-3.10.9.tgz &&
 tar xzf Python-3.10.9.tgz && cd Python-3.10.9 && \
 ./configure --enable-optimizations && make altinstall"
 
-RAPIDS_VERSION=24.10.0
+RAPIDS_VERSION=24.12.0
 
 sudo /usr/local/bin/pip3.10 install --upgrade pip
 

diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
@@ -51,7 +51,7 @@ If you already have a Databricks account, you can run the example notebooks on a
       spark.task.resource.gpu.amount 1
       spark.databricks.delta.preview.enabled true
       spark.python.worker.reuse true
-      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.08.1.jar:/databricks/spark/python
+      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.10.1.jar:/databricks/spark/python
       spark.sql.execution.arrow.maxRecordsPerBatch 100000
       spark.rapids.memory.gpu.minAllocFraction 0.0001
       spark.plugins com.nvidia.spark.SQLPlugin

diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -1,11 +1,25 @@
 #!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # set portion of path below after /dbfs/ to dbfs zip file location
 SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file
 # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10
 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
 # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
-RAPIDS_VERSION=24.10.0
-SPARK_RAPIDS_VERSION=24.08.1
+RAPIDS_VERSION=24.12.0
+SPARK_RAPIDS_VERSION=24.10.1
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 

diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md
@@ -28,11 +28,10 @@ If you already have a Dataproc account, you can run the example notebooks on a D
   ```
 - Create a cluster with at least two single-gpu workers.  **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS:
   ```
-  export CUDA_VERSION=11.8
-  export RAPIDS_VERSION=24.10.0
+  export RAPIDS_VERSION=24.12.0
 
   gcloud dataproc clusters create $USER-spark-rapids-ml \
-  --image-version=2.1-ubuntu \
+  --image-version=2.2-ubuntu22 \
   --region ${COMPUTE_REGION} \
   --master-machine-type n1-standard-16 \
   --master-accelerator type=nvidia-tesla-t4,count=1 \
@@ -42,11 +41,11 @@ If you already have a Dataproc account, you can run the example notebooks on a D
   --worker-machine-type n1-standard-16 \
   --num-worker-local-ssds 4 \
   --worker-local-ssd-interface=NVME \
-  --initialization-actions gs://goog-dataproc-initialization-actions-us-central1/gpu/install_gpu_driver.sh,gs://${GCS_BUCKET}/spark_rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \
+  --initialization-actions gs://${GCS_BUCKET}/spark-rapids.sh,gs://${GCS_BUCKET}/spark_rapids_ml.sh \
+  --initialization-action-timeout=20m \
   --optional-components=JUPYTER \
   --metadata gpu-driver-provider="NVIDIA" \
   --metadata rapids-runtime=SPARK \
-  --metadata cuda-version=${CUDA_VERSION} \
   --metadata rapids-version=${RAPIDS_VERSION} \
   --bucket ${GCS_BUCKET} \
   --enable-component-gateway \