apache · flyrain · Oct 15, 2024 · Sep 15, 2024 · Sep 15, 2024 · Sep 16, 2024
diff --git a/.github/workflows/check-md-link.yml b/.github/workflows/check-md-link.yml
@@ -41,5 +41,5 @@ jobs:
       with:
         use-quiet-mode: 'yes'
         config-file: '.github/workflows/check-md-link-config.json'
-        folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, notebooks'
+        folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, getting-started'
         file-path: 'CHAT_BYLAWS.md, CODE_OF_CONDUCT.md, CONTRIBUTING.md, README.md SECURITY.md'
diff --git a/.gitignore b/.gitignore
@@ -26,8 +26,8 @@ regtests/output/
 /polaris-venv/
 /pyproject.toml
 
-# Notebooks
-notebooks/.ipynb_checkpoints/
+# Notebook Checkpoints
+**/.ipynb_checkpoints/
 
 # Metastore
 metastore_db/

diff --git a/getting-started/spark/README.md b/getting-started/spark/README.md
@@ -0,0 +1,45 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Getting Started with Apache Spark and Apache Polaris
+
+This getting started guide provides a `docker-compose` file to set up [Apache Spark](https://spark.apache.org/) with Apache Polaris. Apache Polaris is configured as an Iceberg REST Catalog in Spark. 
+A Jupyter notebook is used to run PySpark.
+
+## Run the `docker-compose` file
+To start the `docker-compose` file, run this command from the repo's root directory:
+```
+docker-compose -f getting-started/spark/docker-compose.yml up 
+```
+
+This will spin up 2 container services
+* The `polaris` service for running Apache Polaris using an in-memory metastore
+* The `jupyter` service for running Jupyter notebook with PySpark
+
+## Access the Jupyter notebook interface
+In the Jupyter notebook container log, look for the URL to access the Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=<token>`.
+
+Open the Jupyter notebook in a browser.
+Navigate to [`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb) <!-- markdown-link-check-disable-line -->
+
+## Change the Polaris credential
+The Polaris service will create a new root crendential on startup, find this credential in the Polaris service log and change the `polaris_credential` variable in the first cell of the jupyter notebook
+
+## Run the Jupyter notebook
+You can now run all cells in the notebook or write your own code!
diff --git a/docker-compose-jupyter.yml → getting-started/spark/docker-compose.yml b/docker-compose-jupyter.yml → getting-started/spark/docker-compose.yml
@@ -20,7 +20,7 @@
 services:
   polaris:
     build:
-      context: .
+      context: ../../
       network: host
     ports:
       - "8181:8181"
@@ -37,8 +37,8 @@ services:
       retries: 5
   jupyter:
     build:
-      context: .
-      dockerfile: ./notebooks/Dockerfile
+      context: ../../ # this is necessary to expose `regtests/` dir to notebooks/Dockerfile
+      dockerfile: ./getting-started/spark/notebooks/Dockerfile
       network: host
     ports:
       - "8888:8888"
@@ -57,4 +57,4 @@ volumes:
     driver_opts:
       o: bind
       type: none
-      device: ./notebooks
+      device: ./notebooks/
diff --git a/notebooks/Dockerfile → getting-started/spark/notebooks/Dockerfile b/notebooks/Dockerfile → getting-started/spark/notebooks/Dockerfile
diff --git a/notebooks/SparkPolaris.ipynb → ...tarted/spark/notebooks/SparkPolaris.ipynb b/notebooks/SparkPolaris.ipynb → ...tarted/spark/notebooks/SparkPolaris.ipynb
@@ -21,8 +21,11 @@
     "from polaris.catalog.api_client import ApiClient as CatalogApiClient\n",
     "from polaris.catalog.api_client import Configuration as CatalogApiClientConfiguration\n",
     "\n",
-    "client_id = 'b3b6497353b33ea7'\n",
-    "client_secret = '623a67ee71d75825238e3e269df5cdac' # pragma: allowlist secret\n",
+    "# (CHANGE ME): This credential changes on every Polaris service restart\n",
+    "# In the Polaris log, look for the `realm: default-realm root principal credentials:` string\n",
+    "polaris_credential = '35df9f8a34199df0:101b9d35700032416210ad2d39b1b4e3' # pragma: allowlist secret\n",
+    "\n",
+    "client_id, client_secret = polaris_credential.split(\":\")\n",
     "client = CatalogApiClient(CatalogApiClientConfiguration(username=client_id,\n",
     "                                 password=client_secret,\n",
     "                                 host='http://polaris:8181/api/catalog'))\n",
@@ -42,8 +45,7 @@
    "source": [
     "# Create our first catalog\n",
     "\n",
-    "* Creates a catalog named `polaris_catalog` that writes to a specified location in S3.\n",
-    "* An AWS IAM role is specified - this role is assumed whenever we read or write data in the catalog"
+    "* Creates a catalog named `polaris_catalog` that writes to a specified location in the Local Filesystem."
    ]
   },
   {
@@ -59,11 +61,9 @@
     "                                   host='http://polaris:8181/api/management/v1'))\n",
     "root_client = PolarisDefaultApi(client)\n",
     "\n",
-    "storage_conf = AwsStorageConfigInfo(storage_type=\"S3\",\n",
-    "                                  allowed_locations=[\"s3://datalake-storage-team/polaris_test/\"],\n",
-    "                                  role_arn=\"arn:aws:iam::631484165566:role/datalake-storage-integration-role\")\n",
+    "storage_conf = FileStorageConfigInfo(storage_type=\"FILE\", allowed_locations=[\"file:///tmp\"])\n",
     "catalog_name = 'polaris_demo'\n",
-    "catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"s3://datalake-storage-team/polaris_test/polaris_catalog\"},\n",
+    "catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"file:///tmp/polaris/\"},\n",
     "                storage_config_info=storage_conf)\n",
     "catalog.storage_config_info = storage_conf\n",
     "root_client.create_catalog(create_catalog_request=CreateCatalogRequest(catalog=catalog))\n",
@@ -272,7 +272,7 @@
     "  .config(\"spark.sql.catalog.polaris.credential\", f\"{engineer_principal.credentials.client_id}:{engineer_principal.credentials.client_secret}\")\n",
     "\n",
     "  # Set the warehouse to the name of the catalog we created\n",
-    "  .config(\"spark.sql.catalog.polaris.warehouse\", 'polaris_demo')\n",
+    "  .config(\"spark.sql.catalog.polaris.warehouse\", catalog_name)\n",
     "\n",
     "  # Scope set to PRINCIPAL_ROLE:ALL\n",
     "  .config(\"spark.sql.catalog.polaris.scope\", 'PRINCIPAL_ROLE:ALL')\n",
@@ -454,7 +454,7 @@
     "  return codecs.decode(\"1F\", \"hex\").decode(\"UTF-8\").join(namespace)\n",
     "\n",
     "# Call loadTable\n",
-    "tbl_meta = collado_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
+    "tbl_meta = collado_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
     "display(JSON(tbl_meta.to_dict(), expanded=True))"
    ]
   },
@@ -604,7 +604,7 @@
    },
    "outputs": [],
    "source": [
-    "tbl_meta = pm_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
+    "tbl_meta = pm_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n",
     "display(JSON(tbl_meta.to_dict(), expanded=True))"
    ]
   },
@@ -632,7 +632,7 @@
    },
    "outputs": [],
    "source": [
-    "pm_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
+    "pm_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
    ]
   },
   {
@@ -775,7 +775,7 @@
     "# The ops_client fails to do any real damage even though the engineer normally has DROP_TABLE privileges\n",
     "ops_client = IcebergCatalogAPI(CatalogApiClient(CatalogApiClientConfiguration(access_token=ops_token.access_token,\n",
     "              host='http://polaris:8181/api/catalog')))\n",
-    "ops_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
+    "ops_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')"
    ]
   }
  ],