From 89c8164317c910f302a852b04f1dc0d7db95ff72 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 15 Sep 2024 11:58:37 -0700 Subject: [PATCH 01/16] move to getting-started/ --- .../spark/docker-compose-jupyter.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) rename docker-compose-jupyter.yml => getting-started/spark/docker-compose-jupyter.yml (95%) diff --git a/docker-compose-jupyter.yml b/getting-started/spark/docker-compose-jupyter.yml similarity index 95% rename from docker-compose-jupyter.yml rename to getting-started/spark/docker-compose-jupyter.yml index 97a6d1cec..ab97e1f83 100644 --- a/docker-compose-jupyter.yml +++ b/getting-started/spark/docker-compose-jupyter.yml @@ -20,7 +20,7 @@ services: polaris: build: - context: . + context: ../../ network: host ports: - "8181:8181" @@ -37,7 +37,7 @@ services: retries: 5 jupyter: build: - context: . + context: ../../ dockerfile: ./notebooks/Dockerfile network: host ports: @@ -57,4 +57,4 @@ volumes: driver_opts: o: bind type: none - device: ./notebooks + device: ../../notebooks From 35b8820afcdf7c46be36c5fec96fe9428a278212 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Sun, 15 Sep 2024 12:28:12 -0700 Subject: [PATCH 02/16] move ./notebooks --- getting-started/spark/docker-compose-jupyter.yml | 6 +++--- {notebooks => getting-started/spark/notebooks}/Dockerfile | 0 .../spark/notebooks}/SparkPolaris.ipynb | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename {notebooks => getting-started/spark/notebooks}/Dockerfile (100%) rename {notebooks => getting-started/spark/notebooks}/SparkPolaris.ipynb (100%) diff --git a/getting-started/spark/docker-compose-jupyter.yml b/getting-started/spark/docker-compose-jupyter.yml index ab97e1f83..4bda0320c 100644 --- a/getting-started/spark/docker-compose-jupyter.yml +++ b/getting-started/spark/docker-compose-jupyter.yml @@ -37,8 +37,8 @@ services: retries: 5 jupyter: build: - context: ../../ - dockerfile: ./notebooks/Dockerfile + context: ../../ # this is necessary to expose `regtests/` dir to notebooks/Dockerfile + dockerfile: ./getting-started/spark/notebooks/Dockerfile network: host ports: - "8888:8888" @@ -57,4 +57,4 @@ volumes: driver_opts: o: bind type: none - device: ../../notebooks + device: ./notebooks/ diff --git a/notebooks/Dockerfile b/getting-started/spark/notebooks/Dockerfile similarity index 100% rename from notebooks/Dockerfile rename to getting-started/spark/notebooks/Dockerfile diff --git a/notebooks/SparkPolaris.ipynb b/getting-started/spark/notebooks/SparkPolaris.ipynb similarity index 100% rename from notebooks/SparkPolaris.ipynb rename to getting-started/spark/notebooks/SparkPolaris.ipynb From af516b94f6ebaaea6bd57f0bf5c507898410a4d0 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 16 Sep 2024 09:57:42 -0700 Subject: [PATCH 03/16] rename --- .../spark/{docker-compose-jupyter.yml => docker-compose.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename getting-started/spark/{docker-compose-jupyter.yml => docker-compose.yml} (100%) diff --git a/getting-started/spark/docker-compose-jupyter.yml b/getting-started/spark/docker-compose.yml similarity index 100% rename from getting-started/spark/docker-compose-jupyter.yml rename to getting-started/spark/docker-compose.yml From 143ee1d60d8f01f3399018525993d95b0a8701cf Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Tue, 24 Sep 2024 19:08:25 -0700 Subject: [PATCH 04/16] take from run_spark_sql.sh --- .../spark/create-polaris-catalog.sh | 55 +++++++++++++++++++ getting-started/spark/docker-compose.yml | 10 ++++ 2 files changed, 65 insertions(+) create mode 100644 getting-started/spark/create-polaris-catalog.sh diff --git a/getting-started/spark/create-polaris-catalog.sh b/getting-started/spark/create-polaris-catalog.sh new file mode 100644 index 000000000..1e2a8e8fa --- /dev/null +++ b/getting-started/spark/create-polaris-catalog.sh @@ -0,0 +1,55 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +SPARK_BEARER_TOKEN="${REGTEST_ROOT_BEARER_TOKEN:-principal:root;realm:default-realm}" + +# create a catalog backed by the local filesystem +curl -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" \ + -H 'Accept: application/json' \ + -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \ + -d '{ + "catalog": { + "name": "manual_spark", + "type": "INTERNAL", + "readOnly": false, + "properties": { + "default-base-location": "file:///tmp/polaris/" + }, + "storageConfigInfo": { + "storageType": "FILE", + "allowedLocations": [ + "file:///tmp" + ] + } + } + }' + +# Add TABLE_WRITE_DATA to the catalog's catalog_admin role since by default it can only manage access and metadata +curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/manual_spark/catalog-roles/catalog_admin/grants \ + -d '{"type": "catalog", "privilege": "TABLE_WRITE_DATA"}' > /dev/stderr + +# Assign the catalog_admin to the service_admin. +curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/principal-roles/service_admin/catalog-roles/manual_spark \ + -d '{"name": "catalog_admin"}' > /dev/stderr + +curl -X GET -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/manual_spark diff --git a/getting-started/spark/docker-compose.yml b/getting-started/spark/docker-compose.yml index 4bda0320c..a38b94e0f 100644 --- a/getting-started/spark/docker-compose.yml +++ b/getting-started/spark/docker-compose.yml @@ -50,6 +50,16 @@ services: POLARIS_HOST: polaris volumes: - notebooks:/home/jovyan/notebooks + create-polaris-catalog: + image: curlimages/curl + depends_on: + polaris: + condition: service_healthy + environment: + POLARIS_HOST: polaris + volumes: + - ./create-polaris-catalog.sh:/create-polaris-catalog.sh + command: ["/bin/sh", "/create-polaris-catalog.sh"] volumes: notebooks: From c94c6541ba1ac7de20ddc98be079ac8ae8852b71 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 25 Sep 2024 08:59:00 -0700 Subject: [PATCH 05/16] use `polaris_demo` as catalog name --- getting-started/spark/create-polaris-catalog.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/getting-started/spark/create-polaris-catalog.sh b/getting-started/spark/create-polaris-catalog.sh index 1e2a8e8fa..2145b9310 100644 --- a/getting-started/spark/create-polaris-catalog.sh +++ b/getting-started/spark/create-polaris-catalog.sh @@ -26,7 +26,7 @@ curl -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" \ http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \ -d '{ "catalog": { - "name": "manual_spark", + "name": "polaris_demo", "type": "INTERNAL", "readOnly": false, "properties": { @@ -43,13 +43,13 @@ curl -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" \ # Add TABLE_WRITE_DATA to the catalog's catalog_admin role since by default it can only manage access and metadata curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/manual_spark/catalog-roles/catalog_admin/grants \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/polaris_demo/catalog-roles/catalog_admin/grants \ -d '{"type": "catalog", "privilege": "TABLE_WRITE_DATA"}' > /dev/stderr # Assign the catalog_admin to the service_admin. curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/principal-roles/service_admin/catalog-roles/manual_spark \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/principal-roles/service_admin/catalog-roles/polaris_demo \ -d '{"name": "catalog_admin"}' > /dev/stderr curl -X GET -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/manual_spark + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/polaris_demo From 8acb1b5149dca687e8788b03a1cdc2f5638dc9f5 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 25 Sep 2024 09:12:04 -0700 Subject: [PATCH 06/16] use env var POLARIS_CATALOG_NAME --- getting-started/spark/create-polaris-catalog.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/getting-started/spark/create-polaris-catalog.sh b/getting-started/spark/create-polaris-catalog.sh index 2145b9310..055ab069e 100644 --- a/getting-started/spark/create-polaris-catalog.sh +++ b/getting-started/spark/create-polaris-catalog.sh @@ -18,6 +18,7 @@ # SPARK_BEARER_TOKEN="${REGTEST_ROOT_BEARER_TOKEN:-principal:root;realm:default-realm}" +POLARIS_CATALOG_NAME="${POLARIS_CATALOG_NAME:-polaris_demo}" # create a catalog backed by the local filesystem curl -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" \ @@ -26,7 +27,7 @@ curl -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" \ http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \ -d '{ "catalog": { - "name": "polaris_demo", + "name": "'"${POLARIS_CATALOG_NAME}"'", "type": "INTERNAL", "readOnly": false, "properties": { @@ -43,13 +44,13 @@ curl -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" \ # Add TABLE_WRITE_DATA to the catalog's catalog_admin role since by default it can only manage access and metadata curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/polaris_demo/catalog-roles/catalog_admin/grants \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/${POLARIS_CATALOG_NAME}/catalog-roles/catalog_admin/grants \ -d '{"type": "catalog", "privilege": "TABLE_WRITE_DATA"}' > /dev/stderr # Assign the catalog_admin to the service_admin. curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/principal-roles/service_admin/catalog-roles/polaris_demo \ + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/principal-roles/service_admin/catalog-roles/${POLARIS_CATALOG_NAME} \ -d '{"name": "catalog_admin"}' > /dev/stderr curl -X GET -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/polaris_demo + http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/${POLARIS_CATALOG_NAME} From ccafb2d5c7f55963706f4b7130f04bf7f8b76a03 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 25 Sep 2024 09:17:25 -0700 Subject: [PATCH 07/16] ignore notebook checkpoints --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 3855dceb5..62beb3bcc 100644 --- a/.gitignore +++ b/.gitignore @@ -26,8 +26,8 @@ regtests/output/ /polaris-venv/ /pyproject.toml -# Notebooks -notebooks/.ipynb_checkpoints/ +# Notebook Checkpoints +**/.ipynb_checkpoints/ # Metastore metastore_db/ From be13e1eec2cebabd9cae401d240452300e9cc964 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 25 Sep 2024 09:38:42 -0700 Subject: [PATCH 08/16] fix notebook to work locally --- .../spark/notebooks/SparkPolaris.ipynb | 36 +++++++++---------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/getting-started/spark/notebooks/SparkPolaris.ipynb b/getting-started/spark/notebooks/SparkPolaris.ipynb index 6510c670f..d74a2d930 100644 --- a/getting-started/spark/notebooks/SparkPolaris.ipynb +++ b/getting-started/spark/notebooks/SparkPolaris.ipynb @@ -21,8 +21,11 @@ "from polaris.catalog.api_client import ApiClient as CatalogApiClient\n", "from polaris.catalog.api_client import Configuration as CatalogApiClientConfiguration\n", "\n", - "client_id = 'b3b6497353b33ea7'\n", - "client_secret = '623a67ee71d75825238e3e269df5cdac' # pragma: allowlist secret\n", + "# (CHANGE ME): This credential changes on every Polaris service restart\n", + "# In the Polaris log, look for the `realm: default-realm root principal credentials:` string\n", + "polaris_credential = '35df9f8a34199df0:101b9d35700032416210ad2d39b1b4e3' # pragma: allowlist secret\n", + "\n", + "client_id, client_secret = polaris_credential.split(\":\")\n", "client = CatalogApiClient(CatalogApiClientConfiguration(username=client_id,\n", " password=client_secret,\n", " host='http://polaris:8181/api/catalog'))\n", @@ -40,10 +43,9 @@ "id": "4c21f4a1-4129-4dd8-9a6c-fa6eeabfa56e", "metadata": {}, "source": [ - "# Create our first catalog\n", + "# Load the catalog\n", "\n", - "* Creates a catalog named `polaris_catalog` that writes to a specified location in S3.\n", - "* An AWS IAM role is specified - this role is assumed whenever we read or write data in the catalog" + "* A catalog using Local Filesystem is created by `create-polaris-catalog.sh`, with associated access grants" ] }, { @@ -54,21 +56,15 @@ "outputs": [], "source": [ "from polaris.management import *\n", + "import os\n", "\n", "client = ApiClient(Configuration(access_token=token.access_token,\n", " host='http://polaris:8181/api/management/v1'))\n", "root_client = PolarisDefaultApi(client)\n", "\n", - "storage_conf = AwsStorageConfigInfo(storage_type=\"S3\",\n", - " allowed_locations=[\"s3://datalake-storage-team/polaris_test/\"],\n", - " role_arn=\"arn:aws:iam::631484165566:role/datalake-storage-integration-role\")\n", - "catalog_name = 'polaris_demo'\n", - "catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"s3://datalake-storage-team/polaris_test/polaris_catalog\"},\n", - " storage_config_info=storage_conf)\n", - "catalog.storage_config_info = storage_conf\n", - "root_client.create_catalog(create_catalog_request=CreateCatalogRequest(catalog=catalog))\n", - "resp = root_client.get_catalog(catalog_name=catalog.name)\n", - "resp" + "catalog_name = os.getenv(\"POLARIS_CATALOG_NAME\", \"polaris_demo\")\n", + "catalog = root_client.get_catalog(catalog_name)\n", + "catalog" ] }, { @@ -272,7 +268,7 @@ " .config(\"spark.sql.catalog.polaris.credential\", f\"{engineer_principal.credentials.client_id}:{engineer_principal.credentials.client_secret}\")\n", "\n", " # Set the warehouse to the name of the catalog we created\n", - " .config(\"spark.sql.catalog.polaris.warehouse\", 'polaris_demo')\n", + " .config(\"spark.sql.catalog.polaris.warehouse\", catalog_name)\n", "\n", " # Scope set to PRINCIPAL_ROLE:ALL\n", " .config(\"spark.sql.catalog.polaris.scope\", 'PRINCIPAL_ROLE:ALL')\n", @@ -454,7 +450,7 @@ " return codecs.decode(\"1F\", \"hex\").decode(\"UTF-8\").join(namespace)\n", "\n", "# Call loadTable\n", - "tbl_meta = collado_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n", + "tbl_meta = collado_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n", "display(JSON(tbl_meta.to_dict(), expanded=True))" ] }, @@ -604,7 +600,7 @@ }, "outputs": [], "source": [ - "tbl_meta = pm_client.load_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n", + "tbl_meta = pm_client.load_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE', x_iceberg_access_delegation='true')\n", "display(JSON(tbl_meta.to_dict(), expanded=True))" ] }, @@ -632,7 +628,7 @@ }, "outputs": [], "source": [ - "pm_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')" + "pm_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')" ] }, { @@ -775,7 +771,7 @@ "# The ops_client fails to do any real damage even though the engineer normally has DROP_TABLE privileges\n", "ops_client = IcebergCatalogAPI(CatalogApiClient(CatalogApiClientConfiguration(access_token=ops_token.access_token,\n", " host='http://polaris:8181/api/catalog')))\n", - "ops_client.drop_table(prefix='polaris_demo', namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')" + "ops_client.drop_table(prefix=catalog_name, namespace=format_namespace(['COLLADO_TEST', 'PUBLIC']), table='TEST_TABLE')" ] } ], From bec6dffc77ca276ed7e8592ed265a5ff2e0da49b Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 25 Sep 2024 09:48:07 -0700 Subject: [PATCH 09/16] add POLARIS_CATALOG_NAME env var to docker --- getting-started/spark/docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/getting-started/spark/docker-compose.yml b/getting-started/spark/docker-compose.yml index a38b94e0f..e52dbc020 100644 --- a/getting-started/spark/docker-compose.yml +++ b/getting-started/spark/docker-compose.yml @@ -48,6 +48,7 @@ services: environment: AWS_REGION: us-west-2 POLARIS_HOST: polaris + POLARIS_CATALOG_NAME: polaris_demo # this is used both for `jupyter` and `create-polaris-catalog`, and must be kept in sync volumes: - notebooks:/home/jovyan/notebooks create-polaris-catalog: @@ -57,6 +58,7 @@ services: condition: service_healthy environment: POLARIS_HOST: polaris + POLARIS_CATALOG_NAME: polaris_demo # this is used both for `jupyter` and `create-polaris-catalog`, and must be kept in sync volumes: - ./create-polaris-catalog.sh:/create-polaris-catalog.sh command: ["/bin/sh", "/create-polaris-catalog.sh"] From aa69a5dccf06567d75896a090490f3ab685a3907 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 25 Sep 2024 10:09:13 -0700 Subject: [PATCH 10/16] readme --- getting-started/spark/README.md | 51 +++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 getting-started/spark/README.md diff --git a/getting-started/spark/README.md b/getting-started/spark/README.md new file mode 100644 index 000000000..7a4c84fbd --- /dev/null +++ b/getting-started/spark/README.md @@ -0,0 +1,51 @@ + + +# Getting Started with Spark and Apache Polaris + +This getting started guide provides a `docker-compose` file to set up [Spark](https://spark.apache.org/) with Apache Polaris. Apache Polaris is configured as an Iceberg REST Catalog in Spark. +A Jupyter notebook is used to run PySpark. + +## Run the `docker-compose` file +To start the `docker-compose` file, run this command from the repo's root directory: +``` +docker-compose -f getting-started/spark/docker-compose.yml up +``` + +This will spin up 3 containers services +* The `polaris` service for running Apache Polaris +* The `jupyter` service for running Jupyter notebook with PySpark +* The `create-polaris-catalog` service to run setup script and create local catalog in Polaris + +## Access the Jupyter notebook interface +In the Jupyter notebook container log, look for the URL to access Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=`. + +Open the Jupyter notebook in a browser. +Navigate to [`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb) + +## Change the Polaris credential +The Polaris service will create a new root crendential on startup, find this credential in the Polaris service log and change the `polaris_credential` variable in the first cell of the jupyter notebook + +## Run the Jupyter notebook +You can now run all cells in the notebook or write your own code! + +## Note +The Polaris catalog setup script uses the credential `principal:root;realm:default-realm`. This credential is used so users do not need to fetch credentials from Apache Polaris' console output. + +An example catalog is created in Apache Polaris using the `curl` command. See `create-polaris-catalog.sh` for details. From 7c5f659ab1d663242da43eb30472a7d012f56cf9 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 25 Sep 2024 10:12:45 -0700 Subject: [PATCH 11/16] disable link check for this line --- getting-started/spark/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/getting-started/spark/README.md b/getting-started/spark/README.md index 7a4c84fbd..96834d74c 100644 --- a/getting-started/spark/README.md +++ b/getting-started/spark/README.md @@ -37,7 +37,7 @@ This will spin up 3 containers services In the Jupyter notebook container log, look for the URL to access Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=`. Open the Jupyter notebook in a browser. -Navigate to [`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb) +Navigate to [`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb) ## Change the Polaris credential The Polaris service will create a new root crendential on startup, find this credential in the Polaris service log and change the `polaris_credential` variable in the first cell of the jupyter notebook From da0cc1692c61104275d2c41776fcf5be64b5fe64 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Thu, 26 Sep 2024 12:06:29 -0700 Subject: [PATCH 12/16] typo --- getting-started/spark/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/getting-started/spark/README.md b/getting-started/spark/README.md index 96834d74c..eafb03981 100644 --- a/getting-started/spark/README.md +++ b/getting-started/spark/README.md @@ -17,9 +17,9 @@ under the License. --> -# Getting Started with Spark and Apache Polaris +# Getting Started with Apache Spark and Apache Polaris -This getting started guide provides a `docker-compose` file to set up [Spark](https://spark.apache.org/) with Apache Polaris. Apache Polaris is configured as an Iceberg REST Catalog in Spark. +This getting started guide provides a `docker-compose` file to set up [Apache Spark](https://spark.apache.org/) with Apache Polaris. Apache Polaris is configured as an Iceberg REST Catalog in Spark. A Jupyter notebook is used to run PySpark. ## Run the `docker-compose` file @@ -28,13 +28,13 @@ To start the `docker-compose` file, run this command from the repo's root direct docker-compose -f getting-started/spark/docker-compose.yml up ``` -This will spin up 3 containers services +This will spin up 3 container services * The `polaris` service for running Apache Polaris * The `jupyter` service for running Jupyter notebook with PySpark * The `create-polaris-catalog` service to run setup script and create local catalog in Polaris ## Access the Jupyter notebook interface -In the Jupyter notebook container log, look for the URL to access Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=`. +In the Jupyter notebook container log, look for the URL to access the Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=`. Open the Jupyter notebook in a browser. Navigate to [`notebooks/SparkPolaris.ipynb`](http://127.0.0.1:8888/lab/tree/notebooks/SparkPolaris.ipynb) From d44b948e53200bea8bdc0ddf7c9c0aa13517cee4 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Tue, 15 Oct 2024 08:00:55 -0700 Subject: [PATCH 13/16] notebooks dir moved to getting-started/ --- .github/workflows/check-md-link.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/check-md-link.yml b/.github/workflows/check-md-link.yml index 6cdb4195e..00a163593 100644 --- a/.github/workflows/check-md-link.yml +++ b/.github/workflows/check-md-link.yml @@ -41,5 +41,5 @@ jobs: with: use-quiet-mode: 'yes' config-file: '.github/workflows/check-md-link-config.json' - folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, notebooks' + folder-path: 'regtests, regtests/client/python/docs, regtests/client/python, .github, build-logic, polaris-core, polaris-service, extension, spec, k8, getting-started' file-path: 'CHAT_BYLAWS.md, CODE_OF_CONDUCT.md, CONTRIBUTING.md, README.md SECURITY.md' From 797fabbfc94afcecb0fc5ecfb89fae4cb1cae294 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Tue, 15 Oct 2024 12:24:26 -0700 Subject: [PATCH 14/16] address PR feedback --- getting-started/spark/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/getting-started/spark/README.md b/getting-started/spark/README.md index eafb03981..d53d87bb1 100644 --- a/getting-started/spark/README.md +++ b/getting-started/spark/README.md @@ -29,9 +29,9 @@ docker-compose -f getting-started/spark/docker-compose.yml up ``` This will spin up 3 container services -* The `polaris` service for running Apache Polaris +* The `polaris` service for running Apache Polaris using an in-memory metastore * The `jupyter` service for running Jupyter notebook with PySpark -* The `create-polaris-catalog` service to run setup script and create local catalog in Polaris +* The `create-polaris-catalog` service to run setup script and create a catalog in Polaris backed by the local file system ## Access the Jupyter notebook interface In the Jupyter notebook container log, look for the URL to access the Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=`. From c2c3437ead70cb865b737f38c3ffa849ccd77179 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Tue, 15 Oct 2024 14:37:57 -0700 Subject: [PATCH 15/16] remove script, move to jupyter notebook --- .../spark/create-polaris-catalog.sh | 56 ------------------- getting-started/spark/docker-compose.yml | 12 ---- .../spark/notebooks/SparkPolaris.ipynb | 16 ++++-- 3 files changed, 10 insertions(+), 74 deletions(-) delete mode 100644 getting-started/spark/create-polaris-catalog.sh diff --git a/getting-started/spark/create-polaris-catalog.sh b/getting-started/spark/create-polaris-catalog.sh deleted file mode 100644 index 055ab069e..000000000 --- a/getting-started/spark/create-polaris-catalog.sh +++ /dev/null @@ -1,56 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -SPARK_BEARER_TOKEN="${REGTEST_ROOT_BEARER_TOKEN:-principal:root;realm:default-realm}" -POLARIS_CATALOG_NAME="${POLARIS_CATALOG_NAME:-polaris_demo}" - -# create a catalog backed by the local filesystem -curl -X POST -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" \ - -H 'Accept: application/json' \ - -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs \ - -d '{ - "catalog": { - "name": "'"${POLARIS_CATALOG_NAME}"'", - "type": "INTERNAL", - "readOnly": false, - "properties": { - "default-base-location": "file:///tmp/polaris/" - }, - "storageConfigInfo": { - "storageType": "FILE", - "allowedLocations": [ - "file:///tmp" - ] - } - } - }' - -# Add TABLE_WRITE_DATA to the catalog's catalog_admin role since by default it can only manage access and metadata -curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/${POLARIS_CATALOG_NAME}/catalog-roles/catalog_admin/grants \ - -d '{"type": "catalog", "privilege": "TABLE_WRITE_DATA"}' > /dev/stderr - -# Assign the catalog_admin to the service_admin. -curl -i -X PUT -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/principal-roles/service_admin/catalog-roles/${POLARIS_CATALOG_NAME} \ - -d '{"name": "catalog_admin"}' > /dev/stderr - -curl -X GET -H "Authorization: Bearer ${SPARK_BEARER_TOKEN}" -H 'Accept: application/json' -H 'Content-Type: application/json' \ - http://${POLARIS_HOST:-localhost}:8181/api/management/v1/catalogs/${POLARIS_CATALOG_NAME} diff --git a/getting-started/spark/docker-compose.yml b/getting-started/spark/docker-compose.yml index e52dbc020..4bda0320c 100644 --- a/getting-started/spark/docker-compose.yml +++ b/getting-started/spark/docker-compose.yml @@ -48,20 +48,8 @@ services: environment: AWS_REGION: us-west-2 POLARIS_HOST: polaris - POLARIS_CATALOG_NAME: polaris_demo # this is used both for `jupyter` and `create-polaris-catalog`, and must be kept in sync volumes: - notebooks:/home/jovyan/notebooks - create-polaris-catalog: - image: curlimages/curl - depends_on: - polaris: - condition: service_healthy - environment: - POLARIS_HOST: polaris - POLARIS_CATALOG_NAME: polaris_demo # this is used both for `jupyter` and `create-polaris-catalog`, and must be kept in sync - volumes: - - ./create-polaris-catalog.sh:/create-polaris-catalog.sh - command: ["/bin/sh", "/create-polaris-catalog.sh"] volumes: notebooks: diff --git a/getting-started/spark/notebooks/SparkPolaris.ipynb b/getting-started/spark/notebooks/SparkPolaris.ipynb index d74a2d930..deb74e928 100644 --- a/getting-started/spark/notebooks/SparkPolaris.ipynb +++ b/getting-started/spark/notebooks/SparkPolaris.ipynb @@ -43,9 +43,9 @@ "id": "4c21f4a1-4129-4dd8-9a6c-fa6eeabfa56e", "metadata": {}, "source": [ - "# Load the catalog\n", + "# Create our first catalog\n", "\n", - "* A catalog using Local Filesystem is created by `create-polaris-catalog.sh`, with associated access grants" + "* Creates a catalog named `polaris_catalog` that writes to a specified location in the Local Filesystem." ] }, { @@ -56,15 +56,19 @@ "outputs": [], "source": [ "from polaris.management import *\n", - "import os\n", "\n", "client = ApiClient(Configuration(access_token=token.access_token,\n", " host='http://polaris:8181/api/management/v1'))\n", "root_client = PolarisDefaultApi(client)\n", "\n", - "catalog_name = os.getenv(\"POLARIS_CATALOG_NAME\", \"polaris_demo\")\n", - "catalog = root_client.get_catalog(catalog_name)\n", - "catalog" + "storage_conf = FileStorageConfigInfo(storage_type=\"FILE\", allowed_locations=[\"file:///tmp\"])\n", + "catalog_name = 'polaris_demo'\n", + "catalog = Catalog(name=catalog_name, type='INTERNAL', properties={\"default-base-location\": \"file:///tmp/polaris/\"},\n", + " storage_config_info=storage_conf)\n", + "catalog.storage_config_info = storage_conf\n", + "root_client.create_catalog(create_catalog_request=CreateCatalogRequest(catalog=catalog))\n", + "resp = root_client.get_catalog(catalog_name=catalog.name)\n", + "resp" ] }, { From 708522a5000638b326760a1dfe639e1a9406e845 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Tue, 15 Oct 2024 14:40:51 -0700 Subject: [PATCH 16/16] update README --- getting-started/spark/README.md | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/getting-started/spark/README.md b/getting-started/spark/README.md index d53d87bb1..55e4f9d94 100644 --- a/getting-started/spark/README.md +++ b/getting-started/spark/README.md @@ -28,10 +28,9 @@ To start the `docker-compose` file, run this command from the repo's root direct docker-compose -f getting-started/spark/docker-compose.yml up ``` -This will spin up 3 container services +This will spin up 2 container services * The `polaris` service for running Apache Polaris using an in-memory metastore * The `jupyter` service for running Jupyter notebook with PySpark -* The `create-polaris-catalog` service to run setup script and create a catalog in Polaris backed by the local file system ## Access the Jupyter notebook interface In the Jupyter notebook container log, look for the URL to access the Jupyter notebook. The url should be in the format, `http://127.0.0.1:8888/lab?token=`. @@ -44,8 +43,3 @@ The Polaris service will create a new root crendential on startup, find this cre ## Run the Jupyter notebook You can now run all cells in the notebook or write your own code! - -## Note -The Polaris catalog setup script uses the credential `principal:root;realm:default-realm`. This credential is used so users do not need to fetch credentials from Apache Polaris' console output. - -An example catalog is created in Apache Polaris using the `curl` command. See `create-polaris-catalog.sh` for details.