diff --git a/demos/data-lakehouse-iceberg-trino-spark/create-nifi-ingestion-job.yaml b/demos/data-lakehouse-iceberg-trino-spark/create-nifi-ingestion-job.yaml index 59968ee2..e86947df 100644 --- a/demos/data-lakehouse-iceberg-trino-spark/create-nifi-ingestion-job.yaml +++ b/demos/data-lakehouse-iceberg-trino-spark/create-nifi-ingestion-job.yaml @@ -10,7 +10,14 @@ spec: initContainers: - name: wait-for-kafka image: oci.stackable.tech/sdp/tools:1.0.0-stackable0.0.0-dev - command: ["bash", "-c", "echo 'Waiting for all kafka brokers to be ready' && kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/instance=kafka -l app.kubernetes.io/name=kafka"] + command: + - bash + - -euo + - pipefail + - -c + - | + echo 'Waiting for all kafka brokers to be ready' + kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/instance=kafka,app.kubernetes.io/name=kafka containers: - name: create-nifi-ingestion-job image: oci.stackable.tech/sdp/testing-tools:0.2.0-stackable0.0.0-dev @@ -19,7 +26,8 @@ spec: - -euo - pipefail - -c - - python -u /tmp/script/script.py + - | + python -u /tmp/script/script.py volumeMounts: - name: script mountPath: /tmp/script @@ -53,8 +61,8 @@ data: import requests import urllib3 - # As of 2022-08-29 we cant use "https://nifi:8443" here because

The request contained an invalid host header [nifi:8443] in the request [/nifi-api]. Check for request manipulation or third-party intercept.

- ENDPOINT = f"https://nifi-node-default-0.nifi-node-default.{os.environ['NAMESPACE']}.svc.cluster.local:8443" # For local testing / developing replace it, afterwards change back to f"https://nifi-node-default-0.nifi-node-default.{os.environ['NAMESPACE']}.svc.cluster.local:8443" + # As of 2022-08-29 we cant use "https://nifi-node:8443" here because

The request contained an invalid host header [nifi:8443] in the request [/nifi-api]. Check for request manipulation or third-party intercept.

+ ENDPOINT = f"https://nifi-node-default-0.nifi-node-default-headless.{os.environ['NAMESPACE']}.svc.cluster.local:8443" # For local testing / developing replace it, afterwards change back to f"https://nifi-node-default-0.nifi-node-default-headless.{os.environ['NAMESPACE']}.svc.cluster.local:8443" USERNAME = "admin" PASSWORD = open("/nifi-admin-credentials-secret/admin").read() diff --git a/demos/data-lakehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml b/demos/data-lakehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml index 739ceeed..10271f25 100644 --- a/demos/data-lakehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml +++ b/demos/data-lakehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml @@ -13,11 +13,26 @@ spec: initContainers: - name: wait-for-kafka image: oci.stackable.tech/sdp/tools:1.0.0-stackable0.0.0-dev - command: ["bash", "-c", "echo 'Waiting for all kafka brokers to be ready' && kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/name=kafka -l app.kubernetes.io/instance=kafka"] + command: + - bash + - -euo + - pipefail + - -c + - | + echo 'Waiting for all minio instances to be ready' + kubectl wait --for=condition=ready --timeout=30m pod -l app=minio,release=minio,stackable.tech/vendor=Stackable + echo 'Waiting for all kafka brokers to be ready' + kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/name=kafka,app.kubernetes.io/instance=kafka containers: - name: create-spark-ingestion-job image: oci.stackable.tech/sdp/tools:1.0.0-stackable0.0.0-dev - command: ["bash", "-c", "echo 'Submitting Spark job' && kubectl apply -f /tmp/manifest/spark-ingestion-job.yaml"] + command: + - bash + - -euo + - pipefail + - -c + - | + echo 'Submitting Spark job' && kubectl apply -f /tmp/manifest/spark-ingestion-job.yaml volumeMounts: - name: manifest mountPath: /tmp/manifest @@ -56,7 +71,7 @@ data: spark.sql.extensions: org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions spark.sql.catalog.lakehouse: org.apache.iceberg.spark.SparkCatalog spark.sql.catalog.lakehouse.type: hive - spark.sql.catalog.lakehouse.uri: thrift://hive-iceberg:9083 + spark.sql.catalog.lakehouse.uri: thrift://hive-iceberg-metastore:9083 # Every merge into statements creates 8 files. # Paralleling is enough for the demo, might need to be increased (or omitted entirely) when merge larger data volumes spark.sql.shuffle.partitions: "8" diff --git a/demos/data-lakehouse-iceberg-trino-spark/create-trino-tables.yaml b/demos/data-lakehouse-iceberg-trino-spark/create-trino-tables.yaml index 5a0f5ae9..a1ce4d7f 100644 --- a/demos/data-lakehouse-iceberg-trino-spark/create-trino-tables.yaml +++ b/demos/data-lakehouse-iceberg-trino-spark/create-trino-tables.yaml @@ -10,7 +10,14 @@ spec: initContainers: - name: wait-for-testdata image: oci.stackable.tech/sdp/tools:1.0.0-stackable0.0.0-dev - command: ["bash", "-c", "echo 'Waiting for job load-test-data to finish' && kubectl wait --for=condition=complete --timeout=30m job/load-test-data"] + command: + - bash + - -euo + - pipefail + - -c + - | + echo 'Waiting for job load-test-data to finish' + kubectl wait --for=condition=complete --timeout=30m job/load-test-data containers: - name: create-tables-in-trino image: oci.stackable.tech/sdp/testing-tools:0.2.0-stackable0.0.0-dev diff --git a/demos/data-lakehouse-iceberg-trino-spark/serviceaccount.yaml b/demos/data-lakehouse-iceberg-trino-spark/serviceaccount.yaml index cb3ea626..53e2ccae 100644 --- a/demos/data-lakehouse-iceberg-trino-spark/serviceaccount.yaml +++ b/demos/data-lakehouse-iceberg-trino-spark/serviceaccount.yaml @@ -31,6 +31,14 @@ rules: - get - list - watch + - apiGroups: + - apps + resources: + - statefulsets + verbs: + - get + - list + - watch - apiGroups: - batch resources: diff --git a/demos/data-lakehouse-iceberg-trino-spark/setup-superset.yaml b/demos/data-lakehouse-iceberg-trino-spark/setup-superset.yaml index 187f94ee..4a14c4f5 100644 --- a/demos/data-lakehouse-iceberg-trino-spark/setup-superset.yaml +++ b/demos/data-lakehouse-iceberg-trino-spark/setup-superset.yaml @@ -9,7 +9,14 @@ spec: containers: - name: setup-superset image: oci.stackable.tech/sdp/testing-tools:0.2.0-stackable0.0.0-dev - command: ["bash", "-c", "curl -o superset-assets.zip https://raw.githubusercontent.com/stackabletech/demos/main/demos/data-lakehouse-iceberg-trino-spark/superset-assets.zip && python -u /tmp/script/script.py"] + command: + - bash + - -euo + - pipefail + - -c + - | + curl -o superset-assets.zip https://raw.githubusercontent.com/stackabletech/demos/main/demos/data-lakehouse-iceberg-trino-spark/superset-assets.zip + python -u /tmp/script/script.py volumeMounts: - name: script mountPath: /tmp/script @@ -39,7 +46,7 @@ data: import logging import requests - base_url = "http://superset-node-default:8088" # For local testing / developing replace it, afterwards change back to http://superset-node-default:8088 + base_url = "http://superset-node:8088" # For local testing / developing replace it, afterwards change back to http://superset-node:8088 superset_username = open("/superset-credentials/adminUser.username").read() superset_password = open("/superset-credentials/adminUser.password").read() trino_username = "admin" diff --git a/demos/data-lakehouse-iceberg-trino-spark/superset-assets.zip b/demos/data-lakehouse-iceberg-trino-spark/superset-assets.zip index 4f36c971..ebe610be 100644 Binary files a/demos/data-lakehouse-iceberg-trino-spark/superset-assets.zip and b/demos/data-lakehouse-iceberg-trino-spark/superset-assets.zip differ diff --git a/stacks/data-lakehouse-iceberg-trino-spark/nifi.yaml b/stacks/data-lakehouse-iceberg-trino-spark/nifi.yaml index 58a4c2db..6c067d79 100644 --- a/stacks/data-lakehouse-iceberg-trino-spark/nifi.yaml +++ b/stacks/data-lakehouse-iceberg-trino-spark/nifi.yaml @@ -9,18 +9,19 @@ spec: clusterConfig: authentication: - authenticationClass: nifi-admin-credentials - listenerClass: external-unstable sensitiveProperties: keySecret: nifi-sensitive-property-key autoGenerate: true nodes: + roleConfig: + listenerClass: external-unstable config: resources: cpu: min: "2" max: "4" memory: - limit: '6Gi' + limit: "6Gi" storage: contentRepo: capacity: "10Gi" @@ -51,4 +52,4 @@ kind: Secret metadata: name: nifi-admin-credentials-secret stringData: - admin: {{ nifiAdminPassword }} + admin: "{{ nifiAdminPassword }}" diff --git a/stacks/data-lakehouse-iceberg-trino-spark/trino.yaml b/stacks/data-lakehouse-iceberg-trino-spark/trino.yaml index 0716844e..2848e35b 100644 --- a/stacks/data-lakehouse-iceberg-trino-spark/trino.yaml +++ b/stacks/data-lakehouse-iceberg-trino-spark/trino.yaml @@ -7,7 +7,6 @@ spec: image: productVersion: "476" clusterConfig: - listenerClass: external-unstable catalogLabelSelector: matchLabels: trino: trino @@ -18,6 +17,8 @@ spec: configMapName: opa package: trino coordinators: + roleConfig: + listenerClass: external-unstable config: queryMaxMemory: 10TB resources: @@ -25,7 +26,7 @@ spec: min: "1" max: "4" memory: - limit: '6Gi' + limit: "6Gi" roleGroups: default: replicas: 1 @@ -37,7 +38,7 @@ spec: min: "2" max: "6" memory: - limit: '20Gi' + limit: "20Gi" roleGroups: default: replicas: 4