docker/compose-controller-spark-sql-single.yaml

# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# This docker-compose configuration is for bringing up a pipeline controller
# along with a single-process Spark environment with a JDBC endpoint.

# Environment variables:
#
# PIPELINE_CONFIG: The directory that contains pipeline configurations, namely
#   application.yaml and flink-conf.yaml files.
#
# DWH_ROOT: The directory where Parquet files are written. This is shared
#   between all containers; the pipeline writes to it and Spark ones read.
#
# Note if local paths are used, they should start with `./ `or `../`. Also the
# mounted files should be readable by containers, e.g., world-readable.
#

# NOTES ON SPARK:
# This is a very simple single-process Spark configuration to be able to run
# SQL queries against Parquet files generated by the pipeline. It exposes an
# endpoint on port 10001 which can be used for JDBC connection from any SQL
# client.
#
# For a more complete configuration which shows different pieces that are needed
# for a cluster environment, please see `compose-controller-spark-sql.yaml`.

# NOTES ON METASTORE:
# This configuration uses the default embedded Derby database as Metastore for
# the thriftserver. Example config lines are provided (but commented out) that
# show how to use an external DB instead.

# OTHER CONFIGS:
# If you want to change Spark default configs, you can mount your config files
# to /opt/bitnami/spark/conf/
# https://spark.apache.org/docs/latest/configuration.html

# Version 2.4 supports healthcheck feature.
version: '2.4'

services:
  pipeline-controller:
    # to force a build use `--build` option of `docker-compose up`.
    build:
      context: ..
    # To use a released image instead, replace the above two lines with this:
    # image: us-docker.pkg.dev/cloud-build-fhir/fhir-analytics/main:latest
    container_name: pipeline-controller
    volumes:
      - ${PIPELINE_CONFIG}:/app/config:ro
      - ${DWH_ROOT}:/dwh
    environment:
      - JAVA_OPTS=$JAVA_OPTS
      # This is to turn this on in e2e but leave it off in the default config.
      - FHIRDATA_SINKFHIRSERVERURL=$FHIRDATA_SINKFHIRSERVERURL
    ports:
      - '8090:8080'
    networks:
      - cloudbuild
      - default
    depends_on:
      spark:
        condition: service_healthy

  spark:
    image: docker.io/bitnami/spark:3.3
    container_name: spark-thriftserver
    command:
      - sbin/start-thriftserver.sh
      - --driver-memory
      # You may need to increase this if your queries/data are too large.
      - 5g
    environment:
      - HIVE_SERVER2_THRIFT_PORT=10000
    ports:
      - '10001:10000'
      - '4041:4040'
    volumes:
      - ${DWH_ROOT}:/dwh
      # NON-EMBEDDED METASTORE CONFIG:
      # If you want to persist the Metastore data, e.g., table and view
      # definitions, you can use an external database by adjusting hive-site.xml
      #- ./hive-site_example.xml:/opt/bitnami/spark/conf/hive-site.xml
      # Note to use an external DB, you need to provide its driver jar too:
      #- ./postgresql-42.6.0.jar:/opt/bitnami/spark/jars/postgresql-42.6.0.jar
      - spark_vol_single:/opt/bitnami/spark
    networks:
      - cloudbuild
      - default
    healthcheck:
      test: beeline help || exit 1
      interval: 10s
      retries: 10
      start_period: 5s
      timeout: 60s


volumes:
  spark_vol_single:

networks:
  cloudbuild:
    external: true
    name: cloudbuild # Needed for Continuous integration