forked from google/fhir-data-pipes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
compose-controller-spark-sql-single.yaml
115 lines (104 loc) · 3.88 KB
/
compose-controller-spark-sql-single.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This docker-compose configuration is for bringing up a pipeline controller
# along with a single-process Spark environment with a JDBC endpoint.
# Environment variables:
#
# PIPELINE_CONFIG: The directory that contains pipeline configurations, namely
# application.yaml and flink-conf.yaml files.
#
# DWH_ROOT: The directory where Parquet files are written. This is shared
# between all containers; the pipeline writes to it and Spark ones read.
#
# Note if local paths are used, they should start with `./ `or `../`. Also the
# mounted files should be readable by containers, e.g., world-readable.
#
# NOTES ON SPARK:
# This is a very simple single-process Spark configuration to be able to run
# SQL queries against Parquet files generated by the pipeline. It exposes an
# endpoint on port 10001 which can be used for JDBC connection from any SQL
# client.
#
# For a more complete configuration which shows different pieces that are needed
# for a cluster environment, please see `compose-controller-spark-sql.yaml`.
# NOTES ON METASTORE:
# This configuration uses the default embedded Derby database as Metastore for
# the thriftserver. Example config lines are provided (but commented out) that
# show how to use an external DB instead.
# OTHER CONFIGS:
# If you want to change Spark default configs, you can mount your config files
# to /opt/bitnami/spark/conf/
# https://spark.apache.org/docs/latest/configuration.html
# Version 2.4 supports healthcheck feature.
version: '2.4'
services:
pipeline-controller:
# to force a build use `--build` option of `docker-compose up`.
build:
context: ..
# To use a released image instead, replace the above two lines with this:
# image: us-docker.pkg.dev/cloud-build-fhir/fhir-analytics/main:latest
container_name: pipeline-controller
volumes:
- ${PIPELINE_CONFIG}:/app/config:ro
- ${DWH_ROOT}:/dwh
environment:
- JAVA_OPTS=$JAVA_OPTS
# This is to turn this on in e2e but leave it off in the default config.
- FHIRDATA_SINKFHIRSERVERURL=$FHIRDATA_SINKFHIRSERVERURL
ports:
- '8090:8080'
networks:
- cloudbuild
- default
depends_on:
spark:
condition: service_healthy
spark:
image: docker.io/bitnami/spark:3.3
container_name: spark-thriftserver
command:
- sbin/start-thriftserver.sh
- --driver-memory
# You may need to increase this if your queries/data are too large.
- 5g
environment:
- HIVE_SERVER2_THRIFT_PORT=10000
ports:
- '10001:10000'
- '4041:4040'
volumes:
- ${DWH_ROOT}:/dwh
# NON-EMBEDDED METASTORE CONFIG:
# If you want to persist the Metastore data, e.g., table and view
# definitions, you can use an external database by adjusting hive-site.xml
#- ./hive-site_example.xml:/opt/bitnami/spark/conf/hive-site.xml
# Note to use an external DB, you need to provide its driver jar too:
#- ./postgresql-42.6.0.jar:/opt/bitnami/spark/jars/postgresql-42.6.0.jar
- spark_vol_single:/opt/bitnami/spark
networks:
- cloudbuild
- default
healthcheck:
test: beeline help || exit 1
interval: 10s
retries: 10
start_period: 5s
timeout: 60s
volumes:
spark_vol_single:
networks:
cloudbuild:
external: true
name: cloudbuild # Needed for Continuous integration