Skip to content

Commit

Permalink
Airflow (#5)
Browse files Browse the repository at this point in the history
* [feat] add airflow

* [fix] move dependencies inside services

* minor changes + fixes

* minor update

* init: add S3 Bucker MinIO

* init: add HashiCorp vault

* feat: add Apache Spark + Airflow
  • Loading branch information
keivanipchihagh authored Jun 7, 2023
1 parent 0de3523 commit d3bcb49
Show file tree
Hide file tree
Showing 15 changed files with 307 additions and 0 deletions.
10 changes: 10 additions & 0 deletions apache-airflow/.env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
HOSTNAME=airflow
# Postgres
POSTGRES_DB=airflow
POSTGRES_PORT=5432
POSTGRES_USER=
POSTGRES_PASSWORD=
# Airflow
AIRFLOW_UID=0
AIRFLOW_WEBSERVER_PORT=8085
AIRFLOW_SCHEDULER_PORT=8793
31 changes: 31 additions & 0 deletions apache-airflow/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
FROM apache/airflow:2.6.0rc5-python3.10

USER root

# Install OpenJDK-11
RUN apt-get update
RUN apt-get install -y openjdk-11-jdk
RUN apt-get install -y ant
RUN apt-get clean
RUN apt-get install curl

# Set JAVA_HOME
ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/
RUN mkdir -p /opt/airflow/spark/jars
RUN export JAVA_HOME

# Trino JDBC driver
# RUN curl https://repo1.maven.org/maven2/io/trino/trino-jdbc/396/trino-jdbc-396.jar \
# --output /opt/airflow/spark/jars/trino-jdbc-396.jar

# PostgreSQL JDBC driver
RUN curl https://jdbc.postgresql.org/download/postgresql-42.5.0.jar \
--output /opt/airflow/spark/jars/postgresql-42.5.0.jar

# Must use 'airflow' user to install PIP packages
USER airflow

# Install requirements
COPY requirements.txt /
RUN --mount=type=cache,target=/root/.cache \
pip install -r /requirements.txt
3 changes: 3 additions & 0 deletions apache-airflow/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
export DOCKER_BUILDKIT=1
docker-compose -f docker-compose.yml --env-file .env up -d --build
94 changes: 94 additions & 0 deletions apache-airflow/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
version: '3.9'

x-common:
&common
build:
context: .
dockerfile: Dockerfile
user: "${AIRFLOW_UID:-0}:0"
logging:
driver: "json-file"
options:
max-size: "100m"
max-file: "3"
# deploy:
# resources:
# limits:
# cpus: '0.5'
# memory: 1g
environment:
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: "postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:${POSTGRES_PORT:-5432}/${POSTGRES_DB:-airflow}"

env_file:
- .env
volumes:
- ./volume/airflow/dags:/opt/airflow/dags
- ./volume/airflow/logs:/opt/airflow/logs
- ./volume/airflow/plugins:/opt/airflow/plugins
- /var/run/docker.sock:/var/run/docker.sock
networks:
- private
- nginx
- public

x-depends-on:
&depends-on
depends_on:
airflow-init:
condition: service_completed_successfully


services:
# Airflow Scheduler
airflow-scheduler:
<<: *common
<<: *depends-on
container_name: airflow-scheduler
healthcheck:
test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"']
interval: 10s
timeout: 10s
retries: 5
command: scheduler
restart: on-failure
ports:
- "${AIRFLOW_SCHEDULER_PORT}:8793"

# Airflow Webserver
airflow-webserver:
<<: *common
<<: *depends-on
container_name: airflow-webserver
restart: always
command: webserver
ports:
- "${AIRFLOW_WEBSERVER_PORT:-8080}:8080"
healthcheck:
test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
interval: 10s
timeout: 10s
retries: 3

# Airflow Initializer
airflow-init:
<<: *common
container_name: airflow-init
entrypoint: /bin/bash
command:
- -c
- |
mkdir -p /sources/logs /sources/dags /sources/plugins
chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins}
exec /entrypoint airflow version
networks:
private:
name: private
external: true
nginx:
name: nginx
internal: true
public:
name: public
external: true
3 changes: 3 additions & 0 deletions apache-airflow/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Airflow Provider Packages
apache-airflow-providers-apache-spark==4.0.0
apache-airflow-providers-cncf-kubernetes==5.0.0
14 changes: 14 additions & 0 deletions apache-spark/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM bitnami/spark:3.3

# Install curl
USER root
RUN install_packages curl
USER 1001

# Trino JDBC driver
RUN curl https://repo1.maven.org/maven2/io/trino/trino-jdbc/396/trino-jdbc-396.jar \
--output /opt/bitnami/spark/jars/trino-jdbc-396.jar

# PostgreSQL JDBC driver
RUN curl https://jdbc.postgresql.org/download/postgresql-42.5.0.jar \
--output /opt/bitnami/spark/jars/postgresql-42.5.0.jar
3 changes: 3 additions & 0 deletions apache-spark/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
export DOCKER_BUILDKIT=1
docker-compose -f docker-compose.yml --env-file ../.env up -d --build
45 changes: 45 additions & 0 deletions apache-spark/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
version: '3.9'

services:
spark-master:
build:
context: .
dockerfile: Dockerfile
container_name: spark-master
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=yes
- SPARK_SSL_ENABLED=no
ports:
- 8080:8080 # Master UI
- 7077:7077 # Master
networks:
- services
- spark

spark-worker:
build:
context: .
dockerfile: Dockerfile
container_name: spark-worker
environment:
- SPARK_MODE=worker
- SPARK_WORKER_MEMORY=1G
- SPARK_WORKER_CORES=1
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=yes
- SPARK_SSL_ENABLED=no
networks:
- spark
- services

networks:
services:
name: services
external: true
spark:
name: spark
internal: true
2 changes: 2 additions & 0 deletions minIO/.env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
MINIO_ROOT_USER=
MINIO_ROOT_PASSWORD=
2 changes: 2 additions & 0 deletions minIO/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
docker-compose --env-file .env up -d --build
37 changes: 37 additions & 0 deletions minIO/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
version: '3.9'

services:
minio:
container_name: minio
image: minio/minio
restart: always
ports:
- "${MINIO_API_PORT:9000}:9000" # API port
- "${MINIO_CONSOLE_PORT:-9001}:9001" # Console port
environment:
MINIO_ROOT_USER: ${MINIO_ROOT_USER}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 10s
retries: 3
command: server --console-address ":9001" /data
volumes:
- ./volume/minio:/data
logging:
driver: "json-file"
options:
max-size: "100m"
max-file: "3"
networks:
- private
- nginx

networks:
private:
name: private
internal: true
nginx:
name: nginx
internal: true
3 changes: 3 additions & 0 deletions vault/.env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
DOMAIN=
EMAIL=
VAULT_PORT=8200
2 changes: 2 additions & 0 deletions vault/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
docker-compose --env-file .env up -d --build
42 changes: 42 additions & 0 deletions vault/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
version: '3.7'

x-common:
&common
logging:
driver: "json-file"
options:
max-size: "100m"
max-file: "3"

services:

# Vault
vault:
<<: *common
image: hashicorp/vault:1.13.0-rc1
container_name: vault
restart: always
entrypoint: vault server -config vault/config/config.hcl
ports:
- "${VAULT_PORT}:8200"
cap_add:
- IPC_LOCK
volumes:
- ./volume/logs:/vault/logs
- ./volume/file:/vault/file
- ./volume/config:/vault/config
- ./volume/policies:/vault/policies
- ./vault-config.hcl:/vault/config/config.hcl
- /etc/letsencrypt/live/${DOMAIN}/fullchain.pem:/certs/fullchain.pem
- /etc/letsencrypt/live/${DOMAIN}/privkey.pem:/certs/privkey.pem
networks:
- nginx
- private

networks:
nginx:
name: nginx
internal: true
private:
name: private
internal: true
16 changes: 16 additions & 0 deletions vault/vault-config.hcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
storage "file" {
path = "/vault/file"
}

listener "tcp" {
address = "0.0.0.0:8200"
tls_disable = "true"
tls_cert_file = "/certs/fullchain.pem"
tls_key_file = "/certs/privkey.pem"
}

ui = true
api_addr = "http://0.0.0.0:8200"
default_lease_ttl = "48h"
max_lease_ttl = "168h"
cluster_name = "Primary"

0 comments on commit d3bcb49

Please sign in to comment.