From 2b56c4c6783522a2ed251e0e1c96a55dd378290a Mon Sep 17 00:00:00 2001 From: beshiniii Date: Sat, 19 Jun 2021 12:07:13 +0530 Subject: [PATCH 1/3] Add apache airflow to the crawlerx server --- airflow_docker/docker-compose.yaml | 147 +++++++++++++++++++++++++++ airflow_docker/logs/scheduler/latest | 1 + 2 files changed, 148 insertions(+) create mode 100644 airflow_docker/docker-compose.yaml create mode 120000 airflow_docker/logs/scheduler/latest diff --git a/airflow_docker/docker-compose.yaml b/airflow_docker/docker-compose.yaml new file mode 100644 index 0000000..6b1e87a --- /dev/null +++ b/airflow_docker/docker-compose.yaml @@ -0,0 +1,147 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. +# +# WARNING: This configuration is for local development. Do not use it in a production deployment. +# +# This configuration supports basic configuration using environment variables or an .env file +# The following variables are supported: +# +# AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. +# Default: apache/airflow:master-python3.8 +# AIRFLOW_UID - User ID in Airflow containers +# Default: 50000 +# AIRFLOW_GID - Group ID in Airflow containers +# Default: 50000 +# _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account. +# Default: airflow +# _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account. +# Default: airflow +# +# Feel free to modify this file to suit your needs. +--- +version: '3' +x-airflow-common: + &airflow-common + image: ${AIRFLOW_IMAGE_NAME:-apache/airflow:2.1.0} + environment: + &airflow-common-env + AIRFLOW__CORE__EXECUTOR: CeleryExecutor + AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow + AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 + AIRFLOW__CORE__FERNET_KEY: '' + AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' + AIRFLOW__CORE__LOAD_EXAMPLES: 'false' + AIRFLOW__API__AUTH_BACKEND: 'airflow.api.auth.backend.basic_auth' + volumes: + - ./dags:/opt/airflow/dags + - ./logs:/opt/airflow/logs + - ./plugins:/opt/airflow/plugins + user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" + depends_on: + redis: + condition: service_healthy + postgres: + condition: service_healthy + +services: + postgres: + image: postgres:13 + environment: + POSTGRES_USER: airflow + POSTGRES_PASSWORD: airflow + POSTGRES_DB: airflow + volumes: + - postgres-db-volume:/var/lib/postgresql/data + healthcheck: + test: ["CMD", "pg_isready", "-U", "airflow"] + interval: 5s + retries: 5 + restart: always + + redis: + image: redis:latest + ports: + - 6379:6379 + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 30s + retries: 50 + restart: always + + airflow-webserver: + <<: *airflow-common + command: webserver + ports: + - 8080:8080 + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] + interval: 10s + timeout: 10s + retries: 5 + restart: always + + airflow-scheduler: + <<: *airflow-common + command: scheduler + healthcheck: + test: ["CMD-SHELL", 'airflow jobs check --job-type SchedulerJob --hostname "$${HOSTNAME}"'] + interval: 10s + timeout: 10s + retries: 5 + restart: always + + airflow-worker: + <<: *airflow-common + command: celery worker + healthcheck: + test: + - "CMD-SHELL" + - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' + interval: 10s + timeout: 10s + retries: 5 + restart: always + + airflow-init: + <<: *airflow-common + command: version + environment: + <<: *airflow-common-env + _AIRFLOW_DB_UPGRADE: 'true' + _AIRFLOW_WWW_USER_CREATE: 'true' + _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} + _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} + + flower: + <<: *airflow-common + command: celery flower + ports: + - 5555:5555 + healthcheck: + test: ["CMD", "curl", "--fail", "http://localhost:5555/"] + interval: 10s + timeout: 10s + retries: 5 + restart: always + +volumes: + postgres-db-volume: diff --git a/airflow_docker/logs/scheduler/latest b/airflow_docker/logs/scheduler/latest new file mode 120000 index 0000000..dbd4d2e --- /dev/null +++ b/airflow_docker/logs/scheduler/latest @@ -0,0 +1 @@ +/opt/airflow/logs/scheduler/2021-06-19 \ No newline at end of file From 184b7872551320104c4459f70fa8f9c9fd5a070e Mon Sep 17 00:00:00 2001 From: beshiniii Date: Thu, 24 Jun 2021 17:04:08 +0530 Subject: [PATCH 2/3] initialize rabbitmq as the broker for airflow --- airflow_docker/docker-compose.yaml | 31 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/airflow_docker/docker-compose.yaml b/airflow_docker/docker-compose.yaml index 6b1e87a..389bcc1 100644 --- a/airflow_docker/docker-compose.yaml +++ b/airflow_docker/docker-compose.yaml @@ -16,7 +16,7 @@ # under the License. # -# Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. +# Basic Airflow cluster configuration for CeleryExecutor with RabbitMQ and PostgreSQL. # # WARNING: This configuration is for local development. Do not use it in a production deployment. # @@ -45,7 +45,7 @@ x-airflow-common: AIRFLOW__CORE__EXECUTOR: CeleryExecutor AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow - AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 + AIRFLOW__CELERY__BROKER_URL: amqp://guest:guest@rabbitmq:5672/ AIRFLOW__CORE__FERNET_KEY: '' AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' AIRFLOW__CORE__LOAD_EXAMPLES: 'false' @@ -56,10 +56,8 @@ x-airflow-common: - ./plugins:/opt/airflow/plugins user: "${AIRFLOW_UID:-50000}:${AIRFLOW_GID:-50000}" depends_on: - redis: - condition: service_healthy - postgres: - condition: service_healthy + - rabbitmq + - postgres services: postgres: @@ -76,16 +74,21 @@ services: retries: 5 restart: always - redis: - image: redis:latest + rabbitmq: + image: "rabbitmq:3-management" + hostname: "rabbit" ports: - - 6379:6379 + - 15672:15672 + - 5672:5672 + labels: + NAME: "rabbitmq" + volumes: + - ./rabbitmq-isolated.conf:/etc/rabbitmq/rabbitmq.config healthcheck: - test: ["CMD", "redis-cli", "ping"] - interval: 5s - timeout: 30s - retries: 50 - restart: always + test: ["CMD", "curl", "-f", "http://localhost:15672"] + interval: 20s + retries: 5 + restart: always airflow-webserver: <<: *airflow-common From b1d66b11277617076a668ecd93abe765d9f4c752 Mon Sep 17 00:00:00 2001 From: beshiniii Date: Thu, 24 Jun 2021 17:05:39 +0530 Subject: [PATCH 3/3] ignore airflow logs --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 12e9a0f..d834af5 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,6 @@ dmypy.json #node modules **/node_modules + +#airflow logs +airflow_docker/logs