From a1b95df3b89975d1afa203d42ce631c2ec4912cd Mon Sep 17 00:00:00 2001 From: pablodanswer Date: Sat, 9 Nov 2024 13:26:51 -0800 Subject: [PATCH] Robustify cloud deployment + include initial KEDA configuration (#3094) * robustify cloud deployment + include initial KEDA configuration * ensure .github changes are passed * raise exits --- ...er-build-push-backend-container-on-tag.yml | 100 +++++++++--------- ...-build-push-cloud-web-container-on-tag.yml | 54 +++++----- ...ild-push-model-server-container-on-tag.yml | 84 +++++++-------- .../background/celery/apps/app_base.py | 5 - .../danswer/background/celery/apps/beat.py | 6 ++ .../danswer/background/celery/apps/heavy.py | 5 + .../background/celery/apps/indexing.py | 5 + .../danswer/background/celery/apps/light.py | 4 + .../danswer/background/celery/apps/primary.py | 7 +- .../keda/celery-worker-auth.yaml | 13 +++ .../celery-worker-indexing-scaledobject.yaml | 46 ++++++++ .../celery-worker-light-scaledobject.yaml | 63 +++++++++++ .../celery-worker-primary-scaledobject.yaml | 76 +++++++++++++ .../keda/keda-redis-secret.yaml | 9 ++ deployment/cloud_kubernetes/workers/beat.yaml | 2 +- .../workers/heavy_worker.yaml | 2 +- .../workers/indexing_worker.yaml | 4 +- .../workers/light_worker.yaml | 4 +- .../cloud_kubernetes/workers/primary.yaml | 6 +- 19 files changed, 362 insertions(+), 133 deletions(-) create mode 100644 deployment/cloud_kubernetes/keda/celery-worker-auth.yaml create mode 100644 deployment/cloud_kubernetes/keda/celery-worker-indexing-scaledobject.yaml create mode 100644 deployment/cloud_kubernetes/keda/celery-worker-light-scaledobject.yaml create mode 100644 deployment/cloud_kubernetes/keda/celery-worker-primary-scaledobject.yaml create mode 100644 deployment/cloud_kubernetes/keda/keda-redis-secret.yaml diff --git a/.github/workflows/docker-build-push-backend-container-on-tag.yml b/.github/workflows/docker-build-push-backend-container-on-tag.yml index ef07e051db3..ef33750c271 100644 --- a/.github/workflows/docker-build-push-backend-container-on-tag.yml +++ b/.github/workflows/docker-build-push-backend-container-on-tag.yml @@ -3,61 +3,61 @@ name: Build and Push Backend Image on Tag on: push: tags: - - '*' + - "*" env: - REGISTRY_IMAGE: danswer/danswer-backend + REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'danswer/danswer-backend-cloud' || 'danswer/danswer-backend' }} LATEST_TAG: ${{ contains(github.ref_name, 'latest') }} - + jobs: build-and-push: - # TODO: investigate a matrix build like the web container + # TODO: investigate a matrix build like the web container # See https://runs-on.com/runners/linux/ - runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"] + runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"] steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_TOKEN }} - - - name: Install build-essential - run: | - sudo apt-get update - sudo apt-get install -y build-essential - - - name: Backend Image Docker Build and Push - uses: docker/build-push-action@v5 - with: - context: ./backend - file: ./backend/Dockerfile - platforms: linux/amd64,linux/arm64 - push: true - tags: | - ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} - ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }} - build-args: | - DANSWER_VERSION=${{ github.ref_name }} - - # trivy has their own rate limiting issues causing this action to flake - # we worked around it by hardcoding to different db repos in env - # can re-enable when they figure it out - # https://github.com/aquasecurity/trivy/discussions/7538 - # https://github.com/aquasecurity/trivy-action/issues/389 - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - env: - TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2' - TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1' - with: - # To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend - image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} - severity: 'CRITICAL,HIGH' - trivyignores: ./backend/.trivyignore + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + - name: Install build-essential + run: | + sudo apt-get update + sudo apt-get install -y build-essential + + - name: Backend Image Docker Build and Push + uses: docker/build-push-action@v5 + with: + context: ./backend + file: ./backend/Dockerfile + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} + ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }} + build-args: | + DANSWER_VERSION=${{ github.ref_name }} + + # trivy has their own rate limiting issues causing this action to flake + # we worked around it by hardcoding to different db repos in env + # can re-enable when they figure it out + # https://github.com/aquasecurity/trivy/discussions/7538 + # https://github.com/aquasecurity/trivy-action/issues/389 + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + env: + TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2" + TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1" + with: + # To run locally: trivy image --severity HIGH,CRITICAL danswer/danswer-backend + image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} + severity: "CRITICAL,HIGH" + trivyignores: ./backend/.trivyignore diff --git a/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml b/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml index 871c96841ad..6cebb4b6c8f 100644 --- a/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml +++ b/.github/workflows/docker-build-push-cloud-web-container-on-tag.yml @@ -4,12 +4,12 @@ name: Build and Push Cloud Web Image on Tag on: push: tags: - - '*' + - "*" env: - REGISTRY_IMAGE: danswer/danswer-cloud-web-server + REGISTRY_IMAGE: danswer/danswer-web-server-cloud LATEST_TAG: ${{ contains(github.ref_name, 'latest') }} - + jobs: build: runs-on: @@ -28,11 +28,11 @@ jobs: - name: Prepare run: | platform=${{ matrix.platform }} - echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV - + echo "PLATFORM_PAIR=${platform//\//-}" >> $GITHUB_ENV + - name: Checkout uses: actions/checkout@v4 - + - name: Docker meta id: meta uses: docker/metadata-action@v5 @@ -41,16 +41,16 @@ jobs: tags: | type=raw,value=${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} type=raw,value=${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }} - + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - + - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_TOKEN }} - + - name: Build and push by digest id: build uses: docker/build-push-action@v5 @@ -65,17 +65,17 @@ jobs: NEXT_PUBLIC_POSTHOG_KEY=${{ secrets.POSTHOG_KEY }} NEXT_PUBLIC_POSTHOG_HOST=${{ secrets.POSTHOG_HOST }} NEXT_PUBLIC_SENTRY_DSN=${{ secrets.SENTRY_DSN }} - # needed due to weird interactions with the builds for different platforms + # needed due to weird interactions with the builds for different platforms no-cache: true labels: ${{ steps.meta.outputs.labels }} outputs: type=image,name=${{ env.REGISTRY_IMAGE }},push-by-digest=true,name-canonical=true,push=true - + - name: Export digest run: | mkdir -p /tmp/digests digest="${{ steps.build.outputs.digest }}" - touch "/tmp/digests/${digest#sha256:}" - + touch "/tmp/digests/${digest#sha256:}" + - name: Upload digest uses: actions/upload-artifact@v4 with: @@ -95,42 +95,42 @@ jobs: path: /tmp/digests pattern: digests-* merge-multiple: true - + - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - + - name: Docker meta id: meta uses: docker/metadata-action@v5 with: images: ${{ env.REGISTRY_IMAGE }} - + - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_TOKEN }} - + - name: Create manifest list and push working-directory: /tmp/digests run: | docker buildx imagetools create $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \ - $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *) - + $(printf '${{ env.REGISTRY_IMAGE }}@sha256:%s ' *) + - name: Inspect image run: | docker buildx imagetools inspect ${{ env.REGISTRY_IMAGE }}:${{ steps.meta.outputs.version }} - # trivy has their own rate limiting issues causing this action to flake - # we worked around it by hardcoding to different db repos in env - # can re-enable when they figure it out - # https://github.com/aquasecurity/trivy/discussions/7538 - # https://github.com/aquasecurity/trivy-action/issues/389 + # trivy has their own rate limiting issues causing this action to flake + # we worked around it by hardcoding to different db repos in env + # can re-enable when they figure it out + # https://github.com/aquasecurity/trivy/discussions/7538 + # https://github.com/aquasecurity/trivy-action/issues/389 - name: Run Trivy vulnerability scanner uses: aquasecurity/trivy-action@master env: - TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2' - TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1' + TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2" + TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1" with: image-ref: docker.io/${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} - severity: 'CRITICAL,HIGH' + severity: "CRITICAL,HIGH" diff --git a/.github/workflows/docker-build-push-model-server-container-on-tag.yml b/.github/workflows/docker-build-push-model-server-container-on-tag.yml index c05d233d1e9..3e0445ab04a 100644 --- a/.github/workflows/docker-build-push-model-server-container-on-tag.yml +++ b/.github/workflows/docker-build-push-model-server-container-on-tag.yml @@ -3,53 +3,53 @@ name: Build and Push Model Server Image on Tag on: push: tags: - - '*' + - "*" env: - REGISTRY_IMAGE: danswer/danswer-model-server + REGISTRY_IMAGE: ${{ contains(github.ref_name, 'cloud') && 'danswer/danswer-model-server-cloud' || 'danswer/danswer-model-server' }} LATEST_TAG: ${{ contains(github.ref_name, 'latest') }} - + jobs: build-and-push: # See https://runs-on.com/runners/linux/ - runs-on: [runs-on,runner=8cpu-linux-x64,"run-id=${{ github.run_id }}"] + runs-on: [runs-on, runner=8cpu-linux-x64, "run-id=${{ github.run_id }}"] steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Login to Docker Hub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_TOKEN }} - - - name: Model Server Image Docker Build and Push - uses: docker/build-push-action@v5 - with: - context: ./backend - file: ./backend/Dockerfile.model_server - platforms: linux/amd64,linux/arm64 - push: true - tags: | - ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} - ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }} - build-args: | - DANSWER_VERSION=${{ github.ref_name }} - - # trivy has their own rate limiting issues causing this action to flake - # we worked around it by hardcoding to different db repos in env - # can re-enable when they figure it out - # https://github.com/aquasecurity/trivy/discussions/7538 - # https://github.com/aquasecurity/trivy-action/issues/389 - - name: Run Trivy vulnerability scanner - uses: aquasecurity/trivy-action@master - env: - TRIVY_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-db:2' - TRIVY_JAVA_DB_REPOSITORY: 'public.ecr.aws/aquasecurity/trivy-java-db:1' - with: - image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }} - severity: 'CRITICAL,HIGH' + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_TOKEN }} + + - name: Model Server Image Docker Build and Push + uses: docker/build-push-action@v5 + with: + context: ./backend + file: ./backend/Dockerfile.model_server + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ env.REGISTRY_IMAGE }}:${{ github.ref_name }} + ${{ env.LATEST_TAG == 'true' && format('{0}:latest', env.REGISTRY_IMAGE) || '' }} + build-args: | + DANSWER_VERSION=${{ github.ref_name }} + + # trivy has their own rate limiting issues causing this action to flake + # we worked around it by hardcoding to different db repos in env + # can re-enable when they figure it out + # https://github.com/aquasecurity/trivy/discussions/7538 + # https://github.com/aquasecurity/trivy-action/issues/389 + - name: Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + env: + TRIVY_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-db:2" + TRIVY_JAVA_DB_REPOSITORY: "public.ecr.aws/aquasecurity/trivy-java-db:1" + with: + image-ref: docker.io/danswer/danswer-model-server:${{ github.ref_name }} + severity: "CRITICAL,HIGH" diff --git a/backend/danswer/background/celery/apps/app_base.py b/backend/danswer/background/celery/apps/app_base.py index 7d3e7644ed9..79e2e9739ae 100644 --- a/backend/danswer/background/celery/apps/app_base.py +++ b/backend/danswer/background/celery/apps/app_base.py @@ -31,7 +31,6 @@ from danswer.utils.logger import ColoredFormatter from danswer.utils.logger import PlainFormatter from danswer.utils.logger import setup_logger -from shared_configs.configs import MULTI_TENANT from shared_configs.configs import SENTRY_DSN @@ -274,10 +273,6 @@ def wait_for_vespa(sender: Any, **kwargs: Any) -> None: def on_secondary_worker_init(sender: Any, **kwargs: Any) -> None: logger.info("Running as a secondary celery worker.") - # Exit early if multi-tenant since primary worker check not needed - if MULTI_TENANT: - return - # Set up variables for waiting on primary worker WAIT_INTERVAL = 5 WAIT_LIMIT = 60 diff --git a/backend/danswer/background/celery/apps/beat.py b/backend/danswer/background/celery/apps/beat.py index 979cf07cbb1..8842343ffae 100644 --- a/backend/danswer/background/celery/apps/beat.py +++ b/backend/danswer/background/celery/apps/beat.py @@ -12,6 +12,7 @@ from danswer.db.engine import SqlEngine from danswer.utils.logger import setup_logger from danswer.utils.variable_functionality import fetch_versioned_implementation +from shared_configs.configs import MULTI_TENANT logger = setup_logger(__name__) @@ -143,6 +144,11 @@ def on_beat_init(sender: Any, **kwargs: Any) -> None: # Celery beat shouldn't touch the db at all. But just setting a low minimum here. SqlEngine.set_app_name(POSTGRES_CELERY_BEAT_APP_NAME) SqlEngine.init_engine(pool_size=2, max_overflow=0) + + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return + app_base.wait_for_redis(sender, **kwargs) diff --git a/backend/danswer/background/celery/apps/heavy.py b/backend/danswer/background/celery/apps/heavy.py index c124a6fc246..3f8263267e0 100644 --- a/backend/danswer/background/celery/apps/heavy.py +++ b/backend/danswer/background/celery/apps/heavy.py @@ -13,6 +13,7 @@ from danswer.configs.constants import POSTGRES_CELERY_WORKER_HEAVY_APP_NAME from danswer.db.engine import SqlEngine from danswer.utils.logger import setup_logger +from shared_configs.configs import MULTI_TENANT logger = setup_logger() @@ -60,6 +61,10 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None: SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME) SqlEngine.init_engine(pool_size=4, max_overflow=12) + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return + app_base.wait_for_redis(sender, **kwargs) app_base.wait_for_db(sender, **kwargs) app_base.wait_for_vespa(sender, **kwargs) diff --git a/backend/danswer/background/celery/apps/indexing.py b/backend/danswer/background/celery/apps/indexing.py index 1c4cff425bc..01ec79e5c7b 100644 --- a/backend/danswer/background/celery/apps/indexing.py +++ b/backend/danswer/background/celery/apps/indexing.py @@ -13,6 +13,7 @@ from danswer.configs.constants import POSTGRES_CELERY_WORKER_INDEXING_APP_NAME from danswer.db.engine import SqlEngine from danswer.utils.logger import setup_logger +from shared_configs.configs import MULTI_TENANT logger = setup_logger() @@ -60,6 +61,10 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None: SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_INDEXING_APP_NAME) SqlEngine.init_engine(pool_size=8, max_overflow=0) + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return + app_base.wait_for_redis(sender, **kwargs) app_base.wait_for_db(sender, **kwargs) app_base.wait_for_vespa(sender, **kwargs) diff --git a/backend/danswer/background/celery/apps/light.py b/backend/danswer/background/celery/apps/light.py index fce19ed17c5..354257e9a98 100644 --- a/backend/danswer/background/celery/apps/light.py +++ b/backend/danswer/background/celery/apps/light.py @@ -13,6 +13,7 @@ from danswer.configs.constants import POSTGRES_CELERY_WORKER_LIGHT_APP_NAME from danswer.db.engine import SqlEngine from danswer.utils.logger import setup_logger +from shared_configs.configs import MULTI_TENANT logger = setup_logger() @@ -59,6 +60,9 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None: SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME) SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=8) + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return app_base.wait_for_redis(sender, **kwargs) app_base.wait_for_db(sender, **kwargs) diff --git a/backend/danswer/background/celery/apps/primary.py b/backend/danswer/background/celery/apps/primary.py index 5a5ffbb62c2..14d2b006bb8 100644 --- a/backend/danswer/background/celery/apps/primary.py +++ b/backend/danswer/background/celery/apps/primary.py @@ -75,15 +75,16 @@ def on_worker_init(sender: Any, **kwargs: Any) -> None: SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME) SqlEngine.init_engine(pool_size=8, max_overflow=0) + # Startup checks are not needed in multi-tenant case + if MULTI_TENANT: + return + app_base.wait_for_redis(sender, **kwargs) app_base.wait_for_db(sender, **kwargs) app_base.wait_for_vespa(sender, **kwargs) logger.info("Running as the primary celery worker.") - if MULTI_TENANT: - return - # This is singleton work that should be done on startup exactly once # by the primary worker. This is unnecessary in the multi tenant scenario r = get_redis_client(tenant_id=None) diff --git a/deployment/cloud_kubernetes/keda/celery-worker-auth.yaml b/deployment/cloud_kubernetes/keda/celery-worker-auth.yaml new file mode 100644 index 00000000000..dcd6f92b5b6 --- /dev/null +++ b/deployment/cloud_kubernetes/keda/celery-worker-auth.yaml @@ -0,0 +1,13 @@ +apiVersion: keda.sh/v1alpha1 +kind: TriggerAuthentication +metadata: + name: celery-worker-auth + namespace: danswer +spec: + secretTargetRef: + - parameter: host + name: keda-redis-secret + key: host + - parameter: password + name: keda-redis-secret + key: password diff --git a/deployment/cloud_kubernetes/keda/celery-worker-indexing-scaledobject.yaml b/deployment/cloud_kubernetes/keda/celery-worker-indexing-scaledobject.yaml new file mode 100644 index 00000000000..d3246555214 --- /dev/null +++ b/deployment/cloud_kubernetes/keda/celery-worker-indexing-scaledobject.yaml @@ -0,0 +1,46 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: celery-worker-indexing-scaledobject + namespace: danswer + labels: + app: celery-worker-indexing +spec: + scaleTargetRef: + name: celery-worker-indexing + minReplicaCount: 1 + maxReplicaCount: 10 + triggers: + - type: redis + metadata: + sslEnabled: "true" + host: "{host}" + port: "6379" + enableTLS: "true" + listName: connector_indexing + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + sslEnabled: "true" + host: "{host}" + port: "6379" + enableTLS: "true" + listName: connector_indexing:2 + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + sslEnabled: "true" + host: "{host}" + port: "6379" + enableTLS: "true" + listName: connector_indexing:3 + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth diff --git a/deployment/cloud_kubernetes/keda/celery-worker-light-scaledobject.yaml b/deployment/cloud_kubernetes/keda/celery-worker-light-scaledobject.yaml new file mode 100644 index 00000000000..bb972683ab5 --- /dev/null +++ b/deployment/cloud_kubernetes/keda/celery-worker-light-scaledobject.yaml @@ -0,0 +1,63 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: celery-worker-light-scaledobject + namespace: danswer + labels: + app: celery-worker-light +spec: + scaleTargetRef: + name: celery-worker-light + minReplicaCount: 1 + maxReplicaCount: 20 + triggers: + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: vespa_metadata_sync + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: vespa_metadata_sync:2 + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: vespa_metadata_sync:3 + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: connector_deletion + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: connector_deletion:2 + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth diff --git a/deployment/cloud_kubernetes/keda/celery-worker-primary-scaledobject.yaml b/deployment/cloud_kubernetes/keda/celery-worker-primary-scaledobject.yaml new file mode 100644 index 00000000000..3de3119cba9 --- /dev/null +++ b/deployment/cloud_kubernetes/keda/celery-worker-primary-scaledobject.yaml @@ -0,0 +1,76 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: celery-worker-primary-scaledobject + namespace: danswer + labels: + app: celery-worker-primary +spec: + scaleTargetRef: + name: celery-worker-primary + pollingInterval: 15 # Check every 15 seconds + cooldownPeriod: 30 # Wait 30 seconds before scaling down + minReplicaCount: 1 + maxReplicaCount: 1 + triggers: + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: celery + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: celery:1 + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: celery:2 + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: celery:3 + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: periodic_tasks + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth + - type: redis + metadata: + host: "{host}" + port: "6379" + enableTLS: "true" + listName: periodic_tasks:2 + listLength: "1" + databaseIndex: "15" + authenticationRef: + name: celery-worker-auth diff --git a/deployment/cloud_kubernetes/keda/keda-redis-secret.yaml b/deployment/cloud_kubernetes/keda/keda-redis-secret.yaml new file mode 100644 index 00000000000..a9a986f038b --- /dev/null +++ b/deployment/cloud_kubernetes/keda/keda-redis-secret.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Secret +metadata: + name: keda-redis-secret + namespace: danswer +type: Opaque +data: + host: { { base64-encoded-hostname } } + password: { { base64-encoded-password } } diff --git a/deployment/cloud_kubernetes/workers/beat.yaml b/deployment/cloud_kubernetes/workers/beat.yaml index 563dbf10435..7f6bf980673 100644 --- a/deployment/cloud_kubernetes/workers/beat.yaml +++ b/deployment/cloud_kubernetes/workers/beat.yaml @@ -14,7 +14,7 @@ spec: spec: containers: - name: celery-beat - image: danswer/danswer-backend:v0.11.0-cloud.beta.8 + image: danswer/danswer-backend-cloud:v0.12.0-cloud.beta.2 imagePullPolicy: Always command: [ diff --git a/deployment/cloud_kubernetes/workers/heavy_worker.yaml b/deployment/cloud_kubernetes/workers/heavy_worker.yaml index d8da6a3d3ae..7488b0e9a39 100644 --- a/deployment/cloud_kubernetes/workers/heavy_worker.yaml +++ b/deployment/cloud_kubernetes/workers/heavy_worker.yaml @@ -14,7 +14,7 @@ spec: spec: containers: - name: celery-worker-heavy - image: danswer/danswer-backend:v0.11.0-cloud.beta.8 + image: danswer/danswer-backend-cloud:v0.12.0-cloud.beta.2 imagePullPolicy: Always command: [ diff --git a/deployment/cloud_kubernetes/workers/indexing_worker.yaml b/deployment/cloud_kubernetes/workers/indexing_worker.yaml index 98158f62ef8..9368c2ba8b9 100644 --- a/deployment/cloud_kubernetes/workers/indexing_worker.yaml +++ b/deployment/cloud_kubernetes/workers/indexing_worker.yaml @@ -14,7 +14,7 @@ spec: spec: containers: - name: celery-worker-indexing - image: danswer/danswer-backend:v0.11.0-cloud.beta.8 + image: danswer/danswer-backend-cloud:v0.12.0-cloud.beta.2 imagePullPolicy: Always command: [ @@ -26,6 +26,8 @@ spec: "--hostname=indexing@%n", "-Q", "connector_indexing", + "--prefetch-multiplier=1", + "--concurrency=10", ] env: - name: REDIS_PASSWORD diff --git a/deployment/cloud_kubernetes/workers/light_worker.yaml b/deployment/cloud_kubernetes/workers/light_worker.yaml index 2df3b50ea53..b16c24a9402 100644 --- a/deployment/cloud_kubernetes/workers/light_worker.yaml +++ b/deployment/cloud_kubernetes/workers/light_worker.yaml @@ -14,7 +14,7 @@ spec: spec: containers: - name: celery-worker-light - image: danswer/danswer-backend:v0.11.0-cloud.beta.8 + image: danswer/danswer-backend-cloud:v0.12.0-cloud.beta.2 imagePullPolicy: Always command: [ @@ -26,6 +26,8 @@ spec: "--hostname=light@%n", "-Q", "vespa_metadata_sync,connector_deletion", + "--prefetch-multiplier=1", + "--concurrency=10", ] env: - name: REDIS_PASSWORD diff --git a/deployment/cloud_kubernetes/workers/primary.yaml b/deployment/cloud_kubernetes/workers/primary.yaml index 32e34b5cdfc..7736cebf418 100644 --- a/deployment/cloud_kubernetes/workers/primary.yaml +++ b/deployment/cloud_kubernetes/workers/primary.yaml @@ -14,7 +14,7 @@ spec: spec: containers: - name: celery-worker-primary - image: danswer/danswer-backend:v0.11.0-cloud.beta.8 + image: danswer/danswer-backend-cloud:v0.12.0-cloud.beta.2 imagePullPolicy: Always command: [ @@ -25,7 +25,9 @@ spec: "--loglevel=INFO", "--hostname=primary@%n", "-Q", - "celery,periodic_tasks,vespa_metadata_sync", + "celery,periodic_tasks", + "--prefetch-multiplier=1", + "--concurrency=10", ] env: - name: REDIS_PASSWORD