diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index e1c9d60b0f..a1d79b8db1 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -148,6 +148,8 @@ def __init__( minutes=int(os.environ.get("PAUSED_CRAWL_LIMIT_MINUTES", "10080")) ) + self.crawl_queue_limit_scale = int(os.environ.get("CRAWL_QUEUE_LIMIT_SCALE", 0)) + self.router = APIRouter( prefix="/crawlconfigs", tags=["crawlconfigs"], @@ -1179,6 +1181,8 @@ async def run_now_internal( if crawlconfig.proxyId and not self.can_org_use_proxy(org, crawlconfig.proxyId): raise HTTPException(status_code=404, detail="proxy_not_found") + await self.check_if_too_many_waiting_crawls(org) + profile_filename = await self.get_profile_filename(crawlconfig.profileid, org) storage_filename = ( crawlconfig.crawlFilenameTemplate or self.default_filename_template @@ -1220,6 +1224,21 @@ async def run_now_internal( print(traceback.format_exc()) raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") + async def check_if_too_many_waiting_crawls(self, org: Organization): + """if max concurrent crawls are set, limit number of queued crawls to X concurrent limit + return 429 if at limit""" + max_concur = org.quotas.maxConcurrentCrawls + if not max_concur: + return + + num_waiting = await self.crawls.count_documents( + {"oid": org.id, "state": "waiting_org_limit"} + ) + if num_waiting < max_concur * self.crawl_queue_limit_scale: + return + + raise HTTPException(status_code=429, detail="slow_down_too_many_crawls_queued") + async def set_config_current_crawl_info( self, cid: UUID, crawl_id: str, crawl_start: datetime, user: User ): diff --git a/backend/test_nightly/test_concurrent_crawl_limit.py b/backend/test_nightly/test_concurrent_crawl_limit.py index 141ec97948..109922b774 100644 --- a/backend/test_nightly/test_concurrent_crawl_limit.py +++ b/backend/test_nightly/test_concurrent_crawl_limit.py @@ -20,11 +20,11 @@ def test_set_concurrent_crawl_limit(org_with_quotas, admin_auth_headers): def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers): global crawl_id_a - crawl_id_a = run_crawl(org_with_quotas, admin_auth_headers) + crawl_id_a, _ = run_crawl(org_with_quotas, admin_auth_headers) time.sleep(1) global crawl_id_b - crawl_id_b = run_crawl(org_with_quotas, admin_auth_headers) + crawl_id_b, _ = run_crawl(org_with_quotas, admin_auth_headers) while get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) in ( "starting", @@ -49,6 +49,18 @@ def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers): == "waiting_org_limit" ) + crawl_id, res = run_crawl(org_with_quotas, admin_auth_headers) + assert not crawl_id + assert res["errorDetail"] == "slow_down_too_many_crawls_queued" + + config_id = res["id"] + + r = requests.post( + f"{API_PREFIX}/orgs/{org_with_quotas}/crawlconfigs/{config_id}/run", + headers=admin_auth_headers, + ) + assert r.status_code == 429 + def test_cancel_and_run_other(org_with_quotas, admin_auth_headers): r = requests.post( @@ -101,6 +113,6 @@ def run_crawl(org_id, headers): headers=headers, json=crawl_data, ) + r.raise_for_status() data = r.json() - - return data["run_now_job"] + return data["run_now_job"], data diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index f87c5643f6..3ec5c7f0bd 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -40,6 +40,8 @@ data: MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}" + CRAWL_QUEUE_LIMIT_SCALE: "{{ .Values.crawl_queue_limit_scale | default 0 }}" + IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" RERUN_FROM_MIGRATION: "{{ .Values.rerun_from_migration }}" diff --git a/chart/test/test-nightly-addons.yaml b/chart/test/test-nightly-addons.yaml index 02de7ff1f6..7ea9fcd640 100644 --- a/chart/test/test-nightly-addons.yaml +++ b/chart/test/test-nightly-addons.yaml @@ -10,6 +10,9 @@ cleanup_job_cron_schedule: "* * * * *" # Clean up files > 1 minute old in testing cleanup_files_after_minutes: 1 +# max concurrent crawls +max_concur_queue_to_limit_scale: 1 + # enable to allow access to minio directly minio_local_access_port: 30090 diff --git a/chart/values.yaml b/chart/values.yaml index 0eb3a7cf68..60380ce0fa 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -33,6 +33,12 @@ crawler_extract_full_text: to-warc # if set, each workflow can have a lower limit, but not higher max_pages_per_crawl: 50000 +# set limit for how many crawls can be queued at once, +# calculated as scale multiplier applied to concurrent crawl limit +# if 0, no limit to concurrent crawl queue +# if >0, scale to get max queue from concurrent crawl limit +# crawl_queue_limit_scale: 0 + # default template for generate wacz files # supports following interpolated vars: # @ts - current timestamp