diff --git a/backend/btrixcloud/operator.py b/backend/btrixcloud/operator.py index 49b8331766..af85f9458a 100644 --- a/backend/btrixcloud/operator.py +++ b/backend/btrixcloud/operator.py @@ -649,9 +649,9 @@ async def set_state(self, state, status, crawl_id, allowed_from, **kwargs): from starting to running: - starting -> running - from running to complete or partial_complete: + from running to complete or complete[:stopReason]: + - running -> complete[:stopReason] - running -> complete - - running -> partial_complete from starting or running to waiting for capacity (pods pending) and back: - starting -> waiting_capacity @@ -659,8 +659,8 @@ async def set_state(self, state, status, crawl_id, allowed_from, **kwargs): - waiting_capacity -> running from any state to canceled or failed: - - not complete or partial_complete -> canceled - - not complete or partial_complete -> failed + - not complete[:stopReason] -> canceled + - not complete[:stopReason] -> failed """ if not allowed_from or status.state in allowed_from: res = await self.crawl_ops.update_crawl_state_if_allowed( @@ -1280,15 +1280,14 @@ async def add_file_to_crawl(self, cc_data, crawl, redis): return True - async def is_crawl_stopping(self, crawl: CrawlSpec, status: CrawlStatus) -> None: + async def is_crawl_stopping( + self, crawl: CrawlSpec, status: CrawlStatus + ) -> Optional[str]: """check if crawl is stopping and set reason""" - # if user requested stop, then enter stopping phase if crawl.stopping: print("Graceful Stop: User requested stop") - status.stopping = True - status.stopReason = "user-stop" - return + return "user-stop" # check timeout if timeout time exceeds elapsed time if crawl.timeout: @@ -1300,28 +1299,21 @@ async def is_crawl_stopping(self, crawl: CrawlSpec, status: CrawlStatus) -> None print( f"Graceful Stop: Crawl running time exceeded {crawl.timeout} second timeout" ) - status.stopping = True - status.stopReason = "time-limit" - return + return "time-limit" # crawl size limit if crawl.max_crawl_size and status.size > crawl.max_crawl_size: print(f"Graceful Stop: Maximum crawl size {crawl.max_crawl_size} hit") - status.stopping = True - status.stopReason = "size-limit" - return + return "size-limit" # check exec time quotas and stop if reached limit if await self.org_ops.exec_mins_quota_reached(crawl.oid): - status.stopping = True - status.stopReason = "exec-time-quota" - return + return "exec-time-quota" if self.max_pages_per_crawl and status.pagesFound >= self.max_pages_per_crawl: - # will stop on its own - status.stopping = False - status.stopReason = "page-limit" - return + return "page-limit" + + return None async def get_redis_crawl_stats(self, redis: Redis, crawl_id: str): """get page stats""" @@ -1368,7 +1360,9 @@ async def update_crawl_state( pod_info = status.podStatus[key] pod_info.used.storage = value - await self.is_crawl_stopping(crawl, status) + if not status.stopReason: + status.stopReason = await self.is_crawl_stopping(crawl, status) + status.stopping = status.stopReason is not None # mark crawl as stopping if status.stopping: diff --git a/backend/requirements.txt b/backend/requirements.txt index 365e03d44c..d16d5f9d33 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -24,3 +24,6 @@ backoff>=2.2.1 python-slugify>=8.0.1 mypy_boto3_s3 types_aiobotocore_s3 +types-redis +types-python-slugify +types-pyYAML diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index a50ab55c84..a2e4b42319 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -2,7 +2,7 @@ import requests -from .conftest import API_PREFIX +from .conftest import API_PREFIX, SUCCESSFUL_STATES cid = None @@ -361,7 +361,7 @@ def test_incremental_workflow_total_size_and_last_crawl_stats( headers=crawler_auth_headers, ) data = r.json() - if data["state"] == "complete": + if data["state"] in SUCCESSFUL_STATES: break time.sleep(5) diff --git a/backend/test/test_webhooks.py b/backend/test/test_webhooks.py index 17ded4d71b..4e74d7142d 100644 --- a/backend/test/test_webhooks.py +++ b/backend/test/test_webhooks.py @@ -4,7 +4,7 @@ import requests -from .conftest import API_PREFIX +from .conftest import API_PREFIX, SUCCESSFUL_STATES from .utils import read_in_chunks _webhook_event_id = None @@ -191,7 +191,7 @@ def test_webhooks_sent( headers=admin_auth_headers, ) data = r.json() - if data["state"] == "complete": + if data["state"] in SUCCESSFUL_STATES: break time.sleep(5) diff --git a/backend/test/test_workflow_auto_add_to_collection.py b/backend/test/test_workflow_auto_add_to_collection.py index bf3a16bcd7..1fd6e186e8 100644 --- a/backend/test/test_workflow_auto_add_to_collection.py +++ b/backend/test/test_workflow_auto_add_to_collection.py @@ -1,7 +1,7 @@ import requests import time -from .conftest import API_PREFIX +from .conftest import API_PREFIX, SUCCESSFUL_STATES def test_workflow_crawl_auto_added_to_collection( @@ -50,7 +50,7 @@ def test_workflow_crawl_auto_added_subsequent_runs( headers=crawler_auth_headers, ) data = r.json() - if data["state"] == "complete": + if data["state"] in SUCCESSFUL_STATES: break time.sleep(5)