From 58195ea09325de15d69ab557b4beae6fc8014ebe Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 22 Oct 2025 15:31:00 -0400 Subject: [PATCH 1/2] Add additional endpoints and options for crawl tagCounts - Add onlySuccessful and crawlType args to all-crawls tagCounts endpoint - Add tagCounts endpoints for crawls and uploads - Add tests --- backend/btrixcloud/basecrawls.py | 37 ++++++++--- backend/btrixcloud/crawlconfigs.py | 4 +- backend/btrixcloud/crawls.py | 15 +++++ backend/btrixcloud/models.py | 10 +-- backend/btrixcloud/uploads.py | 14 +++++ backend/test/conftest.py | 62 +++++++++++++++++++ backend/test/test_uploads.py | 98 ++++++++++++++++++++++++++++++ 7 files changed, 224 insertions(+), 16 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 4a368ef2f0..d94717786d 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -25,7 +25,7 @@ from .models import ( SUCCESSFUL_STATES, - CrawlConfigTags, + TagsResponse, CrawlFile, CrawlFileOut, BaseCrawl, @@ -984,13 +984,22 @@ async def get_org_last_crawl_finished(self, oid: UUID) -> Optional[datetime]: return last_crawl_finished - async def get_all_crawls_tag_counts(self, org: Organization): - """get distinct tags from all archived items for this org""" + async def get_all_crawls_tag_counts( + self, + org: Organization, + only_successful: bool = True, + type_: Optional[str] = None, + ): + """get distinct tags from archived items for this org""" + match_query: Dict[str, Any] = {"oid": org.id} + if only_successful: + match_query["state"] = {"$in": SUCCESSFUL_STATES} + if type_ in ("crawl", "upload"): + match_query["type"] = type_ + tags = await self.crawls.aggregate( [ - # Match only against the states of archived items that might be - # displayed in the frontend - {"$match": {"oid": org.id, "state": {"$in": SUCCESSFUL_STATES}}}, + {"$match": match_query}, {"$unwind": "$tags"}, {"$group": {"_id": "$tags", "count": {"$sum": 1}}}, {"$project": {"tag": "$_id", "count": "$count", "_id": 0}}, @@ -1094,10 +1103,20 @@ async def get_all_crawls_search_values( @app.get( "/orgs/{oid}/all-crawls/tagCounts", tags=["all-crawls"], - response_model=CrawlConfigTags, + response_model=TagsResponse, ) - async def get_all_crawls_tag_counts(org: Organization = Depends(org_viewer_dep)): - return {"tags": await ops.get_all_crawls_tag_counts(org)} + async def get_all_crawls_tag_counts( + org: Organization = Depends(org_viewer_dep), + onlySuccessful: bool = True, + crawlType: Optional[str] = None, + ): + if crawlType and crawlType not in ("crawl", "upload"): + raise HTTPException(status_code=400, detail="invalid_crawl_type") + + tags = await ops.get_all_crawls_tag_counts( + org, only_successful=onlySuccessful, type_=crawlType + ) + return {"tags": tags} @app.get( "/orgs/{oid}/all-crawls/{crawl_id}", diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index e1c9d60b0f..72dc8f6e0d 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -26,7 +26,7 @@ ConfigRevision, CrawlConfig, CrawlConfigOut, - CrawlConfigTags, + TagsResponse, CrawlOut, CrawlOutWithResources, UpdateCrawlConfig, @@ -1622,7 +1622,7 @@ async def get_crawl_config_tags(org: Organization = Depends(org_viewer_dep)): """ return await ops.get_crawl_config_tags(org) - @router.get("/tagCounts", response_model=CrawlConfigTags) + @router.get("/tagCounts", response_model=TagsResponse) async def get_crawl_config_tag_counts(org: Organization = Depends(org_viewer_dep)): return {"tags": await ops.get_crawl_config_tag_counts(org)} diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index c6c8780f06..a55597799d 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -78,6 +78,7 @@ CrawlQueueResponse, MatchCrawlQueueResponse, CrawlLogLine, + TagsResponse, ) @@ -1355,6 +1356,20 @@ async def delete_crawls( deleted=count, storageQuotaReached=quota_reached ) + @app.get( + "/orgs/{oid}/crawls/tagCounts", + tags=["crawls"], + response_model=TagsResponse, + ) + async def get_crawls_tag_counts( + org: Organization = Depends(org_viewer_dep), + onlySuccessful: bool = True, + ): + tags = await ops.get_all_crawls_tag_counts( + org, only_successful=onlySuccessful, type_="crawl" + ) + return {"tags": tags} + @app.get("/orgs/all/crawls/stats", tags=["crawls"], response_model=bytes) async def get_all_orgs_crawl_stats( user: User = Depends(user_dep), diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 7652197a72..5f2b4f9da3 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -603,18 +603,18 @@ class CrawlConfigAddedResponse(BaseModel): # ============================================================================ -class CrawlConfigTagCount(BaseModel): - """Response model for crawlconfig tag count""" +class TagCount(BaseModel): + """Response model for crawlconfig/crawl tag count""" tag: str count: int # ============================================================================ -class CrawlConfigTags(BaseModel): - """Response model for crawlconfig tags""" +class TagsResponse(BaseModel): + """Response model for crawlconfig/crawl tags""" - tags: List[CrawlConfigTagCount] + tags: List[TagCount] # ============================================================================ diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 23c0f1257b..d874bf603a 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -28,6 +28,7 @@ AddedResponseIdQuota, FilePreparer, MIN_UPLOAD_PART_SIZE, + TagsResponse, ) from .pagination import paginated_format, DEFAULT_PAGE_SIZE from .utils import dt_now @@ -362,6 +363,19 @@ async def list_uploads( ) return paginated_format(uploads, total, page, pageSize) + @app.get( + "/orgs/{oid}/uploads/tagCounts", + tags=["uploads"], + response_model=TagsResponse, + ) + async def get_uploads_tag_counts( + org: Organization = Depends(org_viewer_dep), + ): + tags = await ops.get_all_crawls_tag_counts( + org, only_successful=False, type_="upload" + ) + return {"tags": tags} + @app.get( "/orgs/{oid}/uploads/{crawlid}", tags=["uploads"], diff --git a/backend/test/conftest.py b/backend/test/conftest.py index 5f212d0d40..cafa24c7d9 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -31,6 +31,8 @@ NON_DEFAULT_ORG_NAME = "Non-default org" NON_DEFAULT_ORG_SLUG = "non-default-org" +RUNNING_STATES = ["running", "pending-wait", "generate-wacz", "uploading-wacz"] + FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"] SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"] @@ -266,6 +268,7 @@ def qa_crawl_id(crawler_auth_headers, default_org_id): "runNow": True, "name": "Crawler User Crawl for Testing QA", "description": "crawler test crawl for qa", + "tags": ["qa", "wr-test-1"], "config": {"seeds": [{"url": "https://old.webrecorder.net/"}], "limit": 1}, "crawlerChannel": "test", } @@ -295,6 +298,7 @@ def wr_specs_crawl_id(crawler_auth_headers, default_org_id): crawl_data = { "runNow": True, "name": "Webrecorder Specs sample crawl", + "tags": ["wr-test-1"], "config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1}, } r = requests.post( @@ -358,6 +362,7 @@ def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_ "runNow": True, "name": "Auto Add", "description": "For testing auto-adding new workflow crawls to collections", + "tags": ["wr-test-1"], "autoAddCollections": [auto_add_collection_id], "config": { "seeds": [{"url": "https://old.webrecorder.net/"}], @@ -399,6 +404,7 @@ def all_crawls_crawl_id(crawler_auth_headers, default_org_id): "runNow": True, "name": "All Crawls Test Crawl", "description": "Lorem ipsum", + "tags": ["all-crawls", "wr-test-2"], "config": { "seeds": [{"url": "https://old.webrecorder.net/"}], "exclude": "community", @@ -458,6 +464,7 @@ def all_crawls_delete_crawl_ids(admin_auth_headers, default_org_id): "runNow": True, "name": "All Crawls Delete Test Workflow", "description": "Lorem ipsum", + "tags": ["wr-test-1", "to-delete"], "config": { "seeds": [{"url": "https://old.webrecorder.net/"}], "exclude": "community", @@ -520,6 +527,7 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id): crawl_data = { "runNow": True, "name": "Custom Behavior Logs", + "tags": ["behaviors", "wr-test-1"], "config": { "seeds": [{"url": "https://specs.webrecorder.net/"}], "customBehaviors": [ @@ -551,6 +559,59 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id): return crawl_id +@pytest.fixture(scope="session") +def canceled_crawl_id(admin_auth_headers, default_org_id): + crawl_data = { + "runNow": True, + "name": "Canceled crawl", + "tags": ["canceled"], + "config": { + "seeds": [{"url": "https://old.webrecorder.net/"}], + "limit": 5, + }, + "browserWindows": 1, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + data = r.json() + + crawl_id = data["run_now_job"] + + # Cancel crawl after it's started + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] in RUNNING_STATES: + break + time.sleep(5) + + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel", + headers=admin_auth_headers, + ) + data = r.json() + assert data["success"] == True + + # Wait until crawl finishes + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] in FINISHED_STATES: + break + time.sleep(5) + + return crawl_id + + @pytest.fixture(scope="session") def url_list_config_id(crawler_auth_headers, default_org_id): # Start crawl. @@ -558,6 +619,7 @@ def url_list_config_id(crawler_auth_headers, default_org_id): "runNow": False, "name": "URL List config", "description": "Contains 3 seeds", + "tags": ["wr-test-1", "seed-list"], "config": { "seeds": [ {"url": "https://old.webrecorder.net"}, diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index c388d75860..cc23dbb11c 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1065,6 +1065,104 @@ def test_clear_all_presigned_urls( assert r.json()["success"] +def test_all_crawls_tag_counts(crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "wr-test-1", "count": 3}, + {"tag": "wr-test-2", "count": 2}, + {"tag": "all-crawls", "count": 1}, + {"tag": "behaviors", "count": 1}, + {"tag": "four", "count": 1}, + {"tag": "qa", "count": 1}, + {"tag": "three", "count": 1}, + {"tag": "wr-test-1-updated-again", "count": 1}, + {"tag": "wr-test-2-updated-again", "count": 1}, + ] + } + + +def test_all_crawls_tag_counts_including_failed( + crawler_auth_headers, default_org_id, canceled_crawl_id +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts?onlySuccessful=false", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "wr-test-1", "count": 3}, + {"tag": "wr-test-2", "count": 2}, + {"tag": "all-crawls", "count": 1}, + {"tag": "behaviors", "count": 1}, + {"tag": "canceled", "count": 1}, + {"tag": "four", "count": 1}, + {"tag": "qa", "count": 1}, + {"tag": "three", "count": 1}, + {"tag": "wr-test-1-updated-again", "count": 1}, + {"tag": "wr-test-2-updated-again", "count": 1}, + ] + } + + +def test_crawls_tag_counts(crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "wr-test-1", "count": 3}, + {"tag": "wr-test-2", "count": 2}, + {"tag": "all-crawls", "count": 1}, + {"tag": "behaviors", "count": 1}, + {"tag": "qa", "count": 1}, + ] + } + + +def test_crawls_tag_counts_including_failed( + crawler_auth_headers, default_org_id, canceled_crawl_id +): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts?onlySuccessful=false", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "wr-test-1", "count": 3}, + {"tag": "wr-test-2", "count": 2}, + {"tag": "all-crawls", "count": 1}, + {"tag": "behaviors", "count": 1}, + {"tag": "canceled", "count": 1}, + {"tag": "qa", "count": 1}, + ] + } + + +def test_uploads_tag_counts(crawler_auth_headers, default_org_id): + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/uploads/tagCounts", + headers=crawler_auth_headers, + ) + assert r.status_code == 200 + assert r.json() == { + "tags": [ + {"tag": "four", "count": 1}, + {"tag": "three", "count": 1}, + {"tag": "wr-test-1-updated-again", "count": 1}, + {"tag": "wr-test-2-updated-again", "count": 1}, + ] + } + + def test_delete_form_upload_and_crawls_from_all_crawls( admin_auth_headers, crawler_auth_headers, From d6897fb3c0816b9e18914a0a16caec4c773221d4 Mon Sep 17 00:00:00 2001 From: sua yoo Date: Tue, 28 Oct 2025 13:46:37 -0700 Subject: [PATCH 2/2] task: Filter tags by item type (#2934) Shows only tags belonging to selected item type in the archived item tags filter. --- .../archived-items/archived-item-tag-filter.ts | 17 ++++++++++++++--- frontend/src/pages/org/archived-items.ts | 1 + 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/frontend/src/features/archived-items/archived-item-tag-filter.ts b/frontend/src/features/archived-items/archived-item-tag-filter.ts index 1cb1b2afba..0f2652b989 100644 --- a/frontend/src/features/archived-items/archived-item-tag-filter.ts +++ b/frontend/src/features/archived-items/archived-item-tag-filter.ts @@ -17,10 +17,12 @@ import { state, } from "lit/decorators.js"; import { repeat } from "lit/directives/repeat.js"; +import queryString from "query-string"; import { isFocusable } from "tabbable"; import { BtrixElement } from "@/classes/BtrixElement"; import type { BtrixChangeEvent } from "@/events/btrix-change"; +import type { ArchivedItem } from "@/types/crawler"; import { type WorkflowTag, type WorkflowTags } from "@/types/workflow"; import { stopProp } from "@/utils/events"; import { isNotEqual } from "@/utils/is-not-equal"; @@ -44,6 +46,9 @@ export class ArchivedItemTagFilter extends BtrixElement { @property({ type: Array }) tags?: string[]; + @property({ type: String }) + itemType?: ArchivedItem["type"]; + @state() private searchString = ""; @@ -93,9 +98,15 @@ export class ArchivedItemTagFilter extends BtrixElement { } private readonly orgTagsTask = new Task(this, { - task: async () => { + task: async ([itemType], { signal }) => { + const query = queryString.stringify({ + onlySuccessful: true, + crawlType: itemType, + }); + const { tags } = await this.api.fetch( - `/orgs/${this.orgId}/all-crawls/tagCounts`, + `/orgs/${this.orgId}/all-crawls/tagCounts?${query}`, + { signal }, ); this.fuse.setCollection(tags); @@ -103,7 +114,7 @@ export class ArchivedItemTagFilter extends BtrixElement { // Match fuse shape return tags.map((item) => ({ item })); }, - args: () => [] as const, + args: () => [this.itemType] as const, }); render() { diff --git a/frontend/src/pages/org/archived-items.ts b/frontend/src/pages/org/archived-items.ts index c273895958..de7efd49bc 100644 --- a/frontend/src/pages/org/archived-items.ts +++ b/frontend/src/pages/org/archived-items.ts @@ -641,6 +641,7 @@ export class CrawlsList extends BtrixElement { { this.filterByTags.setValue(e.detail.value?.tags); this.filterByTagsType.setValue(e.detail.value?.type || "or");