From 58195ea09325de15d69ab557b4beae6fc8014ebe Mon Sep 17 00:00:00 2001
From: Tessa Walsh <tessa@bitarchivist.net>
Date: Wed, 22 Oct 2025 15:31:00 -0400
Subject: [PATCH 1/2] Add additional endpoints and options for crawl tagCounts

- Add onlySuccessful and crawlType args to all-crawls tagCounts
endpoint
- Add tagCounts endpoints for crawls and uploads
- Add tests
---
 backend/btrixcloud/basecrawls.py   | 37 ++++++++---
 backend/btrixcloud/crawlconfigs.py |  4 +-
 backend/btrixcloud/crawls.py       | 15 +++++
 backend/btrixcloud/models.py       | 10 +--
 backend/btrixcloud/uploads.py      | 14 +++++
 backend/test/conftest.py           | 62 +++++++++++++++++++
 backend/test/test_uploads.py       | 98 ++++++++++++++++++++++++++++++
 7 files changed, 224 insertions(+), 16 deletions(-)

diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py
index 4a368ef2f0..d94717786d 100644
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@@ -25,7 +25,7 @@
 
 from .models import (
     SUCCESSFUL_STATES,
-    CrawlConfigTags,
+    TagsResponse,
     CrawlFile,
     CrawlFileOut,
     BaseCrawl,
@@ -984,13 +984,22 @@ async def get_org_last_crawl_finished(self, oid: UUID) -> Optional[datetime]:
 
         return last_crawl_finished
 
-    async def get_all_crawls_tag_counts(self, org: Organization):
-        """get distinct tags from all archived items for this org"""
+    async def get_all_crawls_tag_counts(
+        self,
+        org: Organization,
+        only_successful: bool = True,
+        type_: Optional[str] = None,
+    ):
+        """get distinct tags from archived items for this org"""
+        match_query: Dict[str, Any] = {"oid": org.id}
+        if only_successful:
+            match_query["state"] = {"$in": SUCCESSFUL_STATES}
+        if type_ in ("crawl", "upload"):
+            match_query["type"] = type_
+
         tags = await self.crawls.aggregate(
             [
-                # Match only against the states of archived items that might be
-                # displayed in the frontend
-                {"$match": {"oid": org.id, "state": {"$in": SUCCESSFUL_STATES}}},
+                {"$match": match_query},
                 {"$unwind": "$tags"},
                 {"$group": {"_id": "$tags", "count": {"$sum": 1}}},
                 {"$project": {"tag": "$_id", "count": "$count", "_id": 0}},
@@ -1094,10 +1103,20 @@ async def get_all_crawls_search_values(
     @app.get(
         "/orgs/{oid}/all-crawls/tagCounts",
         tags=["all-crawls"],
-        response_model=CrawlConfigTags,
+        response_model=TagsResponse,
     )
-    async def get_all_crawls_tag_counts(org: Organization = Depends(org_viewer_dep)):
-        return {"tags": await ops.get_all_crawls_tag_counts(org)}
+    async def get_all_crawls_tag_counts(
+        org: Organization = Depends(org_viewer_dep),
+        onlySuccessful: bool = True,
+        crawlType: Optional[str] = None,
+    ):
+        if crawlType and crawlType not in ("crawl", "upload"):
+            raise HTTPException(status_code=400, detail="invalid_crawl_type")
+
+        tags = await ops.get_all_crawls_tag_counts(
+            org, only_successful=onlySuccessful, type_=crawlType
+        )
+        return {"tags": tags}
 
     @app.get(
         "/orgs/{oid}/all-crawls/{crawl_id}",
diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py
index e1c9d60b0f..72dc8f6e0d 100644
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@@ -26,7 +26,7 @@
     ConfigRevision,
     CrawlConfig,
     CrawlConfigOut,
-    CrawlConfigTags,
+    TagsResponse,
     CrawlOut,
     CrawlOutWithResources,
     UpdateCrawlConfig,
@@ -1622,7 +1622,7 @@ async def get_crawl_config_tags(org: Organization = Depends(org_viewer_dep)):
         """
         return await ops.get_crawl_config_tags(org)
 
-    @router.get("/tagCounts", response_model=CrawlConfigTags)
+    @router.get("/tagCounts", response_model=TagsResponse)
     async def get_crawl_config_tag_counts(org: Organization = Depends(org_viewer_dep)):
         return {"tags": await ops.get_crawl_config_tag_counts(org)}
 
diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
index c6c8780f06..a55597799d 100644
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@@ -78,6 +78,7 @@
     CrawlQueueResponse,
     MatchCrawlQueueResponse,
     CrawlLogLine,
+    TagsResponse,
 )
 
 
@@ -1355,6 +1356,20 @@ async def delete_crawls(
             deleted=count, storageQuotaReached=quota_reached
         )
 
+    @app.get(
+        "/orgs/{oid}/crawls/tagCounts",
+        tags=["crawls"],
+        response_model=TagsResponse,
+    )
+    async def get_crawls_tag_counts(
+        org: Organization = Depends(org_viewer_dep),
+        onlySuccessful: bool = True,
+    ):
+        tags = await ops.get_all_crawls_tag_counts(
+            org, only_successful=onlySuccessful, type_="crawl"
+        )
+        return {"tags": tags}
+
     @app.get("/orgs/all/crawls/stats", tags=["crawls"], response_model=bytes)
     async def get_all_orgs_crawl_stats(
         user: User = Depends(user_dep),
diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py
index 7652197a72..5f2b4f9da3 100644
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@@ -603,18 +603,18 @@ class CrawlConfigAddedResponse(BaseModel):
 
 
 # ============================================================================
-class CrawlConfigTagCount(BaseModel):
-    """Response model for crawlconfig tag count"""
+class TagCount(BaseModel):
+    """Response model for crawlconfig/crawl tag count"""
 
     tag: str
     count: int
 
 
 # ============================================================================
-class CrawlConfigTags(BaseModel):
-    """Response model for crawlconfig tags"""
+class TagsResponse(BaseModel):
+    """Response model for crawlconfig/crawl tags"""
 
-    tags: List[CrawlConfigTagCount]
+    tags: List[TagCount]
 
 
 # ============================================================================
diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py
index 23c0f1257b..d874bf603a 100644
--- a/backend/btrixcloud/uploads.py
+++ b/backend/btrixcloud/uploads.py
@@ -28,6 +28,7 @@
     AddedResponseIdQuota,
     FilePreparer,
     MIN_UPLOAD_PART_SIZE,
+    TagsResponse,
 )
 from .pagination import paginated_format, DEFAULT_PAGE_SIZE
 from .utils import dt_now
@@ -362,6 +363,19 @@ async def list_uploads(
         )
         return paginated_format(uploads, total, page, pageSize)
 
+    @app.get(
+        "/orgs/{oid}/uploads/tagCounts",
+        tags=["uploads"],
+        response_model=TagsResponse,
+    )
+    async def get_uploads_tag_counts(
+        org: Organization = Depends(org_viewer_dep),
+    ):
+        tags = await ops.get_all_crawls_tag_counts(
+            org, only_successful=False, type_="upload"
+        )
+        return {"tags": tags}
+
     @app.get(
         "/orgs/{oid}/uploads/{crawlid}",
         tags=["uploads"],
diff --git a/backend/test/conftest.py b/backend/test/conftest.py
index 5f212d0d40..cafa24c7d9 100644
--- a/backend/test/conftest.py
+++ b/backend/test/conftest.py
@@ -31,6 +31,8 @@
 NON_DEFAULT_ORG_NAME = "Non-default org"
 NON_DEFAULT_ORG_SLUG = "non-default-org"
 
+RUNNING_STATES = ["running", "pending-wait", "generate-wacz", "uploading-wacz"]
+
 FAILED_STATES = ["canceled", "failed", "skipped_quota_reached"]
 
 SUCCESSFUL_STATES = ["complete", "stopped_by_user", "stopped_quota_reached"]
@@ -266,6 +268,7 @@ def qa_crawl_id(crawler_auth_headers, default_org_id):
         "runNow": True,
         "name": "Crawler User Crawl for Testing QA",
         "description": "crawler test crawl for qa",
+        "tags": ["qa", "wr-test-1"],
         "config": {"seeds": [{"url": "https://old.webrecorder.net/"}], "limit": 1},
         "crawlerChannel": "test",
     }
@@ -295,6 +298,7 @@ def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
     crawl_data = {
         "runNow": True,
         "name": "Webrecorder Specs sample crawl",
+        "tags": ["wr-test-1"],
         "config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
     }
     r = requests.post(
@@ -358,6 +362,7 @@ def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_
         "runNow": True,
         "name": "Auto Add",
         "description": "For testing auto-adding new workflow crawls to collections",
+        "tags": ["wr-test-1"],
         "autoAddCollections": [auto_add_collection_id],
         "config": {
             "seeds": [{"url": "https://old.webrecorder.net/"}],
@@ -399,6 +404,7 @@ def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
         "runNow": True,
         "name": "All Crawls Test Crawl",
         "description": "Lorem ipsum",
+        "tags": ["all-crawls", "wr-test-2"],
         "config": {
             "seeds": [{"url": "https://old.webrecorder.net/"}],
             "exclude": "community",
@@ -458,6 +464,7 @@ def all_crawls_delete_crawl_ids(admin_auth_headers, default_org_id):
         "runNow": True,
         "name": "All Crawls Delete Test Workflow",
         "description": "Lorem ipsum",
+        "tags": ["wr-test-1", "to-delete"],
         "config": {
             "seeds": [{"url": "https://old.webrecorder.net/"}],
             "exclude": "community",
@@ -520,6 +527,7 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
     crawl_data = {
         "runNow": True,
         "name": "Custom Behavior Logs",
+        "tags": ["behaviors", "wr-test-1"],
         "config": {
             "seeds": [{"url": "https://specs.webrecorder.net/"}],
             "customBehaviors": [
@@ -551,6 +559,59 @@ def custom_behaviors_crawl_id(admin_auth_headers, default_org_id):
     return crawl_id
 
 
+@pytest.fixture(scope="session")
+def canceled_crawl_id(admin_auth_headers, default_org_id):
+    crawl_data = {
+        "runNow": True,
+        "name": "Canceled crawl",
+        "tags": ["canceled"],
+        "config": {
+            "seeds": [{"url": "https://old.webrecorder.net/"}],
+            "limit": 5,
+        },
+        "browserWindows": 1,
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    crawl_id = data["run_now_job"]
+
+    # Cancel crawl after it's started
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] in RUNNING_STATES:
+            break
+        time.sleep(5)
+
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel",
+        headers=admin_auth_headers,
+    )
+    data = r.json()
+    assert data["success"] == True
+
+    # Wait until crawl finishes
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] in FINISHED_STATES:
+            break
+        time.sleep(5)
+
+    return crawl_id
+
+
 @pytest.fixture(scope="session")
 def url_list_config_id(crawler_auth_headers, default_org_id):
     # Start crawl.
@@ -558,6 +619,7 @@ def url_list_config_id(crawler_auth_headers, default_org_id):
         "runNow": False,
         "name": "URL List config",
         "description": "Contains 3 seeds",
+        "tags": ["wr-test-1", "seed-list"],
         "config": {
             "seeds": [
                 {"url": "https://old.webrecorder.net"},
diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py
index c388d75860..cc23dbb11c 100644
--- a/backend/test/test_uploads.py
+++ b/backend/test/test_uploads.py
@@ -1065,6 +1065,104 @@ def test_clear_all_presigned_urls(
     assert r.json()["success"]
 
 
+def test_all_crawls_tag_counts(crawler_auth_headers, default_org_id):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "wr-test-1", "count": 3},
+            {"tag": "wr-test-2", "count": 2},
+            {"tag": "all-crawls", "count": 1},
+            {"tag": "behaviors", "count": 1},
+            {"tag": "four", "count": 1},
+            {"tag": "qa", "count": 1},
+            {"tag": "three", "count": 1},
+            {"tag": "wr-test-1-updated-again", "count": 1},
+            {"tag": "wr-test-2-updated-again", "count": 1},
+        ]
+    }
+
+
+def test_all_crawls_tag_counts_including_failed(
+    crawler_auth_headers, default_org_id, canceled_crawl_id
+):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/tagCounts?onlySuccessful=false",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "wr-test-1", "count": 3},
+            {"tag": "wr-test-2", "count": 2},
+            {"tag": "all-crawls", "count": 1},
+            {"tag": "behaviors", "count": 1},
+            {"tag": "canceled", "count": 1},
+            {"tag": "four", "count": 1},
+            {"tag": "qa", "count": 1},
+            {"tag": "three", "count": 1},
+            {"tag": "wr-test-1-updated-again", "count": 1},
+            {"tag": "wr-test-2-updated-again", "count": 1},
+        ]
+    }
+
+
+def test_crawls_tag_counts(crawler_auth_headers, default_org_id):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "wr-test-1", "count": 3},
+            {"tag": "wr-test-2", "count": 2},
+            {"tag": "all-crawls", "count": 1},
+            {"tag": "behaviors", "count": 1},
+            {"tag": "qa", "count": 1},
+        ]
+    }
+
+
+def test_crawls_tag_counts_including_failed(
+    crawler_auth_headers, default_org_id, canceled_crawl_id
+):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/tagCounts?onlySuccessful=false",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "wr-test-1", "count": 3},
+            {"tag": "wr-test-2", "count": 2},
+            {"tag": "all-crawls", "count": 1},
+            {"tag": "behaviors", "count": 1},
+            {"tag": "canceled", "count": 1},
+            {"tag": "qa", "count": 1},
+        ]
+    }
+
+
+def test_uploads_tag_counts(crawler_auth_headers, default_org_id):
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/uploads/tagCounts",
+        headers=crawler_auth_headers,
+    )
+    assert r.status_code == 200
+    assert r.json() == {
+        "tags": [
+            {"tag": "four", "count": 1},
+            {"tag": "three", "count": 1},
+            {"tag": "wr-test-1-updated-again", "count": 1},
+            {"tag": "wr-test-2-updated-again", "count": 1},
+        ]
+    }
+
+
 def test_delete_form_upload_and_crawls_from_all_crawls(
     admin_auth_headers,
     crawler_auth_headers,

From d6897fb3c0816b9e18914a0a16caec4c773221d4 Mon Sep 17 00:00:00 2001
From: sua yoo <sua@webrecorder.org>
Date: Tue, 28 Oct 2025 13:46:37 -0700
Subject: [PATCH 2/2] task: Filter tags by item type (#2934)

Shows only tags belonging to selected item type in the archived item
tags filter.
---
 .../archived-items/archived-item-tag-filter.ts  | 17 ++++++++++++++---
 frontend/src/pages/org/archived-items.ts        |  1 +
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/frontend/src/features/archived-items/archived-item-tag-filter.ts b/frontend/src/features/archived-items/archived-item-tag-filter.ts
index 1cb1b2afba..0f2652b989 100644
--- a/frontend/src/features/archived-items/archived-item-tag-filter.ts
+++ b/frontend/src/features/archived-items/archived-item-tag-filter.ts
@@ -17,10 +17,12 @@ import {
   state,
 } from "lit/decorators.js";
 import { repeat } from "lit/directives/repeat.js";
+import queryString from "query-string";
 import { isFocusable } from "tabbable";
 
 import { BtrixElement } from "@/classes/BtrixElement";
 import type { BtrixChangeEvent } from "@/events/btrix-change";
+import type { ArchivedItem } from "@/types/crawler";
 import { type WorkflowTag, type WorkflowTags } from "@/types/workflow";
 import { stopProp } from "@/utils/events";
 import { isNotEqual } from "@/utils/is-not-equal";
@@ -44,6 +46,9 @@ export class ArchivedItemTagFilter extends BtrixElement {
   @property({ type: Array })
   tags?: string[];
 
+  @property({ type: String })
+  itemType?: ArchivedItem["type"];
+
   @state()
   private searchString = "";
 
@@ -93,9 +98,15 @@ export class ArchivedItemTagFilter extends BtrixElement {
   }
 
   private readonly orgTagsTask = new Task(this, {
-    task: async () => {
+    task: async ([itemType], { signal }) => {
+      const query = queryString.stringify({
+        onlySuccessful: true,
+        crawlType: itemType,
+      });
+
       const { tags } = await this.api.fetch<WorkflowTags>(
-        `/orgs/${this.orgId}/all-crawls/tagCounts`,
+        `/orgs/${this.orgId}/all-crawls/tagCounts?${query}`,
+        { signal },
       );
 
       this.fuse.setCollection(tags);
@@ -103,7 +114,7 @@ export class ArchivedItemTagFilter extends BtrixElement {
       // Match fuse shape
       return tags.map((item) => ({ item }));
     },
-    args: () => [] as const,
+    args: () => [this.itemType] as const,
   });
 
   render() {
diff --git a/frontend/src/pages/org/archived-items.ts b/frontend/src/pages/org/archived-items.ts
index c273895958..de7efd49bc 100644
--- a/frontend/src/pages/org/archived-items.ts
+++ b/frontend/src/pages/org/archived-items.ts
@@ -641,6 +641,7 @@ export class CrawlsList extends BtrixElement {
 
           <btrix-archived-item-tag-filter
             .tags=${this.filterByTags.value}
+            itemType=${ifDefined(this.itemType || undefined)}
             @btrix-change=${(e: BtrixChangeArchivedItemTagFilterEvent) => {
               this.filterByTags.setValue(e.detail.value?.tags);
               this.filterByTagsType.setValue(e.detail.value?.type || "or");