Add collection page list/search endpoint (#2354)

Fixes #2353 Adds a new endpoint to list pages in a collection, with filtering available on `url` (exact match), `ts`, `urlPrefix`, `isSeed`, and `depth`, as well as accompanying tests. Additional sort options have been added as well. These same filters and sort options have also been added to the crawl pages endpoint. Also fixes an issue where `isSeed` wasn't being set in the database when false but only added on serialization, which was preventing filtering from working as expected.
webrecorder · Feb 11, 2025 · 98a45b0 · 98a45b0
1 parent 001839a
commit 98a45b0
Show file tree

Hide file tree

Showing 6 changed files with 423 additions and 23 deletions.
diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py
@@ -248,7 +248,14 @@ def main() -> None:
     upload_ops = init_uploads_api(*base_crawl_init)
 
     page_ops = init_pages_api(
-        app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user
+        app,
+        mdb,
+        crawls,
+        org_ops,
+        storage_ops,
+        background_job_ops,
+        coll_ops,
+        current_active_user,
     )
 
     base_crawl_ops.set_page_ops(page_ops)

diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py
@@ -89,7 +89,9 @@ def init_ops() -> Tuple[
 
     upload_ops = UploadOps(*base_crawl_init)
 
-    page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
+    page_ops = PageOps(
+        mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
+    )
 
     base_crawl_ops.set_page_ops(page_ops)
     crawl_ops.set_page_ops(page_ops)

diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py
@@ -1,8 +1,12 @@
 """crawl pages"""
 
+# pylint: disable=too-many-lines
+
 import asyncio
 import os
+import re
 import traceback
+import urllib.parse
 from datetime import datetime
 from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
 from uuid import UUID, uuid4
@@ -37,11 +41,12 @@
 
 if TYPE_CHECKING:
     from .background_jobs import BackgroundJobOps
+    from .colls import CollectionOps
     from .crawls import CrawlOps
     from .orgs import OrgOps
     from .storages import StorageOps
 else:
-    CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object
+    CrawlOps = StorageOps = OrgOps = BackgroundJobOps = CollectionOps = object
 
 
 # ============================================================================
@@ -53,14 +58,18 @@ class PageOps:
     org_ops: OrgOps
     storage_ops: StorageOps
     background_job_ops: BackgroundJobOps
+    coll_ops: CollectionOps
 
-    def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops):
+    def __init__(
+        self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
+    ):
         self.pages = mdb["pages"]
         self.crawls = mdb["crawls"]
         self.crawl_ops = crawl_ops
         self.org_ops = org_ops
         self.storage_ops = storage_ops
         self.background_job_ops = background_job_ops
+        self.coll_ops = coll_ops
 
     async def init_index(self):
         """init index for pages db collection"""
@@ -82,6 +91,9 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
                 if not page_dict.get("url"):
                     continue
 
+                if not page_dict.get("isSeed"):
+                    page_dict["isSeed"] = False
+
                 if len(pages_buffer) > batch_size:
                     await self._add_pages_to_db(crawl_id, pages_buffer)
                     pages_buffer = []
@@ -210,9 +222,8 @@ async def add_page_to_db(
     ):
         """Add page to database"""
         page = self._get_page_from_dict(page_dict, crawl_id, oid)
-        page_to_insert = page.to_dict(
-            exclude_unset=True, exclude_none=True, exclude_defaults=True
-        )
+
+        page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True)
 
         try:
             await self.pages.insert_one(page_to_insert)
@@ -492,6 +503,11 @@ async def list_pages(
         self,
         crawl_id: str,
         org: Optional[Organization] = None,
+        url: Optional[str] = None,
+        url_prefix: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        is_seed: Optional[bool] = None,
+        depth: Optional[int] = None,
         qa_run_id: Optional[str] = None,
         qa_filter_by: Optional[str] = None,
         qa_gte: Optional[float] = None,
@@ -518,6 +534,23 @@ async def list_pages(
         if org:
             query["oid"] = org.id
 
+        if url_prefix:
+            url_prefix = urllib.parse.unquote(url_prefix)
+            regex_pattern = f"^{re.escape(url_prefix)}"
+            query["url"] = {"$regex": regex_pattern, "$options": "i"}
+
+        elif url:
+            query["url"] = urllib.parse.unquote(url)
+
+        if ts:
+            query["ts"] = ts
+
+        if is_seed in (True, False):
+            query["isSeed"] = is_seed
+
+        if isinstance(depth, int):
+            query["depth"] = depth
+
         if reviewed:
             query["$or"] = [
                 {"approved": {"$ne": None}},
@@ -562,7 +595,18 @@ async def list_pages(
             # Sorting options to add:
             # - automated heuristics like screenshot_comparison (dict keyed by QA run id)
             # - Ensure notes sorting works okay with notes in list
-            sort_fields = ("url", "title", "notes", "approved")
+            sort_fields = (
+                "url",
+                "title",
+                "notes",
+                "approved",
+                "ts",
+                "status",
+                "mime",
+                "filename",
+                "depth",
+                "isSeed",
+            )
             qa_sort_fields = ("screenshotMatch", "textMatch")
             if sort_by not in sort_fields and sort_by not in qa_sort_fields:
                 raise HTTPException(status_code=400, detail="invalid_sort_by")
@@ -613,6 +657,101 @@ async def list_pages(
 
         return [PageOut.from_dict(data) for data in items], total
 
+    async def list_collection_pages(
+        self,
+        coll_id: UUID,
+        org: Optional[Organization] = None,
+        url: Optional[str] = None,
+        url_prefix: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        is_seed: Optional[bool] = None,
+        depth: Optional[int] = None,
+        page_size: int = DEFAULT_PAGE_SIZE,
+        page: int = 1,
+        sort_by: Optional[str] = None,
+        sort_direction: Optional[int] = -1,
+    ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
+        """List all pages in collection, with optional filtering"""
+        # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
+        # Zero-index page for query
+        page = page - 1
+        skip = page_size * page
+
+        crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id)
+
+        query: dict[str, object] = {
+            "crawl_id": {"$in": crawl_ids},
+        }
+        if org:
+            query["oid"] = org.id
+
+        if url_prefix:
+            url_prefix = urllib.parse.unquote(url_prefix)
+            regex_pattern = f"^{re.escape(url_prefix)}"
+            query["url"] = {"$regex": regex_pattern, "$options": "i"}
+
+        elif url:
+            query["url"] = urllib.parse.unquote(url)
+
+        if ts:
+            query["ts"] = ts
+
+        if is_seed in (True, False):
+            query["isSeed"] = is_seed
+
+        if isinstance(depth, int):
+            query["depth"] = depth
+
+        aggregate = [{"$match": query}]
+
+        if sort_by:
+            # Sorting options to add:
+            # - automated heuristics like screenshot_comparison (dict keyed by QA run id)
+            # - Ensure notes sorting works okay with notes in list
+            sort_fields = (
+                "url",
+                "crawl_id",
+                "ts",
+                "status",
+                "mime",
+                "filename",
+                "depth",
+                "isSeed",
+            )
+            if sort_by not in sort_fields:
+                raise HTTPException(status_code=400, detail="invalid_sort_by")
+            if sort_direction not in (1, -1):
+                raise HTTPException(status_code=400, detail="invalid_sort_direction")
+
+            aggregate.extend([{"$sort": {sort_by: sort_direction}}])
+
+        aggregate.extend(
+            [
+                {
+                    "$facet": {
+                        "items": [
+                            {"$skip": skip},
+                            {"$limit": page_size},
+                        ],
+                        "total": [{"$count": "count"}],
+                    }
+                },
+            ]
+        )
+
+        # Get total
+        cursor = self.pages.aggregate(aggregate)
+        results = await cursor.to_list(length=1)
+        result = results[0]
+        items = result["items"]
+
+        try:
+            total = int(result["total"][0]["count"])
+        except (IndexError, ValueError):
+            total = 0
+
+        return [PageOut.from_dict(data) for data in items], total
+
     async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
         """Delete existing pages for crawl and re-add from WACZs."""
         await self.delete_crawl_pages(crawl_id, oid)
@@ -738,13 +877,14 @@ async def set_archived_item_page_counts(self, crawl_id: str):
 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
 def init_pages_api(
-    app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep
+    app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
 ):
     """init pages API"""
     # pylint: disable=invalid-name
 
-    ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
+    ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops)
 
+    org_viewer_dep = org_ops.org_viewer_dep
     org_crawl_dep = org_ops.org_crawl_dep
 
     @app.post(
@@ -913,9 +1053,14 @@ async def delete_page_notes(
         tags=["pages", "all-crawls"],
         response_model=PaginatedPageOutResponse,
     )
-    async def get_pages_list(
+    async def get_crawl_pages_list(
         crawl_id: str,
         org: Organization = Depends(org_crawl_dep),
+        url: Optional[str] = None,
+        urlPrefix: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        isSeed: Optional[bool] = None,
+        depth: Optional[int] = None,
         reviewed: Optional[bool] = None,
         approved: Optional[str] = None,
         hasNotes: Optional[bool] = None,
@@ -932,6 +1077,11 @@ async def get_pages_list(
         pages, total = await ops.list_pages(
             crawl_id=crawl_id,
             org=org,
+            url=url,
+            url_prefix=urlPrefix,
+            ts=ts,
+            is_seed=isSeed,
+            depth=depth,
             reviewed=reviewed,
             approved=formatted_approved,
             has_notes=hasNotes,
@@ -942,6 +1092,40 @@ async def get_pages_list(
         )
         return paginated_format(pages, total, page, pageSize)
 
+    @app.get(
+        "/orgs/{oid}/collections/{coll_id}/pages",
+        tags=["pages", "collections"],
+        response_model=PaginatedPageOutResponse,
+    )
+    async def get_collection_pages_list(
+        coll_id: UUID,
+        org: Organization = Depends(org_viewer_dep),
+        url: Optional[str] = None,
+        urlPrefix: Optional[str] = None,
+        ts: Optional[datetime] = None,
+        isSeed: Optional[bool] = None,
+        depth: Optional[int] = None,
+        pageSize: int = DEFAULT_PAGE_SIZE,
+        page: int = 1,
+        sortBy: Optional[str] = None,
+        sortDirection: Optional[int] = -1,
+    ):
+        """Retrieve paginated list of pages in collection"""
+        pages, total = await ops.list_collection_pages(
+            coll_id=coll_id,
+            org=org,
+            url=url,
+            url_prefix=urlPrefix,
+            ts=ts,
+            is_seed=isSeed,
+            depth=depth,
+            page_size=pageSize,
+            page=page,
+            sort_by=sortBy,
+            sort_direction=sortDirection,
+        )
+        return paginated_format(pages, total, page, pageSize)
+
     @app.get(
         "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages",
         tags=["pages", "qa"],

diff --git a/backend/test/conftest.py b/backend/test/conftest.py
@@ -232,7 +232,7 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
         "name": "Crawler User Test Crawl",
         "description": "crawler test crawl",
         "tags": ["wr-test-2"],
-        "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1},
+        "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 3},
         "crawlerChannel": "test",
     }
     r = requests.post(