Skip to content

Commit

Permalink
Add collection page list/search endpoint (#2354)
Browse files Browse the repository at this point in the history
Fixes #2353

Adds a new endpoint to list pages in a collection, with filtering
available on `url` (exact match), `ts`, `urlPrefix`, `isSeed`, and
`depth`, as well as accompanying tests. Additional sort options have
been added as well.

These same filters and sort options have also been added to the crawl
pages endpoint.

Also fixes an issue where `isSeed` wasn't being set in the database when
false but only added on serialization, which was preventing filtering
from working as expected.
  • Loading branch information
tw4l authored Feb 11, 2025
1 parent 001839a commit 98a45b0
Show file tree
Hide file tree
Showing 6 changed files with 423 additions and 23 deletions.
9 changes: 8 additions & 1 deletion backend/btrixcloud/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,14 @@ def main() -> None:
upload_ops = init_uploads_api(*base_crawl_init)

page_ops = init_pages_api(
app, mdb, crawls, org_ops, storage_ops, background_job_ops, current_active_user
app,
mdb,
crawls,
org_ops,
storage_ops,
background_job_ops,
coll_ops,
current_active_user,
)

base_crawl_ops.set_page_ops(page_ops)
Expand Down
4 changes: 3 additions & 1 deletion backend/btrixcloud/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,9 @@ def init_ops() -> Tuple[

upload_ops = UploadOps(*base_crawl_init)

page_ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
page_ops = PageOps(
mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
)

base_crawl_ops.set_page_ops(page_ops)
crawl_ops.set_page_ops(page_ops)
Expand Down
202 changes: 193 additions & 9 deletions backend/btrixcloud/pages.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
"""crawl pages"""

# pylint: disable=too-many-lines

import asyncio
import os
import re
import traceback
import urllib.parse
from datetime import datetime
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
from uuid import UUID, uuid4
Expand Down Expand Up @@ -37,11 +41,12 @@

if TYPE_CHECKING:
from .background_jobs import BackgroundJobOps
from .colls import CollectionOps
from .crawls import CrawlOps
from .orgs import OrgOps
from .storages import StorageOps
else:
CrawlOps = StorageOps = OrgOps = BackgroundJobOps = object
CrawlOps = StorageOps = OrgOps = BackgroundJobOps = CollectionOps = object


# ============================================================================
Expand All @@ -53,14 +58,18 @@ class PageOps:
org_ops: OrgOps
storage_ops: StorageOps
background_job_ops: BackgroundJobOps
coll_ops: CollectionOps

def __init__(self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops):
def __init__(
self, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops
):
self.pages = mdb["pages"]
self.crawls = mdb["crawls"]
self.crawl_ops = crawl_ops
self.org_ops = org_ops
self.storage_ops = storage_ops
self.background_job_ops = background_job_ops
self.coll_ops = coll_ops

async def init_index(self):
"""init index for pages db collection"""
Expand All @@ -82,6 +91,9 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100):
if not page_dict.get("url"):
continue

if not page_dict.get("isSeed"):
page_dict["isSeed"] = False

if len(pages_buffer) > batch_size:
await self._add_pages_to_db(crawl_id, pages_buffer)
pages_buffer = []
Expand Down Expand Up @@ -210,9 +222,8 @@ async def add_page_to_db(
):
"""Add page to database"""
page = self._get_page_from_dict(page_dict, crawl_id, oid)
page_to_insert = page.to_dict(
exclude_unset=True, exclude_none=True, exclude_defaults=True
)

page_to_insert = page.to_dict(exclude_unset=True, exclude_none=True)

try:
await self.pages.insert_one(page_to_insert)
Expand Down Expand Up @@ -492,6 +503,11 @@ async def list_pages(
self,
crawl_id: str,
org: Optional[Organization] = None,
url: Optional[str] = None,
url_prefix: Optional[str] = None,
ts: Optional[datetime] = None,
is_seed: Optional[bool] = None,
depth: Optional[int] = None,
qa_run_id: Optional[str] = None,
qa_filter_by: Optional[str] = None,
qa_gte: Optional[float] = None,
Expand All @@ -518,6 +534,23 @@ async def list_pages(
if org:
query["oid"] = org.id

if url_prefix:
url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}"
query["url"] = {"$regex": regex_pattern, "$options": "i"}

elif url:
query["url"] = urllib.parse.unquote(url)

if ts:
query["ts"] = ts

if is_seed in (True, False):
query["isSeed"] = is_seed

if isinstance(depth, int):
query["depth"] = depth

if reviewed:
query["$or"] = [
{"approved": {"$ne": None}},
Expand Down Expand Up @@ -562,7 +595,18 @@ async def list_pages(
# Sorting options to add:
# - automated heuristics like screenshot_comparison (dict keyed by QA run id)
# - Ensure notes sorting works okay with notes in list
sort_fields = ("url", "title", "notes", "approved")
sort_fields = (
"url",
"title",
"notes",
"approved",
"ts",
"status",
"mime",
"filename",
"depth",
"isSeed",
)
qa_sort_fields = ("screenshotMatch", "textMatch")
if sort_by not in sort_fields and sort_by not in qa_sort_fields:
raise HTTPException(status_code=400, detail="invalid_sort_by")
Expand Down Expand Up @@ -613,6 +657,101 @@ async def list_pages(

return [PageOut.from_dict(data) for data in items], total

async def list_collection_pages(
self,
coll_id: UUID,
org: Optional[Organization] = None,
url: Optional[str] = None,
url_prefix: Optional[str] = None,
ts: Optional[datetime] = None,
is_seed: Optional[bool] = None,
depth: Optional[int] = None,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sort_by: Optional[str] = None,
sort_direction: Optional[int] = -1,
) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
"""List all pages in collection, with optional filtering"""
# pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
# Zero-index page for query
page = page - 1
skip = page_size * page

crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id)

query: dict[str, object] = {
"crawl_id": {"$in": crawl_ids},
}
if org:
query["oid"] = org.id

if url_prefix:
url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}"
query["url"] = {"$regex": regex_pattern, "$options": "i"}

elif url:
query["url"] = urllib.parse.unquote(url)

if ts:
query["ts"] = ts

if is_seed in (True, False):
query["isSeed"] = is_seed

if isinstance(depth, int):
query["depth"] = depth

aggregate = [{"$match": query}]

if sort_by:
# Sorting options to add:
# - automated heuristics like screenshot_comparison (dict keyed by QA run id)
# - Ensure notes sorting works okay with notes in list
sort_fields = (
"url",
"crawl_id",
"ts",
"status",
"mime",
"filename",
"depth",
"isSeed",
)
if sort_by not in sort_fields:
raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction")

aggregate.extend([{"$sort": {sort_by: sort_direction}}])

aggregate.extend(
[
{
"$facet": {
"items": [
{"$skip": skip},
{"$limit": page_size},
],
"total": [{"$count": "count"}],
}
},
]
)

# Get total
cursor = self.pages.aggregate(aggregate)
results = await cursor.to_list(length=1)
result = results[0]
items = result["items"]

try:
total = int(result["total"][0]["count"])
except (IndexError, ValueError):
total = 0

return [PageOut.from_dict(data) for data in items], total

async def re_add_crawl_pages(self, crawl_id: str, oid: UUID):
"""Delete existing pages for crawl and re-add from WACZs."""
await self.delete_crawl_pages(crawl_id, oid)
Expand Down Expand Up @@ -738,13 +877,14 @@ async def set_archived_item_page_counts(self, crawl_id: str):
# ============================================================================
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
def init_pages_api(
app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, user_dep
app, mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops, user_dep
):
"""init pages API"""
# pylint: disable=invalid-name

ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops)
ops = PageOps(mdb, crawl_ops, org_ops, storage_ops, background_job_ops, coll_ops)

org_viewer_dep = org_ops.org_viewer_dep
org_crawl_dep = org_ops.org_crawl_dep

@app.post(
Expand Down Expand Up @@ -913,9 +1053,14 @@ async def delete_page_notes(
tags=["pages", "all-crawls"],
response_model=PaginatedPageOutResponse,
)
async def get_pages_list(
async def get_crawl_pages_list(
crawl_id: str,
org: Organization = Depends(org_crawl_dep),
url: Optional[str] = None,
urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None,
isSeed: Optional[bool] = None,
depth: Optional[int] = None,
reviewed: Optional[bool] = None,
approved: Optional[str] = None,
hasNotes: Optional[bool] = None,
Expand All @@ -932,6 +1077,11 @@ async def get_pages_list(
pages, total = await ops.list_pages(
crawl_id=crawl_id,
org=org,
url=url,
url_prefix=urlPrefix,
ts=ts,
is_seed=isSeed,
depth=depth,
reviewed=reviewed,
approved=formatted_approved,
has_notes=hasNotes,
Expand All @@ -942,6 +1092,40 @@ async def get_pages_list(
)
return paginated_format(pages, total, page, pageSize)

@app.get(
"/orgs/{oid}/collections/{coll_id}/pages",
tags=["pages", "collections"],
response_model=PaginatedPageOutResponse,
)
async def get_collection_pages_list(
coll_id: UUID,
org: Organization = Depends(org_viewer_dep),
url: Optional[str] = None,
urlPrefix: Optional[str] = None,
ts: Optional[datetime] = None,
isSeed: Optional[bool] = None,
depth: Optional[int] = None,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sortBy: Optional[str] = None,
sortDirection: Optional[int] = -1,
):
"""Retrieve paginated list of pages in collection"""
pages, total = await ops.list_collection_pages(
coll_id=coll_id,
org=org,
url=url,
url_prefix=urlPrefix,
ts=ts,
is_seed=isSeed,
depth=depth,
page_size=pageSize,
page=page,
sort_by=sortBy,
sort_direction=sortDirection,
)
return paginated_format(pages, total, page, pageSize)

@app.get(
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/pages",
tags=["pages", "qa"],
Expand Down
2 changes: 1 addition & 1 deletion backend/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def crawler_crawl_id(crawler_auth_headers, default_org_id):
"name": "Crawler User Test Crawl",
"description": "crawler test crawl",
"tags": ["wr-test-2"],
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1},
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 3},
"crawlerChannel": "test",
}
r = requests.post(
Expand Down
Loading

0 comments on commit 98a45b0

Please sign in to comment.