diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index ee852bf336..7fe555d363 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -171,14 +171,15 @@ async def get_crawl_out( res["collections"] = await self.colls.get_collection_names(coll_ids) if res.get("version", 1) == 2: - res["initialPages"] = await self.page_ops.list_replay_query_pages( + res["initialPages"], _ = await self.page_ops.list_pages( crawl_ids=[crawlid], is_seed=True, page_size=25 ) oid = res.get("oid") if oid: res["pagesQueryUrl"] = ( - get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages" + get_origin(headers) + + f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch" ) crawl = CrawlOutWithResources.from_dict(res) diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 0e263056cc..8d0f385d55 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -42,7 +42,6 @@ OrgPublicCollections, PublicOrgDetails, CollAccessType, - PageOut, UpdateCollHomeUrl, User, ImageFile, @@ -346,8 +345,7 @@ async def get_collection_out( await self.get_collection_crawl_resources(coll_id) ) - initial_pages: List[PageOut] = await self.page_ops.list_replay_query_pages( - coll_id, + initial_pages, _ = await self.page_ops.list_pages( crawl_ids=crawl_ids, page_size=25, ) diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 353f87bd70..9d4b190e33 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -198,6 +198,7 @@ async def _run_bg_job_with_ops_classes( "job_type": job_type, "backend_image": os.environ.get("BACKEND_IMAGE", ""), "pull_policy": os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""), + "larger_resources": True, **kwargs, } if oid: diff --git a/backend/btrixcloud/operator/bgjobs.py b/backend/btrixcloud/operator/bgjobs.py index dc7cb63d77..e9c1a3f28c 100644 --- a/backend/btrixcloud/operator/bgjobs.py +++ b/backend/btrixcloud/operator/bgjobs.py @@ -38,7 +38,12 @@ async def finalize_background_job(self, data: MCDecoratorSyncData) -> dict: job_id: str = labels.get("job_id") or metadata.get("name") status = data.object["status"] - success = status.get("succeeded") == 1 + spec = data.object["spec"] + success = status.get("succeeded") == spec.get("parallelism") + if not success: + print( + "Succeeded: {status.get('succeeded')}, Num Pods: {spec.get('parallelism')}" + ) completion_time = status.get("completionTime") finalized = True diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index fc293508d4..4a7361ea18 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -3,7 +3,6 @@ # pylint: disable=too-many-lines import asyncio -import re import traceback import urllib.parse from datetime import datetime @@ -495,7 +494,10 @@ async def delete_page_notes( async def list_pages( self, - crawl_id: str, + coll_id: Optional[UUID] = None, + crawl_ids: Optional[List[str]] = None, + public_or_unlisted_only=False, + # pylint: disable=unused-argument org: Optional[Organization] = None, search: Optional[str] = None, url: Optional[str] = None, @@ -516,6 +518,7 @@ async def list_pages( page: int = 1, sort_by: Optional[str] = None, sort_direction: Optional[int] = -1, + include_total=False, ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: """List all pages in crawl""" # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements @@ -523,26 +526,45 @@ async def list_pages( page = page - 1 skip = page_size * page + # Crawl or Collection Selection + if coll_id: + if crawl_ids: + # both coll_id and crawll_ids, error + raise HTTPException( + status_code=400, + detail="only one of crawl_ids or coll_id can be provided", + ) + + crawl_ids = await self.coll_ops.get_collection_crawl_ids( + coll_id, public_or_unlisted_only + ) + elif not crawl_ids: + # neither coll_id nor crawl_id, error + raise HTTPException( + status_code=400, detail="either crawl_ids or coll_id must be provided" + ) + query: dict[str, object] = { - "crawl_id": crawl_id, + "crawl_id": {"$in": crawl_ids}, } - if org: - query["oid"] = org.id + # if org: + # query["oid"] = org.id + # Text Search + is_text_search = False if search: - search_regex = re.escape(urllib.parse.unquote(search)) - query["$or"] = [ - {"url": {"$regex": search_regex, "$options": "i"}}, - {"title": {"$regex": search_regex, "$options": "i"}}, - ] - - if url_prefix: - url_prefix = urllib.parse.unquote(url_prefix) - regex_pattern = f"^{re.escape(url_prefix)}" - query["url"] = {"$regex": regex_pattern, "$options": "i"} + search = urllib.parse.unquote(search) + if search.startswith("http:") or search.startswith("https:"): + query["url"] = {"$gte": search} + else: + query["$text"] = {"$search": search} + is_text_search = True - elif url: + # Seed Settings + if url: query["url"] = urllib.parse.unquote(url) + elif url_prefix: + query["url"] = {"$gte": urllib.parse.unquote(url_prefix)} if ts: query["ts"] = ts @@ -553,6 +575,7 @@ async def list_pages( if isinstance(depth, int): query["depth"] = depth + # QA Settings if reviewed: query["$or"] = [ {"approved": {"$ne": None}}, @@ -591,8 +614,14 @@ async def list_pages( query[f"qa.{qa_run_id}.{qa_filter_by}"] = range_filter - aggregate = [{"$match": query}] + aggregate: List[Dict[str, Union[int, object]]] = [{"$match": query}] + + # Extra QA Set + if qa_run_id: + aggregate.extend([{"$set": {"qa": f"$qa.{qa_run_id}"}}]) + # aggregate.extend([{"$project": {"qa": f"$qa.{qa_run_id}"}}]) + # Sorting if sort_by: # Sorting options to add: # - automated heuristics like screenshot_comparison (dict keyed by QA run id) @@ -625,33 +654,52 @@ async def list_pages( aggregate.extend([{"$sort": {sort_by: sort_direction}}]) - if qa_run_id: - aggregate.extend([{"$set": {"qa": f"$qa.{qa_run_id}"}}]) - # aggregate.extend([{"$project": {"qa": f"$qa.{qa_run_id}"}}]) + # default sort with search + elif search or url_prefix: + if is_text_search: + aggregate.extend( + [ + {"$sort": {"score": {"$meta": "textScore"}}}, + ] + ) + else: + aggregate.extend([{"$sort": {"url": 1}}]) + else: + # default sort: seeds first, then by timestamp + aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}]) - aggregate.extend( - [ - { - "$facet": { - "items": [ - {"$skip": skip}, - {"$limit": page_size}, - ], - "total": [{"$count": "count"}], + if include_total: + aggregate.extend( + [ + { + "$facet": { + "items": [ + {"$skip": skip}, + {"$limit": page_size}, + ], + "total": [{"$count": "count"}], + } } - }, - ] - ) + ] + ) - # Get total - cursor = self.pages.aggregate(aggregate) - results = await cursor.to_list(length=1) - result = results[0] - items = result["items"] + cursor = self.pages.aggregate(aggregate) + results = await cursor.to_list(length=1) + result = results[0] + items = result["items"] - try: - total = int(result["total"][0]["count"]) - except (IndexError, ValueError): + try: + total = int(result["total"][0]["count"]) + except (IndexError, ValueError, KeyError): + total = 0 + + else: + if skip: + aggregate.extend([{"$skip": skip}]) + + aggregate.extend([{"$limit": page_size}]) + cursor = self.pages.aggregate(aggregate) + items = await cursor.to_list(page_size) total = 0 if qa_run_id: @@ -667,35 +715,18 @@ async def list_page_url_counts( ) -> List[PageUrlCount]: """List all page URLs in collection sorted desc by snapshot count unless prefix is specified""" - # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements - # Zero-index page for query - crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id) - match_query: dict[str, object] = {"crawl_id": {"$in": crawl_ids}} - sort_query: dict[str, int] = {"isSeed": -1, "ts": 1} - - if url_prefix: - url_prefix = urllib.parse.unquote(url_prefix) - # regex_pattern = f"^{re.escape(url_prefix)}" - # match_query["url"] = {"$regex": regex_pattern, "$options": "i"} - match_query["url"] = {"$gte": url_prefix} - sort_query = {"url": 1} - - aggregate: List[Dict[str, Union[int, object]]] = [ - {"$match": match_query}, - {"$sort": sort_query}, - ] - - aggregate.append({"$limit": page_size * len(crawl_ids)}) - - cursor = self.pages.aggregate(aggregate) - results = await cursor.to_list(length=page_size * len(crawl_ids)) + pages, _ = await self.list_pages( + crawl_ids=crawl_ids, + url_prefix=url_prefix, + page_size=page_size * len(crawl_ids), + ) url_counts: dict[str, PageUrlCount] = {} - for result in results: - url = result.get("url") + for page in pages: + url = page.url count = url_counts.get(url) if not count: # if already at max pages, this would add a new page, so we're done @@ -705,125 +736,15 @@ async def list_page_url_counts( url_counts[url] = count count.snapshots.append( PageIdTimestamp( - pageId=result.get("_id"), - ts=result.get("ts"), - status=result.get("status", 200), + pageId=page.id, + ts=page.ts, + status=page.status or 200, ) ) count.count += 1 return list(url_counts.values()) - async def list_replay_query_pages( - self, - coll_id: Optional[UUID] = None, - crawl_ids: Optional[List[str]] = None, - org: Optional[Organization] = None, - search: Optional[str] = None, - url: Optional[str] = None, - url_prefix: Optional[str] = None, - ts: Optional[datetime] = None, - is_seed: Optional[bool] = None, - depth: Optional[int] = None, - page_size: int = DEFAULT_PAGE_SIZE, - page: int = 1, - sort_by: Optional[str] = None, - sort_direction: Optional[int] = -1, - public_or_unlisted_only=False, - ) -> List[PageOut]: - """Query pages in collection, with filtering sorting. No total returned for optimization""" - # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements - # Zero-index page for query - page = page - 1 - skip = page_size * page - - if crawl_ids is None and coll_id is None: - raise HTTPException( - status_code=400, detail="either crawl_ids or coll_id must be provided" - ) - - if coll_id and crawl_ids is None: - crawl_ids = await self.coll_ops.get_collection_crawl_ids( - coll_id, public_or_unlisted_only - ) - - query: dict[str, object] = { - "crawl_id": {"$in": crawl_ids}, - } - if org: - query["oid"] = org.id - - is_text_search = False - if search: - search = urllib.parse.unquote(search) - if search.startswith("http:") or search.startswith("https:"): - query["url"] = {"$gte": search} - else: - query["$text"] = {"$search": search} - is_text_search = True - - elif url_prefix: - url_prefix = urllib.parse.unquote(url_prefix) - regex_pattern = f"^{re.escape(url_prefix)}" - query["url"] = {"$regex": regex_pattern, "$options": "i"} - - elif url: - query["url"] = urllib.parse.unquote(url) - - if ts: - query["ts"] = ts - - if is_seed in (True, False): - query["isSeed"] = is_seed - - if isinstance(depth, int): - query["depth"] = depth - - aggregate: list[dict[str, object]] = [{"$match": query}] - - if sort_by: - # Sorting options to add: - # - automated heuristics like screenshot_comparison (dict keyed by QA run id) - # - Ensure notes sorting works okay with notes in list - sort_fields = ( - "url", - "crawl_id", - "ts", - "status", - "mime", - "filename", - "depth", - "isSeed", - ) - if sort_by not in sort_fields: - raise HTTPException(status_code=400, detail="invalid_sort_by") - if sort_direction not in (1, -1): - raise HTTPException(status_code=400, detail="invalid_sort_direction") - - aggregate.extend([{"$sort": {sort_by: sort_direction}}]) - elif search: - if is_text_search: - aggregate.extend( - [ - {"$sort": {"score": {"$meta": "textScore"}}}, - ] - ) - else: - aggregate.extend([{"$sort": {"url": 1}}]) - else: - # default sort: seeds first, then by timestamp - aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}]) - - if skip: - aggregate.append({"$skip": skip}) - aggregate.append({"$limit": page_size}) - - cursor = self.pages.aggregate(aggregate) - - results = await cursor.to_list(length=page_size) - - return [PageOut.from_dict(data) for data in results] - async def re_add_crawl_pages(self, crawl_id: str, oid: Optional[UUID] = None): """Delete existing pages for crawl and re-add from WACZs.""" @@ -1006,8 +927,7 @@ async def get_unique_page_count(self, crawl_ids: List[str]) -> int: async def set_archived_item_page_counts(self, crawl_id: str): """Store archived item page and unique page counts in crawl document""" - _, page_count = await self.list_pages(crawl_id) - + page_count = await self.pages.count_documents({"crawl_id": crawl_id}) unique_page_count = await self.get_unique_page_count([crawl_id]) await self.crawls.find_one_and_update( @@ -1031,6 +951,7 @@ async def process_finished_crawls(): match_query, {"$set": {"isMigrating": True}}, sort=[("finished", -1)], + projection={"_id": 1, "pageCount": 1, "stats": 1, "state": 1}, ) if next_crawl is None: print("No more finished crawls to migrate") @@ -1046,6 +967,13 @@ async def process_finished_crawls(): if has_page_no_filename: print("Re-importing pages to migrate to v2") await self.re_add_crawl_pages(crawl_id) + elif ( + next_crawl.get("pageCount") == 0 + and next_crawl.get("stats", {}).get("done", 0) > 0 + and next_crawl.get("state") not in ["canceled", "failed"] + ): + print("Pages likely missing, importing pages to migrate to v2") + await self.re_add_crawl_pages(crawl_id) else: print("Pages already have filename, set to v2") @@ -1291,7 +1219,7 @@ async def get_crawl_pages_list( formatted_approved = str_list_to_bools(approved.split(",")) pages, total = await ops.list_pages( - crawl_id=crawl_id, + crawl_ids=[crawl_id], org=org, search=search, url=url, @@ -1306,9 +1234,41 @@ async def get_crawl_pages_list( page=page, sort_by=sortBy, sort_direction=sortDirection, + include_total=True, ) return paginated_format(pages, total, page, pageSize) + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/pagesSearch", + tags=["pages", "crawls"], + response_model=PageOutItemsResponse, + ) + async def get_search_pages_list( + crawl_id: str, + org: Organization = Depends(org_crawl_dep), + search: Optional[str] = None, + url: Optional[str] = None, + ts: Optional[datetime] = None, + isSeed: Optional[bool] = None, + depth: Optional[int] = None, + pageSize: int = DEFAULT_PAGE_SIZE, + page: int = 1, + ): + """Retrieve paginated list of pages""" + pages, _ = await ops.list_pages( + crawl_ids=[crawl_id], + search=search, + url=url, + ts=ts, + is_seed=isSeed, + depth=depth, + org=org, + page_size=pageSize, + page=page, + include_total=False, + ) + return {"items": pages} + @app.get( "/orgs/{oid}/collections/{coll_id}/public/pages", tags=["pages", "collections"], @@ -1320,7 +1280,6 @@ async def get_public_collection_pages_list( org: Organization = Depends(org_public), search: Optional[str] = None, url: Optional[str] = None, - urlPrefix: Optional[str] = None, ts: Optional[datetime] = None, isSeed: Optional[bool] = None, depth: Optional[int] = None, @@ -1330,12 +1289,11 @@ async def get_public_collection_pages_list( sortDirection: Optional[int] = -1, ): """Retrieve paginated list of pages in collection""" - pages = await ops.list_replay_query_pages( + pages, _ = await ops.list_pages( coll_id=coll_id, org=org, search=search, url=url, - url_prefix=urlPrefix, ts=ts, is_seed=isSeed, depth=depth, @@ -1377,7 +1335,6 @@ async def get_collection_pages_list( org: Organization = Depends(org_viewer_dep), search: Optional[str] = None, url: Optional[str] = None, - urlPrefix: Optional[str] = None, ts: Optional[datetime] = None, isSeed: Optional[bool] = None, depth: Optional[int] = None, @@ -1387,12 +1344,11 @@ async def get_collection_pages_list( sortDirection: Optional[int] = -1, ): """Retrieve paginated list of pages in collection""" - pages = await ops.list_replay_query_pages( + pages, _ = await ops.list_pages( coll_id=coll_id, org=org, search=search, url=url, - url_prefix=urlPrefix, ts=ts, is_seed=isSeed, depth=depth, @@ -1433,7 +1389,7 @@ async def get_pages_list_with_qa( formatted_approved = str_list_to_bools(approved.split(",")) pages, total = await ops.list_pages( - crawl_id=crawl_id, + crawl_ids=[crawl_id], org=org, qa_run_id=qa_run_id, qa_filter_by=filterQABy, diff --git a/backend/btrixcloud/version.py b/backend/btrixcloud/version.py index e75c8a158f..055122d2f5 100644 --- a/backend/btrixcloud/version.py +++ b/backend/btrixcloud/version.py @@ -1,3 +1,3 @@ """current version""" -__version__ = "1.14.0-beta.6" +__version__ = "1.14.0-beta.7" diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index d63bb0bfb4..ef3835719f 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -186,7 +186,7 @@ def test_wait_for_complete(admin_auth_headers, default_org_id): assert len(data["initialPages"]) == 1 assert data["pagesQueryUrl"].endswith( - f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pages" + f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pagesSearch" ) # ensure filename matches specified pattern diff --git a/chart/Chart.yaml b/chart/Chart.yaml index 77582c2821..4588974246 100644 --- a/chart/Chart.yaml +++ b/chart/Chart.yaml @@ -5,7 +5,7 @@ type: application icon: https://webrecorder.net/assets/icon.png # Browsertrix and Chart Version -version: v1.14.0-beta.6 +version: v1.14.0-beta.7 dependencies: - name: btrix-admin-logging diff --git a/chart/app-templates/background_job.yaml b/chart/app-templates/background_job.yaml index b26c723b94..5702c5c8e9 100644 --- a/chart/app-templates/background_job.yaml +++ b/chart/app-templates/background_job.yaml @@ -65,9 +65,18 @@ spec: command: ["python3", "-m", "btrixcloud.main_bg"] resources: +{% if larger_resources %} limits: - memory: "500Mi" + memory: "1200Mi" requests: - memory: "250Mi" + memory: "500Mi" cpu: "200m" +{% else %} + limits: + memory: "200Mi" + + requests: + memory: "200Mi" + cpu: "50m" +{% endif %} diff --git a/chart/values.yaml b/chart/values.yaml index f5cbd34fc1..97ceb00207 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -103,7 +103,7 @@ replica_deletion_delay_days: 0 # API Image # ========================================= -backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.6" +backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.0-beta.7" backend_pull_policy: "Always" backend_password_secret: "PASSWORD!" @@ -161,7 +161,7 @@ backend_avg_memory_threshold: 95 # Nginx Image # ========================================= -frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.6" +frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.0-beta.7" frontend_pull_policy: "Always" frontend_cpu: "10m" diff --git a/frontend/package.json b/frontend/package.json index dcd5d0031a..57a74d8bb1 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "browsertrix-frontend", - "version": "1.14.0-beta.6", + "version": "1.14.0-beta.7", "main": "index.ts", "license": "AGPL-3.0-or-later", "dependencies": { diff --git a/frontend/src/pages/org/archived-item-detail/ui/qa.ts b/frontend/src/pages/org/archived-item-detail/ui/qa.ts index 079938aa6f..0b3b0eaa69 100644 --- a/frontend/src/pages/org/archived-item-detail/ui/qa.ts +++ b/frontend/src/pages/org/archived-item-detail/ui/qa.ts @@ -703,7 +703,7 @@ export class ArchivedItemDetailQA extends BtrixElement { class="label-same-line" label=${msg("Sort by:")} size="small" - value=${this.qaRunId ? "approved.-1" : "url.1"} + value=${this.qaRunId ? "approved.-1" : ".1"} pill @sl-change=${(e: SlChangeEvent) => { const { value } = e.target as SlSelect; @@ -717,6 +717,7 @@ export class ArchivedItemDetailQA extends BtrixElement { }); }} > + ${msg("Crawl Order")} ${msg("Title")} ${msg("URL")}