Skip to content

Commit

Permalink
Consolidate list page endpoints + better QA sorting + optimize pages …
Browse files Browse the repository at this point in the history
…fix (#2417)

- consolidate list_pages() and list_replay_query_pages() into
list_pages()
- to keep backwards compatibility, add <crawl>/pagesSearch that does not
include page totals, keep <crawl>/pages with page total (slower)
- qa frontend: add default 'Crawl Order' sort order, to better show
pages in QA view
- bgjob: account for parallelism in bgjobs, add logging if succeeded
mismatches parallelism
- QA sorting: default to 'crawl order' by default to get better results.
- Optimize pages job: also cover crawls that may not have any pages but have pages listed in done stats
- Bgjobs: give custom op jobs more memory
  • Loading branch information
ikreymer authored Feb 21, 2025
1 parent 06f6d9d commit 8a507f0
Show file tree
Hide file tree
Showing 13 changed files with 174 additions and 203 deletions.
5 changes: 3 additions & 2 deletions backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,15 @@ async def get_crawl_out(
res["collections"] = await self.colls.get_collection_names(coll_ids)

if res.get("version", 1) == 2:
res["initialPages"] = await self.page_ops.list_replay_query_pages(
res["initialPages"], _ = await self.page_ops.list_pages(
crawl_ids=[crawlid], is_seed=True, page_size=25
)

oid = res.get("oid")
if oid:
res["pagesQueryUrl"] = (
get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages"
get_origin(headers)
+ f"/api/orgs/{oid}/crawls/{crawlid}/pagesSearch"
)

crawl = CrawlOutWithResources.from_dict(res)
Expand Down
4 changes: 1 addition & 3 deletions backend/btrixcloud/colls.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
OrgPublicCollections,
PublicOrgDetails,
CollAccessType,
PageOut,
UpdateCollHomeUrl,
User,
ImageFile,
Expand Down Expand Up @@ -346,8 +345,7 @@ async def get_collection_out(
await self.get_collection_crawl_resources(coll_id)
)

initial_pages: List[PageOut] = await self.page_ops.list_replay_query_pages(
coll_id,
initial_pages, _ = await self.page_ops.list_pages(
crawl_ids=crawl_ids,
page_size=25,
)
Expand Down
1 change: 1 addition & 0 deletions backend/btrixcloud/crawlmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ async def _run_bg_job_with_ops_classes(
"job_type": job_type,
"backend_image": os.environ.get("BACKEND_IMAGE", ""),
"pull_policy": os.environ.get("BACKEND_IMAGE_PULL_POLICY", ""),
"larger_resources": True,
**kwargs,
}
if oid:
Expand Down
7 changes: 6 additions & 1 deletion backend/btrixcloud/operator/bgjobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,12 @@ async def finalize_background_job(self, data: MCDecoratorSyncData) -> dict:
job_id: str = labels.get("job_id") or metadata.get("name")

status = data.object["status"]
success = status.get("succeeded") == 1
spec = data.object["spec"]
success = status.get("succeeded") == spec.get("parallelism")
if not success:
print(
"Succeeded: {status.get('succeeded')}, Num Pods: {spec.get('parallelism')}"
)
completion_time = status.get("completionTime")

finalized = True
Expand Down
Loading

0 comments on commit 8a507f0

Please sign in to comment.