Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Update collection sorting, metadata, stats #2327

Merged
merged 22 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion backend/btrixcloud/basecrawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@


# ============================================================================
# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines
# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines, too-many-branches
class BaseCrawlOps:
"""operations that apply to all crawls"""

Expand Down Expand Up @@ -300,6 +300,7 @@ async def delete_crawls(
) -> tuple[int, dict[UUID, dict[str, int]], bool]:
"""Delete a list of crawls by id for given org"""
cids_to_update: dict[UUID, dict[str, int]] = {}
collection_ids_to_update = set()

size = 0

Expand All @@ -325,6 +326,10 @@ async def delete_crawls(

await self.page_ops.delete_crawl_pages(crawl_id, org.id)

if crawl.collectionIds:
for coll_id in crawl.collectionIds:
collection_ids_to_update.add(coll_id)

if type_ == "crawl":
await self.delete_all_crawl_qa_files(crawl_id, org)

Expand Down Expand Up @@ -361,6 +366,10 @@ async def delete_crawls(

await self.orgs.set_last_crawl_finished(org.id)

if collection_ids_to_update:
for coll_id in collection_ids_to_update:
await self.colls.update_collection_counts_and_tags(coll_id)

quota_reached = self.orgs.storage_quota_reached(org)

return res.deleted_count, cids_to_update, quota_reached
Expand Down
39 changes: 33 additions & 6 deletions backend/btrixcloud/colls.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,7 +396,7 @@ async def list_collections(
page = page - 1
skip = page * page_size

match_query: dict[str, object] = {"oid": org.id}
match_query: Dict[str, Union[str, UUID, int, object]] = {"oid": org.id}

if name:
match_query["name"] = name
Expand All @@ -409,15 +409,33 @@ async def list_collections(
elif access:
match_query["access"] = access

aggregate = [{"$match": match_query}]
aggregate: List[Dict[str, Union[str, UUID, int, object]]] = [
{"$match": match_query}
]

if sort_by:
if sort_by not in ("modified", "name", "description", "totalSize"):
if sort_by not in (
"created",
"modified",
"dateLatest",
"name",
"crawlCount",
"pageCount",
"totalSize",
"description",
"caption",
):
raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction")

aggregate.extend([{"$sort": {sort_by: sort_direction}}])
sort_query = {sort_by: sort_direction}

# add secondary sort keys:
if sort_by == "dateLatest":
sort_query["dateEarliest"] = sort_direction

aggregate.extend([{"$sort": sort_query}])

aggregate.extend(
[
Expand Down Expand Up @@ -564,11 +582,14 @@ async def recalculate_org_collection_counts_tags(self, org: Organization):

async def update_collection_counts_and_tags(self, collection_id: UUID):
"""Set current crawl info in config when crawl begins"""
# pylint: disable=too-many-locals
crawl_count = 0
page_count = 0
total_size = 0
tags = []

crawl_ids = []

coll = await self.get_collection(collection_id)
org = await self.orgs.get_org_by_id(coll.oid)

Expand All @@ -582,25 +603,30 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
total_size += file.size

try:
_, crawl_pages = await self.page_ops.list_pages(
_, crawl_page_count = await self.page_ops.list_pages(
crawl.id, org, page_size=1_000_000
)
page_count += crawl_pages
page_count += crawl_page_count
# pylint: disable=broad-exception-caught
except Exception:
pass

if crawl.tags:
tags.extend(crawl.tags)

crawl_ids.append(crawl.id)

sorted_tags = [tag for tag, count in Counter(tags).most_common()]

unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)

await self.collections.find_one_and_update(
{"_id": collection_id},
{
"$set": {
"crawlCount": crawl_count,
"pageCount": page_count,
"uniquePageCount": unique_page_count,
"totalSize": total_size,
"tags": sorted_tags,
}
Expand All @@ -618,6 +644,7 @@ async def recalculate_org_collection_dates(self, org: Organization):

async def update_collection_dates(self, coll_id: UUID):
"""Update collection earliest and latest dates from page timestamps"""
# pylint: disable=too-many-locals
coll = await self.get_collection(coll_id)
crawl_ids = await self.get_collection_crawl_ids(coll_id)

Expand Down
11 changes: 7 additions & 4 deletions backend/btrixcloud/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from .migrations import BaseMigration


CURR_DB_VERSION = "0040"
CURR_DB_VERSION = "0041"


# ============================================================================
Expand Down Expand Up @@ -96,7 +96,7 @@ async def update_and_prepare_db(
await ping_db(mdb)
print("Database setup started", flush=True)
if await run_db_migrations(
mdb, user_manager, page_ops, org_ops, background_job_ops
mdb, user_manager, page_ops, org_ops, background_job_ops, coll_ops
):
await drop_indexes(mdb)

Expand All @@ -117,8 +117,10 @@ async def update_and_prepare_db(


# ============================================================================
# pylint: disable=too-many-locals
async def run_db_migrations(mdb, user_manager, page_ops, org_ops, background_job_ops):
# pylint: disable=too-many-locals, too-many-arguments
async def run_db_migrations(
mdb, user_manager, page_ops, org_ops, background_job_ops, coll_ops
):
"""Run database migrations."""

# if first run, just set version and exit
Expand Down Expand Up @@ -155,6 +157,7 @@ async def run_db_migrations(mdb, user_manager, page_ops, org_ops, background_job
page_ops=page_ops,
org_ops=org_ops,
background_job_ops=background_job_ops,
coll_ops=coll_ops,
)
if await migration.run():
migrations_run = True
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def main() -> None:
crawls.set_page_ops(page_ops)
upload_ops.set_page_ops(page_ops)

org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops)
org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops, page_ops)

user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ async def migrate_up(self):
)
return

async for crawl_raw in crawls_mdb.find({"pageCount": None}):
async for crawl_raw in crawls_mdb.find({}):
crawl_id = crawl_raw["_id"]
try:
await self.page_ops.set_archived_item_page_count(crawl_id)
await self.page_ops.set_archived_item_page_counts(crawl_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Error saving pageCount for archived item {crawl_id}: {err}",
f"Error saving page counts for archived item {crawl_id}: {err}",
flush=True,
)
43 changes: 43 additions & 0 deletions backend/btrixcloud/migrations/migration_0041_pages_snapshots.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""
Migration 0041 - Rationalize page counts
"""

from btrixcloud.migrations import BaseMigration


MIGRATION_VERSION = "0041"


class Migration(BaseMigration):
"""Migration class."""

# pylint: disable=unused-argument
def __init__(self, mdb, **kwargs):
super().__init__(mdb, migration_version=MIGRATION_VERSION)

self.coll_ops = kwargs.get("coll_ops")

async def migrate_up(self):
"""Perform migration up.

Recalculate collections to get new page and unique page counts
"""
colls_mdb = self.mdb["collections"]

if self.coll_ops is None:
print(
"Unable to set collection page counts, missing coll_ops",
flush=True,
)
return

async for coll in colls_mdb.collections.find({}):
coll_id = coll["_id"]
try:
await self.coll_ops.update_collection_counts_and_tags(coll_id)
# pylint: disable=broad-exception-caught
except Exception as err:
print(
f"Unable to update page counts for collection {coll_id}: {err}",
flush=True,
)
8 changes: 8 additions & 0 deletions backend/btrixcloud/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -798,6 +798,7 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel):
reviewStatus: ReviewStatus = None

pageCount: Optional[int] = 0
uniquePageCount: Optional[int] = 0

filePageCount: Optional[int] = 0
errorPageCount: Optional[int] = 0
Expand Down Expand Up @@ -875,6 +876,7 @@ class CrawlOut(BaseMongoModel):
lastQAStarted: Optional[datetime] = None

pageCount: Optional[int] = 0
uniquePageCount: Optional[int] = 0
filePageCount: Optional[int] = 0
errorPageCount: Optional[int] = 0

Expand Down Expand Up @@ -1250,6 +1252,7 @@ class Collection(BaseMongoModel):

crawlCount: Optional[int] = 0
pageCount: Optional[int] = 0
uniquePageCount: Optional[int] = 0
totalSize: Optional[int] = 0

dateEarliest: Optional[datetime] = None
Expand Down Expand Up @@ -1303,6 +1306,7 @@ class CollOut(BaseMongoModel):

crawlCount: Optional[int] = 0
pageCount: Optional[int] = 0
uniquePageCount: Optional[int] = 0
totalSize: Optional[int] = 0

dateEarliest: Optional[datetime] = None
Expand Down Expand Up @@ -1339,6 +1343,7 @@ class PublicCollOut(BaseMongoModel):

crawlCount: Optional[int] = 0
pageCount: Optional[int] = 0
uniquePageCount: Optional[int] = 0
totalSize: Optional[int] = 0

dateEarliest: Optional[datetime] = None
Expand Down Expand Up @@ -1919,6 +1924,9 @@ class OrgMetrics(BaseModel):
pageCount: int
crawlPageCount: int
uploadPageCount: int
uniquePageCount: int
crawlUniquePageCount: int
ikreymer marked this conversation as resolved.
Show resolved Hide resolved
uploadUniquePageCount: int
profileCount: int
workflowsRunningCount: int
maxConcurrentCrawls: int
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/operator/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -1534,7 +1534,7 @@ async def do_crawl_finished_tasks(
)

if state in SUCCESSFUL_STATES and crawl.oid:
await self.page_ops.set_archived_item_page_count(crawl.id)
await self.page_ops.set_archived_item_page_counts(crawl.id)
await self.org_ops.inc_org_bytes_stored(
crawl.oid, status.filesAddedSize, "crawl"
)
Expand Down
2 changes: 1 addition & 1 deletion backend/btrixcloud/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def init_ops() -> Tuple[

background_job_ops.set_ops(crawl_ops, profile_ops)

org_ops.set_ops(base_crawl_ops, profile_ops, coll_ops, background_job_ops)
org_ops.set_ops(base_crawl_ops, profile_ops, coll_ops, background_job_ops, page_ops)

user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops)

Expand Down
Loading
Loading