diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 99b065975a..fb8a07cd9e 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -49,7 +49,7 @@ # ============================================================================ -# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines +# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines, too-many-branches class BaseCrawlOps: """operations that apply to all crawls""" @@ -300,6 +300,7 @@ async def delete_crawls( ) -> tuple[int, dict[UUID, dict[str, int]], bool]: """Delete a list of crawls by id for given org""" cids_to_update: dict[UUID, dict[str, int]] = {} + collection_ids_to_update = set() size = 0 @@ -325,6 +326,10 @@ async def delete_crawls( await self.page_ops.delete_crawl_pages(crawl_id, org.id) + if crawl.collectionIds: + for coll_id in crawl.collectionIds: + collection_ids_to_update.add(coll_id) + if type_ == "crawl": await self.delete_all_crawl_qa_files(crawl_id, org) @@ -361,6 +366,10 @@ async def delete_crawls( await self.orgs.set_last_crawl_finished(org.id) + if collection_ids_to_update: + for coll_id in collection_ids_to_update: + await self.colls.update_collection_counts_and_tags(coll_id) + quota_reached = self.orgs.storage_quota_reached(org) return res.deleted_count, cids_to_update, quota_reached diff --git a/backend/btrixcloud/colls.py b/backend/btrixcloud/colls.py index 4c1efe719c..3553884160 100644 --- a/backend/btrixcloud/colls.py +++ b/backend/btrixcloud/colls.py @@ -396,7 +396,7 @@ async def list_collections( page = page - 1 skip = page * page_size - match_query: dict[str, object] = {"oid": org.id} + match_query: Dict[str, Union[str, UUID, int, object]] = {"oid": org.id} if name: match_query["name"] = name @@ -409,15 +409,33 @@ async def list_collections( elif access: match_query["access"] = access - aggregate = [{"$match": match_query}] + aggregate: List[Dict[str, Union[str, UUID, int, object]]] = [ + {"$match": match_query} + ] if sort_by: - if sort_by not in ("modified", "name", "description", "totalSize"): + if sort_by not in ( + "created", + "modified", + "dateLatest", + "name", + "crawlCount", + "pageCount", + "totalSize", + "description", + "caption", + ): raise HTTPException(status_code=400, detail="invalid_sort_by") if sort_direction not in (1, -1): raise HTTPException(status_code=400, detail="invalid_sort_direction") - aggregate.extend([{"$sort": {sort_by: sort_direction}}]) + sort_query = {sort_by: sort_direction} + + # add secondary sort keys: + if sort_by == "dateLatest": + sort_query["dateEarliest"] = sort_direction + + aggregate.extend([{"$sort": sort_query}]) aggregate.extend( [ @@ -564,11 +582,14 @@ async def recalculate_org_collection_counts_tags(self, org: Organization): async def update_collection_counts_and_tags(self, collection_id: UUID): """Set current crawl info in config when crawl begins""" + # pylint: disable=too-many-locals crawl_count = 0 page_count = 0 total_size = 0 tags = [] + crawl_ids = [] + coll = await self.get_collection(collection_id) org = await self.orgs.get_org_by_id(coll.oid) @@ -582,10 +603,10 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): total_size += file.size try: - _, crawl_pages = await self.page_ops.list_pages( + _, crawl_page_count = await self.page_ops.list_pages( crawl.id, org, page_size=1_000_000 ) - page_count += crawl_pages + page_count += crawl_page_count # pylint: disable=broad-exception-caught except Exception: pass @@ -593,14 +614,19 @@ async def update_collection_counts_and_tags(self, collection_id: UUID): if crawl.tags: tags.extend(crawl.tags) + crawl_ids.append(crawl.id) + sorted_tags = [tag for tag, count in Counter(tags).most_common()] + unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids) + await self.collections.find_one_and_update( {"_id": collection_id}, { "$set": { "crawlCount": crawl_count, "pageCount": page_count, + "uniquePageCount": unique_page_count, "totalSize": total_size, "tags": sorted_tags, } @@ -618,6 +644,7 @@ async def recalculate_org_collection_dates(self, org: Organization): async def update_collection_dates(self, coll_id: UUID): """Update collection earliest and latest dates from page timestamps""" + # pylint: disable=too-many-locals coll = await self.get_collection(coll_id) crawl_ids = await self.get_collection_crawl_ids(coll_id) diff --git a/backend/btrixcloud/db.py b/backend/btrixcloud/db.py index bd889be25a..a16964626f 100644 --- a/backend/btrixcloud/db.py +++ b/backend/btrixcloud/db.py @@ -17,7 +17,7 @@ from .migrations import BaseMigration -CURR_DB_VERSION = "0040" +CURR_DB_VERSION = "0041" # ============================================================================ @@ -96,7 +96,7 @@ async def update_and_prepare_db( await ping_db(mdb) print("Database setup started", flush=True) if await run_db_migrations( - mdb, user_manager, page_ops, org_ops, background_job_ops + mdb, user_manager, page_ops, org_ops, background_job_ops, coll_ops ): await drop_indexes(mdb) @@ -117,8 +117,10 @@ async def update_and_prepare_db( # ============================================================================ -# pylint: disable=too-many-locals -async def run_db_migrations(mdb, user_manager, page_ops, org_ops, background_job_ops): +# pylint: disable=too-many-locals, too-many-arguments +async def run_db_migrations( + mdb, user_manager, page_ops, org_ops, background_job_ops, coll_ops +): """Run database migrations.""" # if first run, just set version and exit @@ -155,6 +157,7 @@ async def run_db_migrations(mdb, user_manager, page_ops, org_ops, background_job page_ops=page_ops, org_ops=org_ops, background_job_ops=background_job_ops, + coll_ops=coll_ops, ) if await migration.run(): migrations_run = True diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index 927a03dcb8..507db08f02 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -255,7 +255,7 @@ def main() -> None: crawls.set_page_ops(page_ops) upload_ops.set_page_ops(page_ops) - org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops) + org_ops.set_ops(base_crawl_ops, profiles, coll_ops, background_job_ops, page_ops) user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops) diff --git a/backend/btrixcloud/migrations/migration_0040_archived_item_page_count.py b/backend/btrixcloud/migrations/migration_0040_archived_item_page_count.py index 2f72fc39bd..5ed031b700 100644 --- a/backend/btrixcloud/migrations/migration_0040_archived_item_page_count.py +++ b/backend/btrixcloud/migrations/migration_0040_archived_item_page_count.py @@ -31,13 +31,13 @@ async def migrate_up(self): ) return - async for crawl_raw in crawls_mdb.find({"pageCount": None}): + async for crawl_raw in crawls_mdb.find({}): crawl_id = crawl_raw["_id"] try: - await self.page_ops.set_archived_item_page_count(crawl_id) + await self.page_ops.set_archived_item_page_counts(crawl_id) # pylint: disable=broad-exception-caught except Exception as err: print( - f"Error saving pageCount for archived item {crawl_id}: {err}", + f"Error saving page counts for archived item {crawl_id}: {err}", flush=True, ) diff --git a/backend/btrixcloud/migrations/migration_0041_pages_snapshots.py b/backend/btrixcloud/migrations/migration_0041_pages_snapshots.py new file mode 100644 index 0000000000..75d1dc2227 --- /dev/null +++ b/backend/btrixcloud/migrations/migration_0041_pages_snapshots.py @@ -0,0 +1,43 @@ +""" +Migration 0041 - Rationalize page counts +""" + +from btrixcloud.migrations import BaseMigration + + +MIGRATION_VERSION = "0041" + + +class Migration(BaseMigration): + """Migration class.""" + + # pylint: disable=unused-argument + def __init__(self, mdb, **kwargs): + super().__init__(mdb, migration_version=MIGRATION_VERSION) + + self.coll_ops = kwargs.get("coll_ops") + + async def migrate_up(self): + """Perform migration up. + + Recalculate collections to get new page and unique page counts + """ + colls_mdb = self.mdb["collections"] + + if self.coll_ops is None: + print( + "Unable to set collection page counts, missing coll_ops", + flush=True, + ) + return + + async for coll in colls_mdb.find({}): + coll_id = coll["_id"] + try: + await self.coll_ops.update_collection_counts_and_tags(coll_id) + # pylint: disable=broad-exception-caught + except Exception as err: + print( + f"Unable to update page counts for collection {coll_id}: {err}", + flush=True, + ) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 9bcd7557c3..e0e57f200a 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -798,6 +798,7 @@ class BaseCrawl(CoreCrawlable, BaseMongoModel): reviewStatus: ReviewStatus = None pageCount: Optional[int] = 0 + uniquePageCount: Optional[int] = 0 filePageCount: Optional[int] = 0 errorPageCount: Optional[int] = 0 @@ -875,6 +876,7 @@ class CrawlOut(BaseMongoModel): lastQAStarted: Optional[datetime] = None pageCount: Optional[int] = 0 + uniquePageCount: Optional[int] = 0 filePageCount: Optional[int] = 0 errorPageCount: Optional[int] = 0 @@ -1250,6 +1252,7 @@ class Collection(BaseMongoModel): crawlCount: Optional[int] = 0 pageCount: Optional[int] = 0 + uniquePageCount: Optional[int] = 0 totalSize: Optional[int] = 0 dateEarliest: Optional[datetime] = None @@ -1303,6 +1306,7 @@ class CollOut(BaseMongoModel): crawlCount: Optional[int] = 0 pageCount: Optional[int] = 0 + uniquePageCount: Optional[int] = 0 totalSize: Optional[int] = 0 dateEarliest: Optional[datetime] = None @@ -1339,6 +1343,7 @@ class PublicCollOut(BaseMongoModel): crawlCount: Optional[int] = 0 pageCount: Optional[int] = 0 + uniquePageCount: Optional[int] = 0 totalSize: Optional[int] = 0 dateEarliest: Optional[datetime] = None @@ -1919,6 +1924,9 @@ class OrgMetrics(BaseModel): pageCount: int crawlPageCount: int uploadPageCount: int + uniquePageCount: int + crawlUniquePageCount: int + uploadUniquePageCount: int profileCount: int workflowsRunningCount: int maxConcurrentCrawls: int diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index d2b84da5b4..86c326b7a8 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1534,7 +1534,7 @@ async def do_crawl_finished_tasks( ) if state in SUCCESSFUL_STATES and crawl.oid: - await self.page_ops.set_archived_item_page_count(crawl.id) + await self.page_ops.set_archived_item_page_counts(crawl.id) await self.org_ops.inc_org_bytes_stored( crawl.oid, status.filesAddedSize, "crawl" ) diff --git a/backend/btrixcloud/ops.py b/backend/btrixcloud/ops.py index bee24d00c5..bcdb493db5 100644 --- a/backend/btrixcloud/ops.py +++ b/backend/btrixcloud/ops.py @@ -97,7 +97,7 @@ def init_ops() -> Tuple[ background_job_ops.set_ops(crawl_ops, profile_ops) - org_ops.set_ops(base_crawl_ops, profile_ops, coll_ops, background_job_ops) + org_ops.set_ops(base_crawl_ops, profile_ops, coll_ops, background_job_ops, page_ops) user_manager.set_ops(org_ops, crawl_config_ops, base_crawl_ops) diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index 3486cc29a4..7c9f7558c0 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -96,9 +96,10 @@ from .profiles import ProfileOps from .users import UserManager from .background_jobs import BackgroundJobOps + from .pages import PageOps else: InviteOps = BaseCrawlOps = ProfileOps = CollectionOps = object - BackgroundJobOps = UserManager = object + BackgroundJobOps = UserManager = PageOps = object DEFAULT_ORG = os.environ.get("DEFAULT_ORG", "My Organization") @@ -110,7 +111,7 @@ # ============================================================================ -# pylint: disable=too-many-public-methods, too-many-instance-attributes, too-many-locals +# pylint: disable=too-many-public-methods, too-many-instance-attributes, too-many-locals, too-many-arguments class OrgOps: """Organization API operations""" @@ -156,13 +157,15 @@ def set_ops( profile_ops: ProfileOps, coll_ops: CollectionOps, background_job_ops: BackgroundJobOps, + page_ops: PageOps, ) -> None: - """Set base crawl ops""" + """Set additional ops classes""" # pylint: disable=attribute-defined-outside-init self.base_crawl_ops = base_crawl_ops self.profile_ops = profile_ops self.coll_ops = coll_ops self.background_job_ops = background_job_ops + self.page_ops = page_ops def set_default_primary_storage(self, storage: StorageRef): """set default primary storage""" @@ -944,6 +947,9 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: crawl_page_count = 0 upload_page_count = 0 + crawl_ids = [] + upload_ids = [] + async for item_data in self.crawls_db.find({"oid": org.id}): item = BaseCrawl.from_dict(item_data) if item.state not in SUCCESSFUL_STATES: @@ -952,12 +958,22 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: if item.type == "crawl": crawl_count += 1 crawl_page_count += item.pageCount or 0 + crawl_ids.append(item.id) if item.type == "upload": upload_count += 1 upload_page_count += item.pageCount or 0 + upload_ids.append(item.id) if item.pageCount: page_count += item.pageCount + all_archived_item_ids = crawl_ids + upload_ids + + unique_page_count = await self.page_ops.get_unique_page_count( + all_archived_item_ids + ) + crawl_unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids) + upload_unique_page_count = await self.page_ops.get_unique_page_count(upload_ids) + profile_count = await self.profiles_db.count_documents({"oid": org.id}) workflows_running_count = await self.crawls_db.count_documents( {"oid": org.id, "state": {"$in": RUNNING_STATES}} @@ -982,6 +998,9 @@ async def get_org_metrics(self, org: Organization) -> dict[str, int]: "pageCount": page_count, "crawlPageCount": crawl_page_count, "uploadPageCount": upload_page_count, + "uniquePageCount": unique_page_count, + "crawlUniquePageCount": crawl_unique_page_count, + "uploadUniquePageCount": upload_unique_page_count, "profileCount": profile_count, "workflowsRunningCount": workflows_running_count, "maxConcurrentCrawls": max_concurrent_crawls, @@ -1311,11 +1330,15 @@ async def import_org( await self.pages_db.insert_one(PageWithAllQA.from_dict(page).to_dict()) # collections - for collection in org_data.get("collections", []): - collection = json_stream.to_standard_types(collection) - if not collection.get("slug"): - collection["slug"] = slug_from_name(collection["name"]) - await self.colls_db.insert_one(Collection.from_dict(collection).to_dict()) + for coll_raw in org_data.get("collections", []): + coll_raw = json_stream.to_standard_types(coll_raw) + + if not coll_raw.get("slug"): + coll_raw["slug"] = slug_from_name(coll_raw["name"]) + + collection = Collection.from_dict(coll_raw) + await self.colls_db.insert_one(collection.to_dict()) + await self.coll_ops.update_collection_counts_and_tags(collection.id) async def delete_org_and_data( self, org: Organization, user_manager: UserManager diff --git a/backend/btrixcloud/pages.py b/backend/btrixcloud/pages.py index f796d94a49..4149a3e9d3 100644 --- a/backend/btrixcloud/pages.py +++ b/backend/btrixcloud/pages.py @@ -92,7 +92,7 @@ async def add_crawl_pages_to_db_from_wacz(self, crawl_id: str, batch_size=100): if pages_buffer: await self._add_pages_to_db(crawl_id, pages_buffer) - await self.set_archived_item_page_count(crawl_id) + await self.set_archived_item_page_counts(crawl_id) print(f"Added pages for crawl {crawl_id} to db", flush=True) # pylint: disable=broad-exception-caught, raise-missing-from @@ -663,12 +663,22 @@ def get_crawl_type_from_pages_route(self, request: Request): return crawl_type - async def set_archived_item_page_count(self, crawl_id: str): - """Store archived item page count in crawl document""" + async def get_unique_page_count(self, crawl_ids: List[str]) -> int: + """Get count of unique page URLs across list of archived items""" + unique_pages = await self.pages.distinct( + "url", {"crawl_id": {"$in": crawl_ids}} + ) + return len(unique_pages) or 0 + + async def set_archived_item_page_counts(self, crawl_id: str): + """Store archived item page and unique page counts in crawl document""" _, page_count = await self.list_pages(crawl_id) + unique_page_count = await self.get_unique_page_count([crawl_id]) + await self.crawls.find_one_and_update( - {"_id": crawl_id}, {"$set": {"pageCount": page_count}} + {"_id": crawl_id}, + {"$set": {"uniquePageCount": unique_page_count, "pageCount": page_count}}, ) diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index 751b554ffe..e219134b95 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -82,6 +82,7 @@ def test_create_collection( assert data["caption"] == CAPTION assert data["crawlCount"] == 1 assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 assert data["totalSize"] > 0 modified = data["modified"] assert modified @@ -181,6 +182,7 @@ def test_update_collection( assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 1 assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 assert data["totalSize"] > 0 global modified modified = data["modified"] @@ -270,6 +272,7 @@ def test_add_remove_crawl_from_collection( assert data["id"] == _coll_id assert data["crawlCount"] == 2 assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] @@ -294,6 +297,7 @@ def test_add_remove_crawl_from_collection( assert data["id"] == _coll_id assert data["crawlCount"] == 0 assert data["pageCount"] == 0 + assert data["uniquePageCount"] == 0 assert data["totalSize"] == 0 assert data["modified"] >= modified assert data.get("tags", []) == [] @@ -324,6 +328,7 @@ def test_add_remove_crawl_from_collection( assert data["id"] == _coll_id assert data["crawlCount"] == 2 assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] @@ -346,6 +351,7 @@ def test_get_collection(crawler_auth_headers, default_org_id): assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 2 assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] @@ -369,6 +375,7 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): assert data["caption"] == UPDATED_CAPTION assert data["crawlCount"] == 2 assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] >= modified assert data["tags"] == ["wr-test-2", "wr-test-1"] @@ -488,6 +495,7 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id): assert data["id"] == _coll_id assert data["crawlCount"] == 3 assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] assert data["tags"] == ["wr-test-2", "wr-test-1"] @@ -548,6 +556,7 @@ def test_list_collections( assert first_coll["caption"] == UPDATED_CAPTION assert first_coll["crawlCount"] == 3 assert first_coll["pageCount"] > 0 + assert first_coll["uniquePageCount"] > 0 assert first_coll["totalSize"] > 0 assert first_coll["modified"] assert first_coll["tags"] == ["wr-test-2", "wr-test-1"] @@ -564,6 +573,7 @@ def test_list_collections( assert second_coll.get("description") is None assert second_coll["crawlCount"] == 1 assert second_coll["pageCount"] > 0 + assert second_coll["uniquePageCount"] > 0 assert second_coll["totalSize"] > 0 assert second_coll["modified"] assert second_coll["tags"] == ["wr-test-2"] @@ -584,6 +594,7 @@ def test_remove_upload_from_collection(crawler_auth_headers, default_org_id): assert data["id"] == _coll_id assert data["crawlCount"] == 2 assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 assert data["totalSize"] > 0 assert data["modified"] >= modified assert data.get("tags") == ["wr-test-2", "wr-test-1"] @@ -914,6 +925,7 @@ def test_list_public_collections( assert collection["dateLatest"] assert collection["crawlCount"] > 0 assert collection["pageCount"] > 0 + assert collection["uniquePageCount"] > 0 assert collection["totalSize"] > 0 # Test non-existing slug - it should return a 404 but not reveal @@ -1072,13 +1084,7 @@ def test_list_public_colls_home_url_thumbnail(): # Check we get expected data for each public collection # and nothing we don't expect non_public_fields = ( - "oid", - "modified", - "crawlCount", - "pageCount", - "totalSize", "tags", - "access", "homeUrlPageId", ) non_public_image_fields = ("originalFilename", "userid", "userName", "created") @@ -1100,9 +1106,10 @@ def test_list_public_colls_home_url_thumbnail(): assert coll["dateLatest"] assert coll["crawlCount"] > 0 assert coll["pageCount"] > 0 + assert coll["uniquePageCount"] > 0 assert coll["totalSize"] > 0 - for field in NON_PUBLIC_COLL_FIELDS: + for field in non_public_fields: assert field not in coll if coll["id"] == _public_coll_id: @@ -1122,7 +1129,7 @@ def test_list_public_colls_home_url_thumbnail(): assert thumbnail["size"] assert thumbnail["mime"] - for field in NON_PUBLIC_IMAGE_FIELDS: + for field in non_public_image_fields: assert field not in thumbnail if coll["id"] == _second_public_coll_id: @@ -1150,6 +1157,7 @@ def test_get_public_collection(default_org_id): assert coll["dateLatest"] assert coll["crawlCount"] > 0 assert coll["pageCount"] > 0 + assert coll["uniquePageCount"] > 0 assert coll["totalSize"] > 0 for field in NON_PUBLIC_COLL_FIELDS: @@ -1229,6 +1237,7 @@ def test_get_public_collection_unlisted(crawler_auth_headers, default_org_id): assert coll["dateLatest"] assert coll["crawlCount"] > 0 assert coll["pageCount"] > 0 + assert coll["uniquePageCount"] > 0 assert coll["totalSize"] > 0 assert coll["defaultThumbnailName"] == "orange-default.avif" assert coll["allowPublicDownload"] @@ -1270,6 +1279,7 @@ def test_get_public_collection_unlisted_org_profile_disabled( assert coll["dateLatest"] assert coll["crawlCount"] > 0 assert coll["pageCount"] > 0 + assert coll["uniquePageCount"] > 0 assert coll["totalSize"] > 0 assert coll["defaultThumbnailName"] == "orange-default.avif" assert coll["allowPublicDownload"] diff --git a/backend/test/test_org.py b/backend/test/test_org.py index 57c0b8fcce..a5e3a4cf48 100644 --- a/backend/test/test_org.py +++ b/backend/test/test_org.py @@ -569,6 +569,9 @@ def test_org_metrics(crawler_auth_headers, default_org_id): assert data["uploadCount"] >= 0 assert data["archivedItemCount"] == data["crawlCount"] + data["uploadCount"] assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 + assert data["crawlUniquePageCount"] > 0 + assert data["uploadUniquePageCount"] >= 0 assert data["profileCount"] >= 0 assert data["workflowsRunningCount"] >= 0 assert data["workflowsQueuedCount"] >= 0 diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 26f5574681..20318a1638 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -877,13 +877,15 @@ def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_ ) assert r.status_code == 403 - # Check that pageCount was stored on crawl + # Check that pageCount and uniquePageCount were stored on crawl r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}", headers=crawler_auth_headers, ) assert r.status_code == 200 - assert r.json()["pageCount"] > 0 + data = r.json() + assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id): diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 5a55b36c5c..ab3e816561 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -274,13 +274,15 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): assert page.get("modified") is None assert page.get("approved") is None - # Check that pageCount was stored on upload + # Check that pageCount and uniquePageCount stored on upload r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}", headers=admin_auth_headers, ) assert r.status_code == 200 - assert r.json()["pageCount"] > 0 + data = r.json() + assert data["pageCount"] > 0 + assert data["uniquePageCount"] > 0 def test_replace_upload( diff --git a/frontend/src/features/collections/collection-metadata-dialog.ts b/frontend/src/features/collections/collection-metadata-dialog.ts index fcbb6b7da4..160ce1e468 100644 --- a/frontend/src/features/collections/collection-metadata-dialog.ts +++ b/frontend/src/features/collections/collection-metadata-dialog.ts @@ -135,12 +135,13 @@ export class CollectionMetadataDialog extends BtrixElement { ${msg( - "Write a short description that summarizes this collection. If the collection is public, this description will be visible next to the collection name.", + "Write a short description that summarizes this collection. If the collection is shareable, this will appear next to the collection name.", )} ${this.collection ? nothing : msg( - "You can write a longer description in the 'About' section after creating the collection.", + html`You can add a longer description in the “About” + section after creating the collection.`, )} ${msg("Name")}
  • ${msg("Summary")}
  • ${msg("About")}
  • -
  • ${msg("Collection Period")}
  • -
  • ${msg("Total Pages")}
  • +
  • ${metadata.dateLatest}
  • +
  • ${metadata.uniquePageCount}
  • +
  • ${metadata.pageCount}
  • ${msg("Collection Size")}
  • diff --git a/frontend/src/features/collections/select-collection-start-page.ts b/frontend/src/features/collections/select-collection-start-page.ts index 2515fcf6af..f6bd33d290 100644 --- a/frontend/src/features/collections/select-collection-start-page.ts +++ b/frontend/src/features/collections/select-collection-start-page.ts @@ -143,10 +143,10 @@ export class SelectCollectionStartPage extends BtrixElement {
    ${this.renderPageSearch()} TemplateResult | string; + }) { + return html` + + ${when( + collection, + render, + () => html``, + )} + + `; + }; +} + +export function metadataColumn(collection?: Collection | PublicCollection) { + const metadataItem = metadataItemWithCollection(collection); + + return html` + + ${metadataItem({ + label: metadata.dateLatest, + render: (col) => html` + + ${monthYearDateRange(col.dateEarliest, col.dateLatest)} + + `, + })} + ${metadataItem({ + label: metadata.uniquePageCount, + render: (col) => + `${localize.number(col.uniquePageCount)} ${pluralOf("pages", col.uniquePageCount)}`, + })} + ${metadataItem({ + label: metadata.pageCount, + render: (col) => + `${localize.number(col.pageCount)} ${pluralOf("pages", col.pageCount)}`, + })} + + `; +} diff --git a/frontend/src/pages/collections/collection.ts b/frontend/src/pages/collections/collection.ts index b7d6b24314..5d51379e83 100644 --- a/frontend/src/pages/collections/collection.ts +++ b/frontend/src/pages/collections/collection.ts @@ -1,4 +1,4 @@ -import { localized, msg, str } from "@lit/localize"; +import { localized, msg } from "@lit/localize"; import { Task, TaskStatus } from "@lit/task"; import { html, type TemplateResult } from "lit"; import { customElement, property } from "lit/decorators.js"; @@ -6,6 +6,7 @@ import { ifDefined } from "lit/directives/if-defined.js"; import { when } from "lit/directives/when.js"; import { BtrixElement } from "@/classes/BtrixElement"; +import { metadataColumn } from "@/layouts/collections/metadataColumn"; import { page } from "@/layouts/page"; import { RouteNamespace } from "@/routes"; import type { PublicCollection } from "@/types/collection"; @@ -211,39 +212,8 @@ export class Collection extends BtrixElement { `; } - // TODO Consolidate with collection-detail.ts private renderAbout(collection: PublicCollection) { - const dateRange = () => { - if (!collection.dateEarliest || !collection.dateLatest) { - return msg("n/a"); - } - const format: Intl.DateTimeFormatOptions = { - month: "long", - year: "numeric", - }; - const dateEarliest = this.localize.date(collection.dateEarliest, format); - const dateLatest = this.localize.date(collection.dateLatest, format); - - if (dateEarliest === dateLatest) return dateLatest; - - return msg(str`${dateEarliest} to ${dateLatest}`, { - desc: "Date range formatted to show full month name and year", - }); - }; - - const metadata = html` - - - ${dateRange()} - - - ${this.localize.number(collection.pageCount)} - - - ${this.localize.bytes(collection.totalSize)} - - - `; + const metadata = metadataColumn(collection); if (collection.description) { return html` diff --git a/frontend/src/pages/org/archived-item-qa/archived-item-qa.ts b/frontend/src/pages/org/archived-item-qa/archived-item-qa.ts index 5d5ea682e3..a2192d309b 100644 --- a/frontend/src/pages/org/archived-item-qa/archived-item-qa.ts +++ b/frontend/src/pages/org/archived-item-qa/archived-item-qa.ts @@ -38,6 +38,7 @@ import type { } from "@/types/api"; import type { ArchivedItem, ArchivedItemPageComment } from "@/types/crawler"; import type { ArchivedItemQAPage, QARun } from "@/types/qa"; +import { SortDirection as APISortDirection } from "@/types/utils"; import { isActive, isSuccessfullyFinished, @@ -553,7 +554,8 @@ export class ArchivedItemQA extends BtrixElement { .pages=${this.pages} .orderBy=${{ field: this.sortPagesBy.sortBy, - direction: (this.sortPagesBy.sortDirection === -1 + direction: (this.sortPagesBy.sortDirection === + APISortDirection.Descending ? "desc" : "asc") as SortDirection, }} diff --git a/frontend/src/pages/org/collection-detail.ts b/frontend/src/pages/org/collection-detail.ts index 9073a47284..5678d2c8b5 100644 --- a/frontend/src/pages/org/collection-detail.ts +++ b/frontend/src/pages/org/collection-detail.ts @@ -14,13 +14,21 @@ import type { MarkdownEditor } from "@/components/ui/markdown-editor"; import type { PageChangeEvent } from "@/components/ui/pagination"; import { SelectCollectionAccess } from "@/features/collections/select-collection-access"; import type { ShareCollection } from "@/features/collections/share-collection"; +import { + metadataColumn, + metadataItemWithCollection, +} from "@/layouts/collections/metadataColumn"; import { pageHeader, pageNav, type Breadcrumb } from "@/layouts/pageHeader"; import type { APIPaginatedList, APIPaginationQuery, APISortQuery, } from "@/types/api"; -import { CollectionAccess, type Collection } from "@/types/collection"; +import { + CollectionAccess, + type Collection, + type PublicCollection, +} from "@/types/collection"; import type { ArchivedItem, Crawl, Upload } from "@/types/crawler"; import type { CrawlState } from "@/types/crawlState"; import { pluralOf } from "@/utils/pluralize"; @@ -43,7 +51,7 @@ export class CollectionDetail extends BtrixElement { collectionId!: string; @property({ type: String }) - collectionTab: Tab = Tab.Replay; + collectionTab: Tab | null = Tab.Replay; @state() private collection?: Collection; @@ -105,6 +113,9 @@ export class CollectionDetail extends BtrixElement { void this.fetchCollection(); void this.fetchArchivedItems({ page: 1 }); } + if (changedProperties.has("collectionTab") && this.collectionTab === null) { + this.collectionTab = Tab.Replay; + } } protected async updated( @@ -472,11 +483,6 @@ export class CollectionDetail extends BtrixElement { (col) => `${this.localize.number(col.crawlCount)} ${pluralOf("items", col.crawlCount)}`, )} - ${this.renderDetailItem(msg("Total Size"), (col) => - this.localize.bytes(col.totalSize || 0, { - unitDisplay: "narrow", - }), - )} ${this.renderDetailItem( msg("Total Pages"), (col) => @@ -495,12 +501,13 @@ export class CollectionDetail extends BtrixElement { year="numeric" hour="numeric" minute="numeric" + time-zone-name="short" >`, ) : nothing, )} ${this.renderDetailItem( - msg("Last Updated"), + msg("Last Modified"), (col) => html``, )} @@ -517,67 +525,58 @@ export class CollectionDetail extends BtrixElement { private renderDetailItem( label: string | TemplateResult, - renderContent: (collection: Collection) => TemplateResult | string, + renderContent: (collection: PublicCollection) => TemplateResult | string, ) { - return html` - - ${when( - this.collection, - () => renderContent(this.collection!), - () => html``, - )} - - `; + return metadataItemWithCollection(this.collection)({ + label, + render: renderContent, + }); } - // TODO Consolidate with collection.ts private renderAbout() { - const dateRange = (collection: Collection) => { - if (!collection.dateEarliest || !collection.dateLatest) { - return msg("n/a"); - } - const format: Intl.DateTimeFormatOptions = { - month: "long", - year: "numeric", - }; - const dateEarliest = this.localize.date(collection.dateEarliest, format); - const dateLatest = this.localize.date(collection.dateLatest, format); - - if (dateEarliest === dateLatest) return dateLatest; - - return msg(str`${dateEarliest} to ${dateLatest}`, { - desc: "Date range formatted to show full month name and year", - }); - }; - const skeleton = html``; - - const metadata = html` - - - ${this.collection ? dateRange(this.collection) : skeleton} - - - `; + const metadata = metadataColumn(this.collection); return html`
    -
    -

    - ${msg("Description")} -

    +
    +
    +

    + ${msg("About This Collection")} +

    + +
    +

    + ${msg( + html`Describe your collection in long-form rich text (e.g. + bold and italicized text.)`, + )} +

    +

    + ${msg( + html`If this collection is shareable, this will appear in + the “About This Collection” section of the shared + collection.`, + )} +

    +
    + +
    +
    ${when( this.collection?.description && !this.isEditingDescription, () => html` - (this.isEditingDescription = true)} - > - - ${msg("Edit Description")} - + + (this.isEditingDescription = true)} + > + + `, )}
    @@ -602,7 +601,7 @@ export class CollectionDetail extends BtrixElement { ` : html`
    -

    +

    ${msg("No description provided.")}

    = { - modified: { - label: msg("Last Updated"), - defaultDirection: "desc", - }, name: { label: msg("Name"), - defaultDirection: "asc", + defaultDirection: SortDirection.Ascending, + }, + dateLatest: { + label: metadata.dateLatest, + defaultDirection: SortDirection.Descending, + }, + crawlCount: { + label: msg("Archived Items"), + defaultDirection: SortDirection.Descending, + }, + pageCount: { + label: msg("Total Pages"), + defaultDirection: SortDirection.Descending, }, totalSize: { label: msg("Size"), - defaultDirection: "desc", + defaultDirection: SortDirection.Descending, + }, + modified: { + label: msg("Last Modified"), + defaultDirection: SortDirection.Descending, }, }; const MIN_SEARCH_LENGTH = 2; @@ -269,7 +288,7 @@ export class CollectionsList extends BtrixElement { @click=${() => { this.orderBy = { ...this.orderBy, - direction: this.orderBy.direction === "asc" ? "desc" : "asc", + direction: -1 * this.orderBy.direction, }; }} > @@ -363,24 +382,24 @@ export class CollectionsList extends BtrixElement { return html` ${msg("Collection Access")} - ${msg("Name")} - ${msg("Archived Items")} + ${msg(html`Name & Collection Period`)} - ${msg("Total Size")} - - - ${msg("Total Pages")} + ${msg("Archived Items")} + ${msg("Total Pages")} + ${msg("Size")} - ${msg("Last Updated")} + ${msg("Last Modified")} ${msg("Row Actions")} @@ -514,30 +533,31 @@ export class CollectionsList extends BtrixElement { href=${`${this.navigate.orgBasePath}/collections/view/${col.id}`} @click=${this.navigate.link} > - ${col.name} +
    ${col.name}
    +
    + ${monthYearDateRange(col.dateEarliest, col.dateLatest)} +
    ${this.localize.number(col.crawlCount, { notation: "compact" })} ${pluralOf("items", col.crawlCount)} + + ${this.localize.number(col.pageCount, { notation: "compact" })} + ${pluralOf("pages", col.pageCount)} + ${this.localize.bytes(col.totalSize || 0, { unitDisplay: "narrow", })} - - ${this.localize.number(col.pageCount, { notation: "compact" })} - ${pluralOf("pages", col.pageCount)} - @@ -783,7 +803,7 @@ export class CollectionsList extends BtrixElement { this.collections?.pageSize || INITIAL_PAGE_SIZE, sortBy: this.orderBy.field, - sortDirection: this.orderBy.direction === "desc" ? -1 : 1, + sortDirection: this.orderBy.direction, }, { arrayFormat: "comma", diff --git a/frontend/src/pages/org/dashboard.ts b/frontend/src/pages/org/dashboard.ts index 56d17f8b39..2d3b06f762 100644 --- a/frontend/src/pages/org/dashboard.ts +++ b/frontend/src/pages/org/dashboard.ts @@ -1,10 +1,11 @@ import { localized, msg } from "@lit/localize"; import { Task } from "@lit/task"; import type { SlSelectEvent } from "@shoelace-style/shoelace"; -import { html, type PropertyValues, type TemplateResult } from "lit"; +import { html, nothing, type PropertyValues, type TemplateResult } from "lit"; import { customElement, property, state } from "lit/decorators.js"; import { ifDefined } from "lit/directives/if-defined.js"; import { when } from "lit/directives/when.js"; +import queryString from "query-string"; import type { SelectNewDialogEvent } from "."; @@ -13,8 +14,9 @@ import { ClipboardController } from "@/controllers/clipboard"; import { pageHeading } from "@/layouts/page"; import { pageHeader } from "@/layouts/pageHeader"; import { RouteNamespace } from "@/routes"; -import type { PublicCollection } from "@/types/collection"; -import type { PublicOrgCollections } from "@/types/org"; +import type { APIPaginatedList, APISortQuery } from "@/types/api"; +import { CollectionAccess, type Collection } from "@/types/collection"; +import { SortDirection } from "@/types/utils"; import { humanizeExecutionSeconds } from "@/utils/executionTimeFormatter"; import { tw } from "@/utils/tailwind"; @@ -56,16 +58,13 @@ export class Dashboard extends BtrixElement { }; private readonly publicCollections = new Task(this, { - task: async ([slug, metrics]) => { - if (!slug) throw new Error("slug required"); + task: async ([orgId]) => { + if (!orgId) throw new Error("orgId required"); - if (!metrics) return undefined; - if (!metrics.publicCollectionsCount) return []; - - const collections = await this.fetchCollections({ slug }); + const collections = await this.getPublicCollections({ orgId }); return collections; }, - args: () => [this.orgSlugState, this.metrics] as const, + args: () => [this.orgId] as const, }); willUpdate(changedProperties: PropertyValues & Map) { @@ -334,15 +333,17 @@ export class Dashboard extends BtrixElement { ${msg("Copy Link to Profile")} ` - : html` - - - - ${msg("Update Org Visibility")} - - `, + : this.appState.isAdmin + ? html` + + + + ${msg("Update Org Profile")} + + ` + : nothing, )} @@ -368,29 +369,16 @@ export class Dashboard extends BtrixElement { let button: TemplateResult; if (this.metrics.collectionsCount) { - if (this.org.enablePublicProfile) { - button = html` - { - this.navigate.to(`${this.navigate.orgBasePath}/collections`); - }} - > - - ${msg("Manage Collections")} - - `; - } else { - button = html` - { - this.navigate.to(`${this.navigate.orgBasePath}/settings`); - }} - > - - ${msg("Update Org Visibility")} - - `; - } + button = html` + { + this.navigate.to(`${this.navigate.orgBasePath}/collections`); + }} + > + + ${msg("Manage Collections")} + + `; } else { button = html` { - const resp = await fetch(`/api/public/orgs/${slug}/collections`, { - headers: { "Content-Type": "application/json" }, - }); - - switch (resp.status) { - case 200: - return ((await resp.json()) as PublicOrgCollections).collections; - case 404: - return []; - default: - throw resp.status; - } + private async getPublicCollections({ orgId }: { orgId: string }) { + const params: APISortQuery & { + access: CollectionAccess; + } = { + sortBy: "dateLatest", + sortDirection: SortDirection.Descending, + access: CollectionAccess.Public, + }; + const query = queryString.stringify(params); + + const data = await this.api.fetch>( + `/orgs/${orgId}/collections?${query}`, + ); + + return data.items; } } diff --git a/frontend/src/pages/org/profile.ts b/frontend/src/pages/org/profile.ts index fdcbb73a10..7dc9936fb0 100644 --- a/frontend/src/pages/org/profile.ts +++ b/frontend/src/pages/org/profile.ts @@ -3,10 +3,14 @@ import { Task } from "@lit/task"; import { html, nothing } from "lit"; import { customElement, property, state } from "lit/decorators.js"; import { when } from "lit/directives/when.js"; +import queryString from "query-string"; import { BtrixElement } from "@/classes/BtrixElement"; import { page, pageHeading } from "@/layouts/page"; +import type { APIPaginatedList, APISortQuery } from "@/types/api"; +import { CollectionAccess, type Collection } from "@/types/collection"; import type { OrgData, PublicOrgCollections } from "@/types/org"; +import { SortDirection } from "@/types/utils"; @localized() @customElement("btrix-org-profile") @@ -242,7 +246,13 @@ export class OrgProfile extends BtrixElement { }: { slug: string; }): Promise { - const resp = await fetch(`/api/public/orgs/${slug}/collections`, { + const params: APISortQuery = { + sortBy: "dateLatest", + sortDirection: SortDirection.Descending, + }; + const query = queryString.stringify(params); + + const resp = await fetch(`/api/public/orgs/${slug}/collections?${query}`, { headers: { "Content-Type": "application/json" }, }); @@ -277,6 +287,9 @@ export class OrgProfile extends BtrixElement { } const org = await this.api.fetch(`/orgs/${userOrg.id}`); + const collections = await this.getUserPublicCollections({ + orgId: this.orgId, + }); return { org: { @@ -285,10 +298,27 @@ export class OrgProfile extends BtrixElement { url: org.publicUrl || "", verified: false, // TODO }, - collections: [], // TODO + collections, }; } catch { return null; } } + + private async getUserPublicCollections({ orgId }: { orgId: string }) { + const params: APISortQuery & { + access: CollectionAccess; + } = { + sortBy: "dateLatest", + sortDirection: SortDirection.Descending, + access: CollectionAccess.Public, + }; + const query = queryString.stringify(params); + + const data = await this.api.fetch>( + `/orgs/${orgId}/collections?${query}`, + ); + + return data.items; + } } diff --git a/frontend/src/strings/collections/metadata.ts b/frontend/src/strings/collections/metadata.ts new file mode 100644 index 0000000000..1f7d906ec6 --- /dev/null +++ b/frontend/src/strings/collections/metadata.ts @@ -0,0 +1,7 @@ +import { msg } from "@lit/localize"; + +export const metadata = { + dateLatest: msg("Collection Period"), + uniquePageCount: msg("Unique Pages in Collection"), + pageCount: msg("Total Pages Crawled"), +}; diff --git a/frontend/src/strings/ui.ts b/frontend/src/strings/ui.ts index 95c560639a..5a426397b6 100644 --- a/frontend/src/strings/ui.ts +++ b/frontend/src/strings/ui.ts @@ -1,6 +1,9 @@ import { msg } from "@lit/localize"; import { html, type TemplateResult } from "lit"; +export const noData = "--"; +export const notApplicable = msg("n/a"); + // TODO Refactor all generic confirmation messages to use utility export const deleteConfirmation = (name: string | TemplateResult) => msg(html` diff --git a/frontend/src/strings/utils.ts b/frontend/src/strings/utils.ts new file mode 100644 index 0000000000..46ad5e7a5c --- /dev/null +++ b/frontend/src/strings/utils.ts @@ -0,0 +1,25 @@ +import { msg, str } from "@lit/localize"; + +import { noData } from "@/strings/ui"; +import localize from "@/utils/localize"; + +export const monthYearDateRange = ( + startDate?: string | null, + endDate?: string | null, +): string => { + if (!startDate || !endDate) { + return noData; + } + const format: Intl.DateTimeFormatOptions = { + month: "long", + year: "numeric", + }; + const startMonthYear = localize.date(startDate, format); + const endMonthYear = localize.date(endDate, format); + + if (startMonthYear === endMonthYear) return endMonthYear; + + return msg(str`${startMonthYear} to ${endMonthYear}`, { + desc: "Date range formatted to show full month name and year", + }); +}; diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index b016944aad..173d406562 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -30,7 +30,7 @@ export type APIPaginationQuery = { pageSize?: number; }; -export type APISortQuery = { - sortBy?: string; +export type APISortQuery> = { + sortBy?: keyof T; sortDirection?: SortDirection; }; diff --git a/frontend/src/types/collection.ts b/frontend/src/types/collection.ts index 1c0887bd52..4ec6f8fe6c 100644 --- a/frontend/src/types/collection.ts +++ b/frontend/src/types/collection.ts @@ -11,6 +11,8 @@ export const publicCollectionSchema = z.object({ slug: z.string(), oid: z.string(), name: z.string(), + created: z.string().datetime(), + modified: z.string().datetime(), caption: z.string().nullable(), description: z.string().nullable(), resources: z.array(z.string()), @@ -24,6 +26,7 @@ export const publicCollectionSchema = z.object({ .nullable(), defaultThumbnailName: z.string().nullable(), crawlCount: z.number(), + uniquePageCount: z.number(), pageCount: z.number(), totalSize: z.number(), allowPublicDownload: z.boolean(), @@ -34,9 +37,6 @@ export const publicCollectionSchema = z.object({ export type PublicCollection = z.infer; export const collectionSchema = publicCollectionSchema.extend({ - id: z.string(), - created: z.string().datetime(), - modified: z.string().datetime(), tags: z.array(z.string()), access: z.nativeEnum(CollectionAccess), }); diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index 95edaba426..4beb8c7ef0 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -165,6 +165,7 @@ type ArchivedItemBase = { lastQAState: CrawlState | null; lastQAStarted: string | null; pageCount?: number; + uniquePageCount?: number; filePageCount?: number; errorPageCount?: number; }; diff --git a/frontend/src/types/utils.ts b/frontend/src/types/utils.ts index d65cc73589..3d15b69317 100644 --- a/frontend/src/types/utils.ts +++ b/frontend/src/types/utils.ts @@ -22,5 +22,7 @@ export type Range = Exclude< Enumerate >; -/** 1 or -1, but will accept any number for easier typing where this is used **/ -export type SortDirection = -1 | 1 | (number & {}); +export enum SortDirection { + Descending = -1, + Ascending = 1, +}