refactor: store collections outside of tags index

open-craft · Sep 10, 2024 · d517d80 · d517d80
1 parent 436822e
commit d517d80
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 47 deletions.
diff --git a/openedx/core/djangoapps/content/search/api.py b/openedx/core/djangoapps/content/search/api.py
@@ -34,6 +34,7 @@
     searchable_doc_for_course_block,
     searchable_doc_for_collection,
     searchable_doc_for_library_block,
+    searchable_doc_collections,
     searchable_doc_tags,
 )
 
@@ -317,12 +318,12 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
             Fields.context_key,
             Fields.org,
             Fields.tags,
-            Fields.tags + "." + Fields.tags_collections,
             Fields.tags + "." + Fields.tags_taxonomy,
             Fields.tags + "." + Fields.tags_level0,
             Fields.tags + "." + Fields.tags_level1,
             Fields.tags + "." + Fields.tags_level2,
             Fields.tags + "." + Fields.tags_level3,
+            Fields.collections,
             Fields.type,
             Fields.access_id,
             Fields.last_published,
@@ -336,11 +337,11 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
             Fields.content,
             Fields.tags,
             Fields.description,
+            Fields.collections,
             # If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
             # are searchable only if at least one document in the index has a value. If we didn't list them here and,
             # say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
             # these sub-fields: "Attribute `tags.level3` is not searchable."
-            Fields.tags + "." + Fields.tags_collections,
             Fields.tags + "." + Fields.tags_taxonomy,
             Fields.tags + "." + Fields.tags_level0,
             Fields.tags + "." + Fields.tags_level1,
@@ -377,6 +378,7 @@ def index_library(lib_key: str) -> list:
                     doc = {}
                     doc.update(searchable_doc_for_library_block(metadata))
                     doc.update(searchable_doc_tags(metadata.usage_key))
+                    doc.update(searchable_doc_collections(metadata.usage_key))
                     docs.append(doc)
                 except Exception as err:  # pylint: disable=broad-except
                     status_cb(f"Error indexing library component {component}: {err}")
@@ -573,6 +575,15 @@ def upsert_block_tags_index_docs(usage_key: UsageKey):
     _update_index_docs([doc])
 
 
+def upsert_block_collections_index_docs(usage_key: UsageKey):
+    """
+    Updates the collections data in documents for the given Course/Library block
+    """
+    doc = {Fields.id: meili_id_from_opaque_key(usage_key)}
+    doc.update(searchable_doc_collections(usage_key))
+    _update_index_docs([doc])
+
+
 def _get_user_orgs(request: Request) -> list[str]:
     """
     Get the org.short_names for the organizations that the requesting user has OrgStaffRole or OrgInstructorRole.

diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py
@@ -54,7 +54,8 @@ class Fields:
     tags_level1 = "level1"
     tags_level2 = "level2"
     tags_level3 = "level3"
-    tags_collections = "collections"  # subfield of tags, i.e. tags.collections
+    # List of collection.key strings this object belongs to.
+    collections = "collections"
     # The "content" field is a dictionary of arbitrary data, depending on the block_type.
     # It comes from each XBlock's index_dictionary() method (if present) plus some processing.
     # Text (html) blocks have an "html_content" key in here, capa has "capa_content" and "problem_types", and so on.
@@ -167,11 +168,10 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
 
     See the comments above on "Field.tags" for an explanation of the format.
 
-    e.g. for something tagged "Difficulty: Hard" and "Location: Vancouver",
-    and in Collections 3 and 4, this would return:
+    e.g. for something tagged "Difficulty: Hard" and "Location: Vancouver" this
+    would return:
         {
             "tags": {
-                "collections": ["Col_key1", "Col_key2"],
                 "taxonomy": ["Location", "Difficulty"],
                 "level0": ["Location > North America", "Difficulty > Hard"],
                 "level1": ["Location > North America > Canada"],
@@ -186,16 +186,21 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
     strings in a particular format that the frontend knows how to render to
     support hierarchical refinement by tag.
     """
-    result = {}
-
     # Note that we could improve performance for indexing many components from the same library/course,
     # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the
     # tags for each component separately.
     all_tags = tagging_api.get_object_tags(str(object_id)).all()
+    if not all_tags:
+        # Clear out tags in the index when unselecting all tags for the block, otherwise
+        # it would remain the last value if a cleared Fields.tags field is not included
+        return {Fields.tags: {}}
+    result = {
+        Fields.tags_taxonomy: [],
+        Fields.tags_level0: [],
+        # ... other levels added as needed
+    }
     for obj_tag in all_tags:
         # Add the taxonomy name:
-        if Fields.tags_taxonomy not in result:
-            result[Fields.tags_taxonomy] = []
         if obj_tag.taxonomy.name not in result[Fields.tags_taxonomy]:
             result[Fields.tags_taxonomy].append(obj_tag.taxonomy.name)
         # Taxonomy name plus each level of tags, in a list: # e.g. ["Location", "North America", "Canada", "Vancouver"]
@@ -219,7 +224,22 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
             if len(parts) == level + 2:
                 break  # We have all the levels for this tag now (e.g. parts=["Difficulty", "Hard"] -> need level0 only)
 
+    return {Fields.tags: result}
+
+
+def _collections_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
+    """
+    Given an XBlock, course, library, etc., get the collections for its index doc.
+
+    e.g. for something in Collections "COL_A" and "COL_B", this would return:
+        {
+            "collections": ["COL_A", "COL_B"],
+        }
+
+    Returns an empty dict if the object is not in any collections.
+    """
     # Gather the collections associated with this object
+    result = {}
     collections = []
     try:
         component = lib_api.get_component_from_usage_key(object_id)
@@ -231,9 +251,9 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
         log.warning(f"No component found for {object_id}")
 
     if collections:
-        result[Fields.tags_collections] = list(collections)
+        result[Fields.collections] = list(collections)
 
-    return {Fields.tags: result}
+    return result
 
 
 def searchable_doc_for_library_block(xblock_metadata: lib_api.LibraryXBlockMetadata) -> dict:
@@ -278,6 +298,19 @@ def searchable_doc_tags(usage_key: UsageKey) -> dict:
     return doc
 
 
+def searchable_doc_collections(usage_key: UsageKey) -> dict:
+    """
+    Generate a dictionary document suitable for ingestion into a search engine
+    like Meilisearch or Elasticsearch, with the collections data for the given content object.
+    """
+    doc = {
+        Fields.id: meili_id_from_opaque_key(usage_key),
+    }
+    doc.update(_collections_for_content_object(usage_key))
+
+    return doc
+
+
 def searchable_doc_for_course_block(block) -> dict:
     """
     Generate a dictionary document suitable for ingestion into a search engine

diff --git a/openedx/core/djangoapps/content/search/handlers.py b/openedx/core/djangoapps/content/search/handlers.py
@@ -163,4 +163,9 @@ def content_object_associations_changed_handler(**kwargs) -> None:
         log.error("Received invalid content object id")
         return
 
-    upsert_block_tags_index_docs(usage_key)
+    # This event's changes may contain both "tags" and "collections", but this will happen rarely, if ever.
+    # So we allow a potential double "upsert" here.
+    if "tags" in content_object.changes:
+        upsert_block_tags_index_docs(usage_key)
+    elif "collections" in content_object.changes:
+        upsert_block_collections_index_docs(usage_key)
diff --git a/openedx/core/djangoapps/content/search/tests/test_api.py b/openedx/core/djangoapps/content/search/tests/test_api.py
@@ -175,7 +175,7 @@ def setUp(self):
         tagging_api.add_tag_to_taxonomy(self.taxonomyB, "three")
         tagging_api.add_tag_to_taxonomy(self.taxonomyB, "four")
 
-        # Create collections:
+        # Create a collection:
         self.learning_package = authoring_api.get_learning_package_by_key(self.library.key)
         with freeze_time(created_date):
             self.collection = authoring_api.create_collection(
@@ -379,6 +379,40 @@ def test_index_library_block_tags(self, mock_meilisearch):
         """
         Test indexing an Library Block with tags.
         """
+
+        # Tag XBlock (these internally call `upsert_block_tags_index_docs`)
+        tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyA, ["one", "two"])
+        tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyB, ["three", "four"])
+
+        # Build expected docs with tags at each stage
+        doc_problem_with_tags1 = {
+            "id": self.doc_problem1["id"],
+            "tags": {
+                'taxonomy': ['A'],
+                'level0': ['A > one', 'A > two']
+            }
+        }
+        doc_problem_with_tags2 = {
+            "id": self.doc_problem1["id"],
+            "tags": {
+                'taxonomy': ['A', 'B'],
+                'level0': ['A > one', 'A > two', 'B > four', 'B > three']
+            }
+        }
+
+        mock_meilisearch.return_value.index.return_value.update_documents.assert_has_calls(
+            [
+                call([doc_problem_with_tags1]),
+                call([doc_problem_with_tags2]),
+            ],
+            any_order=True,
+        )
+
+    @override_settings(MEILISEARCH_ENABLED=True)
+    def test_index_library_block_collections(self, mock_meilisearch):
+        """
+        Test indexing an Library Block that is in two collections.
+        """
         collection1 = authoring_api.create_collection(
             learning_package_id=self.library.learning_package.id,
             key="COL1",
@@ -395,11 +429,7 @@ def test_index_library_block_tags(self, mock_meilisearch):
             description="Second Collection",
         )
 
-        # Tag XBlock (these internally call `upsert_block_tags_index_docs`)
-        tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyA, ["one", "two"])
-        tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyB, ["three", "four"])
-
-        # Add Problem1 to both Collections (these internally call `upsert_block_tags_index_docs`)
+        # Add Problem1 to both Collections (these internally call `upsert_block_collections_index_docs`)
         # (adding in reverse order to test sorting of collection tag))
         for collection in (collection2, collection1):
             library_api.update_library_collection_components(
@@ -410,42 +440,18 @@ def test_index_library_block_tags(self, mock_meilisearch):
                 ],
             )
 
-        # Build expected docs with tags at each stage
-        doc_problem_with_tags1 = {
-            "id": self.doc_problem1["id"],
-            "tags": {
-                'taxonomy': ['A'],
-                'level0': ['A > one', 'A > two']
-            }
-        }
-        doc_problem_with_tags2 = {
-            "id": self.doc_problem1["id"],
-            "tags": {
-                'taxonomy': ['A', 'B'],
-                'level0': ['A > one', 'A > two', 'B > four', 'B > three']
-            }
-        }
+        # Build expected docs with collections at each stage
         doc_problem_with_collection2 = {
             "id": self.doc_problem1["id"],
-            "tags": {
-                'collections': [collection2.key],
-                'taxonomy': ['A', 'B'],
-                'level0': ['A > one', 'A > two', 'B > four', 'B > three']
-            }
+            "collections": [collection2.key],
         }
         doc_problem_with_collection1 = {
             "id": self.doc_problem1["id"],
-            "tags": {
-                'collections': [collection1.key, collection2.key],
-                'taxonomy': ['A', 'B'],
-                'level0': ['A > one', 'A > two', 'B > four', 'B > three']
-            }
+            "collections": [collection1.key, collection2.key],
         }
 
         mock_meilisearch.return_value.index.return_value.update_documents.assert_has_calls(
             [
-                call([doc_problem_with_tags1]),
-                call([doc_problem_with_tags2]),
                 call([doc_problem_with_collection2]),
                 call([doc_problem_with_collection1]),
             ],

diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py
@@ -282,8 +282,8 @@ def test_html_library_block(self):
             "content": {
                 "html_content": "",
             },
+            "collections": ["TOY_COLLECTION"],
             "tags": {
-                "collections": ["TOY_COLLECTION"],
                 "taxonomy": ["Difficulty"],
                 "level0": ["Difficulty > Normal"],
             },