Skip to content

Commit

Permalink
refactor: store collections outside of tags index
Browse files Browse the repository at this point in the history
  • Loading branch information
pomegranited committed Sep 10, 2024
1 parent 436822e commit d517d80
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 47 deletions.
15 changes: 13 additions & 2 deletions openedx/core/djangoapps/content/search/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
searchable_doc_for_course_block,
searchable_doc_for_collection,
searchable_doc_for_library_block,
searchable_doc_collections,
searchable_doc_tags,
)

Expand Down Expand Up @@ -317,12 +318,12 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
Fields.context_key,
Fields.org,
Fields.tags,
Fields.tags + "." + Fields.tags_collections,
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Fields.tags + "." + Fields.tags_level2,
Fields.tags + "." + Fields.tags_level3,
Fields.collections,
Fields.type,
Fields.access_id,
Fields.last_published,
Expand All @@ -336,11 +337,11 @@ def rebuild_index(status_cb: Callable[[str], None] | None = None) -> None:
Fields.content,
Fields.tags,
Fields.description,
Fields.collections,
# If we don't list the following sub-fields _explicitly_, they're only sometimes searchable - that is, they
# are searchable only if at least one document in the index has a value. If we didn't list them here and,
# say, there were no tags.level3 tags in the index, the client would get an error if trying to search for
# these sub-fields: "Attribute `tags.level3` is not searchable."
Fields.tags + "." + Fields.tags_collections,
Fields.tags + "." + Fields.tags_taxonomy,
Fields.tags + "." + Fields.tags_level0,
Fields.tags + "." + Fields.tags_level1,
Expand Down Expand Up @@ -377,6 +378,7 @@ def index_library(lib_key: str) -> list:
doc = {}
doc.update(searchable_doc_for_library_block(metadata))
doc.update(searchable_doc_tags(metadata.usage_key))
doc.update(searchable_doc_collections(metadata.usage_key))
docs.append(doc)
except Exception as err: # pylint: disable=broad-except
status_cb(f"Error indexing library component {component}: {err}")
Expand Down Expand Up @@ -573,6 +575,15 @@ def upsert_block_tags_index_docs(usage_key: UsageKey):
_update_index_docs([doc])


def upsert_block_collections_index_docs(usage_key: UsageKey):
"""
Updates the collections data in documents for the given Course/Library block
"""
doc = {Fields.id: meili_id_from_opaque_key(usage_key)}
doc.update(searchable_doc_collections(usage_key))
_update_index_docs([doc])


def _get_user_orgs(request: Request) -> list[str]:
"""
Get the org.short_names for the organizations that the requesting user has OrgStaffRole or OrgInstructorRole.
Expand Down
53 changes: 43 additions & 10 deletions openedx/core/djangoapps/content/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ class Fields:
tags_level1 = "level1"
tags_level2 = "level2"
tags_level3 = "level3"
tags_collections = "collections" # subfield of tags, i.e. tags.collections
# List of collection.key strings this object belongs to.
collections = "collections"
# The "content" field is a dictionary of arbitrary data, depending on the block_type.
# It comes from each XBlock's index_dictionary() method (if present) plus some processing.
# Text (html) blocks have an "html_content" key in here, capa has "capa_content" and "problem_types", and so on.
Expand Down Expand Up @@ -167,11 +168,10 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
See the comments above on "Field.tags" for an explanation of the format.
e.g. for something tagged "Difficulty: Hard" and "Location: Vancouver",
and in Collections 3 and 4, this would return:
e.g. for something tagged "Difficulty: Hard" and "Location: Vancouver" this
would return:
{
"tags": {
"collections": ["Col_key1", "Col_key2"],
"taxonomy": ["Location", "Difficulty"],
"level0": ["Location > North America", "Difficulty > Hard"],
"level1": ["Location > North America > Canada"],
Expand All @@ -186,16 +186,21 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
strings in a particular format that the frontend knows how to render to
support hierarchical refinement by tag.
"""
result = {}

# Note that we could improve performance for indexing many components from the same library/course,
# if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the
# tags for each component separately.
all_tags = tagging_api.get_object_tags(str(object_id)).all()
if not all_tags:
# Clear out tags in the index when unselecting all tags for the block, otherwise
# it would remain the last value if a cleared Fields.tags field is not included
return {Fields.tags: {}}
result = {
Fields.tags_taxonomy: [],
Fields.tags_level0: [],
# ... other levels added as needed
}
for obj_tag in all_tags:
# Add the taxonomy name:
if Fields.tags_taxonomy not in result:
result[Fields.tags_taxonomy] = []
if obj_tag.taxonomy.name not in result[Fields.tags_taxonomy]:
result[Fields.tags_taxonomy].append(obj_tag.taxonomy.name)
# Taxonomy name plus each level of tags, in a list: # e.g. ["Location", "North America", "Canada", "Vancouver"]
Expand All @@ -219,7 +224,22 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
if len(parts) == level + 2:
break # We have all the levels for this tag now (e.g. parts=["Difficulty", "Hard"] -> need level0 only)

return {Fields.tags: result}


def _collections_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
"""
Given an XBlock, course, library, etc., get the collections for its index doc.
e.g. for something in Collections "COL_A" and "COL_B", this would return:
{
"collections": ["COL_A", "COL_B"],
}
Returns an empty dict if the object is not in any collections.
"""
# Gather the collections associated with this object
result = {}
collections = []
try:
component = lib_api.get_component_from_usage_key(object_id)
Expand All @@ -231,9 +251,9 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict:
log.warning(f"No component found for {object_id}")

if collections:
result[Fields.tags_collections] = list(collections)
result[Fields.collections] = list(collections)

return {Fields.tags: result}
return result


def searchable_doc_for_library_block(xblock_metadata: lib_api.LibraryXBlockMetadata) -> dict:
Expand Down Expand Up @@ -278,6 +298,19 @@ def searchable_doc_tags(usage_key: UsageKey) -> dict:
return doc


def searchable_doc_collections(usage_key: UsageKey) -> dict:
"""
Generate a dictionary document suitable for ingestion into a search engine
like Meilisearch or Elasticsearch, with the collections data for the given content object.
"""
doc = {
Fields.id: meili_id_from_opaque_key(usage_key),
}
doc.update(_collections_for_content_object(usage_key))

return doc


def searchable_doc_for_course_block(block) -> dict:
"""
Generate a dictionary document suitable for ingestion into a search engine
Expand Down
7 changes: 6 additions & 1 deletion openedx/core/djangoapps/content/search/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,9 @@ def content_object_associations_changed_handler(**kwargs) -> None:
log.error("Received invalid content object id")
return

upsert_block_tags_index_docs(usage_key)
# This event's changes may contain both "tags" and "collections", but this will happen rarely, if ever.
# So we allow a potential double "upsert" here.
if "tags" in content_object.changes:
upsert_block_tags_index_docs(usage_key)
elif "collections" in content_object.changes:
upsert_block_collections_index_docs(usage_key)
72 changes: 39 additions & 33 deletions openedx/core/djangoapps/content/search/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def setUp(self):
tagging_api.add_tag_to_taxonomy(self.taxonomyB, "three")
tagging_api.add_tag_to_taxonomy(self.taxonomyB, "four")

# Create collections:
# Create a collection:
self.learning_package = authoring_api.get_learning_package_by_key(self.library.key)
with freeze_time(created_date):
self.collection = authoring_api.create_collection(
Expand Down Expand Up @@ -379,6 +379,40 @@ def test_index_library_block_tags(self, mock_meilisearch):
"""
Test indexing an Library Block with tags.
"""

# Tag XBlock (these internally call `upsert_block_tags_index_docs`)
tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyA, ["one", "two"])
tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyB, ["three", "four"])

# Build expected docs with tags at each stage
doc_problem_with_tags1 = {
"id": self.doc_problem1["id"],
"tags": {
'taxonomy': ['A'],
'level0': ['A > one', 'A > two']
}
}
doc_problem_with_tags2 = {
"id": self.doc_problem1["id"],
"tags": {
'taxonomy': ['A', 'B'],
'level0': ['A > one', 'A > two', 'B > four', 'B > three']
}
}

mock_meilisearch.return_value.index.return_value.update_documents.assert_has_calls(
[
call([doc_problem_with_tags1]),
call([doc_problem_with_tags2]),
],
any_order=True,
)

@override_settings(MEILISEARCH_ENABLED=True)
def test_index_library_block_collections(self, mock_meilisearch):
"""
Test indexing an Library Block that is in two collections.
"""
collection1 = authoring_api.create_collection(
learning_package_id=self.library.learning_package.id,
key="COL1",
Expand All @@ -395,11 +429,7 @@ def test_index_library_block_tags(self, mock_meilisearch):
description="Second Collection",
)

# Tag XBlock (these internally call `upsert_block_tags_index_docs`)
tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyA, ["one", "two"])
tagging_api.tag_object(str(self.problem1.usage_key), self.taxonomyB, ["three", "four"])

# Add Problem1 to both Collections (these internally call `upsert_block_tags_index_docs`)
# Add Problem1 to both Collections (these internally call `upsert_block_collections_index_docs`)
# (adding in reverse order to test sorting of collection tag))
for collection in (collection2, collection1):
library_api.update_library_collection_components(
Expand All @@ -410,42 +440,18 @@ def test_index_library_block_tags(self, mock_meilisearch):
],
)

# Build expected docs with tags at each stage
doc_problem_with_tags1 = {
"id": self.doc_problem1["id"],
"tags": {
'taxonomy': ['A'],
'level0': ['A > one', 'A > two']
}
}
doc_problem_with_tags2 = {
"id": self.doc_problem1["id"],
"tags": {
'taxonomy': ['A', 'B'],
'level0': ['A > one', 'A > two', 'B > four', 'B > three']
}
}
# Build expected docs with collections at each stage
doc_problem_with_collection2 = {
"id": self.doc_problem1["id"],
"tags": {
'collections': [collection2.key],
'taxonomy': ['A', 'B'],
'level0': ['A > one', 'A > two', 'B > four', 'B > three']
}
"collections": [collection2.key],
}
doc_problem_with_collection1 = {
"id": self.doc_problem1["id"],
"tags": {
'collections': [collection1.key, collection2.key],
'taxonomy': ['A', 'B'],
'level0': ['A > one', 'A > two', 'B > four', 'B > three']
}
"collections": [collection1.key, collection2.key],
}

mock_meilisearch.return_value.index.return_value.update_documents.assert_has_calls(
[
call([doc_problem_with_tags1]),
call([doc_problem_with_tags2]),
call([doc_problem_with_collection2]),
call([doc_problem_with_collection1]),
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,8 @@ def test_html_library_block(self):
"content": {
"html_content": "",
},
"collections": ["TOY_COLLECTION"],
"tags": {
"collections": ["TOY_COLLECTION"],
"taxonomy": ["Difficulty"],
"level0": ["Difficulty > Normal"],
},
Expand Down

0 comments on commit d517d80

Please sign in to comment.