Assign topics to videos and playlists (#584)

mitodl · Mar 8, 2024 · 8defe2d · 8defe2d
1 parent f3fd6dd
commit 8defe2d
Show file tree

Hide file tree

Showing 13 changed files with 281 additions and 12 deletions.
diff --git a/learning_resources/admin.py b/learning_resources/admin.py
@@ -105,6 +105,7 @@ class VideoPlaylistInline(TabularInline):
     model = models.VideoPlaylist
     extra = 0
     show_change_link = True
+    fields = ("channel",)
 
 
 class ProgramInline(TabularInline):

diff --git a/learning_resources/etl/loaders.py b/learning_resources/etl/loaders.py
@@ -17,6 +17,7 @@
 )
 from learning_resources.etl.deduplication import get_most_relevant_run
 from learning_resources.etl.exceptions import ExtractException
+from learning_resources.etl.utils import most_common_topics
 from learning_resources.models import (
     ContentFile,
     Course,
@@ -45,6 +46,7 @@
     resource_run_upserted_actions,
     resource_unpublished_actions,
     resource_upserted_actions,
+    similar_topics_action,
 )
 
 log = logging.getLogger()
@@ -722,6 +724,8 @@ def load_video(video_data: dict) -> LearningResource:
             learning_resource=learning_resource, defaults=video_fields
         )
         load_image(learning_resource, image_data)
+        if not topics_data:
+            topics_data = similar_topics_action(learning_resource)
         load_topics(learning_resource, topics_data)
         load_offered_by(learning_resource, offered_by_data)
 
@@ -776,6 +780,7 @@ def load_playlist(video_channel: VideoChannel, playlist_data: dict) -> LearningR
         )
         load_offered_by(playlist_resource, offered_bys_data)
         video_resources = load_videos(videos_data)
+        load_topics(playlist_resource, most_common_topics(video_resources))
         playlist_resource.resources.clear()
         for idx, video in enumerate(video_resources):
             playlist_resource.resources.add(
@@ -856,9 +861,10 @@ def load_video_channels(video_channels_data: iter) -> list[VideoChannel]:
         list of VideoChannel: the loaded video channels
     """
     video_channels = []
-
+    channel_ids = []
     for video_channel_data in video_channels_data:
         channel_id = video_channel_data["channel_id"]
+        channel_ids.append(channel_id)
         try:
             video_channel = load_video_channel(video_channel_data)
         except ExtractException:
@@ -874,7 +880,6 @@ def load_video_channels(video_channels_data: iter) -> list[VideoChannel]:
         else:
             video_channels.append(video_channel)
 
-    channel_ids = [video_channel.channel_id for video_channel in video_channels]
     VideoChannel.objects.exclude(channel_id__in=channel_ids).update(published=False)
 
     # Unpublish any video playlists not included in published channels

diff --git a/learning_resources/etl/loaders_test.py b/learning_resources/etl/loaders_test.py
@@ -936,6 +936,11 @@ def test_load_video(mocker, mock_upsert_tasks, video_exists, is_published, pass_
         VideoFactory.create() if video_exists else VideoFactory.build()
     ).learning_resource
     offered_by = LearningResourceOfferorFactory.create()
+    expected_topics = [{"name": "Biology"}, {"name": "Chemistry"}]
+    mock_similar_topics_action = mocker.patch(
+        "learning_resources.etl.loaders.similar_topics_action",
+        return_value=expected_topics,
+    )
 
     assert Video.objects.count() == (1 if video_exists else 0)
 
@@ -954,6 +959,8 @@ def test_load_video(mocker, mock_upsert_tasks, video_exists, is_published, pass_
         "published": is_published,
         "video": {"duration": video_resource.video.duration},
     }
+    if pass_topics:
+        props["topics"] = expected_topics
 
     result = load_video(props)
     assert Video.objects.count() == 1
@@ -962,6 +969,11 @@ def test_load_video(mocker, mock_upsert_tasks, video_exists, is_published, pass_
     assert isinstance(result, LearningResource)
     assert result.published == is_published
 
+    assert mock_similar_topics_action.call_count == (0 if pass_topics else 1)
+    assert list(result.topics.values_list("name", flat=True).order_by("name")) == [
+        topic["name"] for topic in expected_topics
+    ]
+
     for key, value in props.items():
         assert getattr(result, key) == value, f"Property {key} should equal {value}"
 
@@ -986,22 +998,25 @@ def test_load_videos():
     assert Video.objects.count() == len(video_resources)
 
 
-def test_load_playlist():
+def test_load_playlist(mocker):
     """Test load_playlist"""
+    expected_topics = [{"name": "Biology"}, {"name": "Physics"}]
+    mock_most_common_topics = mocker.patch(
+        "learning_resources.etl.loaders.most_common_topics",
+        return_value=expected_topics,
+    )
     channel = VideoChannelFactory.create(playlists=None)
     playlist = VideoPlaylistFactory.build().learning_resource
     assert VideoPlaylist.objects.count() == 0
     assert Video.objects.count() == 0
-    videos_resources = [
-        video.learning_resource for video in VideoFactory.build_batch(5)
-    ]
+    video_resources = [video.learning_resource for video in VideoFactory.build_batch(5)]
     videos_data = [
         {
             **model_to_dict(video, exclude=non_transformable_attributes),
             "platform": PlatformType.youtube.name,
             "offered_by": {"code": LearningResourceOfferorFactory.create().code},
         }
-        for video in videos_resources
+        for video in video_resources
     ]
 
     props = {
@@ -1015,9 +1030,13 @@ def test_load_playlist():
     result = load_playlist(channel, props)
 
     assert isinstance(result, LearningResource)
+    mock_most_common_topics.assert_called_once()
 
-    assert result.resources.count() == len(videos_resources)
+    assert result.resources.count() == len(video_resources)
     assert result.video_playlist.channel == channel
+    assert list(result.topics.values_list("name", flat=True).order_by("name")) == [
+        topic["name"] for topic in expected_topics
+    ]
 
 
 def test_load_playlists_unpublish(mocker):

diff --git a/learning_resources/etl/utils.py b/learning_resources/etl/utils.py
@@ -7,6 +7,7 @@
 import os
 import re
 import uuid
+from collections import Counter
 from collections.abc import Generator
 from datetime import datetime
 from hashlib import md5
@@ -38,6 +39,7 @@
 from learning_resources.models import (
     ContentFile,
     Course,
+    LearningResource,
     LearningResourceRun,
 )
 
@@ -627,3 +629,23 @@ def update_course_numbers_json(course: Course):
         is_ocw=is_ocw,
     )
     course.save()
+
+
+def most_common_topics(
+    resources: list[LearningResource], max_topics: int = settings.OPEN_VIDEO_MAX_TOPICS
+) -> list[dict]:
+    """
+    Get the most common topics from a list of resources
+
+    Args:
+        resources (list[LearningResource]): resources to get topics from
+        max_topics (int): The maximum number of topics to return
+
+    Returns:
+        list of dict: The most common topic names
+    """
+    counter = Counter(
+        [topic.name for resource in resources for topic in resource.topics.all()]
+    )
+    common_topics = dict(counter.most_common(max_topics)).keys()
+    return [{"name": topic} for topic in common_topics]
diff --git a/learning_resources/etl/utils_test.py b/learning_resources/etl/utils_test.py
@@ -2,6 +2,7 @@
 
 import datetime
 import pathlib
+from random import randrange
 from subprocess import check_call
 from tempfile import TemporaryDirectory
 from unittest.mock import ANY
@@ -16,7 +17,12 @@
     PlatformType,
 )
 from learning_resources.etl import utils
-from learning_resources.factories import ContentFileFactory, LearningResourceRunFactory
+from learning_resources.factories import (
+    ContentFileFactory,
+    LearningResourceFactory,
+    LearningResourceRunFactory,
+    LearningResourceTopicFactory,
+)
 
 pytestmark = pytest.mark.django_db
 
@@ -326,3 +332,24 @@ def test_extract_valid_department_from_id(readable_id, is_ocw, dept_ids):
     assert (
         utils.extract_valid_department_from_id(readable_id, is_ocw=is_ocw) == dept_ids
     )
+
+
+def test_most_common_topics():
+    """Test that most_common_topics returns the correct topics"""
+    max_topics = 4
+    common_topics = LearningResourceTopicFactory.create_batch(max_topics)
+    uncommon_topics = LearningResourceTopicFactory.create_batch(3)
+    resources = []
+    for topic in common_topics:
+        resources.extend(
+            LearningResourceFactory.create_batch(randrange(2, 4), topics=[topic])  # noqa: S311
+        )
+    resources.extend(
+        [LearningResourceFactory.create(topics=[topic]) for topic in uncommon_topics]
+    )
+    assert sorted(
+        [
+            topic["name"]
+            for topic in utils.most_common_topics(resources, max_topics=max_topics)
+        ]
+    ) == [topic.name for topic in common_topics]
diff --git a/learning_resources/hooks.py b/learning_resources/hooks.py
@@ -23,6 +23,10 @@ def resource_upserted(self, resource):
     def resource_unpublished(self, resource):
         """Trigger actions after a learning resource is unpublished"""
 
+    @hookspec
+    def resource_similar_topics(self, resource) -> list[dict]:
+        """Get similar topics for a learning resource"""
+
     @hookspec
     def bulk_resources_unpublished(self, resource_ids, resource_type):
         """Trigger actions after multiple learning resources are unpublished"""

diff --git a/learning_resources/models.py b/learning_resources/models.py
@@ -112,15 +112,13 @@ class LearningResource(TimestampedModel):
         "children__child",
         "children__child__runs",
         "children__child__runs__instructors",
-        "children__child__course",
-        "children__child__program",
-        "children__child__learning_path",
         "children__child__departments",
         "children__child__platform",
         "children__child__topics",
         "children__child__image",
         "children__child__offered_by",
         "children__child__content_tags",
+        *[f"children__child__{item.name}" for item in LearningResourceType],
     ]
 
     related_selects = [

diff --git a/learning_resources/utils.py b/learning_resources/utils.py
@@ -331,6 +331,17 @@ def resource_unpublished_actions(resource: LearningResource):
     hook.resource_unpublished(resource=resource)
 
 
+def similar_topics_action(resource: LearningResource) -> dict:
+    """
+    Trigger plugin to get similar topics for a resource
+    """
+    pm = get_plugin_manager()
+    hook = pm.hook
+    topics = hook.resource_similar_topics(resource=resource)
+    # The plugin returns the list wrapped in another list for some reason
+    return topics[0] if topics else []
+
+
 def resource_delete_actions(resource: LearningResource):
     """
     Trigger plugin to handle learning resource deletion

diff --git a/learning_resources/utils_test.py b/learning_resources/utils_test.py
@@ -212,6 +212,18 @@ def test_resource_upserted_actions(mock_plugin_manager, fixture_resource):
     )
 
 
+def test_similar_topics_action(mock_plugin_manager, fixture_resource) -> dict:
+    """
+    similar_topics_action should trigger plugin hook's resource_similar_topics function
+    """
+    mock_topics = [{"name": "Biology"}, {"name": "Chemistry"}]
+    mock_plugin_manager.hook.resource_similar_topics.return_value = [mock_topics]
+    assert utils.similar_topics_action(fixture_resource) == mock_topics
+    mock_plugin_manager.hook.resource_similar_topics.assert_called_once_with(
+        resource=fixture_resource
+    )
+
+
 def test_resource_unpublished_actions(mock_plugin_manager, fixture_resource):
     """
     resource_unpublished_actions function should trigger plugin hook's resource_unpublished function

diff --git a/learning_resources_search/api.py b/learning_resources_search/api.py
@@ -1,8 +1,10 @@
 """API for general search-related functionality"""
 
 import re
+from collections import Counter
 
 from opensearch_dsl import Search
+from opensearch_dsl.query import MoreLikeThis
 
 from learning_resources.constants import LEARNING_RESOURCE_SORTBY_OPTIONS
 from learning_resources_search.connection import get_default_alias_name
@@ -11,6 +13,7 @@
     COURSE_QUERY_FIELDS,
     COURSE_TYPE,
     DEPARTMENT_QUERY_FIELDS,
+    LEARNING_RESOURCE,
     LEARNING_RESOURCE_QUERY_FIELDS,
     LEARNING_RESOURCE_SEARCH_FILTERS,
     LEARNING_RESOURCE_TYPES,
@@ -515,3 +518,49 @@ def execute_learn_search(search_params):
         search = search.extra(aggs=aggregation_clauses)
 
     return search.execute().to_dict()
+
+
+def get_similar_topics(
+    value_doc: dict, num_topics: int, min_term_freq: int, min_doc_freq: int
+) -> list[str]:
+    """
+    Get a list of similar topics based on text values
+
+    Args:
+        value_doc (dict):
+            a document representing the data fields we want to search with
+        num_topics (int):
+            number of topics to return
+        min_term_freq (int):
+            minimum times a term needs to show up in input
+        min_doc_freq (int):
+            minimum times a term needs to show up in docs
+
+    Returns:
+        list of str:
+            list of topic values
+    """
+    indexes = relevant_indexes([COURSE_TYPE], [], endpoint=LEARNING_RESOURCE)
+    search = Search(index=",".join(indexes))
+    search = search.filter("term", resource_type=COURSE_TYPE)
+    search = search.query(
+        MoreLikeThis(
+            like=[{"doc": value_doc, "fields": list(value_doc.keys())}],
+            fields=[
+                "course.course_numbers.value",
+                "title",
+                "description",
+                "full_description",
+            ],
+            min_term_freq=min_term_freq,
+            min_doc_freq=min_doc_freq,
+        )
+    )
+    search = search.source(includes="topics")
+
+    response = search.execute()
+
+    topics = [topic.to_dict()["name"] for hit in response.hits for topic in hit.topics]
+
+    counter = Counter(topics)
+    return list(dict(counter.most_common(num_topics)).keys())