Skip to content

Commit

Permalink
Assign topics to videos and playlists (#584)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertrand authored Mar 8, 2024
1 parent f3fd6dd commit 8defe2d
Show file tree
Hide file tree
Showing 13 changed files with 281 additions and 12 deletions.
1 change: 1 addition & 0 deletions learning_resources/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ class VideoPlaylistInline(TabularInline):
model = models.VideoPlaylist
extra = 0
show_change_link = True
fields = ("channel",)


class ProgramInline(TabularInline):
Expand Down
9 changes: 7 additions & 2 deletions learning_resources/etl/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)
from learning_resources.etl.deduplication import get_most_relevant_run
from learning_resources.etl.exceptions import ExtractException
from learning_resources.etl.utils import most_common_topics
from learning_resources.models import (
ContentFile,
Course,
Expand Down Expand Up @@ -45,6 +46,7 @@
resource_run_upserted_actions,
resource_unpublished_actions,
resource_upserted_actions,
similar_topics_action,
)

log = logging.getLogger()
Expand Down Expand Up @@ -722,6 +724,8 @@ def load_video(video_data: dict) -> LearningResource:
learning_resource=learning_resource, defaults=video_fields
)
load_image(learning_resource, image_data)
if not topics_data:
topics_data = similar_topics_action(learning_resource)
load_topics(learning_resource, topics_data)
load_offered_by(learning_resource, offered_by_data)

Expand Down Expand Up @@ -776,6 +780,7 @@ def load_playlist(video_channel: VideoChannel, playlist_data: dict) -> LearningR
)
load_offered_by(playlist_resource, offered_bys_data)
video_resources = load_videos(videos_data)
load_topics(playlist_resource, most_common_topics(video_resources))
playlist_resource.resources.clear()
for idx, video in enumerate(video_resources):
playlist_resource.resources.add(
Expand Down Expand Up @@ -856,9 +861,10 @@ def load_video_channels(video_channels_data: iter) -> list[VideoChannel]:
list of VideoChannel: the loaded video channels
"""
video_channels = []

channel_ids = []
for video_channel_data in video_channels_data:
channel_id = video_channel_data["channel_id"]
channel_ids.append(channel_id)
try:
video_channel = load_video_channel(video_channel_data)
except ExtractException:
Expand All @@ -874,7 +880,6 @@ def load_video_channels(video_channels_data: iter) -> list[VideoChannel]:
else:
video_channels.append(video_channel)

channel_ids = [video_channel.channel_id for video_channel in video_channels]
VideoChannel.objects.exclude(channel_id__in=channel_ids).update(published=False)

# Unpublish any video playlists not included in published channels
Expand Down
31 changes: 25 additions & 6 deletions learning_resources/etl/loaders_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -936,6 +936,11 @@ def test_load_video(mocker, mock_upsert_tasks, video_exists, is_published, pass_
VideoFactory.create() if video_exists else VideoFactory.build()
).learning_resource
offered_by = LearningResourceOfferorFactory.create()
expected_topics = [{"name": "Biology"}, {"name": "Chemistry"}]
mock_similar_topics_action = mocker.patch(
"learning_resources.etl.loaders.similar_topics_action",
return_value=expected_topics,
)

assert Video.objects.count() == (1 if video_exists else 0)

Expand All @@ -954,6 +959,8 @@ def test_load_video(mocker, mock_upsert_tasks, video_exists, is_published, pass_
"published": is_published,
"video": {"duration": video_resource.video.duration},
}
if pass_topics:
props["topics"] = expected_topics

result = load_video(props)
assert Video.objects.count() == 1
Expand All @@ -962,6 +969,11 @@ def test_load_video(mocker, mock_upsert_tasks, video_exists, is_published, pass_
assert isinstance(result, LearningResource)
assert result.published == is_published

assert mock_similar_topics_action.call_count == (0 if pass_topics else 1)
assert list(result.topics.values_list("name", flat=True).order_by("name")) == [
topic["name"] for topic in expected_topics
]

for key, value in props.items():
assert getattr(result, key) == value, f"Property {key} should equal {value}"

Expand All @@ -986,22 +998,25 @@ def test_load_videos():
assert Video.objects.count() == len(video_resources)


def test_load_playlist():
def test_load_playlist(mocker):
"""Test load_playlist"""
expected_topics = [{"name": "Biology"}, {"name": "Physics"}]
mock_most_common_topics = mocker.patch(
"learning_resources.etl.loaders.most_common_topics",
return_value=expected_topics,
)
channel = VideoChannelFactory.create(playlists=None)
playlist = VideoPlaylistFactory.build().learning_resource
assert VideoPlaylist.objects.count() == 0
assert Video.objects.count() == 0
videos_resources = [
video.learning_resource for video in VideoFactory.build_batch(5)
]
video_resources = [video.learning_resource for video in VideoFactory.build_batch(5)]
videos_data = [
{
**model_to_dict(video, exclude=non_transformable_attributes),
"platform": PlatformType.youtube.name,
"offered_by": {"code": LearningResourceOfferorFactory.create().code},
}
for video in videos_resources
for video in video_resources
]

props = {
Expand All @@ -1015,9 +1030,13 @@ def test_load_playlist():
result = load_playlist(channel, props)

assert isinstance(result, LearningResource)
mock_most_common_topics.assert_called_once()

assert result.resources.count() == len(videos_resources)
assert result.resources.count() == len(video_resources)
assert result.video_playlist.channel == channel
assert list(result.topics.values_list("name", flat=True).order_by("name")) == [
topic["name"] for topic in expected_topics
]


def test_load_playlists_unpublish(mocker):
Expand Down
22 changes: 22 additions & 0 deletions learning_resources/etl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import re
import uuid
from collections import Counter
from collections.abc import Generator
from datetime import datetime
from hashlib import md5
Expand Down Expand Up @@ -38,6 +39,7 @@
from learning_resources.models import (
ContentFile,
Course,
LearningResource,
LearningResourceRun,
)

Expand Down Expand Up @@ -627,3 +629,23 @@ def update_course_numbers_json(course: Course):
is_ocw=is_ocw,
)
course.save()


def most_common_topics(
resources: list[LearningResource], max_topics: int = settings.OPEN_VIDEO_MAX_TOPICS
) -> list[dict]:
"""
Get the most common topics from a list of resources
Args:
resources (list[LearningResource]): resources to get topics from
max_topics (int): The maximum number of topics to return
Returns:
list of dict: The most common topic names
"""
counter = Counter(
[topic.name for resource in resources for topic in resource.topics.all()]
)
common_topics = dict(counter.most_common(max_topics)).keys()
return [{"name": topic} for topic in common_topics]
29 changes: 28 additions & 1 deletion learning_resources/etl/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import datetime
import pathlib
from random import randrange
from subprocess import check_call
from tempfile import TemporaryDirectory
from unittest.mock import ANY
Expand All @@ -16,7 +17,12 @@
PlatformType,
)
from learning_resources.etl import utils
from learning_resources.factories import ContentFileFactory, LearningResourceRunFactory
from learning_resources.factories import (
ContentFileFactory,
LearningResourceFactory,
LearningResourceRunFactory,
LearningResourceTopicFactory,
)

pytestmark = pytest.mark.django_db

Expand Down Expand Up @@ -326,3 +332,24 @@ def test_extract_valid_department_from_id(readable_id, is_ocw, dept_ids):
assert (
utils.extract_valid_department_from_id(readable_id, is_ocw=is_ocw) == dept_ids
)


def test_most_common_topics():
"""Test that most_common_topics returns the correct topics"""
max_topics = 4
common_topics = LearningResourceTopicFactory.create_batch(max_topics)
uncommon_topics = LearningResourceTopicFactory.create_batch(3)
resources = []
for topic in common_topics:
resources.extend(
LearningResourceFactory.create_batch(randrange(2, 4), topics=[topic]) # noqa: S311
)
resources.extend(
[LearningResourceFactory.create(topics=[topic]) for topic in uncommon_topics]
)
assert sorted(
[
topic["name"]
for topic in utils.most_common_topics(resources, max_topics=max_topics)
]
) == [topic.name for topic in common_topics]
4 changes: 4 additions & 0 deletions learning_resources/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ def resource_upserted(self, resource):
def resource_unpublished(self, resource):
"""Trigger actions after a learning resource is unpublished"""

@hookspec
def resource_similar_topics(self, resource) -> list[dict]:
"""Get similar topics for a learning resource"""

@hookspec
def bulk_resources_unpublished(self, resource_ids, resource_type):
"""Trigger actions after multiple learning resources are unpublished"""
Expand Down
4 changes: 1 addition & 3 deletions learning_resources/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,13 @@ class LearningResource(TimestampedModel):
"children__child",
"children__child__runs",
"children__child__runs__instructors",
"children__child__course",
"children__child__program",
"children__child__learning_path",
"children__child__departments",
"children__child__platform",
"children__child__topics",
"children__child__image",
"children__child__offered_by",
"children__child__content_tags",
*[f"children__child__{item.name}" for item in LearningResourceType],
]

related_selects = [
Expand Down
11 changes: 11 additions & 0 deletions learning_resources/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,17 @@ def resource_unpublished_actions(resource: LearningResource):
hook.resource_unpublished(resource=resource)


def similar_topics_action(resource: LearningResource) -> dict:
"""
Trigger plugin to get similar topics for a resource
"""
pm = get_plugin_manager()
hook = pm.hook
topics = hook.resource_similar_topics(resource=resource)
# The plugin returns the list wrapped in another list for some reason
return topics[0] if topics else []


def resource_delete_actions(resource: LearningResource):
"""
Trigger plugin to handle learning resource deletion
Expand Down
12 changes: 12 additions & 0 deletions learning_resources/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,18 @@ def test_resource_upserted_actions(mock_plugin_manager, fixture_resource):
)


def test_similar_topics_action(mock_plugin_manager, fixture_resource) -> dict:
"""
similar_topics_action should trigger plugin hook's resource_similar_topics function
"""
mock_topics = [{"name": "Biology"}, {"name": "Chemistry"}]
mock_plugin_manager.hook.resource_similar_topics.return_value = [mock_topics]
assert utils.similar_topics_action(fixture_resource) == mock_topics
mock_plugin_manager.hook.resource_similar_topics.assert_called_once_with(
resource=fixture_resource
)


def test_resource_unpublished_actions(mock_plugin_manager, fixture_resource):
"""
resource_unpublished_actions function should trigger plugin hook's resource_unpublished function
Expand Down
49 changes: 49 additions & 0 deletions learning_resources_search/api.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""API for general search-related functionality"""

import re
from collections import Counter

from opensearch_dsl import Search
from opensearch_dsl.query import MoreLikeThis

from learning_resources.constants import LEARNING_RESOURCE_SORTBY_OPTIONS
from learning_resources_search.connection import get_default_alias_name
Expand All @@ -11,6 +13,7 @@
COURSE_QUERY_FIELDS,
COURSE_TYPE,
DEPARTMENT_QUERY_FIELDS,
LEARNING_RESOURCE,
LEARNING_RESOURCE_QUERY_FIELDS,
LEARNING_RESOURCE_SEARCH_FILTERS,
LEARNING_RESOURCE_TYPES,
Expand Down Expand Up @@ -515,3 +518,49 @@ def execute_learn_search(search_params):
search = search.extra(aggs=aggregation_clauses)

return search.execute().to_dict()


def get_similar_topics(
value_doc: dict, num_topics: int, min_term_freq: int, min_doc_freq: int
) -> list[str]:
"""
Get a list of similar topics based on text values
Args:
value_doc (dict):
a document representing the data fields we want to search with
num_topics (int):
number of topics to return
min_term_freq (int):
minimum times a term needs to show up in input
min_doc_freq (int):
minimum times a term needs to show up in docs
Returns:
list of str:
list of topic values
"""
indexes = relevant_indexes([COURSE_TYPE], [], endpoint=LEARNING_RESOURCE)
search = Search(index=",".join(indexes))
search = search.filter("term", resource_type=COURSE_TYPE)
search = search.query(
MoreLikeThis(
like=[{"doc": value_doc, "fields": list(value_doc.keys())}],
fields=[
"course.course_numbers.value",
"title",
"description",
"full_description",
],
min_term_freq=min_term_freq,
min_doc_freq=min_doc_freq,
)
)
search = search.source(includes="topics")

response = search.execute()

topics = [topic.to_dict()["name"] for hit in response.hits for topic in hit.topics]

counter = Counter(topics)
return list(dict(counter.most_common(num_topics)).keys())
Loading

0 comments on commit 8defe2d

Please sign in to comment.