Skip to content

Commit a99710e

Browse files
authored
remove unpublished resources with duplicate readable_ids (#2478)
* adding task to remove duplicates * adding recurring task * adding test * adding generate_embeddings task after removal * fixing test
1 parent 54733a2 commit a99710e

File tree

3 files changed

+50
-1
lines changed

3 files changed

+50
-1
lines changed

learning_resources/tasks.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import boto3
1010
import celery
1111
from django.conf import settings
12-
from django.db.models import Q
12+
from django.db.models import Count, Q
1313
from django.utils import timezone
1414

1515
from learning_resources.content_summarizer import ContentSummarizer
@@ -35,6 +35,7 @@
3535
load_course_blocklist,
3636
resource_unpublished_actions,
3737
)
38+
from learning_resources_search.constants import COURSE_TYPE
3839
from learning_resources_search.exceptions import RetryError
3940
from main.celery import app
4041
from main.constants import ISOFORMAT
@@ -43,6 +44,34 @@
4344
log = logging.getLogger(__name__)
4445

4546

47+
@app.task(bind=True)
48+
def remove_duplicate_resources(self):
49+
"""Remove duplicate unpublished resources"""
50+
from vector_search.tasks import generate_embeddings
51+
52+
duplicates = (
53+
LearningResource.objects.values("readable_id")
54+
.annotate(count_id=Count("id"))
55+
.filter(count_id__gt=1)
56+
)
57+
embed_tasks = []
58+
for duplicate in duplicates:
59+
unpublished_resources = LearningResource.objects.filter(
60+
readable_id=duplicate["readable_id"],
61+
published=False,
62+
).values_list("id", flat=True)
63+
published_resources = LearningResource.objects.filter(
64+
readable_id=duplicate["readable_id"],
65+
published=False,
66+
).values_list("id", flat=True)
67+
# keep the most recently created resource, delete the rest
68+
LearningResource.objects.filter(id__in=unpublished_resources).delete()
69+
embed_tasks.append(
70+
generate_embeddings.si(published_resources, COURSE_TYPE, overwrite=True)
71+
)
72+
self.replace(celery.chain(*embed_tasks))
73+
74+
4675
@app.task
4776
def update_next_start_date_and_prices():
4877
"""Update expired next start dates and prices"""

learning_resources/tasks_test.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
get_youtube_data,
2424
get_youtube_transcripts,
2525
marketing_page_for_resources,
26+
remove_duplicate_resources,
2627
scrape_marketing_pages,
2728
sync_canvas_courses,
2829
update_next_start_date_and_prices,
@@ -676,3 +677,18 @@ def test_sync_canvas_courses(settings, mocker, django_assert_num_queries, canvas
676677
assert mock_ingest_course.call_count == 1
677678
else:
678679
assert mock_ingest_course.call_count == 2
680+
681+
682+
def test_remove_duplicate_resources(mocker, mocked_celery):
683+
"""
684+
Test that remove_duplicate_resources removes duplicate unpublished resources
685+
while keeping the most recently created resource.
686+
"""
687+
duplicate_id = "duplicate_id"
688+
689+
LearningResourceFactory.create_batch(3, readable_id=duplicate_id, published=False)
690+
LearningResourceFactory.create(readable_id=duplicate_id)
691+
assert LearningResource.objects.filter(readable_id=duplicate_id).count() == 4
692+
with pytest.raises(mocked_celery.replace_exception_class):
693+
remove_duplicate_resources()
694+
assert LearningResource.objects.filter(readable_id=duplicate_id).count() == 1

main/settings_celery.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,10 @@
161161
"SCRAPE_MARKETING_PAGES_SCHEDULE_SECONDS", 60 * 60 * 12
162162
), # default is every 12 hours
163163
},
164+
"remove-duplicate-courses-every-6-hours": {
165+
"task": "learning_resources.tasks.remove_duplicate_resources",
166+
"schedule": crontab(minute=0, hour=9), # 5:00am EST
167+
},
164168
}
165169

166170

0 commit comments

Comments
 (0)