From 2c44db962f11dadeae6865af47fb60b50f91b8f7 Mon Sep 17 00:00:00 2001 From: gabina Date: Wed, 18 Dec 2024 20:15:02 -0300 Subject: [PATCH] Stop creating article course timeslices for the entire universe of timeslices. Instead of that, we only create ac timeslices on demand, based on ingested revisions. This is to avoid using a huge amount of disk space for article course timeslices table, when most rows are actually empty. --- app/models/article_course_timeslice.rb | 16 +++++---- app/models/course_data/articles_courses.rb | 4 +-- lib/timeslice_manager.rb | 2 -- spec/lib/timeslice_manager_spec.rb | 6 ++-- spec/models/article_course_timeslice_spec.rb | 36 ++++++++++++++++++-- 5 files changed, 47 insertions(+), 17 deletions(-) diff --git a/app/models/article_course_timeslice.rb b/app/models/article_course_timeslice.rb index a3ad2eb101..2dcdead86f 100644 --- a/app/models/article_course_timeslice.rb +++ b/app/models/article_course_timeslice.rb @@ -51,17 +51,21 @@ def self.search_by_course_and_user(course, user_id) def self.update_article_course_timeslices(course, article_id, revisions) rev_start = revisions[:start] rev_end = revisions[:end] - # Article course timeslices to update - article_course_timeslices = ArticleCourseTimeslice.for_course_and_article(course, - article_id) - .for_revisions_between(rev_start, rev_end) - article_course_timeslices.each do |timeslice| + # Timeslice dates to update are determined based on course wiki timeslices + timeslices = course.course_wiki_timeslices.for_revisions_between(rev_start, rev_end) + timeslices.each do |timeslice| # Group revisions that belong to the timeslice revisions_in_timeslice = revisions[:revisions].select do |revision| timeslice.start <= revision.date && revision.date < timeslice.end end + # Get or create article course timeslice based on course, article_id, + # timeslice.start and timeslice.end + ac_timeslice = ArticleCourseTimeslice.find_or_create_by(course:, + article_id:, + start: timeslice.start, + end: timeslice.end) # Update cache for ArticleCourseTimeslice - timeslice.update_cache_from_revisions revisions_in_timeslice + ac_timeslice.update_cache_from_revisions revisions_in_timeslice end end diff --git a/app/models/course_data/articles_courses.rb b/app/models/course_data/articles_courses.rb index 82cf4412de..231de310b4 100644 --- a/app/models/course_data/articles_courses.rb +++ b/app/models/course_data/articles_courses.rb @@ -223,7 +223,7 @@ def self.get_first_revisions(revisions, new_article_ids) min_dates end - def self.maybe_insert_new_records(course, new_records) + def self.maybe_insert_new_records(_course, new_records) return if new_records.empty? # Do this in batches to avoid running the MySQL server out of memory new_records.each_slice(5000) do |new_record_slice| @@ -231,8 +231,6 @@ def self.maybe_insert_new_records(course, new_records) insert_all new_record_slice # rubocop:enable Rails/SkipsModelValidations end - - TimesliceManager.new(course).create_timeslices_for_new_article_course_records(new_records) end def self.destroy_invalid_records(course, valid_article_ids) diff --git a/lib/timeslice_manager.rb b/lib/timeslice_manager.rb index 0b1731e030..914238895b 100644 --- a/lib/timeslice_manager.rb +++ b/lib/timeslice_manager.rb @@ -112,7 +112,6 @@ def create_timeslices_for_new_course_wiki_records(wikis) def create_timeslices_for_new_course_start_date courses_wikis = @course.courses_wikis # order matters - create_empty_article_course_timeslices(start_dates_backward, @course.articles_courses) create_empty_course_user_wiki_timeslices(start_dates_backward) create_empty_course_wiki_timeslices(start_dates_backward, courses_wikis, needs_update: true) end @@ -122,7 +121,6 @@ def create_timeslices_for_new_course_start_date def create_timeslices_up_to_new_course_end_date courses_wikis = @course.courses_wikis # order matters - create_empty_article_course_timeslices(start_dates_from_old_end, @course.articles_courses) create_empty_course_user_wiki_timeslices(start_dates_from_old_end) create_empty_course_wiki_timeslices(start_dates_from_old_end, courses_wikis, needs_update: true) end diff --git a/spec/lib/timeslice_manager_spec.rb b/spec/lib/timeslice_manager_spec.rb index 053858e213..046f72142a 100644 --- a/spec/lib/timeslice_manager_spec.rb +++ b/spec/lib/timeslice_manager_spec.rb @@ -113,11 +113,10 @@ timeslice_manager.create_timeslices_for_new_course_start_date course.reload - # Create timeslices for the period between 2023-12-20 and 2024-01-01 + # Create course and user timeslices for the period between 2023-12-20 and 2024-01-01 expect(course.course_wiki_timeslices.size).to eq(369) expect(course.course_wiki_timeslices.where(needs_update: true).size).to eq(36) expect(course.course_user_wiki_timeslices.size).to eq(738) - expect(course.article_course_timeslices.size).to eq(369) end end end @@ -142,11 +141,10 @@ timeslice_manager.create_timeslices_up_to_new_course_end_date course.reload - # Create timeslices for the period between 2024-04-20 and 2024-04-30 + # Create course and user timeslices for the period between 2024-04-20 and 2024-04-30 expect(course.course_wiki_timeslices.size).to eq(363) expect(course.course_wiki_timeslices.where(needs_update: true).size).to eq(30) expect(course.course_user_wiki_timeslices.size).to eq(726) - expect(course.article_course_timeslices.size).to eq(363) end end end diff --git a/spec/models/article_course_timeslice_spec.rb b/spec/models/article_course_timeslice_spec.rb index 55169b6316..adf4297180 100644 --- a/spec/models/article_course_timeslice_spec.rb +++ b/spec/models/article_course_timeslice_spec.rb @@ -28,6 +28,7 @@ let(:user) { create(:user) } let(:start) { 1.month.ago.beginning_of_day } let(:course) { create(:course, start:, end: 1.month.from_now.beginning_of_day) } + let(:wiki) { Wiki.get_or_create(language: 'en', project: 'wikipedia') } let(:article_course) { create(:articles_course, article:, course:) } let(:revision1) do build(:revision, article:, @@ -89,14 +90,43 @@ revisions << build(:revision, article:, user_id: 3, date: start + 50.hours) revisions << build(:revision, article:, user_id: 7, date: start + 51.hours) + create(:course_wiki_timeslice, course:, wiki:, start:, end: start + 1.day) + create(:course_wiki_timeslice, course:, wiki:, start: start + 1.day, + end: start + 2.days) + create(:course_wiki_timeslice, course:, wiki:, start: start + 2.days, + end: start + 3.days) + end + + it 'creates the right article timeslices based on the revisions' do + article_course_timeslice_0 = described_class.find_by(course:, article:, start:) + article_course_timeslice_1 = described_class.find_by(course:, article:, start: start + 1.day) + article_course_timeslice_2 = described_class.find_by(course:, article:, start: start + 2.days) + + expect(article_course_timeslice_0).to be_nil + expect(article_course_timeslice_1).to be_nil + expect(article_course_timeslice_2).to be_nil + + start_period = start.strftime('%Y%m%d%H%M%S') + end_period = (start + 55.hours).strftime('%Y%m%d%H%M%S') + revision_data = { start: start_period, end: end_period, revisions: } + described_class.update_article_course_timeslices(course, article.id, revision_data) + + article_course_timeslice_0 = described_class.find_by(course:, article:, start:) + article_course_timeslice_1 = described_class.find_by(course:, article:, start: start + 1.day) + article_course_timeslice_2 = described_class.find_by(course:, article:, start: start + 2.days) + + expect(article_course_timeslice_0.user_ids).to eq([25, 1]) + expect(article_course_timeslice_1.user_ids).to eq([1]) + expect(article_course_timeslice_2.user_ids).to eq([3, 7]) + end + + it 'updates the right article timeslices based on the revisions' do create(:article_course_timeslice, course:, article:, start:, end: start + 1.day) create(:article_course_timeslice, course:, article:, start: start + 1.day, end: start + 2.days) create(:article_course_timeslice, course:, article:, start: start + 2.days, end: start + 3.days) - end - it 'updates the right article timeslices based on the revisions' do article_course_timeslice_0 = described_class.find_by(course:, article:, start:) article_course_timeslice_1 = described_class.find_by(course:, article:, start: start + 1.day) article_course_timeslice_2 = described_class.find_by(course:, article:, start: start + 2.days) @@ -110,6 +140,8 @@ revision_data = { start: start_period, end: end_period, revisions: } described_class.update_article_course_timeslices(course, article.id, revision_data) + # No new article course timeslices were created + expect(described_class.where(course:, article:).count).to eq(3) article_course_timeslice_0 = described_class.find_by(course:, article:, start:) article_course_timeslice_1 = described_class.find_by(course:, article:, start: start + 1.day) article_course_timeslice_2 = described_class.find_by(course:, article:, start: start + 2.days)