Skip to content

Commit

Permalink
[Data rearchitecture] Stop creating complete universe of article cour…
Browse files Browse the repository at this point in the history
…se timeslices (#6069)

* Stop creating article course timeslices for the entire universe of timeslices. Instead of that, we only create ac timeslices on demand, based on ingested revisions. This is to avoid using a huge amount of disk space for article course timeslices table, when most rows are actually empty.

* Remove calls to create_timeslices_for_new_article_course_records in specs
  • Loading branch information
gabina authored Dec 19, 2024
1 parent da0da8f commit 8c4862a
Show file tree
Hide file tree
Showing 12 changed files with 150 additions and 198 deletions.
16 changes: 10 additions & 6 deletions app/models/article_course_timeslice.rb
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,21 @@ def self.search_by_course_and_user(course, user_id)
def self.update_article_course_timeslices(course, article_id, revisions)
rev_start = revisions[:start]
rev_end = revisions[:end]
# Article course timeslices to update
article_course_timeslices = ArticleCourseTimeslice.for_course_and_article(course,
article_id)
.for_revisions_between(rev_start, rev_end)
article_course_timeslices.each do |timeslice|
# Timeslice dates to update are determined based on course wiki timeslices
timeslices = course.course_wiki_timeslices.for_revisions_between(rev_start, rev_end)
timeslices.each do |timeslice|
# Group revisions that belong to the timeslice
revisions_in_timeslice = revisions[:revisions].select do |revision|
timeslice.start <= revision.date && revision.date < timeslice.end
end
# Get or create article course timeslice based on course, article_id,
# timeslice.start and timeslice.end
ac_timeslice = ArticleCourseTimeslice.find_or_create_by(course:,
article_id:,
start: timeslice.start,
end: timeslice.end)
# Update cache for ArticleCourseTimeslice
timeslice.update_cache_from_revisions revisions_in_timeslice
ac_timeslice.update_cache_from_revisions revisions_in_timeslice
end
end

Expand Down
6 changes: 2 additions & 4 deletions app/models/course_data/articles_courses.rb
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ def self.update_from_course_revisions(course, revisions)
{ article_id: id, course_id: course.id, first_revision: first_revisions[id] }
end

maybe_insert_new_records(course, new_records)
maybe_insert_new_records(new_records)
end

# Given an array of revisions and an array of article ids,
Expand All @@ -223,16 +223,14 @@ def self.get_first_revisions(revisions, new_article_ids)
min_dates
end

def self.maybe_insert_new_records(course, new_records)
def self.maybe_insert_new_records(new_records)
return if new_records.empty?
# Do this in batches to avoid running the MySQL server out of memory
new_records.each_slice(5000) do |new_record_slice|
# rubocop:disable Rails/SkipsModelValidations
insert_all new_record_slice
# rubocop:enable Rails/SkipsModelValidations
end

TimesliceManager.new(course).create_timeslices_for_new_article_course_records(new_records)
end

def self.destroy_invalid_records(course, valid_article_ids)
Expand Down
9 changes: 0 additions & 9 deletions lib/timeslice_manager.rb
Original file line number Diff line number Diff line change
Expand Up @@ -85,13 +85,6 @@ def delete_course_user_wiki_timeslices_after_end_date
end
end

# Creates article course timeslices records for new articles courses
# Takes an array like the following:
# [{:article_id=>115, :course_id=>72},..., {:article_id=>116, :course_id=>72}]
def create_timeslices_for_new_article_course_records(articles_courses)
create_empty_article_course_timeslices(start_dates, articles_courses)
end

# Creates course user timeslices records for every course wiki for new course users
# Takes an array of CoursesUsers records
def create_timeslices_for_new_course_user_records(courses_users)
Expand All @@ -112,7 +105,6 @@ def create_timeslices_for_new_course_wiki_records(wikis)
def create_timeslices_for_new_course_start_date
courses_wikis = @course.courses_wikis
# order matters
create_empty_article_course_timeslices(start_dates_backward, @course.articles_courses)
create_empty_course_user_wiki_timeslices(start_dates_backward)
create_empty_course_wiki_timeslices(start_dates_backward, courses_wikis, needs_update: true)
end
Expand All @@ -122,7 +114,6 @@ def create_timeslices_for_new_course_start_date
def create_timeslices_up_to_new_course_end_date
courses_wikis = @course.courses_wikis
# order matters
create_empty_article_course_timeslices(start_dates_from_old_end, @course.articles_courses)
create_empty_course_user_wiki_timeslices(start_dates_from_old_end)
create_empty_course_wiki_timeslices(start_dates_from_old_end, courses_wikis, needs_update: true)
end
Expand Down
105 changes: 38 additions & 67 deletions spec/lib/articles_courses_cleaner_timeslice_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
describe ArticlesCoursesCleanerTimeslice do
let(:enwiki) { Wiki.get_or_create(project: 'wikipedia', language: 'en') }
let(:wikidata) { Wiki.get_or_create(project: 'wikidata', language: nil) }
let(:course) { create(:course, start: '2024-01-01', end: '2024-04-20') }
let(:start) { '2024-01-01'.to_datetime }
let(:course) { create(:course, start:, end: '2024-04-20') }
let(:manager) { TimesliceManager.new(course) }
let(:article1) { create(:article, wiki: enwiki) }
let(:article2) { create(:article, wiki: wikidata) }
Expand Down Expand Up @@ -34,19 +35,17 @@
create(:articles_course, course:, article: article1, user_ids: [1, 45])
create(:articles_course, course:, article: article3, user_ids: [47, 45, 5])
create(:articles_course, course:, article: article2, user_ids: [455])
manager.create_timeslices_for_new_article_course_records(
[{ article_id: article1.id, course_id: course.id },
{ article_id: article2.id, course_id: course.id },
{ article_id: article3.id, course_id: course.id }]
)
create(:article_course_timeslice, course:, article: article1)
create(:article_course_timeslice, course:, article: article2)
create(:article_course_timeslice, course:, article: article3)
end

it 'removes ArticlesCourses and their timeslices that were edited only by removed users' do
expect(course.article_course_timeslices.size).to eq(333)
expect(course.article_course_timeslices.size).to eq(3)
expect(course.articles_courses.size).to eq(3)
# User ids 5, 45 and 47 were deleted
described_class.clean_articles_courses_for_user_ids(course, [5, 45, 47])
expect(course.article_course_timeslices.size).to eq(222)
expect(course.article_course_timeslices.size).to eq(2)
expect(course.articles_courses.size).to eq(2)
expect(course.articles_courses.first.user_ids).to eq([1, 45])
expect(course.articles_courses.second.user_ids).to eq([455])
Expand All @@ -59,41 +58,27 @@
create(:articles_course, course:, article: article1, user_ids: [1])
create(:articles_course, course:, article: article2, user_ids: [47])
create(:articles_course, course:, article: article3, user_ids: [455])
manager.create_timeslices_for_new_article_course_records(
[{ article_id: article1.id, course_id: course.id },
{ article_id: article2.id, course_id: course.id },
{ article_id: article3.id, course_id: course.id }]
)
# Course start date is updated
course.update(start: '2024-01-10')

# Article 1 was only edited before the new start date
timeslice = ArticleCourseTimeslice.where(course:)
.where(article_id: article1.id)
.where(start: '2024-01-02'.to_datetime)
.first
timeslice.update(user_ids: [1])
create(:article_course_timeslice, course:, article: article1, start:,
end: start + 1.day, user_ids: [1])

# Article 2 was edited before and after the new start date
timeslice = ArticleCourseTimeslice.where(course:)
.where(article_id: article2.id)
.where(start: '2024-01-02'.to_datetime)
.first
timeslice.update(user_ids: [47])
timeslice = ArticleCourseTimeslice.where(course:)
.where(article_id: article2.id)
.where(start: '2024-02-15'.to_datetime)
.first
timeslice.update(user_ids: [47])
create(:article_course_timeslice, course:, article: article2, start:,
end: start + 1.day, user_ids: [47])
create(:article_course_timeslice, course:, article: article2, start: start + 10.days,
end: start + 11.days, user_ids: [47])

# Article 3 was only edited after the new start date
timeslice = ArticleCourseTimeslice.where(course:)
.where(article_id: article3.id)
.where(start: '2024-02-18'.to_datetime)
.first
timeslice.update(user_ids: [455])
create(:article_course_timeslice, course:, article: article3, start: start + 10.days,
end: start + 11.days, user_ids: [455])

# Course start date is updated
course.update(start: '2024-01-10')
end

it 'removes ArticlesCourses and timeslices that do not belong to the course anymore' do
expect(course.article_course_timeslices.size).to eq(333)
expect(course.article_course_timeslices.size).to eq(4)
expect(course.articles_courses.size).to eq(3)
# Clean articles courses
described_class.clean_articles_courses_prior_to_course_start(course)
Expand All @@ -102,7 +87,7 @@
expect(course.article_course_timeslices.where(article_id: article1.id).size).to eq(0)
# Timeslices prior to the new course start date were deleted
expect(course.article_course_timeslices.where('end <= ?', course.start).size).to eq(0)
expect(course.article_course_timeslices.size).to eq(204)
expect(course.article_course_timeslices.size).to eq(2)
# Article 1 was deleted
expect(course.articles_courses.size).to eq(2)
expect(course.articles_courses.first.article_id).to eq(article2.id)
Expand All @@ -116,41 +101,27 @@
create(:articles_course, course:, article: article1, user_ids: [1])
create(:articles_course, course:, article: article2, user_ids: [47])
create(:articles_course, course:, article: article3, user_ids: [455])
manager.create_timeslices_for_new_article_course_records(
[{ article_id: article1.id, course_id: course.id },
{ article_id: article2.id, course_id: course.id },
{ article_id: article3.id, course_id: course.id }]
)
# Course end date is updated
course.update(end: '2024-04-10')

# Article 1 was only edited after the new end date
timeslice = ArticleCourseTimeslice.where(course:)
.where(article_id: article1.id)
.where(start: '2024-04-15'.to_datetime)
.first
timeslice.update(user_ids: [1])
create(:article_course_timeslice, course:, article: article1, start: '2024-04-11',
end: '2024-04-12', user_ids: [1])

# Article 2 was edited before and after the new end date
timeslice = ArticleCourseTimeslice.where(course:)
.where(article_id: article2.id)
.where(start: '2024-01-02'.to_datetime)
.first
timeslice.update(user_ids: [47])
timeslice = ArticleCourseTimeslice.where(course:)
.where(article_id: article2.id)
.where(start: '2024-04-16'.to_datetime)
.first
timeslice.update(user_ids: [47])
# Article 3 was only edited after the new end date
timeslice = ArticleCourseTimeslice.where(course:)
.where(article_id: article3.id)
.where(start: '2024-02-18'.to_datetime)
.first
timeslice.update(user_ids: [455])
create(:article_course_timeslice, course:, article: article2, start:,
end: start + 1.day, user_ids: [47])
create(:article_course_timeslice, course:, article: article2, start: '2024-04-11',
end: '2024-04-12', user_ids: [47])

# Article 3 was only edited before the new end date
create(:article_course_timeslice, course:, article: article3, start:,
end: start + 1.day, user_ids: [455])

# Course end date is updated
course.update(end: '2024-04-10')
end

it 'removes ArticlesCourses and timeslices that do not belong to the course anymore' do
expect(course.article_course_timeslices.size).to eq(333)
expect(course.article_course_timeslices.size).to eq(4)
expect(course.articles_courses.size).to eq(3)
# Clean articles courses
described_class.clean_articles_courses_after_course_end(course)
Expand All @@ -159,7 +130,7 @@
expect(course.article_course_timeslices.where(article_id: article1.id).size).to eq(0)
# Timeslices after the new course end date were deleted
expect(course.article_course_timeslices.where('start > ?', course.end).size).to eq(0)
expect(course.article_course_timeslices.size).to eq(202)
expect(course.article_course_timeslices.size).to eq(2)
# Article 1 was deleted
expect(course.articles_courses.size).to eq(2)
expect(course.articles_courses.first.article_id).to eq(article2.id)
Expand Down
Loading

0 comments on commit 8c4862a

Please sign in to comment.