Skip to content

Commit

Permalink
[Data rearchitecture] Implement UpdateWikidataStatsTimeslice (#6059)
Browse files Browse the repository at this point in the history
* Add stats field in course_wiki_timeslices table to store partial stats. For now, this only will work for wikidata.

* Add UpdateWikidataStatsTimeslice class to import wikidata stats through the wikidata-diff-analyzer gem, build partial stats for timeslices, and create/update course_stats rows for course. Add basic specs for it.

* Use UpdateWikidataStatsTimeslice class.

* Fix format_course_stats when 'other updates' field does not exist

* Do not include deleted revisions
  • Loading branch information
gabina authored Dec 19, 2024
1 parent 65bfd73 commit da0da8f
Show file tree
Hide file tree
Showing 12 changed files with 238 additions and 4 deletions.
5 changes: 4 additions & 1 deletion app/helpers/course_helper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@ def course_i18n(message_key, course = nil)
def format_course_stats(course_stats)
course_stats.each do |wiki_ns_key, _wiki_ns_stats|
if wiki_ns_key == 'www.wikidata.org'
course_stats[wiki_ns_key]['other updates'] += course_stats[wiki_ns_key]['unknown']
# Not all course stats have the 'other updates' field
if course_stats[wiki_ns_key]['other updates']
course_stats[wiki_ns_key]['other updates'] += course_stats[wiki_ns_key]['unknown']
end
course_stats[wiki_ns_key].reject! { |k, _v| k == 'unknown' }
end
# convert stats to human readable values
Expand Down
2 changes: 1 addition & 1 deletion app/models/course_data/articles_courses.rb
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def self.update_from_course_revisions(course, revisions)
# Given an array of revisions and an array of article ids,
# it returns a hash with the min revision datetime for every article id.
def self.get_first_revisions(revisions, new_article_ids)
# This is the only way I found to get an always-greater value
# TODO: find a better way to ensure an always-greater value
max_time = Time.utc(9999, 12, 31)
min_dates = Hash.new(max_time)

Expand Down
9 changes: 9 additions & 0 deletions app/models/course_wiki_timeslice.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,14 @@
# wiki_id :integer not null
# last_mw_rev_datetime :datetime
# needs_update :boolean default(FALSE)
# stats :text(65535)
#
class CourseWikiTimeslice < ApplicationRecord
belongs_to :course
belongs_to :wiki

serialize :stats, Hash

scope :for_course_and_wiki, ->(course, wiki) { where(course:, wiki:) }
# Returns the timeslice to which a datetime belongs (it should be a single timeslice)
scope :for_datetime, ->(datetime) { where('start <= ? AND end > ?', datetime, datetime) }
Expand Down Expand Up @@ -96,6 +99,7 @@ def update_cache_from_revisions(revisions)
update_upload_count
update_uploads_in_use_count
update_upload_usages_count
update_stats
self.needs_update = false
save
end
Expand Down Expand Up @@ -151,4 +155,9 @@ def update_upload_usages_count
# TODO: count only uploads updated at during the timeslice range
self.upload_usages_count = course.uploads_in_use.sum(:usage_count)
end

def update_stats
return unless wiki.project == 'wikidata'
self.stats = UpdateWikidataStatsTimeslice.new(course).build_stats_from_revisions(@revisions)
end
end
7 changes: 5 additions & 2 deletions app/services/update_course_stats_timeslice.rb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def initialize(course)
update_article_status if should_update_article_status?
update_average_pageviews
update_caches
# update_wikidata_stats if wikidata
update_wikidata_stats if wikidata
# This needs to happen after `update_caches` because it relies on ArticlesCourses#new_article
# to calculate new article stats for each namespace.
update_wiki_namespace_stats
Expand Down Expand Up @@ -85,7 +85,10 @@ def update_caches
end

def update_wikidata_stats
UpdateWikidataStatsWorker.new.perform(@course)
wikidata = Wiki.get_or_create(language: nil, project: 'wikidata')
timeslices = CourseWikiTimeslice.for_course_and_wiki(@course, wikidata)
stats = timeslices.pluck(:stats)
UpdateWikidataStatsTimeslice.new(@course).update_wikidata_statistics(stats)
@debugger.log_update_progress :wikidata_stats_updated
end

Expand Down
12 changes: 12 additions & 0 deletions app/services/update_course_wiki_timeslices.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def initialize(course)
@course = course
@timeslice_manager = TimesliceManager.new(@course)
@debugger = UpdateDebugger.new(@course)
@wikidata_stats_updater = UpdateWikidataStatsTimeslice.new(@course) if wikidata
end

def run(all_time:)
Expand Down Expand Up @@ -88,6 +89,13 @@ def fetch_data(wiki, timeslice_start, timeslice_end)
timeslice_start.strftime('%Y%m%d%H%M%S'),
timeslice_end.strftime('%Y%m%d%H%M%S'),
update_service: self)

# Only for wikidata project, fetch wikidata stats
if wiki.project == 'wikidata' && @revisions.present?
wikidata_revisions = @revisions[wiki][:revisions].reject(&:deleted)
@revisions[wiki][:revisions] =
@wikidata_stats_updater.update_revisions_with_stats(wikidata_revisions)
end
# TODO: replace the logic on ArticlesCourses.update_from_course to remove all
# the ArticlesCourses that do not correspond to course revisions.
# That may happen if the course dates changed, so some revisions are no
Expand Down Expand Up @@ -144,6 +152,10 @@ def update_course_user_wiki_timeslices_for_wiki(wiki, revisions)
end
end

def wikidata
@course.wikis.find { |wiki| wiki.project == 'wikidata' }
end

def log_error(error)
Sentry.capture_message "#{@course.title} update timeslices error: #{error}",
level: 'error'
Expand Down
153 changes: 153 additions & 0 deletions app/services/update_wikidata_stats_timeslice.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# frozen_string_literal: true
require_dependency "#{Rails.root}/lib/wikidata_summary_parser"
require_dependency "#{Rails.root}/lib/importers/wikidata_summary_importer"
# require the installed wikidata-diff-analyzer gem
require 'wikidata-diff-analyzer'

class UpdateWikidataStatsTimeslice
# This hash contains the keys of the wikidata-diff-analyzer output hash
# and maps them to the values used in the UI and CourseStat Hash
STATS_CLASSIFICATION = {
# UI section: General
'merge_to' => 'merged to',
'added_sitelinks' => 'interwiki links added',
# UI section: Claims
'added_claims' => 'claims created',
'removed_claims' => 'claims removed',
'changed_claims' => 'claims changed',
# UI section: Items
'clear_item' => 'items cleared',
'create_item' => 'items created',
# UI section: Labels
'added_labels' => 'labels added',
'removed_labels' => 'labels removed',
'changed_labels' => 'labels changed',
# UI section: Descriptions
'added_descriptions' => 'descriptions added',
'removed_descriptions' => 'descriptions removed',
'changed_descriptions' => 'descriptions changed',
# UI section: Aliases
'added_aliases' => 'aliases added',
'removed_aliases' => 'aliases removed',
'changed_aliases' => 'aliases changed',
# UI section: Others
'added_references' => 'references added',
'added_qualifiers' => 'qualifiers added',
'redirect' => 'redirects created',
'undo' => 'reverts performed',
'restore' => 'restorations performed',
# UI section: Not added yet
'removed_references' => 'references removed',
'changed_references' => 'references changed',
'removed_qualifiers' => 'qualifiers removed',
'changed_qualifiers' => 'qualifiers changed',
'removed_sitelinks' => 'interwiki links removed',
'changed_sitelinks' => 'interwiki links updated',
'merge_from' => 'merged from',
'added_lemmas' => 'lemmas added',
'removed_lemmas' => 'lemmas removed',
'changed_lemmas' => 'lemmas changed',
'added_forms' => 'forms added',
'removed_forms' => 'forms removed',
'changed_forms' => 'forms changed',
'added_senses' => 'senses added',
'removed_senses' => 'senses removed',
'changed_senses' => 'senses changed',
'create_property' => 'properties created',
'create_lexeme' => 'lexeme items created',
'added_representations' => 'representations added',
'removed_representations' => 'representations removed',
'changed_representations' => 'representations changed',
'added_glosses' => 'glosses added',
'removed_glosses' => 'glosses removed',
'changed_glosses' => 'glosses changed',
'added_formclaims' => 'form claims added',
'removed_formclaims' => 'form claims removed',
'changed_formclaims' => 'form claims changed',
'added_senseclaims' => 'sense claims added',
'removed_senseclaims' => 'sense claims removed',
'changed_senseclaims' => 'sense claims changed'
}.freeze

def initialize(course)
@course = course
end

# Given an array of revisions, it updates the summary field for each one with
# the wikidata stats. wikidata-diff-analyzer gem is used to fetch the stats.
# Returns the updated array.
def update_revisions_with_stats(revisions)
revision_ids = revisions.pluck(:mw_rev_id)
analyzed_revisions = WikidataDiffAnalyzer.analyze(revision_ids)[:diffs]
revisions.each do |revision|
rev_id = revision.mw_rev_id
individual_stat = analyzed_revisions[rev_id]
serialized_stat = individual_stat.to_json
revision.summary = serialized_stat
end
revisions
end

# Given an array of revisions, it builds the stats for those revisions
def build_stats_from_revisions(revisions)
stats = {}
STATS_CLASSIFICATION.each_key do |key|
stats[STATS_CLASSIFICATION[key]] = 0
end

# create a sum of stats after deserializing the stats for each revision object
revisions.each do |revision|
# Deserialize the summary field to get the stats
deserialized_stat = summary(revision)
next if deserialized_stat.nil?
# create a stats which sums up each field of the deserialized_stat and create a stats hash
deserialized_stat.each do |key, value|
stats[STATS_CLASSIFICATION[key]] += value
end
end
stats['total revisions'] = revisions.count
stats
end

# Given an array of indivual stats, it creates or updates the CourseStats row for it.
def update_wikidata_statistics(individual_stats)
stats = sum_up_stats individual_stats
crs_stat = CourseStat.find_by(course_id: @course.id) || CourseStat.create(course_id: @course.id)

# Update the stats_hash in the CourseStat model and save it
crs_stat.stats_hash[wikidata.domain] = stats
crs_stat.save
end

private

def sum_up_stats(individual_stats)
total_stats = {}
STATS_CLASSIFICATION.each_key do |key|
total_stats[STATS_CLASSIFICATION[key]] = 0
end
# Add total revisions
total_stats['total revisions'] = 0

# Iterate over each individual stat and sum up the values
individual_stats.each do |hash|
hash.each do |key, value|
total_stats[key] += value
end
end
total_stats
end

def wikidata
Wiki.get_or_create(language: nil, project: 'wikidata')
end

# This function parses the serialized stats saved in the summary field, in case of any errors
# it returns nil meaning the field contains an edit summary
def summary(revision)
summary = revision.summary
JSON.parse(summary) if summary.present? && summary.start_with?('{', '[')
rescue JSON::ParserError
nil # Return nil if parsing fails (i.e., not diff_stats)
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class AddStatsToCourseWikiTimeslice < ActiveRecord::Migration[7.0]
def change
add_column :course_wiki_timeslices, :stats, :text
end
end
1 change: 1 addition & 0 deletions db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@
t.integer "wiki_id", null: false
t.datetime "last_mw_rev_datetime"
t.boolean "needs_update", default: false
t.text "stats"
t.index ["course_id", "wiki_id", "start", "end"], name: "course_wiki_timeslice_by_course_wiki_start_and_end", unique: true
end

Expand Down
1 change: 1 addition & 0 deletions spec/factories/course_wiki_timeslices.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# wiki_id :integer not null
# last_mw_rev_datetime :datetime
# needs_update :boolean default(FALSE)
# stats :text(65535)
#

FactoryBot.define do
Expand Down
1 change: 1 addition & 0 deletions spec/models/course_wiki_timeslice_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# wiki_id :integer not null
# last_mw_rev_datetime :datetime
# needs_update :boolean default(FALSE)
# stats :text(65535)
#
require 'rails_helper'

Expand Down
2 changes: 2 additions & 0 deletions spec/services/update_course_wiki_timeslices_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@
expect(timeslice.uploads_in_use_count).to eq(0)
expect(timeslice.upload_usages_count).to eq(0)
expect(timeslice.last_mw_rev_datetime).to eq('20181129180841'.to_datetime)
expect(timeslice.stats).to be_empty

# For wikidata
timeslice = course.course_wiki_timeslices.where(wiki: wikidata,
Expand All @@ -122,6 +123,7 @@
expect(timeslice.uploads_in_use_count).to eq(0)
expect(timeslice.upload_usages_count).to eq(0)
expect(timeslice.last_mw_rev_datetime).to eq('20181124045740'.to_datetime)
expect(timeslice.stats['references removed']).to eq(2)
end

it 'rolls back the updates if something goes wrong' do
Expand Down
44 changes: 44 additions & 0 deletions spec/services/update_wikidata_stats_timeslice_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# frozen_string_literal: true

require 'rails_helper'

require "#{Rails.root}/lib/importers/revision_importer"
require "#{Rails.root}/app/services/update_wikidata_stats"

describe UpdateWikidataStatsTimeslice do
describe 'update_wikidata_statistics' do
let(:wikidata) { Wiki.get_or_create(language: nil, project: 'wikidata') }
let(:course) do
create(:course, start: Date.new(2022, 1, 5), end: Date.new(2022, 1, 7),
home_wiki: wikidata)
end
let(:revision1) { create(:revision, wiki: wikidata, mw_rev_id: 1556860240) }
let(:revision2) { create(:revision, wiki: wikidata, mw_rev_id: 99682036) }
let(:revisions) { [revision1, revision2] }
let(:updater) { described_class.new(course) }

before do
stub_wiki_validation
end

it 'imports wikidata', :vcr do
revisions.each do |rev|
expect(rev.summary).to be_nil
end
updater.update_revisions_with_stats(revisions)
revisions.each do |rev|
expect(rev.summary).not_to be_nil
end
end

it 'creates record in CourseStat table', :vcr do
expect(CourseStat.count).to eq(0)
updater.update_revisions_with_stats(revisions)
partial_stats = updater.build_stats_from_revisions(revisions)
updater.update_wikidata_statistics([partial_stats])
expect(CourseStat.count).to eq(1)
expect(CourseStat.last.stats_hash).not_to be_nil
expect(CourseStat.last.course_id).to eq(Course.last.id)
end
end
end

0 comments on commit da0da8f

Please sign in to comment.