-
Notifications
You must be signed in to change notification settings - Fork 641
/
revision_importer.rb
169 lines (142 loc) · 5.39 KB
/
revision_importer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# frozen_string_literal: true
require_dependency "#{Rails.root}/lib/replica"
require_dependency "#{Rails.root}/lib/duplicate_article_deleter"
require_dependency "#{Rails.root}/lib/importers/article_importer"
require_dependency "#{Rails.root}/app/helpers/encoding_helper"
#= Imports and updates revisions from Wikipedia into the dashboard database
class RevisionImporter
include EncodingHelper
def initialize(wiki, course, update_service: nil)
@wiki = wiki
@course = course
@update_service = update_service
end
def import_revisions_for_course(all_time:)
if all_time
import_revisions(all_revisions_for_course)
else
import_revisions(new_revisions_for_course)
end
end
###########
# Helpers #
###########
private
def all_revisions_for_course
get_revisions(@course.students, course_start_date, end_of_update_period)
end
def new_revisions_for_course
results = []
# Users with no revisions are considered "new". For them, we search for
# revisions starting from the beginning of the course, in case they were
# just added to the course.
@new_users = users_with_no_revisions
results += revisions_from_new_users unless @new_users.empty?
# For users who already have revisions during the course, we assume that
# previous updates imported their revisions prior to the latest revisions.
# We only need to import revisions
@old_users = @course.students - @new_users
results += revisions_from_old_users unless @old_users.empty?
results
end
def revisions_from_new_users
get_revisions(@new_users, course_start_date, end_of_update_period)
end
def revisions_from_old_users
latest_rev = latest_revision_of_course
start = latest_rev.blank? ? course_start_date : latest_rev.date.strftime('%Y%m%d%H%M%S')
get_revisions(@old_users, start, end_of_update_period)
end
def import_revisions(data)
# Use revision data fetched from Replica to add new Revisions as well as
# new Articles where appropriate.
data.each_slice(10000) do |sub_data|
import_revisions_slice(sub_data)
end
end
# Get revisions made by a set of users between two dates.
# We limit the number of usernames per query in order to avoid
# hitting the memory limit of the Replica endpoint.
MAX_USERNAMES = 10
def get_revisions(users, start, end_date)
Utils.chunk_requests(users, MAX_USERNAMES) do |block|
Replica.new(@wiki, @update_service).get_revisions block, start, end_date
end
end
def course_start_date
@course.start.strftime('%Y%m%d')
end
# pull all revisions until present, so that we have any after-the-end revisions
# included for calculating retention when a past course gets updated.
def end_of_update_period
2.days.from_now.strftime('%Y%m%d')
end
def users_with_no_revisions
@course.users.role('student')
.joins(:courses_users)
.where(courses_users: { revision_count: 0 })
end
def latest_revision_of_course
@course.recent_revisions.where(wiki_id: @wiki.id).order('date DESC').first
end
def import_revisions_slice(sub_data)
@articles, @revisions = [], []
# Extract all article data from the slice. Outputs a hash with article attrs.
articles = sub_data_to_article_attributes(sub_data)
# We rely on the unique index here, mw_page_id and wiki_id
Article.import articles, on_duplicate_key_update: [:title, :namespace]
@articles = Article.where(wiki_id: @wiki.id, mw_page_id: articles.map { |a| a['mw_page_id'] })
# Prep: get a user dictionary for all users referred to by revisions.
users = user_dict_from_sub_data(sub_data)
# Now get all the revisions
# We need a slightly different article dictionary format here
article_dict = @articles.each_with_object({}) { |a, memo| memo[a.mw_page_id] = a.id }
revisions = sub_data_to_revision_attributes(sub_data, users, article_dict)
Revision.import revisions, on_duplicate_key_ignore: true
DuplicateArticleDeleter.new(@wiki).resolve_duplicates(@articles)
end
def string_to_boolean(string)
case string
when 'false'
false
when 'true'
true
end
end
def sub_data_to_article_attributes(sub_data)
sub_data.map do |_a_id, article_data|
{
'mw_page_id' => article_data['article']['mw_page_id'],
'wiki_id' => @wiki.id,
'title' => sanitize_4_byte_string(article_data['article']['title']),
'namespace' => article_data['article']['namespace']
}
end
end
def user_dict_from_sub_data(sub_data)
users = sub_data.flat_map do |_a_id, article_data|
article_data['revisions'].map { |rev_data| rev_data['username'] }
end
users.uniq!
# Returns e.g. {"Nalumc"=>4, "Twkpassmore"=>3}
User.where(username: users).pluck(:username, :id).to_h
end
def sub_data_to_revision_attributes(sub_data, users, articles)
sub_data.flat_map do |_a_id, article_data|
article_data['revisions'].map do |rev_data|
mw_page_id = rev_data['mw_page_id'].to_i
{
mw_rev_id: rev_data['mw_rev_id'],
date: rev_data['date'],
characters: rev_data['characters'],
article_id: articles[mw_page_id],
mw_page_id:,
user_id: users[rev_data['username']],
new_article: string_to_boolean(rev_data['new_article']),
system: string_to_boolean(rev_data['system']),
wiki_id: rev_data['wiki_id']
}
end
end
end
end