Skip to content

Commit

Permalink
Merge pull request #233 from edgi-govdata-archiving/61-import-from-sh…
Browse files Browse the repository at this point in the history
…eets

Add rake task for importing annotations
  • Loading branch information
Mr0grog authored Feb 28, 2018
2 parents adbfe2d + 3de03ad commit b586f2f
Show file tree
Hide file tree
Showing 4 changed files with 219 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -53,3 +53,7 @@ TOKEN_PRIVATE_KEY='MIIEogIBAAKCAQEAufNrDQRl6Gj1yuga0DVHeJ4fi+lNWtn4S8XRU8/nBwm9v

# In production, set up Sentry.io for error tracking
# SENTRY_DSN=

# Set these if you are running rake tasks to import data from Google Sheets
# GOOGLE_CLIENT_ID=XYZ
# GOOGLE_CLIENT_SECRET=XYZ
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ gem 'oj', '~> 3.4'
gem 'sentry-raven'
gem 'readthis'
gem 'hiredis'
gem 'google-api-client'

# See https://github.com/rails/execjs#readme for more supported runtimes
# gem 'therubyracer', platforms: :ruby
Expand Down
39 changes: 39 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ GEM
crack (0.4.3)
safe_yaml (~> 1.0.0)
crass (1.0.3)
declarative (0.0.10)
declarative-option (0.1.0)
devise (4.4.1)
bcrypt (~> 3.0)
orm_adapter (~> 0.1)
Expand All @@ -82,10 +84,26 @@ GEM
ffi (1.9.21)
globalid (0.4.1)
activesupport (>= 4.2.0)
google-api-client (0.19.8)
addressable (~> 2.5, >= 2.5.1)
googleauth (>= 0.5, < 0.7.0)
httpclient (>= 2.8.1, < 3.0)
mime-types (~> 3.0)
representable (~> 3.0)
retriable (>= 2.0, < 4.0)
googleauth (0.6.2)
faraday (~> 0.12)
jwt (>= 1.4, < 3.0)
logging (~> 2.0)
memoist (~> 0.12)
multi_json (~> 1.11)
os (~> 0.9)
signet (~> 0.7)
hashdiff (0.3.7)
hiredis (0.6.1)
httparty (0.16.0)
multi_xml (>= 0.5.2)
httpclient (2.8.3)
i18n (0.9.5)
concurrent-ruby (~> 1.0)
jmespath (1.3.1)
Expand All @@ -95,12 +113,20 @@ GEM
rb-fsevent (~> 0.9, >= 0.9.4)
rb-inotify (~> 0.9, >= 0.9.7)
ruby_dep (~> 1.2)
little-plugger (1.1.4)
logging (2.2.2)
little-plugger (~> 1.1)
multi_json (~> 1.10)
loofah (2.2.0)
crass (~> 1.0.2)
nokogiri (>= 1.5.9)
mail (2.7.0)
mini_mime (>= 0.1.1)
memoist (0.16.0)
method_source (0.9.0)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_mime (1.0.0)
mini_portile2 (2.3.0)
minitest (5.11.3)
Expand All @@ -115,6 +141,7 @@ GEM
mini_portile2 (~> 2.3.0)
oj (3.4.0)
orm_adapter (0.5.0)
os (0.9.6)
parallel (1.12.1)
parser (2.5.0.0)
ast (~> 2.4.0)
Expand Down Expand Up @@ -173,6 +200,10 @@ GEM
redis (4.0.1)
redis-namespace (1.6.0)
redis (>= 3.0.4)
representable (3.0.4)
declarative (< 0.1.0)
declarative-option (< 0.2.0)
uber (< 0.2.0)
responders (2.4.0)
actionpack (>= 4.2.0, < 5.3)
railties (>= 4.2.0, < 5.3)
Expand All @@ -182,6 +213,7 @@ GEM
redis-namespace (~> 1.3)
sinatra (>= 0.9.2)
vegas (~> 0.1.2)
retriable (3.1.1)
rubocop (0.52.1)
parallel (~> 1.10)
parser (>= 2.4.0.2, < 3.0)
Expand All @@ -205,6 +237,11 @@ GEM
tilt (>= 1.1, < 3)
sentry-raven (2.7.2)
faraday (>= 0.7.6, < 1.0)
signet (0.8.1)
addressable (~> 2.3)
faraday (~> 0.9)
jwt (>= 1.5, < 3.0)
multi_json (~> 1.10)
sinatra (2.0.1)
mustermann (~> 1.0)
rack (~> 2.0)
Expand All @@ -227,6 +264,7 @@ GEM
tilt (2.0.8)
tzinfo (1.2.5)
thread_safe (~> 0.1)
uber (0.1.0)
uglifier (4.1.6)
execjs (>= 0.3.0, < 3)
unicode-display_width (1.3.0)
Expand Down Expand Up @@ -255,6 +293,7 @@ DEPENDENCIES
byebug
devise
dotenv-rails
google-api-client
hiredis
httparty
jwt (~> 2.1)
Expand Down
175 changes: 175 additions & 0 deletions lib/tasks/import_from_sheets.rake
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
require 'google/apis/sheets_v4'
require 'googleauth'
require 'googleauth/stores/file_token_store'

IMPORT_TYPE = 'rake_task_v1'.freeze
OOB_URI = 'urn:ietf:wg:oauth:2.0:oob'.freeze
APPLICATION_NAME = 'Web Monitoring DB Importer'.freeze


desc 'Create annotations from data in analysts’ Google sheets -- only sheet ID & user e-mail are required.'
task :import_annotations_from_sheet, [:sheet_id, :user_email, :tabs, :start_row, :end_row] => [:environment] do |_t, args|
verbose = ENV['VERBOSE']
sheet_id = args[:sheet_id]
start_row = args.fetch(:start_row, 7).to_i
end_row = args[:end_row] || ''
client = sheets_client

user = User.find_by!(email: args[:user_email])

tab_count = 0
annotated_count = 0
skipped_count = 0
error_count = 0

tabs =
if args[:tabs]
args[:tabs].split(',').collect(&:strip)
else
client.get_spreadsheet(sheet_id).sheets.collect do |sheet|
sheet.properties.title
end
end

begin
tabs.each do |tab_title|
puts "Importing spreadsheet tab '#{tab_title}'"

rows = client.get_spreadsheet_values(
sheet_id,
"#{tab_title}!A#{start_row}:AL#{end_row}"
).values

rows.each_with_index do |row, index|
# Column 9 is latest-to-base
begin
change = change_for_version_url(row[9])
rescue StandardError => error
puts "Row #{start_row + index}: #{error.message}"
error_count += 1
end
next unless change

change.annotate(annotation_data_for_row(row), user)
annotated_count += 1

puts "Annotated '#{change.version.page.url}' change '#{change.api_id}'" if verbose
end

tab_count += 1
end
ensure
puts ''
puts 'RESULTS:'
puts '--------'
puts "Created #{annotated_count} annotations"
puts "Skipped #{skipped_count} rows"
puts "Errored #{error_count} rows"
puts "In #{tab_count} spreadsheet tabs"
puts ''
end
end

def change_for_version_url(url)
return nil unless url.present?

# Handle versionista URLs
match = /versionista\.com\/\d+\/\d+\/(\d+):(\d+)/.match(url)
if match
to_version = Version.find_by!(
"source_type = 'versionista' AND source_metadata->>'version_id' = ?",
match[1]
)
from_version = Version.find_by!(
"source_type = 'versionista' AND source_metadata->>'version_id' = ?",
match[2]
)
return Change.between(from: from_version, to: to_version, create: true)
end

# Handle our URLs
match = /monitoring\.envirodatagov\.org\/page\/[^\/]+\/([^\/.]+)\.\.([^\/.]+)/.match(url)
if match
from_version = Version.find(match[1])
to_version = Version.find(match[2])
return Change.between(from: from_version, to: to_version, create: true)
end

raise StandardError, "Unknown change URL format: '#{url}'"
end

def annotation_data_for_row(row)
start_index = 17
# fields from UI project
fields = [
['indiv_1', :boolean],
['indiv_2', :boolean],
['indiv_3', :boolean],
['indiv_4', :boolean],
['indiv_5', :boolean],
['indiv_6', :boolean],
['repeat_7', :boolean],
['repeat_8', :boolean],
['repeat_9', :boolean],
['repeat_10', :boolean],
['repeat_11', :boolean],
['repeat_12', :boolean],
['sig_1', :boolean],
['sig_2', :boolean],
['sig_3', :boolean],
['sig_4', :boolean],
['sig_5', :boolean],
['sig_6', :boolean],
'notes'
]

data = { _importer: IMPORT_TYPE }
fields.each_with_index do |field, index|
field_name, field_type = field.is_a?(Array) ? field : [field, :text]

value = row[start_index + index]
value = value.present? if field_type == :boolean

data[field_name] = value
end

data
end

def sheets_client
service = Google::Apis::SheetsV4::SheetsService.new
service.client_options.application_name = APPLICATION_NAME
service.authorization = authorize_google
service
end

def authorize_google
unless ENV['GOOGLE_CLIENT_ID'] && ENV['GOOGLE_CLIENT_SECRET']
raise 'You must have both `GOOGLE_CLIENT_ID` and `GOOGLE_CLIENT_SECRET` environment variables set.'
end

client_id = Google::Auth::ClientId.new(
ENV['GOOGLE_CLIENT_ID'],
ENV['GOOGLE_CLIENT_SECRET']
)
scope = Google::Apis::SheetsV4::AUTH_SPREADSHEETS_READONLY
token_store = Google::Auth::Stores::FileTokenStore.new(file: Tempfile.new)
authorizer = Google::Auth::UserAuthorizer.new(client_id, scope, token_store)

user_id = 'default'
credentials = authorizer.get_credentials(user_id)
if credentials.nil?
url = authorizer.get_authorization_url(base_url: OOB_URI)
puts 'Open the following URL in your browser and enter the ' \
'resulting code after authorization:'
puts url
code = STDIN.gets.strip
credentials = authorizer.get_and_store_credentials_from_code(
user_id: user_id,
code: code,
base_url: OOB_URI
)
end

credentials
end

0 comments on commit b586f2f

Please sign in to comment.