Skip to content

Commit

Permalink
Make backups in CleanInconsistencies migration
Browse files Browse the repository at this point in the history
  • Loading branch information
danielkza committed Jul 28, 2016
1 parent a57e571 commit a8777c6
Showing 1 changed file with 74 additions and 44 deletions.
118 changes: 74 additions & 44 deletions db/migrate/20160720185407_clean_inconsistencies.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,34 @@
require 'fileutils'

class CleanInconsistencies < ActiveRecord::Migration
def self.up
def backup_dir
return @backup_dir if @backup_dir
@backup_dir = Rails.root.join('db', 'backup', Time.now.strftime('%Y-%m-%d_%H-%M-%S'))
FileUtils.mkdir_p(@backup_dir)
end

def backup(name, query, header: true)
say_with_time("backup(#{name.inspect}, #{query.inspect})") do
copy_query = "COPY (#{query}) TO STDOUT WITH DELIMITER ',' CSV #{header ? 'HEADER' : ''}"

File.open(backup_dir.join("#{name}.csv"), 'a') do |f|
connection.raw_connection.copy_data(copy_query) do
while line = connection.raw_connection.get_copy_data
f.write(line)
end
end
end
end
end

def backup_and_delete_missing(table, exists_query)
backup(table, "SELECT * FROM \"#{table}\" WHERE NOT EXISTS(#{exists_query})")
execute "DELETE FROM \"#{table}\" WHERE NOT EXISTS(#{exists_query})"
end

def up
say "WARNING: destructive migration necessary. Deleted data will be backed up to #{backup_dir}"

# Unset project reference for repositories with non-existing projects
execute <<-SQL
UPDATE repositories AS r
Expand All @@ -10,61 +39,62 @@ def self.up
SQL

# Delete processings with non-existing repositories
execute <<-SQL
DELETE FROM processings AS p
WHERE NOT EXISTS(
SELECT 1 FROM repositories AS r WHERE r.id = p.repository_id
)
SQL
backup_and_delete_missing("processings",
"SELECT 1 FROM repositories AS r WHERE r.id = processings.repository_id")

# Delete process times with non-existing processings
execute <<-SQL
DELETE FROM process_times AS t
WHERE NOT EXISTS (
SELECT 1 FROM processings AS p WHERE p.id = t.processing_id
)
SQL
backup_and_delete_missing("process_times",
"SELECT 1 FROM processings AS p WHERE p.id = process_times.processing_id")

# Delete module results with non-existing processings
execute <<-SQL
DELETE FROM module_results AS m
WHERE NOT EXISTS (
SELECT 1 FROM processings AS p WHERE p.id = m.processing_id
)
SQL
backup_and_delete_missing("module_results",
"SELECT 1 FROM processings AS p WHERE p.id = module_results.processing_id")

# Delete kalibro modules with non-existing module results
backup_and_delete_missing("kalibro_modules",
"SELECT 1 FROM module_results AS m WHERE m.id = kalibro_modules.module_result_id")

# Fix up metric results type, even before backing up so the backup is cleaner
execute <<-SQL
DELETE FROM kalibro_modules AS k
WHERE NOT EXISTS (
SELECT 1 FROM module_results AS m WHERE m.id = k.module_result_id
)
UPDATE metric_results SET "type" = 'TreeMetricResult' WHERE "type" = 'MetricResult'
SQL

# Delete metric results with non-existing module results
execute <<-SQL
DELETE FROM metric_results AS met
WHERE NOT EXISTS (
SELECT 1 FROM module_results AS mod WHERE mod.id = met.module_result_id
)
SQL
backup_and_delete_missing("metric_results",
"SELECT 1 FROM module_results AS m WHERE m.id = metric_results.module_result_id")

# Delete duplicate metric_results. Group them by (module_result, metric_configuration),
# then delete all but the one with the highest ID
# The double wrapping on the inner query is necessary because window functions
# cannot be used in WHERE in PostgreSQL.
execute <<-SQL
DELETE FROM metric_results
WHERE id IN (
SELECT t.id FROM (
SELECT id, ROW_NUMBER() OVER (PARTITION BY module_result_id, metric_configuration_id, "type"
ORDER BY id DESC) AS rnum
FROM metric_results
WHERE "type" = 'TreeMetricResult'
) AS t
WHERE t.rnum > 1
)
# Delete duplicate metric_results. Group them by (module_result_id, metric_configuration_id),
# then delete all but the one with the highest ID. The double wrapping on the inner query is
# necessary because window functions cannot be used in WHERE in PostgreSQL.
repeated_metric_result_query = exec_query <<-SQL
SELECT t.id FROM (
SELECT metric_results.*, ROW_NUMBER() OVER (
PARTITION BY module_result_id, metric_configuration_id, "type"
ORDER BY id DESC) AS rnum
FROM metric_results
WHERE "type" = 'TreeMetricResult'
) AS t
WHERE t.rnum > 1
SQL

unless repeated_metric_result_query.empty?
repeated_metric_result_ids = repeated_metric_result_query.rows.flat_map(&:first).join(',')

# Replace default messages with custom ones to avoid flooding the screen with the huge query
say_with_time('backup("metric_results", "SELECT * metric_results WHERE id IN (...)")') do
suppress_messages do
backup('metric_results',
"SELECT * FROM metric_results WHERE id IN (#{repeated_metric_result_ids})",
header: false)
end
end

say_with_time('execute("DELETE FROM metric_results WHERE id IN (...)")') do
suppress_messages do
execute "DELETE FROM metric_results WHERE id IN (#{repeated_metric_result_ids})"
end
end
end
end

def self.down
Expand Down

0 comments on commit a8777c6

Please sign in to comment.