From 9f0105594f61024963934048d5a905a777e43281 Mon Sep 17 00:00:00 2001 From: Aaron Elkiss Date: Tue, 23 Apr 2024 10:58:10 -0400 Subject: [PATCH] DEV-1086: dedupe & group delete results WIP: Integrated into output email, not yet tested on real data. --- bin/notify.rb | 43 +++++++++++++++++++++---------- docker-compose.yml | 1 + lib/datasets/dedupe_delete_log.rb | 30 +++++++++++++-------- spec/dedupe_deletes_spec.rb | 35 +++++++++++++------------ 4 files changed, 68 insertions(+), 41 deletions(-) diff --git a/bin/notify.rb b/bin/notify.rb index a081352..e39b339 100755 --- a/bin/notify.rb +++ b/bin/notify.rb @@ -5,8 +5,22 @@ # send notifications to users about recent deletes require 'net/smtp' +require 'dedupe_delete_log' -deletes = ARGF.readlines.map { |l| l.strip.split("\t") } +def main + dataset_emails = [ + ['ht_text_pd','dataset-pd','pd'], + ['ht_text_pd_open_access','dataset-pd-oa','pd_open'], + ['ht_text_pd_world','dataset-pd-world','pd_world'], + ['ht_text_pd_world_open_access','dataset-pd-world-oa','pd_world_open'] + ] + + deletes = DedupeDeleteLog.new(ARGV).compile_results + + dataset_emails.each do |subset_full_name,email,subset_short_name| + email(subset_full_name,"#{email}@hathitrust.org",deletes[subset_short_name]) + end +end def email(set_name,recipient,data) (data.count < 1) and return @@ -32,23 +46,24 @@ def email(set_name,recipient,data) ===BEGIN ID LIST=== DOC data.each do |item| - message+="#{item[1]}\n" + message+="#{item}\n" end message+="===END ID LIST===\n" puts "sending message with #{data.count} deletes to #{recipient}" - Net::SMTP.start(ENV['SMTP_HOST'] || 'localhost') do |smtp| - smtp.send_message message, 'support@hathitrust.org', recipient - end + send_or_preview(message,recipient) end -dataset_emails = [ - ['ht_text_pd','dataset-pd','pd'], - ['ht_text_pd_open_access','dataset-pd-oa','pd_open'], - ['ht_text_pd_world','dataset-pd-world','pd_world'], - ['ht_text_pd_world_open_access','dataset-pd-world-oa','pd_world_open'] -] - -dataset_emails.each do |subset_full_name,email,subset_short_name| - email(subset_full_name,"#{email}@hathitrust.org",deletes.select { |i| i[0] == subset_short_name }.sort) +def send_or_preview(message,recipient) + if ENV['PREVIEW_EMAIL'] + puts "To: support@hathitrust.org, #{recipient}" + puts + puts message + else + Net::SMTP.start(ENV['SMTP_HOST'] || 'localhost') do |smtp| + smtp.send_message message, 'support@hathitrust.org', recipient + end + end end + +main if __FILE__ == $0 diff --git a/docker-compose.yml b/docker-compose.yml index 0fba686..57777e8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,6 +18,7 @@ x-common-service: &common-service - ./example/datasets:/tmp/datasets environment: REDIS_URL: redis://redis/ + PREVIEW_EMAIL: true services: diff --git a/lib/datasets/dedupe_delete_log.rb b/lib/datasets/dedupe_delete_log.rb index 18455ba..070a619 100644 --- a/lib/datasets/dedupe_delete_log.rb +++ b/lib/datasets/dedupe_delete_log.rb @@ -1,26 +1,34 @@ +require "datasets" +require_relative "../../config/hathitrust_config" + module Datasets class DedupeDeleteLog attr_reader :profile attr_reader :files - def initialize(profile:, files:) - @profile = profile + def initialize(files) @files = files - @path_resolver = Datasets.config.dest_path_resolver[profile] end def compile_results - Tempfile.create("dedupe-deletes") do |f| - filename = f.path - f.close - system("sort #{files.join(" ")} | uniq > #{filename}") - f = File.open(filename) - yield f.map(&:strip).select { |id| not_in_dataset(id) } - end + # Files should be small, so fine to read them all into memory + files.flat_map { |f| File.readlines(f) } + .map(&:strip) + .sort + .uniq + .map(&:split) + .select { |profile, id| not_in_dataset(profile, id) } + .group_by { |profile, id| profile } + # transform "pd" => [ ["pd", "id1"], ["pd", "id2" ] ] to + # "pd" => ["id1", "id2"] + .transform_values do |deletes| + deletes.map { |profile, id| id }.sort + end end - def not_in_dataset(id) + def not_in_dataset(profile, id) (namespace, id) = id.split(".", 2) + path_resolver = Datasets.config.dest_path_resolver[profile.to_sym] volume = Volume.new(namespace: namespace, id: id, access_profile: :none, right: :none) !File.exist?(path_resolver.path(volume)) end diff --git a/spec/dedupe_deletes_spec.rb b/spec/dedupe_deletes_spec.rb index d5a1211..c4b54a5 100644 --- a/spec/dedupe_deletes_spec.rb +++ b/spec/dedupe_deletes_spec.rb @@ -5,22 +5,23 @@ module Datasets RSpec.describe DedupeDeleteLog do # needs the dataset paths there include_context "integration" do - let(:profile) { :pd } - - it "takes a profile and an array of files as input" do - expect(DedupeDeleteLog.new(profile: profile, files: ["foo", "bar"])).not_to be_nil + it "takes an array of files as input" do + expect(DedupeDeleteLog.new(["foo", "bar"])).not_to be_nil end - it "outputs each item at most once" do + it "outputs each item/profile at most once, collated by profile, sorted by id" do files = Array.new(2) { Tempfile.create("dedupe-deletes") } begin - files[0].puts("test.id1", "test.id2") - files[1].puts("test.id3", "test.id2") + files[0].puts("pd\ttest.id2", "pd\ttest.id1") + files[1].puts("pd\ttest.id3", "pd\ttest.id2", "pd_open\ttest.id1") files.map(&:close) - DedupeDeleteLog.new(profile: profile, files: files.map(&:path)).compile_results do |results| - expect(results).to contain_exactly("test.id1", "test.id2", "test.id3") - end + results = DedupeDeleteLog.new(files.map(&:path)).compile_results + + expect(results).to eq( + {"pd" => ["test.id1", "test.id2", "test.id3"], + "pd_open" => ["test.id1"]} + ) ensure files.map { |f| File.unlink(f) } end @@ -28,19 +29,21 @@ module Datasets it "only outputs deletes that aren't present in the current dataset" do Tempfile.create("dedupe-deletes") do |f| - f.puts("test.still_there", "test.not_there") + f.puts("pd\ttest.still_there", "pd\ttest.not_there") f.close - volume = Volume.new(namespace: "test", id: "still_there", access_profile: :open, right: :pd) - writer = Datasets.config.volume_writer[profile] - src_path_resolver = Datasets.config.src_path_resolver[profile] + volume = Volume.new(namespace: "test", id: "still_there", + access_profile: :open, right: :pd) + + writer = Datasets.config.volume_writer[:pd] + src_path_resolver = Datasets.config.src_path_resolver[:pd] src_path = src_path_resolver.path(volume) src_path.parent.mkpath FileUtils.touch(src_path) writer.save(volume, src_path) - DedupeDeleteLog.new(profile: profile, files: [f.path]).compile_results do |results| - expect(results).to contain_exactly("test.not_there") + DedupeDeleteLog.new([f.path]).compile_results do |results| + expect(results).to eq({"pd" => ["test.not_there"]}) end end end