Skip to content

Commit

Permalink
DEV-1086: dedupe & group delete results
Browse files Browse the repository at this point in the history
WIP: Integrated into output email, not yet tested on real data.
  • Loading branch information
aelkiss committed Apr 24, 2024
1 parent b1788e8 commit 9f01055
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 41 deletions.
43 changes: 29 additions & 14 deletions bin/notify.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,22 @@
# send notifications to users about recent deletes

require 'net/smtp'
require 'dedupe_delete_log'

deletes = ARGF.readlines.map { |l| l.strip.split("\t") }
def main
dataset_emails = [
['ht_text_pd','dataset-pd','pd'],
['ht_text_pd_open_access','dataset-pd-oa','pd_open'],
['ht_text_pd_world','dataset-pd-world','pd_world'],
['ht_text_pd_world_open_access','dataset-pd-world-oa','pd_world_open']
]

deletes = DedupeDeleteLog.new(ARGV).compile_results

dataset_emails.each do |subset_full_name,email,subset_short_name|
email(subset_full_name,"#{email}@hathitrust.org",deletes[subset_short_name])
end
end

def email(set_name,recipient,data)
(data.count < 1) and return
Expand All @@ -32,23 +46,24 @@ def email(set_name,recipient,data)
===BEGIN ID LIST===
DOC
data.each do |item|
message+="#{item[1]}\n"
message+="#{item}\n"
end
message+="===END ID LIST===\n"

puts "sending message with #{data.count} deletes to #{recipient}"
Net::SMTP.start(ENV['SMTP_HOST'] || 'localhost') do |smtp|
smtp.send_message message, '[email protected]', recipient
end
send_or_preview(message,recipient)
end

dataset_emails = [
['ht_text_pd','dataset-pd','pd'],
['ht_text_pd_open_access','dataset-pd-oa','pd_open'],
['ht_text_pd_world','dataset-pd-world','pd_world'],
['ht_text_pd_world_open_access','dataset-pd-world-oa','pd_world_open']
]

dataset_emails.each do |subset_full_name,email,subset_short_name|
email(subset_full_name,"#{email}@hathitrust.org",deletes.select { |i| i[0] == subset_short_name }.sort)
def send_or_preview(message,recipient)
if ENV['PREVIEW_EMAIL']
puts "To: [email protected], #{recipient}"
puts
puts message
else
Net::SMTP.start(ENV['SMTP_HOST'] || 'localhost') do |smtp|
smtp.send_message message, '[email protected]', recipient
end
end
end

main if __FILE__ == $0
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ x-common-service: &common-service
- ./example/datasets:/tmp/datasets
environment:
REDIS_URL: redis://redis/
PREVIEW_EMAIL: true

services:

Expand Down
30 changes: 19 additions & 11 deletions lib/datasets/dedupe_delete_log.rb
Original file line number Diff line number Diff line change
@@ -1,26 +1,34 @@
require "datasets"
require_relative "../../config/hathitrust_config"

module Datasets
class DedupeDeleteLog
attr_reader :profile
attr_reader :files

def initialize(profile:, files:)
@profile = profile
def initialize(files)
@files = files
@path_resolver = Datasets.config.dest_path_resolver[profile]
end

def compile_results
Tempfile.create("dedupe-deletes") do |f|
filename = f.path
f.close
system("sort #{files.join(" ")} | uniq > #{filename}")
f = File.open(filename)
yield f.map(&:strip).select { |id| not_in_dataset(id) }
end
# Files should be small, so fine to read them all into memory
files.flat_map { |f| File.readlines(f) }
.map(&:strip)
.sort
.uniq
.map(&:split)
.select { |profile, id| not_in_dataset(profile, id) }
.group_by { |profile, id| profile }
# transform "pd" => [ ["pd", "id1"], ["pd", "id2" ] ] to
# "pd" => ["id1", "id2"]
.transform_values do |deletes|
deletes.map { |profile, id| id }.sort
end
end

def not_in_dataset(id)
def not_in_dataset(profile, id)
(namespace, id) = id.split(".", 2)
path_resolver = Datasets.config.dest_path_resolver[profile.to_sym]
volume = Volume.new(namespace: namespace, id: id, access_profile: :none, right: :none)
!File.exist?(path_resolver.path(volume))
end
Expand Down
35 changes: 19 additions & 16 deletions spec/dedupe_deletes_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,42 +5,45 @@ module Datasets
RSpec.describe DedupeDeleteLog do
# needs the dataset paths there
include_context "integration" do
let(:profile) { :pd }

it "takes a profile and an array of files as input" do
expect(DedupeDeleteLog.new(profile: profile, files: ["foo", "bar"])).not_to be_nil
it "takes an array of files as input" do
expect(DedupeDeleteLog.new(["foo", "bar"])).not_to be_nil
end

it "outputs each item at most once" do
it "outputs each item/profile at most once, collated by profile, sorted by id" do
files = Array.new(2) { Tempfile.create("dedupe-deletes") }
begin
files[0].puts("test.id1", "test.id2")
files[1].puts("test.id3", "test.id2")
files[0].puts("pd\ttest.id2", "pd\ttest.id1")
files[1].puts("pd\ttest.id3", "pd\ttest.id2", "pd_open\ttest.id1")
files.map(&:close)

DedupeDeleteLog.new(profile: profile, files: files.map(&:path)).compile_results do |results|
expect(results).to contain_exactly("test.id1", "test.id2", "test.id3")
end
results = DedupeDeleteLog.new(files.map(&:path)).compile_results

expect(results).to eq(
{"pd" => ["test.id1", "test.id2", "test.id3"],
"pd_open" => ["test.id1"]}
)
ensure
files.map { |f| File.unlink(f) }
end
end

it "only outputs deletes that aren't present in the current dataset" do
Tempfile.create("dedupe-deletes") do |f|
f.puts("test.still_there", "test.not_there")
f.puts("pd\ttest.still_there", "pd\ttest.not_there")
f.close

volume = Volume.new(namespace: "test", id: "still_there", access_profile: :open, right: :pd)
writer = Datasets.config.volume_writer[profile]
src_path_resolver = Datasets.config.src_path_resolver[profile]
volume = Volume.new(namespace: "test", id: "still_there",
access_profile: :open, right: :pd)

writer = Datasets.config.volume_writer[:pd]
src_path_resolver = Datasets.config.src_path_resolver[:pd]
src_path = src_path_resolver.path(volume)
src_path.parent.mkpath
FileUtils.touch(src_path)
writer.save(volume, src_path)

DedupeDeleteLog.new(profile: profile, files: [f.path]).compile_results do |results|
expect(results).to contain_exactly("test.not_there")
DedupeDeleteLog.new([f.path]).compile_results do |results|
expect(results).to eq({"pd" => ["test.not_there"]})
end
end
end
Expand Down

0 comments on commit 9f01055

Please sign in to comment.