From ad8b6c1f2af1ac138ab84cd2c27c8524c730631f Mon Sep 17 00:00:00 2001 From: "Brian \"Moses\" Hall" Date: Thu, 30 Nov 2023 16:25:31 -0500 Subject: [PATCH] DEV-613 and DEV-614 Public Domain Day Collections (#118) * DEV-613 and DEV-614 Public Domain Day Collections - Add lib/CollectionBuilder.pm for running mb scripts to create collections. - Rename two `pdd_collection_report` scripts to `pdd_collection`. - The PDD tickets that are just about report generation can be merged with collection creation. - In 2024 these scripts can probably be run by cron. - Use cpanminus --- Dockerfile | 3 +- bin/pdd_collection.pl | 123 ++++++++++++++++++ bin/pdd_collection_report.pl | 100 -------------- ...rldwide.pl => pdd_collection_worldwide.pl} | 53 +++++--- cgi/CRMS.pm | 2 +- lib/CRMS/CollectionBuilder.pm | 75 +++++++++++ lib/CRMS/RightsPredictor.pm | 5 + t/lib/CRMS/CollectionBuilder.t | 56 ++++++++ 8 files changed, 299 insertions(+), 118 deletions(-) create mode 100755 bin/pdd_collection.pl delete mode 100755 bin/pdd_collection_report.pl rename bin/{pdd_collection_report_worldwide.pl => pdd_collection_worldwide.pl} (58%) create mode 100644 lib/CRMS/CollectionBuilder.pm create mode 100644 t/lib/CRMS/CollectionBuilder.t diff --git a/Dockerfile b/Dockerfile index 30291c4a..14e4f47f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -134,6 +134,7 @@ RUN apt-get install -y \ autoconf \ bison \ build-essential \ + cpanminus \ git \ libdevel-cover-perl \ libffi-dev \ @@ -151,7 +152,7 @@ RUN apt-get install -y \ zip \ zlib1g-dev -RUN cpan \ +RUN cpanm --notest \ Devel::Cover::Report::Coveralls \ MARC::Record::MiJ \ OAuth::Lite \ diff --git a/bin/pdd_collection.pl b/bin/pdd_collection.pl new file mode 100755 index 00000000..733faf8b --- /dev/null +++ b/bin/pdd_collection.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl + +use strict; +use warnings; +use utf8; + +BEGIN { + die "SDRROOT environment variable not set" unless defined $ENV{'SDRROOT'}; + use lib $ENV{'SDRROOT'} . '/crms/cgi'; + use lib $ENV{'SDRROOT'} . '/crms/lib'; +} +$ENV{'SDRDATAROOT'} = '/sdr1' unless defined $ENV{'SDRDATAROOT'}; + +use Capture::Tiny; +use Getopt::Long qw(:config no_ignore_case bundling); + +use CRMS; +use CRMS::CollectionBuilder; + +binmode(STDOUT, ':encoding(UTF-8)'); +my $usage = < \$help, + 'v+' => \$verbose, + 'V:s' => \$visibility, + 'y:s' => \$year); + +if ($help) { print $usage. "\n"; exit(0); } + +my $crms = CRMS->new( + verbose => $verbose, + instance => 'production' +); + +$verbose = 0 unless defined $verbose; +print "Verbosity $verbose\n" if $verbose; +$year = $crms->GetTheYear() + 1 unless $year; +my $target_year = $year - 96; +print "Using copyright year $target_year from $year\n" if $verbose; + +# First get a hash of all HTIDs with rights attribute {nobody, pd-pvt, supp} +# This so we need not JOIN with CONCAT(rights_current.namespace,".",rights_current.id) +# which really slows things down and is a PITA. +# There should be between 10k and 20k of these excludes. +my $excludes = {}; +my $sql = <<'SQL'; +SELECT CONCAT(rc.namespace,".",rc.id),attr.name FROM rights_current rc +INNER JOIN attributes attr ON rc.attr=attr.id +WHERE attr.name IN ('nobody','pd-pvt','supp') +SQL + +my $ref = $crms->SelectAllSDR($sql); +my $n = scalar @{$ref}; +print "$n results for {nobody, pd-pvt, supp}\n" if $verbose; +foreach my $row (@$ref) { + $excludes->{$row->[0]} = $row->[1]; +} + +# Now get everything from our local bib rights database that has a "date used" of +# YEAR - 96. Print these out in order, excluding anything in the rights exclusion list. +$sql = <<'SQL'; +SELECT id FROM bib_rights_bri +WHERE date_used=? +ORDER BY id +SQL + +my $outfile = $ENV{'SDRROOT'} . "/crms/prep/pdd_collection_$year.txt"; +open(my $fh, '>:encoding(UTF-8)', $outfile) or die "Could not open '$outfile' $!"; +$ref = $crms->SelectAll($sql, $target_year); +foreach my $row (@$ref) { + my $htid = $row->[0]; + my $attr = $excludes->{$htid}; + next if defined $attr; + print $fh "$htid\n"; +} +close $fh; + +my $title = "$target_year Publications"; +my $cb = CRMS::CollectionBuilder->new; +my $cmd = $cb->create_collection_cmd( + 'title' => $title, + 'description' => "Volumes published in $target_year for the purpose of sharing items that became public domain in the U.S. in $year", + 'file' => $outfile +); +print `$cmd`; +$sql = 'SELECT MColl_ID FROM mb_collection WHERE owner_name="HathiTrust" AND collname=?'; +my $coll_id = $crms->SimpleSqlGetSDR($sql, $title); +$cmd = $cb->set_visibility_cmd('coll_id' => $coll_id, 'visibility' => $visibility); +print `$cmd`; + +print "Warning: $_\n" for @{$crms->GetErrors()}; diff --git a/bin/pdd_collection_report.pl b/bin/pdd_collection_report.pl deleted file mode 100755 index 8f562c21..00000000 --- a/bin/pdd_collection_report.pl +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/perl - -use strict; -use warnings; -use utf8; -use v5.10; - -BEGIN { - die "SDRROOT environment variable not set" unless defined $ENV{'SDRROOT'}; - use lib $ENV{'SDRROOT'} . '/crms/cgi'; -} - -use Getopt::Long qw(:config no_ignore_case bundling); -use Term::ANSIColor qw(:constants colored); - -use CRMS; - -$Term::ANSIColor::AUTORESET = 1; -binmode(STDOUT, ':encoding(UTF-8)'); -my $usage = < \$help, - 'v+' => \$verbose, - 'y:s' => \$year); - -if ($help) { print $usage. "\n"; exit(0); } - -my $crms = CRMS->new( - verbose => $verbose, - instance => 'production' -); - -$verbose = 0 unless defined $verbose; -print "Verbosity $verbose\n" if $verbose; -$year = $crms->GetTheYear() unless $year; -my $target_date = $year - 96; -print "Using copyright date $target_date from $year\n" if $verbose; - -# First get a hash of all HTIDs with rights attribute {nobody, pd-pvt, supp} -# This so we need not JOIN with CONCAT(rights_current.namespace,".",rights_current.id) -# which really slows things down and is a PITA. -# There should be between 10k and 20k of these excludes. -my $excludes = {}; -my $sql = <<'SQL'; -SELECT CONCAT(rc.namespace,".",rc.id),attr.name FROM rights_current rc -INNER JOIN attributes attr ON rc.attr=attr.id -WHERE attr.name IN ('nobody','pd-pvt','supp') -SQL - -my $ref = $crms->SelectAllSDR($sql); -my $n = scalar @{$ref}; -print "$n results for {nobody, pd-pvt, supp}\n" if $verbose; -foreach my $row (@$ref) { - $excludes->{$row->[0]} = $row->[1]; -} - -# Now get everything from our local bib rights database that has a "date used" of -# YEAR - 96. Print these out in order, excluding anything in the rights exclusion list. -$sql = <<'SQL'; -SELECT id FROM bib_rights_bri -WHERE date_used=? -ORDER BY id -SQL - -$ref = $crms->SelectAll($sql, $target_date); -foreach my $row (@$ref) { - my $htid = $row->[0]; - my $attr = $excludes->{$htid}; - if (defined $attr) { - print RED "Skipping $htid ($attr)\n" if $verbose; - } else { - say $htid; - } -} - -print "Warning: $_\n" for @{$crms->GetErrors()}; diff --git a/bin/pdd_collection_report_worldwide.pl b/bin/pdd_collection_worldwide.pl similarity index 58% rename from bin/pdd_collection_report_worldwide.pl rename to bin/pdd_collection_worldwide.pl index aad1ea22..6b2f932c 100755 --- a/bin/pdd_collection_report_worldwide.pl +++ b/bin/pdd_collection_worldwide.pl @@ -3,27 +3,32 @@ use strict; use warnings; use utf8; -use v5.10; BEGIN { die "SDRROOT environment variable not set" unless defined $ENV{'SDRROOT'}; use lib $ENV{'SDRROOT'} . '/crms/cgi'; + use lib $ENV{'SDRROOT'} . '/crms/lib'; } use Getopt::Long qw(:config no_ignore_case bundling); -use Term::ANSIColor qw(:constants colored); use CRMS; +use CRMS::CollectionBuilder; -$Term::ANSIColor::AUTORESET = 1; binmode(STDOUT, ':encoding(UTF-8)'); my $usage = < \$help, 'v+' => \$verbose, + 'V:s' => \$visibility, 'y:s' => \$year); if ($help) { print $usage. "\n"; exit(0); } @@ -59,7 +67,7 @@ END $verbose = 0 unless defined $verbose; print "Verbosity $verbose\n" if $verbose; -$year = $crms->GetTheYear() unless $year; +$year = $crms->GetTheYear() + 1 unless $year; my $target_date = $year - 126; print "Using copyright date $target_date from $year\n" if $verbose; @@ -93,15 +101,28 @@ END ORDER BY id SQL +my $outfile = $ENV{'SDRROOT'} . "/crms/prep/pdd_collection_worldwide_$year.txt"; +open(my $fh, '>:encoding(UTF-8)', $outfile) or die "Could not open '$outfile' $!"; $ref = $crms->SelectAll($sql, $target_date); foreach my $row (@$ref) { my $htid = $row->[0]; my $attr = $excludes->{$htid}; - if (defined $attr) { - print RED "Skipping $htid ($attr)\n" if $verbose; - } else { - say $htid; - } + next if defined $attr; + print $fh "$htid\n"; } +close $fh; + +my $title = "Newly Opened Worldwide January $year"; +my $cb = CRMS::CollectionBuilder->new; +my $cmd = $cb->create_collection_cmd( + 'title' => $title, + 'description' => "Volumes newly opened to the world as of January 1, $year based on bibliographic data", + 'file' => $outfile +); +print `$cmd`; +$sql = 'SELECT MColl_ID FROM mb_collection WHERE owner_name="HathiTrust" AND collname=?'; +my $coll_id = $crms->SimpleSqlGetSDR($sql, $title); +$cmd = $cb->set_visibility_cmd('coll_id' => $coll_id, 'visibility' => $visibility); +print `$cmd`; print "Warning: $_\n" for @{$crms->GetErrors()}; diff --git a/cgi/CRMS.pm b/cgi/CRMS.pm index f4a46522..d360635f 100755 --- a/cgi/CRMS.pm +++ b/cgi/CRMS.pm @@ -65,7 +65,7 @@ sub new return $self; } -our $VERSION = '8.5.20'; +our $VERSION = '8.5.21'; sub Version { return $VERSION; diff --git a/lib/CRMS/CollectionBuilder.pm b/lib/CRMS/CollectionBuilder.pm new file mode 100644 index 00000000..6676fc30 --- /dev/null +++ b/lib/CRMS/CollectionBuilder.pm @@ -0,0 +1,75 @@ +package CRMS::CollectionBuilder; + +# Routines for interfacing with the mb batch_collection.pl script. +# Eventually we'll go through an API, +# This is used for creating yearly collections in advance of public domain day rollover. +use strict; +use warnings; +use utf8; + +my $BATCH_COLLECTION_PATH = $ENV{'SDRROOT'} . '/mb/scripts/batch-collection.pl'; +my $BATCH_COLLECTION_OWNER = 'hathitrust@gmail.com'; +my $BATCH_COLLECTION_OWNER_NAME = 'HathiTrust'; + +my $VISIBILITIES = { + 'public' => 1, + 'private' => 1, + 'draft' => 1 +}; + +sub new { + my ($class, %args) = @_; + my $self = bless {}, $class; + my $who = `whoami`; + chomp $who; + $self->{whoami} = $who; + return $self; +} + +# Returns a shell command that will create a public domain day collection. +# It is up to the caller to run the command. +sub create_collection_cmd { + my $self = shift; + my %args = @_; + + my $title = $args{title}; + my $description = $args{description}; + my $file = $args{file}; + die 'missing required parameter "title"' unless $title; + die 'missing required parameter "description"' unless $description; + die 'missing required parameter "file"' unless $file; + + my $cmd = <{whoami} $BATCH_COLLECTION_PATH +-t "$title" +-d "$description" +-o $BATCH_COLLECTION_OWNER +-O $BATCH_COLLECTION_OWNER_NAME +-f $file +2>&1 +CMD + $cmd =~ s/\n/ /g; + return $cmd; +} + +# Returns a shell command that will set the collections visibility. +# It is up to the caller to run the command. +sub set_visibility_cmd { + my $self = shift; + my %args = @_; + + my $coll_id = $args{coll_id}; + my $visibility = $args{visibility} || 'private'; + die 'missing required parameter "coll_id"' unless $coll_id; + die "unknown visibility parameter '$visibility'" unless $VISIBILITIES->{$visibility}; + + my $cmd = <{whoami} $BATCH_COLLECTION_PATH +-u $coll_id +-s $visibility +2>&1 +CMD + $cmd =~ s/\n/ /g; + return $cmd; +} + diff --git a/lib/CRMS/RightsPredictor.pm b/lib/CRMS/RightsPredictor.pm index ee2c8f32..ae4a00d0 100644 --- a/lib/CRMS/RightsPredictor.pm +++ b/lib/CRMS/RightsPredictor.pm @@ -262,6 +262,11 @@ sub validate_for_rights { my $self = shift; my $prediction = shift; + if (!defined $prediction->{pub_year}) { + push @{$prediction->{desc}}, "undefined pub year"; + $prediction->{error} = 1; + return; + } if ($prediction->{pub_year} !~ m/$VALID_PUB_DATE/) { push @{$prediction->{desc}}, "unsupported pub date format '$prediction->{pub_year}'"; diff --git a/t/lib/CRMS/CollectionBuilder.t b/t/lib/CRMS/CollectionBuilder.t new file mode 100644 index 00000000..d0afdc4a --- /dev/null +++ b/t/lib/CRMS/CollectionBuilder.t @@ -0,0 +1,56 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +use Test::Exception; +use Test::More; + +use lib $ENV{'SDRROOT'} . '/crms/lib'; +use CRMS::CollectionBuilder; + +subtest '::new' => sub { + my $cb = CRMS::CollectionBuilder->new; + isa_ok($cb, 'CRMS::CollectionBuilder'); + ok(defined $cb->{whoami}); +}; + +subtest '::create_collection_cmd' => sub { + my $cb = CRMS::CollectionBuilder->new; + subtest 'with all required parameters' => sub { + my $cmd = $cb->create_collection_cmd(title => 'Test Title', description => 'Test Description', file => '/path/to/file.txt'); + ok(defined $cmd); + }; + + subtest 'missing title' => sub { + dies_ok { $cb->create_collection_cmd(description => 'Test Description', file => '/path/to/file.txt'); } + }; + + subtest 'missing description' => sub { + dies_ok { $cb->create_collection_cmd(title => 'Test Title', file => '/path/to/file.txt'); } + }; + + subtest 'missing file' => sub { + dies_ok { $cb->create_collection_cmd(title => 'Test Title', description => 'Test Description'); } + }; +}; + +subtest '::set_visibility_cmd' => sub { + my $cb = CRMS::CollectionBuilder->new; + subtest 'with all required parameters' => sub { + my $cmd = $cb->set_visibility_cmd(coll_id => '00000000'); + ok(defined $cmd); + }; + + subtest 'missing coll_id' => sub { + dies_ok { $cb->set_visibility_cmd } + }; + + subtest 'bogus visibility' => sub { + dies_ok { $cb->set_visibility_cmd(coll_id => '00000000', visibility => 'out of phase with the prime material plane'); } + }; +}; + +done_testing(); + +1;