Skip to content

Commit

Permalink
DEV-613 and DEV-614 Public Domain Day Collections
Browse files Browse the repository at this point in the history
- Add CollectionBuilder.pm for running mb scripts to create collections.
- Rename two `pdd_collection_report` scripts to `pdd_collection`.
- The PDD tickets that are just about report generation can be merged with collection creation.
- In 2024 these scripts can probably be run by cron.
  • Loading branch information
moseshll committed Nov 22, 2023
1 parent be33647 commit ae05d6d
Show file tree
Hide file tree
Showing 6 changed files with 299 additions and 116 deletions.
123 changes: 123 additions & 0 deletions bin/pdd_collection.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/usr/bin/perl

use strict;
use warnings;
use utf8;

BEGIN {
die "SDRROOT environment variable not set" unless defined $ENV{'SDRROOT'};
use lib $ENV{'SDRROOT'} . '/crms/cgi';
use lib $ENV{'SDRROOT'} . '/crms/lib';
}
$ENV{'SDRDATAROOT'} = '/sdr1' unless defined $ENV{'SDRDATAROOT'};

use Capture::Tiny;
use Getopt::Long qw(:config no_ignore_case bundling);

use CRMS;
use CRMS::CollectionBuilder;

binmode(STDOUT, ':encoding(UTF-8)');
my $usage = <<END;
USAGE: $0 [-hv] [-y YEAR]
Creates a collection for the upcoming public domain rollover on January 1 of YEAR.
Intended to be run by a cron job in November of YEAR - 1. (So the default -y YEAR
value used internally is current year plus one.)
First creates a text file of HTIDs with a publication/copyright date of YEAR - 96,
minus any with "permanently closed" rights attributes {pd-pvt, nobody, supp}.
These are written to SDRROOT/crms/prep/pdd_collection_YEAR.txt
Then, uses mb/scripts/batch-collection.pl to create a collection based on the report.
Assembles the list of the copyright dates using crms.bib_rights_bri which is kept
up-to-date by a nightly script. This could be done with the ht.hf (HathiFiles metadata)
table, but since there are some discrepancies between crms.bib_rights_bri.date_used and
ht.hf.bib_date_used I am using the former because I understand it better.
Note: the Collection Builder component of this script runs long and should be invoked with nohup.
-h Print this help message.
-v Emit verbose debugging information. May be repeated.
-V VISIBILITY Set collection to VISIBILITY (in {public, private, draft}). Default "private".
-y YEAR Use some other value for YEAR other than the current year plus one.
END


my $help;
my $verbose;
my $visibility;
my $year;

Getopt::Long::Configure('bundling');
die 'Terminating' unless GetOptions(
'h|?' => \$help,
'v+' => \$verbose,
'V:s' => \$visibility,
'y:s' => \$year);

if ($help) { print $usage. "\n"; exit(0); }

my $crms = CRMS->new(
verbose => $verbose,
instance => 'production'
);

$verbose = 0 unless defined $verbose;
print "Verbosity $verbose\n" if $verbose;
$year = $crms->GetTheYear() + 1 unless $year;
my $target_year = $year - 96;
print "Using copyright year $target_year from $year\n" if $verbose;

# First get a hash of all HTIDs with rights attribute {nobody, pd-pvt, supp}
# This so we need not JOIN with CONCAT(rights_current.namespace,".",rights_current.id)
# which really slows things down and is a PITA.
# There should be between 10k and 20k of these excludes.
my $excludes = {};
my $sql = <<'SQL';
SELECT CONCAT(rc.namespace,".",rc.id),attr.name FROM rights_current rc
INNER JOIN attributes attr ON rc.attr=attr.id
WHERE attr.name IN ('nobody','pd-pvt','supp')
SQL

my $ref = $crms->SelectAllSDR($sql);
my $n = scalar @{$ref};
print "$n results for {nobody, pd-pvt, supp}\n" if $verbose;
foreach my $row (@$ref) {
$excludes->{$row->[0]} = $row->[1];
}

# Now get everything from our local bib rights database that has a "date used" of
# YEAR - 96. Print these out in order, excluding anything in the rights exclusion list.
$sql = <<'SQL';
SELECT id FROM bib_rights_bri
WHERE date_used=?
ORDER BY id
SQL

my $outfile = $ENV{'SDRROOT'} . "/crms/prep/pdd_collection_$year.txt";
open(my $fh, '>:encoding(UTF-8)', $outfile) or die "Could not open '$outfile' $!";
$ref = $crms->SelectAll($sql, $target_year);
foreach my $row (@$ref) {
my $htid = $row->[0];
my $attr = $excludes->{$htid};
next if defined $attr;
print $fh "$htid\n";
}
close $fh;

my $title = "$target_year Publications";
my $cb = CRMS::CollectionBuilder->new;
my $cmd = $cb->create_collection_cmd(
'title' => $title,
'description' => "Volumes published in $target_year for the purpose of sharing items that became public domain in the U.S. in $year",
'file' => $outfile
);
print `$cmd`;
$sql = 'SELECT MColl_ID FROM mb_collection WHERE owner_name="HathiTrust" AND collname=?';
my $coll_id = $crms->SimpleSqlGetSDR($sql, $title);
$cmd = $cb->set_visibility_cmd('coll_id' => $coll_id, 'visibility' => $visibility);
print `$cmd`;

print "Warning: $_\n" for @{$crms->GetErrors()};
100 changes: 0 additions & 100 deletions bin/pdd_collection_report.pl

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,51 +3,59 @@
use strict;
use warnings;
use utf8;
use v5.10;

BEGIN {
die "SDRROOT environment variable not set" unless defined $ENV{'SDRROOT'};
use lib $ENV{'SDRROOT'} . '/crms/cgi';
use lib $ENV{'SDRROOT'} . '/crms/lib';
}

use Getopt::Long qw(:config no_ignore_case bundling);
use Term::ANSIColor qw(:constants colored);

use CRMS;
use CRMS::CollectionBuilder;

$Term::ANSIColor::AUTORESET = 1;
binmode(STDOUT, ':encoding(UTF-8)');
my $usage = <<END;
USAGE: $0 [-hv] [-y YEAR]
USAGE: $0 [-hv] [-y YEAR] [-V VISIBILITY]
Reports to STDOUT all non-US HTIDs with a publication/copyright date of the current YEAR - 126,
Creates a collection for the upcoming public domain rollover on January 1 of YEAR.
Intended to be run by a cron job in November of YEAR - 1. (So the default -y YEAR
value used internally is current year plus one.)
First creates a text file of non-US HTIDs HTIDs with a publication/copyright date of the current YEAR - 126,
minus any with "permanently closed" rights attributes {pd-pvt, nobody, supp}.
This is similar to the logic in pdd_collection_report.pl except it uses an earlier cutoff
Then, uses mb/scripts/batch-collection.pl to create a collection based on the report.
This is similar to the logic in pdd_collection.pl except it uses an earlier cutoff
date, and it uses the 008-derived value in bib_rights_bi.pub_place to exclude US publications.
Uses the crms.bib_rights_bi and crms.bib_rights_bri tables which is kept up-to-date by a nightly script.
This could be done with the ht.hf (HathiFiles metadata) table, but since there are
some discrepancies between crms.bib_rights_bri.date_used and ht.hf.bib_date_used
I am using the former because I understand it better.
NOTE: this script should take no more than five minutes to run.
Note: the Collection Builder component of this script runs long and should be invoked with nohup.
-h Print this help message.
-v Emit verbose debugging information. May be repeated.
-y YEAR Use this YEAR instead of the current one.
-h Print this help message.
-v Emit verbose debugging information. May be repeated.
-V VISIBILITY Set collection to VISIBILITY (in {public, private, draft}). Default "private".
-y YEAR Use this YEAR instead of the current one.
END


my $help;
my $instance;
my $verbose;
my $visibility;
my $year;

Getopt::Long::Configure('bundling');
die 'Terminating' unless GetOptions(
'h|?' => \$help,
'v+' => \$verbose,
'V:s' => \$visibility,
'y:s' => \$year);

if ($help) { print $usage. "\n"; exit(0); }
Expand All @@ -59,7 +67,7 @@ END

$verbose = 0 unless defined $verbose;
print "Verbosity $verbose\n" if $verbose;
$year = $crms->GetTheYear() unless $year;
$year = $crms->GetTheYear() + 1 unless $year;
my $target_date = $year - 126;
print "Using copyright date $target_date from $year\n" if $verbose;

Expand Down Expand Up @@ -93,15 +101,28 @@ END
ORDER BY id
SQL

my $outfile = $ENV{'SDRROOT'} . "/crms/prep/pdd_collection_worldwide_$year.txt";
open(my $fh, '>:encoding(UTF-8)', $outfile) or die "Could not open '$outfile' $!";
$ref = $crms->SelectAll($sql, $target_date);
foreach my $row (@$ref) {
my $htid = $row->[0];
my $attr = $excludes->{$htid};
if (defined $attr) {
print RED "Skipping $htid ($attr)\n" if $verbose;
} else {
say $htid;
}
next if defined $attr;
print $fh "$htid\n";
}
close $fh;

my $title = "Newly Opened Worldwide January $year";
my $cb = CRMS::CollectionBuilder->new;
my $cmd = $cb->create_collection_cmd(
'title' => $title,
'description' => "Volumes newly opened to the world as of January 1, $year based on bibliographic data",
'file' => $outfile
);
print `$cmd`;
$sql = 'SELECT MColl_ID FROM mb_collection WHERE owner_name="HathiTrust" AND collname=?';
my $coll_id = $crms->SimpleSqlGetSDR($sql, $title);
$cmd = $cb->set_visibility_cmd('coll_id' => $coll_id, 'visibility' => $visibility);
print `$cmd`;

print "Warning: $_\n" for @{$crms->GetErrors()};
Loading

0 comments on commit ae05d6d

Please sign in to comment.