Skip to content

Commit

Permalink
Merge pull request #116 from hathitrust/DEV-612_worldwide_collection_…
Browse files Browse the repository at this point in the history
…report

DEV-612 Produce a report on 1898 items for worldwide
  • Loading branch information
moseshll authored Nov 17, 2023
2 parents 37819b7 + b72d22d commit ec1c6a5
Showing 1 changed file with 107 additions and 0 deletions.
107 changes: 107 additions & 0 deletions bin/pdd_collection_report_worldwide.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/perl

use strict;
use warnings;
use utf8;
use v5.10;

BEGIN {
die "SDRROOT environment variable not set" unless defined $ENV{'SDRROOT'};
use lib $ENV{'SDRROOT'} . '/crms/cgi';
}

use Getopt::Long qw(:config no_ignore_case bundling);
use Term::ANSIColor qw(:constants colored);

use CRMS;

$Term::ANSIColor::AUTORESET = 1;
binmode(STDOUT, ':encoding(UTF-8)');
my $usage = <<END;
USAGE: $0 [-hv] [-y YEAR]
Reports to STDOUT all non-US HTIDs with a publication/copyright date of the current YEAR - 126,
minus any with "permanently closed" rights attributes {pd-pvt, nobody, supp}.
This is similar to the logic in pdd_collection_report.pl except it uses an earlier cutoff
date, and it uses the 008-derived value in bib_rights_bi.pub_place to exclude US publications.
Uses the crms.bib_rights_bi and crms.bib_rights_bri tables which is kept up-to-date by a nightly script.
This could be done with the ht.hf (HathiFiles metadata) table, but since there are
some discrepancies between crms.bib_rights_bri.date_used and ht.hf.bib_date_used
I am using the former because I understand it better.
NOTE: this script should take no more than five minutes to run.
-h Print this help message.
-v Emit verbose debugging information. May be repeated.
-y YEAR Use this YEAR instead of the current one.
END


my $help;
my $instance;
my $verbose;
my $year;

Getopt::Long::Configure('bundling');
die 'Terminating' unless GetOptions(
'h|?' => \$help,
'v+' => \$verbose,
'y:s' => \$year);

if ($help) { print $usage. "\n"; exit(0); }

my $crms = CRMS->new(
verbose => $verbose,
instance => 'production'
);

$verbose = 0 unless defined $verbose;
print "Verbosity $verbose\n" if $verbose;
$year = $crms->GetTheYear() unless $year;
my $target_date = $year - 126;
print "Using copyright date $target_date from $year\n" if $verbose;

# First get a hash of all HTIDs with rights attribute {nobody, pd-pvt, supp}
# This so we need not JOIN with CONCAT(rights_current.namespace,".",rights_current.id)
# which really slows things down and is a PITA.
# There should be between 10k and 20k of these excludes.
my $excludes = {};
my $sql = <<'SQL';
SELECT CONCAT(rc.namespace,".",rc.id),attr.name FROM rights_current rc
INNER JOIN attributes attr ON rc.attr=attr.id
WHERE attr.name IN ('nobody','pd-pvt','supp')
SQL

my $ref = $crms->SelectAllSDR($sql);
my $n = scalar @{$ref};
print "$n results for {nobody, pd-pvt, supp}\n" if $verbose;
foreach my $row (@$ref) {
$excludes->{$row->[0]} = $row->[1];
}

# Now get everything from our local bib rights database that has a "date used" of
# YEAR - 126 and non-US pub place.
# Print these out in order, excluding anything in the rights exclusion list.
$sql = <<'SQL';
SELECT bri.id,bi.pub_place FROM bib_rights_bi bi
INNER JOIN bib_rights_bri bri
ON bi.bib_key=bri.bib_key
WHERE bri.date_used=?
AND substr(bi.pub_place,3,1)!='u'
ORDER BY id
SQL

$ref = $crms->SelectAll($sql, $target_date);
foreach my $row (@$ref) {
my $htid = $row->[0];
my $attr = $excludes->{$htid};
if (defined $attr) {
print RED "Skipping $htid ($attr)\n" if $verbose;
} else {
say $htid;
}
}

print "Warning: $_\n" for @{$crms->GetErrors()};

0 comments on commit ec1c6a5

Please sign in to comment.