From 1ed91afeae1e0b85b9e9a740a6423bd235d03e06 Mon Sep 17 00:00:00 2001 From: Martin Warin <mwarin@umich.edu> Date: Wed, 18 Sep 2024 12:04:56 -0400 Subject: [PATCH] Adding metrics to IA::Download and moving some metrics out of Stage::Imageremediate --- lib/HTFeed/METS.pm | 998 ++++++++++-------- lib/HTFeed/PackageType/IA/Download.pm | 17 +- lib/HTFeed/PackageType/IA/ImageRemediate.pm | 108 +- .../PackageType/Simple/ImageRemediate.pm | 19 +- lib/HTFeed/Stage/ImageRemediate.pm | 52 +- t/ia.t | 293 ++--- t/local_ingest.t | 428 ++++---- 7 files changed, 1026 insertions(+), 889 deletions(-) diff --git a/lib/HTFeed/METS.pm b/lib/HTFeed/METS.pm index 8ba8a9b9..d9f33757 100644 --- a/lib/HTFeed/METS.pm +++ b/lib/HTFeed/METS.pm @@ -22,38 +22,37 @@ use base qw(HTFeed::Stage); # TODO: remove after uplift # Everything else should be covered by digitization? my %agent_mapping = ( - 'Ca-MvGOO' => 'google', - 'CaSfIA' => 'archive', - 'MiU' => 'umich', - 'MnU' => 'umn', - 'GEU' => 'emory', - 'GEU-S' => 'emory', - 'GEU-T' => 'emory', - 'TxCM' => 'tamu', - 'DeU' => 'udel', - 'IU' => 'illinois', - 'Internet Archive' => 'archive', - 'UM' => 'umich' + 'Ca-MvGOO' => 'google', + 'CaSfIA' => 'archive', + 'DeU' => 'udel', + 'GEU' => 'emory', + 'GEU-S' => 'emory', + 'GEU-T' => 'emory', + 'IU' => 'illinois', + 'Internet Archive' => 'archive', + 'MiU' => 'umich', + 'MnU' => 'umn', + 'TxCM' => 'tamu', + 'UM' => 'umich' ); sub new { my $class = shift; my $self = $class->SUPER::new( - is_uplift => 0, - @_, - - # files => [], - # dir => undef, - # mets_name => undef, - # mets_xml => undef, + is_uplift => 0, + @_, + # files => [], + # dir => undef, + # mets_name => undef, + # mets_xml => undef, ); $self->{outfile} = $self->{volume}->get_mets_path(); # by default use volume "get_pagedata" to apply pagedata - $self->{pagedata} = sub { $self->{volume}->get_page_data(@_); }; - $self->{premis} = new PREMIS; + $self->{pagedata} = sub { $self->{volume}->get_page_data(@_); }; + $self->{premis} = new PREMIS; $self->{old_event_types} = {}; - $self->{profile} = get_config('mets_profile'); + $self->{profile} = get_config('mets_profile'); $self->{required_events} = ["capture","message digest calculation","fixity check","validation","ingestion"]; return $self; @@ -61,8 +60,10 @@ sub new { sub run { my $self = shift; - my $mets = new METS( objid => $self->{volume}->get_identifier(), - profile => $self->{profile} ); + my $mets = new METS( + objid => $self->{volume}->get_identifier(), + profile => $self->{profile} + ); $self->{'mets'} = $mets; $self->{'amdsecs'} = []; @@ -79,19 +80,20 @@ sub run { $self->_save_mets(); $self->_validate_mets(); $self->_set_done(); - } sub stage_info { - return { success_state => 'metsed', failure_state => 'punted' }; + return { + success_state => 'metsed', + failure_state => 'punted' + }; } sub _add_schemas { my $self = shift; my $mets = $self->{mets}; - $mets->add_schema( "PREMIS", NS_PREMIS, SCHEMA_PREMIS ); - + $mets->add_schema("PREMIS", NS_PREMIS, SCHEMA_PREMIS); } sub _add_header { @@ -100,24 +102,28 @@ sub _add_header { my $header; - if($self->{is_uplift}) { - my $volume = $self->{volume}; - my $xc = $volume->get_repository_mets_xpc(); + if ($self->{is_uplift}) { + my $volume = $self->{volume}; + my $xc = $volume->get_repository_mets_xpc(); my $createdate = $xc->findvalue('//mets:metsHdr/@CREATEDATE'); - if(not defined $createdate or !$createdate) { - $self->setError('BadValue',field=>'//metsHdr/@CREATEDATE', - detail=>"can't get METS creation time", - file=>$volume->get_repository_mets_path()); + + if (not defined $createdate or !$createdate) { + $self->setError( + 'BadValue', + field => '//metsHdr/@CREATEDATE', + detail => "can't get METS creation time", + file => $volume->get_repository_mets_path() + ); } # time stamp w/o timezone in METS creation date - if($createdate =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/) { - $createdate = $self->convert_tz($createdate,'America/Detroit'); + if ($createdate =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/) { + $createdate = $self->convert_tz($createdate, 'America/Detroit'); } $header = new METS::Header( - createdate => $createdate, - lastmoddate => _get_createdate(), + createdate => $createdate, + lastmoddate => _get_createdate(), recordstatus => 'REV', - id => 'HDR1', + id => 'HDR1', ); } else { $header = new METS::Header( @@ -139,12 +145,14 @@ sub _add_header { sub _add_dmdsecs { my $self = shift; + my $volume = $self->{volume}; my $mets = $self->{mets}; + my $dmdsec = new METS::MetadataSection( + 'dmdSec', + 'id' => $self->_get_subsec_id("DMD") + ); - my $dmdsec = - new METS::MetadataSection( 'dmdSec', - 'id' => $self->_get_subsec_id("DMD") ); $dmdsec->set_md_ref( mdtype => 'MARC', loctype => 'OTHER', @@ -152,136 +160,155 @@ sub _add_dmdsecs { xptr => $volume->get_identifier() ); $mets->add_dmd_sec($dmdsec); - } # add reading order techMD if it is present sub _add_techmds { - my $self = shift; + my $self = shift; + my $volume = $self->{volume}; - my $xc = $volume->get_source_mets_xpc(); + my $xc = $volume->get_source_mets_xpc(); - my $reading_order = new METS::MetadataSection( 'techMD', - id => $self->_get_subsec_id('TMD')); + my $reading_order = new METS::MetadataSection( + 'techMD', + id => $self->_get_subsec_id('TMD') + ); my @mdwraps = $xc->findnodes('//mets:mdWrap[@LABEL="reading order"]'); - if(@mdwraps == 1) { + if (@mdwraps == 1) { my $mdwrap = $mdwraps[0]; - - my $mets = $self->{mets}; - $mets->add_schema( "gbs", "http://books.google.com/gbs"); + my $mets = $self->{mets}; + $mets->add_schema("gbs", "http://books.google.com/gbs"); $reading_order->set_mdwrap($mdwrap); - push(@{ $self->{amd_mdsecs} },$reading_order); - } elsif(@mdwraps > 1) { + push(@{ $self->{amd_mdsecs} }, $reading_order); + } elsif (@mdwraps > 1) { my $count = scalar(@mdwraps); - $self->set_error("BadField",field=>"reading order",detail=>"Found $count reading order techMDs, expected 1"); + $self->set_error( + "BadField", + field => "reading order", + detail => "Found $count reading order techMDs, expected 1" + ); } } # generate info from feed_zephir_items and ht_collections table, or throw error if it's missing. sub _add_sourcemd { + # Why is this sub embedded? //mw2024 sub element_ht { - my $name = shift; + my $name = shift; my %attributes = @_; + my $element = XML::LibXML::Element->new($name); - $element->setNamespace(NS_HT,'HT'); - while (my ($attr,$val) = each %attributes) { - $element->setAttribute($attr,$val); + $element->setNamespace(NS_HT, 'HT'); + while (my ($attr, $val) = each %attributes) { + $element->setAttribute($attr, $val); } return $element; } my $self = shift; - my ($content_providers,$responsible_entity,$digitization_agents) = $self->{volume}->get_sources(); + my ($content_providers, $responsible_entity, $digitization_agents) = $self->{volume}->get_sources(); my $format = 'digitized'; - $format = 'borndigital' if not defined $digitization_agents or $digitization_agents eq ''; + $format = 'borndigital' if not defined $digitization_agents or $digitization_agents eq ''; - my $sources = element_ht("sources", format => $format); - - my $sourcemd = METS::MetadataSection->new( 'sourceMD', - id => $self->_get_subsec_id('SMD')); + my $sources = element_ht("sources", format => $format); + my $sourcemd = METS::MetadataSection->new( + 'sourceMD', + id => $self->_get_subsec_id('SMD') + ); - $self->_format_source_element($sources,'contentProvider', $content_providers); - $self->_format_source_element($sources,'digitizationAgent', $digitization_agents) if $digitization_agents; + $self->_format_source_element($sources, 'contentProvider', $content_providers); + $self->_format_source_element($sources, 'digitizationAgent', $digitization_agents) if $digitization_agents; # add responsible entity # FIXME: how to add 2nd responsible entity? - my $responsible_entity_element = element_ht('responsibleEntity',sequence => '1'); + my $responsible_entity_element = element_ht('responsibleEntity', sequence => '1'); $responsible_entity_element->appendText($responsible_entity); $sources->appendChild($responsible_entity_element); - $sourcemd->set_data($sources, mdtype => 'OTHER', othermdtype => 'HT'); - push(@{ $self->{amd_mdsecs} },$sourcemd); + $sourcemd->set_data( + $sources, + mdtype => 'OTHER', + othermdtype => 'HT' + ); + push(@{ $self->{amd_mdsecs} }, $sourcemd); } sub _format_source_element { - my $self = shift; - my $source_element = shift; - my $element_name = shift; - my $source_agentids = shift; - - # make sure one content provider is selected for display - $source_agentids = "$source_agentids*" if $source_agentids !~ /\*/; - foreach my $agentid (split(';',$source_agentids)) { - my $sequence = 0; - $sequence++; - my $display = 'no'; - if($agentid =~ /\*$/) { - $display = 'yes'; - $agentid =~ s/\*$//; - } - - # add element - my $element = undef; - if($element_name eq 'contentProvider') { - $element = element_ht($element_name, sequence => $sequence, display => $display); - } elsif ($element_name eq 'digitizationAgent') { - # order doesn't matter for digitization source - $element = element_ht($element_name, display => $display); - } else { - die("Unexpected source element $element_name"); + my $self = shift; + my $source_element = shift; + my $element_name = shift; + my $source_agentids = shift; + + # make sure one content provider is selected for display + $source_agentids = "$source_agentids*" if $source_agentids !~ /\*/; + + foreach my $agentid (split(';', $source_agentids)) { + my $sequence = 0; + $sequence++; + my $display = 'no'; + if ($agentid =~ /\*$/) { + $display = 'yes'; + $agentid =~ s/\*$//; + } + + # add element + my $element = undef; + if ($element_name eq 'contentProvider') { + $element = element_ht( + $element_name, + sequence => $sequence, + display => $display + ); + } elsif ($element_name eq 'digitizationAgent') { + # order doesn't matter for digitization source + $element = element_ht($element_name, display => $display); + } else { + die("Unexpected source element $element_name"); + } + + $element->appendText($agentid); + $source_element->appendChild($element); } - $element->appendText($agentid); - $source_element->appendChild($element); - } } sub _update_event_date { - my $self = shift; - - my $event = shift; - my $xc = shift; + my $self = shift; + my $event = shift; + my $xc = shift; my $eventinfo = shift; - my $date = $eventinfo->{date}; + my $date = $eventinfo->{date}; my $volume = $self->{volume}; - if($date =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/) { + if ($date =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}$/) { my $from_tz = $volume->get_nspkg()->get('default_timezone'); - if(not defined $from_tz or $from_tz eq '') { - $self->set_error("BadField",field=>"eventDate", - actual => $date, + if (not defined $from_tz or $from_tz eq '') { + $self->set_error( + "BadField", + field => "eventDate", + actual => $date, detail => "Missing time zone for event date"); } - if(defined $from_tz) { - $date = $self->convert_tz($date,$from_tz); - my $eventdateTimeNode = ($xc->findnodes('./premis:eventDateTime',$event))[0]; + if (defined $from_tz) { + $date = $self->convert_tz($date, $from_tz); + my $eventdateTimeNode = ($xc->findnodes('./premis:eventDateTime', $event))[0]; $eventdateTimeNode->removeChildNodes(); $eventdateTimeNode->appendText($date); } - } elsif($date =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2}$/) { + } elsif ($date =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}[+-]\d{2}:\d{2}$/) { # Date::Manip 5 will parse using the offset to the equivalent time in # the default time zone, then convert from default TZ to UTC # Date::Manip 6 will use the included time zone information - $date = $self->convert_tz($date,''); - my $eventdateTimeNode = ($xc->findnodes('./premis:eventDateTime',$event))[0]; + $date = $self->convert_tz($date, ''); + my $eventdateTimeNode = ($xc->findnodes('./premis:eventDateTime', $event))[0]; $eventdateTimeNode->removeChildNodes(); $eventdateTimeNode->appendText($date); } @@ -292,97 +319,97 @@ sub _update_event_date { # extract existing PREMIS events from object currently in repos sub _extract_old_premis { my $self = shift; - my $volume = $self->{volume}; - my $mets_in_repos = $volume->get_repository_mets_path(); - my $old_events = {}; + my $volume = $self->{volume}; + my $mets_in_repos = $volume->get_repository_mets_path(); + my $old_events = {}; my $need_uplift_event = 0; - if ( defined $mets_in_repos ) { - - my ( $mets_in_rep_valid, $val_results ) = - $self->validate_xml($mets_in_repos); + if (defined $mets_in_repos) { + my ($mets_in_rep_valid, $val_results) = $self->validate_xml($mets_in_repos); if ($mets_in_rep_valid) { # create map of event types to event details -- for use in updating old event details my %event_map = (); - my $nspkg = $volume->get_nspkg(); - foreach my $eventconfig ( (@{ $nspkg->get('source_premis_events_extract') }, - @{ $nspkg->{packagetype}->get('premis_events') }, # underlying original events - @{ $nspkg->get('premis_events') }) ) { # overridden events - my $eventconfig_info = $nspkg->get_event_configuration($eventconfig); - my $eventconfig_type = $eventconfig_info->{type}; + my $nspkg = $volume->get_nspkg(); + + foreach my $eventconfig ( + @{ $nspkg->get('source_premis_events_extract') }, + @{ $nspkg->{packagetype}->get('premis_events') }, # underlying original events + @{ $nspkg->get('premis_events') } + ) { # overridden events + my $eventconfig_info = $nspkg->get_event_configuration($eventconfig); + my $eventconfig_type = $eventconfig_info->{type}; $event_map{$eventconfig_type} = $eventconfig_info->{detail}; } my $xc = $volume->get_repository_mets_xpc(); - $self->migrate_agent_identifiers($xc); - foreach my $event ( $xc->findnodes('//premis:event') ) { - - my $eventinfo = { - eventtype => $xc->findvalue( "./premis:eventType", $event ) , - eventid => $xc->findvalue( "./premis:eventIdentifier/premis:eventIdentifierValue", $event ), - eventidtype => $xc->findvalue(" ./premis:eventIdentifier/premis:eventIdentifierType", $event), - date => $xc->findvalue( "./premis:eventDateTime", $event ), + foreach my $event ($xc->findnodes('//premis:event')) { + my $eventinfo = { + eventtype => $xc->findvalue("./premis:eventType", $event), + eventid => $xc->findvalue("./premis:eventIdentifier/premis:eventIdentifierValue", $event), + eventidtype => $xc->findvalue("./premis:eventIdentifier/premis:eventIdentifierType", $event), + date => $xc->findvalue("./premis:eventDateTime", $event) }; foreach my $field (qw(eventtype eventid date)) { - $self->set_error( - "MissingField", - field => "$field", - node => $event->toString() - ) unless defined $eventinfo->{$field} and $eventinfo->{$field}; + unless (defined $eventinfo->{$field} and $eventinfo->{$field}) { + $self->set_error( + "MissingField", + field => "$field", + node => $event->toString() + ); + } } # migrate obsolete events my $migrate_events = $nspkg->get('migrate_events'); my $new_event_tags = $migrate_events->{$eventinfo->{eventtype}}; - if(defined $new_event_tags) { + + if (defined $new_event_tags) { my $old_event_type = $eventinfo->{eventtype}; - $new_event_tags = [$new_event_tags] unless ref($new_event_tags); - foreach my $new_event_tag (@$new_event_tags) { - my $new_event = $event->cloneNode(1); + $new_event_tags = [$new_event_tags] unless ref($new_event_tags); + foreach my $new_event_tag (@$new_event_tags) { + my $new_event = $event->cloneNode(1); my $new_eventinfo = $nspkg->get_event_configuration($new_event_tag); # update eventType,eventDetail - my $eventtype_node = ($xc->findnodes("./premis:eventType",$new_event))[0]; + my $eventtype_node = ($xc->findnodes("./premis:eventType", $new_event))[0]; $eventtype_node->removeChildNodes(); $eventtype_node->appendText($new_eventinfo->{type}); $eventinfo->{eventtype} = $new_eventinfo->{type}; - my $eventdetail_node = ($xc->findnodes("./premis:eventDetail",$new_event))[0]; + my $eventdetail_node = ($xc->findnodes("./premis:eventDetail", $new_event))[0]; $eventdetail_node->removeChildNodes(); $eventdetail_node->appendText($new_eventinfo->{detail}); # update eventDate - my $new_date = $self->_update_event_date($new_event,$xc,$eventinfo); + my $new_date = $self->_update_event_date($new_event, $xc, $eventinfo); # create new event UUID - my $uuid = $volume->make_premis_uuid($new_eventinfo->{type},$new_date); - my $eventidval_node = ($xc->findnodes("./premis:eventIdentifier/premis:eventIdentifierValue",$new_event))[0]; + my $uuid = $volume->make_premis_uuid($new_eventinfo->{type}, $new_date); + my $eventidval_node = ($xc->findnodes("./premis:eventIdentifier/premis:eventIdentifierValue", $new_event))[0]; $eventidval_node->removeChildNodes(); $eventidval_node->appendText($uuid); - my $eventidtype_node = ($xc->findnodes("./premis:eventIdentifier/premis:eventIdentifierType",$new_event))[0]; + my $eventidtype_node = ($xc->findnodes("./premis:eventIdentifier/premis:eventIdentifierType", $new_event))[0]; $eventidtype_node->removeChildNodes(); $eventidtype_node->appendText('UUID'); - + $old_events->{$uuid} = $new_event; $self->{old_event_types}->{$new_eventinfo->{type}} = $event; $need_uplift_event = 1; get_logger()->info("Migrated $old_event_type event to $new_eventinfo->{type}"); } } else { - # update eventDetail - my $eventdetail_node = ($xc->findnodes("./premis:eventDetail",$event))[0]; - my $newtext = $event_map{$eventinfo->{eventtype}}; - if(defined $eventdetail_node) { + my $eventdetail_node = ($xc->findnodes("./premis:eventDetail", $event))[0]; + my $newtext = $event_map{$eventinfo->{eventtype}}; + if (defined $eventdetail_node) { my $text = $eventdetail_node->textContent(); - if(defined $newtext - and $newtext ne $text) { + if (defined $newtext and $newtext ne $text) { $eventdetail_node->removeChildNodes(); $eventdetail_node->appendText($event_map{$eventinfo->{eventtype}}); $need_uplift_event = 1; @@ -390,41 +417,52 @@ sub _extract_old_premis { } } else { # eventDetail node may be missing in some cases e.g. audio manual quality inspection :( - if(not defined $newtext) { - $self->set_error("BadField",field => 'eventDetail', detail => "Missing eventDetail for $eventinfo->{eventtype}"); + if (not defined $newtext) { + $self->set_error( + "BadField", + field => 'eventDetail', + detail => "Missing eventDetail for $eventinfo->{eventtype}" + ); } - my $eventDateTime = ($xc->findnodes("./premis:eventDateTime",$event))[0]; - if(not defined $eventDateTime) { - $self->set_error("BadField",field => 'eventDateTime', detail => "Missing eventDateTime for $eventinfo->{eventtype}"); + my $eventDateTime = ($xc->findnodes("./premis:eventDateTime", $event))[0]; + if (not defined $eventDateTime) { + $self->set_error( + "BadField", + field => 'eventDateTime', + detail => "Missing eventDateTime for $eventinfo->{eventtype}" + ); } - $eventDateTime->parentNode()->insertAfter(PREMIS::createElement( "eventDetail", $newtext ), - $eventDateTime); + $eventDateTime->parentNode()->insertAfter( + PREMIS::createElement("eventDetail", $newtext), + $eventDateTime + ); } # update eventDate - my $event_date = $self->_update_event_date($event,$xc,$eventinfo); + my $event_date = $self->_update_event_date($event, $xc, $eventinfo); # update event UUID - my $uuid = $volume->make_premis_uuid($eventinfo->{eventtype},$event_date); + my $uuid = $volume->make_premis_uuid($eventinfo->{eventtype}, $event_date); my $update_eventid = 0; - if($eventinfo->{eventidtype} ne 'UUID') { + + if ($eventinfo->{eventidtype} ne 'UUID') { get_logger()->info("Updating old event ID type $eventinfo->{eventidtype} to UUID for $eventinfo->{eventtype}/$eventinfo->{date}"); $need_uplift_event = 1; - $update_eventid = 1; - } elsif($eventinfo->{eventid} ne $uuid) { + $update_eventid = 1; + } elsif ($eventinfo->{eventid} ne $uuid) { # UUID may change if it was originally computed incorrectly # or if the time zone is now included in the date # calculation. get_logger()->warn("Warning: calculated UUID for $eventinfo->{eventtype} on $eventinfo->{date} did not match saved UUID; updating."); $need_uplift_event = 1; - $update_eventid = 1; + $update_eventid = 1; } - if($update_eventid) { - my $eventidval_node = ($xc->findnodes("./premis:eventIdentifier/premis:eventIdentifierValue",$event))[0]; + if ($update_eventid) { + my $eventidval_node = ($xc->findnodes("./premis:eventIdentifier/premis:eventIdentifierValue", $event))[0]; $eventidval_node->removeChildNodes(); $eventidval_node->appendText($uuid); - my $eventidtype_node = ($xc->findnodes("./premis:eventIdentifier/premis:eventIdentifierType",$event))[0]; + my $eventidtype_node = ($xc->findnodes("./premis:eventIdentifier/premis:eventIdentifierType", $event))[0]; $eventidtype_node->removeChildNodes(); $eventidtype_node->appendText('UUID'); } @@ -432,31 +470,20 @@ sub _extract_old_premis { $self->{old_event_types}->{$eventinfo->{eventtype}} = $event; $old_events->{$uuid} = $event; } - } } else { - $self->set_error( - "BadFile", - file => $mets_in_repos, - detail => $val_results - ); + $self->set_error( + "BadFile", + file => $mets_in_repos, + detail => $val_results + ); } - -# # at a minimum there should be capture, message digest calculation, -# # fixity check, validation and ingestion. -# if($volume->get_packagetype() ne 'audio') { -# foreach my $required_event_type ("capture","message digest calculation","fixity check","validation","ingestion") { -# $self->set_error("BadField",detail=>"Could not extract old PREMIS event", -# field=>"premis event $required_event_type",file=>$mets_in_repos) -# if not defined $self->{old_event_types}->{$required_event_type}; -# } -# } - - if($need_uplift_event) { + + if ($need_uplift_event) { $volume->record_premis_event('premis_migration'); } - return $old_events; + return $old_events; } } @@ -470,67 +497,75 @@ sub _add_premis_events { EVENTCODE: foreach my $eventcode ( @{$events} ) { # query database for: datetime, outcome my $eventconfig = $nspkg->get_event_configuration($eventcode); - my ( $eventid, $datetime, $outcome,$custom ) = - $volume->get_event_info($eventcode); - if(defined $custom) { + my ($eventid, $datetime, $outcome, $custom) = $volume->get_event_info($eventcode); + if (defined $custom) { $premis->add_event($custom); - } elsif(defined $eventid) { + } elsif (defined $eventid) { $eventconfig->{eventid} = $eventid; - $eventconfig->{date} = $datetime; - if(defined $outcome) { + $eventconfig->{date} = $datetime; + if (defined $outcome) { $eventconfig->{outcomes} = [$outcome]; } $self->add_premis_event($eventconfig); } elsif (not defined $eventconfig->{optional} or !$eventconfig->{optional}) { - $self->set_error("MissingField",field=>"premis_$eventcode",detail=>"No PREMIS event recorded with config ID $eventcode"); + $self->set_error( + "MissingField", + field => "premis_$eventcode", + detail => "No PREMIS event recorded with config ID $eventcode" + ); } } - } sub _get_event_type { - my $event = shift; - - if (blessed($event) and $event->isa("PREMIS::Event") and defined $event->{event_type}) { - return $event->{event_type}; - } elsif (blessed($event) and $event->isa("XML::LibXML::Element") ) { - my $xc = XML::LibXML::XPathContext->new($event); - register_namespaces($xc); - return $xc->findvalue( './premis:eventType', $event ); - } else { - return undef; - } + my $event = shift; + if (blessed($event) and $event->isa("PREMIS::Event") and defined $event->{event_type}) { + return $event->{event_type}; + } elsif (blessed($event) and $event->isa("XML::LibXML::Element")) { + my $xc = XML::LibXML::XPathContext->new($event); + register_namespaces($xc); + return $xc->findvalue('./premis:eventType', $event); + } else { + return undef; + } } sub _check_premis { - my $self = shift; - my $volume = $self->{volume}; - - my %included_event_types = map { (_get_event_type($_),1) } values( %{$self->{included_events}} ); - # at a minimum there should be capture, message digest calculation, - # fixity check, validation and ingestion. - if($volume->get_packagetype() ne 'audio') { - foreach my $required_event_type (@{$self->{required_events}}) { - $self->set_error("BadField",detail=>"Missing required PREMIS event type", - field=>"premis event $required_event_type") - if not defined $included_event_types{$required_event_type}; - } - } + my $self = shift; + my $volume = $self->{volume}; + my %included_event_types = map { + (_get_event_type($_), 1) + } values(%{$self->{included_events}}); + # at a minimum there should be capture, message digest calculation, + # fixity check, validation and ingestion. + if ($volume->get_packagetype() ne 'audio') { + foreach my $required_event_type (@{$self->{required_events}}) { + if (not defined $included_event_types{$required_event_type}) { + $self->set_error( + "BadField", + detail => "Missing required PREMIS event type", + field => "premis event $required_event_type" + ); + } + } + } } sub add_premis_event { - my $self = shift; + my $self = shift; my $eventconfig = shift; - my $volume = $self->{volume}; - my $premis = $self->{premis}; + + my $volume = $self->{volume}; + my $premis = $self->{premis}; my $included_events = $self->{included_events}; - foreach my $field ('executor','executor_type','detail','type','date','eventid') { - if(not defined $eventconfig->{$field}) { - $self->set_error("MissingField", - field => $field, + foreach my $field ('executor', 'executor_type', 'detail', 'type', 'date', 'eventid') { + if (not defined $eventconfig->{$field}) { + $self->set_error( + "MissingField", + field => $field, actual => $eventconfig ); return; @@ -541,26 +576,39 @@ sub add_premis_event { my $eventid = $eventconfig->{'eventid'}; if (defined $included_events->{$eventid}) { return; - } + } + + my $event = new PREMIS::Event( + $eventconfig->{'eventid'}, + 'UUID', + $eventconfig->{'type'}, + $eventconfig->{'date'}, + $eventconfig->{'detail'} + ); - my $event = new PREMIS::Event( $eventconfig->{'eventid'}, 'UUID', - $eventconfig->{'type'}, $eventconfig->{'date'}, - $eventconfig->{'detail'}); foreach my $outcome (@{ $eventconfig->{'outcomes'} }) { $event->add_outcome($outcome); } -# query namespace/packagetype for software tools to record for this event type + # query namespace/packagetype for software tools to record for this event type $event->add_linking_agent( - new PREMIS::LinkingAgent( $eventconfig->{'executor_type'}, - $eventconfig->{'executor'}, - 'Executor' ) ); + new PREMIS::LinkingAgent( + $eventconfig->{'executor_type'}, + $eventconfig->{'executor'}, + 'Executor' + ) + ); my @agents = (); my $tools_config = $eventconfig->{'tools'}; + foreach my $agent (@$tools_config) { $event->add_linking_agent( - new PREMIS::LinkingAgent( 'tool', get_tool_version($agent), 'software') + new PREMIS::LinkingAgent( + 'tool', + get_tool_version($agent), + 'software' + ) ); } $included_events->{$eventid} = $event; @@ -574,46 +622,48 @@ sub add_premis_event { sub _add_source_mets_events { my $self = shift; + my $volume = $self->{volume}; my $premis = $self->{premis}; - - my $xc = $volume->get_source_mets_xpc(); + my $xc = $volume->get_source_mets_xpc(); $self->migrate_agent_identifiers($xc); my $src_premis_events = {}; - foreach my $src_event ( $xc->findnodes('//premis:event') ) { - + foreach my $src_event ($xc->findnodes('//premis:event')) { # src event will be an XML node # do we want to keep this kind of event? - my $event_type = $xc->findvalue( './premis:eventType', $src_event ); - $src_premis_events->{$event_type} = [] - if not defined $src_premis_events->{$event_type}; - push( @{ $src_premis_events->{$event_type} }, $src_event ); + my $event_type = $xc->findvalue('./premis:eventType', $src_event); + if (not defined $src_premis_events->{$event_type}) { + $src_premis_events->{$event_type} = [] + } + push(@{$src_premis_events->{$event_type}}, $src_event); } - foreach my $eventcode ( - @{ $volume->get_nspkg()->get('source_premis_events_extract') } ) - { + my $eventcodes = $volume->get_nspkg()->get('source_premis_events_extract'); + foreach my $eventcode (@{$eventcodes}) { my $eventconfig = $volume->get_nspkg()->get_event_configuration($eventcode); - my $eventtype = $eventconfig->{type}; - - if(not defined $src_premis_events->{$eventtype}) { - $self->set_error("MissingField", - field => "premis $eventtype", - file => $volume->get_source_mets_file(), - detail => "Missing required PREMIS event in source METS") - unless (defined $eventconfig->{optional} and $eventconfig->{optional}); + my $eventtype = $eventconfig->{type}; + + unless (defined $src_premis_events->{$eventtype}) { + unless (defined $eventconfig->{optional} and $eventconfig->{optional}) { + $self->set_error( + "MissingField", + field => "premis $eventtype", + file => $volume->get_source_mets_file(), + detail => "Missing required PREMIS event in source METS" + ); + } } next unless defined $src_premis_events->{$eventtype}; - foreach my $src_event ( @{ $src_premis_events->{$eventtype} } ) { - my $eventid = $xc->findvalue( "./premis:eventIdentifier[premis:eventIdentifierType='UUID']/premis:eventIdentifierValue", + + foreach my $src_event (@{$src_premis_events->{$eventtype}}) { + my $eventid = $xc->findvalue( + "./premis:eventIdentifier[premis:eventIdentifierType='UUID']/premis:eventIdentifierValue", $src_event ); - # overwrite already-included event w/ updated information if needed $self->{included_events}{$eventid} = $src_event; $premis->add_event($src_event); - } } } @@ -625,56 +675,66 @@ sub _add_premis { # map from UUID to event - events that have already been added $self->{included_events} = {}; - my $premis = $self->{premis}; my $old_events = $self->_extract_old_premis(); if ($old_events) { - while ( my ( $eventid, $event ) = each(%$old_events) ) { + while (my ($eventid, $event) = each(%$old_events)) { $self->{included_events}{$eventid} = $event; $premis->add_event($event); } } # don't re-add source METS events if this is an uplift - if(!$self->{is_uplift}) { + if (!$self->{is_uplift}) { $self->_add_source_mets_events(); } # create PREMIS object my $premis_object = - new PREMIS::Object( 'HathiTrust', $volume->get_identifier() ); - $premis_object->add_significant_property( 'file count', - $volume->get_file_count() ); + new PREMIS::Object('HathiTrust', $volume->get_identifier()); + $premis_object->add_significant_property( + 'file count', + $volume->get_file_count() + ); if ($volume->get_file_groups()->{image}) { - $premis_object->add_significant_property( 'page count', - $volume->get_page_count() ); + $premis_object->add_significant_property( + 'page count', + $volume->get_page_count() + ); } $premis->add_object($premis_object); # last chance to record, even though it's not done yet $volume->record_premis_event('ingestion'); - $self->_add_premis_events( $nspkg->get('premis_events') ); + $self->_add_premis_events($nspkg->get('premis_events')); - my $digiprovMD = - new METS::MetadataSection( 'digiprovMD', 'id' => 'premis1' ); - $digiprovMD->set_xml_node( $premis->to_node(), mdtype => 'PREMIS' ); - - push( @{ $self->{amd_mdsecs} }, $digiprovMD ); + my $digiprovMD = new METS::MetadataSection( + 'digiprovMD', + 'id' => 'premis1' + ); + $digiprovMD->set_xml_node( + $premis->to_node(), + mdtype => 'PREMIS' + ); + push(@{$self->{amd_mdsecs}}, $digiprovMD); } sub _add_amdsecs { my $self = shift; - $self->{'mets'} - ->add_amd_sec( $self->_get_subsec_id("AMD"), @{ $self->{amd_mdsecs} } ); + $self->{'mets'}->add_amd_sec( + $self->_get_subsec_id("AMD"), + @{$self->{amd_mdsecs}} + ); } sub _get_subsec_id { my $self = shift; my $subsec_type = shift; + $self->{counts} = {} if not exists $self->{counts}; $self->{counts}{$subsec_type} = 0 if not exists $self->{counts}{$subsec_type}; @@ -683,6 +743,7 @@ sub _get_subsec_id { sub _add_zip_fg { my $self = shift; + my $mets = $self->{mets}; my $volume = $self->{volume}; @@ -691,8 +752,16 @@ sub _add_zip_fg { id => $self->_get_subsec_id("FG"), use => 'zip archive' ); - my ($zip_path,$zip_name) = ($volume->get_zip_directory(), $volume->get_zip_filename()); - $zip_filegroup->add_file( $zip_name, path => $zip_path, prefix => 'ZIP' ); + + my $zip_path = $volume->get_zip_directory(); + my $zip_name = $volume->get_zip_filename(); + + $zip_filegroup->add_file( + $zip_name, + path => $zip_path, + prefix => 'ZIP' + ); + $mets->add_filegroup($zip_filegroup); } @@ -704,14 +773,16 @@ sub _add_srcmets_fg { # Add source METS if it is present my $src_mets_file = $self->{volume}->get_source_mets_file(); - if($src_mets_file) { + if ($src_mets_file) { my $mets_filegroup = new METS::FileGroup( id => $self->_get_subsec_id("FG"), use => 'source METS' ); - $mets_filegroup->add_file( $src_mets_file, - path => $volume->get_staging_directory(), - prefix => 'METS' ); + $mets_filegroup->add_file( + $src_mets_file, + path => $volume->get_staging_directory(), + prefix => 'METS' + ); $mets->add_filegroup($mets_filegroup); } } @@ -724,16 +795,18 @@ sub _add_content_fgs { # then add the actual content files my $filegroups = $volume->get_file_groups(); $self->{filegroups} = {}; - while ( my ( $filegroup_name, $filegroup ) = each(%$filegroups) ) { + while (my ($filegroup_name, $filegroup) = each(%$filegroups)) { # ignore empty file groups next unless @{$filegroup->get_filenames()}; my $mets_filegroup = new METS::FileGroup( id => $self->_get_subsec_id("FG"), use => $filegroup->get_use() ); - $mets_filegroup->add_files( $filegroup->get_filenames(), + $mets_filegroup->add_files( + $filegroup->get_filenames(), prefix => $filegroup->get_prefix(), - path => $volume->get_staging_directory() ); + path => $volume->get_staging_directory() + ); $self->{filegroups}{$filegroup_name} = $mets_filegroup; $mets->add_filegroup($mets_filegroup); @@ -747,31 +820,32 @@ sub _add_filesecs { $self->_add_zip_fg(); $self->_add_srcmets_fg(); $self->_add_content_fgs(); - } # Basic structMap with optional page labels. sub _add_struct_map { my $self = shift; - my $mets = $self->{mets}; - my $volume = $self->{volume}; - my $get_pagedata = $self->{pagedata}; - my $struct_map = new METS::StructMap( id => 'SM1', type => 'physical' ); - my $voldiv = new METS::StructMap::Div( type => 'volume' ); + my $mets = $self->{mets}; + my $volume = $self->{volume}; + my $get_pagedata = $self->{pagedata}; + my $struct_map = new METS::StructMap(id => 'SM1', type => 'physical'); + my $voldiv = new METS::StructMap::Div(type => 'volume'); $struct_map->add_div($voldiv); + my $order = 1; my $file_groups_by_page = $volume->get_structmap_file_groups_by_page(); - foreach my $seqnum ( sort( keys(%$file_groups_by_page) ) ) { + + foreach my $seqnum (sort keys %$file_groups_by_page) { my $pagefiles = $file_groups_by_page->{$seqnum}; my $pagediv_ids = []; my $pagedata; my @pagedata; - while ( my ( $filegroup_name, $files ) = each(%$pagefiles) ) { + + while (my ($filegroup_name, $files) = each %$pagefiles) { foreach my $file (@$files) { - my $fileid = - $self->{filegroups}{$filegroup_name}->get_file_id($file); - if ( not defined $fileid ) { + my $fileid = $self->{filegroups}{$filegroup_name}->get_file_id($file); + if (not defined $fileid) { $self->set_error( "MissingField", field => "fileid", @@ -782,32 +856,30 @@ sub _add_struct_map { next; } - if(defined $get_pagedata) { + if (defined $get_pagedata) { # try to find page number & page tags for this page - if ( not defined $pagedata ) { + if (not defined $pagedata) { $pagedata = &$get_pagedata($file); @pagedata = %$pagedata if defined $pagedata; - } - else { + } else { my $other_pagedata = &$get_pagedata($file); - while ( my ( $key, $val ) = each(%$pagedata) ) { + while (my ($key, $val) = each %$pagedata) { my $val1 = $other_pagedata->{$key}; - $self->set_error( - "NotEqualValues", - actual => "other=$val ,$fileid=$val1", - detail => - "Mismatched page data for different files in pagefiles" - ) - unless ( not defined $val and not defined $val1 ) - or ( $val eq $val1 ); + unless ( (not defined $val and not defined $val1) or ($val eq $val1) ) { + $self->set_error( + "NotEqualValues", + actual => "other=$val ,$fileid=$val1", + detail => "Mismatched page data for different files in pagefiles" + ); + } } - } } - push( @$pagediv_ids, $fileid ); + push(@$pagediv_ids, $fileid); } } + $voldiv->add_file_div( $pagediv_ids, order => $order++, @@ -816,17 +888,15 @@ sub _add_struct_map { ); } $mets->add_struct_map($struct_map); - } sub _save_mets { my $self = shift; - my $mets = $self->{mets}; + my $mets = $self->{mets}; my $mets_path = $self->{outfile}; - open( my $metsxml, ">", "$mets_path" ) - or die("Can't open METS xml $mets_path for writing: $!"); + open(my $metsxml, ">", "$mets_path") or die("Can't open METS xml $mets_path for writing: $!"); print $metsxml $mets->to_node()->toString(1); close($metsxml); } @@ -835,11 +905,12 @@ sub _validate_mets { my $self = shift; my $mets_path = $self->{outfile}; - croak("File $mets_path does not exist. Cannot validate.") - unless -e $mets_path; + unless (-e $mets_path) { + croak("File $mets_path does not exist. Cannot validate.") + } - my ( $mets_valid, $val_results ) = $self->validate_xml($mets_path); - if ( !$mets_valid ) { + my ($mets_valid, $val_results) = $self->validate_xml($mets_path); + if (!$mets_valid) { $self->set_error( "BadFile", file => $mets_path, @@ -849,27 +920,29 @@ sub _validate_mets { # TODO: set failure creating METS file return; } - } sub validate_xml { - my $self = shift; - my $use_caching = $self->{volume}->get_nspkg()->get('use_schema_caching'); + my $self = shift; + my $filename = shift; + + my $use_caching = $self->{volume}->get_nspkg()->get('use_schema_caching'); my $schema_cache = get_config('xerces_cache'); - my $xerces = get_config('xerces'); + my $xerces = get_config('xerces'); - $xerces .= " $schema_cache" if($use_caching); + $xerces .= " $schema_cache" if $use_caching; - my $filename = shift; my $validation_cmd = "$xerces '$filename' 2>&1"; my $val_results = `$validation_cmd`; - if ( ($use_caching and $val_results !~ /\Q$filename\E OK/) or - (!$use_caching and $val_results =~ /Error/) or - $? ) { - wantarray ? return ( 0, $val_results ) : return (0); - } - else { - wantarray ? return ( 1, undef ) : return (0); + + if ( + ($use_caching and $val_results !~ /\Q$filename\E OK/) or + (!$use_caching and $val_results =~ /Error/) or + $? + ) { + wantarray ? return (0, $val_results) : return (0); + } else { + wantarray ? return (1, undef) : return (0); } } @@ -883,9 +956,12 @@ sub _get_createdate { my $ts = sprintf( "%d-%02d-%02dT%02d:%02d:%02dZ", - ( 1900 + $gmtime_obj->year() ), ( 1 + $gmtime_obj->mon() ), - $gmtime_obj->mday(), $gmtime_obj->hour(), - $gmtime_obj->min(), $gmtime_obj->sec() + (1900 + $gmtime_obj->year()), + (1 + $gmtime_obj->mon()), + $gmtime_obj->mday(), + $gmtime_obj->hour(), + $gmtime_obj->min(), + $gmtime_obj->sec() ); return $ts; @@ -900,6 +976,7 @@ sub clean_always { # do cleaning that is appropriate after failure sub clean_failure { my $self = shift; + $self->{volume}->clean_mets(); } @@ -907,7 +984,7 @@ sub clean_failure { # do not match the regular expression for the leader in the MARC schema sub _remediate_marc { my $self = shift; - my $xc = shift; + my $xc = shift; foreach my $fakeleader ($xc->findnodes('.//marc:controlfield[@tag="LDR"]')) { $fakeleader->removeAttribute('tag'); @@ -918,120 +995,122 @@ sub _remediate_marc { my @controlfields = (); foreach my $controlfield ($xc->findnodes('.//marc:controlfield')) { $controlfield->parentNode()->removeChild($controlfield); - if($controlfield->getAttribute('tag') =~ /^\d{2}[A-Z0-9]$/) { - push(@controlfields,$controlfield); + if ($controlfield->getAttribute('tag') =~ /^\d{2}[A-Z0-9]$/) { + push(@controlfields, $controlfield); } } foreach my $datafield ($xc->findnodes('.//marc:datafield')) { - if($datafield->getAttribute('tag') =~ /^[A-Z]{3}$/) { + if ($datafield->getAttribute('tag') =~ /^[A-Z]{3}$/) { $datafield->parentNode()->removeChild($datafield); } } my @leaders = $xc->findnodes(".//marc:leader"); - if(@leaders != 1) { - $self->set_error("BadField",field=>"marc:leader",detail=>"Zero or more than one leader found"); + if (@leaders != 1) { + $self->set_error( + "BadField", + field => "marc:leader", + detail => "Zero or more than one leader found" + ); return; } my $leader = $leaders[0]; - - my $value = $leader->findvalue("."); - + my $value = $leader->findvalue("."); $value =~ s/\^/ /g; if ($value !~ /^ - [\d ]{5} # 00-04: Record length - [\dA-Za-z ]{1} # 05: Record status - [\dA-Za-z]{1} # 06: Type of record - [\dA-Za-z ]{3} # 07: Bibliographic level - # 08: Type of control - # 09: Character - (2| ) # 10: Indicator count - (2| ) # 11: Subfield code count - [\d ]{5} # 12: Base address of data - [\dA-Za-z ]{3} # 17: Encoding level - # 18: Descriptive cataloging form - # 19: Multipart resource record level - (4500| ) # 20: Length of the length-of-field portion - # 21: Length of the starting-character-position portion - # 22: Length of the implementation-defined portion - # 23: Undefined - $/x) { + [\d ]{5} # 00-04: Record length + [\dA-Za-z ]{1} # 05: Record status + [\dA-Za-z]{1} # 06: Type of record + [\dA-Za-z ]{3} # 07: Bibliographic level + # 08: Type of control + # 09: Character + (2| ) # 10: Indicator count + (2| ) # 11: Subfield code count + [\d ]{5} # 12: Base address of data + [\dA-Za-z ]{3} # 17: Encoding level + # 18: Descriptive cataloging form + # 19: Multipart resource record level + (4500| ) # 20: Length of the length-of-field portion + # 21: Length of the starting-character-position portion + # 22: Length of the implementation-defined portion + # 23: Undefined + $/x) { # fix up material with record status of 'a' and no record type - if(substr($value,5,2) eq 'a ') { - substr($value,5,2) = ' a'; + if (substr($value, 5, 2) eq 'a ') { + substr($value, 5, 2) = ' a'; } # 00-04: Record length - default to empty - if(substr($value,0,5) !~ /^[\d ]{5}$/) { - substr($value,0,5) = ' '; + if (substr($value, 0, 5) !~ /^[\d ]{5}$/) { + substr($value, 0, 5) = ' '; } # 05: Record status - if(substr($value,5,1) !~ /^[\dA-Za-z ]$/) { - substr($value,5,1) = ' '; + if (substr($value, 5, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 5, 1) = ' '; } # 06: Type of record - if(substr($value,6,1) !~ /^[\dA-Za-z]$/) { + if (substr($value, 6, 1) !~ /^[\dA-Za-z]$/) { get_logger()->warn("Invalid value found for record type, can't remediate"); } # 07: Bibliographic level - if(substr($value,7,1) !~ /^[\dA-Za-z ]$/) { - substr($value,7,1) = ' '; + if (substr($value, 7, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 7, 1) = ' '; } # 08: Type of control - if(substr($value,8,1) !~ /^[\dA-Za-z ]$/) { - substr($value,8,1) = ' '; + if (substr($value, 8, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 8, 1) = ' '; } # 09: Character coding scheme - if(substr($value,9,1) ne 'a') { + if (substr($value, 9, 1) ne 'a') { get_logger()->warn("Non-Unicode MARC-XML found"); } # 10: Indicator count - if(substr($value,10,1) !~ /^(2| )$/) { - substr($value,10,1) = ' '; + if (substr($value, 10, 1) !~ /^(2| )$/) { + substr($value, 10, 1) = ' '; } # 11: Subfield code count - if(substr($value,11,1) !~ /^(2| )$/) { - substr($value,11,1) = ' '; + if (substr($value, 11, 1) !~ /^(2| )$/) { + substr($value, 11, 1) = ' '; } # 12-16: Base address of data - if(substr($value,12,5) !~ /^[\d ]{5}$/) { - substr($value,12,5) = ' '; + if (substr($value, 12, 5) !~ /^[\d ]{5}$/) { + substr($value, 12, 5) = ' '; } # 17: Encoding level - if(substr($value,17,1) !~ /^[\dA-Za-z ]$/) { - substr($value,17,1) = 'u'; # unknown + if (substr($value, 17, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 17, 1) = 'u'; # unknown } # 18: Descriptive cataloging form - if(substr($value,18,1) !~ /^[\dA-Za-z ]$/) { - substr($value,18,1) = 'u'; # unknown + if (substr($value, 18, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 18, 1) = 'u'; # unknown } # 19: Multipart resource record level - if(substr($value,19,1) !~ /^[\dA-Za-z ]$/) { - substr($value,19,1) = ' '; + if (substr($value, 19, 1) !~ /^[\dA-Za-z ]$/) { + substr($value, 19, 1) = ' '; } # 20: Length of the length-of-field portion # 21: Length of the start-character-position portion # 22: Length of the implementatino-defined portion # 23: Undefined - if(substr($value,20,4) !~ /^(4500| )/) { + if (substr($value, 20, 4) !~ /^(4500| )/) { # default to unspecified - substr($value,20,4) = ' '; + substr($value, 20, 4) = ' '; } } @@ -1040,7 +1119,7 @@ sub _remediate_marc { # reinsert control fields in the correct place while (my $controlfield = pop @controlfields) { - $leader->parentNode()->insertAfter($controlfield,$leader); + $leader->parentNode()->insertAfter($controlfield, $leader); } foreach my $datafield ($xc->findnodes('.//marc:datafield')) { @@ -1051,19 +1130,18 @@ sub _remediate_marc { # clean ind1, ind2; move i{1,2} -> ind{1,2} 'ind1' => 'ind1', 'ind2' => 'ind2', - 'i1' => 'ind1', - 'i2' => 'ind2', + 'i1' => 'ind1', + 'i2' => 'ind2', }; - while (my ($old,$new) = each (%$attrs_to_move)) { - if($datafield->hasAttribute($old)) { - + while (my ($old, $new) = each (%$attrs_to_move)) { + if ($datafield->hasAttribute($old)) { my $attrval = $datafield->getAttribute($old); # default to empty if value is invalid - if($attrval !~ /^[\da-z ]{1}$/) { + if ($attrval !~ /^[\da-z ]{1}$/) { $attrval = " "; } $datafield->removeAttribute($old); - $datafield->setAttribute($new,$attrval); + $datafield->setAttribute($new, $attrval); } } } @@ -1072,18 +1150,17 @@ sub _remediate_marc { # remove empty data fields $datafield->parentNode()->removeChild($datafield); } - - } sub convert_tz { - my $self = shift; - my $date = shift; + my $self = shift; + my $date = shift; my $from_tz = shift; - die("No from_tz specified") unless defined $from_tz; + die("No from_tz specified") unless defined $from_tz; die("Missing Date::Manip::VERSION") unless defined $Date::Manip::VERSION; - if($Date::Manip::VERSION < 6.00) { + + if ($Date::Manip::VERSION < 6.00) { # version 5 functional interface, doesn't track timezone my $parsed = ParseDate($date); $self->set_error("BadValue",actual=>"$date",field=>"date",detail=>"Can't parse date") unless defined $parsed; @@ -1099,7 +1176,7 @@ sub convert_tz { $dm_date->convert('UTC'); $self->set_error("BadValue",actual=>"$date $from_tz",field=>"date",detail=>"Can't convert to UTC: " . $dm_date->err()) if $dm_date->err(); - + my $res = $dm_date->printf('%OZ'); $self->set_error("BadValue",actual=>"$date $from_tz",field=>"date",detail=>"Can't convert to UTC: " . $dm_date->err()) if not defined $res or !$res; @@ -1113,62 +1190,71 @@ sub is_uplift { } sub agent_type { - my $self = shift; - my $agentid = shift; + my $self = shift; + my $agentid = shift; - return "HathiTrust Institution ID"; + return "HathiTrust Institution ID"; } -# map MARC21 agent codes to HathiTrust Institution IDs +# map MARC21 agent codes to HathiTrust Institution IDs sub migrate_agent_identifiers { - my $self = shift; - my $xc = shift; - my $volume = $self->{volume}; - - # migrate agent IDs - # - foreach my $agent ( $xc->findnodes('//premis:linkingAgentIdentifier') ) { - my $agent_type = ($xc->findnodes('./premis:linkingAgentIdentifierType',$agent))[0]; - my $agent_value = ($xc->findnodes('./premis:linkingAgentIdentifierValue',$agent))[0]; - - my $agent_type_text = $agent_type->textContent(); - my $agent_value_text = $agent_value->textContent(); - my $new_agent_value = undef; - # TODO: remove after uplift - if($agent_type_text eq 'MARC21 Code' or $agent_type_text eq 'AgentID') { - $new_agent_value = $agent_mapping{$agent_value_text}; - if(not defined $new_agent_value) { - $self->set_error("BadValue",field=>'linkingAgentIdentifierValue', - actual => $agent_value_text, - detail => "Don't know what the HT institution ID is for obsolete agent identifier"); - } - } elsif($agent_type_text eq 'HathiTrust AgentID') { - if($agent_value_text eq 'UNKNOWN' and $volume->{namespace} = 'mdp') { - # best guess - $new_agent_value = 'umich'; - } else { - $self->set_error("BadValue",field=>'linkingAgentIdentifierValue', - actual => $agent_value_text, - detail => 'Unexpected HathiTrust AgentID'); - } - } elsif($agent_type_text eq 'HathiTrust Institution ID' or $agent_type_text eq 'tool') { - # do nothing - } else { - my $mets_in_repos = $volume->get_repository_mets_path(); - $self->set_error("BadValue",field => 'linkingAgentIdentifierType', - actual => $agent_type_text, - expected => 'tool, MARC21 Code, or HathiTrust Institution ID', - file => $mets_in_repos) - } + my $self = shift; + my $xc = shift; + + my $volume = $self->{volume}; - if(defined $new_agent_value) { - $agent_type->removeChildNodes(); - $agent_type->appendText("HathiTrust Institution ID"); - $agent_value->removeChildNodes(); - $agent_value->appendText($new_agent_value); + # migrate agent IDs + foreach my $agent ($xc->findnodes('//premis:linkingAgentIdentifier')) { + my $agent_type = ($xc->findnodes('./premis:linkingAgentIdentifierType', $agent))[0]; + my $agent_value = ($xc->findnodes('./premis:linkingAgentIdentifierValue', $agent))[0]; + + my $agent_type_text = $agent_type->textContent(); + my $agent_value_text = $agent_value->textContent(); + my $new_agent_value = undef; + # TODO: remove after uplift + if ($agent_type_text eq 'MARC21 Code' or $agent_type_text eq 'AgentID') { + $new_agent_value = $agent_mapping{$agent_value_text}; + if (not defined $new_agent_value) { + $self->set_error( + "BadValue", + field => 'linkingAgentIdentifierValue', + actual => $agent_value_text, + detail => "Don't know what the HT institution ID is for obsolete agent identifier" + ); + } + } elsif ($agent_type_text eq 'HathiTrust AgentID') { + if ($agent_value_text eq 'UNKNOWN' and $volume->{namespace} = 'mdp') { + # best guess + $new_agent_value = 'umich'; + } else { + $self->set_error( + "BadValue", + field => 'linkingAgentIdentifierValue', + actual => $agent_value_text, + detail => 'Unexpected HathiTrust AgentID' + ); + } + } elsif ($agent_type_text eq 'HathiTrust Institution ID' or $agent_type_text eq 'tool') { + # do nothing + } else { + my $mets_in_repos = $volume->get_repository_mets_path(); + $self->set_error( + "BadValue", + field => 'linkingAgentIdentifierType', + actual => $agent_type_text, + expected => 'tool, MARC21 Code, or HathiTrust Institution ID', + file => $mets_in_repos + ); + } + + if (defined $new_agent_value) { + $agent_type->removeChildNodes(); + $agent_type->appendText("HathiTrust Institution ID"); + $agent_value->removeChildNodes(); + $agent_value->appendText($new_agent_value); + } } - } } 1; @@ -1183,7 +1269,7 @@ HTFeed::METS - Main class for creating METS XML A series of stages to generate a METS XML document for a Feed package. -=head1 DESCRIPTION +=head1 DESCRIPTION METS.pm provides the main methods for generating a METS XML document. These methods (documented below) can be subclassed for various special cases, such as SourceMETS and PackageType::METS. @@ -1206,7 +1292,7 @@ header dmdsecs -techmds +techmds filesecs @@ -1226,7 +1312,7 @@ C<$version = perl_mod_version($module);> =item stage_info() -Return status on completion of METS stage (success/failure) +Return status on completion of METS stage (success/failure) =item add_premis_event() diff --git a/lib/HTFeed/PackageType/IA/Download.pm b/lib/HTFeed/PackageType/IA/Download.pm index f6f75206..31ad7f2e 100644 --- a/lib/HTFeed/PackageType/IA/Download.pm +++ b/lib/HTFeed/PackageType/IA/Download.pm @@ -17,12 +17,13 @@ sub links { } package HTFeed::PackageType::IA::Download; -use Encode qw(decode); -use warnings; use strict; +use warnings; use base qw(HTFeed::Stage::Download); + +use Encode qw(decode); use File::Pairtree qw(id2ppath s2ppchars); use File::Path qw(make_path); use HTFeed::Config qw(get_config); @@ -45,6 +46,9 @@ sub run { $self->{pt_path} = $pt_path; my @noncore_missing = (); + my $labels = {name => 'ia'}; + my $start_time = $self->{job_metrics}->time; + foreach my $suffix (@$core_package_items) { $self->download(suffix => $suffix); } @@ -127,6 +131,13 @@ sub run { ); $self->_set_done(); + my $end_time = $self->{job_metrics}->time; + my $delta_time = $end_time - $start_time; + my $downloaded_size = $self->{job_metrics}->dir_size($pt_path); + $self->{job_metrics}->add("ingest_download_seconds_total", $delta_time, $labels); + $self->{job_metrics}->add("ingest_download_bytes_r_total", $downloaded_size, $labels); + $self->{job_metrics}->inc("ingest_download_items_total", $labels); + return $self->succeeded(); } @@ -144,6 +155,8 @@ sub download { # check if it was already downloaded return 1 if -e "$self->{pt_path}/$filename"; + die "died from $self->{pt_path}/$filename \n"; + foreach my $link (@{$self->get_links()}) { next if not defined $link; if ($link =~ /$suffix$/ and $link !~ /_bw_$suffix/) { diff --git a/lib/HTFeed/PackageType/IA/ImageRemediate.pm b/lib/HTFeed/PackageType/IA/ImageRemediate.pm index b58eada3..6db3e957 100644 --- a/lib/HTFeed/PackageType/IA/ImageRemediate.pm +++ b/lib/HTFeed/PackageType/IA/ImageRemediate.pm @@ -1,57 +1,74 @@ package HTFeed::PackageType::IA::ImageRemediate; -use warnings; use strict; +use warnings; + use base qw(HTFeed::Stage::ImageRemediate); + use Carp; +use File::Basename qw(basename); use Log::Log4perl qw(get_logger); use POSIX qw(strftime); -use File::Basename qw(basename); sub run { - my $self = shift; - my $volume = $self->{volume}; + my $self = shift; + + my $volume = $self->{volume}; my $preingest_dir = $volume->get_preingest_directory(); - my $stage_path = $volume->get_staging_directory(); - my $objid = $volume->get_objid(); - my $scandata_xpc = $volume->get_scandata_xpc(); - - my $resolution = $volume->get_db_resolution(); - $resolution = $scandata_xpc->findvalue("//scribe:bookData/scribe:dpi | //bookData/dpi") if not defined $resolution or !$resolution; - $resolution = $volume->get_meta_xpc()->findvalue("//ppi") if not defined $resolution or !$resolution; - + my $stage_path = $volume->get_staging_directory(); + my $objid = $volume->get_objid(); + my $scandata_xpc = $volume->get_scandata_xpc(); + my $resolution = $volume->get_db_resolution(); + my $labels = {packagetype => 'ia'}; + my $start_time = $self->{job_metrics}->time; + + # Fall back to getting resolution from scandata or meta + if (not defined $resolution or !$resolution) { + $resolution = $scandata_xpc->findvalue("//scribe:bookData/scribe:dpi | //bookData/dpi") + } + if (not defined $resolution or !$resolution) { + $resolution = $volume->get_meta_xpc()->findvalue("//ppi"); + } + # decompress any lossless JPEG2000 images my @jp2 = glob("$preingest_dir/*.jp2"); if(@jp2) { - $self->expand_lossless_jpeg2000($volume,$preingest_dir,[map { basename($_) } @jp2]); + $self->expand_lossless_jpeg2000( + $volume, + $preingest_dir, + [map { basename($_) } @jp2] + ); } #remediate TIFFs (incl. expanded JPEG2000 images) my @tiffs = map { basename($_) } glob("$preingest_dir/*.tif"); - $self->remediate_tiffs($volume,$preingest_dir,\@tiffs, + $self->remediate_tiffs( + $volume, + $preingest_dir, + \@tiffs, # return extra fields to set that depend on the file sub { my $file = shift; my $set_if_undefined_fields = {}; - my $force_fields = {'IFD0:DocumentName' => join('/',$volume->get_objid(),$file) }; - if ( my $capture_time = $self->get_capture_time($file) ) { + my $force_fields = {'IFD0:DocumentName' => join('/',$volume->get_objid(),$file) }; + if (my $capture_time = $self->get_capture_time($file)) { $set_if_undefined_fields->{'XMP-tiff:DateTime'} = $capture_time; } $set_if_undefined_fields->{'Resolution'} = $resolution if defined $resolution and $resolution; - return ( $force_fields, $set_if_undefined_fields, $file); + return ($force_fields, $set_if_undefined_fields, $file); } ) if @tiffs; - opendir( my $dirh, "$preingest_dir" ) - or croak("Can't opendir $preingest_dir: $!"); + opendir(my $dirh, "$preingest_dir") or croak("Can't opendir $preingest_dir: $!"); - while ( my $file = readdir($dirh) ) { + while (my $file = readdir($dirh)) { next unless $file =~ /(\d{4})\.jp2$/; - my $seqnum = $1; - my $new_filename = sprintf("%08d.jp2",$seqnum); + + my $seqnum = $1; + my $new_filename = sprintf("%08d.jp2",$seqnum); my $jp2_submitted = "$preingest_dir/$file"; my $jp2_remediated = "$stage_path/$new_filename"; @@ -62,22 +79,33 @@ sub run { my $set_if_undefined_fields = {}; - if ( my $capture_time = $self->get_capture_time($file) ) { + if (my $capture_time = $self->get_capture_time($file)) { $set_if_undefined_fields->{'XMP-tiff:DateTime'} = $capture_time; } $set_if_undefined_fields->{'Resolution'} = $resolution if defined $resolution and $resolution; $self->remediate_image( - $jp2_submitted, $jp2_remediated, - $set_always_fields, $set_if_undefined_fields + $jp2_submitted, + $jp2_remediated, + $set_always_fields, + $set_if_undefined_fields ); } closedir($dirh); + $volume->record_premis_event('image_header_modification'); $volume->record_premis_event('file_rename'); + # Record metrics + my $end_time = $self->{job_metrics}->time; + my $delta_time = $end_time - $start_time; + my $page_count = $volume->get_page_count(); + $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); + $self->{job_metrics}->add("ingest_imageremediate_images_total", $page_count, $labels); + $self->{job_metrics}->inc("ingest_imageremediate_items_total", $labels); + $self->_set_done(); return $self->succeeded(); } @@ -85,41 +113,39 @@ sub run { sub get_capture_time { my $self = shift; my $image_file = shift; - my $volume = $self->{volume}; - my $xpc = $volume->get_scandata_xpc(); - my $preingest_dir = $volume->get_preingest_directory(); + my $volume = $self->{volume}; + my $xpc = $volume->get_scandata_xpc(); + my $preingest_dir = $volume->get_preingest_directory(); my $gmtTimeStampRE = qr/^(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})$/; # Get the time of creation from scandata.xml - my $leafNum = int( $image_file =~ /_(\d{4}).jp2/ ); + my $leafNum = int($image_file =~ /_(\d{4}).jp2/); # A couple places this might appear, and it might be with or without a namespace.. - my $gmtTimeStamp = - $xpc->findvalue(qq(//scribe:pageData/scribe:page[\@leafNum='$leafNum']/scribe:gmtTimeStamp | //pageData/page[\@leafNum='$leafNum']/gmtTimeStamp)); + my $gmtTimeStamp = $xpc->findvalue( + qq(//scribe:pageData/scribe:page[\@leafNum='$leafNum']/scribe:gmtTimeStamp | //pageData/page[\@leafNum='$leafNum']/gmtTimeStamp) + ); # TODO: Start or end time stamp? Or do we want to get it from the file? - if( not defined $gmtTimeStamp or $gmtTimeStamp eq '' or $gmtTimeStamp !~ $gmtTimeStampRE) { - $gmtTimeStamp = $xpc->findvalue('//scribe:scanLog/scribe:scanEvent/scribe:endTimeStamp | //scanLog/scanEvent/endTimeStamp'); + if (not defined $gmtTimeStamp or $gmtTimeStamp eq '' or $gmtTimeStamp !~ $gmtTimeStampRE) { + $gmtTimeStamp = $xpc->findvalue( + '//scribe:scanLog/scribe:scanEvent/scribe:endTimeStamp | //scanLog/scanEvent/endTimeStamp' + ); } - if( not defined $gmtTimeStamp or $gmtTimeStamp eq '' or $gmtTimeStamp !~ $gmtTimeStampRE) { + if (not defined $gmtTimeStamp or $gmtTimeStamp eq '' or $gmtTimeStamp !~ $gmtTimeStampRE) { my $meta_xpc = $self->{volume}->get_meta_xpc(); $gmtTimeStamp = $meta_xpc->findvalue('//scandate'); } # use file time stamp if all else fails - if( not defined $gmtTimeStamp or $gmtTimeStamp eq '' or $gmtTimeStamp !~ $gmtTimeStampRE) { + if (not defined $gmtTimeStamp or $gmtTimeStamp eq '' or $gmtTimeStamp !~ $gmtTimeStampRE) { $gmtTimeStamp = strftime("%Y%m%d%H%M%S",gmtime((stat("$preingest_dir/$image_file"))[9])); } # Format is YYYYMMDDHHmmss - if ( defined $gmtTimeStamp - and $gmtTimeStamp =~ $gmtTimeStampRE ) - { + if (defined $gmtTimeStamp and $gmtTimeStamp =~ $gmtTimeStampRE) { return ("$1:$2:$3 $4:$5:$6+00:00"); } - } 1; - -__END__ diff --git a/lib/HTFeed/PackageType/Simple/ImageRemediate.pm b/lib/HTFeed/PackageType/Simple/ImageRemediate.pm index 005c19c4..c559db8a 100644 --- a/lib/HTFeed/PackageType/Simple/ImageRemediate.pm +++ b/lib/HTFeed/PackageType/Simple/ImageRemediate.pm @@ -34,11 +34,17 @@ sub run { my $volume = $self->{volume}; my $preingest_dir = $volume->get_preingest_directory(); my $staging_dir = $volume->get_staging_directory(); + my $labels = {packagetype => 'simple'}; + my $start_time = $self->{job_metrics}->time; # decompress any lossless JPEG2000 images my @jp2 = glob("$preingest_dir/*.jp2"); if (@jp2) { - $self->expand_lossless_jpeg2000($volume, $preingest_dir, [map { basename($_) } @jp2]); + $self->expand_lossless_jpeg2000( + $volume, + $preingest_dir, + [map { basename($_) } @jp2] + ); } #remediate TIFFs @@ -84,7 +90,7 @@ sub run { # force override resolution if it is provided in meta.yml $self->set_from_meta_yml('contone_resolution_dpi', $force_fields, 'Resolution'); - $self->remediate_image( $jp2_submitted, $jp2_remediated, $force_fields, $set_if_undefined ); + $self->remediate_image($jp2_submitted, $jp2_remediated, $force_fields, $set_if_undefined); } $volume->record_premis_event('image_header_modification'); @@ -98,8 +104,15 @@ sub run { move($file, $staging_dir); } $fetch->fix_line_endings($staging_dir); - $self->_set_done(); + my $page_count = $volume->get_page_count(); + my $end_time = $self->{job_metrics}->time; + my $delta_time = $end_time - $start_time; + $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); + $self->{job_metrics}->add("ingest_imageremediate_images_total", $page_count, $labels); + $self->{job_metrics}->inc("ingest_imageremediate_items_total", $labels); + + $self->_set_done(); return $self->succeeded(); } diff --git a/lib/HTFeed/Stage/ImageRemediate.pm b/lib/HTFeed/Stage/ImageRemediate.pm index b56da58b..f88886a7 100644 --- a/lib/HTFeed/Stage/ImageRemediate.pm +++ b/lib/HTFeed/Stage/ImageRemediate.pm @@ -19,8 +19,6 @@ use List::Util qw(max min); use Log::Log4perl qw(get_logger); use POSIX qw(ceil); -use Data::Dumper qw(Dumper); - =head1 NAME HTFeed::Stage::ImageRemediate - Image file processing @@ -217,7 +215,6 @@ sub _remediate_tiff { my $force_headers = shift || {}; my $set_if_undefined_headers = shift; - my $start_time = $self->{job_metrics}->time; my $infile_size = -s $infile; my $bad = 0; @@ -380,11 +377,7 @@ sub _remediate_tiff { $self->{newFields} ); - my $end_time = $self->{job_metrics}->time; - my $delta_time = $end_time - $start_time; my $labels = {format => 'tiff'}; - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", $infile_size, $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s $outfile, $labels); @@ -415,7 +408,6 @@ sub repair_tiff_exiftool { my $outfile = shift; my $fields = shift; - my $start_time = $self->{job_metrics}->time; my $infile_size = -s $infile; # fix the DateTime @@ -445,13 +437,10 @@ sub repair_tiff_exiftool { ); return 0; } - my $end_time = $self->{job_metrics}->time; - my $delta_time = $end_time - $start_time; + my $labels = {format => 'tiff'}; - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", $infile_size, $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s $outfile, $labels); - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); return $write_return; } @@ -461,7 +450,6 @@ sub repair_tiff_imagemagick { my $infile = shift; my $outfile = shift; - my $start_time = $self->{job_metrics}->time; # try running IM on the TIFF file get_logger()->trace( "TIFF_REPAIR: attempting to repair $infile to $outfile\n" @@ -472,19 +460,14 @@ sub repair_tiff_imagemagick { # convert returns 0 on success, 1 on failure my $compress_ok = HTFeed::Image::Magick::compress($infile, $outfile, '-compress' => 'Group4'); - my $end_time = $self->{job_metrics}->time; - my $delta_time = $end_time - $start_time; my $labels = {format => 'tiff', tool => 'imagemagick'}; $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", -s $infile, $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s $outfile, $labels); - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); croak("failed repairing $infile\n") unless $compress_ok; # Some metadata may be lost when imagemagick compresses infile to outfile. # Here we are putting Artist back, or we'll crash at a later stage, # due to missing ImageProducer (which depends on Artist). - $start_time = $self->{job_metrics}->time; my $out_exif = Image::ExifTool->new; my $out_meta = $out_exif->ImageInfo($outfile); if (defined $in_meta->{'Artist'} && !defined $out_meta->{'Artist'}) { @@ -496,13 +479,9 @@ sub repair_tiff_imagemagick { } } - $end_time = $self->{job_metrics}->time; - $delta_time = $end_time - $start_time; $labels = {format => 'tiff', tool => 'exiftool'}; $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", -s $infile, $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s $outfile, $labels); - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); return $compress_ok; } @@ -514,7 +493,6 @@ sub _remediate_jpeg2000 { my $force_headers = shift || {}; my $set_if_undefined_headers = shift; - my $start_time = $self->{job_metrics}->time; my $infile_size = -s $infile; $self->{newFields} = $force_headers; $self->{oldFields} = $self->get_exiftool_fields($infile); @@ -627,13 +605,9 @@ sub _remediate_jpeg2000 { } my $ret_val = $self->update_tags($exifTool, $outfile, $infile); - my $end_time = $self->{job_metrics}->time; - my $delta_time = $end_time - $start_time; my $labels = {format => 'jpeg2000'}; - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", $infile_size, $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s $outfile, $labels); - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); return $ret_val; } @@ -793,7 +767,6 @@ sub expand_lossless_jpeg2000 { my $jpeg2000 = $file; my $jpeg2000_remediated = $file; my $tiff = $file; - my $start_time = $self->{job_metrics}->time; $tiff =~ s/\.jp2$/.tif/; $jpeg2000_remediated =~ s/\.jp2$/.remediated.jp2/; @@ -804,9 +777,6 @@ sub expand_lossless_jpeg2000 { HTFeed::Image::Grok::decompress("$path/$jpeg2000", "$path/$tiff"); $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", -s "$path/$jpeg2000", $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s "$path/$tiff", $labels); - my $delta_time = $self->{job_metrics}->time - $start_time; - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); # try to compress the TIFF -> JPEG2000 get_logger()->trace("Compressing $path/$tiff to $path/$jpeg2000"); @@ -817,7 +787,6 @@ sub expand_lossless_jpeg2000 { } # Single quality level with reqested PSNR of 32dB. See DEV-10 - $start_time = $self->{job_metrics}->time; my $grk_compress_success = HTFeed::Image::Grok::compress( "$path/$tiff", "$path/$jpeg2000_remediated" @@ -836,11 +805,7 @@ sub expand_lossless_jpeg2000 { }; $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", -s "$path/$tiff", $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s "$path/$jpeg2000_remediated", $labels); - $delta_time = $self->{job_metrics}->time - $start_time; - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); - $start_time = $self->{job_metrics}->time; # copy all headers from the original jpeg2000 # grk_compress loses info from IFD0 headers, which are sometimes present in JPEG2000 images my $exiftool = new Image::ExifTool; @@ -850,9 +815,6 @@ sub expand_lossless_jpeg2000 { $labels = {tool => 'exiftool'}; $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", -s "$path/$tiff", $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s "$path/$jpeg2000_remediated", $labels); - $delta_time = $self->{job_metrics}->time - $start_time; - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); # gotta do metrics first or we can't get file sizes rename("$path/$jpeg2000_remediated", "$path/$jpeg2000"); @@ -880,7 +842,6 @@ sub expand_other_file_formats { my $outname = $parts[0]; my $ext = $parts[2]; my $outfile = "$path/$outname.tif"; - my $start_time = $self->{job_metrics}->time; my $compress_ok = HTFeed::Image::Magick::compress( $infile, @@ -896,12 +857,8 @@ sub expand_other_file_formats { tool => 'imagemagick', converted => $ext."->tiff" }; - my $end_time = $self->{job_metrics}->time; - my $delta_time = $end_time - $start_time; - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", $infile_size, $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s $outfile, $labels); - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); } else { $self->set_error( "OperationFailed", @@ -1019,7 +976,6 @@ sub remediate_tiffs { '/jhove:jhove/jhove:repInfo/jhove:messages/jhove:message[@severity="error"]' ); - my $start_time = $self->{job_metrics}->time; my $stage_path = $volume->get_staging_directory(); my $objid = $volume->get_objid(); @@ -1084,10 +1040,7 @@ sub remediate_tiffs { "-m TIFF-hul" ); - my $end_time = $self->{job_metrics}->time; - my $delta_time = $end_time - $start_time; my $labels = {format => "tiff", tool => 'jhove'}; - $self->{job_metrics}->add("ingest_imageremediate_seconds_total", $delta_time, $labels); $self->{job_metrics}->inc("ingest_imageremediate_items_total", $labels); } @@ -1165,7 +1118,6 @@ sub convert_tiff_to_jpeg2000 { -s "$infile.unc.tif", $labels ); - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); if (!$magick_compress_success) { $self->set_error( @@ -1201,8 +1153,6 @@ sub convert_tiff_to_jpeg2000 { $labels = {converted => "tiff->jpeg2000", tool => "grk_compress"}; $self->{job_metrics}->add("ingest_imageremediate_bytes_r_total", -s "$infile.unc.tif", $labels); $self->{job_metrics}->add("ingest_imageremediate_bytes_w_total", -s $outfile, $labels); - $self->{job_metrics}->inc("ingest_imageremediate_images_total", $labels); - # then set new metadata fields - the rest will automatically be # set from the JP2 foreach $field (qw(XResolution YResolution ResolutionUnit Artist Make Model)) { diff --git a/t/ia.t b/t/ia.t index 0e800732..fbec7bbd 100644 --- a/t/ia.t +++ b/t/ia.t @@ -1,154 +1,203 @@ use FindBin; use lib "$FindBin::Bin/lib"; -use Test::Spec; -use HTFeed::Test::Support qw(load_db_fixtures); -use HTFeed::Test::SpecSupport qw(mock_zephir); +use strict; +use warnings; + use HTFeed::Config qw(set_config); +use HTFeed::Test::SpecSupport qw(mock_zephir); +use HTFeed::Test::Support qw(load_db_fixtures); +use Test::Spec; +context "mock download" => sub { + describe "HTFeed::PackageType::IA::Download" => sub { + use HTFeed::JobMetrics; + it "increments jobmetrics (even as a mock download)" => sub { + my $jm = HTFeed::JobMetrics->new; + my $objid = 'ark:/13960/t7kq2zj36'; + my $ia_id = 'ark+=13960=t7kq2zj36'; + my $volume = HTFeed::Volume->new( + namespace => 'test', + objid => $objid, + packagetype => 'ia' + ); + my $download_items = "ingest_download_items_total"; + + $jm->clear; + $volume->{ia_id} = $ia_id; + my $downloader = HTFeed::PackageType::IA::Download->new(volume => $volume); + my @mock_files = ( + 'djvu.xml', + 'files.xml', + 'jp2.zip', + 'meta.xml', + 'scandata.xml', + 'scanfactors.xml', + ); + # Create mock files in the download dir to skip the actual download from IA + foreach my $mock_file (@mock_files) { + my $mock_path = join( + "", + $volume->get_download_directory(), + "/", + $ia_id, + "_", + $mock_file + ); + system("touch $mock_path"); + } + + # Check that the $download_items metric increments upon successful download + ok($jm->get_value($download_items) == 0); + $downloader->run(); + ok($jm->get_value($download_items) == 1); + }; + }; +}; context "with volume & temporary ingest/preingest/zipfile dirs" => sub { - my $volume; - my $objid; - my $pt_objid; - - my $tmpdir; - - my $tmpdirs; - - before all => sub { - load_db_fixtures; - $tmpdirs = HTFeed::Test::TempDirs->new(); - $objid = 'ark:/13960/t7kq2zj36'; - $pt_objid = 'ark+=13960=t7kq2zj36'; - }; - - before each => sub { - $tmpdirs->setup_example; - set_config($tmpdirs->test_home . "/fixtures",'staging','download'); - - $volume = HTFeed::Volume->new(namespace => 'test', - objid => $objid, - packagetype => 'ia'); - $volume->{ia_id} = 'ark+=13960=t7kq2zj36'; - }; - - after each => sub { - $tmpdirs->cleanup_example; - }; - - after all => sub { - $tmpdirs->cleanup; - }; - - describe "HTFeed::PackageType::IA::VerifyManifest" => sub { - my $stage; - - before each => sub { - HTFeed::PackageType::IA::Unpack->new(volume => $volume)->run(); - $stage = HTFeed::PackageType::IA::VerifyManifest->new(volume => $volume); + my $volume; + my $objid; + my $pt_objid; + my $tmpdir; + my $tmpdirs; + + before all => sub { + load_db_fixtures; + $tmpdirs = HTFeed::Test::TempDirs->new(); + $objid = 'ark:/13960/t7kq2zj36'; + $pt_objid = 'ark+=13960=t7kq2zj36'; }; - it "succeeds" => sub { - $stage->run(); - ok($stage->succeeded()); + before each => sub { + $tmpdirs->setup_example; + set_config($tmpdirs->test_home . '/fixtures', 'staging', 'download'); + + $volume = HTFeed::Volume->new( + namespace => 'test', + objid => $objid, + packagetype => 'ia' + ); + $volume->{ia_id} = 'ark+=13960=t7kq2zj36'; }; after each => sub { - $stage->clean(); + $tmpdirs->cleanup_example; }; - }; - describe "HTFeed::PackageType::IA::Unpack" => sub { - my $stage; - - before each => sub { - $stage = HTFeed::PackageType::IA::Unpack->new(volume => $volume); + after all => sub { + $tmpdirs->cleanup; }; - it "succeeds" => sub { - $stage->run(); - ok($stage->succeeded()); - }; + describe "HTFeed::PackageType::IA::VerifyManifest" => sub { + my $stage; - it "extracts the zip" => sub { - $stage->run(); + before each => sub { + HTFeed::PackageType::IA::Unpack->new(volume => $volume)->run(); + $stage = HTFeed::PackageType::IA::VerifyManifest->new(volume => $volume); + }; - my $ia_id = $volume->get_ia_id(); - ok(-e "$tmpdirs->{preingest}/$pt_objid/${ia_id}_0001.jp2"); - }; + it "succeeds" => sub { + $stage->run(); + ok($stage->succeeded()); + }; - after each => sub { - $stage->clean(); - }; - }; - - share my %vars; - shared_examples_for "mets with reading order" => sub { - it "succeeds" => sub { - my $stage = $vars{stage}; - $stage->run(); - ok($stage->succeeded()); + after each => sub { + $stage->clean(); + }; }; - it "generates the METS xml" => sub { - $vars{stage}->run(); - ok(-e $vars{mets_xml}); - }; + describe "HTFeed::PackageType::IA::Unpack" => sub { + my $stage; - context "with a mets xml" => sub { - - before each => sub { - $vars{stage}->run; - }; - - it "writes scanningOrder, readingOrder, and coverTag" => sub { - my $xc = $volume->_parse_xpc($vars{mets_xml}); - ok($xc->findnodes('/METS:mets/METS:amdSec/METS:techMD/METS:mdWrap/METS:xmlData/gbs:scanningOrder')->size() == 1); - is($xc->findvalue('/METS:mets/METS:amdSec/METS:techMD/METS:mdWrap/METS:xmlData/gbs:scanningOrder'), 'right-to-left'); - ok($xc->findnodes('/METS:mets/METS:amdSec/METS:techMD/METS:mdWrap/METS:xmlData/gbs:readingOrder')->size() == 1); - is($xc->findvalue('/METS:mets/METS:amdSec/METS:techMD/METS:mdWrap/METS:xmlData/gbs:readingOrder'), 'right-to-left'); - ok($xc->findnodes('/METS:mets/METS:amdSec/METS:techMD/METS:mdWrap/METS:xmlData/gbs:coverTag')->size() == 1); - is($xc->findvalue('/METS:mets/METS:amdSec/METS:techMD/METS:mdWrap/METS:xmlData/gbs:coverTag'), 'follows-reading-order'); - }; - }; - }; + before each => sub { + $stage = HTFeed::PackageType::IA::Unpack->new(volume => $volume); + }; - describe "HTFeed::PackageType::IA::SourceMETS" => sub { + it "succeeds" => sub { + $stage->run(); + ok($stage->succeeded()); + }; - before each => sub { - $volume->record_premis_event('package_inspection'); - HTFeed::PackageType::IA::VerifyManifest->new(volume => $volume)->run(); - HTFeed::PackageType::IA::Unpack->new(volume => $volume)->run(); - HTFeed::PackageType::IA::DeleteCheck->new(volume => $volume)->run(); - HTFeed::PackageType::IA::OCRSplit->new(volume => $volume)->run(); - HTFeed::PackageType::IA::ImageRemediate->new(volume => $volume)->run(); - mock_zephir(); - $vars{stage} = HTFeed::PackageType::IA::SourceMETS->new(volume => $volume); - $vars{mets_xml} = "$tmpdirs->{ingest}/$pt_objid/IA_$pt_objid.xml" + it "extracts the zip" => sub { + $stage->run(); + + my $ia_id = $volume->get_ia_id(); + ok(-e "$tmpdirs->{preingest}/$pt_objid/${ia_id}_0001.jp2"); + }; + + after each => sub { + $stage->clean(); + }; }; - it_should_behave_like "mets with reading order"; - }; + share my %vars; + shared_examples_for "mets with reading order" => sub { + it "succeeds" => sub { + my $stage = $vars{stage}; + $stage->run(); + ok($stage->succeeded()); + }; + + it "generates the METS xml" => sub { + $vars{stage}->run(); + ok(-e $vars{mets_xml}); + }; + + context "with a mets xml" => sub { + + before each => sub { + $vars{stage}->run; + }; + + it "writes scanningOrder, readingOrder, and coverTag" => sub { + my $xc = $volume->_parse_xpc($vars{mets_xml}); + my $xpath_prefix = '/METS:mets/METS:amdSec/METS:techMD/METS:mdWrap/METS:xmlData'; + ok($xc->findnodes("$xpath_prefix/gbs:scanningOrder")->size() == 1); + is($xc->findvalue("$xpath_prefix/gbs:scanningOrder"), 'right-to-left'); + ok($xc->findnodes("$xpath_prefix/gbs:readingOrder")->size() == 1); + is($xc->findvalue("$xpath_prefix/gbs:readingOrder"), 'right-to-left'); + ok($xc->findnodes("$xpath_prefix/gbs:coverTag")->size() == 1); + is($xc->findvalue("$xpath_prefix/gbs:coverTag"), 'follows-reading-order'); + }; + }; + }; - describe "HTFeed::PackageType::IA::METS" => sub { - before each => sub { - $volume->record_premis_event('package_inspection'); - HTFeed::PackageType::IA::VerifyManifest->new(volume => $volume)->run(); - HTFeed::PackageType::IA::Unpack->new(volume => $volume)->run(); - HTFeed::PackageType::IA::DeleteCheck->new(volume => $volume)->run(); - HTFeed::PackageType::IA::OCRSplit->new(volume => $volume)->run(); - HTFeed::PackageType::IA::ImageRemediate->new(volume => $volume)->run(); - mock_zephir(); - HTFeed::PackageType::IA::SourceMETS->new(volume => $volume)->run(); - HTFeed::VolumeValidator->new(volume => $volume)->run(); - HTFeed::Stage::Pack->new(volume => $volume)->run(); - $vars{stage} = HTFeed::METS->new(volume => $volume); - $vars{mets_xml} = "$tmpdirs->{ingest}/$pt_objid.mets.xml" + describe "HTFeed::PackageType::IA::SourceMETS" => sub { + + before each => sub { + $volume->record_premis_event('package_inspection'); + HTFeed::PackageType::IA::VerifyManifest->new(volume => $volume)->run(); + HTFeed::PackageType::IA::Unpack->new(volume => $volume)->run(); + HTFeed::PackageType::IA::DeleteCheck->new(volume => $volume)->run(); + HTFeed::PackageType::IA::OCRSplit->new(volume => $volume)->run(); + HTFeed::PackageType::IA::ImageRemediate->new(volume => $volume)->run(); + mock_zephir(); + $vars{stage} = HTFeed::PackageType::IA::SourceMETS->new(volume => $volume); + $vars{mets_xml} = "$tmpdirs->{ingest}/$pt_objid/IA_$pt_objid.xml" + }; + + it_should_behave_like "mets with reading order"; }; - it_should_behave_like "mets with reading order"; - }; + describe "HTFeed::PackageType::IA::METS" => sub { + before each => sub { + $volume->record_premis_event('package_inspection'); + HTFeed::PackageType::IA::VerifyManifest->new(volume => $volume)->run(); + HTFeed::PackageType::IA::Unpack->new(volume => $volume)->run(); + HTFeed::PackageType::IA::DeleteCheck->new(volume => $volume)->run(); + HTFeed::PackageType::IA::OCRSplit->new(volume => $volume)->run(); + HTFeed::PackageType::IA::ImageRemediate->new(volume => $volume)->run(); + mock_zephir(); + HTFeed::PackageType::IA::SourceMETS->new(volume => $volume)->run(); + HTFeed::VolumeValidator->new(volume => $volume)->run(); + HTFeed::Stage::Pack->new(volume => $volume)->run(); + $vars{stage} = HTFeed::METS->new(volume => $volume); + $vars{mets_xml} = "$tmpdirs->{ingest}/$pt_objid.mets.xml" + }; + + it_should_behave_like "mets with reading order"; + }; }; runtests unless caller; diff --git a/t/local_ingest.t b/t/local_ingest.t index 81a3fa26..4b59be49 100644 --- a/t/local_ingest.t +++ b/t/local_ingest.t @@ -1,272 +1,272 @@ use FindBin; use lib "$FindBin::Bin/lib"; -use Test::Spec; -use HTFeed::Test::SpecSupport qw(mock_zephir); -use HTFeed::Test::Support qw(load_db_fixtures); +use File::Path qw(remove_tree); use HTFeed::Config qw(set_config); use HTFeed::PackageType::Simple::Unpack; use HTFeed::PackageType::Simple::VerifyManifest; -use File::Path qw(remove_tree); +use HTFeed::Test::SpecSupport qw(mock_zephir); +use HTFeed::Test::Support qw(load_db_fixtures); +use Test::Spec; sub unpacked_volume { - my $objid = shift; - my $volume = HTFeed::Volume->new( - namespace => 'test', - objid => $objid, - packagetype => 'simple'); + my $objid = shift; + my $volume = HTFeed::Volume->new( + namespace => 'test', + objid => $objid, + packagetype => 'simple' + ); - HTFeed::PackageType::Simple::Unpack->new(volume => $volume)->run(); + HTFeed::PackageType::Simple::Unpack->new(volume => $volume)->run(); - return $volume; + return $volume; } sub unpack_and_verify { - my $objid = shift; - my $volume = unpacked_volume($objid); - my $stage = HTFeed::PackageType::Simple::VerifyManifest->new(volume => $volume); - $stage->run; - return $stage; + my $objid = shift; + my $volume = unpacked_volume($objid); + my $stage = HTFeed::PackageType::Simple::VerifyManifest->new(volume => $volume); + $stage->run; + return $stage; } describe "HTFeed::PackageType::Simple" => sub { - my $tmpdirs; - my $testlog; - - before all => sub { - load_db_fixtures; - $tmpdirs = HTFeed::Test::TempDirs->new(); - $testlog = HTFeed::Test::Logger->new(); - set_config(0,'stop_on_error'); - }; - - before each => sub { - $tmpdirs->setup_example; - $testlog->reset; - set_config($tmpdirs->test_home . "/fixtures/simple",'staging','fetch'); - }; - - after each => sub { - $tmpdirs->cleanup_example; - }; - - after all => sub { - $tmpdirs->cleanup; - }; - - describe "checksum.md5" => sub { - it "reports a relevant error when checksum.md5 is missing" => sub { - eval { unpack_and_verify("no_checksum"); }; - printf STDERR "EVAL STATUS: $@\n"; - ok($testlog->matches(qr(Missing file.*checksum.md5))); - }; + my $tmpdirs; + my $testlog; - it "reports relevant errors when checksum.md5 is empty" => sub { - unpack_and_verify("empty_checksum"); - ok($testlog->matches(qr(present in package but not in checksum file))); + before all => sub { + load_db_fixtures; + $tmpdirs = HTFeed::Test::TempDirs->new(); + $testlog = HTFeed::Test::Logger->new(); + set_config(0, 'stop_on_error'); }; - it "reports the specific files missing from checksum.md5" => sub { - unpack_and_verify("missing_meta_yml_checksum"); - ok($testlog->matches(qr(file: meta\.yml.*present in package but not in checksum file))); + before each => sub { + $tmpdirs->setup_example; + $testlog->reset; + set_config($tmpdirs->test_home . "/fixtures/simple", 'staging', 'fetch'); }; - }; - describe "thumbs.db" => sub { - - it "ignores Thumbs.db when it is in the checksum file but not the package" => sub { - ok(unpack_and_verify("thumbs_in_checksum")->succeeded()); + after each => sub { + $tmpdirs->cleanup_example; }; - it "ignores Thumbs.db when it is in the package but not the checksum file" => sub { - ok(unpack_and_verify("thumbs_in_pkg")->succeeded()); + after all => sub { + $tmpdirs->cleanup; }; - it "ignores Thumbs.db when it is in the checksum file and the package, but the checksum is wrong" => sub { - ok(unpack_and_verify("thumbs_bad_checksum")->succeeded()); + describe "checksum.md5" => sub { + it "reports a relevant error when checksum.md5 is missing" => sub { + eval { unpack_and_verify("no_checksum"); }; + printf STDERR "EVAL STATUS: $@\n"; + ok($testlog->matches(qr(Missing file.*checksum.md5))); + }; + + it "reports relevant errors when checksum.md5 is empty" => sub { + unpack_and_verify("empty_checksum"); + ok($testlog->matches(qr(present in package but not in checksum file))); + }; + + it "reports the specific files missing from checksum.md5" => sub { + unpack_and_verify("missing_meta_yml_checksum"); + ok($testlog->matches(qr(file: meta\.yml.*present in package but not in checksum file))); + }; }; - it "ignores Thumbs.db when it is in both the checksum file and the package" => sub { - ok(unpack_and_verify("thumbs_in_pkg_and_checksum")->succeeded()); - }; - }; + describe "thumbs.db" => sub { + it "ignores Thumbs.db when it is in the checksum file but not the package" => sub { + ok(unpack_and_verify("thumbs_in_checksum")->succeeded()); + }; - describe "meta.yml" => sub { - - before all => sub { - mock_zephir(); - }; + it "ignores Thumbs.db when it is in the package but not the checksum file" => sub { + ok(unpack_and_verify("thumbs_in_pkg")->succeeded()); + }; - it "reports a relevant error when meta.yml is missing" => sub { - my $volume = unpacked_volume("no_meta_yml"); - eval { HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run(); }; + it "ignores Thumbs.db when it is in the checksum file and the package, but the checksum is wrong" => sub { + ok(unpack_and_verify("thumbs_bad_checksum")->succeeded()); + }; - ok($testlog->matches(qr(Missing file.*meta\.yml))); + it "ignores Thumbs.db when it is in both the checksum file and the package" => sub { + ok(unpack_and_verify("thumbs_in_pkg_and_checksum")->succeeded()); + }; }; - it "reports a relevant error when meta.yml is malformed" => sub { - my $volume = unpacked_volume("bad_meta_yml"); - eval { HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); }; - ok($testlog->matches(qr(File validation failed.*meta\.yml)s)); - } - }; + describe "meta.yml" => sub { + before all => sub { + mock_zephir(); + }; - describe "HTFeed::PackageType::Simple::ImageRemediate" => sub { - it "compresses tif to a valid jpeg2000" => sub { - my $volume = unpacked_volume("rgb_tif"); - my $remediate = HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume); - $remediate->run(); + it "reports a relevant error when meta.yml is missing" => sub { + my $volume = unpacked_volume("no_meta_yml"); + eval { HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run(); }; - ok(-e "$tmpdirs->{ingest}/rgb_tif/00000001.jp2"); - ok($remediate->succeeded()); + ok($testlog->matches(qr(Missing file.*meta\.yml))); + }; - HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); + it "reports a relevant error when meta.yml is malformed" => sub { + my $volume = unpacked_volume("bad_meta_yml"); + eval { HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); }; + ok($testlog->matches(qr(File validation failed.*meta\.yml)s)); + } + }; - my $validate = HTFeed::VolumeValidator->new(volume => $volume); - $validate->run(); - ok($validate->succeeded()); + describe "HTFeed::PackageType::Simple::ImageRemediate" => sub { + it "compresses tif to a valid jpeg2000" => sub { + my $volume = unpacked_volume("rgb_tif"); + my $remediate = HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume); + $remediate->run(); + + ok(-e "$tmpdirs->{ingest}/rgb_tif/00000001.jp2"); + ok($remediate->succeeded()); + + HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); + + my $validate = HTFeed::VolumeValidator->new(volume => $volume); + $validate->run(); + ok($validate->succeeded()); + }; + + it "preserves XMP values when compressing tif" => sub { + my $volume = unpacked_volume("rgb_tif"); + my $remediate = HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume); + $remediate->run(); + + my $exiftool = Image::ExifTool->new(); + $exiftool->ExtractInfo("$tmpdirs->{ingest}/rgb_tif/00000001.jp2"); + is($exiftool->GetValue("XMP-tiff:Make"), "Test scanner make"); + }; + + it "recompresses lossless jpeg2000 to a valid jpeg2000" => sub { + my $volume = unpacked_volume("lossless_jp2"); + + HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run(); + HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); + + my $validate = HTFeed::VolumeValidator->new(volume => $volume); + $validate->run();; + ok($validate->succeeded()); + }; + + it "preserves the XMP when recompressing a lossless JPEG2000" => sub { + # jp2 has artist & resolution fields in XMP; should preserve those + my $volume = unpacked_volume("lossless_jp2_with_xmp"); + HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run(); + HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); + + my $validate = HTFeed::VolumeValidator->new(volume => $volume); + $validate->run(); + ok($validate->succeeded()); + + my $exiftool = Image::ExifTool->new(); + $exiftool->ExtractInfo("$tmpdirs->{ingest}/lossless_jp2_with_xmp/00000001.jp2"); + is($exiftool->GetValue("XMP-tiff:Make"), "Test scanner make"); + }; + + it "does not lose artist when compressing a bitonal tiff" => sub { + my $volume = unpacked_volume("bitonal_tiff"); + HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run(); + HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); + my $validate = HTFeed::VolumeValidator->new(volume => $volume); + $validate->run(); + ok($validate->succeeded()); + }; }; +}; - it "preserves XMP values when compressing tif" => sub { - my $volume = unpacked_volume("rgb_tif"); - my $remediate = HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume); - $remediate->run(); +describe "HTFeed::PackageType::Simple::Download" => sub { + use HTFeed::PackageType::Simple::Download; + my $tmpdirs; + my $testlog; + my $save_rclone; - my $exiftool = Image::ExifTool->new(); - $exiftool->ExtractInfo("$tmpdirs->{ingest}/rgb_tif/00000001.jp2"); - is($exiftool->GetValue("XMP-tiff:Make"),"Test scanner make"); + before all => sub { + load_db_fixtures; + $tmpdirs = HTFeed::Test::TempDirs->new(); + $testlog = HTFeed::Test::Logger->new(); + set_config(0, 'stop_on_error'); + set_config(1, 'use_dropbox'); + set_config($tmpdirs->test_home . "/fixtures/rclone_config.conf", 'rclone_config_path'); + set_config("$FindBin::Bin/bin/rclone_stub.pl", 'rclone'); }; - it "recompresses lossless jpeg2000 to a valid jpeg2000" => sub { - my $volume = unpacked_volume("lossless_jp2"); + before each => sub { + $tmpdirs->setup_example; + $testlog->reset; + }; - HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run(); - HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); + after each => sub { + $tmpdirs->cleanup_example; + }; - my $validate = HTFeed::VolumeValidator->new(volume => $volume); - $validate->run();; - ok($validate->succeeded()); + after all => sub { + $tmpdirs->cleanup; + set_config(0, 'use_dropbox'); }; - it "preserves the XMP when recompressing a lossless JPEG2000" => sub { - # jp2 has artist & resolution fields in XMP; should preserve those - my $volume = unpacked_volume("lossless_jp2_with_xmp"); - HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run(); + describe "download stage" => sub { + it "downloads the file" => sub { + my $volume = HTFeed::Volume->new( + namespace => 'test', + objid => 'test_objid', + packagetype => 'simple' + ); + my $download = $volume->get_sip_location(); + my $stage = HTFeed::PackageType::Simple::Download->new(volume => $volume); + $stage->run(); + ok($stage->succeeded() && -f $download); + }; + }; +}; - HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); +describe "HTFeed::PackageType::Simple::Volume" => sub { + use HTFeed::PackageType::Simple::Download; + my $tmpdirs; + my $testlog; + my $fetchdir; - my $validate = HTFeed::VolumeValidator->new(volume => $volume); - $validate->run(); - ok($validate->succeeded()); + before all => sub { + load_db_fixtures; + $tmpdirs = HTFeed::Test::TempDirs->new(); + $testlog = HTFeed::Test::Logger->new(); + set_config(0, 'stop_on_error'); + set_config(1, 'use_dropbox'); + set_config($tmpdirs->test_home . "/fixtures/rclone_config.conf", 'rclone_config_path'); + set_config("$FindBin::Bin/bin/rclone_stub.pl", 'rclone'); + }; - my $exiftool = Image::ExifTool->new(); - $exiftool->ExtractInfo("$tmpdirs->{ingest}/lossless_jp2_with_xmp/00000001.jp2"); - is($exiftool->GetValue("XMP-tiff:Make"),"Test scanner make"); + before each => sub { + $tmpdirs->setup_example; + $testlog->reset; + $fetchdir = $tmpdirs->dir_for("fetch"); + set_config($fetchdir, 'staging', 'fetch'); + mkdir("$fetchdir/test"); + system("touch", "$fetchdir/test/test_objid.zip"); + system("touch", "$fetchdir/test/test_objid.xml"); }; - it "does not lose artist when compressing a bitonal tiff" => sub { - my $volume = unpacked_volume("bitonal_tiff"); - HTFeed::PackageType::Simple::ImageRemediate->new(volume => $volume)->run(); - HTFeed::PackageType::Simple::SourceMETS->new(volume => $volume)->run(); - my $validate = HTFeed::VolumeValidator->new(volume => $volume); - $validate->run(); - ok($validate->succeeded()); + after each => sub { + $tmpdirs->cleanup_example; + remove_tree($fetchdir); }; - }; -}; -describe "HTFeed::PackageType::Simple::Download" => sub { - use HTFeed::PackageType::Simple::Download; - my $tmpdirs; - my $testlog; - my $save_rclone; - - before all => sub { - load_db_fixtures; - $tmpdirs = HTFeed::Test::TempDirs->new(); - $testlog = HTFeed::Test::Logger->new(); - set_config(0,'stop_on_error'); - set_config(1,'use_dropbox'); - set_config($tmpdirs->test_home . "/fixtures/rclone_config.conf", 'rclone_config_path'); - set_config("$FindBin::Bin/bin/rclone_stub.pl", 'rclone'); - }; - - before each => sub { - $tmpdirs->setup_example; - $testlog->reset; - }; - - after each => sub { - $tmpdirs->cleanup_example; - }; - - after all => sub { - $tmpdirs->cleanup; - set_config(0,'use_dropbox'); - }; - - describe "download stage" => sub { - it "downloads the file" => sub { - my $volume = HTFeed::Volume->new( - namespace => 'test', - objid => 'test_objid', - packagetype => 'simple'); - my $download = $volume->get_sip_location(); - my $stage = HTFeed::PackageType::Simple::Download->new(volume => $volume); - $stage->run(); - ok($stage->succeeded() && -f $download); + after all => sub { + $tmpdirs->cleanup; + set_config(0, 'use_dropbox'); }; - }; -}; -describe "HTFeed::PackageType::Simple::Volume" => sub { - use HTFeed::PackageType::Simple::Download; - my $tmpdirs; - my $testlog; - my $fetchdir; - - before all => sub { - load_db_fixtures; - $tmpdirs = HTFeed::Test::TempDirs->new(); - $testlog = HTFeed::Test::Logger->new(); - set_config(0,'stop_on_error'); - set_config(1,'use_dropbox'); - set_config($tmpdirs->test_home . "/fixtures/rclone_config.conf", 'rclone_config_path'); - set_config("$FindBin::Bin/bin/rclone_stub.pl", 'rclone'); - }; - - before each => sub { - $tmpdirs->setup_example; - $testlog->reset; - $fetchdir = $tmpdirs->dir_for("fetch"); - set_config($fetchdir,'staging','fetch'); - mkdir("$fetchdir/test"); - system("touch","$fetchdir/test/test_objid.zip"); - system("touch","$fetchdir/test/test_objid.xml"); - }; - - after each => sub { - $tmpdirs->cleanup_example; - remove_tree($fetchdir); - }; - - after all => sub { - $tmpdirs->cleanup; - set_config(0,'use_dropbox'); - }; - - describe "#clean_sip_success" => sub { - it "calls rclone to remove SIP from Dropbox" => sub { - my $volume = HTFeed::Volume->new( - namespace => 'test', - objid => 'test_objid', - packagetype => 'simple'); - eval { - $volume->clean_sip_success(); - }; - ok($testlog->matches(qr(running.+?rclone.+?delete)i) && !$@); + describe "#clean_sip_success" => sub { + it "calls rclone to remove SIP from Dropbox" => sub { + my $volume = HTFeed::Volume->new( + namespace => 'test', + objid => 'test_objid', + packagetype => 'simple' + ); + eval { + $volume->clean_sip_success(); + }; + ok($testlog->matches(qr(running.+?rclone.+?delete)i) && !$@); + }; }; - }; }; runtests unless caller;