Skip to content

Commit

Permalink
Add data object size and verify checksum support (#223)
Browse files Browse the repository at this point in the history
Add DataObject::size method.

Add Dataobject::is_consistent_size method.

Preemptively fix missing data object checksums on overwrite.

Co-authored-by: Keith James <[email protected]>
  • Loading branch information
kjsanger and Keith James authored Oct 19, 2021
1 parent c512c7e commit 7fe6182
Show file tree
Hide file tree
Showing 8 changed files with 264 additions and 49 deletions.
7 changes: 4 additions & 3 deletions .github/workflows/run-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ jobs:
- perl: "5.22.4"
irods: "4.2.7"
server_image: "wsinpg/ub-16.04-irods-4.2.7:latest"
baton: "2.1.0"
baton: "3.2.0"
experimental: false
- perl: "5.22.4"
irods: "4.2.10"
server_image: "wsinpg/ub-18.04-irods-4.2.10:latest"
baton: "3.1.0"
baton: "3.2.0"
experimental: false

services:
Expand Down Expand Up @@ -85,7 +85,8 @@ jobs:
"irods_user_name": "irods",
"irods_zone_name": "testZone",
"irods_home": "/testZone/home/irods",
"irods_default_resource": "replResc"
"irods_default_resource": "replResc",
"irods_default_hash_scheme": "MD5"
}
EOF
Expand Down
72 changes: 60 additions & 12 deletions lib/WTSI/NPG/iRODS.pm
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,9 @@ our $COLLECTION_PATH = 'COLLECTION';
our $STAGING_RAND_MAX = 1024 * 1024 * 1024;
our $STAGING_MAX_TRIES = 2;

our $CALC_CHECKSUM = 1;
our $SKIP_CHECKSUM = 0;
our $SKIP_CHECKSUM = 0;
our $CALC_CHECKSUM = 1;
our $VERIFY_CHECKSUM = 2;

has 'strict_baton_version' =>
(is => 'ro',
Expand Down Expand Up @@ -1366,11 +1367,19 @@ sub read_object {
Arg [1] : Path of file to add to iRODs.
Arg [2] : iRODS data object path.
Arg [3] : Checksum action, either $WTSI::NPG::iRODS::CALC_CHECKSUM
(calculate a checksum on the server side) or
Arg [3] : Checksum action, one of
$WTSI::NPG::iRODS::CALC_CHECKSUM (calculate a checksum on the
server side)
$WTSI::NPG::iRODS::VERIFY_CHECKSUM (calculate a checksum on
the server side and validate it against a checksum calculated
on the client side)
$WTSI::NPG::iRODS::SKIP_CHECKSUM (skip calculation of a
checksum on the server side). Defaults to
$WTSI::NPG::iRODS::SKIP_CHECKSUM
checksum on the server side)
Defaults to $WTSI::NPG::iRODS::SKIP_CHECKSUM.
Example : $irods->add_object('lorem.txt', '/my/path/lorem.txt')
Description: Add a file to iRODS.
Expand All @@ -1393,7 +1402,9 @@ sub add_object {

if (defined $checksum_action) {
($checksum_action =~ m{^\d$}msx and
any { $checksum_action == $_ } ($CALC_CHECKSUM, $SKIP_CHECKSUM)) or
any { $checksum_action == $_ } ($SKIP_CHECKSUM,
$CALC_CHECKSUM,
$VERIFY_CHECKSUM)) or
$self->logconfess("Invalid checksum action '$checksum_action'");
}
else {
Expand All @@ -1417,11 +1428,19 @@ sub add_object {
Arg [1] : Path of file to add to iRODs.
Arg [2] : iRODS data object path.
Arg [3] : Checksum action, either $WTSI::NPG::iRODS::CALC_CHECKSUM
(calculate a checksum on the server side) or
Arg [3] : Checksum action, one of
$WTSI::NPG::iRODS::CALC_CHECKSUM (calculate a checksum on the
server side)
$WTSI::NPG::iRODS::VERIFY_CHECKSUM (calculate a checksum on
the server side and validate it against a checksum calculated
on the client side)
$WTSI::NPG::iRODS::SKIP_CHECKSUM (skip calculation of a
checksum on the server side). Defaults to
$WTSI::NPG::iRODS::SKIP_CHECKSUM
checksum on the server side)
Defaults to $WTSI::NPG::iRODS::SKIP_CHECKSUM.
Example : $irods->replace_object('lorem.txt', '/my/path/lorem.txt')
Description: Replace a file in iRODS.
Expand All @@ -1444,7 +1463,9 @@ sub replace_object {

if (defined $checksum_action) {
($checksum_action =~ m{^\d$}msx and
any { $checksum_action == $_ } ($CALC_CHECKSUM, $SKIP_CHECKSUM)) or
any { $checksum_action == $_ } ($SKIP_CHECKSUM,
$CALC_CHECKSUM,
$VERIFY_CHECKSUM)) or
$self->logconfess("Invalid checksum action '$checksum_action'");
}
else {
Expand Down Expand Up @@ -2059,6 +2080,33 @@ sub checksum {
return $self->baton_client->list_object_checksum($object);
}

=head2 size
Arg [1] : iRODS data object path.
Example : $cs = $irods->size('/my/path/lorem.txt')
Description: Return the size in bytes of an iRODS data object. The size
returned is the iRODS catalog value, which may be different
from the actual size on disk.
Returntype : Int
=cut

sub size {
my ($self, $object) = @_;

defined $object or
$self->logconfess('A defined object argument is required');

$object eq q{} and
$self->logconfess('A non-empty object argument is required');

$object = $self->ensure_object_path($object);

return $self->baton_client->list_object_size($object);
}


=head2 collection_checksums
Arg [1] : iRODS collection path.
Expand Down
48 changes: 45 additions & 3 deletions lib/WTSI/NPG/iRODS/BatonClient.pm
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,17 @@ sub put_object {
my ($file_name, $directory, $suffix) = fileparse($local_path);
my ($data_object, $collection) = fileparse($remote_path);

my $checksum_args = {};
if ($checksum == $WTSI::NPG::iRODS::SKIP_CHECKSUM) {
$checksum_args->{checksum} = JSON::false;
} elsif ($checksum == $WTSI::NPG::iRODS::CALC_CHECKSUM) {
$checksum_args->{checksum} = JSON::true;
} elsif ($checksum == $WTSI::NPG::iRODS::VERIFY_CHECKSUM) {
$checksum_args->{verify} = JSON::true;
}

my $spec = {operation => 'put',
arguments => $self->_map_json_args({checksum => $checksum}),
arguments => $self->_map_json_args($checksum_args),
target => {collection => $collection,
data_object => $data_object,
directory => $directory,
Expand Down Expand Up @@ -260,7 +269,8 @@ sub list_collection_checksums {
Example : my $checksum = $irods->list_object_checksum('/path/to/object')
Description: Return the checksum of the data object. This method uses
the same iRODS API as the 'ichksum' client program.
the same iRODS API as the 'ichksum' client program. Return undef
if no checksum has been calculated.
Returntype : Str
=cut
Expand Down Expand Up @@ -303,7 +313,7 @@ sub calculate_object_checksum {
"received '$object'");

my $spec = {operation => 'checksum',
arguments => {},
arguments => {checksum => JSON::true},
target => {collection => $collection,
data_object => $data_name}};
my $response = $self->communicate($spec);
Expand All @@ -324,6 +334,38 @@ sub calculate_object_checksum {
return $checksum;
}

=head2 list_object_size
Arg [1] : iRODS data object path.
Example : my size = $irods->list_object_size('/path/to/object')
Description: Return the size of the data object. This method returns the
value from the iRODS catalog, not the size on disk.
Returntype : Int
=cut

sub list_object_size {
my ($self, $object) = @_;

my $response = $self->_list_path($object, {size => 1});
my $size;

if (exists $response->{error}) {
if ($response->{error}->{code} == $ITEM_DOES_NOT_EXIST) {
# Continue to return undef
}
else {
$self->report_error($response);
}
}
else {
$size = $response->{size};
}

return $size;
}

=head2 list_object_replicates
Arg [1] : iRODS data object path.
Expand Down
82 changes: 81 additions & 1 deletion lib/WTSI/NPG/iRODS/DataObject.pm
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ use WTSI::NPG::iRODS::Types qw(ArrayRefOfReplicate);

our $VERSION = '';

our $EMPTY_FILE_CHECKSUM = q[d41d8cd98f00b204e9800998ecf8427e];

with 'WTSI::NPG::iRODS::Path';

has 'data_object' =>
Expand All @@ -36,6 +38,18 @@ has 'checksum' =>
clearer => 'clear_checksum',
documentation => 'The checksum of the data object.');

has 'size' =>
(is => 'ro',
isa => 'Int',
lazy => 1,
builder => '_build_size',
predicate => 'has_size',
clearer => 'clear_size',
documentation => 'The size in bytes of the data object in the catalog. ' .
'This is the value that iRODS reports for the whole ' .
'data object. Each replicate, if any, also has its own ' .
'size value.');

# TODO: Add a check so that a DataObject cannot be built from a path
# that is in fact a collection.
around BUILDARGS => sub {
Expand Down Expand Up @@ -64,6 +78,13 @@ sub _build_checksum {
return $self->irods->checksum($self->str);
}

# Lazily load size from iRODS
sub _build_size {
my ($self) = @_;

return $self->irods->size($self->str);
}

=head2 replicates
Arg [1] : None.
Expand Down Expand Up @@ -136,7 +157,7 @@ sub get_metadata {
Example : $path->is_present && print $path->str
Description: Return true if the data object file exists in iRODS.
Returntype : WTSI::NPG::iRODS::DataObject
Returntype : Bool
=cut

Expand All @@ -146,6 +167,65 @@ sub is_present {
return $self->irods->list_object($self->str);
}

=head2 is_consistent_size
Arg [1] : None
Example : $path->is_consistent_size && print $path->str
Description: Return true if the data object in iRODS is internally
consistent. This is defined as:
1. If the file is zero length, it has the checksum of an
empty file.
2. If the file is not zero length, it does not have the checksum
of an empty file.
This method looks for data object size and checksum consistency.
It checks the values that iRODS reports for the whole data
object; it does not check individual replicates.
If the data object is absent, this method returns true as there
can be no conflict where neither value exists.
If the data object has no checksum, this method returns true as
there is no evidence to dispute its reported size.
In iRODS <= 4.2.8 it is possible for a data object to get into a
bad state where it has zero length, but still reports as not
stale and having the checksum of the full-length file.
We can trigger this behaviour in iRODS by having more than one
client uploading to a single path. iRODS does not support any
form of locking, allows uncoordinated writes to the
filesystem. It does recognise this as a failure, but does not
clean up the damaged file.
Returntype : Bool
=cut

sub is_consistent_size {
my ($self) = @_;

if (not $self->is_present) {
return 1;
}

if (not $self->checksum) {
# This return is redundant as the checksum method call will trigger an
# exception if no checksum is present in iRODS (due to the isa
# constraint on the checksum attribute).
return 1;
}

if ($self->size == 0) {
return $self->checksum eq $EMPTY_FILE_CHECKSUM;
}
else {
return $self->checksum ne $EMPTY_FILE_CHECKSUM;
}
}

=head2 absolute
Arg [1] : None
Expand Down
Loading

0 comments on commit 7fe6182

Please sign in to comment.