-
Notifications
You must be signed in to change notification settings - Fork 1
/
searchForNoChecksumDatastreams.php
131 lines (106 loc) · 5.24 KB
/
searchForNoChecksumDatastreams.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env drush
#<?php
// grab the first user supplied parameter as the name of the collection
$collection = drush_shift();
if (! $collection) {
drush_print("***Error: please provide the name of the collection as the first argument");
drush_print("Example: drush php-script searchForNoChecksumDatastreams.php islandora:collection_name_here FULL_TEXT");
return;
}
// grab the second user supplied paramter as the name of the datastream we care about
$dslabel = drush_shift();
if (! $dslabel) {
drush_print("***ERROR: please provide the name of the datastream label as the second argument");
drush_print("Example: drush php-script searchForNoChecksumDatastreams.php islandora:collection_name_here FULL_TEXT");
return;
}
// include all php files necessary for Tuque
foreach (glob("/var/www/drupal/htdocs/sites/all/libraries/tuque/*.php") as $filename) {
require_once ($filename);
}
// repository connection parameters
$url = 'localhost:8080/fedora';
$username = 'fedoraAdmin';
$password = 'fedoraAdmin';
// set up connection and repository variables
$connection = new RepositoryConnection($url, $username, $password);
$api = new FedoraApi($connection);
$repository = new FedoraRepository($api, new SimpleCache());
$api_m = $repository->api->m; // Fedora management API
$api_a = $repository->api->a;
// query to grab all pdf collection objects from the repository
$sparqlQuery = "SELECT ?s
FROM <#ri>
WHERE {
?s <info:fedora/fedora-system:def/relations-external#isMemberOfCollection>
<info:fedora/$collection> .
}";
// run query
drush_print("\nQuerying repository for all objects in the $collection collection...");
$allPDFObjects = $repository->ri->sparqlQuery($sparqlQuery);
drush_print("Query complete\n");
// check number of objects in the collection to make sure we have some
$totalNumObjects = count($allPDFObjects);
if ($totalNumObjects <= 0) {
drush_print("***Error: no objects found in the given collection. Check the collection name.");
drush_print("***No processing was completed. Exiting.");
return;
} else {
drush_print("There are $totalNumObjects objects to be processed");
}
$noChecksumDatastreams = array();
$objZeroChecksumsMissing = array();
$numberOfChecksums = 0;
drush_print("\nBeginning main processing loop\n");
for ($counter = 0; $counter < $totalNumObjects; $counter ++) {
// grab the next object from the result set
$theObject = $allPDFObjects[$counter];
// increment the counter shown to the user
$realCount = $counter + 1;
drush_print("Processing record $realCount of $totalNumObjects");
// grab the PID value from the object array
$objectPID = $theObject['s']['value'];
/****************** ONLY THE SPECIFIED DATASTREAM *****************/
$allDSforObject = array_reverse($api_m->getDatastreamHistory($objectPID, $dslabel));
foreach($allDSforObject as $objectDS) {
$numberOfChecksums++;
if (empty($objectDS['dsChecksum']) || $objectDS['dsChecksum'] == 'none') {
if ($objectDS['dsVersionID'] == $dslabel.'.0') {
// this is the original ingested objected, expected to be missing a checksum
$objZeroChecksumsMissing[] = $objectDS;
drush_print("$objectPID is missing a checksum on the ".$objectDS['dsVersionID']." datastream");
}
else {
// this is any other DS other than dslabel.0 so shouldn't be missing any checksums hopefully
$noChecksumDatastreams[] = $objectDS;
drush_print("$objectPID is missing a checksum on the ".$objectDS['dsVersionID']." datastream");
}
}
}
/*******************************************************************/
/************* ALL DATASTREAMS *******************/
/*
$dsLabels = array_keys($api_a->listDatastreams($objectPID));
$dsLabels = array_unique($dsLabels);
// print_r($dsLabels);
foreach($dsLabels as $theDSLabelkey => $theDSLabelvalue) {
$allObjectDSs = array_reverse($api_m->getDatastreamHistory($objectPID, $theDSLabelvalue));
foreach ($allObjectDSs as $objectDS) {
$numberOfChecksums++;
if (empty($objectDS['dsChecksum']) || $objectDS['dsChecksum'] == 'none') {
$noChecksumDatastreams[] = $objectDS;
// print_r($objectDS);
drush_print("$objectPID is missing a checksum on the ".$objectDS['dsVersionID']." datastream");
}
}
}
*/
/***************************************************/
}
$ttlWithNoChecksum = count($noChecksumDatastreams);
$objZeroNumberChecksumsMissing = count($objZeroChecksumsMissing);
drush_print("\nMain processing loop complete\n");
// drush_print("There are $ttlWithNoChecksum out of $numberOfChecksums objects with missing checksums on the $dslabel datastream");
drush_print("There are $objZeroNumberChecksumsMissing $dslabel.0 datastreams without a checksum");
drush_print("There are $ttlWithNoChecksum out of $numberOfChecksums datastreams with missing checksums on the $dslabel datastream, not including the $dslabel.0 datastream");
echo "\n\nAll operations complete\n";