From 1d2d77630e9b1a3929114757e86cfa700d61a3fa Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 14 Nov 2024 19:19:28 -0500 Subject: [PATCH 01/11] preliminary #10977 --- .../datasetutility/AddReplaceFileHelper.java | 27 ++++------ .../datasetutility/OptionalFileParams.java | 22 +++++++- .../impl/CreateNewDataFilesCommand.java | 6 +++ .../dataverse/globus/GlobusServiceBean.java | 52 ++++++++++++++++--- .../dataverse/ingest/IngestServiceBean.java | 25 ++++++--- 5 files changed, 101 insertions(+), 31 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java index a470f08f736..3943e3ad7d8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java @@ -138,7 +138,8 @@ public class AddReplaceFileHelper{ private String newStorageIdentifier; // step 30 private String newCheckSum; // step 30 private ChecksumType newCheckSumType; //step 30 - + private Long suppliedFileSize = null; + // -- Optional private DataFile fileToReplace; // step 25 @@ -610,11 +611,14 @@ private boolean runAddReplacePhase1(Dataset owner, return false; } - if(optionalFileParams != null) { - if(optionalFileParams.hasCheckSum()) { - newCheckSum = optionalFileParams.getCheckSum(); - newCheckSumType = optionalFileParams.getCheckSumType(); - } + if (optionalFileParams != null) { + if (optionalFileParams.hasCheckSum()) { + newCheckSum = optionalFileParams.getCheckSum(); + newCheckSumType = optionalFileParams.getCheckSumType(); + } + if (optionalFileParams.hasFileSize()) { + suppliedFileSize = optionalFileParams.getFileSize(); + } } msgt("step_030_createNewFilesViaIngest"); @@ -1204,20 +1208,11 @@ private boolean step_030_createNewFilesViaIngest(){ clone = workingVersion.cloneDatasetVersion(); } try { - /*CreateDataFileResult result = FileUtil.createDataFiles(workingVersion, - this.newFileInputStream, - this.newFileName, - this.newFileContentType, - this.newStorageIdentifier, - this.newCheckSum, - this.newCheckSumType, - this.systemConfig);*/ - UploadSessionQuotaLimit quota = null; if (systemConfig.isStorageQuotasEnforced()) { quota = fileService.getUploadSessionQuotaLimit(dataset); } - Command cmd = new CreateNewDataFilesCommand(dvRequest, workingVersion, newFileInputStream, newFileName, newFileContentType, newStorageIdentifier, quota, newCheckSum, newCheckSumType); + Command cmd = new CreateNewDataFilesCommand(dvRequest, workingVersion, newFileInputStream, newFileName, newFileContentType, newStorageIdentifier, quota, newCheckSum, newCheckSumType, suppliedFileSize); CreateDataFileResult createDataFilesResult = commandEngine.submit(cmd); initialFileList = createDataFilesResult.getDataFiles(); diff --git a/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java b/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java index 959dbc4e262..c1be6424a84 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java +++ b/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java @@ -76,6 +76,8 @@ public class OptionalFileParams { public static final String MIME_TYPE_ATTR_NAME = "mimeType"; private String checkSumValue; private ChecksumType checkSumType; + public static final String FILE_SIZE_ATTR_NAME = "fileSize"; + private Long fileSize; public static final String LEGACY_CHECKSUM_ATTR_NAME = "md5Hash"; public static final String CHECKSUM_OBJECT_NAME = "checksum"; public static final String CHECKSUM_OBJECT_TYPE = "@type"; @@ -268,6 +270,18 @@ public String getCheckSum() { public ChecksumType getCheckSumType() { return checkSumType; } + + public boolean hasFileSize() { + return fileSize != null; + } + + public Long getFileSize() { + return fileSize; + } + + public void setFileSize(long fileSize) { + this.fileSize = fileSize; + } /** * Set tags @@ -416,7 +430,13 @@ else if ((jsonObj.has(CHECKSUM_OBJECT_NAME)) && (!jsonObj.get(CHECKSUM_OBJECT_NA this.checkSumType = ChecksumType.fromString(((JsonObject) jsonObj.get(CHECKSUM_OBJECT_NAME)).get(CHECKSUM_OBJECT_TYPE).getAsString()); } - + // ------------------------------- + // get file size as a Long, if supplied + // ------------------------------- + if ((jsonObj.has(FILE_SIZE_ATTR_NAME)) && (!jsonObj.get(FILE_SIZE_ATTR_NAME).isJsonNull())){ + + this.fileSize = jsonObj.get(FILE_SIZE_ATTR_NAME).getAsLong(); + } // ------------------------------- // get tags // ------------------------------- diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java index 76939751899..172c92dc1fd 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java @@ -93,6 +93,10 @@ public CreateNewDataFilesCommand(DataverseRequest aRequest, DatasetVersion versi this(aRequest, version, inputStream, fileName, suppliedContentType, newStorageIdentifier, quota, newCheckSum, newCheckSumType, null, null); } + public CreateNewDataFilesCommand(DataverseRequest aRequest, DatasetVersion version, InputStream inputStream, String fileName, String suppliedContentType, String newStorageIdentifier, UploadSessionQuotaLimit quota, String newCheckSum, DataFile.ChecksumType newCheckSumType, Long newFileSize) { + this(aRequest, version, inputStream, fileName, suppliedContentType, newStorageIdentifier, quota, newCheckSum, newCheckSumType, newFileSize, null); + } + // This version of the command must be used when files are created in the // context of creating a brand new dataset (from the Add Dataset page): @@ -636,6 +640,7 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException createIngestFailureReport(datafile, warningMessage); datafile.SetIngestProblem(); } + logger.info("datafile size: " + datafile.getFilesize()); if (datafile.getFilesize() < 0) { datafile.setFilesize(fileSize); } @@ -654,6 +659,7 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException quota.setTotalUsageInBytes(quota.getTotalUsageInBytes() + fileSize); } + logger.info("datafile size (again): " + datafile.getFilesize()); return CreateDataFileResult.success(fileName, finalType, datafiles); } diff --git a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java index ac3c81622fc..3573b8e05df 100644 --- a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java @@ -284,6 +284,48 @@ private int makeDir(GlobusEndpoint endpoint, String dir) { return result.status; } + private Map lookupFileSizes(GlobusEndpoint endpoint, String dir) { + Map ret = new HashMap<>(); + + MakeRequestResponse result; + + try { + URL url = new URL( + "https://transfer.api.globusonline.org/v0.10/operation/endpoint/" + endpoint.getId() + + "/ls?path=" + dir); + result = makeRequest(url, "Bearer", endpoint.getClientToken(), "GET", null); + + switch (result.status) { + case 200: + logger.fine("Looked up directory " + dir + " successfully."); + break; + default: + logger.warning("Status " + result.status + " received when looking up dir " + dir); + logger.fine("Response: " + result.jsonResponse); + } + } catch (MalformedURLException ex) { + // Misconfiguration + logger.warning("Failed to create dir on " + endpoint.getId()); + return null; + } + + JsonObject listObject = JsonUtil.getJsonObject(result.jsonResponse); + JsonArray dataArray = listObject.getJsonArray("DATA"); + + if (dataArray != null && !dataArray.isEmpty()) { + for (int i = 0; i < dataArray.size(); i++) { + String dataType = dataArray.getJsonObject(i).getString("DATA_TYPE", null); + if (dataType != null && dataType.equals("file")) { + String fileName = dataArray.getJsonObject(i).getString("name"); + long fileSize = dataArray.getJsonObject(i).getJsonNumber("size").longValueExact(); + ret.put(fileName, fileSize); + } + } + } + + return ret; + } + private int requestPermission(GlobusEndpoint endpoint, Dataset dataset, Permissions permissions) { Gson gson = new GsonBuilder().create(); MakeRequestResponse result = null; @@ -972,12 +1014,6 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut fileJsonObject = path.apply(fileJsonObject); addFilesJsonData.add(fileJsonObject); countSuccess++; - // } else { - // globusLogger.info(fileName - // + " will be skipped from adding to dataset by second API due to missing - // values "); - // countError++; - // } } else { myLogger.info(fileName + " will be skipped from adding to dataset in the final AddReplaceFileHelper.addFiles() call. "); @@ -1211,7 +1247,7 @@ private GlobusTaskState globusStatusCheck(GlobusEndpoint endpoint, String taskId return task; } - public JsonObject calculateMissingMetadataFields(List inputList, Logger globusLogger) + private JsonObject calculateMissingMetadataFields(List inputList, Logger globusLogger) throws InterruptedException, ExecutionException, IOException { List> hashvalueCompletableFutures = inputList.stream() @@ -1230,7 +1266,7 @@ public JsonObject calculateMissingMetadataFields(List inputList, Logger }); JsonArrayBuilder filesObject = (JsonArrayBuilder) completableFuture.get(); - + JsonObject output = Json.createObjectBuilder().add("files", filesObject).build(); return output; diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index b42fd950528..fad02c76c78 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -344,10 +344,20 @@ public List saveAndAddFilesToDataset(DatasetVersion version, try { StorageIO dataAccess = DataAccess.getStorageIO(dataFile); //Populate metadata - dataAccess.open(DataAccessOption.READ_ACCESS); - // (the .open() above makes a remote call to check if - // the file exists and obtains its size) - confirmedFileSize = dataAccess.getSize(); + + // There are direct upload sub-cases where the file size + // is already known at this point. For example, direct uploads + // to S3 that go through the jsf dataset page. Or the Globus + // uploads, where the file sizes are looked up in bulk on + // the completion of the remote upload task. + if (dataFile.getFilesize() > 0) { + confirmedFileSize = dataFile.getFilesize(); + } else { + dataAccess.open(DataAccessOption.READ_ACCESS); + // (the .open() above makes a remote call to check if + // the file exists and obtains its size) + confirmedFileSize = dataAccess.getSize(); + } // For directly-uploaded files, we will perform the file size // limit and quota checks here. Perform them *again*, in @@ -362,13 +372,16 @@ public List saveAndAddFilesToDataset(DatasetVersion version, if (fileSizeLimit == null || confirmedFileSize < fileSizeLimit) { //set file size - logger.fine("Setting file size: " + confirmedFileSize); - dataFile.setFilesize(confirmedFileSize); + if (dataFile.getFilesize() < 1) { + logger.fine("Setting file size: " + confirmedFileSize); + dataFile.setFilesize(confirmedFileSize); + } if (dataAccess instanceof S3AccessIO) { ((S3AccessIO) dataAccess).removeTempTag(); } savedSuccess = true; + logger.info("directly uploaded file successfully saved. file size: "+dataFile.getFilesize()); } } catch (IOException ioex) { logger.warning("Failed to get file size, storage id, or failed to remove the temp tag on the saved S3 object" + dataFile.getStorageIdentifier() + " (" From eef0d22ff277152409870484e781069ebf10cab5 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Mon, 18 Nov 2024 17:37:34 -0500 Subject: [PATCH 02/11] more incremental changes #10977 --- .../dataaccess/GlobusOverlayAccessIO.java | 2 +- .../dataverse/globus/GlobusServiceBean.java | 26 ++++++++++++++++--- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java b/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java index 3bf2107e52b..d0da66c38e0 100644 --- a/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java +++ b/src/main/java/edu/harvard/iq/dataverse/dataaccess/GlobusOverlayAccessIO.java @@ -215,7 +215,7 @@ public long retrieveSizeFromMedia() { JsonArray dataArray = responseJson.getJsonArray("DATA"); if (dataArray != null && dataArray.size() != 0) { //File found - return (long) responseJson.getJsonArray("DATA").getJsonObject(0).getJsonNumber("size").longValueExact(); + return (long) dataArray.getJsonObject(0).getJsonNumber("size").longValueExact(); } } else { logger.warning("Response from " + get.getURI().toString() + " was " diff --git a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java index 3573b8e05df..013fefd1e34 100644 --- a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java @@ -74,6 +74,7 @@ import edu.harvard.iq.dataverse.util.URLTokenUtil; import edu.harvard.iq.dataverse.util.UrlSignerUtil; import edu.harvard.iq.dataverse.util.json.JsonUtil; +import jakarta.json.JsonNumber; import jakarta.json.JsonReader; import jakarta.persistence.EntityManager; import jakarta.persistence.PersistenceContext; @@ -980,9 +981,16 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut inputList.add(fileId + "IDsplit" + fullPath + "IDsplit" + fileName); } + + // Look up the sizes of all the files in the dataset folder, to avoid + // looking them up one by one later: + // @todo: we should only be doing this if this is a managed store, probably? + GlobusEndpoint endpoint = getGlobusEndpoint(dataset); + Map fileSizeMap = lookupFileSizes(endpoint, endpoint.getBasePath()); // calculateMissingMetadataFields: checksum, mimetype JsonObject newfilesJsonObject = calculateMissingMetadataFields(inputList, myLogger); + JsonArray newfilesJsonArray = newfilesJsonObject.getJsonArray("files"); logger.fine("Size: " + newfilesJsonArray.size()); logger.fine("Val: " + JsonUtil.prettyPrint(newfilesJsonArray.getJsonObject(0))); @@ -1006,13 +1014,23 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut if (newfileJsonObject != null) { logger.fine("List Size: " + newfileJsonObject.size()); // if (!newfileJsonObject.get(0).getString("hash").equalsIgnoreCase("null")) { - JsonPatch path = Json.createPatchBuilder() + JsonPatch patch = Json.createPatchBuilder() .add("/md5Hash", newfileJsonObject.get(0).getString("hash")).build(); - fileJsonObject = path.apply(fileJsonObject); - path = Json.createPatchBuilder() + fileJsonObject = patch.apply(fileJsonObject); + patch = Json.createPatchBuilder() .add("/mimeType", newfileJsonObject.get(0).getString("mime")).build(); - fileJsonObject = path.apply(fileJsonObject); + fileJsonObject = patch.apply(fileJsonObject); addFilesJsonData.add(fileJsonObject); + // If we already know the size of this file on the Globus end, + // we'll pass it to /addFiles, to avoid looking up file sizes + // one by one: + if (fileSizeMap != null && fileSizeMap.get(fileId) != null) { + Long uploadedFileSize = fileSizeMap.get(fileId); + myLogger.fine("Found size for file " + fileId + ": " + uploadedFileSize + " bytes"); + patch = Json.createPatchBuilder() + .add("/fileSize", Json.createValue(uploadedFileSize)).build(); + fileJsonObject = patch.apply(fileJsonObject); + } countSuccess++; } else { myLogger.info(fileName From 62937872c5cb7b408ce90cd79f26f6cf45921721 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Tue, 19 Nov 2024 16:37:05 -0500 Subject: [PATCH 03/11] a quick bug fix; changed verbose logging to .fine. #10977 --- .../datasetutility/OptionalFileParams.java | 6 ++++++ .../impl/CreateNewDataFilesCommand.java | 2 -- .../iq/dataverse/globus/GlobusServiceBean.java | 18 ++++++++++++------ 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java b/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java index c1be6424a84..54844160163 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java +++ b/src/main/java/edu/harvard/iq/dataverse/datasetutility/OptionalFileParams.java @@ -39,6 +39,12 @@ * - Provenance related information * * @author rmp553 + * @todo (?) We may want to consider renaming this class to DataFileParams or + * DataFileInfo... it was originally created to encode some bits of info - + * the file "tags" specifically, that didn't fit in elsewhere in the normal + * workflow; but it's been expanded to cover pretty much everything else associated + * with DataFiles and it's not really "optional" anymore when, for example, used + * in the direct upload workflow. (?) */ public class OptionalFileParams { diff --git a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java index 172c92dc1fd..e9a2025b112 100644 --- a/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java +++ b/src/main/java/edu/harvard/iq/dataverse/engine/command/impl/CreateNewDataFilesCommand.java @@ -640,7 +640,6 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException createIngestFailureReport(datafile, warningMessage); datafile.SetIngestProblem(); } - logger.info("datafile size: " + datafile.getFilesize()); if (datafile.getFilesize() < 0) { datafile.setFilesize(fileSize); } @@ -659,7 +658,6 @@ public CreateDataFileResult execute(CommandContext ctxt) throws CommandException quota.setTotalUsageInBytes(quota.getTotalUsageInBytes() + fileSize); } - logger.info("datafile size (again): " + datafile.getFilesize()); return CreateDataFileResult.success(fileName, finalType, datafiles); } diff --git a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java index 013fefd1e34..3d1c5a1044d 100644 --- a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java @@ -285,12 +285,11 @@ private int makeDir(GlobusEndpoint endpoint, String dir) { return result.status; } - private Map lookupFileSizes(GlobusEndpoint endpoint, String dir) { - Map ret = new HashMap<>(); - + private Map lookupFileSizes(GlobusEndpoint endpoint, String dir) { MakeRequestResponse result; try { + logger.fine("Attempting to look up the contents of the Globus folder "+dir); URL url = new URL( "https://transfer.api.globusonline.org/v0.10/operation/endpoint/" + endpoint.getId() + "/ls?path=" + dir); @@ -303,13 +302,16 @@ private Map lookupFileSizes(GlobusEndpoint endpoint, String dir) { default: logger.warning("Status " + result.status + " received when looking up dir " + dir); logger.fine("Response: " + result.jsonResponse); + return null; } } catch (MalformedURLException ex) { // Misconfiguration - logger.warning("Failed to create dir on " + endpoint.getId()); + logger.warning("Failed to list the contents of the directory "+ dir + " on endpoint " + endpoint.getId()); return null; } + Map ret = new HashMap<>(); + JsonObject listObject = JsonUtil.getJsonObject(result.jsonResponse); JsonArray dataArray = listObject.getJsonArray("DATA"); @@ -317,6 +319,8 @@ private Map lookupFileSizes(GlobusEndpoint endpoint, String dir) { for (int i = 0; i < dataArray.size(); i++) { String dataType = dataArray.getJsonObject(i).getString("DATA_TYPE", null); if (dataType != null && dataType.equals("file")) { + // is it safe to assume that any entry with a valid "DATA_TYPE": "file" + // will also have valid "name" and "size" entries? String fileName = dataArray.getJsonObject(i).getString("name"); long fileSize = dataArray.getJsonObject(i).getJsonNumber("size").longValueExact(); ret.put(fileName, fileSize); @@ -1020,17 +1024,19 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut patch = Json.createPatchBuilder() .add("/mimeType", newfileJsonObject.get(0).getString("mime")).build(); fileJsonObject = patch.apply(fileJsonObject); - addFilesJsonData.add(fileJsonObject); // If we already know the size of this file on the Globus end, // we'll pass it to /addFiles, to avoid looking up file sizes // one by one: if (fileSizeMap != null && fileSizeMap.get(fileId) != null) { Long uploadedFileSize = fileSizeMap.get(fileId); - myLogger.fine("Found size for file " + fileId + ": " + uploadedFileSize + " bytes"); + myLogger.info("Found size for file " + fileId + ": " + uploadedFileSize + " bytes"); patch = Json.createPatchBuilder() .add("/fileSize", Json.createValue(uploadedFileSize)).build(); fileJsonObject = patch.apply(fileJsonObject); + } else { + logger.warning("No file size entry found for file "+fileId); } + addFilesJsonData.add(fileJsonObject); countSuccess++; } else { myLogger.info(fileName From a1f057287ee1297e8ddf5ab9c0f8ec94dbcf640f Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 20 Nov 2024 10:42:30 -0500 Subject: [PATCH 04/11] added a configurable batch size limit for when to apply the single file size lookup method for the entire batch. #10977 --- .../iq/dataverse/globus/GlobusServiceBean.java | 16 ++++++++++------ .../dataverse/settings/SettingsServiceBean.java | 6 ++++++ .../harvard/iq/dataverse/util/SystemConfig.java | 6 ++++++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java index 3d1c5a1044d..5c9a2f1d946 100644 --- a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java @@ -986,11 +986,15 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut inputList.add(fileId + "IDsplit" + fullPath + "IDsplit" + fileName); } - // Look up the sizes of all the files in the dataset folder, to avoid - // looking them up one by one later: - // @todo: we should only be doing this if this is a managed store, probably? - GlobusEndpoint endpoint = getGlobusEndpoint(dataset); - Map fileSizeMap = lookupFileSizes(endpoint, endpoint.getBasePath()); + Map fileSizeMap = null; + + if (filesJsonArray.size() >= systemConfig.getGlobusBatchLookupSize()) { + // Look up the sizes of all the files in the dataset folder, to avoid + // looking them up one by one later: + // @todo: we should only be doing this if this is a managed store, probably (?) + GlobusEndpoint endpoint = getGlobusEndpoint(dataset); + fileSizeMap = lookupFileSizes(endpoint, endpoint.getBasePath()); + } // calculateMissingMetadataFields: checksum, mimetype JsonObject newfilesJsonObject = calculateMissingMetadataFields(inputList, myLogger); @@ -1034,7 +1038,7 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut .add("/fileSize", Json.createValue(uploadedFileSize)).build(); fileJsonObject = patch.apply(fileJsonObject); } else { - logger.warning("No file size entry found for file "+fileId); + logger.fine("No file size entry found for file "+fileId); } addFilesJsonData.add(fileJsonObject); countSuccess++; diff --git a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java index 8ed96690e84..b5eb483c2c8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/settings/SettingsServiceBean.java @@ -539,6 +539,12 @@ Whether Harvesting (OAI) service is enabled * */ GlobusSingleFileTransfer, + /** Lower limit of the number of files in a Globus upload task where + * the batch mode should be utilized in looking up the file information + * on the remote end node (file sizes, primarily), instead of individual + * lookups. + */ + GlobusBatchLookupSize, /** * Optional external executables to run on the metadata for dataverses * and datasets being published; as an extra validation step, to diff --git a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java index 434b3bd8f8f..e769cacfdb1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java +++ b/src/main/java/edu/harvard/iq/dataverse/util/SystemConfig.java @@ -78,6 +78,7 @@ public class SystemConfig { public static final long defaultZipDownloadLimit = 104857600L; // 100MB private static final int defaultMultipleUploadFilesLimit = 1000; private static final int defaultLoginSessionTimeout = 480; // = 8 hours + private static final int defaultGlobusBatchLookupSize = 50; private String buildNumber = null; @@ -954,6 +955,11 @@ public boolean isGlobusFileDownload() { return (isGlobusDownload() && settingsService.isTrueForKey(SettingsServiceBean.Key.GlobusSingleFileTransfer, false)); } + public int getGlobusBatchLookupSize() { + String batchSizeOption = settingsService.getValueForKey(SettingsServiceBean.Key.GlobusBatchLookupSize); + return getIntLimitFromStringOrDefault(batchSizeOption, defaultGlobusBatchLookupSize); + } + private Boolean getMethodAvailable(String method, boolean upload) { String methods = settingsService.getValueForKey( upload ? SettingsServiceBean.Key.UploadMethods : SettingsServiceBean.Key.DownloadMethods); From 536c1bfe9466242797ce5a037936560d5cd0e197 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 20 Nov 2024 12:08:21 -0500 Subject: [PATCH 05/11] release note #10977 --- doc/release-notes/10977-globus-filesize-lookup.md | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 doc/release-notes/10977-globus-filesize-lookup.md diff --git a/doc/release-notes/10977-globus-filesize-lookup.md b/doc/release-notes/10977-globus-filesize-lookup.md new file mode 100644 index 00000000000..49fd10d9ffe --- /dev/null +++ b/doc/release-notes/10977-globus-filesize-lookup.md @@ -0,0 +1,6 @@ +## A new Globus optimization setting + +An optimization has been added for the Globus upload workflow, with a corresponding new database setting: `:GlobusBatchLookupSize` + + +See the [Database Settings](https://guides.dataverse.org/en/6.5/installation/config.html#GlobusBatchLookupSize) section of the Guides for more information. \ No newline at end of file From 617b13ae40fbed89a14816aa0c07cb10b228db6d Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Wed, 20 Nov 2024 12:11:25 -0500 Subject: [PATCH 06/11] configuration guide entry #10977 --- doc/sphinx-guides/source/installation/config.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/sphinx-guides/source/installation/config.rst b/doc/sphinx-guides/source/installation/config.rst index e3965e3cd7c..30a36da9499 100644 --- a/doc/sphinx-guides/source/installation/config.rst +++ b/doc/sphinx-guides/source/installation/config.rst @@ -4849,6 +4849,13 @@ The URL where the `dataverse-globus Date: Mon, 25 Nov 2024 17:30:30 -0500 Subject: [PATCH 07/11] Per review feedback, made it impossible to supply the file sizes via the /addFiles API (i.e., we don't want to trust the users of the direct s3 upload api when it comes to file sizes). #10977 --- .../datasetutility/AddReplaceFileHelper.java | 48 ++++++++++++------- .../dataverse/globus/GlobusServiceBean.java | 2 +- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java index 3943e3ad7d8..6b98848021c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java +++ b/src/main/java/edu/harvard/iq/dataverse/datasetutility/AddReplaceFileHelper.java @@ -136,10 +136,7 @@ public class AddReplaceFileHelper{ private String newFileName; // step 30 private String newFileContentType; // step 30 private String newStorageIdentifier; // step 30 - private String newCheckSum; // step 30 - private ChecksumType newCheckSumType; //step 30 - private Long suppliedFileSize = null; - + // -- Optional private DataFile fileToReplace; // step 25 @@ -147,6 +144,7 @@ public class AddReplaceFileHelper{ private DatasetVersion clone; List initialFileList; List finalFileList; + private boolean trustSuppliedFileSizes; // ----------------------------------- // Ingested files @@ -611,18 +609,9 @@ private boolean runAddReplacePhase1(Dataset owner, return false; } - if (optionalFileParams != null) { - if (optionalFileParams.hasCheckSum()) { - newCheckSum = optionalFileParams.getCheckSum(); - newCheckSumType = optionalFileParams.getCheckSumType(); - } - if (optionalFileParams.hasFileSize()) { - suppliedFileSize = optionalFileParams.getFileSize(); - } - } msgt("step_030_createNewFilesViaIngest"); - if (!this.step_030_createNewFilesViaIngest()){ + if (!this.step_030_createNewFilesViaIngest(optionalFileParams)){ return false; } @@ -1195,7 +1184,7 @@ private boolean step_007_auto_isReplacementInLatestVersion(DataFile existingFile } - private boolean step_030_createNewFilesViaIngest(){ + private boolean step_030_createNewFilesViaIngest(OptionalFileParams optionalFileParams){ if (this.hasError()){ return false; @@ -1207,6 +1196,22 @@ private boolean step_030_createNewFilesViaIngest(){ //Don't repeatedly update the clone (losing changes) in multifile case clone = workingVersion.cloneDatasetVersion(); } + + Long suppliedFileSize = null; + String newCheckSum = null; + ChecksumType newCheckSumType = null; + + + if (optionalFileParams != null) { + if (optionalFileParams.hasCheckSum()) { + newCheckSum = optionalFileParams.getCheckSum(); + newCheckSumType = optionalFileParams.getCheckSumType(); + } + if (trustSuppliedFileSizes && optionalFileParams.hasFileSize()) { + suppliedFileSize = optionalFileParams.getFileSize(); + } + } + try { UploadSessionQuotaLimit quota = null; if (systemConfig.isStorageQuotasEnforced()) { @@ -2028,9 +2033,15 @@ public void setDuplicateFileWarning(String duplicateFileWarning) { * @param jsonData - an array of jsonData entries (one per file) using the single add file jsonData format * @param dataset * @param authUser + * @param trustSuppliedSizes - whether to accept the fileSize values passed + * in jsonData (we don't want to trust the users of the S3 direct + * upload API with that information - we will verify the status of + * the files in the S3 bucket and confirm the sizes in the process. + * we do want GlobusService to be able to pass the file sizes, since + * they are obtained and verified via a Globus API lookup). * @return */ - public Response addFiles(String jsonData, Dataset dataset, User authUser) { + public Response addFiles(String jsonData, Dataset dataset, User authUser, boolean trustSuppliedFileSizes) { msgt("(addFilesToDataset) jsonData: " + jsonData.toString()); JsonArrayBuilder jarr = Json.createArrayBuilder(); @@ -2039,6 +2050,7 @@ public Response addFiles(String jsonData, Dataset dataset, User authUser) { int totalNumberofFiles = 0; int successNumberofFiles = 0; + this.trustSuppliedFileSizes = trustSuppliedFileSizes; // ----------------------------------------------------------- // Read jsonData and Parse files information from jsondata : // ----------------------------------------------------------- @@ -2171,6 +2183,10 @@ public Response addFiles(String jsonData, Dataset dataset, User authUser) { .add("data", Json.createObjectBuilder().add("Files", jarr).add("Result", result)).build() ).build(); } + public Response addFiles(String jsonData, Dataset dataset, User authUser) { + return addFiles(jsonData, dataset, authUser, false); + } + /** * Replace multiple files with prepositioned replacements as listed in the * jsonData. Works with direct upload, Globus, and other out-of-band methods. diff --git a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java index 5c9a2f1d946..58992805dc8 100644 --- a/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/globus/GlobusServiceBean.java @@ -1093,7 +1093,7 @@ private void processUploadedFiles(JsonArray filesJsonArray, Dataset dataset, Aut // The old code had 2 sec. of sleep, so ... Thread.sleep(2000); - Response addFilesResponse = addFileHelper.addFiles(newjsonData, dataset, authUser); + Response addFilesResponse = addFileHelper.addFiles(newjsonData, dataset, authUser, true); if (addFilesResponse == null) { logger.info("null response from addFiles call"); From 3cd9a82d381f543cd3cb9b3a5560cb44bee1bbee Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Mon, 25 Nov 2024 17:39:46 -0500 Subject: [PATCH 08/11] We do support 0-size files! #10977 --- .../java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index fad02c76c78..52b7d4f1861 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -350,7 +350,7 @@ public List saveAndAddFilesToDataset(DatasetVersion version, // to S3 that go through the jsf dataset page. Or the Globus // uploads, where the file sizes are looked up in bulk on // the completion of the remote upload task. - if (dataFile.getFilesize() > 0) { + if (dataFile.getFilesize() >= 0) { confirmedFileSize = dataFile.getFilesize(); } else { dataAccess.open(DataAccessOption.READ_ACCESS); From 644a52491665d4e0f43a655f6b9094a2c41848b0 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Mon, 25 Nov 2024 18:33:37 -0500 Subject: [PATCH 09/11] added a bunch of globus-related entries that were missing from the bundle, per #11030 (these are used by the notification page, apparently??) --- src/main/java/propertyFiles/Bundle.properties | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index 012b389ce32..2b74e24ea29 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -307,7 +307,13 @@ notification.typeDescription.WORKFLOW_FAILURE=External workflow run has failed notification.typeDescription.STATUSUPDATED=Status of dataset has been updated notification.typeDescription.DATASETCREATED=Dataset was created by user notification.typeDescription.DATASETMENTIONED=Dataset was referenced in remote system - +notification.typeDescription.GLOBUSUPLOADCOMPLETED=Globus upload is completed +notification.typeDescription.GLOBUSUPLOADCOMPLETEDWITHERRORS=Globus upload completed with errors +notification.typeDescription.GLOBUSDOWNLOADCOMPLETED=Globus download is completed +notification.typeDescription.GLOBUSDOWNLOADCOMPLETEDWITHERRORS=Globus download completed with errors +notification.typeDescription.GLOBUSUPLOADLOCALFAILURE=Globus upload failed, internal error +notification.typeDescription.GLOBUSUPLOADREMOTEFAILURE=Globus upload failed, remote transfer error +notification.typeDescription.REQUESTEDFILEACCESS=File access requested groupAndRoles.manageTips=Here is where you can access and manage all the groups you belong to, and the roles you have been assigned. user.message.signup.label=Create Account user.message.signup.tip=Why have a Dataverse account? To create your own dataverse and customize it, add datasets, or request access to restricted files. From 28e25eae93de311a2eb4e5e4971d5463fafb9193 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Mon, 25 Nov 2024 18:52:35 -0500 Subject: [PATCH 10/11] one more missing notification entry in the bundle #10977 --- src/main/java/edu/harvard/iq/dataverse/MailServiceBean.java | 2 +- src/main/java/propertyFiles/Bundle.properties | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/MailServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/MailServiceBean.java index 2995c0c5f47..c67a0293847 100644 --- a/src/main/java/edu/harvard/iq/dataverse/MailServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/MailServiceBean.java @@ -283,7 +283,7 @@ public Boolean sendNotificationEmail(UserNotification notification, String comme if (objectOfNotification != null){ String messageText = getMessageTextBasedOnNotification(notification, objectOfNotification, comment, requestor); String subjectText = MailUtil.getSubjectTextBasedOnNotification(notification, objectOfNotification); - if (!(messageText.isEmpty() || subjectText.isEmpty())){ + if (!(StringUtils.isEmpty(messageText) || StringUtils.isEmpty(subjectText))){ retval = sendSystemEmail(emailAddress, subjectText, messageText, isHtmlContent); } else { logger.warning("Skipping " + notification.getType() + " notification, because couldn't get valid message"); diff --git a/src/main/java/propertyFiles/Bundle.properties b/src/main/java/propertyFiles/Bundle.properties index 2b74e24ea29..02bc19f86cf 100644 --- a/src/main/java/propertyFiles/Bundle.properties +++ b/src/main/java/propertyFiles/Bundle.properties @@ -843,7 +843,8 @@ notification.email.datasetWasMentioned=Hello {0},

The {1} has just been notification.email.datasetWasMentioned.subject={0}: A Dataset Relationship has been reported! notification.email.globus.uploadCompleted.subject={0}: Files uploaded successfully via Globus and verified notification.email.globus.downloadCompleted.subject={0}: Files downloaded successfully via Globus -notification.email.globus.uploadCompletedWithErrors.subject={0}: Uploaded files via Globus with errors +notification.email.globus.downloadCompletedWithErrors.subject={0}: Globus download task completed, errors encountered +notification.email.globus.uploadCompletedWithErrors.subject={0}: Globus upload task completed with errors notification.email.globus.uploadFailedRemotely.subject={0}: Failed to upload files via Globus notification.email.globus.uploadFailedLocally.subject={0}: Failed to add files uploaded via Globus to dataset # dataverse.xhtml From 1325cee6cc7aa707481db68590fa16b65a7ad2ef Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Tue, 26 Nov 2024 09:47:26 -0500 Subject: [PATCH 11/11] This should make the filesize setting logic less confusing potentially #10977 --- .../java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java index 52b7d4f1861..71c498a4d0b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/ingest/IngestServiceBean.java @@ -372,7 +372,7 @@ public List saveAndAddFilesToDataset(DatasetVersion version, if (fileSizeLimit == null || confirmedFileSize < fileSizeLimit) { //set file size - if (dataFile.getFilesize() < 1) { + if (dataFile.getFilesize() < 0) { logger.fine("Setting file size: " + confirmedFileSize); dataFile.setFilesize(confirmedFileSize); }