Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implemented delete-metadata, wip #9

Merged
merged 4 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
<dependency>
<groupId>nl.knaw.dans</groupId>
<artifactId>dans-dataverse-client-lib</artifactId>
<version>0.33.0</version>
<version>0.33.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/nl/knaw/dans/dvcli/DdDataverseCli.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import nl.knaw.dans.dvcli.command.collection.roleassignment.CollectionRoleAssignmentRemove;
import nl.knaw.dans.dvcli.command.dataset.DatasetCmd;
import nl.knaw.dans.dvcli.command.dataset.DatasetDeleteDraft;
import nl.knaw.dans.dvcli.command.dataset.DatasetDeleteMetadata;
import nl.knaw.dans.dvcli.command.dataset.DatasetGetFiles;
import nl.knaw.dans.dvcli.command.dataset.DatasetGetLatestVersion;
import nl.knaw.dans.dvcli.command.dataset.DatasetGetVersion;
Expand Down Expand Up @@ -95,6 +96,7 @@ public void configureCommandLine(CommandLine commandLine, DdDataverseCliConfig c
.addSubcommand(new DatasetGetLatestVersion())
.addSubcommand(new DatasetGetVersion())
.addSubcommand(new DatasetPublish())
.addSubcommand(new DatasetDeleteMetadata())
.addSubcommand(new CommandLine(new DatasetRoleAssignment())
.addSubcommand(new DatasetRoleAssignmentList())
.addSubcommand(new DatasetRoleAssignmentAdd())
Expand Down
49 changes: 37 additions & 12 deletions src/main/java/nl/knaw/dans/dvcli/action/BatchProcessor.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@
import lombok.NonNull;
import lombok.extern.slf4j.Slf4j;

import java.util.List;
import java.util.Collection;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;

/**
* Processes a batch of labeled items by applying an action to each item. The labels are used for reporting. Typically, the label is the ID of the item. After each action, the processor waits for a
Expand All @@ -31,12 +33,17 @@
*/
@Builder
@Slf4j
public class BatchProcessor<I, R> {
public class BatchProcessor<I, R> {
/**
* The labeled items to process.
* The labeled items to process. The String is the label, <code>I</code> is the item.
*/
@NonNull
private final List<Pair<String, I>> labeledItems;
private final Stream<Pair<String, I>> labeledItems;

/**
* The number of items to process. If the labeled items are a collection, this number is the size of the collection. Otherwise, it is null.
*/
private final Long numberOfItems;

/**
* The action to apply to each item.
Expand All @@ -56,15 +63,33 @@ public class BatchProcessor<I, R> {
@Builder.Default
private final long delay = 1000;

public static class BatchProcessorBuilder<I, R> {
public BatchProcessorBuilder<I, R> labeledItems(Collection<Pair<String, I>> items) {
this.labeledItems = items.stream();
this.numberOfItems = (long) items.size();
return this;
}

public BatchProcessorBuilder<I, R> labeledItems(Stream<Pair<String, I>> items) {
this.labeledItems = items;
return this;
}
}

public void process() {
log.info("Starting batch processing");
int i = 0;
for (var labeledItem : labeledItems) {
delayIfNeeded(i);
log.info("Processing item {} of {}", ++i, labeledItems.size());
callAction(labeledItem.getFirst(), labeledItem.getSecond());
log.info("Starting batch processing of " + (numberOfItems == null ? "?" : numberOfItems + " items"));
AtomicInteger i = new AtomicInteger(0);
try {
labeledItems.forEach(labeledItem -> {
int index = i.incrementAndGet();
delayIfNeeded(index);
log.info("Processing item {} of {}: {}", index, numberOfItems == null ? "?" : numberOfItems, labeledItem.getFirst());
callAction(labeledItem.getFirst(), labeledItem.getSecond());
});
} finally {
labeledItems.close();
}
log.info("Finished batch processing of {} items", labeledItems.size());
log.info("Finished batch processing of " + (numberOfItems == null ? "?" : numberOfItems + " items"));
}

private void callAction(String label, I item) {
Expand All @@ -78,7 +103,7 @@ private void callAction(String label, I item) {
}

private void delayIfNeeded(int i) {
if (delay > 0 && i > 0) {
if (delay > 0 && i > 1) {
log.debug("Sleeping for {} ms", delay);
try {
Thread.sleep(delay);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@
import java.io.IOException;
import java.util.List;

/**
*
* @param <T>
*/
public abstract class AbstractSubcommandContainer<T> extends AbstractCmd {
private static final long DEFAULT_DELAY = 1000;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/*
* Copyright (C) 2024 DANS - Data Archiving and Networked Services ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.knaw.dans.dvcli.command.dataset;

import lombok.Value;
import nl.knaw.dans.dvcli.action.Pair;
import nl.knaw.dans.dvcli.action.ThrowingFunction;
import nl.knaw.dans.dvcli.command.AbstractCmd;
import nl.knaw.dans.dvcli.inputparsers.FieldValuesParamsFileParser;
import nl.knaw.dans.dvcli.inputparsers.FieldValuesParser;
import nl.knaw.dans.lib.dataverse.DatasetApi;
import nl.knaw.dans.lib.dataverse.model.dataset.FieldList;
import nl.knaw.dans.lib.dataverse.model.dataset.MetadataField;
import picocli.CommandLine.ArgGroup;
import picocli.CommandLine.Command;
import picocli.CommandLine.Option;
import picocli.CommandLine.ParentCommand;

import java.nio.file.Path;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;

@Command(name = "delete-metadata",
mixinStandardHelpOptions = true,
description = """
Delete metadata fields from a dataset. The fields to delete can be specified as command line options or in a CSV file. The dataset will be in draft state after the operation.
""")
public class DatasetDeleteMetadata extends AbstractCmd {
@ParentCommand
private DatasetCmd datasetCmd;

static class FieldValueOrParameterFile {
@Option(names = { "-f",
"--field-value" }, description = "Field name and value to delete. If the field is a compound field, multiple field-values specified together will be treated as a single compound field. "
+ "If you need to delete multiple values from the same field, you have to call this command multiple times. "
+ "The format is: field-name=field-value. For example, to delete a field named 'alternativeTitle' with value 'Some title', use --field-value 'alternativeTitle=Some title'. "
+ "For compound fields, the field name must be prefixed with the field name of the parent field e.g., 'author.authorName' for the subfield 'authorName' of the compound field 'author'. "
+ "If the field is repeatable, you must add an asterisk (*) at the end of the field name.")
private List<String> fieldValues;

@Option(names = { "-p", "--parameters-file" }, description = "Path to a CSV file containing the field names and values to delete. The file must have a header row with the field names. "
+ "Each subsequent row must contain the field values. There must be a column 'PID' containing the dataset persistent identifier. The other column headers must match field names in "
+ "the dataset metadata. Compound fields must be specified as 'parentField.childField'. If you need to delete multiple fields from one dataset, use multiple rows in the CSV file.")
private Path parametersFile;
}

@ArgGroup(multiplicity = "1")
private FieldValueOrParameterFile fieldValueOrParameterFile;

private static class DeleteMetadataAction implements ThrowingFunction<DeleteMetadataParams, String, Exception> {
@Override
public String apply(DeleteMetadataParams deleteMetadataParams) throws Exception {
var fieldList = new FieldList(deleteMetadataParams.fieldValues.stream().toList());
deleteMetadataParams.api.deleteMetadata(fieldList, Collections.emptyMap());
return "Delete metadata";
}
}

@Value
private static class DeleteMetadataParams {
DatasetApi api;
Set<MetadataField> fieldValues;
}

@Override
public void doCall() throws Exception {
datasetCmd.<DeleteMetadataParams> paramsBatchProcessorBuilder()
.labeledItems(getLabeledItems())
.action(new DeleteMetadataAction())
.build()
.process();
}

private Stream<Pair<String, DeleteMetadataParams>> getLabeledItems() {
try {
if (fieldValueOrParameterFile.fieldValues != null) {
var keyValues = new HashMap<String, String>();
for (var fieldValue : fieldValueOrParameterFile.fieldValues) {
var split = fieldValue.split("=", 2);
keyValues.put(split[0], split[1]);
}
return datasetCmd.getItems().stream()
.map(p -> new Pair<>(p.getFirst(), new FieldValuesParser(keyValues).parse()))
.map(p -> new Pair<>(p.getFirst(), new DeleteMetadataParams(datasetCmd.getDataverseClient().dataset(p.getFirst()), p.getSecond())));

}
else if (fieldValueOrParameterFile.parametersFile != null) {
return new FieldValuesParamsFileParser(fieldValueOrParameterFile.parametersFile)
.parse()
.map(p -> new Pair<>(p.getFirst(), new DeleteMetadataParams(datasetCmd.getDataverseClient().dataset(p.getFirst()), p.getSecond())));
}
}
catch (Exception e) {
throw new RuntimeException("Error parsing field values or parameter file.", e);
}
throw new IllegalArgumentException("No field values or parameter file specified.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
*/
package nl.knaw.dans.dvcli.command.dataset;

import nl.knaw.dans.dvcli.action.BatchProcessor;
import nl.knaw.dans.dvcli.action.ConsoleReport;
import nl.knaw.dans.dvcli.action.Pair;
import nl.knaw.dans.dvcli.action.SingleIdOrIdsFile;
import nl.knaw.dans.dvcli.action.ThrowingFunction;
import nl.knaw.dans.dvcli.command.AbstractCmd;
import nl.knaw.dans.lib.dataverse.AdminApi;
import nl.knaw.dans.lib.dataverse.DataverseException;
Expand All @@ -30,35 +30,31 @@

@Command(name = "validate-files",
mixinStandardHelpOptions = true,
description = "Make sure that all files are correctly stored in object storage.")
description = "Validate the fixity checksums of the files in a dataset.")
public class DatasetValidateFiles extends AbstractCmd {
@ParentCommand
private DatasetCmd datasetCmd;

protected List<Pair<String, IdParam>> getIds() throws IOException {
protected List<Pair<String, String>> getIds() throws IOException {
List<String> pids = new SingleIdOrIdsFile(datasetCmd.getTargets(), SingleIdOrIdsFile.DEFAULT_TARGET_PLACEHOLDER).getPids().toList();
return pids.stream().map(p -> new Pair<>(p, new IdParam(datasetCmd.getDataverseClient().admin(), p))).toList();
// The label is the same as the id. Since the BatchProcessor expects labeled items, we create a list of pairs with the same id as label.
return pids.stream().map(p -> new Pair<>(p, p)).toList();
}

protected record IdParam(AdminApi admin, String id) {
}

private static class ValidateFilesAction implements ThrowingFunction<IdParam, String, Exception> {
@Override
public String apply(IdParam idParam) throws IOException, DataverseException {
var r = idParam.admin().validateDatasetFiles(idParam.id);
return r.getBodyAsString();
}
}

@Override
public void doCall() throws IOException, DataverseException {
datasetCmd.<IdParam> paramsBatchProcessorBuilder()
// Not using the helper method on datasetCmd because we need to call the admin endpoint and not the dataset endpoint.
BatchProcessor.<String, String> builder()
.labeledItems(getIds())
.action(new ValidateFilesAction())
.action(pid -> {
var r = datasetCmd.getDataverseClient().admin().validateDatasetFiles(pid);
return r.getBodyAsString();
})
.report(new ConsoleReport<>())
.build()
.process();
}

}
45 changes: 45 additions & 0 deletions src/main/java/nl/knaw/dans/dvcli/inputparsers/CsvStream.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/*
* Copyright (C) 2024 DANS - Data Archiving and Networked Services ([email protected])
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nl.knaw.dans.dvcli.inputparsers;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

public class CsvStream {
private final Path csvFile;

public CsvStream(Path csvFile) {
this.csvFile = csvFile;
}

public Stream<CSVRecord> stream() throws IOException {
CSVParser parser = CSVParser.parse(csvFile, StandardCharsets.UTF_8, CSVFormat.DEFAULT.builder().setSkipHeaderRecord(true).build());
return StreamSupport.stream(parser.spliterator(), false).onClose(() -> {
try {
parser.close();
} catch (IOException e) {
e.printStackTrace();
}
});
}
}
Loading
Loading