Skip to content

Commit

Permalink
BXC-4652 implement index --from-csv (#104)
Browse files Browse the repository at this point in the history
* implement index --from-csv

* fix javadoc, csvFile path, removed unused code, readjust logic, parse generic csv when indexing, fix/add tests, dmrecord field

* move force flag to CdmIndexOptions, remove ExportObjectsService from CdmIndexService, remove csv-specific methods, simplify code, rename assertCsvImportExists

* remove conditional, add MIGRATION_FIELDS to exportFields in indexAllFromCsv

* text primary key not null

* text primary key not null if indexing from csv file

* text primary key not null, fix test
  • Loading branch information
krwong authored Aug 15, 2024
1 parent afe7b91 commit 1f75569
Show file tree
Hide file tree
Showing 12 changed files with 368 additions and 57 deletions.
30 changes: 22 additions & 8 deletions src/main/java/edu/unc/lib/boxc/migration/cdm/CdmIndexCommand.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,47 +4,61 @@
import static org.slf4j.LoggerFactory.getLogger;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.Callable;

import edu.unc.lib.boxc.migration.cdm.exceptions.MigrationException;
import edu.unc.lib.boxc.migration.cdm.model.CdmFieldInfo;
import edu.unc.lib.boxc.migration.cdm.options.CdmIndexOptions;
import org.slf4j.Logger;

import edu.unc.lib.boxc.migration.cdm.exceptions.StateAlreadyExistsException;
import edu.unc.lib.boxc.migration.cdm.model.MigrationProject;
import edu.unc.lib.boxc.migration.cdm.services.CdmFieldService;
import edu.unc.lib.boxc.migration.cdm.services.CdmIndexService;
import edu.unc.lib.boxc.migration.cdm.services.MigrationProjectFactory;
import picocli.CommandLine.Mixin;
import picocli.CommandLine.Command;
import picocli.CommandLine.Option;
import picocli.CommandLine.ParentCommand;

/**
* @author bbpennel
*/
@Command(name = "index",
description = "Index the exported CDM records for this project. Must be run after a complete export.")
description = "Populate the index of object records for this project. Must be run after " +
"exporting source metadata or providing a CSV file.")
public class CdmIndexCommand implements Callable<Integer> {
private static final Logger log = getLogger(CdmIndexCommand.class);
@ParentCommand
private CLIMain parentCommand;

@Option(names = { "-f", "--force"},
description = "Overwrite index if one already exists")
private boolean force;

private CdmFieldService fieldService;
private CdmIndexService indexService;
private MigrationProject project;

@Mixin
private CdmIndexOptions options;

@Override
public Integer call() throws Exception {
long start = System.nanoTime();

try {
initialize();

indexService.createDatabase(force);
indexService.indexAll();
// if user provides csv, check that it exists
if (options.getCsvFile() != null) {
if (Files.exists(options.getCsvFile())) {
CdmFieldInfo csvExportFields = fieldService.retrieveFieldsFromCsv(options.getCsvFile());
fieldService.persistFieldsToProject(project, csvExportFields);
} else {
throw new MigrationException("No csv file exists in " + options.getCsvFile());
}
}

indexService.createDatabase(options);
indexService.index(options);
// Display any warning messages to user
if (!indexService.getIndexingWarnings().isEmpty()) {
indexService.getIndexingWarnings().forEach(msg -> outputLogger.info(msg));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
package edu.unc.lib.boxc.migration.cdm;

import edu.unc.lib.boxc.migration.cdm.exceptions.InvalidProjectStateException;
import edu.unc.lib.boxc.migration.cdm.model.MigrationProject;
import edu.unc.lib.boxc.migration.cdm.services.ExportObjectsService;
import edu.unc.lib.boxc.migration.cdm.services.MigrationProjectFactory;
import org.slf4j.Logger;
import picocli.CommandLine.Command;
import picocli.CommandLine.ParentCommand;

import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.util.concurrent.Callable;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* @author krwong
*/
public class ExportObjectsInfo {
public static final String RECORD_ID = "record_id";
public static final String RECORD_ID = CdmFieldInfo.CDM_ID;
public static final String FILE_PATH = "file_path";
public static final String FILENAME = "filename";
public static final String[] CSV_HEADERS = new String[] {RECORD_ID, FILE_PATH, FILENAME};
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package edu.unc.lib.boxc.migration.cdm.options;

import picocli.CommandLine.Option;

import java.nio.file.Path;

/**
* Options for indexing object records
* @author krwong
*/
public class CdmIndexOptions {
@Option(names = {"-c", "--from-csv"},
description = {"Export objects CSV file used as source for populating sqlite database."})
private Path csvFile;

@Option(names = { "-f", "--force"},
description = "Overwrite index if one already exists")
private boolean force;

public Path getCsvFile() {
return csvFile;
}

public void setCsvFile(Path csvFile) {
this.csvFile = csvFile;
}

public boolean getForce() {
return force;
}

public void setForce(boolean force) {
this.force = force;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
public class CdmFieldService {
private CloseableHttpClient httpClient;
private String cdmBaseUri;
private MigrationProject project;

private static final String CDM_NICK_FIELD = "nick";
private static final String CDM_NAME_FIELD = "name";
Expand Down Expand Up @@ -69,7 +70,6 @@ public CdmFieldService() {

/**
* Get the URL for retrieving field info for the given collection
* @param cdmBaseUri
* @param collectionId
* @return
*/
Expand Down Expand Up @@ -129,7 +129,7 @@ public CdmFieldInfo retrieveFieldsForCollection(String collectionId) throws IOEx
}

/**
* Persist the field information out to the project project
* Persist the field information out to the project
* @param project
* @param fieldInfo
* @throws IOException
Expand Down Expand Up @@ -234,6 +234,34 @@ private void validateFieldName(String field, String headerField, int line, Set<S
existing.add(field);
}

/**
* Retrieve field information for the project from the csv
* @param
* @return
*/
public CdmFieldInfo retrieveFieldsFromCsv(Path exportedObjectsPath) throws IOException {
CdmFieldInfo fieldInfo = new CdmFieldInfo();

try (
Reader reader = Files.newBufferedReader(exportedObjectsPath);
CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT
.withTrim());
) {
List<String> headers = parser.getRecords().get(0).toList();
for (String header : headers) {
CdmFieldEntry fieldEntry = new CdmFieldEntry();
fieldEntry.setNickName(header);
fieldEntry.setExportAs(header);
fieldEntry.setDescription(header);
fieldEntry.setSkipExport(false);
fieldInfo.getFields().add(fieldEntry);
}
} catch (Exception e) {
throw new MigrationException("Failed to parse exported objects path " + exportedObjectsPath, e);
}
return fieldInfo;
}

private String booleanToString(boolean bool) {
return bool ? "y" : "n";
}
Expand All @@ -245,4 +273,8 @@ public void setHttpClient(CloseableHttpClient httpClient) {
public void setCdmBaseUri(String cdmBaseUri) {
this.cdmBaseUri = cdmBaseUri;
}

public void setProject(MigrationProject project) {
this.project = project;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
import edu.unc.lib.boxc.migration.cdm.exceptions.StateAlreadyExistsException;
import edu.unc.lib.boxc.migration.cdm.model.CdmFieldInfo;
import edu.unc.lib.boxc.migration.cdm.model.MigrationProject;
import edu.unc.lib.boxc.migration.cdm.options.CdmIndexOptions;
import edu.unc.lib.boxc.migration.cdm.util.ProjectPropertiesSerialization;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang3.StringUtils;
import org.jdom2.Document;
import org.jdom2.Element;
Expand All @@ -16,6 +20,7 @@

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.Reader;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.sql.Connection;
Expand Down Expand Up @@ -59,6 +64,14 @@ public class CdmIndexService {
private String recordInsertSqlTemplate;
private List<String> indexingWarnings = new ArrayList<>();

public void index(CdmIndexOptions options) throws Exception {
if (options.getCsvFile() != null) {
indexAllFromCsv(options);
} else {
indexAll();
}
}

/**
* Indexes all exported CDM records for this project
* @throws IOException
Expand Down Expand Up @@ -292,15 +305,16 @@ private void indexObject(Connection conn, List<String> exportFieldValues)

/**
* Create the index database with all cdm and migration fields
* @param force
* @param options
* @throws IOException
*/
public void createDatabase(boolean force) throws IOException {
ensureDatabaseState(force);
public void createDatabase(CdmIndexOptions options) throws IOException {
ensureDatabaseState(options.getForce());

CdmFieldInfo fieldInfo = fieldService.loadFieldsFromProject(project);
List<String> exportFields = new ArrayList<>(fieldInfo.listAllExportFields());
List<String> exportFields = fieldInfo.listAllExportFields();
exportFields.addAll(MIGRATION_FIELDS);

StringBuilder queryBuilder = new StringBuilder("CREATE TABLE " + TB_NAME + " (\n");
for (int i = 0; i < exportFields.size(); i++) {
String field = exportFields.get(i);
Expand All @@ -327,14 +341,57 @@ public void createDatabase(boolean force) throws IOException {

private String indexFieldType(String exportField) {
if (CdmFieldInfo.CDM_ID.equals(exportField)) {
return "INT PRIMARY KEY NOT NULL";
return "TEXT PRIMARY KEY NOT NULL";
} else if (CHILD_ORDER_FIELD.equals(exportField)) {
return "INT";
} else {
return "TEXT";
}
}

/**
* Indexes all exported objects for this project
* @param options
* @throws IOException
*/
public void indexAllFromCsv(CdmIndexOptions options) throws IOException {
assertCsvImportExists(options);

CdmFieldInfo fieldInfo = fieldService.loadFieldsFromProject(project);
List<String> exportFields = fieldInfo.listAllExportFields();
exportFields.addAll(MIGRATION_FIELDS);
recordInsertSqlTemplate = makeInsertTemplate(exportFields);

try (
var conn = openDbConnection();
Reader reader = Files.newBufferedReader(options.getCsvFile());
CSVParser csvParser = new CSVParser(reader, CSVFormat.DEFAULT
.withFirstRecordAsHeader()
.withHeader(String.valueOf(exportFields))
.withTrim());
) {
for (CSVRecord csvRecord : csvParser) {
if (!csvRecord.get(0).isEmpty()) {
List<String> fieldValues = csvRecord.toList();
indexObject(conn, fieldValues);
}
}
} catch (IOException e) {
throw new MigrationException("Failed to read export files", e);
} catch (SQLException e) {
throw new MigrationException("Failed to update database", e);
}

project.getProjectProperties().setIndexedDate(Instant.now());
ProjectPropertiesSerialization.write(project);
}

private void assertCsvImportExists(CdmIndexOptions options) {
if (Files.notExists(options.getCsvFile())) {
throw new InvalidProjectStateException("User provided csv must exist prior to indexing");
}
}

private void ensureDatabaseState(boolean force) {
if (Files.exists(project.getIndexPath())) {
if (force) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,25 @@ public void indexWithWarningsTest() throws Exception {
assertOutputContains("CPD file referenced by object 604 in desc.all was not found");
}

@Test
public void indexFromCsvTest() throws Exception {
initProject();
Files.createDirectories(project.getExportPath());

Files.copy(Paths.get("src/test/resources/files/exported_objects.csv"), project.getExportObjectsPath());
setExportedDate();

String[] args = new String[] {
"-w", project.getProjectPath().toString(),
"index",
"-c", "src/test/resources/files/exported_objects.csv"};
executeExpectSuccess(args);

assertTrue(Files.exists(project.getIndexPath()));
assertTrue(Files.exists(project.getFieldsPath()));
assertDateIndexedPresent();
}

private void setExportedDate() throws Exception {
project.getProjectProperties().setExportedDate(Instant.now());
ProjectPropertiesSerialization.write(project);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.Optional;

import edu.unc.lib.boxc.migration.cdm.model.ExportObjectsInfo;
import edu.unc.lib.boxc.migration.cdm.test.BxcEnvironmentHelper;
import edu.unc.lib.boxc.migration.cdm.test.CdmEnvironmentHelper;
import org.apache.commons.io.FileUtils;
Expand Down Expand Up @@ -63,6 +65,7 @@ public void setup() throws Exception {
service = new CdmFieldService();
service.setHttpClient(httpClient);
service.setCdmBaseUri(CDM_BASE_URL);
service.setProject(project);

when(httpClient.execute(any(HttpGet.class))).thenReturn(httpResp);
when(httpResp.getEntity()).thenReturn(respEntity);
Expand Down Expand Up @@ -342,6 +345,25 @@ public void retrieveValidateAndReloadRoundTripTest() throws Exception {
"BLANK", fieldsLoaded);
}

@Test
public void retrieveCdmFieldsFromCsvTest() throws Exception {
Files.copy(Paths.get("src/test/resources/files/exported_objects.csv"),
project.getExportObjectsPath());

CdmFieldInfo fieldInfo = service.retrieveFieldsFromCsv(Paths.get("src/test/resources/files/exported_objects.csv"));
List<CdmFieldEntry> fields = fieldInfo.getFields();

assertHasFieldWithValue(ExportObjectsInfo.RECORD_ID, ExportObjectsInfo.RECORD_ID, ExportObjectsInfo.RECORD_ID,
false, null, null, null,
null, null, fields);
assertHasFieldWithValue(ExportObjectsInfo.FILE_PATH, ExportObjectsInfo.FILE_PATH, ExportObjectsInfo.FILE_PATH,
false, null, null, null,
null, null, fields);
assertHasFieldWithValue(ExportObjectsInfo.FILENAME, ExportObjectsInfo.FILENAME, ExportObjectsInfo.FILENAME,
false, null, null, null,
null, null, fields);
}

private void assertHasFieldWithValue(String nick, String expectedExport, String expectedDesc,
boolean expectedSkip, String expectedCdmRequired, String expectedCdmSearchable, String expectedCdmHidden,
String expectedCdmVocab, String expectedCdmDcMapping, List<CdmFieldEntry> fields) {
Expand Down
Loading

0 comments on commit 1f75569

Please sign in to comment.