Skip to content

Commit

Permalink
WIP - Checkpoint commit for Index generation for IUCN and ID lookups …
Browse files Browse the repository at this point in the history
…for WoRMS #2

Switch to using opencsv
#1321
  • Loading branch information
djtfmartin committed Jun 27, 2024
1 parent 0896d9b commit 0b6cd7d
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 51 deletions.
12 changes: 6 additions & 6 deletions matching-ws/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ ENV APP_ARTIFACT=matching-ws

# Set environment variables
ARG CLB_DATASET_ID=""
ARG CLB_IUCN_DATASET_ID=""
ARG CLB_IDENTIFIER_DATASET_IDS=""
ARG CLB_URL=""
ARG CLB_USER=""
ARG CLB_PASSWORD=""
Expand All @@ -73,16 +75,14 @@ COPY --from=builder /app/backend/dataset.json /opt/gbif/$APP_ARTIFACT/dataset.js
# CSV export from checklistbank
RUN if [ -n "$CLB_DATASET_ID" ]; then \
java -jar app.jar \
--mode=EXPORT_CSV \
--mode=BUILD_INDEX \
--export.path=/data/$APP_ARTIFACT/exports \
--clb.dataset.id=$CLB_DATASET_ID \
--clb.identifier.dataset.ids=$CLB_IDENTIFIER_DATASET_IDS \
--clb.iucn.dataset.id=$CLB_IUCN_DATASET_ID \
--clb.url=$CLB_URL \
--clb.user=$CLB_USER \
--clb.password=$CLB_PASSWORD && \
java -jar app.jar \
--mode=INDEX_CSV \
--export.path=/data/$APP_ARTIFACT/exports/$CLB_DATASET_ID \
--index.path=/data/$APP_ARTIFACT/index; \
--clb.password=$CLB_PASSWORD; \
fi

RUN chown -R $USER:$USER /opt/gbif/$APP_ARTIFACT
Expand Down
2 changes: 1 addition & 1 deletion matching-ws/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ docker buildx build \
Locally running docker image
```bash
docker pull docker.gbif.org/matching-ws:1.0-SNAPSHOT-3LXRC
docker run -d --platform linux/arm64 -p 8080:8080 --name matching-ws-xcol docker.gbif.org/matching-ws:1.0-SNAPSHOT-3LXRC
docker run -d --platform linux/arm64 -p 8080:8080 --name matching-ws-3LXRC matching-ws:1.0-SNAPSHOT-3LXRC docker.gbif.org
```

### Usage
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,21 @@ void init() {
}
}

protected void reinit() {

final String mainIndexPath = getMainIndexPath();
if (new File(mainIndexPath).exists()) {
LOG.info("Loading lucene index from {}", mainIndexPath);
try {
initWithDir(new MMapDirectory(Path.of(mainIndexPath)));
} catch (IOException e) {
LOG.warn("Cannot open lucene index. Index not available", e);
}
} else {
LOG.warn("Lucene index not found at {}", mainIndexPath);
}
}

private @NotNull String getMainIndexPath() {
return indexPath + "/main";
}
Expand Down Expand Up @@ -335,9 +350,9 @@ public Optional<Document> getByUsageKey(String usageKey, boolean allowExternalID
Document identifierDoc = identifierSearcher.storedFields().document(identifierDocs.scoreDocs[0].doc);
final String joinID = identifierDoc.get(FIELD_JOIN_ID);
Query getByIDQuery = new TermQuery(new Term(FIELD_ID, joinID));
TopDocs docs2 = getSearcher().search(getByIDQuery, 3);
if (docs2.totalHits.value > 0) {
return Optional.of(getSearcher().storedFields().document(docs2.scoreDocs[0].doc));
docs = getSearcher().search(getByIDQuery, 3);
if (docs.totalHits.value > 0) {
return Optional.of(getSearcher().storedFields().document(docs.scoreDocs[0].doc));
} else {
LOG.warn("Cannot find usage {} in main lucene index after finding it in identifier index for {}", usageKey, datasetKey);
return Optional.empty();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@
public class IndexingApplication implements ApplicationRunner {

final IndexingService indexingService;
final DatasetIndex datasetIndex;

public IndexingApplication(IndexingService indexingService) {
public IndexingApplication(IndexingService indexingService, DatasetIndex datasetIndex) {
this.indexingService = indexingService;
this.datasetIndex = datasetIndex;
}

@Override
Expand All @@ -28,42 +30,70 @@ public void run(ApplicationArguments args) throws Exception {
}

String mode = args.getOptionValues("mode").get(0);
List<String> datasetIds = args.getOptionValues("clb.dataset.id");
List<String> datasetIds = args.getOptionValues(Main.CLB_DATASET_ID);

if (Main.ExecutionMode.EXPORT_CSV.name().equals(mode)) {
if (Main.ExecutionMode.BUILD_INDEX.name().equals(mode)) {

List<String> indexPath = args.getOptionValues(Main.INDEX_PATH);
List<String> exportPath = args.getOptionValues(Main.EXPORT_PATH);

// build main index
final String datasetId = datasetIds.get(0);
indexingService.writeCLBToFile(datasetId);
indexingService.createMainIndexFromFile(exportPath.get(0) + "/" + datasetId, indexPath.get(0) );
datasetIndex.reinit();

// build iucn index
List<String> iucnDatasetId = args.getOptionValues(Main.CLB_IUCN_DATASET_ID);
if (iucnDatasetId != null && !iucnDatasetId.isEmpty()) {
indexingService.indexIUCN(iucnDatasetId.get(0));
}

// build identifier index
List<String> identifierDatasetIds = args.getOptionValues(Main.CLB_IDENTIFIER_DATASET_IDS);
if (identifierDatasetIds != null && !identifierDatasetIds.isEmpty()) {
for (String id : identifierDatasetIds) {
String[] ids = id.split(",");
for (String i : ids) {
indexingService.indexIdentifiers(i);
}
}
}
System.out.println("Indexing completed");
} else if (Main.ExecutionMode.EXPORT_CSV.name().equals(mode)) {
if (datasetIds == null || datasetIds.isEmpty()) {
System.err.println("Missing required parameter --clb.dataset.id");
System.err.println("Missing required parameter --" + Main.CLB_DATASET_ID);
return;
}
indexingService.writeCLBToFile(datasetIds.get(0));
} else if (Main.ExecutionMode.INDEX_CSV.name().equals(mode)) {

List<String> indexPath = args.getOptionValues("index.path");
List<String> exportPath = args.getOptionValues("export.path");
List<String> indexPath = args.getOptionValues(Main.INDEX_PATH);
List<String> exportPath = args.getOptionValues(Main.EXPORT_PATH);
if (indexPath == null || indexPath.isEmpty()) {
System.err.println("Missing required parameter --index.path");
System.err.println("Missing required parameter --" + Main.INDEX_PATH);
return;
}
if (exportPath == null || exportPath.isEmpty()) {
System.err.println("Missing required parameter --export.path");
System.err.println("Missing required parameter --" + Main.EXPORT_PATH);
return;
}
indexingService.indexFile(exportPath.get(0), indexPath.get(0));
indexingService.createMainIndexFromFile(exportPath.get(0), indexPath.get(0));
} else if (Main.ExecutionMode.INDEX_DB.name().equals(mode)) {
if (datasetIds == null || datasetIds.isEmpty()) {
System.err.println("Missing required parameter --clb.dataset.id");
System.err.println("Missing required parameter --" + Main.CLB_DATASET_ID);
return;
}
indexingService.runDatasetIndexing(Integer.parseInt(datasetIds.get(0)));
} else if (Main.ExecutionMode.INDEX_IUCN_CSV.name().equals(mode)) {
if (datasetIds == null || datasetIds.isEmpty()) {
System.err.println("Missing required parameter --clb.dataset.id");
System.err.println("Missing required parameter --" + Main.CLB_DATASET_ID);
return;
}
indexingService.indexIUCN(datasetIds.get(0));
} else if (Main.ExecutionMode.INDEX_IDENTIFIER_CSV.name().equals(mode)) {
if (datasetIds == null || datasetIds.isEmpty()) {
System.err.println("Missing required parameter --clb.dataset.id");
System.err.println("Missing required parameter --" + Main.CLB_DATASET_ID);
return;
}
indexingService.indexIdentifiers(datasetIds.get(0));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import java.util.regex.Pattern;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.opencsv.CSVWriter;
import com.opencsv.CSVWriterBuilder;
import com.opencsv.ICSVWriter;
import com.opencsv.bean.CsvToBean;
import com.opencsv.bean.CsvToBeanBuilder;
import com.opencsv.bean.StatefulBeanToCsv;
Expand Down Expand Up @@ -155,6 +156,12 @@ private Integer lookupAttempt(SqlSessionFactory factory, ReleaseAttempt release
}
}

/**
* Writes an export of the name usages in a checklist bank dataset to a CSV file.
*
* @param datasetKeyInput
* @throws Exception
*/
@Transactional
public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Exception {

Expand Down Expand Up @@ -185,16 +192,17 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
}

final Integer validDatasetKey = datasetKey.get();
final String directory = exportPath + "/" + datasetKeyInput;
final String fileName = directory + "/" + "index.csv";

LOG.info("Writing dataset to file...");
LOG.info("Writing dataset to file {}", fileName);
final AtomicInteger counter = new AtomicInteger(0);
final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv";
FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput));

FileUtils.forceMkdir(new File(directory));
try (SqlSession session = factory.openSession(false);
final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) {
final ICSVWriter writer = new CSVWriterBuilder(new FileWriter(fileName)).withSeparator('$').build()) {
StatefulBeanToCsv<NameUsage> sbc = new StatefulBeanToCsvBuilder<NameUsage>(writer)
.withQuotechar('\'')
.withSeparator('$')
.build();
// Create index writer
consume(
Expand All @@ -212,7 +220,7 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
}

// write metadata file in JSON format
LOG.info("Records written to file {}: {}", fileName, counter.get());
LOG.info("ChecklistBank export written to file {}: {}", fileName, counter.get());
}

@Transactional
Expand All @@ -222,14 +230,15 @@ public void indexIdentifiers(String datasetKey) throws Exception {
writeJoinIndex( tempIndexPath + "/" + datasetKey, indexPath + "/identifiers/" + datasetKey, false);
}

@Transactional
public void indexIUCN(String datasetKey) throws Exception {
writeCLBIUCNToFile(datasetKey);
indexFile(exportPath + "/" + datasetKey, tempIndexPath + "/" + datasetKey);
writeJoinIndex( tempIndexPath + "/" + datasetKey, indexPath + "/ancillary/" + datasetKey, true);
}

@Transactional
public String writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws Exception {
public void writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws Exception {

// I am seeing better results with this MyBatis Pooling DataSource for Cursor queries
// (parallelism) as opposed to the spring managed DataSource
Expand Down Expand Up @@ -264,12 +273,11 @@ public String writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws E
final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv";
FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput));
try (SqlSession session = factory.openSession(false);
final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) {
final ICSVWriter writer = new CSVWriterBuilder(new FileWriter(fileName)).withSeparator('$').build()) {

final ObjectMapper objectMapper = new ObjectMapper();
final StatefulBeanToCsv<NameUsage> sbc = new StatefulBeanToCsvBuilder<NameUsage>(writer)
.withQuotechar('\'')
.withSeparator('$')
.build();

// Create index writer
Expand All @@ -294,8 +302,7 @@ public String writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws E
}

// write metadata file in JSON format
LOG.info("Records written to file {}: {}", fileName, counter.get());
return fileName;
LOG.info("ChecklistBank IUCN export written to file {}: {}", fileName, counter.get());
}

public static Directory newMemoryIndex(Iterable<NameUsage> usages) throws IOException {
Expand All @@ -309,7 +316,7 @@ public static Directory newMemoryIndex(Iterable<NameUsage> usages) throws IOExce
for (NameUsage u : usages) {
if (u != null && u.getId() != null) {
writer.addDocument(toDoc(u));
counter++;
counter ++;
}
}
writer.close();
Expand Down Expand Up @@ -392,7 +399,7 @@ public void writeJoinIndex(String tempIndexPath, String ancillaryIndexPath, bool

LOG.info("Ancillary index written: {} documents.", counter.get());
} catch (Exception e) {
LOG.error("Error writing documents to CSV: {}", e.getMessage());
LOG.error("Error writing documents to ancillary index: {}", e.getMessage(), e);
}
}

Expand Down Expand Up @@ -426,7 +433,11 @@ public Map<String, String> loadHierarchy(IndexSearcher searcher, String id) {
}

@Transactional
public void indexFile(String exportPath, String indexPath) throws Exception {
public void createMainIndexFromFile(String exportPath, String indexPath) throws Exception {
indexFile(exportPath, indexPath + "/main");
}

private void indexFile(String exportPath, String indexPath) throws Exception {

// Create index directory
Path indexDirectory = initialiseIndexDirectory(indexPath);
Expand Down Expand Up @@ -540,7 +551,7 @@ protected static Document toDoc(NameUsage nameUsage) {
cultivar or strain information. Infrageneric names are represented without a
leading genus. Unicode characters are replaced by their matching ASCII characters."
*/
Rank rank = Rank.valueOf(nameUsage.getRank());
Rank rank = Rank.valueOf(nameUsage.getRank());

Optional<String> optCanonical = Optional.empty();
try {
Expand Down
35 changes: 22 additions & 13 deletions matching-ws/src/main/java/life/catalogue/matching/Main.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,24 @@

import org.springframework.boot.SpringApplication;

import java.util.ArrayList;
import java.util.List;

/**
* Main application class for the matching-ws module.
*/
@Parameters(separators = "=")
public class Main {

public static final String CLB_DATASET_ID = "clb.dataset.id";
public static final String CLB_IDENTIFIER_DATASET_IDS = "clb.identifier.dataset.ids";
public static final String CLB_IUCN_DATASET_ID = "clb.iucn.dataset.id";
public static final String EXPORT_PATH = "export.path";
public static final String INDEX_PATH = "index.path";
public static final String V_1_ENABLED = "v1.enabled";

@Parameter(names = {"--mode"}, order = 1, description = "The mode to use, Defaults to WEB_APP, which will run the web services and will attempt to read the index" +
" from the --index.path ", converter = ExecutionModeConverter.class)
" from the --" + INDEX_PATH + " ", converter = ExecutionModeConverter.class)
private ExecutionMode mode = ExecutionMode.WEB_APP;

@Parameter(names = {"--clb.url"}, description = "ChecklistBank JDBC URL")
Expand All @@ -26,25 +36,23 @@ public class Main {
@Parameter(names = {"--clb.password"}, description = "ChecklistBank database password")
private String clbPassword;

@Parameter(names = {"--clb.dataset.id"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " +
@Parameter(names = {"--" + CLB_DATASET_ID}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " +
"Required for INDEX_DB and EXPORT_CSV modes")
private String datasetId;

@Parameter(names = {"--clb.status.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " +
"Required for INDEX_DB and EXPORT_CSV modes")
private String statusDatasetIds;
@Parameter(names = {"--" + CLB_IUCN_DATASET_ID}, description = "ChecklistBank dataset ID for the IUCN checklist.")
private String statusDatasetId;

@Parameter(names = {"--clb.identifier.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " +
"Required for INDEX_DB and EXPORT_CSV modes")
private String identifierDatasetIds;
@Parameter(names = {"--" + CLB_IDENTIFIER_DATASET_IDS}, description = "ChecklistBank dataset IDs to index for identifier matching.", arity = 1)
private List<String> identifierDatasetIds = new ArrayList<>();

@Parameter(names = {"--index.path"}, description = "File system path to the pre-generated lucene index")
@Parameter(names = {"--" + INDEX_PATH}, description = "File system path to the pre-generated lucene index")
private String indexPath = "/data/matching-ws/index";

@Parameter(names = {"--export.path"}, description = "File system path to write exports from ChecklistBank to")
@Parameter(names = {"--" + EXPORT_PATH}, description = "File system path to write exports from ChecklistBank to")
private String exportPath = "/data/matching-ws/export";

@Parameter(names = {"--v1.enabled"}, description = "Enable v1 support for the web service", arity = 1)
@Parameter(names = {"--" + V_1_ENABLED}, description = "Enable v1 support for the web service", arity = 1)
private boolean v1Enabled = false;

@Parameter(names = {"--server.port"}, description = "Enable v1 support for the web service", arity = 1)
Expand All @@ -71,7 +79,7 @@ public static void main(String[] args) throws Exception {

if ((app.mode == ExecutionMode.INDEX_DB
|| app.mode == ExecutionMode.EXPORT_CSV) && app.datasetId == null) {
System.err.println("Missing required parameter for mode " + app.mode + " --clb.dataset.id");
System.err.println("Missing required parameter for mode " + app.mode + " --" + CLB_DATASET_ID);
commander.usage();
return;
}
Expand All @@ -82,7 +90,7 @@ public static void main(String[] args) throws Exception {

SpringApplication springApplication;
switch (app.mode) {
case EXPORT_CSV, INDEX_CSV, INDEX_DB, INDEX_IUCN_CSV, INDEX_IDENTIFIER_CSV:
case BUILD_INDEX,EXPORT_CSV, INDEX_CSV, INDEX_DB, INDEX_IUCN_CSV, INDEX_IDENTIFIER_CSV:
springApplication = new SpringApplication(IndexingApplication.class);
springApplication.setAdditionalProfiles("indexing");
springApplication.run(args).close();
Expand All @@ -97,6 +105,7 @@ public static void main(String[] args) throws Exception {
}

enum ExecutionMode {
BUILD_INDEX,
EXPORT_CSV,
INDEX_IUCN_CSV,
INDEX_IDENTIFIER_CSV,
Expand Down

0 comments on commit 0b6cd7d

Please sign in to comment.