diff --git a/matching-ws/Dockerfile b/matching-ws/Dockerfile index 814e269d9..43ba26f83 100644 --- a/matching-ws/Dockerfile +++ b/matching-ws/Dockerfile @@ -53,6 +53,8 @@ ENV APP_ARTIFACT=matching-ws # Set environment variables ARG CLB_DATASET_ID="" +ARG CLB_IUCN_DATASET_ID="" +ARG CLB_IDENTIFIER_DATASET_IDS="" ARG CLB_URL="" ARG CLB_USER="" ARG CLB_PASSWORD="" @@ -73,16 +75,14 @@ COPY --from=builder /app/backend/dataset.json /opt/gbif/$APP_ARTIFACT/dataset.js # CSV export from checklistbank RUN if [ -n "$CLB_DATASET_ID" ]; then \ java -jar app.jar \ - --mode=EXPORT_CSV \ + --mode=BUILD_INDEX \ --export.path=/data/$APP_ARTIFACT/exports \ --clb.dataset.id=$CLB_DATASET_ID \ + --clb.identifier.dataset.ids=$CLB_IDENTIFIER_DATASET_IDS \ + --clb.iucn.dataset.id=$CLB_IUCN_DATASET_ID \ --clb.url=$CLB_URL \ --clb.user=$CLB_USER \ - --clb.password=$CLB_PASSWORD && \ - java -jar app.jar \ - --mode=INDEX_CSV \ - --export.path=/data/$APP_ARTIFACT/exports/$CLB_DATASET_ID \ - --index.path=/data/$APP_ARTIFACT/index; \ + --clb.password=$CLB_PASSWORD; \ fi RUN chown -R $USER:$USER /opt/gbif/$APP_ARTIFACT diff --git a/matching-ws/README.md b/matching-ws/README.md index abccdee26..eb0f90b3d 100644 --- a/matching-ws/README.md +++ b/matching-ws/README.md @@ -16,7 +16,7 @@ docker buildx build \ Locally running docker image ```bash docker pull docker.gbif.org/matching-ws:1.0-SNAPSHOT-3LXRC -docker run -d --platform linux/arm64 -p 8080:8080 --name matching-ws-xcol docker.gbif.org/matching-ws:1.0-SNAPSHOT-3LXRC +docker run -d --platform linux/arm64 -p 8080:8080 --name matching-ws-3LXRC matching-ws:1.0-SNAPSHOT-3LXRC docker.gbif.org ``` ### Usage diff --git a/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java b/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java index ac24be245..ff19e3b5b 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java +++ b/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java @@ -116,6 +116,21 @@ void init() { } } + protected void reinit() { + + final String mainIndexPath = getMainIndexPath(); + if (new File(mainIndexPath).exists()) { + LOG.info("Loading lucene index from {}", mainIndexPath); + try { + initWithDir(new MMapDirectory(Path.of(mainIndexPath))); + } catch (IOException e) { + LOG.warn("Cannot open lucene index. Index not available", e); + } + } else { + LOG.warn("Lucene index not found at {}", mainIndexPath); + } + } + private @NotNull String getMainIndexPath() { return indexPath + "/main"; } @@ -335,9 +350,9 @@ public Optional getByUsageKey(String usageKey, boolean allowExternalID Document identifierDoc = identifierSearcher.storedFields().document(identifierDocs.scoreDocs[0].doc); final String joinID = identifierDoc.get(FIELD_JOIN_ID); Query getByIDQuery = new TermQuery(new Term(FIELD_ID, joinID)); - TopDocs docs2 = getSearcher().search(getByIDQuery, 3); - if (docs2.totalHits.value > 0) { - return Optional.of(getSearcher().storedFields().document(docs2.scoreDocs[0].doc)); + docs = getSearcher().search(getByIDQuery, 3); + if (docs.totalHits.value > 0) { + return Optional.of(getSearcher().storedFields().document(docs.scoreDocs[0].doc)); } else { LOG.warn("Cannot find usage {} in main lucene index after finding it in identifier index for {}", usageKey, datasetKey); return Optional.empty(); diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java index e8a86826b..654e88bba 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java +++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java @@ -13,9 +13,11 @@ public class IndexingApplication implements ApplicationRunner { final IndexingService indexingService; + final DatasetIndex datasetIndex; - public IndexingApplication(IndexingService indexingService) { + public IndexingApplication(IndexingService indexingService, DatasetIndex datasetIndex) { this.indexingService = indexingService; + this.datasetIndex = datasetIndex; } @Override @@ -28,42 +30,70 @@ public void run(ApplicationArguments args) throws Exception { } String mode = args.getOptionValues("mode").get(0); - List datasetIds = args.getOptionValues("clb.dataset.id"); + List datasetIds = args.getOptionValues(Main.CLB_DATASET_ID); - if (Main.ExecutionMode.EXPORT_CSV.name().equals(mode)) { + if (Main.ExecutionMode.BUILD_INDEX.name().equals(mode)) { + + List indexPath = args.getOptionValues(Main.INDEX_PATH); + List exportPath = args.getOptionValues(Main.EXPORT_PATH); + + // build main index + final String datasetId = datasetIds.get(0); + indexingService.writeCLBToFile(datasetId); + indexingService.createMainIndexFromFile(exportPath.get(0) + "/" + datasetId, indexPath.get(0) ); + datasetIndex.reinit(); + + // build iucn index + List iucnDatasetId = args.getOptionValues(Main.CLB_IUCN_DATASET_ID); + if (iucnDatasetId != null && !iucnDatasetId.isEmpty()) { + indexingService.indexIUCN(iucnDatasetId.get(0)); + } + + // build identifier index + List identifierDatasetIds = args.getOptionValues(Main.CLB_IDENTIFIER_DATASET_IDS); + if (identifierDatasetIds != null && !identifierDatasetIds.isEmpty()) { + for (String id : identifierDatasetIds) { + String[] ids = id.split(","); + for (String i : ids) { + indexingService.indexIdentifiers(i); + } + } + } + System.out.println("Indexing completed"); + } else if (Main.ExecutionMode.EXPORT_CSV.name().equals(mode)) { if (datasetIds == null || datasetIds.isEmpty()) { - System.err.println("Missing required parameter --clb.dataset.id"); + System.err.println("Missing required parameter --" + Main.CLB_DATASET_ID); return; } indexingService.writeCLBToFile(datasetIds.get(0)); } else if (Main.ExecutionMode.INDEX_CSV.name().equals(mode)) { - List indexPath = args.getOptionValues("index.path"); - List exportPath = args.getOptionValues("export.path"); + List indexPath = args.getOptionValues(Main.INDEX_PATH); + List exportPath = args.getOptionValues(Main.EXPORT_PATH); if (indexPath == null || indexPath.isEmpty()) { - System.err.println("Missing required parameter --index.path"); + System.err.println("Missing required parameter --" + Main.INDEX_PATH); return; } if (exportPath == null || exportPath.isEmpty()) { - System.err.println("Missing required parameter --export.path"); + System.err.println("Missing required parameter --" + Main.EXPORT_PATH); return; } - indexingService.indexFile(exportPath.get(0), indexPath.get(0)); + indexingService.createMainIndexFromFile(exportPath.get(0), indexPath.get(0)); } else if (Main.ExecutionMode.INDEX_DB.name().equals(mode)) { if (datasetIds == null || datasetIds.isEmpty()) { - System.err.println("Missing required parameter --clb.dataset.id"); + System.err.println("Missing required parameter --" + Main.CLB_DATASET_ID); return; } indexingService.runDatasetIndexing(Integer.parseInt(datasetIds.get(0))); } else if (Main.ExecutionMode.INDEX_IUCN_CSV.name().equals(mode)) { if (datasetIds == null || datasetIds.isEmpty()) { - System.err.println("Missing required parameter --clb.dataset.id"); + System.err.println("Missing required parameter --" + Main.CLB_DATASET_ID); return; } indexingService.indexIUCN(datasetIds.get(0)); } else if (Main.ExecutionMode.INDEX_IDENTIFIER_CSV.name().equals(mode)) { if (datasetIds == null || datasetIds.isEmpty()) { - System.err.println("Missing required parameter --clb.dataset.id"); + System.err.println("Missing required parameter --" + Main.CLB_DATASET_ID); return; } indexingService.indexIdentifiers(datasetIds.get(0)); diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java index 072c10ddd..423ec1d9e 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java +++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java @@ -13,7 +13,8 @@ import java.util.regex.Pattern; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import com.opencsv.CSVWriter; +import com.opencsv.CSVWriterBuilder; +import com.opencsv.ICSVWriter; import com.opencsv.bean.CsvToBean; import com.opencsv.bean.CsvToBeanBuilder; import com.opencsv.bean.StatefulBeanToCsv; @@ -155,6 +156,12 @@ private Integer lookupAttempt(SqlSessionFactory factory, ReleaseAttempt release } } + /** + * Writes an export of the name usages in a checklist bank dataset to a CSV file. + * + * @param datasetKeyInput + * @throws Exception + */ @Transactional public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Exception { @@ -185,16 +192,17 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti } final Integer validDatasetKey = datasetKey.get(); + final String directory = exportPath + "/" + datasetKeyInput; + final String fileName = directory + "/" + "index.csv"; - LOG.info("Writing dataset to file..."); + LOG.info("Writing dataset to file {}", fileName); final AtomicInteger counter = new AtomicInteger(0); - final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv"; - FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput)); + + FileUtils.forceMkdir(new File(directory)); try (SqlSession session = factory.openSession(false); - final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) { + final ICSVWriter writer = new CSVWriterBuilder(new FileWriter(fileName)).withSeparator('$').build()) { StatefulBeanToCsv sbc = new StatefulBeanToCsvBuilder(writer) .withQuotechar('\'') - .withSeparator('$') .build(); // Create index writer consume( @@ -212,7 +220,7 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti } // write metadata file in JSON format - LOG.info("Records written to file {}: {}", fileName, counter.get()); + LOG.info("ChecklistBank export written to file {}: {}", fileName, counter.get()); } @Transactional @@ -222,6 +230,7 @@ public void indexIdentifiers(String datasetKey) throws Exception { writeJoinIndex( tempIndexPath + "/" + datasetKey, indexPath + "/identifiers/" + datasetKey, false); } + @Transactional public void indexIUCN(String datasetKey) throws Exception { writeCLBIUCNToFile(datasetKey); indexFile(exportPath + "/" + datasetKey, tempIndexPath + "/" + datasetKey); @@ -229,7 +238,7 @@ public void indexIUCN(String datasetKey) throws Exception { } @Transactional - public String writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws Exception { + public void writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws Exception { // I am seeing better results with this MyBatis Pooling DataSource for Cursor queries // (parallelism) as opposed to the spring managed DataSource @@ -264,12 +273,11 @@ public String writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws E final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv"; FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput)); try (SqlSession session = factory.openSession(false); - final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) { + final ICSVWriter writer = new CSVWriterBuilder(new FileWriter(fileName)).withSeparator('$').build()) { final ObjectMapper objectMapper = new ObjectMapper(); final StatefulBeanToCsv sbc = new StatefulBeanToCsvBuilder(writer) .withQuotechar('\'') - .withSeparator('$') .build(); // Create index writer @@ -294,8 +302,7 @@ public String writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws E } // write metadata file in JSON format - LOG.info("Records written to file {}: {}", fileName, counter.get()); - return fileName; + LOG.info("ChecklistBank IUCN export written to file {}: {}", fileName, counter.get()); } public static Directory newMemoryIndex(Iterable usages) throws IOException { @@ -309,7 +316,7 @@ public static Directory newMemoryIndex(Iterable usages) throws IOExce for (NameUsage u : usages) { if (u != null && u.getId() != null) { writer.addDocument(toDoc(u)); - counter++; + counter ++; } } writer.close(); @@ -392,7 +399,7 @@ public void writeJoinIndex(String tempIndexPath, String ancillaryIndexPath, bool LOG.info("Ancillary index written: {} documents.", counter.get()); } catch (Exception e) { - LOG.error("Error writing documents to CSV: {}", e.getMessage()); + LOG.error("Error writing documents to ancillary index: {}", e.getMessage(), e); } } @@ -426,7 +433,11 @@ public Map loadHierarchy(IndexSearcher searcher, String id) { } @Transactional - public void indexFile(String exportPath, String indexPath) throws Exception { + public void createMainIndexFromFile(String exportPath, String indexPath) throws Exception { + indexFile(exportPath, indexPath + "/main"); + } + + private void indexFile(String exportPath, String indexPath) throws Exception { // Create index directory Path indexDirectory = initialiseIndexDirectory(indexPath); @@ -540,7 +551,7 @@ protected static Document toDoc(NameUsage nameUsage) { cultivar or strain information. Infrageneric names are represented without a leading genus. Unicode characters are replaced by their matching ASCII characters." */ - Rank rank = Rank.valueOf(nameUsage.getRank()); + Rank rank = Rank.valueOf(nameUsage.getRank()); Optional optCanonical = Optional.empty(); try { diff --git a/matching-ws/src/main/java/life/catalogue/matching/Main.java b/matching-ws/src/main/java/life/catalogue/matching/Main.java index 9b89b1856..20637c08c 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/Main.java +++ b/matching-ws/src/main/java/life/catalogue/matching/Main.java @@ -7,14 +7,24 @@ import org.springframework.boot.SpringApplication; +import java.util.ArrayList; +import java.util.List; + /** * Main application class for the matching-ws module. */ @Parameters(separators = "=") public class Main { + public static final String CLB_DATASET_ID = "clb.dataset.id"; + public static final String CLB_IDENTIFIER_DATASET_IDS = "clb.identifier.dataset.ids"; + public static final String CLB_IUCN_DATASET_ID = "clb.iucn.dataset.id"; + public static final String EXPORT_PATH = "export.path"; + public static final String INDEX_PATH = "index.path"; + public static final String V_1_ENABLED = "v1.enabled"; + @Parameter(names = {"--mode"}, order = 1, description = "The mode to use, Defaults to WEB_APP, which will run the web services and will attempt to read the index" + - " from the --index.path ", converter = ExecutionModeConverter.class) + " from the --" + INDEX_PATH + " ", converter = ExecutionModeConverter.class) private ExecutionMode mode = ExecutionMode.WEB_APP; @Parameter(names = {"--clb.url"}, description = "ChecklistBank JDBC URL") @@ -26,25 +36,23 @@ public class Main { @Parameter(names = {"--clb.password"}, description = "ChecklistBank database password") private String clbPassword; - @Parameter(names = {"--clb.dataset.id"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " + + @Parameter(names = {"--" + CLB_DATASET_ID}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " + "Required for INDEX_DB and EXPORT_CSV modes") private String datasetId; - @Parameter(names = {"--clb.status.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " + - "Required for INDEX_DB and EXPORT_CSV modes") - private String statusDatasetIds; + @Parameter(names = {"--" + CLB_IUCN_DATASET_ID}, description = "ChecklistBank dataset ID for the IUCN checklist.") + private String statusDatasetId; - @Parameter(names = {"--clb.identifier.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " + - "Required for INDEX_DB and EXPORT_CSV modes") - private String identifierDatasetIds; + @Parameter(names = {"--" + CLB_IDENTIFIER_DATASET_IDS}, description = "ChecklistBank dataset IDs to index for identifier matching.", arity = 1) + private List identifierDatasetIds = new ArrayList<>(); - @Parameter(names = {"--index.path"}, description = "File system path to the pre-generated lucene index") + @Parameter(names = {"--" + INDEX_PATH}, description = "File system path to the pre-generated lucene index") private String indexPath = "/data/matching-ws/index"; - @Parameter(names = {"--export.path"}, description = "File system path to write exports from ChecklistBank to") + @Parameter(names = {"--" + EXPORT_PATH}, description = "File system path to write exports from ChecklistBank to") private String exportPath = "/data/matching-ws/export"; - @Parameter(names = {"--v1.enabled"}, description = "Enable v1 support for the web service", arity = 1) + @Parameter(names = {"--" + V_1_ENABLED}, description = "Enable v1 support for the web service", arity = 1) private boolean v1Enabled = false; @Parameter(names = {"--server.port"}, description = "Enable v1 support for the web service", arity = 1) @@ -71,7 +79,7 @@ public static void main(String[] args) throws Exception { if ((app.mode == ExecutionMode.INDEX_DB || app.mode == ExecutionMode.EXPORT_CSV) && app.datasetId == null) { - System.err.println("Missing required parameter for mode " + app.mode + " --clb.dataset.id"); + System.err.println("Missing required parameter for mode " + app.mode + " --" + CLB_DATASET_ID); commander.usage(); return; } @@ -82,7 +90,7 @@ public static void main(String[] args) throws Exception { SpringApplication springApplication; switch (app.mode) { - case EXPORT_CSV, INDEX_CSV, INDEX_DB, INDEX_IUCN_CSV, INDEX_IDENTIFIER_CSV: + case BUILD_INDEX,EXPORT_CSV, INDEX_CSV, INDEX_DB, INDEX_IUCN_CSV, INDEX_IDENTIFIER_CSV: springApplication = new SpringApplication(IndexingApplication.class); springApplication.setAdditionalProfiles("indexing"); springApplication.run(args).close(); @@ -97,6 +105,7 @@ public static void main(String[] args) throws Exception { } enum ExecutionMode { + BUILD_INDEX, EXPORT_CSV, INDEX_IUCN_CSV, INDEX_IDENTIFIER_CSV,