From ae3ae8fb09df77dfd533d1915f432d53438cb89a Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Thu, 30 May 2024 20:43:40 +0100 Subject: [PATCH] WIP - Checkpoint commit for Index generation for IUCN and ID lookups for WoRMS Switch to using opencsv https://github.com/CatalogueOfLife/backend/issues/1321 --- matching-ws/pom.xml | 4 +- .../life/catalogue/matching/DatasetIndex.java | 143 +++++++- .../catalogue/matching/IndexConstants.java | 3 + .../matching/IndexingApplication.java | 12 + .../catalogue/matching/IndexingMapper.java | 2 + .../catalogue/matching/IndexingService.java | 333 ++++++++++++++---- .../java/life/catalogue/matching/Main.java | 17 +- .../catalogue/matching/MatchController.java | 14 +- .../life/catalogue/matching/NameNRank.java | 2 +- .../life/catalogue/matching/NameUsage.java | 32 +- .../catalogue/matching/NameUsageMatch.java | 2 + .../java/life/catalogue/matching/Status.java | 12 + .../catalogue/matching/IndexingMapper.xml | 37 +- .../catalogue/matching/DatasetIndexTest.java | 11 +- 14 files changed, 501 insertions(+), 123 deletions(-) create mode 100644 matching-ws/src/main/java/life/catalogue/matching/Status.java diff --git a/matching-ws/pom.xml b/matching-ws/pom.xml index 02d50e0c5..fdd885fa6 100644 --- a/matching-ws/pom.xml +++ b/matching-ws/pom.xml @@ -251,9 +251,9 @@ 2.17.0 - net.sf.opencsv + com.opencsv opencsv - 2.3 + 5.9 compile diff --git a/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java b/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java index ec8f0c3b5..ac24be245 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java +++ b/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java @@ -1,7 +1,7 @@ package life.catalogue.matching; import static life.catalogue.matching.IndexConstants.*; -import static life.catalogue.matching.IndexingService.analyzer; +import static life.catalogue.matching.IndexingService.scientificNameAnalyzer; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; @@ -29,6 +29,7 @@ import org.apache.lucene.store.MMapDirectory; import org.gbif.nameparser.api.Rank; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Value; @@ -43,6 +44,8 @@ public class DatasetIndex { private static Logger LOG = LoggerFactory.getLogger(DatasetIndex.class); private IndexSearcher searcher; + private Map identifierSearchers = new HashMap<>(); + private Map ancillarySearchers = new HashMap<>(); @Value("${index.path:/data/matching-ws/index}") String indexPath; @@ -53,21 +56,73 @@ public class DatasetIndex { /** Attempts to read the index from disk if it exists. */ @PostConstruct void init() { - if (new File(indexPath).exists()) { - LOG.info("Loading lucene index from {}", indexPath); + + final String mainIndexPath = getMainIndexPath(); + + if (new File(mainIndexPath).exists()) { + LOG.info("Loading lucene index from {}", mainIndexPath); try { - initWithDir(new MMapDirectory(Path.of(indexPath))); + initWithDir(new MMapDirectory(Path.of(mainIndexPath))); } catch (IOException e) { LOG.warn("Cannot open lucene index. Index not available", e); } + + // load identifier indexes + this.identifierSearchers = new HashMap<>(); + if (Path.of(indexPath + "/identifiers").toFile().exists()) { + try (DirectoryStream stream = Files.newDirectoryStream(Path.of(indexPath + "/identifiers"))) { + for (Path entry : stream) { + if (Files.isDirectory(entry)) { + try { + Directory identifierDir = new MMapDirectory(entry); + DirectoryReader reader = DirectoryReader.open(identifierDir); + identifierSearchers.put(entry.toFile().getName(), new IndexSearcher(reader)); + } catch (IOException e) { + LOG.warn("Cannot open identifiers lucene index {}", entry, e); + } + } + } + } catch (IOException e) { + LOG.error("Cannot read identifiers index directory", e); + } + } else { + LOG.info("Identifiers indexes not found at {}", indexPath + "/identifiers"); + } + + // load ancillary indexes + this.ancillarySearchers = new HashMap<>(); + if (Path.of(indexPath + "/ancillary").toFile().exists()) { + try (DirectoryStream stream = Files.newDirectoryStream(Path.of(indexPath + "/ancillary"))) { + for (Path entry : stream) { + if (Files.isDirectory(entry)) { + try { + Directory ancillaryDir = new MMapDirectory(entry); + DirectoryReader reader = DirectoryReader.open(ancillaryDir); + ancillarySearchers.put(entry.toFile().getName(), new IndexSearcher(reader)); + } catch (IOException e) { + LOG.warn("Cannot open ancillary lucene index {}", entry, e); + } + } + } + } catch (IOException e) { + LOG.error("Cannot read ancillary index directory", e); + } + } else { + LOG.info("Ancillary indexes not found at {}", indexPath + "/ancillary"); + } + } else { - LOG.warn("Lucene index not found at {}", indexPath); + LOG.warn("Lucene index not found at {}", mainIndexPath); } } - void initWithDir(Directory indexDir) { + private @NotNull String getMainIndexPath() { + return indexPath + "/main"; + } + + void initWithDir(Directory indexDirectory) { try { - DirectoryReader reader = DirectoryReader.open(indexDir); + DirectoryReader reader = DirectoryReader.open(indexDirectory); this.searcher = new IndexSearcher(reader); } catch (IOException e) { LOG.warn("Cannot open lucene index. Index not available", e); @@ -83,7 +138,7 @@ public IndexMetadata getIndexMetadata(){ IndexMetadata metadata = new IndexMetadata(); // get size on disk - Path directoryPath = Path.of(indexPath); + Path directoryPath = Path.of(getMainIndexPath()); try { BasicFileAttributes attributes = Files.readAttributes(directoryPath, BasicFileAttributes.class); Instant creationTime = attributes.creationTime().toInstant(); @@ -225,7 +280,7 @@ private IndexSearcher getSearcher() { * @return */ public NameUsageMatch matchByUsageKey(String usageKey) { - Optional docOpt = getByUsageId(usageKey); + Optional docOpt = getByUsageKey(usageKey, true); if (docOpt.isPresent()) { Document doc = docOpt.get(); NameUsageMatch match = fromDoc(doc); @@ -245,13 +300,51 @@ public NameUsageMatch matchByUsageKey(String usageKey) { } } - public Optional getByUsageId(String usageKey) { - Query query = new TermQuery(new Term(FIELD_ID, usageKey)); + public static String escapeQueryChars(String s) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + // These are the special characters that need to be escaped + if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' || + c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' || + c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' || + c == '/' || Character.isWhitespace(c)) { + sb.append('\\'); + } + sb.append(c); + } + return sb.toString(); + } + + public Optional getByUsageKey(String usageKey, boolean allowExternalIDs) { + Query query = new TermQuery(new Term(FIELD_ID, escapeQueryChars(usageKey))); try { TopDocs docs = getSearcher().search(query, 3); if (docs.totalHits.value > 0) { return Optional.of(getSearcher().storedFields().document(docs.scoreDocs[0].doc)); - } else { + } else if (allowExternalIDs) { + + // if join indexes are present, add them to the match + if (identifierSearchers != null){ + for (String datasetKey: identifierSearchers.keySet()){ + IndexSearcher identifierSearcher = identifierSearchers.get(datasetKey); + Query identifierQuery = new TermQuery(new Term(FIELD_ID, usageKey)); + LOG.info("Searching for identifier {} in dataset {}", usageKey, datasetKey); + TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3); + if (identifierDocs.totalHits.value > 0) { + Document identifierDoc = identifierSearcher.storedFields().document(identifierDocs.scoreDocs[0].doc); + final String joinID = identifierDoc.get(FIELD_JOIN_ID); + Query getByIDQuery = new TermQuery(new Term(FIELD_ID, joinID)); + TopDocs docs2 = getSearcher().search(getByIDQuery, 3); + if (docs2.totalHits.value > 0) { + return Optional.of(getSearcher().storedFields().document(docs2.scoreDocs[0].doc)); + } else { + LOG.warn("Cannot find usage {} in main lucene index after finding it in identifier index for {}", usageKey, datasetKey); + return Optional.empty(); + } + } + } + } return Optional.empty(); } } catch (IOException e) { @@ -274,7 +367,7 @@ public List loadHigherTaxa(String parentID) { List higherTaxa = new ArrayList<>(); while (parentID != null) { - Optional docOpt = getByUsageId(parentID); + Optional docOpt = getByUsageKey(parentID, false); if (docOpt.isEmpty()) { break; } @@ -313,7 +406,7 @@ private NameUsageMatch fromDoc(Document doc) { if (doc.get(FIELD_ACCEPTED_ID) != null) { synonym = true; - Optional accDocOpt = getByUsageId(doc.get(FIELD_ACCEPTED_ID)); + Optional accDocOpt = getByUsageKey(doc.get(FIELD_ACCEPTED_ID), false); if (accDocOpt.isPresent()) { Document accDoc = accDocOpt.get(); u.setAcceptedUsage( @@ -355,6 +448,26 @@ private NameUsageMatch fromDoc(Document doc) { } u.setSynonym(synonym); + // if join indexes are present, add them to the match + for (String datasetKey: ancillarySearchers.keySet()){ + IndexSearcher ancillarySearcher = ancillarySearchers.get(datasetKey); + Query query = new TermQuery(new Term(FIELD_JOIN_ID, doc.get(FIELD_ID) )); + try { + TopDocs docs = ancillarySearcher.search(query, 3); + if (docs.totalHits.value > 0) { + Document ancillaryDoc = ancillarySearcher.storedFields().document(docs.scoreDocs[0].doc); + String status = ancillaryDoc.get(FIELD_CATEGORY); + Status ancillaryStatus = new Status(); + ancillaryStatus.setCategory(status); + ancillaryStatus.setDatasetKey(datasetKey); + ancillaryStatus.setDatasetTitle(""); + u.getAdditionalStatus().add(ancillaryStatus); + } + } catch (IOException e) { + LOG.error("Cannot load usage {} from lucene index", doc.get(FIELD_ID), e); + } + } + String status = doc.get(FIELD_STATUS); u.getDiagnostics().setStatus(TaxonomicStatus.valueOf(status)); @@ -363,7 +476,7 @@ private NameUsageMatch fromDoc(Document doc) { public List matchByName(String name, boolean fuzzySearch, int maxMatches) { // use the same lucene analyzer to normalize input - final String analyzedName = LuceneUtils.analyzeString(analyzer, name).get(0); + final String analyzedName = LuceneUtils.analyzeString(scientificNameAnalyzer, name).get(0); LOG.debug( "Analyzed {} query \"{}\" becomes >>{}<<", fuzzySearch ? "fuzzy" : "straight", diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java b/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java index 7ef6cdf53..0743d838f 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java +++ b/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java @@ -8,4 +8,7 @@ public class IndexConstants { static final String FIELD_RANK = "rank"; static final String FIELD_STATUS = "status"; static final String FIELD_PARENT_ID = "parentId"; + + static final String FIELD_CATEGORY = "category"; + static final String FIELD_JOIN_ID = "joinId"; } diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java index b97bf2a97..e8a86826b 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java +++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java @@ -55,6 +55,18 @@ public void run(ApplicationArguments args) throws Exception { return; } indexingService.runDatasetIndexing(Integer.parseInt(datasetIds.get(0))); + } else if (Main.ExecutionMode.INDEX_IUCN_CSV.name().equals(mode)) { + if (datasetIds == null || datasetIds.isEmpty()) { + System.err.println("Missing required parameter --clb.dataset.id"); + return; + } + indexingService.indexIUCN(datasetIds.get(0)); + } else if (Main.ExecutionMode.INDEX_IDENTIFIER_CSV.name().equals(mode)) { + if (datasetIds == null || datasetIds.isEmpty()) { + System.err.println("Missing required parameter --clb.dataset.id"); + return; + } + indexingService.indexIdentifiers(datasetIds.get(0)); } else { System.err.println("Unrecognized mode: " + mode); } diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java index cf5abbfcb..b3754119a 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java +++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java @@ -6,4 +6,6 @@ public interface IndexingMapper { Cursor getAllForDataset(@Param("datasetKey") int datasetKey); + + Cursor getAllWithExtensionForDataset(@Param("datasetKey") int datasetKey); } diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java index bf08637c3..072c10ddd 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java +++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java @@ -2,22 +2,22 @@ import static life.catalogue.matching.IndexConstants.*; -import au.com.bytecode.opencsv.CSVReader; -import java.io.File; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; +import java.io.*; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.HashMap; -import java.util.Map; -import java.util.Optional; +import java.util.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.function.Supplier; import java.util.regex.Matcher; import java.util.regex.Pattern; -import au.com.bytecode.opencsv.CSVWriter; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.opencsv.CSVWriter; +import com.opencsv.bean.CsvToBean; +import com.opencsv.bean.CsvToBeanBuilder; +import com.opencsv.bean.StatefulBeanToCsv; +import com.opencsv.bean.StatefulBeanToCsvBuilder; import life.catalogue.api.model.ReleaseAttempt; import life.catalogue.api.vocab.DatasetOrigin; import life.catalogue.api.vocab.TaxonomicStatus; @@ -28,11 +28,12 @@ import org.apache.ibatis.session.SqlSession; import org.apache.ibatis.session.SqlSessionFactory; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.KeywordAnalyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.*; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.*; +import org.apache.lucene.search.*; import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; @@ -45,6 +46,7 @@ import org.mybatis.spring.SqlSessionFactoryBean; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; @@ -52,6 +54,11 @@ /** * Service to index a dataset from the Checklist Bank. + * + * /data/matching-ws/export/ - CSV exports from the Checklist Bank + * /data/matching-ws/index/main - Main lucene index + * /data/matching-ws/index/identifiers - Lucene indexes for IDs lookups + * /data/matching-ws/index/ancillary - Lucene indexes for status values e.g. IUCN */ @Service public class IndexingService { @@ -64,6 +71,9 @@ public class IndexingService { @Value("${export.path:/tmp/matching-export}") String exportPath; + @Value("${temp.path:/tmp/matching-tmp}") + String tempIndexPath; + @Value("${clb.url}") String clbUrl; @@ -76,15 +86,18 @@ public class IndexingService { @Value("${clb.driver}") String clDriver; + @Autowired protected MatchingService matchingService; + private static final String REL_PATTERN_STR = "(\\d+)(?:LX?RC?|R(\\d+))"; private static final Pattern REL_PATTERN = Pattern.compile("^" + REL_PATTERN_STR + "$"); - protected static final ScientificNameAnalyzer analyzer = new ScientificNameAnalyzer(); + protected static final ScientificNameAnalyzer scientificNameAnalyzer = new ScientificNameAnalyzer(); protected static IndexWriterConfig getIndexWriterConfig() { Map analyzerPerField = new HashMap<>(); analyzerPerField.put(FIELD_SCIENTIFIC_NAME, new StandardAnalyzer()); - PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(analyzer, analyzerPerField); + analyzerPerField.put(FIELD_CANONICAL_NAME, scientificNameAnalyzer); + PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper( new KeywordAnalyzer(), analyzerPerField); return new IndexWriterConfig(aWrapper); } @@ -147,7 +160,7 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti // I am seeing better results with this MyBatis Pooling DataSource for Cursor queries // (parallelism) as opposed to the spring managed DataSource - PooledDataSource dataSource = new PooledDataSource(clDriver, clbUrl, clbUser, clPassword); + final PooledDataSource dataSource = new PooledDataSource(clDriver, clbUrl, clbUser, clPassword); // Create a session factory SqlSessionFactoryBean sessionFactoryBean = new SqlSessionFactoryBean(); sessionFactoryBean.setDataSource(dataSource); @@ -160,7 +173,7 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti Optional datasetKey = Optional.empty(); try { datasetKey = Optional.of(Integer.parseInt(datasetKeyInput)); - } catch (NumberFormatException e) { + } catch (NumberFormatException ignored) { } if (datasetKey.isEmpty()) { @@ -178,26 +191,17 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv"; FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput)); try (SqlSession session = factory.openSession(false); - final CSVWriter writer = new CSVWriter(new FileWriter(fileName), '$')) { + final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) { + StatefulBeanToCsv sbc = new StatefulBeanToCsvBuilder(writer) + .withQuotechar('\'') + .withSeparator('$') + .build(); // Create index writer consume( () -> session.getMapper(IndexingMapper.class).getAllForDataset(validDatasetKey), name -> { try { - writer.writeNext( - new String[] { - name.id, - name.parentId, - name.scientificName, - name.authorship, - name.rank, - name.status, - name.nomenclaturalCode, - name.sourceId, - name.sourceDatasetKey, - name.parentSourceId, - name.parentSourceDatasetKey - }); + sbc.write(name); } catch (Exception e) { throw new RuntimeException(e); } @@ -211,10 +215,92 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti LOG.info("Records written to file {}: {}", fileName, counter.get()); } + @Transactional + public void indexIdentifiers(String datasetKey) throws Exception { + writeCLBToFile(datasetKey); + indexFile(exportPath + "/" + datasetKey, tempIndexPath + "/" + datasetKey); + writeJoinIndex( tempIndexPath + "/" + datasetKey, indexPath + "/identifiers/" + datasetKey, false); + } + + public void indexIUCN(String datasetKey) throws Exception { + writeCLBIUCNToFile(datasetKey); + indexFile(exportPath + "/" + datasetKey, tempIndexPath + "/" + datasetKey); + writeJoinIndex( tempIndexPath + "/" + datasetKey, indexPath + "/ancillary/" + datasetKey, true); + } + + @Transactional + public String writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws Exception { + + // I am seeing better results with this MyBatis Pooling DataSource for Cursor queries + // (parallelism) as opposed to the spring managed DataSource + PooledDataSource dataSource = new PooledDataSource(clDriver, clbUrl, clbUser, clPassword); + // Create a session factory + SqlSessionFactoryBean sessionFactoryBean = new SqlSessionFactoryBean(); + sessionFactoryBean.setDataSource(dataSource); + SqlSessionFactory factory = sessionFactoryBean.getObject(); + assert factory != null; + factory.getConfiguration().addMapper(IndexingMapper.class); + factory.getConfiguration().addMapper(DatasetMapper.class); + + // resolve the magic keys... + Optional datasetKey = Optional.empty(); + try { + datasetKey = Optional.of(Integer.parseInt(datasetKeyInput)); + } catch (NumberFormatException e) { + } + + if (datasetKey.isEmpty()) { + datasetKey = lookupDatasetKey(factory, datasetKeyInput); + } + + if (datasetKey.isEmpty()) { + throw new IllegalArgumentException("Invalid dataset key: " + datasetKeyInput); + } + + final Integer validDatasetKey = datasetKey.get(); + + LOG.info("Writing dataset to file..."); + final AtomicInteger counter = new AtomicInteger(0); + final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv"; + FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput)); + try (SqlSession session = factory.openSession(false); + final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) { + + final ObjectMapper objectMapper = new ObjectMapper(); + final StatefulBeanToCsv sbc = new StatefulBeanToCsvBuilder(writer) + .withQuotechar('\'') + .withSeparator('$') + .build(); + + // Create index writer + consume( + () -> session.getMapper(IndexingMapper.class).getAllWithExtensionForDataset(validDatasetKey), + nameUsage -> { + try { + if (StringUtils.isNotBlank(nameUsage.getExtension())){ + // parse it + JsonNode node = objectMapper.readTree(nameUsage.getExtension()); + nameUsage.setCategory(node.path("iucn:threatStatus").asText()); + } + sbc.write(nameUsage); + + } catch (Exception e) { + throw new RuntimeException(e); + } + counter.incrementAndGet(); + }); + } finally { + dataSource.forceCloseAll(); + } + + // write metadata file in JSON format + LOG.info("Records written to file {}: {}", fileName, counter.get()); + return fileName; + } + public static Directory newMemoryIndex(Iterable usages) throws IOException { LOG.info("Start building a new RAM index"); Directory dir = new ByteBuffersDirectory(); - IndexWriter writer = getIndexWriter(dir); // creates initial index segments @@ -235,6 +321,110 @@ private static IndexWriter getIndexWriter(Directory dir) throws IOException { return new IndexWriter(dir, getIndexWriterConfig()); } + public void writeJoinIndex(String tempIndexPath, String ancillaryIndexPath, boolean acceptedOnly) { + + try { + // Load temp index directory + Directory tempDirectory = FSDirectory.open(Paths.get(tempIndexPath)); + IndexReader tempReader = DirectoryReader.open(tempDirectory); + IndexSearcher searcher = new IndexSearcher(tempReader); + + // Create ancillary index + Path indexDirectory = initialiseIndexDirectory(ancillaryIndexPath); + Directory ancillaryDirectory = FSDirectory.open(indexDirectory); + IndexWriterConfig config = getIndexWriterConfig(); + config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + IndexWriter ancillaryIndexWriter = new IndexWriter(ancillaryDirectory, getIndexWriterConfig()); + + // Construct a simple query to get all documents + TopDocs results = searcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE); + ScoreDoc[] hits = results.scoreDocs; + + AtomicInteger counter = new AtomicInteger(0); + + // Write document data + for (ScoreDoc hit : hits) { + + counter.incrementAndGet(); + Document doc = searcher.storedFields().document(hit.doc); + Map hierarchy = loadHierarchy(searcher, doc.get(FIELD_ID)); + + String status = doc.get(FIELD_STATUS); + if (status != null && + acceptedOnly && + !status.equals(TaxonomicStatus.ACCEPTED.name())) { + // skip synonyms, otherwise we would index them twice + continue; + } + String scientificName = doc.get(FIELD_SCIENTIFIC_NAME); + Classification classification = new Classification(); + classification.setKingdom(hierarchy.getOrDefault(Rank.KINGDOM.name(), "")); + classification.setPhylum(hierarchy.getOrDefault(Rank.PHYLUM.name(), "")); + classification.setClazz(hierarchy.getOrDefault(Rank.CLASS.name(), "")); + classification.setOrder(hierarchy.getOrDefault(Rank.ORDER.name(), "")); + classification.setFamily(hierarchy.getOrDefault(Rank.FAMILY.name(), "")); + classification.setGenus(hierarchy.getOrDefault(Rank.GENUS.name(), "")); + classification.setSpecies(hierarchy.getOrDefault(Rank.SPECIES.name(), "")); + + if (counter.get() % 100000 == 0) { + LOG.info("Indexed: {} taxa", counter.get()); + } + + // match to main dataset + NameUsageMatch nameUsageMatch = matchingService.match(scientificName, classification, true); + if (nameUsageMatch.getUsage() != null) { + doc.add(new StringField(FIELD_JOIN_ID, + nameUsageMatch.getAcceptedUsage() != null ? nameUsageMatch.getAcceptedUsage().getKey() : + nameUsageMatch.getUsage().getKey(), Field.Store.YES)); + ancillaryIndexWriter.addDocument(doc); + } + } + + // close temp + tempReader.close(); + tempDirectory.close(); + + // close ancillary + ancillaryIndexWriter.commit(); + ancillaryIndexWriter.forceMerge(1); + ancillaryIndexWriter.close(); + ancillaryDirectory.close(); + + LOG.info("Ancillary index written: {} documents.", counter.get()); + } catch (Exception e) { + LOG.error("Error writing documents to CSV: {}", e.getMessage()); + } + } + + public Optional getById(IndexSearcher searcher, String id) { + Query query = new TermQuery(new Term(FIELD_ID, id)); + try { + TopDocs docs = searcher.search(query, 3); + if (docs.totalHits.value > 0) { + return Optional.of(searcher.storedFields().document(docs.scoreDocs[0].doc)); + } else { + return Optional.empty(); + } + } catch (IOException e) { + LOG.error("Cannot load usage {} from lucene index", id, e); + } + return Optional.empty(); + } + + public Map loadHierarchy(IndexSearcher searcher, String id) { + Map classification = new HashMap<>(); + while (id != null) { + Optional docOpt = getById(searcher, id); + if (docOpt.isEmpty()) { + break; + } + Document doc = docOpt.get(); + classification.put(doc.get(FIELD_RANK), doc.get(FIELD_CANONICAL_NAME)); + id = doc.get(FIELD_PARENT_ID); + } + return classification; + } + @Transactional public void indexFile(String exportPath, String indexPath) throws Exception { @@ -251,38 +441,26 @@ public void indexFile(String exportPath, String indexPath) throws Exception { final AtomicInteger counter = new AtomicInteger(0); final String filePath = exportPath + "/index.csv"; - try (CSVReader reader = new CSVReader(new FileReader(filePath), '$', '"'); - IndexWriter indexWriter = new IndexWriter(directory, config)) { + try (Reader reader = new FileReader(filePath); + IndexWriter indexWriter = new IndexWriter(directory, config)) { - String[] row = reader.readNext(); - while (row != null) { - if (row.length != 11) { - LOG.warn("Skipping row with invalid number of columns: {}", String.join(",", row)); - row = reader.readNext(); - continue; - } + CsvToBean csvReader = new CsvToBeanBuilder(reader) + .withType(NameUsage.class) + .withSeparator('$') + .withIgnoreLeadingWhiteSpace(true) + .withIgnoreEmptyLine(true) + .build(); + + Iterator iterator = csvReader.iterator(); + + while (iterator.hasNext()) { if (counter.get() % 100000 == 0) { LOG.info("Indexed: {} taxa", counter.get()); } - - NameUsage nameUsage = - NameUsage.builder() - .id(row[0]) - .parentId(row[1]) - .scientificName(row[2]) - .authorship(row[3]) - .rank(row[4]) - .status(row[5]) - .nomenclaturalCode(row[6]) - .sourceId(row[7]) - .sourceDatasetKey(row[8]) - .parentSourceId(row[9]) - .parentSourceDatasetKey(row[10]) - .build(); + NameUsage nameUsage = iterator.next(); Document doc = toDoc(nameUsage); indexWriter.addDocument(doc); counter.incrementAndGet(); - row = reader.readNext(); } LOG.info("Final index commit"); indexWriter.commit(); @@ -362,59 +540,64 @@ protected static Document toDoc(NameUsage nameUsage) { cultivar or strain information. Infrageneric names are represented without a leading genus. Unicode characters are replaced by their matching ASCII characters." */ - Rank rank = Rank.valueOf(nameUsage.rank); + Rank rank = Rank.valueOf(nameUsage.getRank()); Optional optCanonical = Optional.empty(); try { NomCode nomCode = null; - if (!StringUtils.isEmpty(nameUsage.nomenclaturalCode)) { - nomCode = NomCode.valueOf(nameUsage.nomenclaturalCode); + if (!StringUtils.isEmpty(nameUsage.getNomenclaturalCode())) { + nomCode = NomCode.valueOf(nameUsage.getNomenclaturalCode()); } - ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.scientificName, rank, nomCode); + ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode); // canonicalMinimal will construct the name without the hybrid marker and authorship String canonical = NameFormatter.canonicalMinimal(pn); optCanonical = Optional.ofNullable(canonical); } catch (UnparsableNameException | InterruptedException e) { // do nothing - LOG.debug("Unable to parse name to create canonical: {}", nameUsage.scientificName); + LOG.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName()); } - final String canonical = optCanonical.orElse(nameUsage.scientificName); + final String canonical = optCanonical.orElse(nameUsage.getScientificName()); // use custom precision step as we do not need range queries and prefer to save memory usage // instead - doc.add(new StringField(FIELD_ID, nameUsage.id, Field.Store.YES)); + doc.add(new StringField(FIELD_ID, nameUsage.getId(), Field.Store.YES)); // we only store accepted key, no need to index it // If the name is a synonym, then parentId name usage points // to the accepted name - if (nameUsage.status != null - && nameUsage.status.equals(TaxonomicStatus.SYNONYM.name()) - && nameUsage.parentId != null) { - doc.add(new StringField(FIELD_ACCEPTED_ID, nameUsage.parentId, Field.Store.YES)); + if (StringUtils.isNotBlank(nameUsage.getStatus()) + && nameUsage.getStatus().equals(TaxonomicStatus.SYNONYM.name()) + && nameUsage.getParentId() != null) { + doc.add(new StringField(FIELD_ACCEPTED_ID, nameUsage.getParentId(), Field.Store.YES)); } // analyzed name field - this is what we search upon doc.add(new TextField(FIELD_CANONICAL_NAME, canonical, Field.Store.YES)); // store full name and classification only to return a full match object for hits - String nameComplete = nameUsage.scientificName; - if (StringUtils.isNotBlank(nameUsage.authorship)) { - nameComplete += " " + nameUsage.authorship; + String nameComplete = nameUsage.getScientificName(); + if (StringUtils.isNotBlank(nameUsage.getAuthorship())) { + nameComplete += " " + nameUsage.getAuthorship(); } doc.add(new TextField(FIELD_SCIENTIFIC_NAME, nameComplete, Field.Store.YES)); // this lucene index is not persistent, so not risk in changing ordinal numbers - doc.add(new StringField(FIELD_RANK, nameUsage.rank, Field.Store.YES)); + doc.add(new StringField(FIELD_RANK, nameUsage.getRank(), Field.Store.YES)); - if (nameUsage.parentId != null && !nameUsage.parentId.equals(nameUsage.id)) { - doc.add(new StringField(FIELD_PARENT_ID, nameUsage.parentId, Field.Store.YES)); + if (StringUtils.isNotBlank(nameUsage.getParentId()) && !nameUsage.getParentId().equals(nameUsage.getId())) { + doc.add(new StringField(FIELD_PARENT_ID, nameUsage.getParentId(), Field.Store.YES)); } - if (nameUsage.status != null) { - doc.add(new StringField(FIELD_STATUS, nameUsage.status, Field.Store.YES)); + if (StringUtils.isNotBlank(nameUsage.getStatus())) { + doc.add(new StringField(FIELD_STATUS, nameUsage.getStatus(), Field.Store.YES)); } + + if (StringUtils.isNotBlank(nameUsage.getCategory())) { + doc.add(new StringField(FIELD_CATEGORY, nameUsage.getCategory(), Field.Store.YES)); + } + return doc; } diff --git a/matching-ws/src/main/java/life/catalogue/matching/Main.java b/matching-ws/src/main/java/life/catalogue/matching/Main.java index 3b7e19672..9b89b1856 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/Main.java +++ b/matching-ws/src/main/java/life/catalogue/matching/Main.java @@ -30,6 +30,14 @@ public class Main { "Required for INDEX_DB and EXPORT_CSV modes") private String datasetId; + @Parameter(names = {"--clb.status.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " + + "Required for INDEX_DB and EXPORT_CSV modes") + private String statusDatasetIds; + + @Parameter(names = {"--clb.identifier.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " + + "Required for INDEX_DB and EXPORT_CSV modes") + private String identifierDatasetIds; + @Parameter(names = {"--index.path"}, description = "File system path to the pre-generated lucene index") private String indexPath = "/data/matching-ws/index"; @@ -61,7 +69,8 @@ public static void main(String[] args) throws Exception { commander.usage(); } - if ((app.mode == ExecutionMode.INDEX_DB || app.mode == ExecutionMode.EXPORT_CSV) && app.datasetId == null) { + if ((app.mode == ExecutionMode.INDEX_DB + || app.mode == ExecutionMode.EXPORT_CSV) && app.datasetId == null) { System.err.println("Missing required parameter for mode " + app.mode + " --clb.dataset.id"); commander.usage(); return; @@ -73,10 +82,10 @@ public static void main(String[] args) throws Exception { SpringApplication springApplication; switch (app.mode) { - case EXPORT_CSV, INDEX_CSV, INDEX_DB: + case EXPORT_CSV, INDEX_CSV, INDEX_DB, INDEX_IUCN_CSV, INDEX_IDENTIFIER_CSV: springApplication = new SpringApplication(IndexingApplication.class); springApplication.setAdditionalProfiles("indexing"); - springApplication.run( args).close(); + springApplication.run(args).close(); break; case WEB_APP: SpringApplication webApp = new SpringApplication(MatchingApplication.class); @@ -89,6 +98,8 @@ public static void main(String[] args) throws Exception { enum ExecutionMode { EXPORT_CSV, + INDEX_IUCN_CSV, + INDEX_IDENTIFIER_CSV, INDEX_CSV, INDEX_DB, WEB_APP diff --git a/matching-ws/src/main/java/life/catalogue/matching/MatchController.java b/matching-ws/src/main/java/life/catalogue/matching/MatchController.java index 0c393efe8..48c920b42 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/MatchController.java +++ b/matching-ws/src/main/java/life/catalogue/matching/MatchController.java @@ -69,14 +69,6 @@ private boolean isTraceRequested(String traceRequested) { (traceRequested.equalsIgnoreCase("true") || traceRequested.equalsIgnoreCase("on")); } - private boolean getTraceParameter(HttpServletRequest request) { - String parameter = request.getParameter("trace"); - if (parameter == null) { - return false; - } - return !"false".equalsIgnoreCase(parameter); - } - @Autowired public MatchController(ErrorAttributes errorAttributes) { Assert.notNull(errorAttributes, "ErrorAttributes must not be null"); @@ -138,6 +130,7 @@ public NameUsageMatch matchOldPaths( HttpServletRequest response) { return matchV2( usageKey, + null,null,null, scientificName2, scientificName, authorship, authorship2, removeNulls(genericName), @@ -215,6 +208,9 @@ public NameUsageMatch matchOldPaths( produces = "application/json") public NameUsageMatch matchV2( @RequestParam(value = "usageKey", required = false) String usageKey, + @RequestParam(value = "taxonID", required = false) String taxonID, + @RequestParam(value = "taxonConceptID", required = false) String taxonConceptID, + @RequestParam(value = "scientificNameID", required = false) String scientificNameID, @RequestParam(value = "name", required = false) String scientificName2, @RequestParam(value = "scientificName", required = false) String scientificName, @RequestParam(value = "authorship", required = false) String authorship2, @@ -231,7 +227,7 @@ public NameUsageMatch matchV2( // ugly, i know, but jackson/spring isn't working with @JsonProperty classification.setClazz(response.getParameter("class")); return matchingService.match( - removeNulls(usageKey), + first(removeNulls(usageKey), removeNulls(taxonID), removeNulls(taxonConceptID), removeNulls(scientificNameID)), first(removeNulls(scientificName), removeNulls(scientificName2)), first(removeNulls(authorship), removeNulls(authorship2)), removeNulls(genericName), diff --git a/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java b/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java index cd9247e83..26ce91366 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java +++ b/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java @@ -203,7 +203,7 @@ protected static boolean isSimpleBinomial(String name) { } private static void warnIfMissing(String name, @Nullable String epithet, String part) { - if (exists(epithet) && !name.toLowerCase().contains(epithet.toLowerCase())) { + if (exists(epithet) && name != null && !name.toLowerCase().contains(epithet.toLowerCase())) { LOG.warn("ScientificName >{}< missing {}: {}", name, part, epithet); } } diff --git a/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java b/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java index 6231109aa..903a9c861 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java +++ b/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java @@ -1,23 +1,41 @@ package life.catalogue.matching; -import lombok.Builder; -import lombok.Data; -import lombok.EqualsAndHashCode; +import com.opencsv.bean.CsvBindByName; + +import lombok.*; /** A simple class to represent a name usage ready to be indexed. */ @Data @EqualsAndHashCode @Builder +@NoArgsConstructor +@AllArgsConstructor public class NameUsage { + + @CsvBindByName(column = "id") String id; + + @CsvBindByName(column = "parentId") String parentId; + + @CsvBindByName(column = "scientificName") String scientificName; + + @CsvBindByName(column = "authorship") String authorship; + + @CsvBindByName(column = "status") String status; + + @CsvBindByName(column = "rank") String rank; + + @CsvBindByName(column = "nomenclaturalCode") String nomenclaturalCode; - String sourceId; - String sourceDatasetKey; - String parentSourceId; - String parentSourceDatasetKey; + + @CsvBindByName(column = "category") + String category; + + @CsvBindByName(column = "extension") + private String extension; } diff --git a/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java b/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java index 2e567c711..a1c279b18 100644 --- a/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java +++ b/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java @@ -32,6 +32,8 @@ public class NameUsageMatch implements LinneanClassification { List alternatives = new ArrayList<>(); @Schema(description = "Diagnostics for a name match including the type of match and confidence level") Diagnostics diagnostics = new Diagnostics(); + @Schema(description = "Status information from external sources like IUCN Red List") + List additionalStatus = new ArrayList<>(); private String nameFor(Rank rank) { return getClassification().stream() diff --git a/matching-ws/src/main/java/life/catalogue/matching/Status.java b/matching-ws/src/main/java/life/catalogue/matching/Status.java new file mode 100644 index 000000000..0bef13645 --- /dev/null +++ b/matching-ws/src/main/java/life/catalogue/matching/Status.java @@ -0,0 +1,12 @@ +package life.catalogue.matching; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import lombok.Data; + +@Data +@JsonIgnoreProperties(ignoreUnknown = true) +public class Status { + private String datasetKey; + private String datasetTitle; + private String category; +} \ No newline at end of file diff --git a/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml b/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml index 36bef0554..82d2893e3 100644 --- a/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml +++ b/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml @@ -16,19 +16,40 @@ n.rank as rank, nu.status as status, n.code as nomenclaturalCode, - vs.source_id as sourceId, - vs.source_dataset_key as sourceDatasetKey, - pvs.source_id as parentSourceId, - pvs.source_dataset_key as parentSourceDatasetKey + '' as extension, + '' as category FROM name_usage nu INNER JOIN name n on n.id = nu.name_id AND n.dataset_key=#{datasetKey} - LEFT JOIN - verbatim_source vs on vs.id = nu.id AND vs.dataset_key=#{datasetKey} - LEFT JOIN - verbatim_source pvs on pvs.id = nu.parent_id AND pvs.dataset_key=#{datasetKey} WHERE nu.dataset_key = #{datasetKey} + + + diff --git a/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java b/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java index 6b9b8cfa9..72609af83 100644 --- a/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java +++ b/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java @@ -15,13 +15,17 @@ import static org.junit.jupiter.api.Assertions.*; -import au.com.bytecode.opencsv.CSVReader; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.io.Resources; import java.io.InputStream; import java.io.InputStreamReader; import java.util.List; + +import com.opencsv.CSVParserBuilder; +import com.opencsv.CSVReader; +import com.opencsv.CSVReaderBuilder; + import life.catalogue.api.vocab.TaxonomicStatus; import org.apache.commons.lang3.StringUtils; import org.gbif.nameparser.api.Rank; @@ -42,8 +46,9 @@ public static void buildMatcher() throws Exception { public static List readTestNames() throws Exception { List usages = Lists.newArrayList(); // 1 2 Acanthophora Hulst, 1896 Geometridae Lepidoptera Insecta Arthropoda Animalia GENUS - try (InputStream testFile = Resources.getResource("testNames.txt").openStream()) { - CSVReader reader = new CSVReader(new InputStreamReader(testFile), '\t', '"'); + try (InputStream testFile = Resources.getResource("testNames.txt").openStream(); + CSVReader reader = new CSVReaderBuilder(new InputStreamReader(testFile)) + .withCSVParser(new CSVParserBuilder().withSeparator('\t').build()).build()) { String[] row = reader.readNext(); while (row != null) { NameUsage n = NameUsage.builder().build();