diff --git a/matching-ws/pom.xml b/matching-ws/pom.xml
index 02d50e0c5..fdd885fa6 100644
--- a/matching-ws/pom.xml
+++ b/matching-ws/pom.xml
@@ -251,9 +251,9 @@
2.17.0
- net.sf.opencsv
+ com.opencsv
opencsv
- 2.3
+ 5.9
compile
diff --git a/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java b/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java
index ec8f0c3b5..ac24be245 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java
@@ -1,7 +1,7 @@
package life.catalogue.matching;
import static life.catalogue.matching.IndexConstants.*;
-import static life.catalogue.matching.IndexingService.analyzer;
+import static life.catalogue.matching.IndexingService.scientificNameAnalyzer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
@@ -29,6 +29,7 @@
import org.apache.lucene.store.MMapDirectory;
import org.gbif.nameparser.api.Rank;
+import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
@@ -43,6 +44,8 @@ public class DatasetIndex {
private static Logger LOG = LoggerFactory.getLogger(DatasetIndex.class);
private IndexSearcher searcher;
+ private Map identifierSearchers = new HashMap<>();
+ private Map ancillarySearchers = new HashMap<>();
@Value("${index.path:/data/matching-ws/index}")
String indexPath;
@@ -53,21 +56,73 @@ public class DatasetIndex {
/** Attempts to read the index from disk if it exists. */
@PostConstruct
void init() {
- if (new File(indexPath).exists()) {
- LOG.info("Loading lucene index from {}", indexPath);
+
+ final String mainIndexPath = getMainIndexPath();
+
+ if (new File(mainIndexPath).exists()) {
+ LOG.info("Loading lucene index from {}", mainIndexPath);
try {
- initWithDir(new MMapDirectory(Path.of(indexPath)));
+ initWithDir(new MMapDirectory(Path.of(mainIndexPath)));
} catch (IOException e) {
LOG.warn("Cannot open lucene index. Index not available", e);
}
+
+ // load identifier indexes
+ this.identifierSearchers = new HashMap<>();
+ if (Path.of(indexPath + "/identifiers").toFile().exists()) {
+ try (DirectoryStream stream = Files.newDirectoryStream(Path.of(indexPath + "/identifiers"))) {
+ for (Path entry : stream) {
+ if (Files.isDirectory(entry)) {
+ try {
+ Directory identifierDir = new MMapDirectory(entry);
+ DirectoryReader reader = DirectoryReader.open(identifierDir);
+ identifierSearchers.put(entry.toFile().getName(), new IndexSearcher(reader));
+ } catch (IOException e) {
+ LOG.warn("Cannot open identifiers lucene index {}", entry, e);
+ }
+ }
+ }
+ } catch (IOException e) {
+ LOG.error("Cannot read identifiers index directory", e);
+ }
+ } else {
+ LOG.info("Identifiers indexes not found at {}", indexPath + "/identifiers");
+ }
+
+ // load ancillary indexes
+ this.ancillarySearchers = new HashMap<>();
+ if (Path.of(indexPath + "/ancillary").toFile().exists()) {
+ try (DirectoryStream stream = Files.newDirectoryStream(Path.of(indexPath + "/ancillary"))) {
+ for (Path entry : stream) {
+ if (Files.isDirectory(entry)) {
+ try {
+ Directory ancillaryDir = new MMapDirectory(entry);
+ DirectoryReader reader = DirectoryReader.open(ancillaryDir);
+ ancillarySearchers.put(entry.toFile().getName(), new IndexSearcher(reader));
+ } catch (IOException e) {
+ LOG.warn("Cannot open ancillary lucene index {}", entry, e);
+ }
+ }
+ }
+ } catch (IOException e) {
+ LOG.error("Cannot read ancillary index directory", e);
+ }
+ } else {
+ LOG.info("Ancillary indexes not found at {}", indexPath + "/ancillary");
+ }
+
} else {
- LOG.warn("Lucene index not found at {}", indexPath);
+ LOG.warn("Lucene index not found at {}", mainIndexPath);
}
}
- void initWithDir(Directory indexDir) {
+ private @NotNull String getMainIndexPath() {
+ return indexPath + "/main";
+ }
+
+ void initWithDir(Directory indexDirectory) {
try {
- DirectoryReader reader = DirectoryReader.open(indexDir);
+ DirectoryReader reader = DirectoryReader.open(indexDirectory);
this.searcher = new IndexSearcher(reader);
} catch (IOException e) {
LOG.warn("Cannot open lucene index. Index not available", e);
@@ -83,7 +138,7 @@ public IndexMetadata getIndexMetadata(){
IndexMetadata metadata = new IndexMetadata();
// get size on disk
- Path directoryPath = Path.of(indexPath);
+ Path directoryPath = Path.of(getMainIndexPath());
try {
BasicFileAttributes attributes = Files.readAttributes(directoryPath, BasicFileAttributes.class);
Instant creationTime = attributes.creationTime().toInstant();
@@ -225,7 +280,7 @@ private IndexSearcher getSearcher() {
* @return
*/
public NameUsageMatch matchByUsageKey(String usageKey) {
- Optional docOpt = getByUsageId(usageKey);
+ Optional docOpt = getByUsageKey(usageKey, true);
if (docOpt.isPresent()) {
Document doc = docOpt.get();
NameUsageMatch match = fromDoc(doc);
@@ -245,13 +300,51 @@ public NameUsageMatch matchByUsageKey(String usageKey) {
}
}
- public Optional getByUsageId(String usageKey) {
- Query query = new TermQuery(new Term(FIELD_ID, usageKey));
+ public static String escapeQueryChars(String s) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < s.length(); i++) {
+ char c = s.charAt(i);
+ // These are the special characters that need to be escaped
+ if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' ||
+ c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' ||
+ c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' ||
+ c == '/' || Character.isWhitespace(c)) {
+ sb.append('\\');
+ }
+ sb.append(c);
+ }
+ return sb.toString();
+ }
+
+ public Optional getByUsageKey(String usageKey, boolean allowExternalIDs) {
+ Query query = new TermQuery(new Term(FIELD_ID, escapeQueryChars(usageKey)));
try {
TopDocs docs = getSearcher().search(query, 3);
if (docs.totalHits.value > 0) {
return Optional.of(getSearcher().storedFields().document(docs.scoreDocs[0].doc));
- } else {
+ } else if (allowExternalIDs) {
+
+ // if join indexes are present, add them to the match
+ if (identifierSearchers != null){
+ for (String datasetKey: identifierSearchers.keySet()){
+ IndexSearcher identifierSearcher = identifierSearchers.get(datasetKey);
+ Query identifierQuery = new TermQuery(new Term(FIELD_ID, usageKey));
+ LOG.info("Searching for identifier {} in dataset {}", usageKey, datasetKey);
+ TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3);
+ if (identifierDocs.totalHits.value > 0) {
+ Document identifierDoc = identifierSearcher.storedFields().document(identifierDocs.scoreDocs[0].doc);
+ final String joinID = identifierDoc.get(FIELD_JOIN_ID);
+ Query getByIDQuery = new TermQuery(new Term(FIELD_ID, joinID));
+ TopDocs docs2 = getSearcher().search(getByIDQuery, 3);
+ if (docs2.totalHits.value > 0) {
+ return Optional.of(getSearcher().storedFields().document(docs2.scoreDocs[0].doc));
+ } else {
+ LOG.warn("Cannot find usage {} in main lucene index after finding it in identifier index for {}", usageKey, datasetKey);
+ return Optional.empty();
+ }
+ }
+ }
+ }
return Optional.empty();
}
} catch (IOException e) {
@@ -274,7 +367,7 @@ public List loadHigherTaxa(String parentID) {
List higherTaxa = new ArrayList<>();
while (parentID != null) {
- Optional docOpt = getByUsageId(parentID);
+ Optional docOpt = getByUsageKey(parentID, false);
if (docOpt.isEmpty()) {
break;
}
@@ -313,7 +406,7 @@ private NameUsageMatch fromDoc(Document doc) {
if (doc.get(FIELD_ACCEPTED_ID) != null) {
synonym = true;
- Optional accDocOpt = getByUsageId(doc.get(FIELD_ACCEPTED_ID));
+ Optional accDocOpt = getByUsageKey(doc.get(FIELD_ACCEPTED_ID), false);
if (accDocOpt.isPresent()) {
Document accDoc = accDocOpt.get();
u.setAcceptedUsage(
@@ -355,6 +448,26 @@ private NameUsageMatch fromDoc(Document doc) {
}
u.setSynonym(synonym);
+ // if join indexes are present, add them to the match
+ for (String datasetKey: ancillarySearchers.keySet()){
+ IndexSearcher ancillarySearcher = ancillarySearchers.get(datasetKey);
+ Query query = new TermQuery(new Term(FIELD_JOIN_ID, doc.get(FIELD_ID) ));
+ try {
+ TopDocs docs = ancillarySearcher.search(query, 3);
+ if (docs.totalHits.value > 0) {
+ Document ancillaryDoc = ancillarySearcher.storedFields().document(docs.scoreDocs[0].doc);
+ String status = ancillaryDoc.get(FIELD_CATEGORY);
+ Status ancillaryStatus = new Status();
+ ancillaryStatus.setCategory(status);
+ ancillaryStatus.setDatasetKey(datasetKey);
+ ancillaryStatus.setDatasetTitle("");
+ u.getAdditionalStatus().add(ancillaryStatus);
+ }
+ } catch (IOException e) {
+ LOG.error("Cannot load usage {} from lucene index", doc.get(FIELD_ID), e);
+ }
+ }
+
String status = doc.get(FIELD_STATUS);
u.getDiagnostics().setStatus(TaxonomicStatus.valueOf(status));
@@ -363,7 +476,7 @@ private NameUsageMatch fromDoc(Document doc) {
public List matchByName(String name, boolean fuzzySearch, int maxMatches) {
// use the same lucene analyzer to normalize input
- final String analyzedName = LuceneUtils.analyzeString(analyzer, name).get(0);
+ final String analyzedName = LuceneUtils.analyzeString(scientificNameAnalyzer, name).get(0);
LOG.debug(
"Analyzed {} query \"{}\" becomes >>{}<<",
fuzzySearch ? "fuzzy" : "straight",
diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java b/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java
index 7ef6cdf53..0743d838f 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java
@@ -8,4 +8,7 @@ public class IndexConstants {
static final String FIELD_RANK = "rank";
static final String FIELD_STATUS = "status";
static final String FIELD_PARENT_ID = "parentId";
+
+ static final String FIELD_CATEGORY = "category";
+ static final String FIELD_JOIN_ID = "joinId";
}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java
index b97bf2a97..e8a86826b 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java
@@ -55,6 +55,18 @@ public void run(ApplicationArguments args) throws Exception {
return;
}
indexingService.runDatasetIndexing(Integer.parseInt(datasetIds.get(0)));
+ } else if (Main.ExecutionMode.INDEX_IUCN_CSV.name().equals(mode)) {
+ if (datasetIds == null || datasetIds.isEmpty()) {
+ System.err.println("Missing required parameter --clb.dataset.id");
+ return;
+ }
+ indexingService.indexIUCN(datasetIds.get(0));
+ } else if (Main.ExecutionMode.INDEX_IDENTIFIER_CSV.name().equals(mode)) {
+ if (datasetIds == null || datasetIds.isEmpty()) {
+ System.err.println("Missing required parameter --clb.dataset.id");
+ return;
+ }
+ indexingService.indexIdentifiers(datasetIds.get(0));
} else {
System.err.println("Unrecognized mode: " + mode);
}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java
index cf5abbfcb..b3754119a 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java
@@ -6,4 +6,6 @@
public interface IndexingMapper {
Cursor getAllForDataset(@Param("datasetKey") int datasetKey);
+
+ Cursor getAllWithExtensionForDataset(@Param("datasetKey") int datasetKey);
}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java
index bf08637c3..072c10ddd 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java
@@ -2,22 +2,22 @@
import static life.catalogue.matching.IndexConstants.*;
-import au.com.bytecode.opencsv.CSVReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
+import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Optional;
+import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-import au.com.bytecode.opencsv.CSVWriter;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.opencsv.CSVWriter;
+import com.opencsv.bean.CsvToBean;
+import com.opencsv.bean.CsvToBeanBuilder;
+import com.opencsv.bean.StatefulBeanToCsv;
+import com.opencsv.bean.StatefulBeanToCsvBuilder;
import life.catalogue.api.model.ReleaseAttempt;
import life.catalogue.api.vocab.DatasetOrigin;
import life.catalogue.api.vocab.TaxonomicStatus;
@@ -28,11 +28,12 @@
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.*;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
@@ -45,6 +46,7 @@
import org.mybatis.spring.SqlSessionFactoryBean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
@@ -52,6 +54,11 @@
/**
* Service to index a dataset from the Checklist Bank.
+ *
+ * /data/matching-ws/export/ - CSV exports from the Checklist Bank
+ * /data/matching-ws/index/main - Main lucene index
+ * /data/matching-ws/index/identifiers - Lucene indexes for IDs lookups
+ * /data/matching-ws/index/ancillary - Lucene indexes for status values e.g. IUCN
*/
@Service
public class IndexingService {
@@ -64,6 +71,9 @@ public class IndexingService {
@Value("${export.path:/tmp/matching-export}")
String exportPath;
+ @Value("${temp.path:/tmp/matching-tmp}")
+ String tempIndexPath;
+
@Value("${clb.url}")
String clbUrl;
@@ -76,15 +86,18 @@ public class IndexingService {
@Value("${clb.driver}")
String clDriver;
+ @Autowired protected MatchingService matchingService;
+
private static final String REL_PATTERN_STR = "(\\d+)(?:LX?RC?|R(\\d+))";
private static final Pattern REL_PATTERN = Pattern.compile("^" + REL_PATTERN_STR + "$");
- protected static final ScientificNameAnalyzer analyzer = new ScientificNameAnalyzer();
+ protected static final ScientificNameAnalyzer scientificNameAnalyzer = new ScientificNameAnalyzer();
protected static IndexWriterConfig getIndexWriterConfig() {
Map analyzerPerField = new HashMap<>();
analyzerPerField.put(FIELD_SCIENTIFIC_NAME, new StandardAnalyzer());
- PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(analyzer, analyzerPerField);
+ analyzerPerField.put(FIELD_CANONICAL_NAME, scientificNameAnalyzer);
+ PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper( new KeywordAnalyzer(), analyzerPerField);
return new IndexWriterConfig(aWrapper);
}
@@ -147,7 +160,7 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
// I am seeing better results with this MyBatis Pooling DataSource for Cursor queries
// (parallelism) as opposed to the spring managed DataSource
- PooledDataSource dataSource = new PooledDataSource(clDriver, clbUrl, clbUser, clPassword);
+ final PooledDataSource dataSource = new PooledDataSource(clDriver, clbUrl, clbUser, clPassword);
// Create a session factory
SqlSessionFactoryBean sessionFactoryBean = new SqlSessionFactoryBean();
sessionFactoryBean.setDataSource(dataSource);
@@ -160,7 +173,7 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
Optional datasetKey = Optional.empty();
try {
datasetKey = Optional.of(Integer.parseInt(datasetKeyInput));
- } catch (NumberFormatException e) {
+ } catch (NumberFormatException ignored) {
}
if (datasetKey.isEmpty()) {
@@ -178,26 +191,17 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv";
FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput));
try (SqlSession session = factory.openSession(false);
- final CSVWriter writer = new CSVWriter(new FileWriter(fileName), '$')) {
+ final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) {
+ StatefulBeanToCsv sbc = new StatefulBeanToCsvBuilder(writer)
+ .withQuotechar('\'')
+ .withSeparator('$')
+ .build();
// Create index writer
consume(
() -> session.getMapper(IndexingMapper.class).getAllForDataset(validDatasetKey),
name -> {
try {
- writer.writeNext(
- new String[] {
- name.id,
- name.parentId,
- name.scientificName,
- name.authorship,
- name.rank,
- name.status,
- name.nomenclaturalCode,
- name.sourceId,
- name.sourceDatasetKey,
- name.parentSourceId,
- name.parentSourceDatasetKey
- });
+ sbc.write(name);
} catch (Exception e) {
throw new RuntimeException(e);
}
@@ -211,10 +215,92 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
LOG.info("Records written to file {}: {}", fileName, counter.get());
}
+ @Transactional
+ public void indexIdentifiers(String datasetKey) throws Exception {
+ writeCLBToFile(datasetKey);
+ indexFile(exportPath + "/" + datasetKey, tempIndexPath + "/" + datasetKey);
+ writeJoinIndex( tempIndexPath + "/" + datasetKey, indexPath + "/identifiers/" + datasetKey, false);
+ }
+
+ public void indexIUCN(String datasetKey) throws Exception {
+ writeCLBIUCNToFile(datasetKey);
+ indexFile(exportPath + "/" + datasetKey, tempIndexPath + "/" + datasetKey);
+ writeJoinIndex( tempIndexPath + "/" + datasetKey, indexPath + "/ancillary/" + datasetKey, true);
+ }
+
+ @Transactional
+ public String writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws Exception {
+
+ // I am seeing better results with this MyBatis Pooling DataSource for Cursor queries
+ // (parallelism) as opposed to the spring managed DataSource
+ PooledDataSource dataSource = new PooledDataSource(clDriver, clbUrl, clbUser, clPassword);
+ // Create a session factory
+ SqlSessionFactoryBean sessionFactoryBean = new SqlSessionFactoryBean();
+ sessionFactoryBean.setDataSource(dataSource);
+ SqlSessionFactory factory = sessionFactoryBean.getObject();
+ assert factory != null;
+ factory.getConfiguration().addMapper(IndexingMapper.class);
+ factory.getConfiguration().addMapper(DatasetMapper.class);
+
+ // resolve the magic keys...
+ Optional datasetKey = Optional.empty();
+ try {
+ datasetKey = Optional.of(Integer.parseInt(datasetKeyInput));
+ } catch (NumberFormatException e) {
+ }
+
+ if (datasetKey.isEmpty()) {
+ datasetKey = lookupDatasetKey(factory, datasetKeyInput);
+ }
+
+ if (datasetKey.isEmpty()) {
+ throw new IllegalArgumentException("Invalid dataset key: " + datasetKeyInput);
+ }
+
+ final Integer validDatasetKey = datasetKey.get();
+
+ LOG.info("Writing dataset to file...");
+ final AtomicInteger counter = new AtomicInteger(0);
+ final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv";
+ FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput));
+ try (SqlSession session = factory.openSession(false);
+ final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) {
+
+ final ObjectMapper objectMapper = new ObjectMapper();
+ final StatefulBeanToCsv sbc = new StatefulBeanToCsvBuilder(writer)
+ .withQuotechar('\'')
+ .withSeparator('$')
+ .build();
+
+ // Create index writer
+ consume(
+ () -> session.getMapper(IndexingMapper.class).getAllWithExtensionForDataset(validDatasetKey),
+ nameUsage -> {
+ try {
+ if (StringUtils.isNotBlank(nameUsage.getExtension())){
+ // parse it
+ JsonNode node = objectMapper.readTree(nameUsage.getExtension());
+ nameUsage.setCategory(node.path("iucn:threatStatus").asText());
+ }
+ sbc.write(nameUsage);
+
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ counter.incrementAndGet();
+ });
+ } finally {
+ dataSource.forceCloseAll();
+ }
+
+ // write metadata file in JSON format
+ LOG.info("Records written to file {}: {}", fileName, counter.get());
+ return fileName;
+ }
+
public static Directory newMemoryIndex(Iterable usages) throws IOException {
LOG.info("Start building a new RAM index");
Directory dir = new ByteBuffersDirectory();
-
IndexWriter writer = getIndexWriter(dir);
// creates initial index segments
@@ -235,6 +321,110 @@ private static IndexWriter getIndexWriter(Directory dir) throws IOException {
return new IndexWriter(dir, getIndexWriterConfig());
}
+ public void writeJoinIndex(String tempIndexPath, String ancillaryIndexPath, boolean acceptedOnly) {
+
+ try {
+ // Load temp index directory
+ Directory tempDirectory = FSDirectory.open(Paths.get(tempIndexPath));
+ IndexReader tempReader = DirectoryReader.open(tempDirectory);
+ IndexSearcher searcher = new IndexSearcher(tempReader);
+
+ // Create ancillary index
+ Path indexDirectory = initialiseIndexDirectory(ancillaryIndexPath);
+ Directory ancillaryDirectory = FSDirectory.open(indexDirectory);
+ IndexWriterConfig config = getIndexWriterConfig();
+ config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
+ IndexWriter ancillaryIndexWriter = new IndexWriter(ancillaryDirectory, getIndexWriterConfig());
+
+ // Construct a simple query to get all documents
+ TopDocs results = searcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE);
+ ScoreDoc[] hits = results.scoreDocs;
+
+ AtomicInteger counter = new AtomicInteger(0);
+
+ // Write document data
+ for (ScoreDoc hit : hits) {
+
+ counter.incrementAndGet();
+ Document doc = searcher.storedFields().document(hit.doc);
+ Map hierarchy = loadHierarchy(searcher, doc.get(FIELD_ID));
+
+ String status = doc.get(FIELD_STATUS);
+ if (status != null &&
+ acceptedOnly &&
+ !status.equals(TaxonomicStatus.ACCEPTED.name())) {
+ // skip synonyms, otherwise we would index them twice
+ continue;
+ }
+ String scientificName = doc.get(FIELD_SCIENTIFIC_NAME);
+ Classification classification = new Classification();
+ classification.setKingdom(hierarchy.getOrDefault(Rank.KINGDOM.name(), ""));
+ classification.setPhylum(hierarchy.getOrDefault(Rank.PHYLUM.name(), ""));
+ classification.setClazz(hierarchy.getOrDefault(Rank.CLASS.name(), ""));
+ classification.setOrder(hierarchy.getOrDefault(Rank.ORDER.name(), ""));
+ classification.setFamily(hierarchy.getOrDefault(Rank.FAMILY.name(), ""));
+ classification.setGenus(hierarchy.getOrDefault(Rank.GENUS.name(), ""));
+ classification.setSpecies(hierarchy.getOrDefault(Rank.SPECIES.name(), ""));
+
+ if (counter.get() % 100000 == 0) {
+ LOG.info("Indexed: {} taxa", counter.get());
+ }
+
+ // match to main dataset
+ NameUsageMatch nameUsageMatch = matchingService.match(scientificName, classification, true);
+ if (nameUsageMatch.getUsage() != null) {
+ doc.add(new StringField(FIELD_JOIN_ID,
+ nameUsageMatch.getAcceptedUsage() != null ? nameUsageMatch.getAcceptedUsage().getKey() :
+ nameUsageMatch.getUsage().getKey(), Field.Store.YES));
+ ancillaryIndexWriter.addDocument(doc);
+ }
+ }
+
+ // close temp
+ tempReader.close();
+ tempDirectory.close();
+
+ // close ancillary
+ ancillaryIndexWriter.commit();
+ ancillaryIndexWriter.forceMerge(1);
+ ancillaryIndexWriter.close();
+ ancillaryDirectory.close();
+
+ LOG.info("Ancillary index written: {} documents.", counter.get());
+ } catch (Exception e) {
+ LOG.error("Error writing documents to CSV: {}", e.getMessage());
+ }
+ }
+
+ public Optional getById(IndexSearcher searcher, String id) {
+ Query query = new TermQuery(new Term(FIELD_ID, id));
+ try {
+ TopDocs docs = searcher.search(query, 3);
+ if (docs.totalHits.value > 0) {
+ return Optional.of(searcher.storedFields().document(docs.scoreDocs[0].doc));
+ } else {
+ return Optional.empty();
+ }
+ } catch (IOException e) {
+ LOG.error("Cannot load usage {} from lucene index", id, e);
+ }
+ return Optional.empty();
+ }
+
+ public Map loadHierarchy(IndexSearcher searcher, String id) {
+ Map classification = new HashMap<>();
+ while (id != null) {
+ Optional docOpt = getById(searcher, id);
+ if (docOpt.isEmpty()) {
+ break;
+ }
+ Document doc = docOpt.get();
+ classification.put(doc.get(FIELD_RANK), doc.get(FIELD_CANONICAL_NAME));
+ id = doc.get(FIELD_PARENT_ID);
+ }
+ return classification;
+ }
+
@Transactional
public void indexFile(String exportPath, String indexPath) throws Exception {
@@ -251,38 +441,26 @@ public void indexFile(String exportPath, String indexPath) throws Exception {
final AtomicInteger counter = new AtomicInteger(0);
final String filePath = exportPath + "/index.csv";
- try (CSVReader reader = new CSVReader(new FileReader(filePath), '$', '"');
- IndexWriter indexWriter = new IndexWriter(directory, config)) {
+ try (Reader reader = new FileReader(filePath);
+ IndexWriter indexWriter = new IndexWriter(directory, config)) {
- String[] row = reader.readNext();
- while (row != null) {
- if (row.length != 11) {
- LOG.warn("Skipping row with invalid number of columns: {}", String.join(",", row));
- row = reader.readNext();
- continue;
- }
+ CsvToBean csvReader = new CsvToBeanBuilder(reader)
+ .withType(NameUsage.class)
+ .withSeparator('$')
+ .withIgnoreLeadingWhiteSpace(true)
+ .withIgnoreEmptyLine(true)
+ .build();
+
+ Iterator iterator = csvReader.iterator();
+
+ while (iterator.hasNext()) {
if (counter.get() % 100000 == 0) {
LOG.info("Indexed: {} taxa", counter.get());
}
-
- NameUsage nameUsage =
- NameUsage.builder()
- .id(row[0])
- .parentId(row[1])
- .scientificName(row[2])
- .authorship(row[3])
- .rank(row[4])
- .status(row[5])
- .nomenclaturalCode(row[6])
- .sourceId(row[7])
- .sourceDatasetKey(row[8])
- .parentSourceId(row[9])
- .parentSourceDatasetKey(row[10])
- .build();
+ NameUsage nameUsage = iterator.next();
Document doc = toDoc(nameUsage);
indexWriter.addDocument(doc);
counter.incrementAndGet();
- row = reader.readNext();
}
LOG.info("Final index commit");
indexWriter.commit();
@@ -362,59 +540,64 @@ protected static Document toDoc(NameUsage nameUsage) {
cultivar or strain information. Infrageneric names are represented without a
leading genus. Unicode characters are replaced by their matching ASCII characters."
*/
- Rank rank = Rank.valueOf(nameUsage.rank);
+ Rank rank = Rank.valueOf(nameUsage.getRank());
Optional optCanonical = Optional.empty();
try {
NomCode nomCode = null;
- if (!StringUtils.isEmpty(nameUsage.nomenclaturalCode)) {
- nomCode = NomCode.valueOf(nameUsage.nomenclaturalCode);
+ if (!StringUtils.isEmpty(nameUsage.getNomenclaturalCode())) {
+ nomCode = NomCode.valueOf(nameUsage.getNomenclaturalCode());
}
- ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.scientificName, rank, nomCode);
+ ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
// canonicalMinimal will construct the name without the hybrid marker and authorship
String canonical = NameFormatter.canonicalMinimal(pn);
optCanonical = Optional.ofNullable(canonical);
} catch (UnparsableNameException | InterruptedException e) {
// do nothing
- LOG.debug("Unable to parse name to create canonical: {}", nameUsage.scientificName);
+ LOG.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName());
}
- final String canonical = optCanonical.orElse(nameUsage.scientificName);
+ final String canonical = optCanonical.orElse(nameUsage.getScientificName());
// use custom precision step as we do not need range queries and prefer to save memory usage
// instead
- doc.add(new StringField(FIELD_ID, nameUsage.id, Field.Store.YES));
+ doc.add(new StringField(FIELD_ID, nameUsage.getId(), Field.Store.YES));
// we only store accepted key, no need to index it
// If the name is a synonym, then parentId name usage points
// to the accepted name
- if (nameUsage.status != null
- && nameUsage.status.equals(TaxonomicStatus.SYNONYM.name())
- && nameUsage.parentId != null) {
- doc.add(new StringField(FIELD_ACCEPTED_ID, nameUsage.parentId, Field.Store.YES));
+ if (StringUtils.isNotBlank(nameUsage.getStatus())
+ && nameUsage.getStatus().equals(TaxonomicStatus.SYNONYM.name())
+ && nameUsage.getParentId() != null) {
+ doc.add(new StringField(FIELD_ACCEPTED_ID, nameUsage.getParentId(), Field.Store.YES));
}
// analyzed name field - this is what we search upon
doc.add(new TextField(FIELD_CANONICAL_NAME, canonical, Field.Store.YES));
// store full name and classification only to return a full match object for hits
- String nameComplete = nameUsage.scientificName;
- if (StringUtils.isNotBlank(nameUsage.authorship)) {
- nameComplete += " " + nameUsage.authorship;
+ String nameComplete = nameUsage.getScientificName();
+ if (StringUtils.isNotBlank(nameUsage.getAuthorship())) {
+ nameComplete += " " + nameUsage.getAuthorship();
}
doc.add(new TextField(FIELD_SCIENTIFIC_NAME, nameComplete, Field.Store.YES));
// this lucene index is not persistent, so not risk in changing ordinal numbers
- doc.add(new StringField(FIELD_RANK, nameUsage.rank, Field.Store.YES));
+ doc.add(new StringField(FIELD_RANK, nameUsage.getRank(), Field.Store.YES));
- if (nameUsage.parentId != null && !nameUsage.parentId.equals(nameUsage.id)) {
- doc.add(new StringField(FIELD_PARENT_ID, nameUsage.parentId, Field.Store.YES));
+ if (StringUtils.isNotBlank(nameUsage.getParentId()) && !nameUsage.getParentId().equals(nameUsage.getId())) {
+ doc.add(new StringField(FIELD_PARENT_ID, nameUsage.getParentId(), Field.Store.YES));
}
- if (nameUsage.status != null) {
- doc.add(new StringField(FIELD_STATUS, nameUsage.status, Field.Store.YES));
+ if (StringUtils.isNotBlank(nameUsage.getStatus())) {
+ doc.add(new StringField(FIELD_STATUS, nameUsage.getStatus(), Field.Store.YES));
}
+
+ if (StringUtils.isNotBlank(nameUsage.getCategory())) {
+ doc.add(new StringField(FIELD_CATEGORY, nameUsage.getCategory(), Field.Store.YES));
+ }
+
return doc;
}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/Main.java b/matching-ws/src/main/java/life/catalogue/matching/Main.java
index 3b7e19672..9b89b1856 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/Main.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/Main.java
@@ -30,6 +30,14 @@ public class Main {
"Required for INDEX_DB and EXPORT_CSV modes")
private String datasetId;
+ @Parameter(names = {"--clb.status.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " +
+ "Required for INDEX_DB and EXPORT_CSV modes")
+ private String statusDatasetIds;
+
+ @Parameter(names = {"--clb.identifier.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " +
+ "Required for INDEX_DB and EXPORT_CSV modes")
+ private String identifierDatasetIds;
+
@Parameter(names = {"--index.path"}, description = "File system path to the pre-generated lucene index")
private String indexPath = "/data/matching-ws/index";
@@ -61,7 +69,8 @@ public static void main(String[] args) throws Exception {
commander.usage();
}
- if ((app.mode == ExecutionMode.INDEX_DB || app.mode == ExecutionMode.EXPORT_CSV) && app.datasetId == null) {
+ if ((app.mode == ExecutionMode.INDEX_DB
+ || app.mode == ExecutionMode.EXPORT_CSV) && app.datasetId == null) {
System.err.println("Missing required parameter for mode " + app.mode + " --clb.dataset.id");
commander.usage();
return;
@@ -73,10 +82,10 @@ public static void main(String[] args) throws Exception {
SpringApplication springApplication;
switch (app.mode) {
- case EXPORT_CSV, INDEX_CSV, INDEX_DB:
+ case EXPORT_CSV, INDEX_CSV, INDEX_DB, INDEX_IUCN_CSV, INDEX_IDENTIFIER_CSV:
springApplication = new SpringApplication(IndexingApplication.class);
springApplication.setAdditionalProfiles("indexing");
- springApplication.run( args).close();
+ springApplication.run(args).close();
break;
case WEB_APP:
SpringApplication webApp = new SpringApplication(MatchingApplication.class);
@@ -89,6 +98,8 @@ public static void main(String[] args) throws Exception {
enum ExecutionMode {
EXPORT_CSV,
+ INDEX_IUCN_CSV,
+ INDEX_IDENTIFIER_CSV,
INDEX_CSV,
INDEX_DB,
WEB_APP
diff --git a/matching-ws/src/main/java/life/catalogue/matching/MatchController.java b/matching-ws/src/main/java/life/catalogue/matching/MatchController.java
index 0c393efe8..48c920b42 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/MatchController.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/MatchController.java
@@ -69,14 +69,6 @@ private boolean isTraceRequested(String traceRequested) {
(traceRequested.equalsIgnoreCase("true") || traceRequested.equalsIgnoreCase("on"));
}
- private boolean getTraceParameter(HttpServletRequest request) {
- String parameter = request.getParameter("trace");
- if (parameter == null) {
- return false;
- }
- return !"false".equalsIgnoreCase(parameter);
- }
-
@Autowired
public MatchController(ErrorAttributes errorAttributes) {
Assert.notNull(errorAttributes, "ErrorAttributes must not be null");
@@ -138,6 +130,7 @@ public NameUsageMatch matchOldPaths(
HttpServletRequest response) {
return matchV2(
usageKey,
+ null,null,null,
scientificName2, scientificName,
authorship, authorship2,
removeNulls(genericName),
@@ -215,6 +208,9 @@ public NameUsageMatch matchOldPaths(
produces = "application/json")
public NameUsageMatch matchV2(
@RequestParam(value = "usageKey", required = false) String usageKey,
+ @RequestParam(value = "taxonID", required = false) String taxonID,
+ @RequestParam(value = "taxonConceptID", required = false) String taxonConceptID,
+ @RequestParam(value = "scientificNameID", required = false) String scientificNameID,
@RequestParam(value = "name", required = false) String scientificName2,
@RequestParam(value = "scientificName", required = false) String scientificName,
@RequestParam(value = "authorship", required = false) String authorship2,
@@ -231,7 +227,7 @@ public NameUsageMatch matchV2(
// ugly, i know, but jackson/spring isn't working with @JsonProperty
classification.setClazz(response.getParameter("class"));
return matchingService.match(
- removeNulls(usageKey),
+ first(removeNulls(usageKey), removeNulls(taxonID), removeNulls(taxonConceptID), removeNulls(scientificNameID)),
first(removeNulls(scientificName), removeNulls(scientificName2)),
first(removeNulls(authorship), removeNulls(authorship2)),
removeNulls(genericName),
diff --git a/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java b/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java
index cd9247e83..26ce91366 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java
@@ -203,7 +203,7 @@ protected static boolean isSimpleBinomial(String name) {
}
private static void warnIfMissing(String name, @Nullable String epithet, String part) {
- if (exists(epithet) && !name.toLowerCase().contains(epithet.toLowerCase())) {
+ if (exists(epithet) && name != null && !name.toLowerCase().contains(epithet.toLowerCase())) {
LOG.warn("ScientificName >{}< missing {}: {}", name, part, epithet);
}
}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java b/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java
index 6231109aa..903a9c861 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java
@@ -1,23 +1,41 @@
package life.catalogue.matching;
-import lombok.Builder;
-import lombok.Data;
-import lombok.EqualsAndHashCode;
+import com.opencsv.bean.CsvBindByName;
+
+import lombok.*;
/** A simple class to represent a name usage ready to be indexed. */
@Data
@EqualsAndHashCode
@Builder
+@NoArgsConstructor
+@AllArgsConstructor
public class NameUsage {
+
+ @CsvBindByName(column = "id")
String id;
+
+ @CsvBindByName(column = "parentId")
String parentId;
+
+ @CsvBindByName(column = "scientificName")
String scientificName;
+
+ @CsvBindByName(column = "authorship")
String authorship;
+
+ @CsvBindByName(column = "status")
String status;
+
+ @CsvBindByName(column = "rank")
String rank;
+
+ @CsvBindByName(column = "nomenclaturalCode")
String nomenclaturalCode;
- String sourceId;
- String sourceDatasetKey;
- String parentSourceId;
- String parentSourceDatasetKey;
+
+ @CsvBindByName(column = "category")
+ String category;
+
+ @CsvBindByName(column = "extension")
+ private String extension;
}
diff --git a/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java b/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java
index 2e567c711..a1c279b18 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java
@@ -32,6 +32,8 @@ public class NameUsageMatch implements LinneanClassification {
List alternatives = new ArrayList<>();
@Schema(description = "Diagnostics for a name match including the type of match and confidence level")
Diagnostics diagnostics = new Diagnostics();
+ @Schema(description = "Status information from external sources like IUCN Red List")
+ List additionalStatus = new ArrayList<>();
private String nameFor(Rank rank) {
return getClassification().stream()
diff --git a/matching-ws/src/main/java/life/catalogue/matching/Status.java b/matching-ws/src/main/java/life/catalogue/matching/Status.java
new file mode 100644
index 000000000..0bef13645
--- /dev/null
+++ b/matching-ws/src/main/java/life/catalogue/matching/Status.java
@@ -0,0 +1,12 @@
+package life.catalogue.matching;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import lombok.Data;
+
+@Data
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class Status {
+ private String datasetKey;
+ private String datasetTitle;
+ private String category;
+}
\ No newline at end of file
diff --git a/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml b/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml
index 36bef0554..82d2893e3 100644
--- a/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml
+++ b/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml
@@ -16,19 +16,40 @@
n.rank as rank,
nu.status as status,
n.code as nomenclaturalCode,
- vs.source_id as sourceId,
- vs.source_dataset_key as sourceDatasetKey,
- pvs.source_id as parentSourceId,
- pvs.source_dataset_key as parentSourceDatasetKey
+ '' as extension,
+ '' as category
FROM
name_usage nu
INNER JOIN
name n on n.id = nu.name_id AND n.dataset_key=#{datasetKey}
- LEFT JOIN
- verbatim_source vs on vs.id = nu.id AND vs.dataset_key=#{datasetKey}
- LEFT JOIN
- verbatim_source pvs on pvs.id = nu.parent_id AND pvs.dataset_key=#{datasetKey}
WHERE
nu.dataset_key = #{datasetKey}
+
+
+
diff --git a/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java b/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java
index 6b9b8cfa9..72609af83 100644
--- a/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java
+++ b/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java
@@ -15,13 +15,17 @@
import static org.junit.jupiter.api.Assertions.*;
-import au.com.bytecode.opencsv.CSVReader;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.List;
+
+import com.opencsv.CSVParserBuilder;
+import com.opencsv.CSVReader;
+import com.opencsv.CSVReaderBuilder;
+
import life.catalogue.api.vocab.TaxonomicStatus;
import org.apache.commons.lang3.StringUtils;
import org.gbif.nameparser.api.Rank;
@@ -42,8 +46,9 @@ public static void buildMatcher() throws Exception {
public static List readTestNames() throws Exception {
List usages = Lists.newArrayList();
// 1 2 Acanthophora Hulst, 1896 Geometridae Lepidoptera Insecta Arthropoda Animalia GENUS
- try (InputStream testFile = Resources.getResource("testNames.txt").openStream()) {
- CSVReader reader = new CSVReader(new InputStreamReader(testFile), '\t', '"');
+ try (InputStream testFile = Resources.getResource("testNames.txt").openStream();
+ CSVReader reader = new CSVReaderBuilder(new InputStreamReader(testFile))
+ .withCSVParser(new CSVParserBuilder().withSeparator('\t').build()).build()) {
String[] row = reader.readNext();
while (row != null) {
NameUsage n = NameUsage.builder().build();