From ae3ae8fb09df77dfd533d1915f432d53438cb89a Mon Sep 17 00:00:00 2001
From: Dave Martin <djtfmartin@gmail.com>
Date: Thu, 30 May 2024 20:43:40 +0100
Subject: [PATCH] WIP - Checkpoint commit for Index generation for IUCN and ID
 lookups for WoRMS Switch to using opencsv
 https://github.com/CatalogueOfLife/backend/issues/1321

---
 matching-ws/pom.xml                           |   4 +-
 .../life/catalogue/matching/DatasetIndex.java | 143 +++++++-
 .../catalogue/matching/IndexConstants.java    |   3 +
 .../matching/IndexingApplication.java         |  12 +
 .../catalogue/matching/IndexingMapper.java    |   2 +
 .../catalogue/matching/IndexingService.java   | 333 ++++++++++++++----
 .../java/life/catalogue/matching/Main.java    |  17 +-
 .../catalogue/matching/MatchController.java   |  14 +-
 .../life/catalogue/matching/NameNRank.java    |   2 +-
 .../life/catalogue/matching/NameUsage.java    |  32 +-
 .../catalogue/matching/NameUsageMatch.java    |   2 +
 .../java/life/catalogue/matching/Status.java  |  12 +
 .../catalogue/matching/IndexingMapper.xml     |  37 +-
 .../catalogue/matching/DatasetIndexTest.java  |  11 +-
 14 files changed, 501 insertions(+), 123 deletions(-)
 create mode 100644 matching-ws/src/main/java/life/catalogue/matching/Status.java
diff --git a/matching-ws/pom.xml b/matching-ws/pom.xml
index 02d50e0c5..fdd885fa6 100644
--- a/matching-ws/pom.xml
+++ b/matching-ws/pom.xml
@@ -251,9 +251,9 @@
       <version>2.17.0</version>
     </dependency>
     <dependency>
-      <groupId>net.sf.opencsv</groupId>
+      <groupId>com.opencsv</groupId>
       <artifactId>opencsv</artifactId>
-      <version>2.3</version>
+      <version>5.9</version>
       <scope>compile</scope>
     </dependency>
   </dependencies>
diff --git a/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java b/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java
index ec8f0c3b5..ac24be245 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/DatasetIndex.java
@@ -1,7 +1,7 @@
 package life.catalogue.matching;
 
 import static life.catalogue.matching.IndexConstants.*;
-import static life.catalogue.matching.IndexingService.analyzer;
+import static life.catalogue.matching.IndexingService.scientificNameAnalyzer;
 
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
@@ -29,6 +29,7 @@
 import org.apache.lucene.store.MMapDirectory;
 import org.gbif.nameparser.api.Rank;
 
+import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.beans.factory.annotation.Value;
@@ -43,6 +44,8 @@ public class DatasetIndex {
   private static Logger LOG = LoggerFactory.getLogger(DatasetIndex.class);
 
   private IndexSearcher searcher;
+  private Map<String, IndexSearcher> identifierSearchers = new HashMap<>();
+  private Map<String, IndexSearcher> ancillarySearchers = new HashMap<>();
 
   @Value("${index.path:/data/matching-ws/index}")
   String indexPath;
@@ -53,21 +56,73 @@ public class DatasetIndex {
   /** Attempts to read the index from disk if it exists. */
   @PostConstruct
   void init() {
-    if (new File(indexPath).exists()) {
-      LOG.info("Loading lucene index from {}", indexPath);
+
+    final String mainIndexPath = getMainIndexPath();
+
+    if (new File(mainIndexPath).exists()) {
+      LOG.info("Loading lucene index from {}", mainIndexPath);
       try {
-        initWithDir(new MMapDirectory(Path.of(indexPath)));
+        initWithDir(new MMapDirectory(Path.of(mainIndexPath)));
       } catch (IOException e) {
         LOG.warn("Cannot open lucene index. Index not available", e);
       }
+
+      // load identifier indexes
+      this.identifierSearchers = new HashMap<>();
+      if (Path.of(indexPath + "/identifiers").toFile().exists()) {
+        try (DirectoryStream<Path> stream = Files.newDirectoryStream(Path.of(indexPath + "/identifiers"))) {
+          for (Path entry : stream) {
+            if (Files.isDirectory(entry)) {
+              try {
+                Directory identifierDir = new MMapDirectory(entry);
+                DirectoryReader reader = DirectoryReader.open(identifierDir);
+                identifierSearchers.put(entry.toFile().getName(), new IndexSearcher(reader));
+              } catch (IOException e) {
+                LOG.warn("Cannot open identifiers lucene index {}", entry, e);
+              }
+            }
+          }
+        } catch (IOException e) {
+          LOG.error("Cannot read identifiers index directory", e);
+        }
+      } else {
+        LOG.info("Identifiers indexes not found at {}", indexPath + "/identifiers");
+      }
+
+      // load ancillary indexes
+      this.ancillarySearchers = new HashMap<>();
+      if (Path.of(indexPath + "/ancillary").toFile().exists()) {
+        try (DirectoryStream<Path> stream = Files.newDirectoryStream(Path.of(indexPath + "/ancillary"))) {
+          for (Path entry : stream) {
+            if (Files.isDirectory(entry)) {
+              try {
+                Directory ancillaryDir = new MMapDirectory(entry);
+                DirectoryReader reader = DirectoryReader.open(ancillaryDir);
+                ancillarySearchers.put(entry.toFile().getName(), new IndexSearcher(reader));
+              } catch (IOException e) {
+                LOG.warn("Cannot open ancillary lucene index {}", entry, e);
+              }
+            }
+          }
+        } catch (IOException e) {
+          LOG.error("Cannot read ancillary index directory", e);
+        }
+      } else {
+        LOG.info("Ancillary indexes not found at {}", indexPath + "/ancillary");
+      }
+
     } else {
-      LOG.warn("Lucene index not found at {}", indexPath);
+      LOG.warn("Lucene index not found at {}", mainIndexPath);
     }
   }
 
-  void initWithDir(Directory indexDir) {
+  private @NotNull String getMainIndexPath() {
+    return indexPath + "/main";
+  }
+
+  void initWithDir(Directory indexDirectory) {
     try {
-      DirectoryReader reader = DirectoryReader.open(indexDir);
+      DirectoryReader reader = DirectoryReader.open(indexDirectory);
       this.searcher = new IndexSearcher(reader);
     } catch (IOException e) {
       LOG.warn("Cannot open lucene index. Index not available", e);
@@ -83,7 +138,7 @@ public IndexMetadata getIndexMetadata(){
 
     IndexMetadata metadata = new IndexMetadata();
     // get size on disk
-    Path directoryPath = Path.of(indexPath);
+    Path directoryPath = Path.of(getMainIndexPath());
     try {
       BasicFileAttributes attributes = Files.readAttributes(directoryPath, BasicFileAttributes.class);
       Instant creationTime = attributes.creationTime().toInstant();
@@ -225,7 +280,7 @@ private IndexSearcher getSearcher() {
    * @return
    */
   public NameUsageMatch matchByUsageKey(String usageKey) {
-    Optional<Document> docOpt = getByUsageId(usageKey);
+    Optional<Document> docOpt = getByUsageKey(usageKey, true);
     if (docOpt.isPresent()) {
       Document doc = docOpt.get();
       NameUsageMatch match = fromDoc(doc);
@@ -245,13 +300,51 @@ public NameUsageMatch matchByUsageKey(String usageKey) {
     }
   }
 
-  public Optional<Document> getByUsageId(String usageKey) {
-    Query query = new TermQuery(new Term(FIELD_ID, usageKey));
+  public static String escapeQueryChars(String s) {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < s.length(); i++) {
+      char c = s.charAt(i);
+      // These are the special characters that need to be escaped
+      if (c == '\\' || c == '+' || c == '-' || c == '!' || c == '(' || c == ')' ||
+        c == ':' || c == '^' || c == '[' || c == ']' || c == '\"' || c == '{' ||
+        c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&' ||
+        c == '/' || Character.isWhitespace(c)) {
+        sb.append('\\');
+      }
+      sb.append(c);
+    }
+    return sb.toString();
+  }
+
+  public Optional<Document> getByUsageKey(String usageKey, boolean allowExternalIDs) {
+    Query query = new TermQuery(new Term(FIELD_ID, escapeQueryChars(usageKey)));
     try {
       TopDocs docs = getSearcher().search(query, 3);
       if (docs.totalHits.value > 0) {
         return Optional.of(getSearcher().storedFields().document(docs.scoreDocs[0].doc));
-      } else {
+      } else if (allowExternalIDs) {
+
+        // if join indexes are present, add them to the match
+        if (identifierSearchers != null){
+          for (String datasetKey: identifierSearchers.keySet()){
+            IndexSearcher identifierSearcher = identifierSearchers.get(datasetKey);
+            Query identifierQuery = new TermQuery(new Term(FIELD_ID, usageKey));
+            LOG.info("Searching for identifier {} in dataset {}", usageKey, datasetKey);
+            TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3);
+            if (identifierDocs.totalHits.value > 0) {
+              Document identifierDoc = identifierSearcher.storedFields().document(identifierDocs.scoreDocs[0].doc);
+              final String joinID = identifierDoc.get(FIELD_JOIN_ID);
+              Query getByIDQuery = new TermQuery(new Term(FIELD_ID, joinID));
+              TopDocs docs2 = getSearcher().search(getByIDQuery, 3);
+              if (docs2.totalHits.value > 0) {
+                return Optional.of(getSearcher().storedFields().document(docs2.scoreDocs[0].doc));
+              } else {
+                LOG.warn("Cannot find usage {} in main lucene index after finding it in identifier index for {}", usageKey, datasetKey);
+                return Optional.empty();
+              }
+            }
+          }
+        }
         return Optional.empty();
       }
     } catch (IOException e) {
@@ -274,7 +367,7 @@ public List<RankedName> loadHigherTaxa(String parentID) {
     List<RankedName> higherTaxa = new ArrayList<>();
 
     while (parentID != null) {
-      Optional<Document> docOpt = getByUsageId(parentID);
+      Optional<Document> docOpt = getByUsageKey(parentID, false);
       if (docOpt.isEmpty()) {
         break;
       }
@@ -313,7 +406,7 @@ private NameUsageMatch fromDoc(Document doc) {
 
     if (doc.get(FIELD_ACCEPTED_ID) != null) {
       synonym = true;
-      Optional<Document> accDocOpt = getByUsageId(doc.get(FIELD_ACCEPTED_ID));
+      Optional<Document> accDocOpt = getByUsageKey(doc.get(FIELD_ACCEPTED_ID), false);
       if (accDocOpt.isPresent()) {
         Document accDoc = accDocOpt.get();
         u.setAcceptedUsage(
@@ -355,6 +448,26 @@ private NameUsageMatch fromDoc(Document doc) {
     }
     u.setSynonym(synonym);
 
+    // if join indexes are present, add them to the match
+    for (String datasetKey: ancillarySearchers.keySet()){
+      IndexSearcher ancillarySearcher = ancillarySearchers.get(datasetKey);
+      Query query = new TermQuery(new Term(FIELD_JOIN_ID, doc.get(FIELD_ID) ));
+      try {
+        TopDocs docs = ancillarySearcher.search(query, 3);
+        if (docs.totalHits.value > 0) {
+          Document ancillaryDoc = ancillarySearcher.storedFields().document(docs.scoreDocs[0].doc);
+          String status = ancillaryDoc.get(FIELD_CATEGORY);
+          Status ancillaryStatus = new Status();
+          ancillaryStatus.setCategory(status);
+          ancillaryStatus.setDatasetKey(datasetKey);
+          ancillaryStatus.setDatasetTitle("");
+          u.getAdditionalStatus().add(ancillaryStatus);
+        }
+      } catch (IOException e) {
+        LOG.error("Cannot load usage {} from lucene index", doc.get(FIELD_ID), e);
+      }
+    }
+
     String status = doc.get(FIELD_STATUS);
     u.getDiagnostics().setStatus(TaxonomicStatus.valueOf(status));
 
@@ -363,7 +476,7 @@ private NameUsageMatch fromDoc(Document doc) {
 
   public List<NameUsageMatch> matchByName(String name, boolean fuzzySearch, int maxMatches) {
     // use the same lucene analyzer to normalize input
-    final String analyzedName = LuceneUtils.analyzeString(analyzer, name).get(0);
+    final String analyzedName = LuceneUtils.analyzeString(scientificNameAnalyzer, name).get(0);
     LOG.debug(
         "Analyzed {} query \"{}\" becomes >>{}<<",
         fuzzySearch ? "fuzzy" : "straight",
diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java b/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java
index 7ef6cdf53..0743d838f 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/IndexConstants.java
@@ -8,4 +8,7 @@ public class IndexConstants {
   static final String FIELD_RANK = "rank";
   static final String FIELD_STATUS = "status";
   static final String FIELD_PARENT_ID = "parentId";
+
+  static final String FIELD_CATEGORY = "category";
+  static final String FIELD_JOIN_ID = "joinId";
 }
diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java
index b97bf2a97..e8a86826b 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingApplication.java
@@ -55,6 +55,18 @@ public void run(ApplicationArguments args) throws Exception {
         return;
       }
       indexingService.runDatasetIndexing(Integer.parseInt(datasetIds.get(0)));
+    } else if (Main.ExecutionMode.INDEX_IUCN_CSV.name().equals(mode)) {
+      if (datasetIds == null || datasetIds.isEmpty()) {
+        System.err.println("Missing required parameter --clb.dataset.id");
+        return;
+      }
+      indexingService.indexIUCN(datasetIds.get(0));
+    } else if (Main.ExecutionMode.INDEX_IDENTIFIER_CSV.name().equals(mode)) {
+      if (datasetIds == null || datasetIds.isEmpty()) {
+        System.err.println("Missing required parameter --clb.dataset.id");
+        return;
+      }
+      indexingService.indexIdentifiers(datasetIds.get(0));
     } else {
       System.err.println("Unrecognized mode: " + mode);
     }
diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java
index cf5abbfcb..b3754119a 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingMapper.java
@@ -6,4 +6,6 @@
 public interface IndexingMapper {
 
   Cursor<NameUsage> getAllForDataset(@Param("datasetKey") int datasetKey);
+
+  Cursor<NameUsage> getAllWithExtensionForDataset(@Param("datasetKey") int datasetKey);
 }
diff --git a/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java b/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java
index bf08637c3..072c10ddd 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/IndexingService.java
@@ -2,22 +2,22 @@
 
 import static life.catalogue.matching.IndexConstants.*;
 
-import au.com.bytecode.opencsv.CSVReader;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
+import java.io.*;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Optional;
+import java.util.*;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.function.Consumer;
 import java.util.function.Supplier;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
-import au.com.bytecode.opencsv.CSVWriter;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.opencsv.CSVWriter;
+import com.opencsv.bean.CsvToBean;
+import com.opencsv.bean.CsvToBeanBuilder;
+import com.opencsv.bean.StatefulBeanToCsv;
+import com.opencsv.bean.StatefulBeanToCsvBuilder;
 import life.catalogue.api.model.ReleaseAttempt;
 import life.catalogue.api.vocab.DatasetOrigin;
 import life.catalogue.api.vocab.TaxonomicStatus;
@@ -28,11 +28,12 @@
 import org.apache.ibatis.session.SqlSession;
 import org.apache.ibatis.session.SqlSessionFactory;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
 import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.*;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.*;
+import org.apache.lucene.search.*;
 import org.apache.lucene.store.ByteBuffersDirectory;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
@@ -45,6 +46,7 @@
 import org.mybatis.spring.SqlSessionFactoryBean;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Transactional;
@@ -52,6 +54,11 @@
 
 /**
  * Service to index a dataset from the Checklist Bank.
+ *
+ * /data/matching-ws/export/             - CSV exports from the Checklist Bank
+ * /data/matching-ws/index/main          - Main lucene index
+ * /data/matching-ws/index/identifiers   - Lucene indexes for IDs lookups
+ * /data/matching-ws/index/ancillary     - Lucene indexes for status values e.g. IUCN
  */
 @Service
 public class IndexingService {
@@ -64,6 +71,9 @@ public class IndexingService {
   @Value("${export.path:/tmp/matching-export}")
   String exportPath;
 
+  @Value("${temp.path:/tmp/matching-tmp}")
+  String tempIndexPath;
+
   @Value("${clb.url}")
   String clbUrl;
 
@@ -76,15 +86,18 @@ public class IndexingService {
   @Value("${clb.driver}")
   String clDriver;
 
+  @Autowired protected MatchingService matchingService;
+
   private static final String REL_PATTERN_STR = "(\\d+)(?:LX?RC?|R(\\d+))";
   private static final Pattern REL_PATTERN = Pattern.compile("^" + REL_PATTERN_STR + "$");
 
-  protected static final ScientificNameAnalyzer analyzer = new ScientificNameAnalyzer();
+  protected static final ScientificNameAnalyzer scientificNameAnalyzer = new ScientificNameAnalyzer();
 
   protected static IndexWriterConfig getIndexWriterConfig() {
     Map<String, Analyzer> analyzerPerField = new HashMap<>();
     analyzerPerField.put(FIELD_SCIENTIFIC_NAME, new StandardAnalyzer());
-    PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(analyzer, analyzerPerField);
+    analyzerPerField.put(FIELD_CANONICAL_NAME, scientificNameAnalyzer);
+    PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper( new KeywordAnalyzer(), analyzerPerField);
     return new IndexWriterConfig(aWrapper);
   }
 
@@ -147,7 +160,7 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
 
     // I am seeing better results with this MyBatis Pooling DataSource for Cursor queries
     // (parallelism) as opposed to the spring managed DataSource
-    PooledDataSource dataSource = new PooledDataSource(clDriver, clbUrl, clbUser, clPassword);
+    final PooledDataSource dataSource = new PooledDataSource(clDriver, clbUrl, clbUser, clPassword);
     // Create a session factory
     SqlSessionFactoryBean sessionFactoryBean = new SqlSessionFactoryBean();
     sessionFactoryBean.setDataSource(dataSource);
@@ -160,7 +173,7 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
     Optional<Integer> datasetKey = Optional.empty();
     try {
       datasetKey = Optional.of(Integer.parseInt(datasetKeyInput));
-    } catch (NumberFormatException e) {
+    } catch (NumberFormatException ignored) {
     }
 
     if (datasetKey.isEmpty()) {
@@ -178,26 +191,17 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
     final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv";
     FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput));
     try (SqlSession session = factory.openSession(false);
-        final CSVWriter writer = new CSVWriter(new FileWriter(fileName), '$')) {
+        final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) {
+      StatefulBeanToCsv<NameUsage> sbc = new StatefulBeanToCsvBuilder<NameUsage>(writer)
+        .withQuotechar('\'')
+        .withSeparator('$')
+        .build();
       // Create index writer
       consume(
           () -> session.getMapper(IndexingMapper.class).getAllForDataset(validDatasetKey),
           name -> {
             try {
-              writer.writeNext(
-                  new String[] {
-                    name.id,
-                    name.parentId,
-                    name.scientificName,
-                    name.authorship,
-                    name.rank,
-                    name.status,
-                    name.nomenclaturalCode,
-                    name.sourceId,
-                    name.sourceDatasetKey,
-                    name.parentSourceId,
-                    name.parentSourceDatasetKey
-                  });
+              sbc.write(name);
             } catch (Exception e) {
               throw new RuntimeException(e);
             }
@@ -211,10 +215,92 @@ public void writeCLBToFile(@NotNull final String datasetKeyInput) throws Excepti
     LOG.info("Records written to file {}: {}", fileName, counter.get());
   }
 
+  @Transactional
+  public void indexIdentifiers(String datasetKey) throws Exception {
+    writeCLBToFile(datasetKey);
+    indexFile(exportPath  + "/" + datasetKey, tempIndexPath + "/" + datasetKey);
+    writeJoinIndex( tempIndexPath + "/"  + datasetKey, indexPath + "/identifiers/" + datasetKey, false);
+  }
+
+  public void indexIUCN(String datasetKey) throws Exception {
+    writeCLBIUCNToFile(datasetKey);
+    indexFile(exportPath  + "/" + datasetKey, tempIndexPath + "/" + datasetKey);
+    writeJoinIndex( tempIndexPath + "/" + datasetKey, indexPath + "/ancillary/" + datasetKey, true);
+  }
+
+  @Transactional
+  public String writeCLBIUCNToFile(@NotNull final String datasetKeyInput) throws Exception {
+
+    // I am seeing better results with this MyBatis Pooling DataSource for Cursor queries
+    // (parallelism) as opposed to the spring managed DataSource
+    PooledDataSource dataSource = new PooledDataSource(clDriver, clbUrl, clbUser, clPassword);
+    // Create a session factory
+    SqlSessionFactoryBean sessionFactoryBean = new SqlSessionFactoryBean();
+    sessionFactoryBean.setDataSource(dataSource);
+    SqlSessionFactory factory = sessionFactoryBean.getObject();
+    assert factory != null;
+    factory.getConfiguration().addMapper(IndexingMapper.class);
+    factory.getConfiguration().addMapper(DatasetMapper.class);
+
+    // resolve the magic keys...
+    Optional<Integer> datasetKey = Optional.empty();
+    try {
+      datasetKey = Optional.of(Integer.parseInt(datasetKeyInput));
+    } catch (NumberFormatException e) {
+    }
+
+    if (datasetKey.isEmpty()) {
+      datasetKey = lookupDatasetKey(factory, datasetKeyInput);
+    }
+
+    if (datasetKey.isEmpty()) {
+      throw new IllegalArgumentException("Invalid dataset key: " + datasetKeyInput);
+    }
+
+    final Integer validDatasetKey = datasetKey.get();
+
+    LOG.info("Writing dataset to file...");
+    final AtomicInteger counter = new AtomicInteger(0);
+    final String fileName = exportPath + "/" + datasetKeyInput + "/" + "index.csv";
+    FileUtils.forceMkdir(new File(exportPath + "/" + datasetKeyInput));
+    try (SqlSession session = factory.openSession(false);
+         final CSVWriter writer = new CSVWriter(new FileWriter(fileName))) {
+
+      final ObjectMapper objectMapper = new ObjectMapper();
+      final StatefulBeanToCsv<NameUsage> sbc = new StatefulBeanToCsvBuilder<NameUsage>(writer)
+        .withQuotechar('\'')
+        .withSeparator('$')
+        .build();
+
+      // Create index writer
+      consume(
+        () -> session.getMapper(IndexingMapper.class).getAllWithExtensionForDataset(validDatasetKey),
+        nameUsage -> {
+          try {
+            if (StringUtils.isNotBlank(nameUsage.getExtension())){
+              // parse it
+              JsonNode node = objectMapper.readTree(nameUsage.getExtension());
+              nameUsage.setCategory(node.path("iucn:threatStatus").asText());
+            }
+            sbc.write(nameUsage);
+
+          } catch (Exception e) {
+            throw new RuntimeException(e);
+          }
+          counter.incrementAndGet();
+        });
+    } finally {
+      dataSource.forceCloseAll();
+    }
+
+    // write metadata file in JSON format
+    LOG.info("Records written to file {}: {}", fileName, counter.get());
+    return fileName;
+  }
+
   public static Directory newMemoryIndex(Iterable<NameUsage> usages) throws IOException {
     LOG.info("Start building a new RAM index");
     Directory dir = new ByteBuffersDirectory();
-
     IndexWriter writer = getIndexWriter(dir);
 
     // creates initial index segments
@@ -235,6 +321,110 @@ private static IndexWriter getIndexWriter(Directory dir) throws IOException {
     return new IndexWriter(dir, getIndexWriterConfig());
   }
 
+  public void writeJoinIndex(String tempIndexPath, String ancillaryIndexPath, boolean acceptedOnly) {
+
+    try {
+      // Load temp index directory
+      Directory tempDirectory = FSDirectory.open(Paths.get(tempIndexPath));
+      IndexReader tempReader = DirectoryReader.open(tempDirectory);
+      IndexSearcher searcher = new IndexSearcher(tempReader);
+
+      // Create ancillary index
+      Path indexDirectory = initialiseIndexDirectory(ancillaryIndexPath);
+      Directory ancillaryDirectory = FSDirectory.open(indexDirectory);
+      IndexWriterConfig config = getIndexWriterConfig();
+      config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
+      IndexWriter ancillaryIndexWriter = new IndexWriter(ancillaryDirectory, getIndexWriterConfig());
+
+      // Construct a simple query to get all documents
+      TopDocs results = searcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE);
+      ScoreDoc[] hits = results.scoreDocs;
+
+      AtomicInteger counter = new AtomicInteger(0);
+
+      // Write document data
+      for (ScoreDoc hit : hits) {
+
+        counter.incrementAndGet();
+        Document doc = searcher.storedFields().document(hit.doc);
+        Map<String, String> hierarchy = loadHierarchy(searcher, doc.get(FIELD_ID));
+
+        String status = doc.get(FIELD_STATUS);
+        if (status != null &&
+          acceptedOnly &&
+          !status.equals(TaxonomicStatus.ACCEPTED.name())) {
+          // skip synonyms, otherwise we would index them twice
+          continue;
+        }
+        String scientificName = doc.get(FIELD_SCIENTIFIC_NAME);
+        Classification classification = new Classification();
+        classification.setKingdom(hierarchy.getOrDefault(Rank.KINGDOM.name(), ""));
+        classification.setPhylum(hierarchy.getOrDefault(Rank.PHYLUM.name(), ""));
+        classification.setClazz(hierarchy.getOrDefault(Rank.CLASS.name(), ""));
+        classification.setOrder(hierarchy.getOrDefault(Rank.ORDER.name(), ""));
+        classification.setFamily(hierarchy.getOrDefault(Rank.FAMILY.name(), ""));
+        classification.setGenus(hierarchy.getOrDefault(Rank.GENUS.name(), ""));
+        classification.setSpecies(hierarchy.getOrDefault(Rank.SPECIES.name(), ""));
+
+        if (counter.get() % 100000 == 0) {
+          LOG.info("Indexed: {} taxa", counter.get());
+        }
+
+        // match to main dataset
+        NameUsageMatch nameUsageMatch = matchingService.match(scientificName, classification, true);
+        if (nameUsageMatch.getUsage() != null) {
+          doc.add(new StringField(FIELD_JOIN_ID,
+            nameUsageMatch.getAcceptedUsage() != null ? nameUsageMatch.getAcceptedUsage().getKey() :
+            nameUsageMatch.getUsage().getKey(), Field.Store.YES));
+          ancillaryIndexWriter.addDocument(doc);
+        }
+      }
+
+      // close temp
+      tempReader.close();
+      tempDirectory.close();
+
+      // close ancillary
+      ancillaryIndexWriter.commit();
+      ancillaryIndexWriter.forceMerge(1);
+      ancillaryIndexWriter.close();
+      ancillaryDirectory.close();
+
+      LOG.info("Ancillary index written: {} documents.", counter.get());
+    } catch (Exception e) {
+      LOG.error("Error writing documents to CSV: {}", e.getMessage());
+    }
+  }
+
+  public Optional<Document> getById(IndexSearcher searcher, String id) {
+    Query query = new TermQuery(new Term(FIELD_ID, id));
+    try {
+      TopDocs docs = searcher.search(query, 3);
+      if (docs.totalHits.value > 0) {
+        return Optional.of(searcher.storedFields().document(docs.scoreDocs[0].doc));
+      } else {
+        return Optional.empty();
+      }
+    } catch (IOException e) {
+      LOG.error("Cannot load usage {} from lucene index", id, e);
+    }
+    return Optional.empty();
+  }
+
+  public Map<String, String> loadHierarchy(IndexSearcher searcher, String id) {
+    Map<String, String> classification = new HashMap<>();
+    while (id != null) {
+      Optional<Document> docOpt = getById(searcher, id);
+      if (docOpt.isEmpty()) {
+        break;
+      }
+      Document doc = docOpt.get();
+      classification.put(doc.get(FIELD_RANK), doc.get(FIELD_CANONICAL_NAME));
+      id = doc.get(FIELD_PARENT_ID);
+    }
+    return classification;
+  }
+
   @Transactional
   public void indexFile(String exportPath, String indexPath) throws Exception {
 
@@ -251,38 +441,26 @@ public void indexFile(String exportPath, String indexPath) throws Exception {
     final AtomicInteger counter = new AtomicInteger(0);
     final String filePath = exportPath + "/index.csv";
 
-    try (CSVReader reader = new CSVReader(new FileReader(filePath), '$', '"');
-        IndexWriter indexWriter = new IndexWriter(directory, config)) {
+    try (Reader reader = new FileReader(filePath);
+         IndexWriter indexWriter = new IndexWriter(directory, config)) {
 
-      String[] row = reader.readNext();
-      while (row != null) {
-        if (row.length != 11) {
-          LOG.warn("Skipping row with invalid number of columns: {}", String.join(",", row));
-          row = reader.readNext();
-          continue;
-        }
+      CsvToBean<NameUsage> csvReader = new CsvToBeanBuilder(reader)
+        .withType(NameUsage.class)
+        .withSeparator('$')
+        .withIgnoreLeadingWhiteSpace(true)
+        .withIgnoreEmptyLine(true)
+        .build();
+
+      Iterator<NameUsage> iterator = csvReader.iterator();
+
+      while (iterator.hasNext()) {
         if (counter.get() % 100000 == 0) {
           LOG.info("Indexed: {} taxa", counter.get());
         }
-
-        NameUsage nameUsage =
-            NameUsage.builder()
-                .id(row[0])
-                .parentId(row[1])
-                .scientificName(row[2])
-                .authorship(row[3])
-                .rank(row[4])
-                .status(row[5])
-                .nomenclaturalCode(row[6])
-                .sourceId(row[7])
-                .sourceDatasetKey(row[8])
-                .parentSourceId(row[9])
-                .parentSourceDatasetKey(row[10])
-                .build();
+        NameUsage nameUsage = iterator.next();
         Document doc = toDoc(nameUsage);
         indexWriter.addDocument(doc);
         counter.incrementAndGet();
-        row = reader.readNext();
       }
       LOG.info("Final index commit");
       indexWriter.commit();
@@ -362,59 +540,64 @@ protected static Document toDoc(NameUsage nameUsage) {
      cultivar or strain information. Infrageneric names are represented without a
      leading genus. Unicode characters are replaced by their matching ASCII characters."
     */
-    Rank rank = Rank.valueOf(nameUsage.rank);
+    Rank rank = Rank.valueOf(nameUsage.getRank());
 
     Optional<String> optCanonical = Optional.empty();
     try {
       NomCode nomCode = null;
-      if (!StringUtils.isEmpty(nameUsage.nomenclaturalCode)) {
-        nomCode = NomCode.valueOf(nameUsage.nomenclaturalCode);
+      if (!StringUtils.isEmpty(nameUsage.getNomenclaturalCode())) {
+        nomCode = NomCode.valueOf(nameUsage.getNomenclaturalCode());
       }
-      ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.scientificName, rank, nomCode);
+      ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
 
       // canonicalMinimal will construct the name without the hybrid marker and authorship
       String canonical = NameFormatter.canonicalMinimal(pn);
       optCanonical = Optional.ofNullable(canonical);
     } catch (UnparsableNameException | InterruptedException e) {
       // do nothing
-      LOG.debug("Unable to parse name to create canonical: {}", nameUsage.scientificName);
+      LOG.debug("Unable to parse name to create canonical: {}", nameUsage.getScientificName());
     }
 
-    final String canonical = optCanonical.orElse(nameUsage.scientificName);
+    final String canonical = optCanonical.orElse(nameUsage.getScientificName());
 
     // use custom precision step as we do not need range queries and prefer to save memory usage
     // instead
-    doc.add(new StringField(FIELD_ID, nameUsage.id, Field.Store.YES));
+    doc.add(new StringField(FIELD_ID, nameUsage.getId(), Field.Store.YES));
 
     // we only store accepted key, no need to index it
     // If the name is a synonym, then parentId name usage points
     // to the accepted name
-    if (nameUsage.status != null
-        && nameUsage.status.equals(TaxonomicStatus.SYNONYM.name())
-        && nameUsage.parentId != null) {
-      doc.add(new StringField(FIELD_ACCEPTED_ID, nameUsage.parentId, Field.Store.YES));
+    if (StringUtils.isNotBlank(nameUsage.getStatus())
+        && nameUsage.getStatus().equals(TaxonomicStatus.SYNONYM.name())
+        && nameUsage.getParentId() != null) {
+      doc.add(new StringField(FIELD_ACCEPTED_ID, nameUsage.getParentId(), Field.Store.YES));
     }
 
     // analyzed name field - this is what we search upon
     doc.add(new TextField(FIELD_CANONICAL_NAME, canonical, Field.Store.YES));
 
     // store full name and classification only to return a full match object for hits
-    String nameComplete = nameUsage.scientificName;
-    if (StringUtils.isNotBlank(nameUsage.authorship)) {
-      nameComplete += " " + nameUsage.authorship;
+    String nameComplete = nameUsage.getScientificName();
+    if (StringUtils.isNotBlank(nameUsage.getAuthorship())) {
+      nameComplete += " " + nameUsage.getAuthorship();
     }
     doc.add(new TextField(FIELD_SCIENTIFIC_NAME, nameComplete, Field.Store.YES));
 
     // this lucene index is not persistent, so not risk in changing ordinal numbers
-    doc.add(new StringField(FIELD_RANK, nameUsage.rank, Field.Store.YES));
+    doc.add(new StringField(FIELD_RANK, nameUsage.getRank(), Field.Store.YES));
 
-    if (nameUsage.parentId != null && !nameUsage.parentId.equals(nameUsage.id)) {
-      doc.add(new StringField(FIELD_PARENT_ID, nameUsage.parentId, Field.Store.YES));
+    if (StringUtils.isNotBlank(nameUsage.getParentId()) && !nameUsage.getParentId().equals(nameUsage.getId())) {
+      doc.add(new StringField(FIELD_PARENT_ID, nameUsage.getParentId(), Field.Store.YES));
     }
 
-    if (nameUsage.status != null) {
-      doc.add(new StringField(FIELD_STATUS, nameUsage.status, Field.Store.YES));
+    if (StringUtils.isNotBlank(nameUsage.getStatus())) {
+      doc.add(new StringField(FIELD_STATUS, nameUsage.getStatus(), Field.Store.YES));
     }
+
+    if (StringUtils.isNotBlank(nameUsage.getCategory())) {
+      doc.add(new StringField(FIELD_CATEGORY, nameUsage.getCategory(), Field.Store.YES));
+    }
+
     return doc;
   }
 
diff --git a/matching-ws/src/main/java/life/catalogue/matching/Main.java b/matching-ws/src/main/java/life/catalogue/matching/Main.java
index 3b7e19672..9b89b1856 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/Main.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/Main.java
@@ -30,6 +30,14 @@ public class Main {
     "Required for INDEX_DB and EXPORT_CSV modes")
   private String datasetId;
 
+  @Parameter(names = {"--clb.status.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " +
+    "Required for INDEX_DB and EXPORT_CSV modes")
+  private String statusDatasetIds;
+
+  @Parameter(names = {"--clb.identifier.dataset.ids"}, description = "ChecklistBank dataset ID to create an index for or to export to CSV. " +
+    "Required for INDEX_DB and EXPORT_CSV modes")
+  private String identifierDatasetIds;
+
   @Parameter(names = {"--index.path"}, description = "File system path to the pre-generated lucene index")
   private String indexPath = "/data/matching-ws/index";
 
@@ -61,7 +69,8 @@ public static void main(String[] args) throws Exception {
       commander.usage();
     }
 
-    if ((app.mode == ExecutionMode.INDEX_DB || app.mode == ExecutionMode.EXPORT_CSV) && app.datasetId == null) {
+    if ((app.mode == ExecutionMode.INDEX_DB
+        || app.mode == ExecutionMode.EXPORT_CSV) && app.datasetId == null) {
       System.err.println("Missing required parameter for mode " + app.mode + " --clb.dataset.id");
       commander.usage();
       return;
@@ -73,10 +82,10 @@ public static void main(String[] args) throws Exception {
 
       SpringApplication springApplication;
       switch (app.mode) {
-        case EXPORT_CSV, INDEX_CSV, INDEX_DB:
+        case EXPORT_CSV, INDEX_CSV, INDEX_DB, INDEX_IUCN_CSV, INDEX_IDENTIFIER_CSV:
           springApplication = new SpringApplication(IndexingApplication.class);
           springApplication.setAdditionalProfiles("indexing");
-          springApplication.run( args).close();
+          springApplication.run(args).close();
           break;
         case WEB_APP:
           SpringApplication webApp = new SpringApplication(MatchingApplication.class);
@@ -89,6 +98,8 @@ public static void main(String[] args) throws Exception {
 
   enum ExecutionMode {
     EXPORT_CSV,
+    INDEX_IUCN_CSV,
+    INDEX_IDENTIFIER_CSV,
     INDEX_CSV,
     INDEX_DB,
     WEB_APP
diff --git a/matching-ws/src/main/java/life/catalogue/matching/MatchController.java b/matching-ws/src/main/java/life/catalogue/matching/MatchController.java
index 0c393efe8..48c920b42 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/MatchController.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/MatchController.java
@@ -69,14 +69,6 @@ private boolean isTraceRequested(String traceRequested) {
       (traceRequested.equalsIgnoreCase("true") || traceRequested.equalsIgnoreCase("on"));
   }
 
-  private boolean getTraceParameter(HttpServletRequest request) {
-    String parameter = request.getParameter("trace");
-    if (parameter == null) {
-      return false;
-    }
-    return !"false".equalsIgnoreCase(parameter);
-  }
-
   @Autowired
   public MatchController(ErrorAttributes errorAttributes) {
     Assert.notNull(errorAttributes, "ErrorAttributes must not be null");
@@ -138,6 +130,7 @@ public NameUsageMatch matchOldPaths(
     HttpServletRequest response) {
     return matchV2(
       usageKey,
+      null,null,null,
       scientificName2, scientificName,
       authorship, authorship2,
       removeNulls(genericName),
@@ -215,6 +208,9 @@ public NameUsageMatch matchOldPaths(
       produces = "application/json")
   public NameUsageMatch matchV2(
       @RequestParam(value = "usageKey", required = false) String usageKey,
+      @RequestParam(value = "taxonID", required = false) String taxonID,
+      @RequestParam(value = "taxonConceptID", required = false) String taxonConceptID,
+      @RequestParam(value = "scientificNameID", required = false) String scientificNameID,
       @RequestParam(value = "name", required = false) String scientificName2,
       @RequestParam(value = "scientificName", required = false) String scientificName,
       @RequestParam(value = "authorship", required = false) String authorship2,
@@ -231,7 +227,7 @@ public NameUsageMatch matchV2(
     // ugly, i know, but jackson/spring isn't working with @JsonProperty
     classification.setClazz(response.getParameter("class"));
     return matchingService.match(
-        removeNulls(usageKey),
+        first(removeNulls(usageKey), removeNulls(taxonID), removeNulls(taxonConceptID), removeNulls(scientificNameID)),
         first(removeNulls(scientificName), removeNulls(scientificName2)),
         first(removeNulls(authorship), removeNulls(authorship2)),
         removeNulls(genericName),
diff --git a/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java b/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java
index cd9247e83..26ce91366 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/NameNRank.java
@@ -203,7 +203,7 @@ protected static boolean isSimpleBinomial(String name) {
   }
 
   private static void warnIfMissing(String name, @Nullable String epithet, String part) {
-    if (exists(epithet) && !name.toLowerCase().contains(epithet.toLowerCase())) {
+    if (exists(epithet) && name != null && !name.toLowerCase().contains(epithet.toLowerCase())) {
       LOG.warn("ScientificName >{}< missing {}: {}", name, part, epithet);
     }
   }
diff --git a/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java b/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java
index 6231109aa..903a9c861 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/NameUsage.java
@@ -1,23 +1,41 @@
 package life.catalogue.matching;
 
-import lombok.Builder;
-import lombok.Data;
-import lombok.EqualsAndHashCode;
+import com.opencsv.bean.CsvBindByName;
+
+import lombok.*;
 
 /** A simple class to represent a name usage ready to be indexed. */
 @Data
 @EqualsAndHashCode
 @Builder
+@NoArgsConstructor
+@AllArgsConstructor
 public class NameUsage {
+
+  @CsvBindByName(column = "id")
   String id;
+
+  @CsvBindByName(column = "parentId")
   String parentId;
+
+  @CsvBindByName(column = "scientificName")
   String scientificName;
+
+  @CsvBindByName(column = "authorship")
   String authorship;
+
+  @CsvBindByName(column = "status")
   String status;
+
+  @CsvBindByName(column = "rank")
   String rank;
+
+  @CsvBindByName(column = "nomenclaturalCode")
   String nomenclaturalCode;
-  String sourceId;
-  String sourceDatasetKey;
-  String parentSourceId;
-  String parentSourceDatasetKey;
+
+  @CsvBindByName(column = "category")
+  String category;
+
+  @CsvBindByName(column = "extension")
+  private String extension;
 }
diff --git a/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java b/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java
index 2e567c711..a1c279b18 100644
--- a/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java
+++ b/matching-ws/src/main/java/life/catalogue/matching/NameUsageMatch.java
@@ -32,6 +32,8 @@ public class NameUsageMatch implements LinneanClassification {
   List<NameUsageMatch> alternatives = new ArrayList<>();
   @Schema(description = "Diagnostics for a name match including the type of match and confidence level")
   Diagnostics diagnostics = new Diagnostics();
+  @Schema(description = "Status information from external sources like IUCN Red List")
+  List<Status> additionalStatus = new ArrayList<>();
 
   private String nameFor(Rank rank) {
     return getClassification().stream()
diff --git a/matching-ws/src/main/java/life/catalogue/matching/Status.java b/matching-ws/src/main/java/life/catalogue/matching/Status.java
new file mode 100644
index 000000000..0bef13645
--- /dev/null
+++ b/matching-ws/src/main/java/life/catalogue/matching/Status.java
@@ -0,0 +1,12 @@
+package life.catalogue.matching;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import lombok.Data;
+
+@Data
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class Status {
+  private String datasetKey;
+  private String datasetTitle;
+  private String category;
+}
\ No newline at end of file
diff --git a/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml b/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml
index 36bef0554..82d2893e3 100644
--- a/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml
+++ b/matching-ws/src/main/resources/life/catalogue/matching/IndexingMapper.xml
@@ -16,19 +16,40 @@
         n.rank as rank,
         nu.status as status,
         n.code as nomenclaturalCode,
-        vs.source_id as sourceId,
-        vs.source_dataset_key as sourceDatasetKey,
-        pvs.source_id as parentSourceId,
-        pvs.source_dataset_key as parentSourceDatasetKey
+        '' as extension,
+        '' as category
       FROM
         name_usage nu
       INNER JOIN
         name n on n.id = nu.name_id AND n.dataset_key=#{datasetKey}
-      LEFT JOIN
-        verbatim_source vs on vs.id = nu.id AND vs.dataset_key=#{datasetKey}
-      LEFT JOIN
-        verbatim_source pvs on pvs.id = nu.parent_id AND pvs.dataset_key=#{datasetKey}
       WHERE
         nu.dataset_key = #{datasetKey}
     </select>
+
+  <select id="getAllWithExtensionForDataset"
+          resultType="life.catalogue.matching.NameUsage"
+          resultOrdered="true"
+          fetchSize="10" resultSetType="FORWARD_ONLY">
+    SELECT
+      nu.id as id,
+      nu.parent_id as parentId,
+      n.scientific_name as scientificName,
+      n.authorship as authorship,
+      n.rank as rank,
+      nu.status as status,
+      n.code as nomenclaturalCode,
+      v.terms as extension,
+      '' as category
+    FROM
+      name_usage nu
+        INNER JOIN
+      name n on n.id = nu.name_id AND n.dataset_key=#{datasetKey}
+        LEFT JOIN
+      distribution d on d.taxon_id = nu.id AND d.dataset_key=#{datasetKey}
+        LEFT JOIN
+      verbatim v on v.id = d.verbatim_key AND v.dataset_key=#{datasetKey}
+    WHERE
+      nu.dataset_key = #{datasetKey}
+  </select>
+
 </mapper>
diff --git a/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java b/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java
index 6b9b8cfa9..72609af83 100644
--- a/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java
+++ b/matching-ws/src/test/java/life/catalogue/matching/DatasetIndexTest.java
@@ -15,13 +15,17 @@
 
 import static org.junit.jupiter.api.Assertions.*;
 
-import au.com.bytecode.opencsv.CSVReader;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.Lists;
 import com.google.common.io.Resources;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.List;
+
+import com.opencsv.CSVParserBuilder;
+import com.opencsv.CSVReader;
+import com.opencsv.CSVReaderBuilder;
+
 import life.catalogue.api.vocab.TaxonomicStatus;
 import org.apache.commons.lang3.StringUtils;
 import org.gbif.nameparser.api.Rank;
@@ -42,8 +46,9 @@ public static void buildMatcher() throws Exception {
   public static List<NameUsage> readTestNames() throws Exception {
     List<NameUsage> usages = Lists.newArrayList();
     // 1	2	Acanthophora Hulst, 1896	Geometridae	Lepidoptera	Insecta	Arthropoda	Animalia	GENUS
-    try (InputStream testFile = Resources.getResource("testNames.txt").openStream()) {
-      CSVReader reader = new CSVReader(new InputStreamReader(testFile), '\t', '"');
+    try (InputStream testFile = Resources.getResource("testNames.txt").openStream();
+      CSVReader reader = new CSVReaderBuilder(new InputStreamReader(testFile))
+        .withCSVParser(new CSVParserBuilder().withSeparator('\t').build()).build()) {
       String[] row = reader.readNext();
       while (row != null) {
         NameUsage n = NameUsage.builder().build();