Include atomised name to avoid pipelines and other services reparsing…

… the name #1350
CatalogueOfLife · Sep 6, 2024 · c4b2f64 · c4b2f64
1 parent dd75be9
commit c4b2f64
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 85 deletions.
diff --git a/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java b/matching-ws/src/main/java/life/catalogue/matching/index/DatasetIndex.java
@@ -524,12 +524,36 @@ public List<ExternalID> lookupIdentifier(@NotNull String identifier)  {
    * @return List of ExternalID
    */
   public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull String identifier)  {
+    return lookupIdentifier(datasetID, identifier, identifierSearchers);
+  }
+
+  /**
+   * Matches an external ID. Intended for debug purposes only, to quickly
+   * check if ids are present and joined to main index or not.
+   *
+   * @param datasetID the datasetKey to match
+   * @param identifier the identifier to match
+   * @return List of ExternalID
+   */
+  public List<ExternalID> lookupAncillary(@NotNull String datasetID, @NotNull String identifier)  {
+    return lookupIdentifier(datasetID, identifier, ancillarySearchers);
+  }
+
+  /**
+   * Matches an external ID. Intended for debug purposes only, to quickly
+   * check if ids are present and joined to main index or not.
+   *
+   * @param datasetID the datasetKey to match
+   * @param identifier the identifier to match
+   * @return List of ExternalID
+   */
+  public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull String identifier,  Map<Dataset, IndexSearcher> searchers)  {
     List<ExternalID> results = new ArrayList<>();
 
     try {
       // if join indexes are present, add them to the match
-      if (identifierSearchers != null && !identifierSearchers.isEmpty()) {
-        for (Dataset dataset : identifierSearchers.keySet()) {
+      if (searchers != null && !searchers.isEmpty()) {
+        for (Dataset dataset : searchers.keySet()) {
 
           // use the prefix mapping
           if (dataset.getKey().toString().equals(datasetID) || (dataset.getGbifKey() != null && dataset.getGbifKey().equals(datasetID))) {
@@ -540,12 +564,12 @@ public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull Str
             }
 
             // find the index and search it
-            IndexSearcher identifierSearcher = identifierSearchers.get(dataset);
+            IndexSearcher searcher = searchers.get(dataset);
             Query identifierQuery = new TermQuery(new Term(FIELD_ID, identifier));
-            TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3);
+            TopDocs identifierDocs = searcher.search(identifierQuery, 3);
 
             if (identifierDocs.totalHits.value > 0) {
-              Document identifierDoc = identifierSearcher.storedFields().
+              Document identifierDoc = searcher.storedFields().
                 document(identifierDocs.scoreDocs[0].doc);
 
               results.add(toExternalID(identifierDoc, dataset));
@@ -560,6 +584,7 @@ public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull Str
     return results;
   }
 
+
   private static ExternalID toExternalID(Document doc, Dataset dataset) {
     return ExternalID.builder()
       .id(doc.get(FIELD_ID))
@@ -774,7 +799,9 @@ private NameUsageMatch fromDoc(Document doc) {
     // if ancillary join indexes are present, add them to the match
     for (Dataset dataset: ancillarySearchers.keySet()){
       IndexSearcher ancillarySearcher = ancillarySearchers.get(dataset);
-      Query query = new TermQuery(new Term(FIELD_JOIN_ID, doc.get(FIELD_ID) ));
+      Query query = new TermQuery(
+        new Term(FIELD_JOIN_ID, doc.get(FIELD_ID))
+      );
       try {
         TopDocs docs = ancillarySearcher.search(query, 3);
         if (docs.totalHits.value > 0) {
@@ -814,6 +841,7 @@ private static NameUsageMatch.Usage constructUsage(Document doc) {
     NameUsageMatch.Usage.UsageBuilder b = NameUsageMatch.Usage.builder()
         .key(doc.get(FIELD_ID))
         .name(doc.get(FIELD_SCIENTIFIC_NAME))
+        .authorship(doc.get(FIELD_AUTHORSHIP))
         .rank(Rank.valueOf(doc.get(FIELD_RANK)))
         .canonicalName(doc.get(FIELD_CANONICAL_NAME))
         .code(getCode(doc));
@@ -867,8 +895,7 @@ private static NameUsageMatch.Usage constructUsage(Document doc) {
         }
     }
 
-    NameUsageMatch.Usage usage = b.build();
-    return usage;
+    return  b.build();
   }
 
   private static NomCode getCode(Document doc) {

diff --git a/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java b/matching-ws/src/main/java/life/catalogue/matching/model/NameUsageMatch.java
@@ -330,7 +330,8 @@ public static class Usage implements Serializable {
     private String key;
     @Schema(description = "The name usage")
     private String name;
-    @JsonIgnore private String canonicalName;
+    private String canonicalName;
+    private String authorship;
     @JsonIgnore private String parentID;
     @Schema(description = "The taxonomic rank for the name usage")
     private Rank rank;

diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/IndexingService.java
@@ -679,7 +679,7 @@ public void run() {
     }
 
     private boolean isAccepted(String status) {
-      return status != null && !status.equals(TaxonomicStatus.ACCEPTED.name());
+      return status != null && status.equals(TaxonomicStatus.ACCEPTED.name());
     }
   }
 
@@ -845,24 +845,35 @@ private static void finishIndex(IndexWriter indexWriter) throws IOException {
     return Paths.get(indexPath);
   }
 
+  /**
+   * Generate the lucene document for a name usage
+   * @param nameUsage
+   * @return lucene document
+   */
   protected static Document toDoc(NameUsage nameUsage) {
 
     Document doc = new Document();
     /*
      Porting notes: The canonical name *sensu strictu* with nothing else but three name parts at
      most (genus, species, infraspecific). No rank or hybrid markers and no authorship,
      cultivar or strain information. Infrageneric names are represented without a
-     leading genus. Unicode characters are replaced by their matching ASCII characters."
+     leading genus. Unicode characters are replaced by their matching ASCII characters.
     */
-     Rank rank = Rank.valueOf(nameUsage.getRank());
+    Rank rank = Rank.valueOf(nameUsage.getRank());
 
     Optional<String> optCanonical = Optional.empty();
     try {
       NomCode nomCode = null;
       if (!StringUtils.isEmpty(nameUsage.getNomenclaturalCode())) {
         nomCode = NomCode.valueOf(nameUsage.getNomenclaturalCode());
       }
-      ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
+
+      ParsedName pn = null;
+      if (StringUtils.isBlank(nameUsage.getAuthorship())) {
+        pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
+      } else{
+        pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName() + " " + nameUsage.getAuthorship(), rank, nomCode);
+      }
 
       StoredParsedName storedParsedName = new StoredParsedName();
       storedParsedName.setAbbreviated(pn.isAbbreviated());
@@ -945,7 +956,9 @@ protected static Document toDoc(NameUsage nameUsage) {
     String nameComplete = nameUsage.getScientificName();
     if (StringUtils.isNotBlank(nameUsage.getAuthorship())) {
       nameComplete += " " + nameUsage.getAuthorship();
+      doc.add(new TextField(FIELD_AUTHORSHIP, nameUsage.getAuthorship(), Field.Store.YES));
     }
+
     doc.add(new TextField(FIELD_SCIENTIFIC_NAME, nameComplete, Field.Store.YES));
 
     // this lucene index is not persistent, so not risk in changing ordinal numbers
@@ -969,75 +982,4 @@ protected static Document toDoc(NameUsage nameUsage) {
 
     return doc;
   }
-
-//  /**
-//   * Converts a {@link org.gbif.nameparser.api.ParsedName} into {@link
-//   * org.gbif.pipelines.io.avro.ParsedName}.
-//   */
-//  private static ParsedName toParsedNameAvro(org.gbif.nameparser.api.ParsedName pn) {
-//    ParsedName.Builder builder =
-//      ParsedName.newBuilder()
-//        .setAbbreviated(pn.isAbbreviated())
-//        .setAutonym(pn.isAutonym())
-//        .setBinomial(pn.isBinomial())
-//        .setCandidatus(pn.isCandidatus())
-//        .setCultivarEpithet(pn.getCultivarEpithet())
-//        .setDoubtful(pn.isDoubtful())
-//        .setGenus(pn.getGenus())
-//        .setUninomial(pn.getUninomial())
-//        .setUnparsed(pn.getUnparsed())
-//        .setTrinomial(pn.isTrinomial())
-//        .setIncomplete(pn.isIncomplete())
-//        .setIndetermined(pn.isIndetermined())
-//        .setTerminalEpithet(pn.getTerminalEpithet())
-//        .setInfragenericEpithet(pn.getInfragenericEpithet())
-//        .setInfraspecificEpithet(pn.getInfraspecificEpithet())
-//        .setExtinct(pn.isExtinct())
-//        .setPublishedIn(pn.getPublishedIn())
-//        .setSanctioningAuthor(pn.getSanctioningAuthor())
-//        .setSpecificEpithet(pn.getSpecificEpithet())
-//        .setPhrase(pn.getPhrase())
-//        .setPhraseName(pn.isPhraseName())
-//        .setVoucher(pn.getVoucher())
-//        .setNominatingParty(pn.getNominatingParty())
-//        .setNomenclaturalNote(pn.getNomenclaturalNote());
-//
-//    // Nullable fields
-//    Optional.ofNullable(pn.getWarnings())
-//      .ifPresent(w -> builder.setWarnings(new ArrayList<>(pn.getWarnings())));
-//    Optional.ofNullable(pn.getBasionymAuthorship())
-//      .ifPresent(authorship -> builder.setBasionymAuthorship(toAuthorshipAvro(authorship)));
-//    Optional.ofNullable(pn.getCombinationAuthorship())
-//      .ifPresent(authorship -> builder.setCombinationAuthorship(toAuthorshipAvro(authorship)));
-//    Optional.ofNullable(pn.getCode())
-//      .ifPresent(code -> builder.setCode(NomCode.valueOf(code.name())));
-//    Optional.ofNullable(pn.getType())
-//      .ifPresent(type -> builder.setType(NameType.valueOf(type.name())));
-//    Optional.ofNullable(pn.getNotho())
-//      .ifPresent(notho -> builder.setNotho(NamePart.valueOf(notho.name())));
-//    Optional.ofNullable(pn.getRank())
-//      .ifPresent(rank -> builder.setRank(NameRank.valueOf(rank.name())));
-//    Optional.ofNullable(pn.getState())
-//      .ifPresent(state -> builder.setState(State.valueOf(state.name())));
-//    Optional.ofNullable(pn.getEpithetQualifier())
-//      .map(
-//        eq ->
-//          eq.entrySet().stream()
-//            .collect(Collectors.toMap(e -> e.getKey().name(), Map.Entry::getValue)))
-//      .ifPresent(builder::setEpithetQualifier);
-//    return builder.build();
-//  }
-//
-//
-//   * Converts a {@link org.gbif.nameparser.api.Authorship} into {@link
-//   * org.gbif.pipelines.io.avro.Authorship}.
-//    */
-//  private static Authorship toAuthorshipAvro(org.gbif.nameparser.api.Authorship authorship) {
-//    return Authorship.newBuilder()
-//      .setEmpty(authorship.isEmpty())
-//      .setYear(authorship.getYear())
-//      .setAuthors(authorship.getAuthors())
-//      .setExAuthors(authorship.getExAuthors())
-//      .build();
-//  }
 }
diff --git a/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java b/matching-ws/src/main/java/life/catalogue/matching/service/MatchingService.java
@@ -262,7 +262,9 @@ public List<ExternalID> lookupJoins(String identifier){
    * @return the list of matches
    */
   public List<ExternalID> matchID(String datasetID, String identifier){
-    return datasetIndex.lookupIdentifier(datasetID, identifier);
+    List<ExternalID> ids = datasetIndex.lookupIdentifier(datasetID, identifier);
+    List<ExternalID> ancillary = datasetIndex.lookupAncillary(datasetID, identifier);
+    return ImmutableList.<ExternalID>builder().addAll(ids).addAll(ancillary).build();
   }
 
   public NameUsageMatch match(

diff --git a/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java b/matching-ws/src/main/java/life/catalogue/matching/util/IndexConstants.java
@@ -8,6 +8,7 @@ public class IndexConstants {
   public static final String FIELD_ACCEPTED_ID = "accid";
   public static final String FIELD_CANONICAL_NAME = "canonical";
   public static final String FIELD_SCIENTIFIC_NAME = "sciname";
+  public static final String FIELD_AUTHORSHIP = "authorship";
   public static final String FIELD_RANK = "rank";
   public static final String FIELD_STATUS = "status";
   public static final String FIELD_PARENT_ID = "parentId";