Skip to content

Commit

Permalink
Include atomised name to avoid pipelines and other services reparsing…
Browse files Browse the repository at this point in the history
… the name

#1350
  • Loading branch information
djtfmartin committed Sep 6, 2024
1 parent dd75be9 commit c4b2f64
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 85 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -524,12 +524,36 @@ public List<ExternalID> lookupIdentifier(@NotNull String identifier) {
* @return List of ExternalID
*/
public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull String identifier) {
return lookupIdentifier(datasetID, identifier, identifierSearchers);
}

/**
* Matches an external ID. Intended for debug purposes only, to quickly
* check if ids are present and joined to main index or not.
*
* @param datasetID the datasetKey to match
* @param identifier the identifier to match
* @return List of ExternalID
*/
public List<ExternalID> lookupAncillary(@NotNull String datasetID, @NotNull String identifier) {
return lookupIdentifier(datasetID, identifier, ancillarySearchers);
}

/**
* Matches an external ID. Intended for debug purposes only, to quickly
* check if ids are present and joined to main index or not.
*
* @param datasetID the datasetKey to match
* @param identifier the identifier to match
* @return List of ExternalID
*/
public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull String identifier, Map<Dataset, IndexSearcher> searchers) {
List<ExternalID> results = new ArrayList<>();

try {
// if join indexes are present, add them to the match
if (identifierSearchers != null && !identifierSearchers.isEmpty()) {
for (Dataset dataset : identifierSearchers.keySet()) {
if (searchers != null && !searchers.isEmpty()) {
for (Dataset dataset : searchers.keySet()) {

// use the prefix mapping
if (dataset.getKey().toString().equals(datasetID) || (dataset.getGbifKey() != null && dataset.getGbifKey().equals(datasetID))) {
Expand All @@ -540,12 +564,12 @@ public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull Str
}

// find the index and search it
IndexSearcher identifierSearcher = identifierSearchers.get(dataset);
IndexSearcher searcher = searchers.get(dataset);
Query identifierQuery = new TermQuery(new Term(FIELD_ID, identifier));
TopDocs identifierDocs = identifierSearcher.search(identifierQuery, 3);
TopDocs identifierDocs = searcher.search(identifierQuery, 3);

if (identifierDocs.totalHits.value > 0) {
Document identifierDoc = identifierSearcher.storedFields().
Document identifierDoc = searcher.storedFields().
document(identifierDocs.scoreDocs[0].doc);

results.add(toExternalID(identifierDoc, dataset));
Expand All @@ -560,6 +584,7 @@ public List<ExternalID> lookupIdentifier(@NotNull String datasetID, @NotNull Str
return results;
}


private static ExternalID toExternalID(Document doc, Dataset dataset) {
return ExternalID.builder()
.id(doc.get(FIELD_ID))
Expand Down Expand Up @@ -774,7 +799,9 @@ private NameUsageMatch fromDoc(Document doc) {
// if ancillary join indexes are present, add them to the match
for (Dataset dataset: ancillarySearchers.keySet()){
IndexSearcher ancillarySearcher = ancillarySearchers.get(dataset);
Query query = new TermQuery(new Term(FIELD_JOIN_ID, doc.get(FIELD_ID) ));
Query query = new TermQuery(
new Term(FIELD_JOIN_ID, doc.get(FIELD_ID))
);
try {
TopDocs docs = ancillarySearcher.search(query, 3);
if (docs.totalHits.value > 0) {
Expand Down Expand Up @@ -814,6 +841,7 @@ private static NameUsageMatch.Usage constructUsage(Document doc) {
NameUsageMatch.Usage.UsageBuilder b = NameUsageMatch.Usage.builder()
.key(doc.get(FIELD_ID))
.name(doc.get(FIELD_SCIENTIFIC_NAME))
.authorship(doc.get(FIELD_AUTHORSHIP))
.rank(Rank.valueOf(doc.get(FIELD_RANK)))
.canonicalName(doc.get(FIELD_CANONICAL_NAME))
.code(getCode(doc));
Expand Down Expand Up @@ -867,8 +895,7 @@ private static NameUsageMatch.Usage constructUsage(Document doc) {
}
}

NameUsageMatch.Usage usage = b.build();
return usage;
return b.build();
}

private static NomCode getCode(Document doc) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,8 @@ public static class Usage implements Serializable {
private String key;
@Schema(description = "The name usage")
private String name;
@JsonIgnore private String canonicalName;
private String canonicalName;
private String authorship;
@JsonIgnore private String parentID;
@Schema(description = "The taxonomic rank for the name usage")
private Rank rank;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -679,7 +679,7 @@ public void run() {
}

private boolean isAccepted(String status) {
return status != null && !status.equals(TaxonomicStatus.ACCEPTED.name());
return status != null && status.equals(TaxonomicStatus.ACCEPTED.name());
}
}

Expand Down Expand Up @@ -845,24 +845,35 @@ private static void finishIndex(IndexWriter indexWriter) throws IOException {
return Paths.get(indexPath);
}

/**
* Generate the lucene document for a name usage
* @param nameUsage
* @return lucene document
*/
protected static Document toDoc(NameUsage nameUsage) {

Document doc = new Document();
/*
Porting notes: The canonical name *sensu strictu* with nothing else but three name parts at
most (genus, species, infraspecific). No rank or hybrid markers and no authorship,
cultivar or strain information. Infrageneric names are represented without a
leading genus. Unicode characters are replaced by their matching ASCII characters."
leading genus. Unicode characters are replaced by their matching ASCII characters.
*/
Rank rank = Rank.valueOf(nameUsage.getRank());
Rank rank = Rank.valueOf(nameUsage.getRank());

Optional<String> optCanonical = Optional.empty();
try {
NomCode nomCode = null;
if (!StringUtils.isEmpty(nameUsage.getNomenclaturalCode())) {
nomCode = NomCode.valueOf(nameUsage.getNomenclaturalCode());
}
ParsedName pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);

ParsedName pn = null;
if (StringUtils.isBlank(nameUsage.getAuthorship())) {
pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName(), rank, nomCode);
} else{
pn = NameParsers.INSTANCE.parse(nameUsage.getScientificName() + " " + nameUsage.getAuthorship(), rank, nomCode);
}

StoredParsedName storedParsedName = new StoredParsedName();
storedParsedName.setAbbreviated(pn.isAbbreviated());
Expand Down Expand Up @@ -945,7 +956,9 @@ protected static Document toDoc(NameUsage nameUsage) {
String nameComplete = nameUsage.getScientificName();
if (StringUtils.isNotBlank(nameUsage.getAuthorship())) {
nameComplete += " " + nameUsage.getAuthorship();
doc.add(new TextField(FIELD_AUTHORSHIP, nameUsage.getAuthorship(), Field.Store.YES));
}

doc.add(new TextField(FIELD_SCIENTIFIC_NAME, nameComplete, Field.Store.YES));

// this lucene index is not persistent, so not risk in changing ordinal numbers
Expand All @@ -969,75 +982,4 @@ protected static Document toDoc(NameUsage nameUsage) {

return doc;
}

// /**
// * Converts a {@link org.gbif.nameparser.api.ParsedName} into {@link
// * org.gbif.pipelines.io.avro.ParsedName}.
// */
// private static ParsedName toParsedNameAvro(org.gbif.nameparser.api.ParsedName pn) {
// ParsedName.Builder builder =
// ParsedName.newBuilder()
// .setAbbreviated(pn.isAbbreviated())
// .setAutonym(pn.isAutonym())
// .setBinomial(pn.isBinomial())
// .setCandidatus(pn.isCandidatus())
// .setCultivarEpithet(pn.getCultivarEpithet())
// .setDoubtful(pn.isDoubtful())
// .setGenus(pn.getGenus())
// .setUninomial(pn.getUninomial())
// .setUnparsed(pn.getUnparsed())
// .setTrinomial(pn.isTrinomial())
// .setIncomplete(pn.isIncomplete())
// .setIndetermined(pn.isIndetermined())
// .setTerminalEpithet(pn.getTerminalEpithet())
// .setInfragenericEpithet(pn.getInfragenericEpithet())
// .setInfraspecificEpithet(pn.getInfraspecificEpithet())
// .setExtinct(pn.isExtinct())
// .setPublishedIn(pn.getPublishedIn())
// .setSanctioningAuthor(pn.getSanctioningAuthor())
// .setSpecificEpithet(pn.getSpecificEpithet())
// .setPhrase(pn.getPhrase())
// .setPhraseName(pn.isPhraseName())
// .setVoucher(pn.getVoucher())
// .setNominatingParty(pn.getNominatingParty())
// .setNomenclaturalNote(pn.getNomenclaturalNote());
//
// // Nullable fields
// Optional.ofNullable(pn.getWarnings())
// .ifPresent(w -> builder.setWarnings(new ArrayList<>(pn.getWarnings())));
// Optional.ofNullable(pn.getBasionymAuthorship())
// .ifPresent(authorship -> builder.setBasionymAuthorship(toAuthorshipAvro(authorship)));
// Optional.ofNullable(pn.getCombinationAuthorship())
// .ifPresent(authorship -> builder.setCombinationAuthorship(toAuthorshipAvro(authorship)));
// Optional.ofNullable(pn.getCode())
// .ifPresent(code -> builder.setCode(NomCode.valueOf(code.name())));
// Optional.ofNullable(pn.getType())
// .ifPresent(type -> builder.setType(NameType.valueOf(type.name())));
// Optional.ofNullable(pn.getNotho())
// .ifPresent(notho -> builder.setNotho(NamePart.valueOf(notho.name())));
// Optional.ofNullable(pn.getRank())
// .ifPresent(rank -> builder.setRank(NameRank.valueOf(rank.name())));
// Optional.ofNullable(pn.getState())
// .ifPresent(state -> builder.setState(State.valueOf(state.name())));
// Optional.ofNullable(pn.getEpithetQualifier())
// .map(
// eq ->
// eq.entrySet().stream()
// .collect(Collectors.toMap(e -> e.getKey().name(), Map.Entry::getValue)))
// .ifPresent(builder::setEpithetQualifier);
// return builder.build();
// }
//
//
// * Converts a {@link org.gbif.nameparser.api.Authorship} into {@link
// * org.gbif.pipelines.io.avro.Authorship}.
// */
// private static Authorship toAuthorshipAvro(org.gbif.nameparser.api.Authorship authorship) {
// return Authorship.newBuilder()
// .setEmpty(authorship.isEmpty())
// .setYear(authorship.getYear())
// .setAuthors(authorship.getAuthors())
// .setExAuthors(authorship.getExAuthors())
// .build();
// }
}
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,9 @@ public List<ExternalID> lookupJoins(String identifier){
* @return the list of matches
*/
public List<ExternalID> matchID(String datasetID, String identifier){
return datasetIndex.lookupIdentifier(datasetID, identifier);
List<ExternalID> ids = datasetIndex.lookupIdentifier(datasetID, identifier);
List<ExternalID> ancillary = datasetIndex.lookupAncillary(datasetID, identifier);
return ImmutableList.<ExternalID>builder().addAll(ids).addAll(ancillary).build();
}

public NameUsageMatch match(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ public class IndexConstants {
public static final String FIELD_ACCEPTED_ID = "accid";
public static final String FIELD_CANONICAL_NAME = "canonical";
public static final String FIELD_SCIENTIFIC_NAME = "sciname";
public static final String FIELD_AUTHORSHIP = "authorship";
public static final String FIELD_RANK = "rank";
public static final String FIELD_STATUS = "status";
public static final String FIELD_PARENT_ID = "parentId";
Expand Down

0 comments on commit c4b2f64

Please sign in to comment.