Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ported matching API #1336

Merged
merged 1 commit into from
Jul 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
9 changes: 8 additions & 1 deletion api/src/main/java/life/catalogue/api/vocab/MatchType.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ public enum MatchType {
* A name which is not supported in the names index and can never be matched or added.
* For example placeholder names.
*/
UNSUPPORTED;
UNSUPPORTED,

/**
* The matching alogrithm was unable to match a scientific name with sufficient confidence,
* and matched a higher rank instead.
*/
HIGHERRANK
;

}
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,19 @@ public static AuthorshipNormalizer createWithoutAuthormap() {

private static AuthorshipNormalizer createWithAuthormap() {
Map<String, String> map = new HashMap<>();
Resources.tabRows(AUTHOR_MAP_FILENAME).forEach(row -> {
var value = row[0];
for (int i = 1; i < row.length; i++) {
map.put(row[i], value);
try {
Resources.tabRows(AUTHOR_MAP_FILENAME).forEach(row -> {
var value = row[0];
for (int i = 1; i < row.length; i++) {
map.put(row[i], value);
}
});
} catch (Exception e) {
LOG.warn("Failed to load author abbreviation map from {}", AUTHOR_MAP_FILENAME);
if (LOG.isDebugEnabled()){
LOG.debug("Failed to load author abbreviation map from {}", AUTHOR_MAP_FILENAME, e);
}
});
}
return new AuthorshipNormalizer(map);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import org.gbif.nameparser.api.Authorship;

import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import javax.annotation.Nullable;

Expand Down Expand Up @@ -56,6 +58,45 @@ public Equality compare(@Nullable Authorship a1, @Nullable Authorship a2) {
}
return result;
}

/**
* This ported over from gbif/checklistbank.
* The {@link AuthorComparator.compare} compares years first
* which leads to very different results compared to current GBIF API.
*
* Compares the authorteams and year of two names.
* If given both the year and authorteam needs to match to yield an EQUAL,
* with a small difference of 2 years being accepted.
*/
public Equality compareAuthorsFirst(@Nullable Authorship a1, @Nullable Authorship a2) {
// compare year first - simpler to calculate
Equality result = compareAuthorteam(a1, a2, minCommonSubstring, MIN_AUTHOR_LENGTH_WITHOUT_LOOKUP);
if (result != Equality.EQUAL) {
// if authors are not the same we allow a positive year comparison to override it as author comparison is very difficult
Equality yresult = new YearComparator(a1.getYear(), a2.getYear()).compare();
if (yresult != Equality.UNKNOWN) {
if (yresult == Equality.DIFFERENT || a1.getAuthors().isEmpty() || a2.getAuthors().isEmpty()) {
result = yresult;
} else {
// year EQUAL, i.e. very close by
// also make sure we have at least one capital char overlap between the 2 authorships
Set<Character> upper1 = String.join("; ", a1.getAuthors()).chars()
.filter(Character::isUpperCase)
.mapToObj(c -> (char) c)
.collect(Collectors.toSet());
Set<Character> upper2 = String.join("; ", a2.getAuthors()).chars()
.filter(Character::isUpperCase)
.mapToObj(c -> (char) c)
.collect(Collectors.toSet());
upper1.retainAll(upper2);
if (!upper1.isEmpty()) {
result = yresult;
}
}
}
}
return result;
}

/**
* Does a comparison of recombination and basionym authorship using the author compare method once for the recombination authorship and once for the basionym.
Expand Down
108 changes: 108 additions & 0 deletions matching-ws/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Use an official Maven runtime as a parent image
FROM maven:3.8.5-openjdk-11 AS builder

# Set the working directory in the container
WORKDIR /app

ARG CLB_DATASET_ID=""
ARG CLB_URL=""
ARG CLB_USER=""
ARG CLB_PASSWORD=""
ENV CLB_API_URL="https://api.checklistbank.org"
ARG GIT_BRANCH=""

# Local builds - run from root of backend
#COPY .. /app/backend
#WORKDIR /app/backend

# Run a script to validate the arguments
RUN if [ -z "$GIT_BRANCH" ]; then \
echo "Error: GIT_BRANCH is not set." >&2; \
exit 1; \
fi

# Clone the backend repository
RUN rm -Rf backend
RUN git clone https://github.com/CatalogueOfLife/backend.git
WORKDIR /app/backend
RUN git checkout $GIT_BRANCH

# Build all the CLB modules
RUN mvn clean install package -DskipTests

# Build the Maven project and create a exec file
WORKDIR /app/backend/matching-ws

# Run tests - full backend tests require additional services (e.g. ES)
RUN mvn clean install package

# Store git commit id and log
RUN curl -o /app/backend/git.json -H "Accept: application/vnd.github+json" "https://api.github.com/repos/catalogueoflife/backend/commits/$(git rev-parse HEAD)"

# Run a script to validate the arguments
RUN if [ -n "$CLB_DATASET_ID" ]; then \
if [ -z "$CLB_URL" ] || [ -z "$CLB_USER" ] || [ -z "$CLB_PASSWORD" ]; then \
echo "Error: CLB_URL, CLB_USER, and CLB_PASSWORD must be set when CLB_DATASET_ID is set." >&2; \
exit 1; \
fi \
fi

# Cache a copy of the dataset metadata from checklistbank for tracking
RUN if [ -n "$CLB_DATASET_ID" ]; then \
curl -o /app/backend/dataset.json $CLB_API_URL/dataset/$CLB_DATASET_ID.json; \
else \
echo "{}" > /app/backend/dataset.json; \
fi

# Copy the executable JAR file from the builder image to the new image
FROM openjdk:11

# Set environment variables
ARG DEBIAN_FRONTEND=noninteractive
ENV SERVER_PORT=8080
ENV JVM_OPTIONS="-Xmx2g -Xms2g"
ENV USER=matching
ENV APP_ARTIFACT=matching-ws
ENV V1_ENABLED="true"

# Set environment variables
ARG CLB_DATASET_ID=""
ARG CLB_IUCN_DATASET_ID=""
ARG CLB_IDENTIFIER_DATASET_IDS=""
ARG CLB_URL=""
ARG CLB_USER=""
ARG CLB_PASSWORD=""

# Directories and perms
RUN mkdir -p /data/$APP_ARTIFACT && \
groupadd -r $USER -g 1000 && useradd -r -g $USER -u 1000 -m $USER && \
chown -R $USER:$USER /data/$APP_ARTIFACT

# Set the working directory in the container
WORKDIR /opt/gbif/$APP_ARTIFACT

# Copy the executable JAR file from the builder image to the new image
COPY --from=builder /app/backend/matching-ws/target/matching-ws-*-SNAPSHOT-exec.jar /opt/gbif/$APP_ARTIFACT/app.jar
COPY --from=builder /app/backend/git.json /opt/gbif/$APP_ARTIFACT/git.json
COPY --from=builder /app/backend/dataset.json /opt/gbif/$APP_ARTIFACT/dataset.json

# CSV export from checklistbank
RUN if [ -n "$CLB_DATASET_ID" ]; then \
java -jar app.jar \
--mode=BUILD_INDEX \
--index.path=/data/$APP_ARTIFACT/index \
--export.path=/data/$APP_ARTIFACT/exports \
--clb.dataset.id=$CLB_DATASET_ID \
--clb.identifier.dataset.ids=$CLB_IDENTIFIER_DATASET_IDS \
--clb.iucn.dataset.id=$CLB_IUCN_DATASET_ID \
--clb.url=$CLB_URL \
--clb.user=$CLB_USER \
--clb.password=$CLB_PASSWORD; \
fi

RUN chown -R $USER:$USER /opt/gbif/$APP_ARTIFACT

USER $USER
EXPOSE $SERVER_PORT

CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --v1.enabled=$V1_ENABLED --working.dir=/opt/gbif/$APP_ARTIFACT
108 changes: 108 additions & 0 deletions matching-ws/Dockerfile-local
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Use an official Maven runtime as a parent image
FROM maven:3.8.5-openjdk-11 AS builder

# Set the working directory in the container
WORKDIR /app

ARG CLB_DATASET_ID=""
ARG CLB_URL=""
ARG CLB_USER=""
ARG CLB_PASSWORD=""
ENV CLB_API_URL="https://api.checklistbank.org"
ARG GIT_BRANCH=""

# Local builds - run from root of backend
COPY .. /app/backend
WORKDIR /app/backend

# Run a script to validate the arguments
#RUN if [ -z "$GIT_BRANCH" ]; then \
# echo "Error: GIT_BRANCH is not set." >&2; \
# exit 1; \
# fi
#
## Clone the backend repository
#RUN rm -Rf backend
#RUN git clone https://github.com/CatalogueOfLife/backend.git
#WORKDIR /app/backend
#RUN git checkout $GIT_BRANCH

# Build all the CLB modules
RUN mvn clean install package -DskipTests

# Build the Maven project and create a exec file
WORKDIR /app/backend/matching-ws

# Run tests - full backend tests require additional services (e.g. ES)
RUN mvn clean install package

# Store git commit id and log
RUN curl -o /app/backend/git.json -H "Accept: application/vnd.github+json" "https://api.github.com/repos/catalogueoflife/backend/commits/$(git rev-parse HEAD)"

# Run a script to validate the arguments
RUN if [ -n "$CLB_DATASET_ID" ]; then \
if [ -z "$CLB_URL" ] || [ -z "$CLB_USER" ] || [ -z "$CLB_PASSWORD" ]; then \
echo "Error: CLB_URL, CLB_USER, and CLB_PASSWORD must be set when CLB_DATASET_ID is set." >&2; \
exit 1; \
fi \
fi

# Cache a copy of the dataset metadata from checklistbank for tracking
RUN if [ -n "$CLB_DATASET_ID" ]; then \
curl -o /app/backend/dataset.json $CLB_API_URL/dataset/$CLB_DATASET_ID.json; \
else \
echo "{}" > /app/backend/dataset.json; \
fi

# Copy the executable JAR file from the builder image to the new image
FROM openjdk:11

# Set environment variables
ARG DEBIAN_FRONTEND=noninteractive
ENV SERVER_PORT=8080
ENV JVM_OPTIONS="-Xmx2g -Xms2g"
ENV USER=matching
ENV APP_ARTIFACT=matching-ws
ENV V1_ENABLED="true"

# Set environment variables
ARG CLB_DATASET_ID=""
ARG CLB_IUCN_DATASET_ID=""
ARG CLB_IDENTIFIER_DATASET_IDS=""
ARG CLB_URL=""
ARG CLB_USER=""
ARG CLB_PASSWORD=""

# Directories and perms
RUN mkdir -p /data/$APP_ARTIFACT && \
groupadd -r $USER -g 1000 && useradd -r -g $USER -u 1000 -m $USER && \
chown -R $USER:$USER /data/$APP_ARTIFACT

# Set the working directory in the container
WORKDIR /opt/gbif/$APP_ARTIFACT

# Copy the executable JAR file from the builder image to the new image
COPY --from=builder /app/backend/matching-ws/target/matching-ws-*-SNAPSHOT-exec.jar /opt/gbif/$APP_ARTIFACT/app.jar
COPY --from=builder /app/backend/git.json /opt/gbif/$APP_ARTIFACT/git.json
COPY --from=builder /app/backend/dataset.json /opt/gbif/$APP_ARTIFACT/dataset.json

# CSV export from checklistbank
RUN if [ -n "$CLB_DATASET_ID" ]; then \
java -jar app.jar \
--mode=BUILD_INDEX \
--index.path=/data/$APP_ARTIFACT/index \
--export.path=/data/$APP_ARTIFACT/exports \
--clb.dataset.id=$CLB_DATASET_ID \
--clb.identifier.dataset.ids=$CLB_IDENTIFIER_DATASET_IDS \
--clb.iucn.dataset.id=$CLB_IUCN_DATASET_ID \
--clb.url=$CLB_URL \
--clb.user=$CLB_USER \
--clb.password=$CLB_PASSWORD; \
fi

RUN chown -R $USER:$USER /opt/gbif/$APP_ARTIFACT

USER $USER
EXPOSE $SERVER_PORT

CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --v1.enabled=$V1_ENABLED --working.dir=/opt/gbif/$APP_ARTIFACT
27 changes: 27 additions & 0 deletions matching-ws/JenkinsFile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
pipeline {
agent any

stages {
stage('Download Dockerfile') {
steps {
script {
// Download Dockerfile from URL
sh "curl -o Dockerfile https://raw.githubusercontent.com/CatalogueOfLife/backend/matching-ws/matching-ws/Dockerfile"
}
}
}

stage('Build Docker Image') {
steps {
script {
def datasetIds = params.CLB_DATASET_ID.split(',')
for (def datasetID in datasetIds) {
// Run Docker build with parameters
sh "docker build --platform linux/amd64 --build-arg CLB_DATASET_ID=${datasetID} --build-arg CLB_URL=${CLB_URL} --build-arg CLB_USER=${CLB_USER} --build-arg CLB_PASSWORD=${CLB_PASSWORD} . -t docker.gbif.org/matching-ws/${datasetID}:1.0-SNAPSHOT"
sh "docker push docker.gbif.org/matching-ws/${datasetID}:1.0-SNAPSHOT"
}
}
}
}
}
}
Loading