Skip to content

Commit

Permalink
Work for issues:
Browse files Browse the repository at this point in the history
  • Loading branch information
djtfmartin committed Jun 28, 2024
1 parent 96b87ff commit d38182b
Show file tree
Hide file tree
Showing 387 changed files with 1,170,694 additions and 7 deletions.
9 changes: 8 additions & 1 deletion api/src/main/java/life/catalogue/api/vocab/MatchType.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ public enum MatchType {
* A name which is not supported in the names index and can never be matched or added.
* For example placeholder names.
*/
UNSUPPORTED;
UNSUPPORTED,

/**
* The matching alogrithm was unable to match a scientific name with sufficient confidence,
* and matched a higher rank instead.
*/
HIGHERRANK
;

}
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,19 @@ public static AuthorshipNormalizer createWithoutAuthormap() {

private static AuthorshipNormalizer createWithAuthormap() {
Map<String, String> map = new HashMap<>();
Resources.tabRows(AUTHOR_MAP_FILENAME).forEach(row -> {
var value = row[0];
for (int i = 1; i < row.length; i++) {
map.put(row[i], value);
try {
Resources.tabRows(AUTHOR_MAP_FILENAME).forEach(row -> {
var value = row[0];
for (int i = 1; i < row.length; i++) {
map.put(row[i], value);
}
});
} catch (Exception e) {
LOG.warn("Failed to load author abbreviation map from {}", AUTHOR_MAP_FILENAME);
if (LOG.isDebugEnabled()){
LOG.debug("Failed to load author abbreviation map from {}", AUTHOR_MAP_FILENAME, e);
}
});
}
return new AuthorshipNormalizer(map);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import org.gbif.nameparser.api.Authorship;

import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;

import javax.annotation.Nullable;

Expand Down Expand Up @@ -56,6 +58,45 @@ public Equality compare(@Nullable Authorship a1, @Nullable Authorship a2) {
}
return result;
}

/**
* This ported over from gbif/checklistbank.
* The {@link AuthorComparator.compare} compares years first
* which leads to very different results compared to current GBIF API.
*
* Compares the authorteams and year of two names.
* If given both the year and authorteam needs to match to yield an EQUAL,
* with a small difference of 2 years being accepted.
*/
public Equality compareAuthorsFirst(@Nullable Authorship a1, @Nullable Authorship a2) {
// compare year first - simpler to calculate
Equality result = compareAuthorteam(a1, a2, minCommonSubstring, MIN_AUTHOR_LENGTH_WITHOUT_LOOKUP);
if (result != Equality.EQUAL) {
// if authors are not the same we allow a positive year comparison to override it as author comparison is very difficult
Equality yresult = new YearComparator(a1.getYear(), a2.getYear()).compare();
if (yresult != Equality.UNKNOWN) {
if (yresult == Equality.DIFFERENT || a1.getAuthors().isEmpty() || a2.getAuthors().isEmpty()) {
result = yresult;
} else {
// year EQUAL, i.e. very close by
// also make sure we have at least one capital char overlap between the 2 authorships
Set<Character> upper1 = String.join("; ", a1.getAuthors()).chars()
.filter(Character::isUpperCase)
.mapToObj(c -> (char) c)
.collect(Collectors.toSet());
Set<Character> upper2 = String.join("; ", a2.getAuthors()).chars()
.filter(Character::isUpperCase)
.mapToObj(c -> (char) c)
.collect(Collectors.toSet());
upper1.retainAll(upper2);
if (!upper1.isEmpty()) {
result = yresult;
}
}
}
}
return result;
}

/**
* Does a comparison of recombination and basionym authorship using the author compare method once for the recombination authorship and once for the basionym.
Expand Down
108 changes: 108 additions & 0 deletions matching-ws/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Use an official Maven runtime as a parent image
FROM maven:3.8.5-openjdk-11 AS builder

# Set the working directory in the container
WORKDIR /app

ARG CLB_DATASET_ID=""
ARG CLB_URL=""
ARG CLB_USER=""
ARG CLB_PASSWORD=""
ENV CLB_API_URL="https://api.checklistbank.org"
ARG GIT_BRANCH=""

# Local builds - run from root of backend
#COPY .. /app/backend
#WORKDIR /app/backend

# Run a script to validate the arguments
RUN if [ -z "$GIT_BRANCH" ]; then \
echo "Error: GIT_BRANCH is not set." >&2; \
exit 1; \
fi

# Clone the backend repository
RUN rm -Rf backend
RUN git clone https://github.com/CatalogueOfLife/backend.git
WORKDIR /app/backend
RUN git checkout $GIT_BRANCH

# Build all the CLB modules
RUN mvn clean install package -DskipTests

# Build the Maven project and create a exec file
WORKDIR /app/backend/matching-ws

# Run tests - full backend tests require additional services (e.g. ES)
RUN mvn clean install package

# Store git commit id and log
RUN curl -o /app/backend/git.json -H "Accept: application/vnd.github+json" "https://api.github.com/repos/catalogueoflife/backend/commits/$(git rev-parse HEAD)"

# Run a script to validate the arguments
RUN if [ -n "$CLB_DATASET_ID" ]; then \
if [ -z "$CLB_URL" ] || [ -z "$CLB_USER" ] || [ -z "$CLB_PASSWORD" ]; then \
echo "Error: CLB_URL, CLB_USER, and CLB_PASSWORD must be set when CLB_DATASET_ID is set." >&2; \
exit 1; \
fi \
fi

# Cache a copy of the dataset metadata from checklistbank for tracking
RUN if [ -n "$CLB_DATASET_ID" ]; then \
curl -o /app/backend/dataset.json $CLB_API_URL/dataset/$CLB_DATASET_ID.json; \
else \
echo "{}" > /app/backend/dataset.json; \
fi

# Copy the executable JAR file from the builder image to the new image
FROM openjdk:11

# Set environment variables
ARG DEBIAN_FRONTEND=noninteractive
ENV SERVER_PORT=8080
ENV JVM_OPTIONS="-Xmx2g -Xms2g"
ENV V1_ENABLED="false"
ENV USER=matching
ENV APP_ARTIFACT=matching-ws

# Set environment variables
ARG CLB_DATASET_ID=""
ARG CLB_IUCN_DATASET_ID=""
ARG CLB_IDENTIFIER_DATASET_IDS=""
ARG CLB_URL=""
ARG CLB_USER=""
ARG CLB_PASSWORD=""

# Directories and perms
RUN mkdir -p /data/$APP_ARTIFACT && \
groupadd -r $USER -g 1000 && useradd -r -g $USER -u 1000 -m $USER && \
chown -R $USER:$USER /data/$APP_ARTIFACT

# Set the working directory in the container
WORKDIR /opt/gbif/$APP_ARTIFACT

# Copy the executable JAR file from the builder image to the new image
COPY --from=builder /app/backend/matching-ws/target/matching-ws-*-SNAPSHOT-exec.jar /opt/gbif/$APP_ARTIFACT/app.jar
COPY --from=builder /app/backend/git.json /opt/gbif/$APP_ARTIFACT/git.json
COPY --from=builder /app/backend/dataset.json /opt/gbif/$APP_ARTIFACT/dataset.json

# CSV export from checklistbank
RUN if [ -n "$CLB_DATASET_ID" ]; then \
java -jar app.jar \
--mode=BUILD_INDEX \
--index.path=/data/$APP_ARTIFACT/index \
--export.path=/data/$APP_ARTIFACT/exports \
--clb.dataset.id=$CLB_DATASET_ID \
--clb.identifier.dataset.ids=$CLB_IDENTIFIER_DATASET_IDS \
--clb.iucn.dataset.id=$CLB_IUCN_DATASET_ID \
--clb.url=$CLB_URL \
--clb.user=$CLB_USER \
--clb.password=$CLB_PASSWORD; \
fi

RUN chown -R $USER:$USER /opt/gbif/$APP_ARTIFACT

USER $USER
EXPOSE $SERVER_PORT

CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --v1.enabled=$V1_ENABLED --working.dir=/opt/gbif/$APP_ARTIFACT
108 changes: 108 additions & 0 deletions matching-ws/Dockerfile-local
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Use an official Maven runtime as a parent image
FROM maven:3.8.5-openjdk-11 AS builder

# Set the working directory in the container
WORKDIR /app

ARG CLB_DATASET_ID=""
ARG CLB_URL=""
ARG CLB_USER=""
ARG CLB_PASSWORD=""
ENV CLB_API_URL="https://api.checklistbank.org"
ARG GIT_BRANCH=""

# Local builds - run from root of backend
COPY .. /app/backend
WORKDIR /app/backend

# Run a script to validate the arguments
#RUN if [ -z "$GIT_BRANCH" ]; then \
# echo "Error: GIT_BRANCH is not set." >&2; \
# exit 1; \
# fi
#
## Clone the backend repository
#RUN rm -Rf backend
#RUN git clone https://github.com/CatalogueOfLife/backend.git
#WORKDIR /app/backend
#RUN git checkout $GIT_BRANCH

# Build all the CLB modules
RUN mvn clean install package -DskipTests

# Build the Maven project and create a exec file
WORKDIR /app/backend/matching-ws

# Run tests - full backend tests require additional services (e.g. ES)
RUN mvn clean install package

# Store git commit id and log
RUN curl -o /app/backend/git.json -H "Accept: application/vnd.github+json" "https://api.github.com/repos/catalogueoflife/backend/commits/$(git rev-parse HEAD)"

# Run a script to validate the arguments
RUN if [ -n "$CLB_DATASET_ID" ]; then \
if [ -z "$CLB_URL" ] || [ -z "$CLB_USER" ] || [ -z "$CLB_PASSWORD" ]; then \
echo "Error: CLB_URL, CLB_USER, and CLB_PASSWORD must be set when CLB_DATASET_ID is set." >&2; \
exit 1; \
fi \
fi

# Cache a copy of the dataset metadata from checklistbank for tracking
RUN if [ -n "$CLB_DATASET_ID" ]; then \
curl -o /app/backend/dataset.json $CLB_API_URL/dataset/$CLB_DATASET_ID.json; \
else \
echo "{}" > /app/backend/dataset.json; \
fi

# Copy the executable JAR file from the builder image to the new image
FROM openjdk:11

# Set environment variables
ARG DEBIAN_FRONTEND=noninteractive
ENV SERVER_PORT=8080
ENV JVM_OPTIONS="-Xmx2g -Xms2g"
ENV V1_ENABLED="false"
ENV USER=matching
ENV APP_ARTIFACT=matching-ws

# Set environment variables
ARG CLB_DATASET_ID=""
ARG CLB_IUCN_DATASET_ID=""
ARG CLB_IDENTIFIER_DATASET_IDS=""
ARG CLB_URL=""
ARG CLB_USER=""
ARG CLB_PASSWORD=""

# Directories and perms
RUN mkdir -p /data/$APP_ARTIFACT && \
groupadd -r $USER -g 1000 && useradd -r -g $USER -u 1000 -m $USER && \
chown -R $USER:$USER /data/$APP_ARTIFACT

# Set the working directory in the container
WORKDIR /opt/gbif/$APP_ARTIFACT

# Copy the executable JAR file from the builder image to the new image
COPY --from=builder /app/backend/matching-ws/target/matching-ws-*-SNAPSHOT-exec.jar /opt/gbif/$APP_ARTIFACT/app.jar
COPY --from=builder /app/backend/git.json /opt/gbif/$APP_ARTIFACT/git.json
COPY --from=builder /app/backend/dataset.json /opt/gbif/$APP_ARTIFACT/dataset.json

# CSV export from checklistbank
RUN if [ -n "$CLB_DATASET_ID" ]; then \
java -jar app.jar \
--mode=BUILD_INDEX \
--index.path=/data/$APP_ARTIFACT/index \
--export.path=/data/$APP_ARTIFACT/exports \
--clb.dataset.id=$CLB_DATASET_ID \
--clb.identifier.dataset.ids=$CLB_IDENTIFIER_DATASET_IDS \
--clb.iucn.dataset.id=$CLB_IUCN_DATASET_ID \
--clb.url=$CLB_URL \
--clb.user=$CLB_USER \
--clb.password=$CLB_PASSWORD; \
fi

RUN chown -R $USER:$USER /opt/gbif/$APP_ARTIFACT

USER $USER
EXPOSE $SERVER_PORT

CMD java $JVM_OPTIONS -jar app.jar --server.port=$SERVER_PORT --v1.enabled=$V1_ENABLED --working.dir=/opt/gbif/$APP_ARTIFACT
27 changes: 27 additions & 0 deletions matching-ws/JenkinsFile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
pipeline {
agent any

stages {
stage('Download Dockerfile') {
steps {
script {
// Download Dockerfile from URL
sh "curl -o Dockerfile https://raw.githubusercontent.com/CatalogueOfLife/backend/matching-ws/matching-ws/Dockerfile"
}
}
}

stage('Build Docker Image') {
steps {
script {
def datasetIds = params.CLB_DATASET_ID.split(',')
for (def datasetID in datasetIds) {
// Run Docker build with parameters
sh "docker build --platform linux/amd64 --build-arg CLB_DATASET_ID=${datasetID} --build-arg CLB_URL=${CLB_URL} --build-arg CLB_USER=${CLB_USER} --build-arg CLB_PASSWORD=${CLB_PASSWORD} . -t docker.gbif.org/matching-ws/${datasetID}:1.0-SNAPSHOT"
sh "docker push docker.gbif.org/matching-ws/${datasetID}:1.0-SNAPSHOT"
}
}
}
}
}
}
Loading

0 comments on commit d38182b

Please sign in to comment.