diff --git a/database/009-create-mention-table.sql b/database/009-create-mention-table.sql index 77de89d3f..2ea751246 100644 --- a/database/009-create-mention-table.sql +++ b/database/009-create-mention-table.sql @@ -40,12 +40,15 @@ CREATE TABLE mention ( page VARCHAR(50), image_url VARCHAR(500) CHECK (image_url ~ '^https?://'), mention_type mention_type NOT NULL, + external_id VARCHAR(500), source VARCHAR(50) NOT NULL, version VARCHAR(100), note VARCHAR(500), scraped_at TIMESTAMPTZ, + citations_scraped_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL, - updated_at TIMESTAMPTZ NOT NULL + updated_at TIMESTAMPTZ NOT NULL, + UNIQUE(external_id, source) ); CREATE FUNCTION sanitise_insert_mention() RETURNS TRIGGER LANGUAGE plpgsql AS @@ -81,6 +84,20 @@ CREATE TABLE mention_for_software ( ); +CREATE TABLE reference_paper_for_software ( + mention UUID REFERENCES mention (id), + software UUID REFERENCES software (id), + PRIMARY KEY (mention, software) +); + + +CREATE TABLE citation_for_mention ( + mention UUID REFERENCES mention (id), + citation UUID REFERENCES mention (id), + PRIMARY KEY (mention, citation) +); + + CREATE FUNCTION search_mentions_for_software(software_id UUID, search_text VARCHAR) RETURNS SETOF mention STABLE LANGUAGE plpgsql AS $$ BEGIN diff --git a/database/020-row-level-security.sql b/database/020-row-level-security.sql index 37cc0baa7..163bbf6a8 100644 --- a/database/020-row-level-security.sql +++ b/database/020-row-level-security.sql @@ -446,17 +446,9 @@ CREATE POLICY admin_all_rights ON research_domain_for_project TO rsd_admin -- mentions --- TODO: not sure what to do here, --- should a mention only be visible if you can see at least one software or project for which it relates? ALTER TABLE mention ENABLE ROW LEVEL SECURITY; CREATE POLICY anyone_can_read ON mention FOR SELECT TO rsd_web_anon, rsd_user - USING (id IN (SELECT mention FROM mention_for_software) - OR id IN (SELECT mention FROM output_for_project) - OR id IN (SELECT mention FROM impact_for_project) - OR id IN (SELECT mention_id FROM release_version)); - -CREATE POLICY maintainer_can_read ON mention FOR SELECT TO rsd_user USING (TRUE); CREATE POLICY maintainer_can_delete ON mention FOR DELETE TO rsd_user @@ -484,6 +476,30 @@ CREATE POLICY admin_all_rights ON mention_for_software TO rsd_admin WITH CHECK (TRUE); +ALTER TABLE reference_paper_for_software ENABLE ROW LEVEL SECURITY; + +CREATE POLICY anyone_can_read ON reference_paper_for_software FOR SELECT TO rsd_web_anon, rsd_user + USING (software IN (SELECT id FROM software)); + +CREATE POLICY maintainer_all_rights ON reference_paper_for_software TO rsd_user + USING (software IN (SELECT * FROM software_of_current_maintainer())) + WITH CHECK (software IN (SELECT * FROM software_of_current_maintainer())); + +CREATE POLICY admin_all_rights ON reference_paper_for_software TO rsd_admin + USING (TRUE) + WITH CHECK (TRUE); + + +ALTER TABLE citation_for_mention ENABLE ROW LEVEL SECURITY; + +CREATE POLICY anyone_can_read ON citation_for_mention FOR SELECT TO rsd_web_anon, rsd_user + USING (mention IN (SELECT id FROM mention)); + +CREATE POLICY admin_all_rights ON citation_for_mention TO rsd_admin + USING (TRUE) + WITH CHECK (TRUE); + + ALTER TABLE output_for_project ENABLE ROW LEVEL SECURITY; CREATE POLICY anyone_can_read ON output_for_project FOR SELECT TO rsd_web_anon, rsd_user diff --git a/database/109-mention-views.sql b/database/109-mention-views.sql new file mode 100644 index 000000000..590784a31 --- /dev/null +++ b/database/109-mention-views.sql @@ -0,0 +1,19 @@ +-- SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) +-- SPDX-FileCopyrightText: 2023 Netherlands eScience Center +-- +-- SPDX-License-Identifier: Apache-2.0 + +CREATE FUNCTION reference_papers_to_scrape() +RETURNS TABLE ( + id UUID, + doi CITEXT, + citations_scraped_at TIMESTAMPTZ +) +LANGUAGE sql STABLE AS +$$ + SELECT id, doi, citations_scraped_at + FROM mention + WHERE id IN ( + SELECT mention FROM reference_paper_for_software + ) +$$; diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CitationData.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CitationData.java new file mode 100644 index 000000000..9957e1ef2 --- /dev/null +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CitationData.java @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import java.util.UUID; + +public class CitationData { + + public UUID id; + public String doi; +} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java index 3a7f115c5..0abadd302 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java @@ -23,10 +23,10 @@ public class CrossrefMention implements Mention { - private static final Map crossrefTypeMap; + static final Map crossrefTypeMap; static { -// https://api.crossref.org/types + // https://api.crossref.org/types crossrefTypeMap = new HashMap<>(); crossrefTypeMap.put("book-section", MentionType.bookSection); @@ -97,7 +97,7 @@ public MentionRecord mentionData() { try { result.publicationYear = Utils.integerOrNull(workJson.getAsJsonObject("published").getAsJsonArray("date-parts").get(0).getAsJsonArray().get(0)); } catch (RuntimeException e) { -// year not found, we leave it at null, nothing to do + // year not found, we leave it at null, nothing to do } if (workJson.getAsJsonArray("container-title").size() > 0) { JsonArray journalTitles = workJson.getAsJsonArray("container-title"); diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java index ef1d59cf4..60dde05ce 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java @@ -21,7 +21,6 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; -import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -65,7 +64,7 @@ public class DataciteMentionRepository implements MentionRepository { private static final Pattern URL_TREE_TAG_PATTERN = Pattern.compile("/tree/([^/]+)$"); static { -// https://schema.datacite.org/meta/kernel-4.4/ + // https://schema.datacite.org/meta/kernel-4.4/ dataciteTypeMap = new HashMap<>(); dataciteTypeMap.put("Audiovisual", MentionType.presentation); dataciteTypeMap.put("Book", MentionType.book); @@ -92,7 +91,7 @@ public class DataciteMentionRepository implements MentionRepository { dataciteTypeMap.put("Software", MentionType.computerProgram); dataciteTypeMap.put("Sound", MentionType.other); dataciteTypeMap.put("Standard", MentionType.other); -// dataciteTypeMap.put("Text", MentionType.other); + // dataciteTypeMap.put("Text", MentionType.other); dataciteTypeMap.put("Workflow", MentionType.other); dataciteTypeMap.put("Other", MentionType.other); @@ -105,7 +104,7 @@ public class DataciteMentionRepository implements MentionRepository { dataciteTextTypeMap.put("Report", MentionType.report); } - // "10.5281/zenodo.1408128","10.1186/s12859-018-2165-7" + // "10.5281/zenodo.1408128","10.1186/s12859-018-2165-7" static String joinCollection(Collection dois) { return dois.stream() .collect(Collectors.joining("\",\"", "\"", "\"")); @@ -118,8 +117,8 @@ static Collection jsonStringToUniqueMentions(String json) { Set usedDois = new TreeSet<>(String.CASE_INSENSITIVE_ORDER); for (JsonElement work : worksJson) { try { -// Sometimes, DataCite gives back two of the same results for one DOI, e.g. for 10.4122/1.1000000817, -// so we need to only add it once, otherwise we cannot POST it to the backend + // Sometimes, DataCite gives back two of the same results for one DOI, e.g. for 10.4122/1.1000000817, + // so we need to only add it once, otherwise we cannot POST it to the backend MentionRecord parsedMention = parseWork(work.getAsJsonObject()); if (usedDois.contains(parsedMention.doi)) continue; @@ -212,7 +211,7 @@ public Collection mentionData(Collection dois) { } @Override - public Map save(Collection mentions) { + public void save(Collection mentions) { throw new UnsupportedOperationException(); } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java new file mode 100644 index 000000000..b603888a2 --- /dev/null +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import nl.esciencecenter.rsd.scraper.Config; +import nl.esciencecenter.rsd.scraper.Utils; + +import java.time.ZonedDateTime; +import java.util.ArrayList; +import java.util.Collection; +import java.util.UUID; + +public class MainCitations { + + public static void main(String[] args) { + System.out.println("Start scraping citations"); + try { + String backendUrl = Config.backendBaseUrl(); + PostgrestCitationRepository localCitationRepository = new PostgrestCitationRepository(backendUrl); + + Collection doisToScrape = localCitationRepository.leastRecentlyScrapedCitations(5); + OpenAlexCitations openAlexCitations = new OpenAlexCitations(); + MentionRepository localMentionRepository = new PostgrestMentionRepository(backendUrl); + String email = Config.crossrefContactEmail().orElse(null); + ZonedDateTime now = ZonedDateTime.now(); + + for (CitationData citationData : doisToScrape) { + Collection citingMentions = openAlexCitations.citations(citationData.doi, email); + localMentionRepository.save(citingMentions); + + Collection citingMentionIds = new ArrayList<>(); + for (MentionRecord citingMention : citingMentions) { + citingMentionIds.add(citingMention.id); + } + + localCitationRepository.saveCitations(backendUrl, citationData.id, citingMentionIds, now); + } + } catch (RuntimeException e) { + Utils.saveExceptionInDatabase("Citation scraper", null, null, e); + } + + System.out.println("Done scraping citations"); + } +} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainReleases.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainReleases.java index d9c50affc..2e3ead245 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainReleases.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainReleases.java @@ -9,6 +9,7 @@ import java.util.Collection; import java.util.Map; +import java.util.TreeMap; import java.util.UUID; /* @@ -34,7 +35,11 @@ public static void main(String[] args) { Collection allMentions = scrapedReleasesPerConceptDoi.values().stream() .flatMap(Collection::stream) .toList(); - Map doiToId = localMentionRepository.save(allMentions); + localMentionRepository.save(allMentions); + Map doiToId = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + for (MentionRecord mention : allMentions) { + doiToId.put(mention.doi, mention.id); + } releaseRepository.saveReleaseContent(releasesToScrape, scrapedReleasesPerConceptDoi, doiToId); System.out.println("Done scraping releases"); diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRecord.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRecord.java index 28e39e14b..421e4e0f5 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRecord.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRecord.java @@ -23,7 +23,30 @@ public class MentionRecord { String page; URI imageUrl; MentionType mentionType; + String externalId; String source; Instant scrapedAt; String version; + + @Override + public String toString() { + return "MentionRecord{" + + "id=" + id + + ", doi='" + doi + '\'' + + ", url=" + url + + ", title='" + title + '\'' + + ", authors='" + authors + '\'' + + ", publisher='" + publisher + '\'' + + ", publicationYear=" + publicationYear + + ", doiRegistrationDate=" + doiRegistrationDate + + ", journal='" + journal + '\'' + + ", page='" + page + '\'' + + ", imageUrl=" + imageUrl + + ", mentionType=" + mentionType + + ", externalId='" + externalId + '\'' + + ", source='" + source + '\'' + + ", scrapedAt=" + scrapedAt + + ", version='" + version + '\'' + + '}'; + } } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRepository.java index 4c7af7529..5a1abee81 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRepository.java @@ -6,8 +6,6 @@ package nl.esciencecenter.rsd.scraper.doi; import java.util.Collection; -import java.util.Map; -import java.util.UUID; public interface MentionRepository { @@ -15,5 +13,5 @@ public interface MentionRepository { Collection mentionData(Collection dois); - Map save(Collection mentions); + void save(Collection mentions); } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java new file mode 100644 index 000000000..1efb3b049 --- /dev/null +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java @@ -0,0 +1,181 @@ +// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import nl.esciencecenter.rsd.scraper.Utils; + +import java.net.URI; +import java.net.http.HttpResponse; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Optional; + +public class OpenAlexCitations { + + static final String DOI_FILTER_URL_UNFORMATTED = "https://api.openalex.org/works?filter=doi:%s"; + + public Collection citations(String doi, String email) { + + String doiUrlEncoded = Utils.urlEncode(doi); + String worksUri = DOI_FILTER_URL_UNFORMATTED.formatted(doiUrlEncoded); + + Optional optionalCitationsUri = citationsUri(worksUri, email); + if (optionalCitationsUri.isEmpty()) { + return Collections.emptyList(); + } + + return scrapeCitations(optionalCitationsUri.get(), email); + } + + static Optional citationsUri(String worksUri, String email) { + HttpResponse response; + if (email == null || email.isBlank()) { + response = Utils.getAsHttpResponse(worksUri); + } else { + response = Utils.getAsHttpResponse(worksUri, "User-Agent", "mailto:" + email); + } + + JsonObject tree = JsonParser.parseString(response.body()).getAsJsonObject(); + + int count = tree + .getAsJsonObject("meta") + .getAsJsonPrimitive("count") + .getAsInt(); + + if (count == 0 || count > 1) { + return Optional.empty(); + } + + String citationsUri = tree + .getAsJsonArray("results") + .get(0) + .getAsJsonObject() + .getAsJsonPrimitive("cited_by_api_url") + .getAsString(); + + return Optional.of(citationsUri); + } + + // we use cursor paging as that will always work + // https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/paging#cursor-paging + static Collection scrapeCitations(String citationsUri, String email) { + final int perPage = 200; + String cursor = "*"; + + Collection citations = new ArrayList<>(); + while (cursor != null) { + HttpResponse response; + String citationsUriWithCursor = citationsUri + "&per-page=" + perPage + "&cursor=" + cursor; + if (email == null || email.isBlank()) { + response = Utils.getAsHttpResponse(citationsUriWithCursor); + } else { + response = Utils.getAsHttpResponse(citationsUriWithCursor, "User-Agent", "mailto:" + email); + } + JsonObject tree = JsonParser.parseString(response.body()).getAsJsonObject(); + + cursor = Utils.stringOrNull(tree + .getAsJsonObject("meta") + .get("next_cursor") + ); + + JsonArray citationsArray = tree + .getAsJsonArray("results"); + + Instant now = Instant.now(); + for (JsonElement citation : citationsArray) { + MentionRecord citationAsMention = parseCitationAsMention(citation, now); + citations.add(citationAsMention); + } + } + + return citations; + } + + static MentionRecord parseCitationAsMention(JsonElement element, Instant scrapedAt) { + JsonObject citationObject = element.getAsJsonObject(); + + MentionRecord mention = new MentionRecord(); + + String doiUrl = Utils.stringOrNull(citationObject.get("doi")); + String doi = doiUrl; + if (doi != null) { + doi = doi.replace("https://doi.org/", ""); + } + mention.doi = doi; + + if (doiUrl != null) { + mention.url = URI.create(doiUrl); + } else { + JsonArray locations = citationObject.getAsJsonArray("locations"); + mention.url = extractUrlFromLocation(locations); + } + + mention.title = Utils.stringOrNull(citationObject.get("title")); + + JsonArray authorsArray = citationObject.getAsJsonArray("authorships"); + Collection authors = new ArrayList<>(); + for (JsonElement jsonElement : authorsArray) { + authors.add( + jsonElement + .getAsJsonObject() + .getAsJsonPrimitive("raw_author_name") + .getAsString() + ); + } + mention.authors = String.join(", ", authors); + + mention.publisher = null; + + mention.publicationYear = Utils.integerOrNull(citationObject.get("publication_year")); + + mention.doiRegistrationDate = null; + + mention.journal = null; + + mention.page = null; + + mention.imageUrl = null; + + String crossrefType = Utils.stringOrNull(citationObject.get("type_crossref")); + mention.mentionType = CrossrefMention.crossrefTypeMap.getOrDefault(crossrefType, MentionType.other); + + mention.externalId = citationObject + .getAsJsonObject("ids") + .getAsJsonPrimitive("openalex") + .getAsString(); + + mention.source = "OpenAlex"; + + mention.scrapedAt = scrapedAt; + + mention.version = null; + + return mention; + } + + static URI extractUrlFromLocation(JsonArray locations) { + for (JsonElement location : locations) { + JsonObject locationObject = location.getAsJsonObject(); + String landingPageUrl = Utils.stringOrNull(locationObject.get("landing_page_url")); + if (landingPageUrl != null) { + return URI.create(landingPageUrl); + } + + String pdfUrl = Utils.stringOrNull(locationObject.get("pdf_url")); + if (pdfUrl != null) { + return URI.create(pdfUrl); + } + } + + return null; + } +} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestCitationRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestCitationRepository.java new file mode 100644 index 000000000..0d22f750e --- /dev/null +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestCitationRepository.java @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import nl.esciencecenter.rsd.scraper.Utils; + +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Objects; +import java.util.UUID; + +public class PostgrestCitationRepository { + + private final String backendUrl; + + public PostgrestCitationRepository(String backendUrl) { + this.backendUrl = Objects.requireNonNull(backendUrl); + } + + public Collection leastRecentlyScrapedCitations(int limit) { + String oneHourAgoFilter = Utils.atLeastOneHourAgoFilter("citations_scraped_at"); + String uri = backendUrl + "/rpc/reference_papers_to_scrape?order=citations_scraped_at.asc.nullsfirst&limit=" + limit + "&" + oneHourAgoFilter; + String data = Utils.getAsAdmin(uri); + return parseJson(data); + } + + public void saveCitations(String backendUrl, UUID idCitedMention, Collection citingMentions, ZonedDateTime scrapedAt) { + String jsonPatch = "{\"citations_scraped_at\": \"%s\"}".formatted(scrapedAt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); + Utils.patchAsAdmin(backendUrl + "/mention?id=eq." + idCitedMention.toString(), jsonPatch); + + JsonArray jsonArray = new JsonArray(); + + for (UUID citingMention : citingMentions) { + JsonObject jsonObject = new JsonObject(); + jsonObject.addProperty("mention", idCitedMention.toString()); + jsonObject.addProperty("citation", citingMention.toString()); + jsonArray.add(jsonObject); + } + + String uri = backendUrl + "/citation_for_mention"; + + Utils.postAsAdmin(uri, jsonArray.toString(), "Prefer", "resolution=merge-duplicates"); + } + + static Collection parseJson(String data) { + JsonArray array = JsonParser.parseString(data).getAsJsonArray(); + Collection result = new ArrayList<>(); + + for (JsonElement jsonElement : array) { + JsonObject jsonObject = jsonElement.getAsJsonObject(); + UUID id = UUID.fromString(jsonObject.getAsJsonPrimitive("id").getAsString()); + String doi = jsonObject.getAsJsonPrimitive("doi").getAsString(); + + CitationData entry = new CitationData(); + entry.id = id; + entry.doi = doi; + + result.add(entry); + } + + return result; + } +} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java index d34732dcf..c38ae1281 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java @@ -6,23 +6,20 @@ package nl.esciencecenter.rsd.scraper.doi; import com.google.gson.FieldNamingPolicy; +import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.gson.JsonArray; import com.google.gson.JsonDeserializer; -import com.google.gson.JsonElement; import com.google.gson.JsonParser; import com.google.gson.JsonPrimitive; import com.google.gson.JsonSerializer; import com.google.gson.reflect.TypeToken; -import nl.esciencecenter.rsd.scraper.Config; import nl.esciencecenter.rsd.scraper.Utils; import java.net.URI; import java.time.Instant; import java.time.ZonedDateTime; import java.util.Collection; -import java.util.HashMap; -import java.util.Map; import java.util.Objects; import java.util.UUID; @@ -64,23 +61,28 @@ public Collection mentionData(Collection dois) { } @Override - public Map save(Collection mentions) { - String scrapedMentionsJson = new GsonBuilder() + public void save(Collection mentions) { + Gson gson = new GsonBuilder() .serializeNulls() .setFieldNamingPolicy(FieldNamingPolicy.LOWER_CASE_WITH_UNDERSCORES) .registerTypeAdapter(Instant.class, (JsonSerializer) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString())) .registerTypeAdapter(ZonedDateTime.class, (JsonSerializer) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString())) - .create().toJson(mentions); - String response = Utils.postAsAdmin(Config.backendBaseUrl() + "/mention?on_conflict=doi&select=doi,id", scrapedMentionsJson, "Prefer", "resolution=merge-duplicates,return=representation"); + .create(); + for (MentionRecord mention : mentions) { + String scrapedMentionJson = gson.toJson(mention); + String onConflictFilter; + if (mention.doi != null) { + onConflictFilter = "doi"; + } else { + onConflictFilter = "external_id,source"; + } - JsonArray responseAsArray = JsonParser.parseString(response).getAsJsonArray(); - Map doiToId = new HashMap<>(); - for (JsonElement jsonElement : responseAsArray) { - String doi = jsonElement.getAsJsonObject().getAsJsonPrimitive("doi").getAsString(); - UUID id = UUID.fromString(jsonElement.getAsJsonObject().getAsJsonPrimitive("id").getAsString()); - doiToId.put(doi, id); - } + String uri = "%s/mention?on_conflict=%s&select=id".formatted(backendUrl, onConflictFilter); + String response = Utils.postAsAdmin(uri, scrapedMentionJson, "Prefer", "resolution=merge-duplicates,return=representation"); - return doiToId; + JsonArray responseAsArray = JsonParser.parseString(response).getAsJsonArray(); + UUID id = UUID.fromString(responseAsArray.get(0).getAsJsonObject().getAsJsonPrimitive("id").getAsString()); + mention.id = id; + } } } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java index 69118127f..f9b798185 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java @@ -78,7 +78,8 @@ Collection parseJson(String data) { Gson gson = new GsonBuilder() .setFieldNamingPolicy(FieldNamingPolicy.LOWER_CASE_WITH_UNDERSCORES) .create(); - TypeToken> typeToken = new TypeToken>() {}; + TypeToken> typeToken = new TypeToken<>() { + }; return gson.fromJson(data, typeToken.getType()); } }