From 4175b82776eb50fe66791bee1d24750125793214 Mon Sep 17 00:00:00 2001 From: Ewan Cahen Date: Fri, 13 Sep 2024 16:07:23 +0200 Subject: [PATCH] feat: allow harvesting citations of OpenAlex reference papers --- database/011-create-mention-table.sql | 5 +- database/104-software-views.sql | 10 +- docker-compose.yml | 6 +- .../docs/01-users/05-adding-software.md | 8 +- .../docs/01-users/07-adding-projects.md | 10 +- .../docs/03-rsd-instance/03-administration.md | 2 +- .../admin/mentions/MentionsOverview.tsx | 2 +- .../components/mention/EditMentionModal.tsx | 11 +- frontend/components/mention/config.ts | 12 +- frontend/package.json | 2 +- frontend/package.json.license | 2 +- frontend/types/Mention.ts | 4 +- .../nl/esciencecenter/rsd/scraper/Utils.java | 17 +- .../rsd/scraper/doi/CitationData.java | 41 ++-- .../rsd/scraper/doi/CrossrefMention.java | 68 +++--- .../doi/DataCiteReleaseRepository.java | 25 ++- .../doi/DataciteMentionRepository.java | 95 ++++---- .../esciencecenter/rsd/scraper/doi/Doi.java | 63 ++++++ .../scraper/doi/ExternalMentionRecord.java | 26 +++ .../rsd/scraper/doi/MainCitations.java | 61 ++--- .../rsd/scraper/doi/MainMentions.java | 126 +++++++---- .../rsd/scraper/doi/MainReleases.java | 46 ++-- .../rsd/scraper/doi/Mention.java | 15 -- .../rsd/scraper/doi/MentionRecord.java | 52 ----- .../rsd/scraper/doi/MentionRepository.java | 17 -- .../rsd/scraper/doi/OpenAlexCitations.java | 140 +++++++----- .../rsd/scraper/doi/OpenalexId.java | 72 ++++++ .../doi/PostgrestCitationRepository.java | 45 ++-- .../doi/PostgrestMentionRepository.java | 211 +++++++++--------- .../doi/PostgrestReleaseRepository.java | 35 +-- .../rsd/scraper/doi/ReleaseData.java | 8 +- .../rsd/scraper/doi/RsdMentionIds.java | 15 ++ .../rsd/scraper/doi/RsdMentionRecord.java | 16 ++ .../doi/DataciteMentionRepositoryTest.java | 14 +- .../rsd/scraper/doi/DoiTest.java | 54 +++++ .../rsd/scraper/doi/MainMentionsTest.java | 6 +- .../rsd/scraper/doi/OpenalexIdTest.java | 49 ++++ 37 files changed, 849 insertions(+), 542 deletions(-) create mode 100644 scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/Doi.java create mode 100644 scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/ExternalMentionRecord.java delete mode 100644 scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/Mention.java delete mode 100644 scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRecord.java delete mode 100644 scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRepository.java create mode 100644 scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenalexId.java create mode 100644 scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/RsdMentionIds.java create mode 100644 scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/RsdMentionRecord.java create mode 100644 scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/DoiTest.java create mode 100644 scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenalexIdTest.java diff --git a/database/011-create-mention-table.sql b/database/011-create-mention-table.sql index 817b96ea6..146effeef 100644 --- a/database/011-create-mention-table.sql +++ b/database/011-create-mention-table.sql @@ -31,6 +31,7 @@ CREATE TABLE mention ( id UUID DEFAULT gen_random_uuid() PRIMARY KEY, doi CITEXT UNIQUE CHECK (doi ~ '^10(\.\w+)+/\S+$' AND LENGTH(doi) <= 255), doi_registration_date TIMESTAMPTZ, + openalex_id CITEXT UNIQUE CHECK (openalex_id ~ '^https://openalex\.org/[WwAaSsIiCcPpFf]\d{3,13}$'), url VARCHAR(500) CHECK (url ~ '^https?://'), title VARCHAR(3000) NOT NULL, authors VARCHAR(50000), @@ -40,15 +41,13 @@ CREATE TABLE mention ( page VARCHAR(50), image_url VARCHAR(500) CHECK (image_url ~ '^https?://'), mention_type mention_type NOT NULL, - external_id VARCHAR(500), source VARCHAR(50) NOT NULL, version VARCHAR(100), note VARCHAR(500), scraped_at TIMESTAMPTZ, citations_scraped_at TIMESTAMPTZ, created_at TIMESTAMPTZ NOT NULL, - updated_at TIMESTAMPTZ NOT NULL, - UNIQUE(external_id, source) + updated_at TIMESTAMPTZ NOT NULL ); CREATE FUNCTION sanitise_insert_mention() RETURNS TRIGGER LANGUAGE plpgsql AS diff --git a/database/104-software-views.sql b/database/104-software-views.sql index ffaee20b3..6380a0b33 100644 --- a/database/104-software-views.sql +++ b/database/104-software-views.sql @@ -56,18 +56,20 @@ CREATE FUNCTION reference_papers_to_scrape() RETURNS TABLE ( id UUID, doi CITEXT, + openalex_id CITEXT, citations_scraped_at TIMESTAMPTZ, - known_dois CITEXT[] + known_citing_dois CITEXT[] ) LANGUAGE sql STABLE AS $$ - SELECT mention.id, mention.doi, mention.citations_scraped_at, ARRAY_REMOVE(ARRAY_AGG(citation.doi), NULL) + SELECT mention.id, mention.doi, mention.openalex_id, mention.citations_scraped_at, ARRAY_REMOVE(ARRAY_AGG(citation.doi), NULL) FROM mention LEFT JOIN citation_for_mention ON mention.id = citation_for_mention.mention LEFT JOIN mention AS citation ON citation_for_mention.citation = citation.id WHERE - -- ONLY items with DOI - mention.doi IS NOT NULL AND ( + -- ONLY items with DOI or OpenAlex id + (mention.doi IS NOT NULL OR mention.openalex_id IS NOT NULL) + AND ( mention.id IN ( SELECT mention FROM reference_paper_for_software ) diff --git a/docker-compose.yml b/docker-compose.yml index 5d6d5dd17..dfe8b64ac 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,7 +16,7 @@ version: "3.0" services: database: build: ./database - image: rsd/database:2.4.1 + image: rsd/database:2.5.0 ports: # enable connection from outside (development mode) - "5432:5432" @@ -110,7 +110,7 @@ services: # dockerfile to use for build dockerfile: Dockerfile # update version number to correspond to frontend/package.json - image: rsd/frontend:2.19.0 + image: rsd/frontend:2.19.1 environment: # it uses values from .env file - POSTGREST_URL @@ -158,7 +158,7 @@ services: scrapers: build: ./scrapers - image: rsd/scrapers:1.8.1 + image: rsd/scrapers:1.9.0 environment: # it uses values from .env file - POSTGREST_URL diff --git a/documentation/docs/01-users/05-adding-software.md b/documentation/docs/01-users/05-adding-software.md index 02ace22ad..35c289c4e 100644 --- a/documentation/docs/01-users/05-adding-software.md +++ b/documentation/docs/01-users/05-adding-software.md @@ -75,7 +75,7 @@ When using a Document URL to point to a remote Markdown file on the GitHub, you ### Logo -The software logo is shown on the software page and in the software card (see example below). **You can upload an image up to 2MB of size**. Widely used image formats like jpg, jpeg, png, svg etc. are supported. Use the **svg** format, if possible, because it scales better than other formats. +The software logo is shown on the software page and in the software card (see example below). **You can upload an image up to 2MB of size**. Widely used image formats like JPG, JPEG, PNG, SVG etc. are supported. Use the **svg** format, if possible, because it scales better than other formats. ![image](img/software-logo-card.webp) @@ -164,14 +164,14 @@ This section allows you to add mentions to your software page. You can use this ### Reference papers -Use the *Search* box on the right hand side to find papers by DOI or title. All the relevant data about the publication will be retrieved automatically. A background scraper will use [OpenAlex](https://openalex.org/) to collect all citations of the reference papers. +Use the *Search* box on the right hand side to find papers by DOI or title. All the relevant data about the publication will be retrieved automatically. A background scraper will use [OpenAlex](https://openalex.org/) to collect all citations of reference papers that have a DOI or an OpenAlex ID. ### Citations -All the results RSD scraper was able to find on [OpenAlex](https://openalex.org/) citing provided reference papers. It can take a few minutes before the citations are harvested. +These are the citations of the reference papers that the RSD scraper was able to find on [OpenAlex](https://openalex.org/. It can take a few minutes before the citations are harvested. :::warning -You cannot edit this section. All entries are automatically generated by the RSD scraper service. The found mentions are displayed in the mentions section of the software page. +You cannot edit the content of this section. All entries are automatically harvested and generated by the RSD scraper. The mentions found are displayed in the mentions section of the software page. ::: ### Related output diff --git a/documentation/docs/01-users/07-adding-projects.md b/documentation/docs/01-users/07-adding-projects.md index bf0eff51a..41b887e62 100644 --- a/documentation/docs/01-users/07-adding-projects.md +++ b/documentation/docs/01-users/07-adding-projects.md @@ -8,7 +8,7 @@ After signing in, use the **"+"** button next to your avatar icon on the top rig ![image](img/new-project.gif) -The RSD will automatically generate a *slug* for your project based on the project name you have provided. This slug will become part of the URL on which your project page can found. +The RSD will automatically generate a *slug* for your project based on the project name you have provided. This slug will become part of the URL on which your project page can be found. There is a small chance the generated slug is already in use by another project. If this is the case, an error will be shown, and you will need to change the slug manually to resolve this conflict. Once you click **"save"**, the RSD will initialize a new empty project page. This page will not be public yet to give you some time to provide additional information. Next, you can add additional information in the edit sections explained below. @@ -106,16 +106,16 @@ You can import up to 50 publications by providing a list of DOIs, one per line. If the output has no DOI you can create new mention item manually. Each manually added item should at least have a **Title**, **Type** and **URL**. All other fields are optional. The **Note** field can be used to add a note to this item, and will not be shown on the project page. :::warning -Please check if the information is complete and correct. A manual item can not be edited after it has been saved! +Please check if the information is complete and correct. A manual item cannot be edited after it has been saved! You can, however, delete an item and create a new one. ::: ### Citations -Here we list all the citations of your output that the RSD was able to find automatically by using the DOIs of your output and OpenAlex. On the project page these citations are shown in the impact section together with the items you added manually. +Here, we list all the citations of your output (that has a DOI or OpenAlex ID) that the RSD was able to find automatically on OpenAlex. On the project page, these citations are shown in the impact section together with the items you added manually. :::warning -You cannot edit this section. All entries are automatically generated by the RSD scraper service. Found publications are displayed in the impact section of the project page. +You cannot edit the content of this section. All entries are automatically harvested and generated by the RSD scraper. The publications found are displayed in the impact section of the project page. ::: ### Impact @@ -135,7 +135,7 @@ You can import up to 50 publications by providing a list of DOIs, one per line. If the publication has no DOI you can create a new item manually. Each manually added item should at least have a **Title**, **Type** and **URL**. All other fields are optional. The **Note** field can be used to add a note to this item, and will not be shown on the project page. :::warning -Please check if the information is complete and correct. A manual item can not be edited after it has been saved! You can, however, delete an item and create a new one. +Please check if the information is complete and correct. A manual item cannot be edited after it has been saved! You can, however, delete an item and create a new one. ::: ## Related projects diff --git a/documentation/docs/03-rsd-instance/03-administration.md b/documentation/docs/03-rsd-instance/03-administration.md index fd9afbfb8..912f4ddb2 100644 --- a/documentation/docs/03-rsd-instance/03-administration.md +++ b/documentation/docs/03-rsd-instance/03-administration.md @@ -187,7 +187,7 @@ fill the `provenance_iri` column. Further read [Linked Data](https://en.wikipedi ## Mentions -In this section, admins can search for mentions and edit them. If you enter a DOI or UUID, we search on that field only. Otherwise, we search on title, authors, journal, URL, note and external ID (like an OpenAlex ID). +In this section, admins can search for mentions and edit them. If you enter a DOI or UUID, we search on that field only. Otherwise, we search on title, authors, journal, URL, note and OpenAlex ID. :::warning Edit mentions with care: they might be referenced to in multiple places. If you want to fully change a mention attached to e.g. a software page, you should delete it there and create a new one instead of editing it. diff --git a/frontend/components/admin/mentions/MentionsOverview.tsx b/frontend/components/admin/mentions/MentionsOverview.tsx index d34e92bf3..43055110e 100644 --- a/frontend/components/admin/mentions/MentionsOverview.tsx +++ b/frontend/components/admin/mentions/MentionsOverview.tsx @@ -52,7 +52,7 @@ export default function MentionsOverview() { if (searchTypeTerm.type === 'doi') { return `doi=eq.${termEscaped}` } - return `or=(title.ilike.*${termEscaped}*,authors.ilike.*${termEscaped}*,journal.ilike.*${termEscaped}*,url.ilike.*${termEscaped}*,note.ilike.*${termEscaped}*,external_id.ilike.*${termEscaped}*)` + return `or=(title.ilike.*${termEscaped}*,authors.ilike.*${termEscaped}*,journal.ilike.*${termEscaped}*,url.ilike.*${termEscaped}*,note.ilike.*${termEscaped}*,openalex_id.ilike.*${termEscaped}*)` } function sanitiseSearch(search: string): string | undefined { diff --git a/frontend/components/mention/EditMentionModal.tsx b/frontend/components/mention/EditMentionModal.tsx index 9370a5a82..c4d61d493 100644 --- a/frontend/components/mention/EditMentionModal.tsx +++ b/frontend/components/mention/EditMentionModal.tsx @@ -291,14 +291,13 @@ export default function EditMentionModal({open, onCancel, onSubmit, item, pos, t
diff --git a/frontend/components/mention/config.ts b/frontend/components/mention/config.ts index 65fec1038..514c9cf45 100644 --- a/frontend/components/mention/config.ts +++ b/frontend/components/mention/config.ts @@ -135,14 +135,14 @@ export const mentionModal = { } } }, - external_id: { - label: 'External ID', - help: 'An ID used by e.g. OpenAlex', + openalex_id: { + label: 'OpenAlex ID', + help: 'The OpenAlex ID', validation: { required: false, - maxLength: { - value: 500, - message: 'Maximum length is 500' + pattern: { + value: /^https:\/\/openalex\.org\/[WwAaSsIiCcPpFf]\d{3,13}$/, + message: 'e.g. https://openalex.org/W3160330321' } } }, diff --git a/frontend/package.json b/frontend/package.json index f68a136e1..5e675d60b 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "rsd-frontend", - "version": "2.19.0", + "version": "2.19.1", "private": true, "scripts": { "dev": "next dev", diff --git a/frontend/package.json.license b/frontend/package.json.license index 7bc0d9869..eb63f95f2 100644 --- a/frontend/package.json.license +++ b/frontend/package.json.license @@ -3,7 +3,7 @@ SPDX-FileCopyrightText: 2021 - 2023 dv4all SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center SPDX-FileCopyrightText: 2022 Jesús García Gonzalez (Netherlands eScience Center) SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center) -SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) +SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) SPDX-License-Identifier: Apache-2.0 SPDX-License-Identifier: CC-BY-4.0 diff --git a/frontend/types/Mention.ts b/frontend/types/Mention.ts index 0b1751c79..63779e630 100644 --- a/frontend/types/Mention.ts +++ b/frontend/types/Mention.ts @@ -29,10 +29,10 @@ export type MentionItemProps = { mention_type: MentionTypeKeys | null source: string note?: string | null - external_id?: string | null + openalex_id?: string | null } -export const mentionColumns ='id,doi,url,title,authors,publisher,publication_year,journal,page,image_url,mention_type,source,note' +export const mentionColumns ='id,doi,openalex_id,url,title,authors,publisher,publication_year,journal,page,image_url,mention_type,source,note' export type MentionByType = { [key in MentionTypeKeys]?: MentionItemProps[] diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java index 60c9342ae..27f249ea2 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java @@ -127,7 +127,7 @@ public static String getAsAdmin(String uri) { Thread.currentThread().interrupt(); throw new RuntimeException(e); } catch (IOException e) { - LOGGER.warn("An error occurred sending a request to {}", uri, e); + LOGGER.error("An error occurred sending a request to {}", uri, e); throw new RuntimeException(e); } @@ -188,7 +188,9 @@ public static String postAsAdmin(String uri, String json, String... extraHeaders .timeout(DEFAULT_TIMEOUT) .header("Content-Type", "application/json") .header("Authorization", "Bearer " + jwtString); - if (extraHeaders != null && extraHeaders.length > 0) builder.headers(extraHeaders); + if (extraHeaders != null && extraHeaders.length > 0) { + builder.headers(extraHeaders); + } HttpRequest request = builder.build(); HttpResponse response; @@ -276,15 +278,18 @@ static String createPatchUri(String baseuri, String tableName, String primaryKey return "%s/%s?%s=eq.%s".formatted(baseuri, tableName, primaryKeyName, primaryKey); } - public static String patchAsAdmin(String uri, String json) { + public static String patchAsAdmin(String uri, String json, String... extraHeaders) { String jwtString = adminJwt(); - HttpRequest request = HttpRequest.newBuilder() + HttpRequest.Builder builder = HttpRequest.newBuilder() .method("PATCH", HttpRequest.BodyPublishers.ofString(json)) .uri(URI.create(uri)) .timeout(Duration.ofSeconds(30)) .header("Content-Type", "application/json") - .header("Authorization", "Bearer " + jwtString) - .build(); + .header("Authorization", "Bearer " + jwtString); + if (extraHeaders != null && extraHeaders.length > 0) { + builder.headers(extraHeaders); + } + HttpRequest request = builder.build(); HttpResponse response; try (HttpClient client = HttpClient.newHttpClient()) { response = client.send(request, HttpResponse.BodyHandlers.ofString()); diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CitationData.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CitationData.java index 94bd1ae79..3af97d419 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CitationData.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CitationData.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -8,31 +8,20 @@ import java.util.Collection; import java.util.UUID; -/** +/** * Container class for Citation information retrieved from the database. */ -public class CitationData { +public record CitationData( + // UUID of this entry in the database + UUID id, - // UUID of this entry in the database - public final UUID id; - - // DOI of this entry. - public final String doi; - - // List of known DOIs citing this entry. - public final Collection knownDois; - - /** - * Create a CitationData and initialize with data provided. - * - * @param id of this entry in the database - * @param doi of this entry - * @param knownDois list of known DOIs citing this entry - */ - public CitationData(UUID id, String doi, Collection knownDois) { - super(); - this.id = id; - this.doi = doi; - this.knownDois = knownDois; - } + // DOI of this entry. + Doi doi, + + // OpenAlex ID of this entry + OpenalexId openalexId, + + // List of known DOIs citing this entry. + Collection knownDois +) { } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java index 5dc6fe905..67e0a51bb 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/CrossrefMention.java @@ -16,14 +16,13 @@ import java.io.IOException; import java.net.URI; -import java.time.Instant; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Map; import java.util.Objects; -public class CrossrefMention implements Mention { +public class CrossrefMention { static final Map crossrefTypeMap; @@ -62,26 +61,24 @@ public class CrossrefMention implements Mention { crossrefTypeMap.put("standard-series", MentionType.other); } - private final String doi; + private final Doi doi; - public CrossrefMention(String doi) { + public CrossrefMention(Doi doi) { this.doi = Objects.requireNonNull(doi); } - @Override - public MentionRecord mentionData() throws IOException, InterruptedException, RsdResponseException { - StringBuilder url = new StringBuilder("https://api.crossref.org/works/" + Utils.urlEncode(doi)); - Config.crossrefContactEmail().ifPresent(email -> url.append("?mailto=").append(email)); - String responseJson = Utils.get(url.toString()); + public ExternalMentionRecord mentionData() throws IOException, InterruptedException, RsdResponseException { + StringBuilder crossrefUrlBuilder = new StringBuilder("https://api.crossref.org/works/" + Utils.urlEncode(doi.toString())); + Config.crossrefContactEmail().ifPresent(email -> crossrefUrlBuilder.append("?mailto=").append(email)); + String responseJson = Utils.get(crossrefUrlBuilder.toString()); JsonObject jsonTree = JsonParser.parseString(responseJson).getAsJsonObject(); - MentionRecord result = new MentionRecord(); JsonObject workJson = jsonTree.getAsJsonObject("message"); - result.doi = doi; - result.url = URI.create("https://doi.org/" + Utils.urlEncode(result.doi)); - result.title = workJson.getAsJsonArray("title").get(0).getAsString(); + URI mentionUrl = URI.create("https://doi.org/" + Utils.urlEncode(this.doi.toString())); + String title = workJson.getAsJsonArray("title").get(0).getAsString(); - Collection authors = new ArrayList<>(); + Collection authorsBuilder = new ArrayList<>(); + String authors = null; Iterable authorsJson = (Iterable) workJson.getAsJsonArray("author"); if (authorsJson != null) { for (JsonObject authorJson : authorsJson) { @@ -89,36 +86,51 @@ public MentionRecord mentionData() throws IOException, InterruptedException, Rsd String familyName = Utils.stringOrNull(authorJson.get("family")); String name = Utils.stringOrNull(authorJson.get("name")); if (givenName != null && familyName != null) { - authors.add(givenName + " " + familyName); + authorsBuilder.add(givenName + " " + familyName); } else if (name != null) { - authors.add(name); + authorsBuilder.add(name); } else if (givenName != null) { - authors.add(givenName); + authorsBuilder.add(givenName); } else if (familyName != null) { - authors.add(familyName); + authorsBuilder.add(familyName); } } - result.authors = String.join(", ", authors); + authors = String.join(", ", authorsBuilder); } - result.publisher = Utils.stringOrNull(workJson.get("publisher")); + String publisher = Utils.stringOrNull(workJson.get("publisher")); + Integer publicationYear = null; try { - result.publicationYear = Utils.integerOrNull(workJson.getAsJsonObject("published").getAsJsonArray("date-parts").get(0).getAsJsonArray().get(0)); + publicationYear = Utils.integerOrNull(workJson.getAsJsonObject("published").getAsJsonArray("date-parts").get(0).getAsJsonArray().get(0)); } catch (RuntimeException e) { // year not found, we leave it at null, nothing to do } + String journal = null; if (!workJson.getAsJsonArray("container-title").isEmpty()) { JsonArray journalTitles = workJson.getAsJsonArray("container-title"); - result.journal = journalTitles.get(0).getAsString(); + StringBuilder journalBuilder = new StringBuilder(journalTitles.get(0).getAsString()); for (int i = 1; i < journalTitles.size(); i++) { - result.journal += ", " + journalTitles.get(i).getAsString(); + journalBuilder.append(", ").append(journalTitles.get(i).getAsString()); } + journal = journalBuilder.toString(); } - result.page = Utils.stringOrNull(workJson.get("page")); - result.mentionType = crossrefTypeMap.getOrDefault(Utils.stringOrNull(workJson.get("type")), MentionType.other); - result.source = "Crossref"; - result.scrapedAt = Instant.now(); + String page = Utils.stringOrNull(workJson.get("page")); + MentionType mentionType = crossrefTypeMap.getOrDefault(Utils.stringOrNull(workJson.get("type")), MentionType.other); - return result; + return new ExternalMentionRecord( + this.doi, + null, + null, + mentionUrl, + title, + authors, + publisher, + publicationYear, + journal, + page, + mentionType, + "Crossref", + null + ); } } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataCiteReleaseRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataCiteReleaseRepository.java index a5f64ba65..b50a46791 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataCiteReleaseRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataCiteReleaseRepository.java @@ -16,13 +16,13 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.Map; -import java.util.TreeMap; public class DataCiteReleaseRepository { private static final Logger LOGGER = LoggerFactory.getLogger(DataCiteReleaseRepository.class); - + private static final String QUERY_UNFORMATTED = """ query { works(ids: [%s], first: 10000) { @@ -39,35 +39,36 @@ public class DataCiteReleaseRepository { } """; - public Map> getVersionedDois(Collection conceptDois) { + public Map> getVersionedDois(Collection conceptDois) { if (conceptDois.isEmpty()) { return Collections.emptyMap(); } - String query = QUERY_UNFORMATTED.formatted(DataciteMentionRepository.joinCollection(conceptDois)); + String query = QUERY_UNFORMATTED.formatted(DataciteMentionRepository.joinDoisForGraphqlQuery(conceptDois)); JsonObject body = new JsonObject(); body.addProperty("query", query); String responseJson = Utils.post("https://api.datacite.org/graphql", body.toString(), "Content-Type", "application/json"); return parseJson(responseJson); } - Map> parseJson(String json) { + Map> parseJson(String json) { DataciteMentionRepository dataciteMentionRepository = new DataciteMentionRepository(); JsonObject root = JsonParser.parseString(json).getAsJsonObject(); JsonArray worksJson = root.getAsJsonObject("data").getAsJsonObject("works").getAsJsonArray("nodes"); - Map> releasesPerConceptDoi = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + Map> releasesPerConceptDoi = new HashMap<>(); for (JsonElement work : worksJson) { try { JsonObject workObject = work.getAsJsonObject(); - String conceptDoi = workObject.getAsJsonPrimitive("doi").getAsString(); + String conceptDoiString = workObject.getAsJsonPrimitive("doi").getAsString(); + Doi conceptDoi = Doi.fromString(conceptDoiString); Integer versionOfCount = Utils.integerOrNull(workObject.get("versionOfCount")); if (versionOfCount == null || versionOfCount.intValue() != 0) { - LOGGER.debug("{} is not a concept DOI", conceptDoi); + LOGGER.debug("{} is not a concept DOI", conceptDoiString); continue; } - Collection versionDois = new ArrayList<>(); + Collection versionDois = new ArrayList<>(); JsonArray relatedIdentifiers = workObject.getAsJsonArray("relatedIdentifiers"); for (JsonElement relatedIdentifier : relatedIdentifiers) { JsonObject relatedIdentifierObject = relatedIdentifier.getAsJsonObject(); @@ -78,13 +79,13 @@ Map> parseJson(String json) { if (relatedIdentifierType == null || !relatedIdentifierType.equals("DOI")) continue; String relatedIdentifierDoi = relatedIdentifierObject.getAsJsonPrimitive("relatedIdentifier").getAsString(); - versionDois.add(relatedIdentifierDoi); + versionDois.add(Doi.fromString(relatedIdentifierDoi)); } - Collection versionedMentions = dataciteMentionRepository.mentionData(versionDois); + Collection versionedMentions = dataciteMentionRepository.mentionData(versionDois); releasesPerConceptDoi.put(conceptDoi, versionedMentions); } catch (RuntimeException e) { - LOGGER.warn("Failed to scrape a DataCite mention with data {}, ", work, e); + LOGGER.error("Failed to scrape a DataCite mention with data {}, ", work, e); } } return releasesPerConceptDoi; diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java index 4ff06b198..cd200411a 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepository.java @@ -14,23 +14,22 @@ import org.slf4j.LoggerFactory; import java.net.URI; -import java.time.Instant; import java.time.ZonedDateTime; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; import java.util.Set; -import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; -public class DataciteMentionRepository implements MentionRepository { +public class DataciteMentionRepository { private static final Logger LOGGER = LoggerFactory.getLogger(DataciteMentionRepository.class); - + private static final String QUERY_UNFORMATTED = """ query { works(ids: [%s], first: 10000) { @@ -111,69 +110,70 @@ public class DataciteMentionRepository implements MentionRepository { } // "10.5281/zenodo.1408128","10.1186/s12859-018-2165-7" - static String joinCollection(Collection dois) { + static String joinDoisForGraphqlQuery(Collection dois) { return dois.stream() + .map(Doi::toString) .collect(Collectors.joining("\",\"", "\"", "\"")); } - static Collection jsonStringToUniqueMentions(String json) { + static Collection jsonStringToUniqueMentions(String json) { JsonObject root = JsonParser.parseString(json).getAsJsonObject(); JsonArray worksJson = root.getAsJsonObject("data").getAsJsonObject("works").getAsJsonArray("nodes"); - Collection mentions = new ArrayList<>(); - Set usedDois = new TreeSet<>(String.CASE_INSENSITIVE_ORDER); + Collection mentions = new ArrayList<>(); + Set usedDois = new HashSet<>(); for (JsonElement work : worksJson) { try { // Sometimes, DataCite gives back two of the same results for one DOI, e.g. for 10.4122/1.1000000817, // so we need to only add it once, otherwise we cannot POST it to the backend - MentionRecord parsedMention = parseWork(work.getAsJsonObject()); - if (usedDois.contains(parsedMention.doi)) continue; + ExternalMentionRecord parsedMention = parseWork(work.getAsJsonObject()); + if (usedDois.contains(parsedMention.doi())) continue; - usedDois.add(parsedMention.doi); + usedDois.add(parsedMention.doi()); mentions.add(parsedMention); } catch (RuntimeException e) { - // TODO: fix exeption type - LOGGER.warn("Failed to scrape a DataCite mention with data {} ", work, e); + // TODO: fix exception type + LOGGER.error("Failed to scrape a DataCite mention with data {} ", work, e); } } return mentions; } - static MentionRecord parseWork(JsonObject work) { - MentionRecord result = new MentionRecord(); - result.doi = work.get("doi").getAsString(); - result.url = URI.create("https://doi.org/" + Utils.urlEncode(result.doi)); - result.title = work.getAsJsonArray("titles").get(0).getAsJsonObject().get("title").getAsString(); + static ExternalMentionRecord parseWork(JsonObject work) { + Doi doi = Doi.fromString(work.get("doi").getAsString()); + URI url = URI.create("https://doi.org/" + Utils.urlEncode(doi.toString())); + String title = work.getAsJsonArray("titles").get(0).getAsJsonObject().get("title").getAsString(); - Collection authors = new ArrayList<>(); + Collection authorsBuilder = new ArrayList<>(); Iterable creators = (Iterable) work.getAsJsonArray("creators"); for (JsonObject creator : creators) { - addAuthor(authors, creator); + addAuthor(authorsBuilder, creator); } Iterable contributors = (Iterable) work.getAsJsonArray("contributors"); for (JsonObject contributor : contributors) { - addAuthor(authors, contributor); + addAuthor(authorsBuilder, contributor); } - result.authors = String.join(", ", authors); + String authors = String.join(", ", authorsBuilder); - result.publisher = Utils.stringOrNull(work.getAsJsonObject("publisher").get("name")); - result.publicationYear = Utils.integerOrNull(work.get("publicationYear")); + String publisher = Utils.stringOrNull(work.getAsJsonObject("publisher").get("name")); + Integer publicationYear = Utils.integerOrNull(work.get("publicationYear")); String doiRegistrationDateString = Utils.stringOrNull(work.get("registered")); + ZonedDateTime doiRegistrationDate = null; if (doiRegistrationDateString != null) { - result.doiRegistrationDate = ZonedDateTime.parse(doiRegistrationDateString); + doiRegistrationDate = ZonedDateTime.parse(doiRegistrationDateString); } + MentionType mentionType; String dataciteResourceTypeGeneral = Utils.stringOrNull(work.getAsJsonObject("types").get("resourceTypeGeneral")); if (dataciteResourceTypeGeneral != null && dataciteResourceTypeGeneral.equals("Text")) { String dataciteResourceType = Utils.stringOrNull(work.getAsJsonObject("types").get("resourceType")); if (dataciteResourceType != null) dataciteResourceType = dataciteResourceType.strip(); - result.mentionType = dataciteTextTypeMap.getOrDefault(dataciteResourceType, MentionType.other); + mentionType = dataciteTextTypeMap.getOrDefault(dataciteResourceType, MentionType.other); } else { - result.mentionType = dataciteTypeMap.getOrDefault(dataciteResourceTypeGeneral, MentionType.other); + mentionType = dataciteTypeMap.getOrDefault(dataciteResourceTypeGeneral, MentionType.other); } - result.source = "DataCite"; - result.version = Utils.stringOrNull(work.get("version")); + String version = Utils.stringOrNull(work.get("version")); // if the version is null, we can often get the version from a linked Git URL which ends in "/tree/{tag}" - if (result.version == null) { + if (version == null) { JsonArray relatedIdentifiers = work.getAsJsonArray("relatedIdentifiers"); for (JsonElement relatedIdentifier : relatedIdentifiers) { String relatedIdentifierString = Utils.stringOrNull(relatedIdentifier.getAsJsonObject().get("relatedIdentifier")); @@ -181,15 +181,28 @@ static MentionRecord parseWork(JsonObject work) { if (relatedIdentifierString != null && relatedIdentifierType != null && relatedIdentifierType.equals("URL")) { Matcher tagMatcher = URL_TREE_TAG_PATTERN.matcher(relatedIdentifierString); if (tagMatcher.find()) { - result.version = tagMatcher.group(1); + version = tagMatcher.group(1); break; } } } } - result.scrapedAt = Instant.now(); - return result; + return new ExternalMentionRecord( + doi, + doiRegistrationDate, + null, + url, + title, + authors, + publisher, + publicationYear, + null, + null, + mentionType, + "DataCite", + version + ); } static void addAuthor(Collection authors, JsonObject author) { @@ -201,26 +214,14 @@ static void addAuthor(Collection authors, JsonObject author) { else authors.add(givenName + " " + familyName); } - @Override - public Collection leastRecentlyScrapedMentions(int limit) { - throw new UnsupportedOperationException(); - } - - @Override - public Collection mentionData(Collection dois) { + public Collection mentionData(Collection dois) { if (dois.isEmpty()) { return Collections.emptyList(); } JsonObject body = new JsonObject(); - body.addProperty("query", QUERY_UNFORMATTED.formatted(joinCollection(dois))); + body.addProperty("query", QUERY_UNFORMATTED.formatted(joinDoisForGraphqlQuery(dois))); String responseJson = Utils.post("https://api.datacite.org/graphql", body.toString(), "Content-Type", "application/json"); return jsonStringToUniqueMentions(responseJson); } - - @Override - public void save(Collection mentions) { - throw new UnsupportedOperationException(); - } - } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/Doi.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/Doi.java new file mode 100644 index 000000000..034603f32 --- /dev/null +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/Doi.java @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import nl.esciencecenter.rsd.scraper.Utils; + +import java.util.Locale; +import java.util.regex.Pattern; + +public class Doi { + + private static final Pattern DOI_PATTERN = Pattern.compile("^10(\\.\\w+)+/\\S+$"); + + private final String doiString; + + private Doi(String doiString) { + this.doiString = doiString.toLowerCase(Locale.ROOT); + } + + public static boolean isValid(String doiToTest) { + return doiToTest != null && doiToTest.length() <= 255 && DOI_PATTERN.asPredicate().test(doiToTest); + } + + public static Doi fromString(String doi) { + if (isValid(doi)) { + return new Doi(doi); + } else { + throw new IllegalArgumentException(); + } + } + + public String toUrlEncodedString() { + return Utils.urlEncode(doiString); + } + + @Override + public String toString() { + return doiString; + } + + @Override + public int hashCode() { + return doiString.hashCode(); + } + + @Override + public boolean equals(Object other) { + if (other == null) { + return false; + } + if (this == other) { + return true; + } + if (other instanceof Doi otherDoi) { + return doiString.equals(otherDoi.doiString); + } + + return false; + } +} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/ExternalMentionRecord.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/ExternalMentionRecord.java new file mode 100644 index 000000000..89973a23b --- /dev/null +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/ExternalMentionRecord.java @@ -0,0 +1,26 @@ +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import java.net.URI; +import java.time.ZonedDateTime; + +public record ExternalMentionRecord( + Doi doi, + ZonedDateTime doiRegistrationDate, + OpenalexId openalexId, + URI url, + String title, + String authors, + String publisher, + Integer publicationYear, + String journal, + String page, + MentionType mentionType, + String source, + String version +) { +} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java index 93843d6b5..050d9069d 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainCitations.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -7,80 +7,87 @@ import nl.esciencecenter.rsd.scraper.Config; import nl.esciencecenter.rsd.scraper.Utils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; -import java.time.ZonedDateTime; +import java.time.Instant; import java.util.ArrayList; import java.util.Collection; import java.util.UUID; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - /* - * Main entry point for citation scraper. + * Main entry point for citation scraper. */ public class MainCitations { - + private static final Logger LOGGER = LoggerFactory.getLogger(MainCitations.class); - + public static void main(String[] args) { - + LOGGER.info("Start scraping citations"); long start = System.currentTimeMillis(); try { - // Connect to the database to retrieve the - + // Connect to the database to retrieve the reference papers to scrape + String backendUrl = Config.backendBaseUrl(); PostgrestCitationRepository localCitationRepository = new PostgrestCitationRepository(backendUrl); Collection referencePapersToScrape = localCitationRepository.leastRecentlyScrapedCitations(5); OpenAlexCitations openAlexCitations = new OpenAlexCitations(); - MentionRepository localMentionRepository = new PostgrestMentionRepository(backendUrl); + PostgrestMentionRepository localMentionRepository = new PostgrestMentionRepository(backendUrl); String email = Config.crossrefContactEmail().orElse(null); - ZonedDateTime now = ZonedDateTime.now(); + Instant now = Instant.now(); for (CitationData citationData : referencePapersToScrape) { long t1 = System.currentTimeMillis(); - LOGGER.info("Scraping for {}", citationData.doi); + LOGGER.info("Scraping for DOI {}, OpenAlex ID {}", citationData.doi(), citationData.openalexId()); - Collection citingMentions = openAlexCitations.citations(citationData.doi, email, citationData.id); + Collection citingMentions = openAlexCitations.citations(citationData.openalexId(), citationData.doi(), email, citationData.id()); // we don't update mentions that have a DOI in the database with OpenAlex data, as they can already be - // scraped through Crossref of DataCite + // scraped through Crossref or DataCite long t2 = System.currentTimeMillis(); - citingMentions.removeIf(mention -> mention.doi != null && citationData.knownDois.contains(mention.doi)); - localMentionRepository.save(citingMentions); + citingMentions.removeIf(mention -> mention.doi() != null && citationData.knownDois().contains(mention.doi())); + Collection savedIds = new ArrayList<>(citingMentions.size()); + for (ExternalMentionRecord citingMention : citingMentions) { + try { + RsdMentionIds ids = localMentionRepository.createOrUpdateMentionWithOpenalexId(citingMention, now); + savedIds.add(ids); + } catch (Exception e) { + LOGGER.error("Unable to save exception with OpenAlex ID {}", citingMention.openalexId()); + Utils.saveExceptionInDatabase("Citation scraper", "mention", null, e); + } + } Collection citingMentionIds = new ArrayList<>(); - for (MentionRecord citingMention : citingMentions) { - citingMentionIds.add(citingMention.id); + for (RsdMentionIds ids : savedIds) { + citingMentionIds.add(ids.id()); } long t3 = System.currentTimeMillis(); - localCitationRepository.saveCitations(backendUrl, citationData.id, citingMentionIds, now); + localCitationRepository.saveCitations(backendUrl, citationData.id(), citingMentionIds, now); long t4 = System.currentTimeMillis(); - LOGGER.info("Scraping for {} done. OpenAlex: {} ms. Saving mentions {} ms. Saving citations {} ms. Total {} ms.", citationData.doi, (t2-t1), (t3-t2), (t4-t3), (t4-t1)); + LOGGER.info("Scraping for {} done. OpenAlex: {} ms. Saving mentions {} ms. Saving citations {} ms. Total {} ms.", citationData.doi(), (t2 - t1), (t3 - t2), (t4 - t3), (t4 - t1)); } } catch (IOException | InterruptedException e) { Utils.saveExceptionInDatabase("Citation scraper", null, null, e); - - if (e instanceof InterruptedException) { + + if (e instanceof InterruptedException) { Thread.currentThread().interrupt(); - } + } } long time = System.currentTimeMillis() - start; - LOGGER.info("Done scraping citations ({} ms.)", time); } } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java index 2e85bd5c2..7b996e0ad 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainMentions.java @@ -15,36 +15,35 @@ import org.slf4j.LoggerFactory; import java.time.Instant; -import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.List; import java.util.Map; -import java.util.TreeMap; import java.util.stream.Collectors; public class MainMentions { private static final Logger LOGGER = LoggerFactory.getLogger(MainMentions.class); - + public static void main(String[] args) { - + LOGGER.info("Start scraping mentions"); - + long t1 = System.currentTimeMillis(); - - MentionRepository localMentionRepository = new PostgrestMentionRepository(Config.backendBaseUrl()); - Collection mentionsToScrape = localMentionRepository.leastRecentlyScrapedMentions(Config.maxRequestsDoi()); + + PostgrestMentionRepository localMentionRepository = new PostgrestMentionRepository(Config.backendBaseUrl()); + Collection mentionsToScrape = localMentionRepository.leastRecentlyScrapedMentions(Config.maxRequestsDoi()); // we will remove successfully scraped mentions from here, // we use this to set scrapedAt even for failed mentions, - // to put them back at the scraping order - Map mentionsFailedToScrape = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); - for (MentionRecord mentionRecord : mentionsToScrape) { - mentionsFailedToScrape.put(mentionRecord.doi, mentionRecord); + // to put them back at the scraping queue + Map mentionsFailedToScrape = new HashMap<>(); + for (RsdMentionIds mentionIds : mentionsToScrape) { + mentionsFailedToScrape.put(mentionIds.doi(), mentionIds); } String doisJoined = mentionsToScrape.stream() - .map(mention -> mention.doi) - .map(Utils::urlEncode) + .map(RsdMentionIds::doi) + .map(Doi::toUrlEncodedString) .collect(Collectors.joining(",")); String jsonSources = null; try { @@ -54,75 +53,110 @@ public static void main(String[] args) { System.exit(1); } - Map doiToSource = parseJsonSources(jsonSources); + Map doiToSource = parseJsonDoiSources(jsonSources); - Collection scrapedMentions = new ArrayList<>(); - Collection dataciteDois = doiToSource.entrySet() + Instant now = Instant.now(); + + // DATACITE + Collection dataciteDois = doiToSource.entrySet() .stream() .filter(doiSourceEntry -> doiSourceEntry.getValue().equals("DataCite")) .map(Map.Entry::getKey) + .map(Doi::fromString) .toList(); + Collection scrapedDataciteMentions = List.of(); try { - scrapedMentions.addAll(new DataciteMentionRepository().mentionData(dataciteDois)); + scrapedDataciteMentions = new DataciteMentionRepository().mentionData(dataciteDois); } catch (RuntimeException e) { Utils.saveExceptionInDatabase("DataCite mention scraper", "mention", null, e); } - for (MentionRecord scrapedMention : scrapedMentions) { - mentionsFailedToScrape.remove(scrapedMention.doi); + for (ExternalMentionRecord scrapedMention : scrapedDataciteMentions) { + Doi doi = scrapedMention.doi(); + RsdMentionIds ids = mentionsFailedToScrape.get(doi); + try { + RsdMentionRecord mentionToUpdate = new RsdMentionRecord(ids.id(), scrapedMention, now); + localMentionRepository.updateMention(mentionToUpdate, false); + mentionsFailedToScrape.remove(doi); + } catch (Exception e) { + LOGGER.error("Failed to update a DataCite mention with DOI {}", scrapedMention.doi()); + Utils.saveExceptionInDatabase("Mention scraper", "mention", ids.id(), e); + } + } + // END DATACITE - Collection crossrefDois = doiToSource.entrySet() + // CROSSREF + Collection crossrefDois = doiToSource.entrySet() .stream() .filter(doiSourceEntry -> doiSourceEntry.getValue().equals("Crossref")) .map(Map.Entry::getKey) + .map(Doi::fromString) .toList(); - for (String crossrefDoi : crossrefDois) { + for (Doi crossrefDoi : crossrefDois) { + ExternalMentionRecord scrapedMention; try { - MentionRecord scrapedMention = new CrossrefMention(crossrefDoi).mentionData(); - scrapedMentions.add(scrapedMention); - mentionsFailedToScrape.remove(scrapedMention.doi); + scrapedMention = new CrossrefMention(crossrefDoi).mentionData(); } catch (Exception e) { + LOGGER.error("Failed to scrape a Crossref mention with DOI {}", crossrefDoi); RuntimeException exceptionWithMessage = new RuntimeException("Failed to scrape a Crossref mention with DOI " + crossrefDoi, e); - Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", null, exceptionWithMessage); + Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", mentionsFailedToScrape.get(crossrefDoi).id(), exceptionWithMessage); + continue; + } + Doi doi = scrapedMention.doi(); + RsdMentionIds ids = mentionsFailedToScrape.get(doi); + RsdMentionRecord mentionToUpdate = new RsdMentionRecord(ids.id(), scrapedMention, now); + try { + localMentionRepository.updateMention(mentionToUpdate, false); + mentionsFailedToScrape.remove(doi); + } catch (Exception e) { + RuntimeException exceptionWithMessage = new RuntimeException("Failed to update a Crossref mention with DOI " + crossrefDoi, e); + Utils.saveExceptionInDatabase("Crossref mention scraper", "mention", ids.id(), exceptionWithMessage); } } + // END CROSSREF + // OPENALEX (for European Publication Office DOIs) String email = Config.crossrefContactEmail().orElse(null); - Collection europeanPublicationsOfficeDois = doiToSource.entrySet() + Collection scrapedOpenalexMentions = List.of(); + Collection europeanPublicationsOfficeDois = doiToSource.entrySet() .stream() .filter(doiSourceEntry -> doiSourceEntry.getValue().equals("OP")) .map(Map.Entry::getKey) + .map(Doi::fromString) .toList(); try { - Collection openalexMentions = new OpenAlexCitations().mentionData(europeanPublicationsOfficeDois, email); - for (MentionRecord openalexMention : openalexMentions) { - mentionsFailedToScrape.remove(openalexMention.doi); - scrapedMentions.add(openalexMention); - } + scrapedOpenalexMentions = new OpenAlexCitations().mentionData(europeanPublicationsOfficeDois, email); } catch (Exception e) { - Utils.saveExceptionInDatabase("DataCite mention scraper", "mention", null, e); + Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e); } - - Instant now = Instant.now(); - for (MentionRecord mention : mentionsFailedToScrape.values()) { - mention.scrapedAt = now; - LOGGER.info("Failed to scrape mention with DOI {}", mention.doi); + for (ExternalMentionRecord scrapedMention : scrapedOpenalexMentions) { + Doi doi = scrapedMention.doi(); + RsdMentionIds ids = mentionsFailedToScrape.get(doi); + RsdMentionRecord mentionToUpdate = new RsdMentionRecord(ids.id(), scrapedMention, now); + try { + localMentionRepository.updateMention(mentionToUpdate, true); + mentionsFailedToScrape.remove(doi); + } catch (Exception e) { + LOGGER.error("Failed to update an OpenAlex mention with DOI {}", scrapedMention.doi()); + Utils.saveExceptionInDatabase("Mention scraper", "mention", ids.id(), e); + } } - scrapedMentions.addAll(mentionsFailedToScrape.values()); + // END OPENALEX - - try { - localMentionRepository.save(scrapedMentions); - } catch (RuntimeException e) { - Utils.saveExceptionInDatabase("Mention scraper", "mention", null, e); + for (RsdMentionIds ids : mentionsFailedToScrape.values()) { + LOGGER.error("Failed to scrape mention with DOI {}", ids.doi()); + try { + localMentionRepository.saveScrapedAt(ids, now); + } catch (RuntimeException e) { + Utils.saveExceptionInDatabase("Mention scraper", "mention", ids.id(), e); + } } long time = System.currentTimeMillis() - t1; - LOGGER.info("Done scraping mentions ({} ms.)", time); } - static Map parseJsonSources(String jsonSources) { + static Map parseJsonDoiSources(String jsonSources) { JsonArray sourcesArray = JsonParser.parseString(jsonSources).getAsJsonArray(); Map result = new HashMap<>(); for (JsonElement jsonElement : sourcesArray) { diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainReleases.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainReleases.java index 49400429f..98cb8869d 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainReleases.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MainReleases.java @@ -1,20 +1,21 @@ -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 package nl.esciencecenter.rsd.scraper.doi; import nl.esciencecenter.rsd.scraper.Config; +import nl.esciencecenter.rsd.scraper.Utils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.time.Instant; import java.util.Collection; +import java.util.HashMap; import java.util.Map; -import java.util.TreeMap; import java.util.UUID; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; /* * 1. Get the least recently scraped releases from software with a concept DOI. We also check for existing releases that already exist as a mention in the database, so we don't have to (TODO) recreate them later. * 2. For each release check if it's a concept DOI on DataCite and get all the versioned DOIs. @@ -23,37 +24,42 @@ public class MainReleases { private static final Logger LOGGER = LoggerFactory.getLogger(MainReleases.class); - + public static void main(String[] args) { - + LOGGER.info("Start scraping releases"); - + long t1 = System.currentTimeMillis(); - + PostgrestReleaseRepository releaseRepository = new PostgrestReleaseRepository(Config.backendBaseUrl()); Collection releasesToScrape = releaseRepository.leastRecentlyScrapedReleases(Config.maxRequestsDoi()); - Collection conceptDoisToScrape = releasesToScrape.stream() + Collection conceptDoisToScrape = releasesToScrape.stream() .map(releaseData -> releaseData.conceptDoi) .toList(); - Map> scrapedReleasesPerConceptDoi = new DataCiteReleaseRepository().getVersionedDois(conceptDoisToScrape); + Map> scrapedReleasesPerConceptDoi = new DataCiteReleaseRepository().getVersionedDois(conceptDoisToScrape); - MentionRepository localMentionRepository = new PostgrestMentionRepository(Config.backendBaseUrl()); - Collection allMentions = scrapedReleasesPerConceptDoi.values().stream() + Instant now = Instant.now(); + PostgrestMentionRepository localMentionRepository = new PostgrestMentionRepository(Config.backendBaseUrl()); + Collection allMentions = scrapedReleasesPerConceptDoi.values().stream() .flatMap(Collection::stream) .toList(); - localMentionRepository.save(allMentions); - Map doiToId = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); - for (MentionRecord mention : allMentions) { - doiToId.put(mention.doi, mention.id); + Map doiToId = new HashMap<>(); + for (ExternalMentionRecord mention : allMentions) { + try { + RsdMentionIds ids = localMentionRepository.createMentionIfNotExistsOnDoiAndGetIds(mention, now); + doiToId.put(mention.doi(), ids.id()); + } catch (Exception e) { + LOGGER.error("Unable to save mention with DOI {}", mention.doi()); + Utils.saveExceptionInDatabase("Releases scraper", "mention", null, e); + } } releaseRepository.saveReleaseContent(releasesToScrape, scrapedReleasesPerConceptDoi, doiToId); - + long time = System.currentTimeMillis() - t1; - LOGGER.info("Done scraping releases ({} ms.)", time); } } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/Mention.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/Mention.java deleted file mode 100644 index 654201b09..000000000 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/Mention.java +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center -// -// SPDX-License-Identifier: Apache-2.0 - -package nl.esciencecenter.rsd.scraper.doi; - -import java.io.IOException; - -import nl.esciencecenter.rsd.scraper.RsdResponseException; - -public interface Mention { - - MentionRecord mentionData() throws IOException, InterruptedException, RsdResponseException; -} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRecord.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRecord.java deleted file mode 100644 index 421e4e0f5..000000000 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRecord.java +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center -// -// SPDX-License-Identifier: Apache-2.0 - -package nl.esciencecenter.rsd.scraper.doi; - -import java.net.URI; -import java.time.Instant; -import java.time.ZonedDateTime; -import java.util.UUID; - -public class MentionRecord { - UUID id; - String doi; - URI url; - String title; - String authors; - String publisher; - Integer publicationYear; - ZonedDateTime doiRegistrationDate; - String journal; - String page; - URI imageUrl; - MentionType mentionType; - String externalId; - String source; - Instant scrapedAt; - String version; - - @Override - public String toString() { - return "MentionRecord{" + - "id=" + id + - ", doi='" + doi + '\'' + - ", url=" + url + - ", title='" + title + '\'' + - ", authors='" + authors + '\'' + - ", publisher='" + publisher + '\'' + - ", publicationYear=" + publicationYear + - ", doiRegistrationDate=" + doiRegistrationDate + - ", journal='" + journal + '\'' + - ", page='" + page + '\'' + - ", imageUrl=" + imageUrl + - ", mentionType=" + mentionType + - ", externalId='" + externalId + '\'' + - ", source='" + source + '\'' + - ", scrapedAt=" + scrapedAt + - ", version='" + version + '\'' + - '}'; - } -} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRepository.java deleted file mode 100644 index 5a1abee81..000000000 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/MentionRepository.java +++ /dev/null @@ -1,17 +0,0 @@ -// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center -// -// SPDX-License-Identifier: Apache-2.0 - -package nl.esciencecenter.rsd.scraper.doi; - -import java.util.Collection; - -public interface MentionRepository { - - Collection leastRecentlyScrapedMentions(int limit); - - Collection mentionData(Collection dois); - - void save(Collection mentions); -} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java index 2958dac86..1e997b8b8 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenAlexCitations.java @@ -10,11 +10,12 @@ import com.google.gson.JsonObject; import com.google.gson.JsonParser; import nl.esciencecenter.rsd.scraper.Utils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URI; import java.net.http.HttpResponse; -import java.time.Instant; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -27,10 +28,17 @@ class OpenAlexCitations { - static final String DOI_FILTER_URL_UNFORMATTED = "https://api.openalex.org/works?filter=doi:%s"; + private static final Logger LOGGER = LoggerFactory.getLogger(OpenAlexCitations.class); - public Collection mentionData(Collection dataciteDois, String email) throws IOException, InterruptedException { - String filter = dataciteDois.stream().filter(Objects::nonNull).collect(Collectors.joining("|")); + static final String DOI_FILTER_URL_UNFORMATTED = "https://api.openalex.org/works?filter=doi:%s"; + static final String OPENALEX_ID_URL_UNFORMATTED = "https://api.openalex.org/works?filter=ids.openalex:%s"; + + public Collection mentionData(Collection dataciteDois, String email) throws IOException, InterruptedException { + String filter = dataciteDois + .stream() + .filter(Objects::nonNull) + .map(Doi::toString) + .collect(Collectors.joining("|")); String worksUri = DOI_FILTER_URL_UNFORMATTED.formatted(Utils.urlEncode(filter)) + "&per-page=200"; HttpResponse response; @@ -44,12 +52,11 @@ public Collection mentionData(Collection dataciteDois, St JsonArray citationsArray = tree .getAsJsonArray("results"); - Collection mentions = new ArrayList<>(); - Instant now = Instant.now(); + Collection mentions = new ArrayList<>(); for (JsonElement citation : citationsArray) { - MentionRecord citationAsMention; + ExternalMentionRecord citationAsMention; try { - citationAsMention = parseCitationAsMention(citation, now); + citationAsMention = parseCitationAsMention(citation); } catch (RuntimeException e) { Utils.saveExceptionInDatabase("OpenAlex mention scraper", "mention", null, e); continue; @@ -60,10 +67,15 @@ public Collection mentionData(Collection dataciteDois, St return mentions; } - public Collection citations(String doi, String email, UUID id) throws IOException, InterruptedException { + public Collection citations(OpenalexId openalexId, Doi doi, String email, UUID id) throws IOException, InterruptedException { + // This shouldn't happen, but let's check it to prevent unexpected exceptions: + if (doi == null && openalexId == null) { + return Collections.emptyList(); + } - String doiUrlEncoded = Utils.urlEncode(doi); - String worksUri = DOI_FILTER_URL_UNFORMATTED.formatted(doiUrlEncoded); + String worksUri = openalexId != null + ? OPENALEX_ID_URL_UNFORMATTED.formatted(openalexId.toUrlEncodedString()) + : DOI_FILTER_URL_UNFORMATTED.formatted(doi.toUrlEncodedString()); Optional optionalCitationsUri = citationsUri(worksUri, email); if (optionalCitationsUri.isEmpty()) { @@ -88,27 +100,38 @@ static Optional citationsUri(String worksUri, String email) throws IOExc .getAsJsonPrimitive("count") .getAsInt(); - if (count == 0 || count > 1) { + if (count < 1) { + LOGGER.warn("No results found for {}: {}", worksUri, count); return Optional.empty(); } - String citationsUri = tree - .getAsJsonArray("results") - .get(0) - .getAsJsonObject() - .getAsJsonPrimitive("cited_by_api_url") - .getAsString(); + if (count > 1) { + LOGGER.warn("More than 1 result found for {}: {}, taking the first", worksUri, count); + } + + String citationsUri = null; + try { + citationsUri = tree + .getAsJsonArray("results") + .get(0) + .getAsJsonObject() + .getAsJsonPrimitive("cited_by_api_url") + .getAsString(); + } catch (RuntimeException e) { + LOGGER.error("Exception parsing cited_by_api_url for %s".formatted(worksUri), e); + Utils.saveExceptionInDatabase("OpenAlex citations scraper", null, null, e); + } - return Optional.of(citationsUri); + return Optional.ofNullable(citationsUri); } // we use cursor paging as that will always work // https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/paging#cursor-paging - static Collection scrapeCitations(String citationsUri, String email, UUID id) throws IOException, InterruptedException { + static Collection scrapeCitations(String citationsUri, String email, UUID id) throws IOException, InterruptedException { final int perPage = 200; String cursor = "*"; - Collection citations = new ArrayList<>(); + Collection citations = new ArrayList<>(); while (cursor != null) { HttpResponse response; String citationsUriWithCursor = citationsUri + "&per-page=" + perPage + "&cursor=" + cursor; @@ -127,11 +150,10 @@ static Collection scrapeCitations(String citationsUri, String ema JsonArray citationsArray = tree .getAsJsonArray("results"); - Instant now = Instant.now(); for (JsonElement citation : citationsArray) { - MentionRecord citationAsMention; + ExternalMentionRecord citationAsMention; try { - citationAsMention = parseCitationAsMention(citation, now); + citationAsMention = parseCitationAsMention(citation); } catch (RuntimeException e) { Utils.saveExceptionInDatabase("Citation scraper", "mention", id, e); continue; @@ -143,70 +165,68 @@ static Collection scrapeCitations(String citationsUri, String ema return citations; } - static MentionRecord parseCitationAsMention(JsonElement element, Instant scrapedAt) { + static ExternalMentionRecord parseCitationAsMention(JsonElement element) { JsonObject citationObject = element.getAsJsonObject(); - MentionRecord mention = new MentionRecord(); - String doiUrl = Utils.stringOrNull(citationObject.get("doi")); - String doi = doiUrl; - if (doi != null) { - doi = doi.replace("https://doi.org/", ""); + String doiString = doiUrl; + if (doiString != null) { + doiString = doiString.replace("https://doi.org/", ""); } - mention.doi = doi; + Doi doi = doiString == null ? null : Doi.fromString(doiString); + URI url; if (doiUrl != null) { - mention.url = URI.create(doiUrl); + url = URI.create(doiUrl); } else { JsonArray locations = citationObject.getAsJsonArray("locations"); - mention.url = extractUrlFromLocation(locations); + url = extractUrlFromLocation(locations); } - mention.title = Utils.stringOrNull(citationObject.get("title")); - if (mention.title == null) { + String title = Utils.stringOrNull(citationObject.get("title")); + if (title == null) { String openAlexId = citationObject.getAsJsonPrimitive("id").getAsString(); - String message = "The title of the mention with DOI %s and OpenAlex ID %s is null".formatted(doi, openAlexId); + String message = "The title of the mention with DOI %s and OpenAlex ID %s is null".formatted(doiString, openAlexId); throw new RuntimeException(message); } JsonArray authorsArray = citationObject.getAsJsonArray("authorships"); - mention.authors = StreamSupport.stream(authorsArray.spliterator(), false) + String authors = StreamSupport.stream(authorsArray.spliterator(), false) .map(JsonElement::getAsJsonObject) .map(jo -> jo.get("raw_author_name")) .filter(Predicate.not(JsonElement::isJsonNull)) .map(JsonElement::getAsString) .collect(Collectors.joining(", ")); - if (mention.authors.isBlank()) { - mention.authors = null; + if (authors.isBlank()) { + authors = null; } - mention.publisher = null; - - mention.publicationYear = Utils.integerOrNull(citationObject.get("publication_year")); - - mention.doiRegistrationDate = null; - - mention.journal = null; - - mention.page = null; - - mention.imageUrl = null; + Integer publicationYear = Utils.integerOrNull(citationObject.get("publication_year")); String crossrefType = Utils.stringOrNull(citationObject.get("type_crossref")); - mention.mentionType = CrossrefMention.crossrefTypeMap.getOrDefault(crossrefType, MentionType.other); + MentionType mentionType = CrossrefMention.crossrefTypeMap.getOrDefault(crossrefType, MentionType.other); - mention.externalId = citationObject + String openalexIdString = citationObject .getAsJsonObject("ids") .getAsJsonPrimitive("openalex") .getAsString(); - - mention.source = "OpenAlex"; - - mention.scrapedAt = scrapedAt; - - mention.version = null; - - return mention; + OpenalexId openalexId = OpenalexId.fromString(openalexIdString); + + return new ExternalMentionRecord( + doi, + null, + openalexId, + url, + title, + authors, + null, + publicationYear, + null, + null, + mentionType, + "OpenAlex", + null + ); } static URI extractUrlFromLocation(JsonArray locations) { diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenalexId.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenalexId.java new file mode 100644 index 000000000..def269a88 --- /dev/null +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/OpenalexId.java @@ -0,0 +1,72 @@ +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import nl.esciencecenter.rsd.scraper.Utils; + +import java.util.Locale; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +// https://docs.openalex.org/how-to-use-the-api/get-single-entities#the-openalex-id +public class OpenalexId { + + private static final Pattern OPENALEX_PATTERN = Pattern.compile("^https://openalex\\.org/([WwAaSsIiCcPpFf]\\d{3,13})$"); + private static final String OPENALEX_ID_BASE = "https://openalex.org/"; + + private final String openalexKey; + + private OpenalexId(String openalexString) { + this.openalexKey = openalexString.toUpperCase(Locale.ROOT); + } + + public static boolean isValid(String idToTest) { + return idToTest != null && OPENALEX_PATTERN.asPredicate().test(idToTest); + } + + public static OpenalexId fromString(String id) { + if (id == null) { + throw new IllegalArgumentException("The ID cannot be null"); + } + Matcher matcher = OPENALEX_PATTERN.matcher(id); + + if (!matcher.find()) { + throw new IllegalArgumentException("This is an invalid OpenAlex ID"); + } + + String key = matcher.group(1); + return new OpenalexId(key); + } + + public String toUrlEncodedString() { + return Utils.urlEncode(toString()); + } + + @Override + public String toString() { + return OPENALEX_ID_BASE + openalexKey; + } + + @Override + public int hashCode() { + return openalexKey.hashCode(); + } + + @Override + public boolean equals(Object other) { + if (other == null) { + return false; + } + if (this == other) { + return true; + } + if (other instanceof OpenalexId otherOpenalexId) { + return openalexKey.equals(otherOpenalexId.openalexKey); + } + + return false; + } +} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestCitationRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestCitationRepository.java index 34a5ff60b..cad4b1ba0 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestCitationRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestCitationRepository.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -11,32 +11,30 @@ import com.google.gson.JsonParser; import nl.esciencecenter.rsd.scraper.Utils; -import java.time.ZonedDateTime; -import java.time.format.DateTimeFormatter; +import java.time.Instant; import java.util.ArrayList; import java.util.Collection; -import java.util.Objects; -import java.util.TreeSet; import java.util.HashSet; +import java.util.Objects; import java.util.UUID; /** - * This class provides access to the citation related tables via the Postgrest API. + * This class provides access to the citation related tables via the Postgrest API. */ public class PostgrestCitationRepository { - // The base URL of the backend. + // The base URL of the backend. private final String backendUrl; - + public PostgrestCitationRepository(String backendUrl) { this.backendUrl = Objects.requireNonNull(backendUrl); } /** * Retrieve the least recently scraped reference papers from the database. - * + * * @param limit the maximum number of references to return - * @return A collection of citation data representing these reference papers. + * @return A collection of citation data representing these reference papers. */ public Collection leastRecentlyScrapedCitations(int limit) { String oneHourAgoFilter = Utils.atLeastOneHourAgoFilter("citations_scraped_at"); @@ -45,23 +43,23 @@ public Collection leastRecentlyScrapedCitations(int limit) { return parseJson(data); } - public void saveCitations(String backendUrl, UUID idCitedMention, Collection citingMentions, ZonedDateTime scrapedAt) { - String jsonPatch = "{\"citations_scraped_at\": \"%s\"}".formatted(scrapedAt.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); + public void saveCitations(String backendUrl, UUID idCitedMention, Collection citingMentions, Instant scrapedAt) { + String jsonPatch = "{\"citations_scraped_at\": \"%s\"}".formatted(scrapedAt.toString()); Utils.patchAsAdmin(backendUrl + "/mention?id=eq." + idCitedMention.toString(), jsonPatch); JsonArray jsonArray = new JsonArray(); - // We sometimes encouter duplicate citations which may lead to the operation to fail. + // We sometimes encounter duplicate citations which may lead to the operation to fail. HashSet seen = new HashSet<>(); for (UUID citingMention : citingMentions) { if (citingMention != null) { - String citationID = citingMention.toString(); + String citationID = citingMention.toString(); if (!seen.contains(citationID)) { seen.add(citationID); - JsonObject jsonObject = new JsonObject(); + JsonObject jsonObject = new JsonObject(); jsonObject.addProperty("mention", idCitedMention.toString()); jsonObject.addProperty("citation", citationID); jsonArray.add(jsonObject); @@ -75,23 +73,26 @@ public void saveCitations(String backendUrl, UUID idCitedMention, Collection parseJson(String data) { - + JsonArray array = JsonParser.parseString(data).getAsJsonArray(); Collection result = new ArrayList<>(); for (JsonElement jsonElement : array) { JsonObject jsonObject = jsonElement.getAsJsonObject(); UUID id = UUID.fromString(jsonObject.getAsJsonPrimitive("id").getAsString()); - String doi = jsonObject.getAsJsonPrimitive("doi").getAsString(); + String doiString = Utils.stringOrNull(jsonObject.get("doi")); + Doi doi = doiString == null ? null : Doi.fromString(doiString); + String openalexIdString = Utils.stringOrNull(jsonObject.get("openalex_id")); + OpenalexId openalexId = openalexIdString == null ? null : OpenalexId.fromString(openalexIdString); - Collection knownDois = new TreeSet<>(String.CASE_INSENSITIVE_ORDER); - JsonArray doisArray = jsonObject.getAsJsonArray("known_dois"); + Collection knownDois = new HashSet<>(); + JsonArray doisArray = jsonObject.getAsJsonArray("known_citing_dois"); for (JsonElement element : doisArray) { - knownDois.add(element.getAsString()); + knownDois.add(Doi.fromString(element.getAsString())); } - result.add(new CitationData(id, doi, knownDois)); + result.add(new CitationData(id, doi, openalexId, knownDois)); } return result; diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java index 768731c81..1b67a7068 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestMentionRepository.java @@ -5,29 +5,21 @@ package nl.esciencecenter.rsd.scraper.doi; -import com.google.gson.FieldNamingPolicy; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; import com.google.gson.JsonArray; -import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; import com.google.gson.JsonParser; -import com.google.gson.JsonPrimitive; -import com.google.gson.JsonSerializer; -import com.google.gson.reflect.TypeToken; import nl.esciencecenter.rsd.scraper.Utils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.net.URI; import java.time.Instant; import java.time.ZonedDateTime; +import java.util.ArrayList; import java.util.Collection; import java.util.Objects; import java.util.UUID; -public class PostgrestMentionRepository implements MentionRepository { - - private static final Logger LOGGER = LoggerFactory.getLogger(PostgrestMentionRepository.class); +public class PostgrestMentionRepository { private final String backendUrl; @@ -35,102 +27,119 @@ public PostgrestMentionRepository(String backendUrl) { this.backendUrl = Objects.requireNonNull(backendUrl); } - static Collection parseJson(String data) { - return new GsonBuilder() - .setFieldNamingPolicy(FieldNamingPolicy.LOWER_CASE_WITH_UNDERSCORES) - .registerTypeAdapter(Instant.class, (JsonDeserializer) (json, typeOfT, context) -> Instant.parse(json.getAsString())) - .registerTypeAdapter(ZonedDateTime.class, (JsonDeserializer) (json, typeOfT, context) -> ZonedDateTime.parse(json.getAsString())) - .registerTypeAdapter(URI.class, (JsonDeserializer) (json, typeOfT, context) -> { - try { - return URI.create(json.getAsString()); - } catch (IllegalArgumentException e) { - LOGGER.warn("Could not create a URI of {} ", json.getAsString()); - return null; - } - }) - .create() - .fromJson(data, new TypeToken>() { - }.getType()); + static Collection parseMultipleRsdIds(String json) { + Collection result = new ArrayList<>(); + + JsonArray rootArray = JsonParser.parseString(json).getAsJsonArray(); + for (JsonElement jsonElement : rootArray) { + + JsonObject arrayEntry = jsonElement.getAsJsonObject(); + UUID id = UUID.fromString(arrayEntry.getAsJsonPrimitive("id").getAsString()); + String doiString = Utils.stringOrNull(arrayEntry.get("doi")); + Doi doi = doiString == null ? null : Doi.fromString(doiString); + String openalexIdString = Utils.stringOrNull(arrayEntry.get("openalex_id")); + OpenalexId openalexId = openalexIdString == null ? null : OpenalexId.fromString(openalexIdString); + + result.add(new RsdMentionIds(id, doi, openalexId)); + } + + return result; } - @Override - public Collection leastRecentlyScrapedMentions(int limit) { - String data = Utils.getAsAdmin(backendUrl + "/mention?doi=not.is.null&order=scraped_at.asc.nullsfirst&limit=" + limit); - return parseJson(data); + static RsdMentionIds parseSingleRsdIds(String json) { + JsonObject root = JsonParser.parseString(json).getAsJsonArray().get(0).getAsJsonObject(); + + UUID id = UUID.fromString(root.getAsJsonPrimitive("id").getAsString()); + String doiString = Utils.stringOrNull(root.get("doi")); + Doi doi = doiString == null ? null : Doi.fromString(doiString); + String openalexIdString = Utils.stringOrNull(root.get("openalex_id")); + OpenalexId openalexId = openalexIdString == null ? null : OpenalexId.fromString(openalexIdString); + + return new RsdMentionIds(id, doi, openalexId); + } + + public Collection leastRecentlyScrapedMentions(int limit) { + String data = Utils.getAsAdmin(backendUrl + "/mention?doi=not.is.null&order=scraped_at.asc.nullsfirst&select=id,doi,openalex_id&limit=" + limit); + return parseMultipleRsdIds(data); } - @Override - public Collection mentionData(Collection dois) { - throw new UnsupportedOperationException(); + public void saveScrapedAt(RsdMentionIds ids, Instant scrapedAt) { + JsonObject root = new JsonObject(); + root.addProperty("scraped_at", scrapedAt.toString()); + Utils.patchAsAdmin(backendUrl + "/mention?select=id,doi,openalex_id&id=eq." + ids.id(), root.toString(), "Prefer", "return=representation"); } - @Override - public void save(Collection mentions) { - Gson gson = new GsonBuilder() - .serializeNulls() - .setFieldNamingPolicy(FieldNamingPolicy.LOWER_CASE_WITH_UNDERSCORES) - .registerTypeAdapter(Instant.class, (JsonSerializer) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString())) - .registerTypeAdapter(ZonedDateTime.class, (JsonSerializer) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString())) - .create(); - - LOGGER.info("Will save {} mentions", mentions.size()); - - for (MentionRecord mention : mentions) { - String scrapedMentionJson = gson.toJson(mention); - String onConflictFilter; - - if (mention.doi != null) { - onConflictFilter = "doi"; - } else { - onConflictFilter = "external_id,source"; - } - - String uri = "%s/mention?on_conflict=%s&select=id".formatted(backendUrl, onConflictFilter); - String response; - - try { - LOGGER.debug("Saving mention: {} / {} / {}", mention.doi, mention.externalId, mention.source); - response = Utils.postAsAdmin(uri, scrapedMentionJson, "Prefer", "resolution=merge-duplicates,return=representation"); - - JsonArray responseAsArray = JsonParser.parseString(response).getAsJsonArray(); - // Used in MainCitations, do not remove - mention.id = UUID.fromString(responseAsArray.get(0).getAsJsonObject().getAsJsonPrimitive("id").getAsString()); - - } catch (RuntimeException e) { - - LOGGER.warn("Failed to save mention: {} / {} / {}", mention.doi, mention.externalId, mention.source, e); - - String metadataMessage = "Failed to save mention: DOI %s, external ID %s, source %s".formatted(mention.doi, mention.externalId, mention.source); - RuntimeException exceptionWithMetadata = new RuntimeException(metadataMessage, e); - if (mention.doi == null) { - Utils.saveExceptionInDatabase("Mention scraper", "mention", null, exceptionWithMetadata); - } else { - // We will try to update the scraped_at field, so that it goes back into the queue for being scraped - // Note that this operation in itself may also fail. - try { - String existingMentionResponse = Utils.getAsAdmin("%s/mention?doi=eq.%s&select=id".formatted(backendUrl, mention.doi)); - JsonArray array = JsonParser.parseString(existingMentionResponse).getAsJsonArray(); - if (array.size() == 1) { - String id = array.get(0).getAsJsonObject().getAsJsonPrimitive("id").getAsString(); - Utils.saveErrorMessageInDatabase(null, - "mention", - null, - id, - "id", - ZonedDateTime.now(), - "scraped_at"); - - Utils.saveExceptionInDatabase("Mention scraper", "mention", UUID.fromString(id), exceptionWithMetadata); - } else { - Utils.saveExceptionInDatabase("Mention scraper", "mention", null, exceptionWithMetadata); - } - } catch (Exception e2) { - LOGGER.warn("Failed to save exception in database", e2); - } - } - - } + public RsdMentionIds updateMention(RsdMentionRecord mention, boolean updateOpenAlexId) { + JsonObject root = createJsonFromMentionData(mention.content(), updateOpenAlexId); + root.addProperty("scraped_at", mention.scrapedAt().toString()); + String response = Utils.patchAsAdmin(backendUrl + "/mention?select=id,doi,openalex_id&id=eq." + mention.id(), root.toString(), "Prefer", "return=representation"); + return parseSingleRsdIds(response); + } + + public RsdMentionIds createMentionIfNotExistsOnDoiAndGetIds(ExternalMentionRecord mention, Instant scrapedAt) { + Doi doi = mention.doi(); + Objects.requireNonNull(doi); + Collection mentionsWithDoi = parseMultipleRsdIds(Utils.getAsAdmin(backendUrl + "/mention?select=id,doi,openalex_id&doi=eq." + doi.toUrlEncodedString())); + if (mentionsWithDoi.size() == 1) { + return mentionsWithDoi.iterator().next(); + } + + return createNewMention(mention, scrapedAt, false); + } + + public RsdMentionIds createOrUpdateMentionWithOpenalexId(ExternalMentionRecord mention, Instant scrapedAt) { + OpenalexId openalexId = Objects.requireNonNull(mention.openalexId()); + Doi doi = mention.doi(); + + String query = "/mention?select=id,doi,openalex_id"; + if (mention.doi() != null) { + query += "&or=(openalex_id.eq.%s,doi.eq.%s)".formatted(openalexId.toUrlEncodedString(), doi.toUrlEncodedString()); + } else { + query += "&openalex_id=eq.%s".formatted(openalexId.toUrlEncodedString()); + } + String existingMentionsResponse = Utils.getAsAdmin(backendUrl + query); + Collection existingIds = parseMultipleRsdIds(existingMentionsResponse); + + if (existingIds.size() > 1) { + throw new RuntimeException("Multiple entries with DOI %s or OpenAlex id %s exist, they should be merged".formatted(openalexId, mention.doi())); + } + if (existingIds.size() == 1) { + UUID id = existingIds.iterator().next().id(); + return updateMention(new RsdMentionRecord(id, mention, scrapedAt), true); + } + + return createNewMention(mention, scrapedAt, true); + } + + private RsdMentionIds createNewMention(ExternalMentionRecord mention, Instant scrapedAt, boolean setOpenAlexId) { + JsonObject root = createJsonFromMentionData(mention, setOpenAlexId); + root.addProperty("scraped_at", scrapedAt.toString()); + String response = Utils.postAsAdmin(backendUrl + "/mention?select=id,doi,openalex_id", root.toString(), "Prefer", "return=representation"); + return parseSingleRsdIds(response); + } + static JsonObject createJsonFromMentionData(ExternalMentionRecord mention, boolean setOpenAlexId) { + JsonObject root = new JsonObject(); + Doi doi = mention.doi(); + root.addProperty("doi", doi == null ? null : mention.doi().toString()); + ZonedDateTime doiRegistrationDate = mention.doiRegistrationDate(); + root.addProperty("doi_registration_date", doiRegistrationDate == null ? null : doiRegistrationDate.toString()); + if (setOpenAlexId) { + root.addProperty("openalex_id", mention.openalexId().toString()); } + URI url = mention.url(); + root.addProperty("url", url == null ? null : mention.url().toString()); + root.addProperty("title", mention.title()); + root.addProperty("authors", mention.authors()); + root.addProperty("publisher", mention.publisher()); + root.addProperty("publication_year", mention.publicationYear()); + root.addProperty("journal", mention.journal()); + root.addProperty("page", mention.page()); + root.addProperty("mention_type", mention.mentionType().name()); + root.addProperty("source", mention.source()); + root.addProperty("version", mention.version()); + root.addProperty("source", mention.source()); + + return root; } } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java index 66e075036..51e13c3a8 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/PostgrestReleaseRepository.java @@ -9,6 +9,7 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.gson.JsonArray; +import com.google.gson.JsonDeserializer; import com.google.gson.JsonObject; import com.google.gson.reflect.TypeToken; import nl.esciencecenter.rsd.scraper.Config; @@ -17,9 +18,9 @@ import java.time.Instant; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.Map; import java.util.Objects; -import java.util.TreeMap; import java.util.UUID; public class PostgrestReleaseRepository { @@ -35,7 +36,7 @@ public Collection leastRecentlyScrapedReleases(int limit) { return parseJson(data); } - public void saveReleaseContent(Collection releaseData, Map> conceptDoiToDois, Map versionDoiToMentionId) { + public void saveReleaseContent(Collection releaseData, Map> conceptDoiToDois, Map versionDoiToMentionId) { // First update the releases_scraped_at column. JsonArray releasesBody = new JsonArray(); Instant now = Instant.now(); @@ -49,31 +50,35 @@ public void saveReleaseContent(Collection releaseData, Map> conceptDoiToSoftwareIds = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); + // For each scraped or existing version as a mention, we need to know its id of the mention table and the ids (plural, because multiple software entries can have the same concept DOI) of the software to which it belongs. + Map> conceptDoiToSoftwareIds = new HashMap<>(); for (ReleaseData release : releaseData) { Collection softwareIds = conceptDoiToSoftwareIds.computeIfAbsent(release.conceptDoi, k -> new ArrayList<>()); softwareIds.add(release.softwareId); } - Map versionDoiToConceptDoi = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); - for (Map.Entry> conceptDoiToDoisEntry : conceptDoiToDois.entrySet()) { - String conceptDoi = conceptDoiToDoisEntry.getKey(); - Collection versionDois = conceptDoiToDoisEntry.getValue(); - for (MentionRecord version : versionDois) { - versionDoiToConceptDoi.put(version.doi, conceptDoi); + Map versionDoiToConceptDoi = new HashMap<>(); + for (Map.Entry> conceptDoiToDoisEntry : conceptDoiToDois.entrySet()) { + Doi conceptDoi = conceptDoiToDoisEntry.getKey(); + Collection versionDois = conceptDoiToDoisEntry.getValue(); + for (ExternalMentionRecord version : versionDois) { + versionDoiToConceptDoi.put(version.doi(), conceptDoi); } } JsonArray coupling = new JsonArray(); - for (Map.Entry entry : versionDoiToConceptDoi.entrySet()) { - String versionDoi = entry.getKey(); - String conceptDoi = entry.getValue(); + for (Map.Entry entry : versionDoiToConceptDoi.entrySet()) { + Doi versionDoi = entry.getKey(); + Doi conceptDoi = entry.getValue(); Collection softwareIds = conceptDoiToSoftwareIds.get(conceptDoi); for (UUID softwareId : softwareIds) { JsonObject couple = new JsonObject(); + UUID mentionId = versionDoiToMentionId.get(versionDoi); + if (mentionId == null) { + continue; + } couple.addProperty("release_id", softwareId.toString()); - couple.addProperty("mention_id", versionDoiToMentionId.get(versionDoi).toString()); + couple.addProperty("mention_id", mentionId.toString()); coupling.add(couple); } } @@ -82,8 +87,10 @@ public void saveReleaseContent(Collection releaseData, Map parseJson(String data) { + JsonDeserializer doiDeserializer = (json, type, context) -> Doi.fromString(json.getAsJsonPrimitive().getAsString()); Gson gson = new GsonBuilder() .setFieldNamingPolicy(FieldNamingPolicy.LOWER_CASE_WITH_UNDERSCORES) + .registerTypeAdapter(Doi.class, doiDeserializer) .create(); TypeToken> typeToken = new TypeToken<>() { }; diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/ReleaseData.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/ReleaseData.java index 45106375c..55807beb2 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/ReleaseData.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/ReleaseData.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -12,8 +12,8 @@ public class ReleaseData { public UUID softwareId; public String slug; - public String conceptDoi; - public Collection versionedDois; + public Doi conceptDoi; + public Collection versionedDois; @Override public String toString() { diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/RsdMentionIds.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/RsdMentionIds.java new file mode 100644 index 000000000..81d529eb5 --- /dev/null +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/RsdMentionIds.java @@ -0,0 +1,15 @@ +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import java.util.UUID; + +public record RsdMentionIds( + UUID id, + Doi doi, + OpenalexId openalexId +) { +} diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/RsdMentionRecord.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/RsdMentionRecord.java new file mode 100644 index 000000000..237e96152 --- /dev/null +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/doi/RsdMentionRecord.java @@ -0,0 +1,16 @@ +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import java.time.Instant; +import java.util.UUID; + +public record RsdMentionRecord( + UUID id, + ExternalMentionRecord content, + Instant scrapedAt +) { +} diff --git a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepositoryTest.java b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepositoryTest.java index f0119a486..8441b4702 100644 --- a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepositoryTest.java +++ b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/DataciteMentionRepositoryTest.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2022 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 Netherlands eScience Center +// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -15,10 +15,14 @@ public class DataciteMentionRepositoryTest { @Test public void givenCollectionOfStrings_whenJoining_thenCorrectStringReturned() { - Collection strings = List.of("abc", "def", "ghij"); + Doi doi1 = Doi.fromString("10.000/1"); + Doi doi2 = Doi.fromString("10.2/2"); + Doi doi3 = Doi.fromString("10.3/abc-def"); + Collection strings = List.of(doi1, doi2, doi3); - String joinedString = DataciteMentionRepository.joinCollection(strings); + String joinedString = DataciteMentionRepository.joinDoisForGraphqlQuery(strings); - Assertions.assertEquals("\"abc\",\"def\",\"ghij\"", joinedString); + String expected = "\"%s\",\"%s\",\"%s\"".formatted(doi1, doi2, doi3); + Assertions.assertEquals(expected, joinedString); } } diff --git a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/DoiTest.java b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/DoiTest.java new file mode 100644 index 000000000..58ff894e4 --- /dev/null +++ b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/DoiTest.java @@ -0,0 +1,54 @@ +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class DoiTest { + + @ParameterizedTest + @ValueSource(strings = { + "10.2533/chimia.2024.525", + "10.1017/9781108881425", + "10.3390/photonics11070630", + "10.1093/gigascience/giad048", + "10.1007/978-3-030-83508-8_2", + "10.22541/essoar.171500959.99365288/v1", + "10.1016/j.eswa.2023.120561", + }) + void givenValidDoi_whenInstanceCreated_thenNoExceptionThrown(String validDoi) { + Doi doi = Assertions.assertDoesNotThrow(() -> Doi.fromString(validDoi)); + Assertions.assertNotNull(doi); + } + + @ParameterizedTest + @ValueSource(strings = { + "10.2533", + "10.2533/", + "https://doi.org/10.2533/chimia.2024.525", + "10.3390/photonics 11070630", + "10.3390/photonics11070630 ", + "11.1016/j.eswa.2023.120561", + "", + }) + void givenInValidDoi_whenCreatingInstance_thenExceptionThrown(String invalidDoi) { + Assertions.assertThrows(RuntimeException.class, () -> Doi.fromString(invalidDoi)); + } + + @Test + void givenTwoValidDoisThatOnlyDifferInCase_whenComparing_thenTheyAreEqual() { + String upperCaseDoi = "10.2533/chimia.2024.525"; + String lowerCaseDoi = "10.2533/CHIMIA.2024.525"; + + Doi doi1 = Doi.fromString(upperCaseDoi); + Doi doi2 = Doi.fromString(lowerCaseDoi); + + Assertions.assertEquals(doi1, doi2); + } +} diff --git a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/MainMentionsTest.java b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/MainMentionsTest.java index d60dfc854..f9acd80cb 100644 --- a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/MainMentionsTest.java +++ b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/MainMentionsTest.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2022 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 Netherlands eScience Center +// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -43,7 +43,7 @@ void givenValidDoiSourceData_whenParsing_thenMapReturned() { } ]"""; - Map doiToSource = MainMentions.parseJsonSources(validDoiSourceData); + Map doiToSource = MainMentions.parseJsonDoiSources(validDoiSourceData); Assertions.assertEquals(6, doiToSource.size()); Assertions.assertEquals("EIDR", doiToSource.get("10.5240/B1FA-0EEC-C316-3316-3A73-L")); Assertions.assertEquals("Invalid DOI", doiToSource.get("notADoi")); diff --git a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenalexIdTest.java b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenalexIdTest.java new file mode 100644 index 000000000..0b20e9a4d --- /dev/null +++ b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/doi/OpenalexIdTest.java @@ -0,0 +1,49 @@ +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Netherlands eScience Center +// +// SPDX-License-Identifier: Apache-2.0 + +package nl.esciencecenter.rsd.scraper.doi; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +class OpenalexIdTest { + + @ParameterizedTest + @ValueSource(strings = { + "https://openalex.org/W3160330321", + "https://openalex.org/w3160330321", + "https://openalex.org/W152867311", + }) + void givenValidOpenalexId_whenInstanceCreated_thenNoExceptionThrown(String validId) { + OpenalexId openalexId = Assertions.assertDoesNotThrow(() -> OpenalexId.fromString(validId)); + Assertions.assertNotNull(openalexId); + } + + @ParameterizedTest + @ValueSource(strings = { + "http://openalex.org/W3160330321", + "https://openalex.org/3160330321", + "https://openalex.org/W3160330321/", + "https://openalex.org/works/W3160330321", + "W3160330321", + "", + }) + void givenInValidOpenalexId_whenCreatingInstance_thenExceptionThrown(String invalidId) { + Assertions.assertThrows(RuntimeException.class, () -> OpenalexId.fromString(invalidId)); + } + + @Test + void givenTwoValidIdsThatOnlyDifferInCase_whenComparing_thenTheyAreEqual() { + String upperCaseId = "https://openalex.org/W3160330321"; + String lowerCaseId = "https://openalex.org/w3160330321"; + + OpenalexId openalexId1 = OpenalexId.fromString(upperCaseId); + OpenalexId openalexId2 = OpenalexId.fromString(lowerCaseId); + + Assertions.assertEquals(openalexId1, openalexId2); + } +}