Skip to content

Commit

Permalink
Merge pull request #1306 from research-software-directory/1291-scrape…
Browse files Browse the repository at this point in the history
…-openalex-citations

feat: allow harvesting citations of OpenAlex reference papers
  • Loading branch information
ewan-escience authored Oct 2, 2024
2 parents 6193b6f + 4175b82 commit ee47d07
Show file tree
Hide file tree
Showing 37 changed files with 849 additions and 542 deletions.
5 changes: 2 additions & 3 deletions database/011-create-mention-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ CREATE TABLE mention (
id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
doi CITEXT UNIQUE CHECK (doi ~ '^10(\.\w+)+/\S+$' AND LENGTH(doi) <= 255),
doi_registration_date TIMESTAMPTZ,
openalex_id CITEXT UNIQUE CHECK (openalex_id ~ '^https://openalex\.org/[WwAaSsIiCcPpFf]\d{3,13}$'),
url VARCHAR(500) CHECK (url ~ '^https?://'),
title VARCHAR(3000) NOT NULL,
authors VARCHAR(50000),
Expand All @@ -40,15 +41,13 @@ CREATE TABLE mention (
page VARCHAR(50),
image_url VARCHAR(500) CHECK (image_url ~ '^https?://'),
mention_type mention_type NOT NULL,
external_id VARCHAR(500),
source VARCHAR(50) NOT NULL,
version VARCHAR(100),
note VARCHAR(500),
scraped_at TIMESTAMPTZ,
citations_scraped_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL,
updated_at TIMESTAMPTZ NOT NULL,
UNIQUE(external_id, source)
updated_at TIMESTAMPTZ NOT NULL
);

CREATE FUNCTION sanitise_insert_mention() RETURNS TRIGGER LANGUAGE plpgsql AS
Expand Down
10 changes: 6 additions & 4 deletions database/104-software-views.sql
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,20 @@ CREATE FUNCTION reference_papers_to_scrape()
RETURNS TABLE (
id UUID,
doi CITEXT,
openalex_id CITEXT,
citations_scraped_at TIMESTAMPTZ,
known_dois CITEXT[]
known_citing_dois CITEXT[]
)
LANGUAGE sql STABLE AS
$$
SELECT mention.id, mention.doi, mention.citations_scraped_at, ARRAY_REMOVE(ARRAY_AGG(citation.doi), NULL)
SELECT mention.id, mention.doi, mention.openalex_id, mention.citations_scraped_at, ARRAY_REMOVE(ARRAY_AGG(citation.doi), NULL)
FROM mention
LEFT JOIN citation_for_mention ON mention.id = citation_for_mention.mention
LEFT JOIN mention AS citation ON citation_for_mention.citation = citation.id
WHERE
-- ONLY items with DOI
mention.doi IS NOT NULL AND (
-- ONLY items with DOI or OpenAlex id
(mention.doi IS NOT NULL OR mention.openalex_id IS NOT NULL)
AND (
mention.id IN (
SELECT mention FROM reference_paper_for_software
)
Expand Down
6 changes: 3 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ version: "3.0"
services:
database:
build: ./database
image: rsd/database:2.4.1
image: rsd/database:2.5.0
ports:
# enable connection from outside (development mode)
- "5432:5432"
Expand Down Expand Up @@ -110,7 +110,7 @@ services:
# dockerfile to use for build
dockerfile: Dockerfile
# update version number to correspond to frontend/package.json
image: rsd/frontend:2.19.0
image: rsd/frontend:2.19.1
environment:
# it uses values from .env file
- POSTGREST_URL
Expand Down Expand Up @@ -158,7 +158,7 @@ services:

scrapers:
build: ./scrapers
image: rsd/scrapers:1.8.1
image: rsd/scrapers:1.9.0
environment:
# it uses values from .env file
- POSTGREST_URL
Expand Down
8 changes: 4 additions & 4 deletions documentation/docs/01-users/05-adding-software.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ When using a Document URL to point to a remote Markdown file on the GitHub, you

### Logo

The software logo is shown on the software page and in the software card (see example below). **You can upload an image up to 2MB of size**. Widely used image formats like jpg, jpeg, png, svg etc. are supported. Use the **svg** format, if possible, because it scales better than other formats.
The software logo is shown on the software page and in the software card (see example below). **You can upload an image up to 2MB of size**. Widely used image formats like JPG, JPEG, PNG, SVG etc. are supported. Use the **svg** format, if possible, because it scales better than other formats.

![image](img/software-logo-card.webp)

Expand Down Expand Up @@ -164,14 +164,14 @@ This section allows you to add mentions to your software page. You can use this

### Reference papers

Use the *Search* box on the right hand side to find papers by DOI or title. All the relevant data about the publication will be retrieved automatically. A background scraper will use [OpenAlex](https://openalex.org/) to collect all citations of the reference papers.
Use the *Search* box on the right hand side to find papers by DOI or title. All the relevant data about the publication will be retrieved automatically. A background scraper will use [OpenAlex](https://openalex.org/) to collect all citations of reference papers that have a DOI or an OpenAlex ID.

### Citations

All the results RSD scraper was able to find on [OpenAlex](https://openalex.org/) citing provided reference papers. It can take a few minutes before the citations are harvested.
These are the citations of the reference papers that the RSD scraper was able to find on [OpenAlex](https://openalex.org/. It can take a few minutes before the citations are harvested.

:::warning
You cannot edit this section. All entries are automatically generated by the RSD scraper service. The found mentions are displayed in the mentions section of the software page.
You cannot edit the content of this section. All entries are automatically harvested and generated by the RSD scraper. The mentions found are displayed in the mentions section of the software page.
:::

### Related output
Expand Down
10 changes: 5 additions & 5 deletions documentation/docs/01-users/07-adding-projects.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ After signing in, use the **"+"** button next to your avatar icon on the top rig

![image](img/new-project.gif)

The RSD will automatically generate a *slug* for your project based on the project name you have provided. This slug will become part of the URL on which your project page can found.
The RSD will automatically generate a *slug* for your project based on the project name you have provided. This slug will become part of the URL on which your project page can be found.
There is a small chance the generated slug is already in use by another project. If this is the case, an error will be shown, and you will need to change the slug manually to resolve this conflict.

Once you click **"save"**, the RSD will initialize a new empty project page. This page will not be public yet to give you some time to provide additional information. Next, you can add additional information in the edit sections explained below.
Expand Down Expand Up @@ -106,16 +106,16 @@ You can import up to 50 publications by providing a list of DOIs, one per line.
If the output has no DOI you can create new mention item manually. Each manually added item should at least have a **Title**, **Type** and **URL**. All other fields are optional. The **Note** field can be used to add a note to this item, and will not be shown on the project page.

:::warning
Please check if the information is complete and correct. A manual item can not be edited after it has been saved!
Please check if the information is complete and correct. A manual item cannot be edited after it has been saved!
You can, however, delete an item and create a new one.
:::

### Citations

Here we list all the citations of your output that the RSD was able to find automatically by using the DOIs of your output and OpenAlex. On the project page these citations are shown in the impact section together with the items you added manually.
Here, we list all the citations of your output (that has a DOI or OpenAlex ID) that the RSD was able to find automatically on OpenAlex. On the project page, these citations are shown in the impact section together with the items you added manually.

:::warning
You cannot edit this section. All entries are automatically generated by the RSD scraper service. Found publications are displayed in the impact section of the project page.
You cannot edit the content of this section. All entries are automatically harvested and generated by the RSD scraper. The publications found are displayed in the impact section of the project page.
:::

### Impact
Expand All @@ -135,7 +135,7 @@ You can import up to 50 publications by providing a list of DOIs, one per line.
If the publication has no DOI you can create a new item manually. Each manually added item should at least have a **Title**, **Type** and **URL**. All other fields are optional. The **Note** field can be used to add a note to this item, and will not be shown on the project page.

:::warning
Please check if the information is complete and correct. A manual item can not be edited after it has been saved! You can, however, delete an item and create a new one.
Please check if the information is complete and correct. A manual item cannot be edited after it has been saved! You can, however, delete an item and create a new one.
:::

## Related projects
Expand Down
2 changes: 1 addition & 1 deletion documentation/docs/03-rsd-instance/03-administration.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ fill the `provenance_iri` column. Further read [Linked Data](https://en.wikipedi

## Mentions

In this section, admins can search for mentions and edit them. If you enter a DOI or UUID, we search on that field only. Otherwise, we search on title, authors, journal, URL, note and external ID (like an OpenAlex ID).
In this section, admins can search for mentions and edit them. If you enter a DOI or UUID, we search on that field only. Otherwise, we search on title, authors, journal, URL, note and OpenAlex ID.

:::warning
Edit mentions with care: they might be referenced to in multiple places. If you want to fully change a mention attached to e.g. a software page, you should delete it there and create a new one instead of editing it.
Expand Down
2 changes: 1 addition & 1 deletion frontend/components/admin/mentions/MentionsOverview.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ export default function MentionsOverview() {
if (searchTypeTerm.type === 'doi') {
return `doi=eq.${termEscaped}`
}
return `or=(title.ilike.*${termEscaped}*,authors.ilike.*${termEscaped}*,journal.ilike.*${termEscaped}*,url.ilike.*${termEscaped}*,note.ilike.*${termEscaped}*,external_id.ilike.*${termEscaped}*)`
return `or=(title.ilike.*${termEscaped}*,authors.ilike.*${termEscaped}*,journal.ilike.*${termEscaped}*,url.ilike.*${termEscaped}*,note.ilike.*${termEscaped}*,openalex_id.ilike.*${termEscaped}*)`
}

function sanitiseSearch(search: string): string | undefined {
Expand Down
11 changes: 5 additions & 6 deletions frontend/components/mention/EditMentionModal.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -291,14 +291,13 @@ export default function EditMentionModal({open, onCancel, onSubmit, item, pos, t
<ControlledTextField
control={control}
options={{
name: 'external_id',
label: config.external_id.label,
name: 'openalex_id',
label: config.openalex_id.label,
useNull: true,
defaultValue: formData?.external_id,
helperTextMessage: config.external_id.help,
helperTextCnt: `${formData?.external_id?.length || 0}/${config.external_id.validation.maxLength.value}`,
defaultValue: formData?.openalex_id,
helperTextMessage: config.openalex_id.help,
}}
rules={config.external_id.validation}
rules={config.openalex_id.validation}
/>
<div className="py-2"></div>
</>
Expand Down
12 changes: 6 additions & 6 deletions frontend/components/mention/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,14 +135,14 @@ export const mentionModal = {
}
}
},
external_id: {
label: 'External ID',
help: 'An ID used by e.g. OpenAlex',
openalex_id: {
label: 'OpenAlex ID',
help: 'The OpenAlex ID',
validation: {
required: false,
maxLength: {
value: 500,
message: 'Maximum length is 500'
pattern: {
value: /^https:\/\/openalex\.org\/[WwAaSsIiCcPpFf]\d{3,13}$/,
message: 'e.g. https://openalex.org/W3160330321'
}
}
},
Expand Down
2 changes: 1 addition & 1 deletion frontend/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "rsd-frontend",
"version": "2.19.0",
"version": "2.19.1",
"private": true,
"scripts": {
"dev": "next dev",
Expand Down
2 changes: 1 addition & 1 deletion frontend/package.json.license
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ SPDX-FileCopyrightText: 2021 - 2023 dv4all
SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
SPDX-FileCopyrightText: 2022 Jesús García Gonzalez (Netherlands eScience Center) <[email protected]>
SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center)
SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>

SPDX-License-Identifier: Apache-2.0
SPDX-License-Identifier: CC-BY-4.0
4 changes: 2 additions & 2 deletions frontend/types/Mention.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ export type MentionItemProps = {
mention_type: MentionTypeKeys | null
source: string
note?: string | null
external_id?: string | null
openalex_id?: string | null
}

export const mentionColumns ='id,doi,url,title,authors,publisher,publication_year,journal,page,image_url,mention_type,source,note'
export const mentionColumns ='id,doi,openalex_id,url,title,authors,publisher,publication_year,journal,page,image_url,mention_type,source,note'

export type MentionByType = {
[key in MentionTypeKeys]?: MentionItemProps[]
Expand Down
17 changes: 11 additions & 6 deletions scrapers/src/main/java/nl/esciencecenter/rsd/scraper/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ public static String getAsAdmin(String uri) {
Thread.currentThread().interrupt();
throw new RuntimeException(e);
} catch (IOException e) {
LOGGER.warn("An error occurred sending a request to {}", uri, e);
LOGGER.error("An error occurred sending a request to {}", uri, e);
throw new RuntimeException(e);
}

Expand Down Expand Up @@ -188,7 +188,9 @@ public static String postAsAdmin(String uri, String json, String... extraHeaders
.timeout(DEFAULT_TIMEOUT)
.header("Content-Type", "application/json")
.header("Authorization", "Bearer " + jwtString);
if (extraHeaders != null && extraHeaders.length > 0) builder.headers(extraHeaders);
if (extraHeaders != null && extraHeaders.length > 0) {
builder.headers(extraHeaders);
}
HttpRequest request = builder.build();
HttpResponse<String> response;

Expand Down Expand Up @@ -276,15 +278,18 @@ static String createPatchUri(String baseuri, String tableName, String primaryKey
return "%s/%s?%s=eq.%s".formatted(baseuri, tableName, primaryKeyName, primaryKey);
}

public static String patchAsAdmin(String uri, String json) {
public static String patchAsAdmin(String uri, String json, String... extraHeaders) {
String jwtString = adminJwt();
HttpRequest request = HttpRequest.newBuilder()
HttpRequest.Builder builder = HttpRequest.newBuilder()
.method("PATCH", HttpRequest.BodyPublishers.ofString(json))
.uri(URI.create(uri))
.timeout(Duration.ofSeconds(30))
.header("Content-Type", "application/json")
.header("Authorization", "Bearer " + jwtString)
.build();
.header("Authorization", "Bearer " + jwtString);
if (extraHeaders != null && extraHeaders.length > 0) {
builder.headers(extraHeaders);
}
HttpRequest request = builder.build();
HttpResponse<String> response;
try (HttpClient client = HttpClient.newHttpClient()) {
response = client.send(request, HttpResponse.BodyHandlers.ofString());
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -8,31 +8,20 @@
import java.util.Collection;
import java.util.UUID;

/**
/**
* Container class for Citation information retrieved from the database.
*/
public class CitationData {
public record CitationData(
// UUID of this entry in the database
UUID id,

// UUID of this entry in the database
public final UUID id;

// DOI of this entry.
public final String doi;

// List of known DOIs citing this entry.
public final Collection<String> knownDois;

/**
* Create a CitationData and initialize with data provided.
*
* @param id of this entry in the database
* @param doi of this entry
* @param knownDois list of known DOIs citing this entry
*/
public CitationData(UUID id, String doi, Collection<String> knownDois) {
super();
this.id = id;
this.doi = doi;
this.knownDois = knownDois;
}
// DOI of this entry.
Doi doi,

// OpenAlex ID of this entry
OpenalexId openalexId,

// List of known DOIs citing this entry.
Collection<Doi> knownDois
) {
}
Loading

0 comments on commit ee47d07

Please sign in to comment.