Skip to content

Commit

Permalink
Merge pull request #1137 from research-software-directory/scraper-fixes
Browse files Browse the repository at this point in the history
Scraper fixes
  • Loading branch information
ewan-escience authored Mar 1, 2024
2 parents 40db234 + 1335b0d commit e82e831
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 29 deletions.
2 changes: 2 additions & 0 deletions deployment/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ services:
restart: unless-stopped

codemeta:
container_name: codemeta
image: ghcr.io/research-software-directory/rsd-saas/codemeta:latest
expose:
- "8000"
Expand All @@ -175,6 +176,7 @@ services:
restart: unless-stopped

swagger:
container_name: swagger
image: swaggerapi/swagger-ui:v4.15.0
expose:
- "8080"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// SPDX-FileCopyrightText: 2023 Dusan Mijatovic (dv4all)
// SPDX-FileCopyrightText: 2023 dv4all
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2024 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

Expand Down Expand Up @@ -133,7 +135,7 @@ export async function validateInputList(doiList: string[], mentions: MentionItem
mentionResultPerDoi.set(doi, {
doi,
status: 'valid',
source: 'Crossref',
source: 'DataCite',
include: true,
mention
})
Expand Down
10 changes: 7 additions & 3 deletions frontend/types/Datacite.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
// SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all)
// SPDX-FileCopyrightText: 2022 dv4all
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2024 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -24,8 +26,8 @@ type DoiAttributes = {
]
referenceCount: number
citationCount: number
// and many more but we are initially interesed in these
// see exapleResponse for more or response from
// and many more, but we are initially interested in these
// see exampleResponse for more or response from
// https://api.datacite.org/dois/10.5281/zenodo.1051064
// documentation https://support.datacite.org/reference/get_dois-id
}
Expand Down Expand Up @@ -92,7 +94,9 @@ const exampleWork = {
'title': 'CellProfiler and KNIME: Open-Source Tools for High-Content Screening'
}
],
'publisher': 'Springer Science and Business Media LLC',
'publisher': {
'name': 'Springer Science and Business Media LLC'
},
'publicationYear': 2019,
'creators': [
{
Expand Down
20 changes: 13 additions & 7 deletions frontend/utils/getDataCite.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all)
// SPDX-FileCopyrightText: 2022 dv4all
//
Expand Down Expand Up @@ -27,7 +27,9 @@ function graphQLDoiQuery(doi:string) {
descriptions(first:1){
description
},
publisher,
publisher {
name
},
publicationYear,
creators{
givenName,
Expand Down Expand Up @@ -66,7 +68,9 @@ function graphQLDoisQuery(dois: string[]) {
descriptions(first:1){
description
},
publisher,
publisher {
name
},
publicationYear,
creators{
givenName,
Expand Down Expand Up @@ -118,7 +122,9 @@ function gqlWorksByTitleQuery(title: string) {
descriptions(first:1){
description
},
publisher,
publisher {
name
},
publicationYear,
creators{
givenName,
Expand Down Expand Up @@ -172,7 +178,7 @@ export function dataCiteGraphQLItemToMentionItem(item: WorkResponse) {
url: makeDoiRedirectUrl(item.doi),
title: item.titles[0].title,
authors: extractAuthors(item),
publisher: item.publisher,
publisher: item.publisher.name,
publication_year: item.publicationYear,
journal: null,
page: null,
Expand Down Expand Up @@ -252,7 +258,7 @@ export async function getDataciteItemsByDoiGraphQL(dois: string[]) {

export async function getDataciteItemsByTitleGraphQL(title: string) {
try {
const query = gqlWorksByTitleQuery(title)
const query = gqlWorksByTitleQuery(title.replace(':', '\\\\:'))
const url = 'https://api.datacite.org/graphql'

const resp = await fetch(url, {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -10,6 +10,8 @@
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import nl.esciencecenter.rsd.scraper.Utils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.net.URI;
import java.time.Instant;
Expand All @@ -25,9 +27,6 @@
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DataciteMentionRepository implements MentionRepository {

private static final Logger LOGGER = LoggerFactory.getLogger(DataciteMentionRepository.class);
Expand All @@ -49,7 +48,9 @@ public class DataciteMentionRepository implements MentionRepository {
titles(first: 1) {
title
}
publisher
publisher {
name
}
publicationYear
registered
creators {
Expand Down Expand Up @@ -154,7 +155,7 @@ static MentionRecord parseWork(JsonObject work) {
}
result.authors = String.join(", ", authors);

result.publisher = Utils.stringOrNull(work.get("publisher"));
result.publisher = Utils.stringOrNull(work.getAsJsonObject("publisher").get("name"));
result.publicationYear = Utils.integerOrNull(work.get("publicationYear"));
String doiRegistrationDateString = Utils.stringOrNull(work.get("registered"));
if (doiRegistrationDateString != null) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -20,6 +20,9 @@
import java.util.Collections;
import java.util.Optional;
import java.util.UUID;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;

public class OpenAlexCitations {

Expand Down Expand Up @@ -135,16 +138,15 @@ static MentionRecord parseCitationAsMention(JsonElement element, Instant scraped
}

JsonArray authorsArray = citationObject.getAsJsonArray("authorships");
Collection<String> authors = new ArrayList<>();
for (JsonElement jsonElement : authorsArray) {
authors.add(
jsonElement
.getAsJsonObject()
.getAsJsonPrimitive("raw_author_name")
.getAsString()
);
mention.authors = StreamSupport.stream(authorsArray.spliterator(), false)
.map(JsonElement::getAsJsonObject)
.map(jo -> jo.get("raw_author_name"))
.filter(Predicate.not(JsonElement::isJsonNull))
.map(JsonElement::getAsString)
.collect(Collectors.joining(", "));
if (mention.authors.isBlank()) {
mention.authors = null;
}
mention.authors = String.join(", ", authors);

mention.publisher = null;

Expand Down

0 comments on commit e82e831

Please sign in to comment.