-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1137 from research-software-directory/scraper-fixes
Scraper fixes
- Loading branch information
Showing
6 changed files
with
46 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
// SPDX-FileCopyrightText: 2023 Dusan Mijatovic (dv4all) | ||
// SPDX-FileCopyrightText: 2023 dv4all | ||
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2024 Netherlands eScience Center | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
@@ -133,7 +135,7 @@ export async function validateInputList(doiList: string[], mentions: MentionItem | |
mentionResultPerDoi.set(doi, { | ||
doi, | ||
status: 'valid', | ||
source: 'Crossref', | ||
source: 'DataCite', | ||
include: true, | ||
mention | ||
}) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
// SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all) | ||
// SPDX-FileCopyrightText: 2022 dv4all | ||
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2024 Netherlands eScience Center | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
@@ -24,8 +26,8 @@ type DoiAttributes = { | |
] | ||
referenceCount: number | ||
citationCount: number | ||
// and many more but we are initially interesed in these | ||
// see exapleResponse for more or response from | ||
// and many more, but we are initially interested in these | ||
// see exampleResponse for more or response from | ||
// https://api.datacite.org/dois/10.5281/zenodo.1051064 | ||
// documentation https://support.datacite.org/reference/get_dois-id | ||
} | ||
|
@@ -92,7 +94,9 @@ const exampleWork = { | |
'title': 'CellProfiler and KNIME: Open-Source Tools for High-Content Screening' | ||
} | ||
], | ||
'publisher': 'Springer Science and Business Media LLC', | ||
'publisher': { | ||
'name': 'Springer Science and Business Media LLC' | ||
}, | ||
'publicationYear': 2019, | ||
'creators': [ | ||
{ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center | ||
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center | ||
// SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all) | ||
// SPDX-FileCopyrightText: 2022 dv4all | ||
// | ||
|
@@ -27,7 +27,9 @@ function graphQLDoiQuery(doi:string) { | |
descriptions(first:1){ | ||
description | ||
}, | ||
publisher, | ||
publisher { | ||
name | ||
}, | ||
publicationYear, | ||
creators{ | ||
givenName, | ||
|
@@ -66,7 +68,9 @@ function graphQLDoisQuery(dois: string[]) { | |
descriptions(first:1){ | ||
description | ||
}, | ||
publisher, | ||
publisher { | ||
name | ||
}, | ||
publicationYear, | ||
creators{ | ||
givenName, | ||
|
@@ -118,7 +122,9 @@ function gqlWorksByTitleQuery(title: string) { | |
descriptions(first:1){ | ||
description | ||
}, | ||
publisher, | ||
publisher { | ||
name | ||
}, | ||
publicationYear, | ||
creators{ | ||
givenName, | ||
|
@@ -172,7 +178,7 @@ export function dataCiteGraphQLItemToMentionItem(item: WorkResponse) { | |
url: makeDoiRedirectUrl(item.doi), | ||
title: item.titles[0].title, | ||
authors: extractAuthors(item), | ||
publisher: item.publisher, | ||
publisher: item.publisher.name, | ||
publication_year: item.publicationYear, | ||
journal: null, | ||
page: null, | ||
|
@@ -252,7 +258,7 @@ export async function getDataciteItemsByDoiGraphQL(dois: string[]) { | |
|
||
export async function getDataciteItemsByTitleGraphQL(title: string) { | ||
try { | ||
const query = gqlWorksByTitleQuery(title) | ||
const query = gqlWorksByTitleQuery(title.replace(':', '\\\\:')) | ||
const url = 'https://api.datacite.org/graphql' | ||
|
||
const resp = await fetch(url, { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center | ||
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
@@ -10,6 +10,8 @@ | |
import com.google.gson.JsonObject; | ||
import com.google.gson.JsonParser; | ||
import nl.esciencecenter.rsd.scraper.Utils; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.net.URI; | ||
import java.time.Instant; | ||
|
@@ -25,9 +27,6 @@ | |
import java.util.regex.Pattern; | ||
import java.util.stream.Collectors; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class DataciteMentionRepository implements MentionRepository { | ||
|
||
private static final Logger LOGGER = LoggerFactory.getLogger(DataciteMentionRepository.class); | ||
|
@@ -49,7 +48,9 @@ public class DataciteMentionRepository implements MentionRepository { | |
titles(first: 1) { | ||
title | ||
} | ||
publisher | ||
publisher { | ||
name | ||
} | ||
publicationYear | ||
registered | ||
creators { | ||
|
@@ -154,7 +155,7 @@ static MentionRecord parseWork(JsonObject work) { | |
} | ||
result.authors = String.join(", ", authors); | ||
|
||
result.publisher = Utils.stringOrNull(work.get("publisher")); | ||
result.publisher = Utils.stringOrNull(work.getAsJsonObject("publisher").get("name")); | ||
result.publicationYear = Utils.integerOrNull(work.get("publicationYear")); | ||
String doiRegistrationDateString = Utils.stringOrNull(work.get("registered")); | ||
if (doiRegistrationDateString != null) { | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2023 Netherlands eScience Center | ||
// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]> | ||
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center | ||
// | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
|
@@ -20,6 +20,9 @@ | |
import java.util.Collections; | ||
import java.util.Optional; | ||
import java.util.UUID; | ||
import java.util.function.Predicate; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.StreamSupport; | ||
|
||
public class OpenAlexCitations { | ||
|
||
|
@@ -135,16 +138,15 @@ static MentionRecord parseCitationAsMention(JsonElement element, Instant scraped | |
} | ||
|
||
JsonArray authorsArray = citationObject.getAsJsonArray("authorships"); | ||
Collection<String> authors = new ArrayList<>(); | ||
for (JsonElement jsonElement : authorsArray) { | ||
authors.add( | ||
jsonElement | ||
.getAsJsonObject() | ||
.getAsJsonPrimitive("raw_author_name") | ||
.getAsString() | ||
); | ||
mention.authors = StreamSupport.stream(authorsArray.spliterator(), false) | ||
.map(JsonElement::getAsJsonObject) | ||
.map(jo -> jo.get("raw_author_name")) | ||
.filter(Predicate.not(JsonElement::isJsonNull)) | ||
.map(JsonElement::getAsString) | ||
.collect(Collectors.joining(", ")); | ||
if (mention.authors.isBlank()) { | ||
mention.authors = null; | ||
} | ||
mention.authors = String.join(", ", authors); | ||
|
||
mention.publisher = null; | ||
|
||
|