Skip to content

Commit

Permalink
feat: add OpenAlex citation scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
ewan-escience committed Oct 11, 2023
1 parent 76bfc6e commit bd8911b
Show file tree
Hide file tree
Showing 14 changed files with 434 additions and 40 deletions.
19 changes: 18 additions & 1 deletion database/009-create-mention-table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,15 @@ CREATE TABLE mention (
page VARCHAR(50),
image_url VARCHAR(500) CHECK (image_url ~ '^https?://'),
mention_type mention_type NOT NULL,
external_id VARCHAR(500),
source VARCHAR(50) NOT NULL,
version VARCHAR(100),
note VARCHAR(500),
scraped_at TIMESTAMPTZ,
citations_scraped_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL,
updated_at TIMESTAMPTZ NOT NULL
updated_at TIMESTAMPTZ NOT NULL,
UNIQUE(external_id, source)
);

CREATE FUNCTION sanitise_insert_mention() RETURNS TRIGGER LANGUAGE plpgsql AS
Expand Down Expand Up @@ -81,6 +84,20 @@ CREATE TABLE mention_for_software (
);


CREATE TABLE reference_paper_for_software (
mention UUID REFERENCES mention (id),
software UUID REFERENCES software (id),
PRIMARY KEY (mention, software)
);


CREATE TABLE citation_for_mention (
mention UUID REFERENCES mention (id),
citation UUID REFERENCES mention (id),
PRIMARY KEY (mention, citation)
);


CREATE FUNCTION search_mentions_for_software(software_id UUID, search_text VARCHAR) RETURNS SETOF mention STABLE LANGUAGE plpgsql AS
$$
BEGIN
Expand Down
32 changes: 24 additions & 8 deletions database/020-row-level-security.sql
Original file line number Diff line number Diff line change
Expand Up @@ -446,17 +446,9 @@ CREATE POLICY admin_all_rights ON research_domain_for_project TO rsd_admin


-- mentions
-- TODO: not sure what to do here,
-- should a mention only be visible if you can see at least one software or project for which it relates?
ALTER TABLE mention ENABLE ROW LEVEL SECURITY;

CREATE POLICY anyone_can_read ON mention FOR SELECT TO rsd_web_anon, rsd_user
USING (id IN (SELECT mention FROM mention_for_software)
OR id IN (SELECT mention FROM output_for_project)
OR id IN (SELECT mention FROM impact_for_project)
OR id IN (SELECT mention_id FROM release_version));

CREATE POLICY maintainer_can_read ON mention FOR SELECT TO rsd_user
USING (TRUE);

CREATE POLICY maintainer_can_delete ON mention FOR DELETE TO rsd_user
Expand Down Expand Up @@ -484,6 +476,30 @@ CREATE POLICY admin_all_rights ON mention_for_software TO rsd_admin
WITH CHECK (TRUE);


ALTER TABLE reference_paper_for_software ENABLE ROW LEVEL SECURITY;

CREATE POLICY anyone_can_read ON reference_paper_for_software FOR SELECT TO rsd_web_anon, rsd_user
USING (software IN (SELECT id FROM software));

CREATE POLICY maintainer_all_rights ON reference_paper_for_software TO rsd_user
USING (software IN (SELECT * FROM software_of_current_maintainer()))
WITH CHECK (software IN (SELECT * FROM software_of_current_maintainer()));

CREATE POLICY admin_all_rights ON reference_paper_for_software TO rsd_admin
USING (TRUE)
WITH CHECK (TRUE);


ALTER TABLE citation_for_mention ENABLE ROW LEVEL SECURITY;

CREATE POLICY anyone_can_read ON citation_for_mention FOR SELECT TO rsd_web_anon, rsd_user
USING (mention IN (SELECT id FROM mention));

CREATE POLICY admin_all_rights ON citation_for_mention TO rsd_admin
USING (TRUE)
WITH CHECK (TRUE);


ALTER TABLE output_for_project ENABLE ROW LEVEL SECURITY;

CREATE POLICY anyone_can_read ON output_for_project FOR SELECT TO rsd_web_anon, rsd_user
Expand Down
19 changes: 19 additions & 0 deletions database/109-mention-views.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
-- SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
-- SPDX-FileCopyrightText: 2023 Netherlands eScience Center
--
-- SPDX-License-Identifier: Apache-2.0

CREATE FUNCTION reference_papers_to_scrape()
RETURNS TABLE (
id UUID,
doi CITEXT,
citations_scraped_at TIMESTAMPTZ
)
LANGUAGE sql STABLE AS
$$
SELECT id, doi, citations_scraped_at
FROM mention
WHERE id IN (
SELECT mention FROM reference_paper_for_software
)
$$;
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

package nl.esciencecenter.rsd.scraper.doi;

import java.util.UUID;

public class CitationData {

public UUID id;
public String doi;
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@

public class CrossrefMention implements Mention {

private static final Map<String, MentionType> crossrefTypeMap;
static final Map<String, MentionType> crossrefTypeMap;

static {
// https://api.crossref.org/types
// https://api.crossref.org/types
crossrefTypeMap = new HashMap<>();

crossrefTypeMap.put("book-section", MentionType.bookSection);
Expand Down Expand Up @@ -97,7 +97,7 @@ public MentionRecord mentionData() {
try {
result.publicationYear = Utils.integerOrNull(workJson.getAsJsonObject("published").getAsJsonArray("date-parts").get(0).getAsJsonArray().get(0));
} catch (RuntimeException e) {
// year not found, we leave it at null, nothing to do
// year not found, we leave it at null, nothing to do
}
if (workJson.getAsJsonArray("container-title").size() > 0) {
JsonArray journalTitles = workJson.getAsJsonArray("container-title");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
Expand Down Expand Up @@ -65,7 +64,7 @@ public class DataciteMentionRepository implements MentionRepository {
private static final Pattern URL_TREE_TAG_PATTERN = Pattern.compile("/tree/([^/]+)$");

static {
// https://schema.datacite.org/meta/kernel-4.4/
// https://schema.datacite.org/meta/kernel-4.4/
dataciteTypeMap = new HashMap<>();
dataciteTypeMap.put("Audiovisual", MentionType.presentation);
dataciteTypeMap.put("Book", MentionType.book);
Expand All @@ -92,7 +91,7 @@ public class DataciteMentionRepository implements MentionRepository {
dataciteTypeMap.put("Software", MentionType.computerProgram);
dataciteTypeMap.put("Sound", MentionType.other);
dataciteTypeMap.put("Standard", MentionType.other);
// dataciteTypeMap.put("Text", MentionType.other);
// dataciteTypeMap.put("Text", MentionType.other);
dataciteTypeMap.put("Workflow", MentionType.other);
dataciteTypeMap.put("Other", MentionType.other);

Expand All @@ -105,7 +104,7 @@ public class DataciteMentionRepository implements MentionRepository {
dataciteTextTypeMap.put("Report", MentionType.report);
}

// "10.5281/zenodo.1408128","10.1186/s12859-018-2165-7"
// "10.5281/zenodo.1408128","10.1186/s12859-018-2165-7"
static String joinCollection(Collection<String> dois) {
return dois.stream()
.collect(Collectors.joining("\",\"", "\"", "\""));
Expand All @@ -118,8 +117,8 @@ static Collection<MentionRecord> jsonStringToUniqueMentions(String json) {
Set<String> usedDois = new TreeSet<>(String.CASE_INSENSITIVE_ORDER);
for (JsonElement work : worksJson) {
try {
// Sometimes, DataCite gives back two of the same results for one DOI, e.g. for 10.4122/1.1000000817,
// so we need to only add it once, otherwise we cannot POST it to the backend
// Sometimes, DataCite gives back two of the same results for one DOI, e.g. for 10.4122/1.1000000817,
// so we need to only add it once, otherwise we cannot POST it to the backend
MentionRecord parsedMention = parseWork(work.getAsJsonObject());
if (usedDois.contains(parsedMention.doi)) continue;

Expand Down Expand Up @@ -212,7 +211,7 @@ public Collection<MentionRecord> mentionData(Collection<String> dois) {
}

@Override
public Map<String, UUID> save(Collection<MentionRecord> mentions) {
public void save(Collection<MentionRecord> mentions) {
throw new UnsupportedOperationException();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

package nl.esciencecenter.rsd.scraper.doi;

import nl.esciencecenter.rsd.scraper.Config;
import nl.esciencecenter.rsd.scraper.Utils;

import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.Collection;
import java.util.UUID;

public class MainCitations {

public static void main(String[] args) {
System.out.println("Start scraping citations");
try {
String backendUrl = Config.backendBaseUrl();
PostgrestCitationRepository localCitationRepository = new PostgrestCitationRepository(backendUrl);

Collection<CitationData> doisToScrape = localCitationRepository.leastRecentlyScrapedCitations(5);
OpenAlexCitations openAlexCitations = new OpenAlexCitations();
MentionRepository localMentionRepository = new PostgrestMentionRepository(backendUrl);
String email = Config.crossrefContactEmail().orElse(null);
ZonedDateTime now = ZonedDateTime.now();

for (CitationData citationData : doisToScrape) {
Collection<MentionRecord> citingMentions = openAlexCitations.citations(citationData.doi, email);
localMentionRepository.save(citingMentions);

Collection<UUID> citingMentionIds = new ArrayList<>();
for (MentionRecord citingMention : citingMentions) {
citingMentionIds.add(citingMention.id);
}

localCitationRepository.saveCitations(backendUrl, citationData.id, citingMentionIds, now);
}
} catch (RuntimeException e) {
Utils.saveExceptionInDatabase("Citation scraper", null, null, e);
}

System.out.println("Done scraping citations");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import java.util.Collection;
import java.util.Map;
import java.util.TreeMap;
import java.util.UUID;

/*
Expand All @@ -34,7 +35,11 @@ public static void main(String[] args) {
Collection<MentionRecord> allMentions = scrapedReleasesPerConceptDoi.values().stream()
.flatMap(Collection::stream)
.toList();
Map<String, UUID> doiToId = localMentionRepository.save(allMentions);
localMentionRepository.save(allMentions);
Map<String, UUID> doiToId = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
for (MentionRecord mention : allMentions) {
doiToId.put(mention.doi, mention.id);
}

releaseRepository.saveReleaseContent(releasesToScrape, scrapedReleasesPerConceptDoi, doiToId);
System.out.println("Done scraping releases");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,30 @@ public class MentionRecord {
String page;
URI imageUrl;
MentionType mentionType;
String externalId;
String source;
Instant scrapedAt;
String version;

@Override
public String toString() {
return "MentionRecord{" +
"id=" + id +
", doi='" + doi + '\'' +
", url=" + url +
", title='" + title + '\'' +
", authors='" + authors + '\'' +
", publisher='" + publisher + '\'' +
", publicationYear=" + publicationYear +
", doiRegistrationDate=" + doiRegistrationDate +
", journal='" + journal + '\'' +
", page='" + page + '\'' +
", imageUrl=" + imageUrl +
", mentionType=" + mentionType +
", externalId='" + externalId + '\'' +
", source='" + source + '\'' +
", scrapedAt=" + scrapedAt +
", version='" + version + '\'' +
'}';
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,12 @@
package nl.esciencecenter.rsd.scraper.doi;

import java.util.Collection;
import java.util.Map;
import java.util.UUID;

public interface MentionRepository {

Collection<MentionRecord> leastRecentlyScrapedMentions(int limit);

Collection<MentionRecord> mentionData(Collection<String> dois);

Map<String, UUID> save(Collection<MentionRecord> mentions);
void save(Collection<MentionRecord> mentions);
}
Loading

0 comments on commit bd8911b

Please sign in to comment.