From 5442ca6539268d13f2633af87fe088df062efe06 Mon Sep 17 00:00:00 2001 From: Ewan Cahen Date: Wed, 6 Nov 2024 14:48:44 +0100 Subject: [PATCH] feat: also scrape archived status of GitHub repos --- .../005-create-relations-for-software.sql | 1 + .../rsd/scraper/git/BasicGitData.java | 16 +++++---- .../rsd/scraper/git/GithubScraper.java | 34 ++++++++++++------- .../rsd/scraper/git/GitlabScraper.java | 25 ++++++++------ .../rsd/scraper/git/PostgrestConnector.java | 34 ++++++++++++------- .../rsd/scraper/git/GithubScraperIT.java | 2 +- .../rsd/scraper/git/GitlabScraperIT.java | 4 +-- 7 files changed, 71 insertions(+), 45 deletions(-) diff --git a/database/005-create-relations-for-software.sql b/database/005-create-relations-for-software.sql index 7ee9eb074..ce5ee2722 100644 --- a/database/005-create-relations-for-software.sql +++ b/database/005-create-relations-for-software.sql @@ -20,6 +20,7 @@ CREATE TABLE repository_url ( software UUID REFERENCES software (id) PRIMARY KEY, url VARCHAR(200) NOT NULL CHECK (url ~ '^https?://'), code_platform platform_type NOT NULL DEFAULT 'other', + archived BOOLEAN, license VARCHAR(200), star_count BIGINT, fork_count INTEGER, diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/BasicGitData.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/BasicGitData.java index d27e01af7..2c93ed97e 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/BasicGitData.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/BasicGitData.java @@ -1,13 +1,15 @@ -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 package nl.esciencecenter.rsd.scraper.git; -public class BasicGitData { - public String license; - public Long starCount; - public Integer forkCount; - public Integer openIssueCount; +public record BasicGitData( + Boolean archived, + String license, + Long starCount, + Integer forkCount, + Integer openIssueCount +) { } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GithubScraper.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GithubScraper.java index 57c64e8a1..a15a45e1e 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GithubScraper.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GithubScraper.java @@ -69,11 +69,11 @@ public BasicGitData basicData() throws IOException, InterruptedException, RsdRes return switch (response.statusCode()) { case 200 -> parseBasicData(response.body()); case 404 -> - throw new RsdResponseException(404, response.uri(), response.body(), "Not found, is the repository URL correct?"); + throw new RsdResponseException(404, response.uri(), response.body(), "Not found, is the repository URL correct?"); case 403 -> - throw new RsdRateLimitException(403, response.uri(), response.body(), "Rate limit for GitHub probably reached"); + throw new RsdRateLimitException(403, response.uri(), response.body(), "Rate limit for GitHub probably reached"); default -> - throw new RsdResponseException(response.statusCode(), response.uri(), response.body(), "Unexpected response"); + throw new RsdResponseException(response.statusCode(), response.uri(), response.body(), "Unexpected response"); }; } @@ -86,12 +86,12 @@ public String languages() throws IOException, InterruptedException, RsdResponseE HttpResponse response = getAsHttpResponse(BASE_API_URL + "/repos/" + organisation + "/" + repo + "/languages"); return switch (response.statusCode()) { case 404 -> - throw new RsdResponseException(404, response.uri(), response.body(), "Not found, is the repository URL correct?"); + throw new RsdResponseException(404, response.uri(), response.body(), "Not found, is the repository URL correct?"); case 403 -> - throw new RsdRateLimitException(403, response.uri(), response.body(), "Rate limit for GitHub probably reached"); + throw new RsdRateLimitException(403, response.uri(), response.body(), "Rate limit for GitHub probably reached"); case 200 -> response.body(); default -> - throw new RsdResponseException(response.statusCode(), response.uri(), response.body(), "Unexpected response"); + throw new RsdResponseException(response.statusCode(), response.uri(), response.body(), "Unexpected response"); }; } @@ -194,16 +194,24 @@ static CommitsPerWeek parseCommits(String json) { } static BasicGitData parseBasicData(String json) { - BasicGitData result = new BasicGitData(); JsonObject jsonObject = JsonParser.parseString(json).getAsJsonObject(); + Boolean archived = jsonObject.getAsJsonPrimitive("archived").getAsBoolean(); JsonElement jsonLicense = jsonObject.get("license"); - result.license = jsonLicense.isJsonNull() ? null : jsonLicense.getAsJsonObject().getAsJsonPrimitive("spdx_id").getAsString(); - result.starCount = jsonObject.getAsJsonPrimitive("stargazers_count").getAsLong(); - result.forkCount = jsonObject.getAsJsonPrimitive("forks_count").getAsInt(); - result.openIssueCount = jsonObject.getAsJsonPrimitive("open_issues_count").getAsInt(); - - return result; + String license = jsonLicense.isJsonNull() ? null : jsonLicense.getAsJsonObject() + .getAsJsonPrimitive("spdx_id") + .getAsString(); + Long starCount = jsonObject.getAsJsonPrimitive("stargazers_count").getAsLong(); + Integer forkCount = jsonObject.getAsJsonPrimitive("forks_count").getAsInt(); + Integer openIssueCount = jsonObject.getAsJsonPrimitive("open_issues_count").getAsInt(); + + return new BasicGitData( + archived, + license, + starCount, + forkCount, + openIssueCount + ); } // return an object with the URL of the last page and the number of the last page respectively diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GitlabScraper.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GitlabScraper.java index ebe813cf9..9388ff98b 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GitlabScraper.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GitlabScraper.java @@ -88,10 +88,10 @@ public CommitsPerWeek contributions() throws IOException, InterruptedException, boolean done = false; while (!done) { HttpRequest request = HttpRequest.newBuilder().GET() - .uri(URI.create(apiUri + "/projects/" + Utils.urlEncode(projectPath) - + "/repository/commits?per_page=100&order=default&page=" + page)) - .timeout(Duration.ofSeconds(30)) - .build(); + .uri(URI.create(apiUri + "/projects/" + Utils.urlEncode(projectPath) + + "/repository/commits?per_page=100&order=default&page=" + page)) + .timeout(Duration.ofSeconds(30)) + .build(); HttpResponse response; try (HttpClient client = HttpClient.newBuilder().followRedirects(HttpClient.Redirect.NORMAL).build()) { response = client.send(request, HttpResponse.BodyHandlers.ofString()); @@ -138,15 +138,20 @@ static void parseCommitPage(String json, CommitsPerWeek commitsToFill) { } static BasicGitData parseBasicData(String json) { - BasicGitData result = new BasicGitData(); JsonObject jsonObject = JsonParser.parseString(json).getAsJsonObject(); JsonElement jsonLicense = jsonObject.get("license"); - result.license = jsonLicense.isJsonNull() ? null : jsonLicense.getAsJsonObject().get("name").getAsString(); - result.starCount = jsonObject.getAsJsonPrimitive("star_count").getAsLong(); - result.forkCount = jsonObject.getAsJsonPrimitive("forks_count").getAsInt(); - - return result; + String license = jsonLicense.isJsonNull() ? null : jsonLicense.getAsJsonObject().get("name").getAsString(); + Long starCount = jsonObject.getAsJsonPrimitive("star_count").getAsLong(); + Integer forkCount = jsonObject.getAsJsonPrimitive("forks_count").getAsInt(); + + return new BasicGitData( + null, + license, + starCount, + forkCount, + null + ); } } diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/PostgrestConnector.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/PostgrestConnector.java index f50da1d39..0890fd348 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/PostgrestConnector.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/PostgrestConnector.java @@ -82,17 +82,20 @@ static Collection parseBasicJsonData(String data) { } public void saveLanguagesData(LanguagesData languagesData) { - String json = String.format("{\"languages_last_error\": null, \"languages\": %s, \"languages_scraped_at\": \"%s\"}", languagesData.languages(), languagesData.languagesScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); + String json = String.format("{\"languages_last_error\": null, \"languages\": %s, \"languages_scraped_at\": \"%s\"}", languagesData.languages(), languagesData.languagesScrapedAt() + .format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); Utils.patchAsAdmin(backendUrl + "?software=eq." + languagesData.basicData().software().toString(), json); } public void saveCommitData(CommitData commitData) { String json; if (commitData.commitHistory() == null) { - json = String.format("{\"commit_history_last_error\": null, \"commit_history_scraped_at\": \"%s\"}", commitData.commitHistoryScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); + json = String.format("{\"commit_history_last_error\": null, \"commit_history_scraped_at\": \"%s\"}", commitData.commitHistoryScrapedAt() + .format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); } else { commitData.commitHistory().addMissingZeros(); - json = String.format("{\"commit_history_last_error\": null, \"commit_history\": %s, \"commit_history_scraped_at\": \"%s\"}", commitData.commitHistory().toJson(), commitData.commitHistoryScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); + json = String.format("{\"commit_history_last_error\": null, \"commit_history\": %s, \"commit_history_scraped_at\": \"%s\"}", commitData.commitHistory() + .toJson(), commitData.commitHistoryScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); } Utils.patchAsAdmin(backendUrl + "?software=eq." + commitData.basicData().software().toString(), json); } @@ -100,23 +103,30 @@ public void saveCommitData(CommitData commitData) { public void saveBasicData(BasicGitDatabaseData basicData) { JsonObject jsonObject = new JsonObject(); jsonObject.add("basic_data_last_error", JsonNull.INSTANCE); - jsonObject.addProperty("basic_data_scraped_at", basicData.dataScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); - jsonObject.addProperty("license", basicData.statsData().license); - jsonObject.addProperty("star_count", basicData.statsData().starCount); - jsonObject.addProperty("fork_count", basicData.statsData().forkCount); - jsonObject.addProperty("open_issue_count", basicData.statsData().openIssueCount); - - Utils.patchAsAdmin(backendUrl + "?software=eq." + basicData.basicData().software().toString(), jsonObject.toString()); + jsonObject.addProperty("basic_data_scraped_at", basicData.dataScrapedAt() + .format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); + jsonObject.addProperty("archived", basicData.statsData().archived()); + jsonObject.addProperty("license", basicData.statsData().license()); + jsonObject.addProperty("star_count", basicData.statsData().starCount()); + jsonObject.addProperty("fork_count", basicData.statsData().forkCount()); + jsonObject.addProperty("open_issue_count", basicData.statsData().openIssueCount()); + + Utils.patchAsAdmin(backendUrl + "?software=eq." + basicData.basicData() + .software() + .toString(), jsonObject.toString()); } public void saveContributorCount(ContributorDatabaseData contributorData) { JsonObject jsonObject = new JsonObject(); jsonObject.add("contributor_count_last_error", JsonNull.INSTANCE); - jsonObject.addProperty("contributor_count_scraped_at", contributorData.dataScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); + jsonObject.addProperty("contributor_count_scraped_at", contributorData.dataScrapedAt() + .format(DateTimeFormatter.ISO_OFFSET_DATE_TIME)); if (contributorData.contributorCount() != null) { jsonObject.addProperty("contributor_count", contributorData.contributorCount()); } - Utils.patchAsAdmin(backendUrl + "?software=eq." + contributorData.basicData().software().toString(), jsonObject.toString()); + Utils.patchAsAdmin(backendUrl + "?software=eq." + contributorData.basicData() + .software() + .toString(), jsonObject.toString()); } } diff --git a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/git/GithubScraperIT.java b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/git/GithubScraperIT.java index 0e0266bce..114b12e92 100644 --- a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/git/GithubScraperIT.java +++ b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/git/GithubScraperIT.java @@ -34,7 +34,7 @@ void languages() { @Disabled @Test void license() { - String license = Assertions.assertDoesNotThrow(() -> githubScraper.basicData().license); + String license = Assertions.assertDoesNotThrow(() -> githubScraper.basicData().license()); Assertions.assertEquals("Apache-2.0", license); } diff --git a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/git/GitlabScraperIT.java b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/git/GitlabScraperIT.java index 780720124..a4a49932e 100644 --- a/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/git/GitlabScraperIT.java +++ b/scrapers/src/test/java/nl/esciencecenter/rsd/scraper/git/GitlabScraperIT.java @@ -29,7 +29,7 @@ void languages() { @Disabled @Test void license() { - String license = Assertions.assertDoesNotThrow(() -> scraper.basicData().license); + String license = Assertions.assertDoesNotThrow(() -> scraper.basicData().license()); Assertions.assertEquals("MIT License", license); } @@ -38,7 +38,7 @@ void license() { void licenseDoesNotExist() { // unlicensed projects should return null // we need to find a suitable project or create a mocked interface - String license = Assertions.assertDoesNotThrow(() -> scraper.basicData().license); + String license = Assertions.assertDoesNotThrow(() -> scraper.basicData().license()); Assertions.assertNull(license); }