Skip to content

Commit

Permalink
feat: also scrape archived status of GitHub repos
Browse files Browse the repository at this point in the history
  • Loading branch information
ewan-escience committed Nov 7, 2024
1 parent 3b720f7 commit 5442ca6
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 45 deletions.
1 change: 1 addition & 0 deletions database/005-create-relations-for-software.sql
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ CREATE TABLE repository_url (
software UUID REFERENCES software (id) PRIMARY KEY,
url VARCHAR(200) NOT NULL CHECK (url ~ '^https?://'),
code_platform platform_type NOT NULL DEFAULT 'other',
archived BOOLEAN,
license VARCHAR(200),
star_count BIGINT,
fork_count INTEGER,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

package nl.esciencecenter.rsd.scraper.git;

public class BasicGitData {
public String license;
public Long starCount;
public Integer forkCount;
public Integer openIssueCount;
public record BasicGitData(
Boolean archived,
String license,
Long starCount,
Integer forkCount,
Integer openIssueCount
) {
}
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@ public BasicGitData basicData() throws IOException, InterruptedException, RsdRes
return switch (response.statusCode()) {
case 200 -> parseBasicData(response.body());
case 404 ->
throw new RsdResponseException(404, response.uri(), response.body(), "Not found, is the repository URL correct?");
throw new RsdResponseException(404, response.uri(), response.body(), "Not found, is the repository URL correct?");
case 403 ->
throw new RsdRateLimitException(403, response.uri(), response.body(), "Rate limit for GitHub probably reached");
throw new RsdRateLimitException(403, response.uri(), response.body(), "Rate limit for GitHub probably reached");
default ->
throw new RsdResponseException(response.statusCode(), response.uri(), response.body(), "Unexpected response");
throw new RsdResponseException(response.statusCode(), response.uri(), response.body(), "Unexpected response");
};
}

Expand All @@ -86,12 +86,12 @@ public String languages() throws IOException, InterruptedException, RsdResponseE
HttpResponse<String> response = getAsHttpResponse(BASE_API_URL + "/repos/" + organisation + "/" + repo + "/languages");
return switch (response.statusCode()) {
case 404 ->
throw new RsdResponseException(404, response.uri(), response.body(), "Not found, is the repository URL correct?");
throw new RsdResponseException(404, response.uri(), response.body(), "Not found, is the repository URL correct?");
case 403 ->
throw new RsdRateLimitException(403, response.uri(), response.body(), "Rate limit for GitHub probably reached");
throw new RsdRateLimitException(403, response.uri(), response.body(), "Rate limit for GitHub probably reached");
case 200 -> response.body();
default ->
throw new RsdResponseException(response.statusCode(), response.uri(), response.body(), "Unexpected response");
throw new RsdResponseException(response.statusCode(), response.uri(), response.body(), "Unexpected response");
};
}

Expand Down Expand Up @@ -194,16 +194,24 @@ static CommitsPerWeek parseCommits(String json) {
}

static BasicGitData parseBasicData(String json) {
BasicGitData result = new BasicGitData();
JsonObject jsonObject = JsonParser.parseString(json).getAsJsonObject();

Boolean archived = jsonObject.getAsJsonPrimitive("archived").getAsBoolean();
JsonElement jsonLicense = jsonObject.get("license");
result.license = jsonLicense.isJsonNull() ? null : jsonLicense.getAsJsonObject().getAsJsonPrimitive("spdx_id").getAsString();
result.starCount = jsonObject.getAsJsonPrimitive("stargazers_count").getAsLong();
result.forkCount = jsonObject.getAsJsonPrimitive("forks_count").getAsInt();
result.openIssueCount = jsonObject.getAsJsonPrimitive("open_issues_count").getAsInt();

return result;
String license = jsonLicense.isJsonNull() ? null : jsonLicense.getAsJsonObject()
.getAsJsonPrimitive("spdx_id")
.getAsString();
Long starCount = jsonObject.getAsJsonPrimitive("stargazers_count").getAsLong();
Integer forkCount = jsonObject.getAsJsonPrimitive("forks_count").getAsInt();
Integer openIssueCount = jsonObject.getAsJsonPrimitive("open_issues_count").getAsInt();

return new BasicGitData(
archived,
license,
starCount,
forkCount,
openIssueCount
);
}

// return an object with the URL of the last page and the number of the last page respectively
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,10 @@ public CommitsPerWeek contributions() throws IOException, InterruptedException,
boolean done = false;
while (!done) {
HttpRequest request = HttpRequest.newBuilder().GET()
.uri(URI.create(apiUri + "/projects/" + Utils.urlEncode(projectPath)
+ "/repository/commits?per_page=100&order=default&page=" + page))
.timeout(Duration.ofSeconds(30))
.build();
.uri(URI.create(apiUri + "/projects/" + Utils.urlEncode(projectPath)
+ "/repository/commits?per_page=100&order=default&page=" + page))
.timeout(Duration.ofSeconds(30))
.build();
HttpResponse<String> response;
try (HttpClient client = HttpClient.newBuilder().followRedirects(HttpClient.Redirect.NORMAL).build()) {
response = client.send(request, HttpResponse.BodyHandlers.ofString());
Expand Down Expand Up @@ -138,15 +138,20 @@ static void parseCommitPage(String json, CommitsPerWeek commitsToFill) {
}

static BasicGitData parseBasicData(String json) {
BasicGitData result = new BasicGitData();
JsonObject jsonObject = JsonParser.parseString(json).getAsJsonObject();

JsonElement jsonLicense = jsonObject.get("license");
result.license = jsonLicense.isJsonNull() ? null : jsonLicense.getAsJsonObject().get("name").getAsString();
result.starCount = jsonObject.getAsJsonPrimitive("star_count").getAsLong();
result.forkCount = jsonObject.getAsJsonPrimitive("forks_count").getAsInt();

return result;
String license = jsonLicense.isJsonNull() ? null : jsonLicense.getAsJsonObject().get("name").getAsString();
Long starCount = jsonObject.getAsJsonPrimitive("star_count").getAsLong();
Integer forkCount = jsonObject.getAsJsonPrimitive("forks_count").getAsInt();

return new BasicGitData(
null,
license,
starCount,
forkCount,
null
);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -82,41 +82,51 @@ static Collection<BasicRepositoryData> parseBasicJsonData(String data) {
}

public void saveLanguagesData(LanguagesData languagesData) {
String json = String.format("{\"languages_last_error\": null, \"languages\": %s, \"languages_scraped_at\": \"%s\"}", languagesData.languages(), languagesData.languagesScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
String json = String.format("{\"languages_last_error\": null, \"languages\": %s, \"languages_scraped_at\": \"%s\"}", languagesData.languages(), languagesData.languagesScrapedAt()
.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
Utils.patchAsAdmin(backendUrl + "?software=eq." + languagesData.basicData().software().toString(), json);
}

public void saveCommitData(CommitData commitData) {
String json;
if (commitData.commitHistory() == null) {
json = String.format("{\"commit_history_last_error\": null, \"commit_history_scraped_at\": \"%s\"}", commitData.commitHistoryScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
json = String.format("{\"commit_history_last_error\": null, \"commit_history_scraped_at\": \"%s\"}", commitData.commitHistoryScrapedAt()
.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
} else {
commitData.commitHistory().addMissingZeros();
json = String.format("{\"commit_history_last_error\": null, \"commit_history\": %s, \"commit_history_scraped_at\": \"%s\"}", commitData.commitHistory().toJson(), commitData.commitHistoryScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
json = String.format("{\"commit_history_last_error\": null, \"commit_history\": %s, \"commit_history_scraped_at\": \"%s\"}", commitData.commitHistory()
.toJson(), commitData.commitHistoryScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
}
Utils.patchAsAdmin(backendUrl + "?software=eq." + commitData.basicData().software().toString(), json);
}

public void saveBasicData(BasicGitDatabaseData basicData) {
JsonObject jsonObject = new JsonObject();
jsonObject.add("basic_data_last_error", JsonNull.INSTANCE);
jsonObject.addProperty("basic_data_scraped_at", basicData.dataScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
jsonObject.addProperty("license", basicData.statsData().license);
jsonObject.addProperty("star_count", basicData.statsData().starCount);
jsonObject.addProperty("fork_count", basicData.statsData().forkCount);
jsonObject.addProperty("open_issue_count", basicData.statsData().openIssueCount);

Utils.patchAsAdmin(backendUrl + "?software=eq." + basicData.basicData().software().toString(), jsonObject.toString());
jsonObject.addProperty("basic_data_scraped_at", basicData.dataScrapedAt()
.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
jsonObject.addProperty("archived", basicData.statsData().archived());
jsonObject.addProperty("license", basicData.statsData().license());
jsonObject.addProperty("star_count", basicData.statsData().starCount());
jsonObject.addProperty("fork_count", basicData.statsData().forkCount());
jsonObject.addProperty("open_issue_count", basicData.statsData().openIssueCount());

Utils.patchAsAdmin(backendUrl + "?software=eq." + basicData.basicData()
.software()
.toString(), jsonObject.toString());
}

public void saveContributorCount(ContributorDatabaseData contributorData) {
JsonObject jsonObject = new JsonObject();
jsonObject.add("contributor_count_last_error", JsonNull.INSTANCE);
jsonObject.addProperty("contributor_count_scraped_at", contributorData.dataScrapedAt().format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
jsonObject.addProperty("contributor_count_scraped_at", contributorData.dataScrapedAt()
.format(DateTimeFormatter.ISO_OFFSET_DATE_TIME));
if (contributorData.contributorCount() != null) {
jsonObject.addProperty("contributor_count", contributorData.contributorCount());
}

Utils.patchAsAdmin(backendUrl + "?software=eq." + contributorData.basicData().software().toString(), jsonObject.toString());
Utils.patchAsAdmin(backendUrl + "?software=eq." + contributorData.basicData()
.software()
.toString(), jsonObject.toString());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ void languages() {
@Disabled
@Test
void license() {
String license = Assertions.assertDoesNotThrow(() -> githubScraper.basicData().license);
String license = Assertions.assertDoesNotThrow(() -> githubScraper.basicData().license());
Assertions.assertEquals("Apache-2.0", license);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ void languages() {
@Disabled
@Test
void license() {
String license = Assertions.assertDoesNotThrow(() -> scraper.basicData().license);
String license = Assertions.assertDoesNotThrow(() -> scraper.basicData().license());
Assertions.assertEquals("MIT License", license);
}

Expand All @@ -38,7 +38,7 @@ void license() {
void licenseDoesNotExist() {
// unlicensed projects should return null
// we need to find a suitable project or create a mocked interface
String license = Assertions.assertDoesNotThrow(() -> scraper.basicData().license);
String license = Assertions.assertDoesNotThrow(() -> scraper.basicData().license());
Assertions.assertNull(license);
}

Expand Down

0 comments on commit 5442ca6

Please sign in to comment.