From fc68c4da93365f79fc35c2478040d57903d827ca Mon Sep 17 00:00:00 2001 From: Ewan Cahen Date: Wed, 27 Mar 2024 12:59:39 +0100 Subject: [PATCH 1/2] feat: allow admins to disable scraping a git repo --- .../004-create-relations-for-software.sql | 5 +- .../software/edit/editSoftwareConfig.tsx | 6 ++ .../edit/links/AutosaveRepositoryUrl.tsx | 74 ++++++++++++++----- .../edit/links/EditSoftwareMetadataForm.tsx | 2 + .../edit/services/SoftwareRepoServices.tsx | 37 ++++++---- .../edit/services/apiSoftwareServices.tsx | 18 +++-- frontend/types/SoftwareTypes.ts | 6 +- frontend/utils/editSoftware.ts | 7 +- .../rsd/scraper/git/GithubScraper.java | 7 +- .../rsd/scraper/git/GitlabScraper.java | 8 +- .../rsd/scraper/git/PostgrestConnector.java | 13 ++-- 11 files changed, 122 insertions(+), 61 deletions(-) diff --git a/database/004-create-relations-for-software.sql b/database/004-create-relations-for-software.sql index 5cb09796d..c1b5c3b7c 100644 --- a/database/004-create-relations-for-software.sql +++ b/database/004-create-relations-for-software.sql @@ -1,4 +1,4 @@ --- SPDX-FileCopyrightText: 2021 - 2023 Ewan Cahen (Netherlands eScience Center) +-- SPDX-FileCopyrightText: 2021 - 2024 Ewan Cahen (Netherlands eScience Center) -- SPDX-FileCopyrightText: 2021 - 2024 Netherlands eScience Center -- SPDX-FileCopyrightText: 2022 - 2023 Dusan Mijatovic (dv4all) -- SPDX-FileCopyrightText: 2022 - 2024 dv4all @@ -34,7 +34,8 @@ CREATE TABLE repository_url ( commit_history_scraped_at TIMESTAMPTZ, contributor_count INTEGER, contributor_count_last_error VARCHAR(500), - contributor_count_scraped_at TIMESTAMPTZ + contributor_count_scraped_at TIMESTAMPTZ, + scraping_disabled_reason VARCHAR(200) ); diff --git a/frontend/components/software/edit/editSoftwareConfig.tsx b/frontend/components/software/edit/editSoftwareConfig.tsx index 4a5422ee3..7951f080a 100644 --- a/frontend/components/software/edit/editSoftwareConfig.tsx +++ b/frontend/components/software/edit/editSoftwareConfig.tsx @@ -76,6 +76,12 @@ export const softwareInformation = { {label: 'Other', value: 'other'}, ] }, + repository_disabled_scraping_reason: { + label: 'Reason why scraping is disabled', + validation: { + maxLength: {value: 200, message: 'Maximum length is 200'} + } + }, // field for markdown description: { label: (brand_name: string) => `What ${brand_name} can do for you`, diff --git a/frontend/components/software/edit/links/AutosaveRepositoryUrl.tsx b/frontend/components/software/edit/links/AutosaveRepositoryUrl.tsx index 888b66323..485a14b55 100644 --- a/frontend/components/software/edit/links/AutosaveRepositoryUrl.tsx +++ b/frontend/components/software/edit/links/AutosaveRepositoryUrl.tsx @@ -1,10 +1,10 @@ // SPDX-FileCopyrightText: 2022 - 2023 dv4all // SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all) // SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) // SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center // SPDX-FileCopyrightText: 2023 Christian Meeßen (GFZ) // SPDX-FileCopyrightText: 2023 Dusan Mijatovic (dv4all) (dv4all) -// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) // SPDX-FileCopyrightText: 2023 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences // // SPDX-License-Identifier: Apache-2.0 @@ -59,14 +59,14 @@ async function suggestPlatform(repositoryUrl: string | null) { } export default function AutosaveRepositoryUrl() { - const {token} = useSession() + const {token, user} = useSession() const {showErrorMessage} = useSnackbar() const {control, watch, resetField} = useFormContext() const {fieldState: {error: urlError}, field: {value: repository_url}} = useController({ control, name: 'repository_url' }) - const [id, repository_platform] = watch(['id', 'repository_platform']) + const [id, repository_platform, scraping_disabled_reason] = watch(['id', 'repository_platform', 'scraping_disabled_reason']) const [platform, setPlatform] = useState<{ id: CodePlatform | null disabled: boolean @@ -128,6 +128,24 @@ export default function AutosaveRepositoryUrl() { } }, [urlError, repository_url, platform.id]) + async function saveScrapingDisabledReason({value}: {value: string | null}) { + try { + const resp = await fetch(`/api/v1/repository_url?software=eq.${id}`, { + method: 'PATCH', + body: JSON.stringify({scraping_disabled_reason: value}), + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${token}` + } + }) + if (!resp.ok) { + showErrorMessage(`Failed to save the disabling reason with status code ${resp.status} and body ${JSON.stringify(resp.body)}`) + } + } catch (e) { + showErrorMessage(`Failed to save the disabling reason with an unknown error: ${e}`) + } + } + async function saveRepositoryInfo({name, value}: OnSaveProps) { // complete record for upsert const data: RepositoryUrl = { @@ -149,7 +167,8 @@ export default function AutosaveRepositoryUrl() { commit_history_scraped_at: null, contributor_count: null, contributor_count_last_error: null, - contributor_count_scraped_at: null + contributor_count_scraped_at: null, + scraping_disabled_reason: scraping_disabled_reason } if (name === 'repository_url') { data.url = value @@ -203,24 +222,43 @@ export default function AutosaveRepositoryUrl() { // console.log('id...', id) // console.log('repository_url...', repository_url) // console.log('platform...', platform) + // console.log('scraping_disabled_reason...', scraping_disabled_reason) // console.log('urlError...', urlError) // console.log('options...', options) // console.groupEnd() return ( -
- - saveRepositoryInfo({name: 'repository_platform', value: platform})} - /> -
+ <> +
+ + saveRepositoryInfo({name: 'repository_platform', value: platform})} + /> +
+ {(user?.role === 'rsd_admin') + ? + : null} + ) } diff --git a/frontend/components/software/edit/links/EditSoftwareMetadataForm.tsx b/frontend/components/software/edit/links/EditSoftwareMetadataForm.tsx index 4ea9c3867..c5d38722d 100644 --- a/frontend/components/software/edit/links/EditSoftwareMetadataForm.tsx +++ b/frontend/components/software/edit/links/EditSoftwareMetadataForm.tsx @@ -1,4 +1,5 @@ // SPDX-FileCopyrightText: 2024 Dusan Mijatovic (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) // SPDX-FileCopyrightText: 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -13,6 +14,7 @@ type EditSoftwareMetadataFormProps={ get_started_url: string | null repository_url: string | null, repository_platform: CodePlatform | null + scraping_disabled_reason: string | null concept_doi: string | null, licenses: AutocompleteOption[] keywords: KeywordForSoftware[] diff --git a/frontend/components/software/edit/services/SoftwareRepoServices.tsx b/frontend/components/software/edit/services/SoftwareRepoServices.tsx index 632fa6b3f..ce315b92a 100644 --- a/frontend/components/software/edit/services/SoftwareRepoServices.tsx +++ b/frontend/components/software/edit/services/SoftwareRepoServices.tsx @@ -1,5 +1,6 @@ // SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center) // SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) // // SPDX-License-Identifier: Apache-2.0 @@ -17,21 +18,27 @@ export default function SoftwareRepoServices() { if (loading) return return ( - - {repoServiceList.map(service=>{ - const props = { - title: service.name, - desc: service.desc, - scraped_at: services ? services[service.props.scraped_at] : null, - last_error: services ? services[service.props.last_error] : null, - url: services ? services[service.props.url] : null, - platform: services ? services['code_platform'] : null - } - return ( - - ) - })} - + <> + {services?.scraping_disabled_reason + ? The harvesters for this repo were disabled by the admins for the following reason: {services?.scraping_disabled_reason} + : null} + + {repoServiceList.map(service=>{ + const props = { + title: service.name, + desc: service.desc, + scraped_at: services ? services[service.props.scraped_at] : null, + last_error: services ? services[service.props.last_error] : null, + url: services ? services[service.props.url] : null, + platform: services ? services['code_platform'] : null + } + return ( + + ) + })} + + + ) } diff --git a/frontend/components/software/edit/services/apiSoftwareServices.tsx b/frontend/components/software/edit/services/apiSoftwareServices.tsx index 600e75312..f1819fa9f 100644 --- a/frontend/components/software/edit/services/apiSoftwareServices.tsx +++ b/frontend/components/software/edit/services/apiSoftwareServices.tsx @@ -1,5 +1,6 @@ // SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center) // SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center +// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) // // SPDX-License-Identifier: Apache-2.0 @@ -13,14 +14,15 @@ import useSoftwareContext from '../useSoftwareContext' export type SoftwareServices = { software:string, - url:string, + url:string, code_platform: CodePlatform, - basic_data_scraped_at: string|null, - basic_data_last_error: string|null, - languages_scraped_at: string|null, - languages_last_error: string|null, - commit_history_scraped_at: string|null, - commit_history_last_error: string|null + basic_data_scraped_at: string|null, + basic_data_last_error: string|null, + languages_scraped_at: string|null, + languages_last_error: string|null, + commit_history_scraped_at: string|null, + commit_history_last_error: string|null, + scraping_disabled_reason: string|null, } export type PackageManagerService = { @@ -35,7 +37,7 @@ export type PackageManagerService = { async function getSoftwareServices(id:string,token:string){ try{ - const select='select=software,url,code_platform,basic_data_scraped_at,basic_data_last_error,languages_scraped_at,languages_last_error,commit_history_scraped_at,commit_history_last_error' + const select='select=software,url,code_platform,basic_data_scraped_at,basic_data_last_error,languages_scraped_at,languages_last_error,commit_history_scraped_at,commit_history_last_error,scraping_disabled_reason' const query = `${select}&software=eq.${id}` const url = `${getBaseUrl()}/repository_url?${query}` diff --git a/frontend/types/SoftwareTypes.ts b/frontend/types/SoftwareTypes.ts index 56cd6bbdd..76546c867 100644 --- a/frontend/types/SoftwareTypes.ts +++ b/frontend/types/SoftwareTypes.ts @@ -1,8 +1,8 @@ // SPDX-FileCopyrightText: 2022 - 2023 Christian Meeßen (GFZ) // SPDX-FileCopyrightText: 2022 - 2023 Dusan Mijatovic (dv4all) -// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) // SPDX-FileCopyrightText: 2022 - 2023 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences // SPDX-FileCopyrightText: 2022 - 2023 dv4all +// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) // SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center // SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center) // SPDX-FileCopyrightText: 2023 Dusan Mijatovic (dv4all) (dv4all) @@ -42,7 +42,8 @@ export type RepositoryUrl = { commit_history_scraped_at?: string | null, contributor_count?: number | null, contributor_count_last_error?: string | null, - contributor_count_scraped_at?: string | null + contributor_count_scraped_at?: string | null, + scraping_disabled_reason: string | null } export type NewSoftwareItem = { @@ -67,6 +68,7 @@ export type SoftwareTableItem = NewSoftwareItem & { export type SoftwareItem = SoftwareTableItem & { repository_url: string | null, repository_platform: CodePlatform | null + scraping_disabled_reason: string | null } export type SoftwareItemFromDB = SoftwareTableItem & { diff --git a/frontend/utils/editSoftware.ts b/frontend/utils/editSoftware.ts index 7f8619172..7c19e4bbe 100644 --- a/frontend/utils/editSoftware.ts +++ b/frontend/utils/editSoftware.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: 2022 - 2023 Dusan Mijatovic (dv4all) // SPDX-FileCopyrightText: 2022 - 2023 dv4all -// SPDX-FileCopyrightText: 2022 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 Netherlands eScience Center +// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -53,7 +53,7 @@ export async function getSoftwareToEdit({slug, token}: { slug: string, token: string }) { try { // GET - const select = '*,repository_url!left(url,code_platform)' + const select = '*,repository_url!left(url,code_platform,scraping_disabled_reason)' const url = `${getBaseUrl()}/software?select=${select}&slug=eq.${slug}` const resp = await fetch(url, { method: 'GET', @@ -71,6 +71,7 @@ export async function getSoftwareToEdit({slug, token}: software.repository_url = null software.repository_platform = null } + software.scraping_disabled_reason = data[0]?.repository_url?.scraping_disabled_reason return software } } catch (e: any) { diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GithubScraper.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GithubScraper.java index ee50b6ae3..81433017f 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GithubScraper.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GithubScraper.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center // SPDX-FileCopyrightText: 2022 Christian Meeßen (GFZ) // SPDX-FileCopyrightText: 2022 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences // @@ -55,7 +55,7 @@ public static Optional create(String url) { * Example URL: https://api.github.com/repos/research-software-directory/RSD-as-a-service */ @Override - public BasicGitData basicData() throws IOException, InterruptedException, RsdResponseException { + public BasicGitData basicData() throws IOException, InterruptedException, RsdResponseException { Optional apiCredentials = Config.apiCredentialsGithub(); HttpResponse response; if (apiCredentials.isPresent()) { @@ -144,6 +144,7 @@ public CommitsPerWeek contributions() throws IOException, InterruptedException, } } + // Example URL: https://api.github.com/repos/research-software-directory/RSD-as-a-service/contributors?per_page=1 @Override public Integer contributorCount() throws IOException, InterruptedException, RsdResponseException { // we request one contributor per page and just extract the number of pages from the headers diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GitlabScraper.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GitlabScraper.java index 212a9220c..80f7c2571 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GitlabScraper.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/GitlabScraper.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center // SPDX-FileCopyrightText: 2022 Christian Meeßen (GFZ) // SPDX-FileCopyrightText: 2022 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences // @@ -43,6 +43,7 @@ public GitlabScraper(String gitLabApiUrl, String projectPath) { * returned. If the license could not be detected, returns "Other". API endpoint: * https://docs.gitlab.com/ee/api/projects.html#get-single-project NOTE: A GraphQL request here * might be more efficient since less data would be sent. + * Example URL: https://gitlab.com/api/v4/projects/gitlab-org%2Fgitlab-shell?license=True * * @return The basic data */ @@ -55,6 +56,8 @@ public BasicGitData basicData() throws IOException, InterruptedException, RsdRes /** * Returns the languages used in a project with percentage values. Uses the API Endpoint * https://docs.gitlab.com/ee/api/projects.html#languages GET /projects/:id/languages + *

+ * Example URL: https://gitlab.com/api/v4/projects/gitlab-org%2Fgitlab-shell/languages * * @return A JSON as a String */ @@ -104,6 +107,7 @@ public CommitsPerWeek contributions() throws IOException, InterruptedException, return commits; } + // Example URL: https://gitlab.com/api/v4/projects/gitlab-org%2Fgitlab-shell/repository/contributors @Override public Integer contributorCount() throws IOException, InterruptedException, RsdResponseException { HttpResponse httpResponse = Utils.getAsHttpResponse(apiUri + "/projects/" + Utils.urlEncode(projectPath) + "/repository/contributors"); diff --git a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/PostgrestConnector.java b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/PostgrestConnector.java index 1b80644f6..f50da1d39 100644 --- a/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/PostgrestConnector.java +++ b/scrapers/src/main/java/nl/esciencecenter/rsd/scraper/git/PostgrestConnector.java @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) -// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center +// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) +// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center // // SPDX-License-Identifier: Apache-2.0 @@ -21,11 +21,12 @@ public class PostgrestConnector { private final String backendUrl; - private final CodePlatformProvider codePlatform; + private final String filter; public PostgrestConnector(String backendUrl, CodePlatformProvider codePlatform) { this.backendUrl = Objects.requireNonNull(backendUrl); - this.codePlatform = Objects.requireNonNull(codePlatform); + Objects.requireNonNull(codePlatform); + this.filter = "scraping_disabled_reason=is.null&code_platform=eq." + codePlatform.name().toLowerCase(); } /** @@ -35,7 +36,6 @@ public PostgrestConnector(String backendUrl, CodePlatformProvider codePlatform) * @return The data corresponding to the git repositories of which the programming languages data were scraped the longest time ago */ public Collection languagesData(int limit) { - String filter = "code_platform=eq." + codePlatform.name().toLowerCase(); String data = Utils.getAsAdmin(backendUrl + "?" + filter + "&select=software,url&order=languages_scraped_at.asc.nullsfirst&limit=" + limit + "&" + Utils.atLeastOneHourAgoFilter("languages_scraped_at")); return parseBasicJsonData(data); } @@ -47,7 +47,6 @@ public Collection languagesData(int limit) { * @return The data corresponding to the git repositories of which the commit data were scraped the longest time ago */ public Collection commitData(int limit) { - String filter = "code_platform=eq." + codePlatform.name().toLowerCase(); String data = Utils.getAsAdmin(backendUrl + "?" + filter + "&select=software,url&order=commit_history_scraped_at.asc.nullsfirst&limit=" + limit + "&" + Utils.atLeastOneHourAgoFilter("commit_history_scraped_at")); return parseBasicJsonData(data); } @@ -59,13 +58,11 @@ public Collection commitData(int limit) { * @return The data corresponding to the git repositories of which the basic data were scraped the longest time ago */ public Collection statsData(int limit) { - String filter = "code_platform=eq." + codePlatform.name().toLowerCase(); String data = Utils.getAsAdmin(backendUrl + "?" + filter + "&select=software,url&order=basic_data_scraped_at.asc.nullsfirst&limit=" + limit + "&" + Utils.atLeastOneHourAgoFilter("basic_data_scraped_at")); return parseBasicJsonData(data); } public Collection contributorData(int limit) { - String filter = "code_platform=eq." + codePlatform.name().toLowerCase(); String data = Utils.getAsAdmin(backendUrl + "?" + filter + "&select=software,url&order=contributor_count_scraped_at.asc.nullsfirst&limit=" + limit + "&" + Utils.atLeastOneHourAgoFilter("contributor_count_scraped_at")); return parseBasicJsonData(data); } From 65ab5ed034cf7d4723d676d733187e9832bf55ae Mon Sep 17 00:00:00 2001 From: Ewan Cahen Date: Thu, 28 Mar 2024 17:51:11 +0100 Subject: [PATCH 2/2] build: bump version numbers --- docker-compose.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 0727e18f6..b5e57dca6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -16,7 +16,7 @@ version: "3.0" services: database: build: ./database - image: rsd/database:2.2.2 + image: rsd/database:2.3.2 ports: # enable connection from outside (development mode) - "5432:5432" @@ -110,7 +110,7 @@ services: # dockerfile to use for build dockerfile: Dockerfile # update version number to correspond to frontend/package.json - image: rsd/frontend:2.8.2 + image: rsd/frontend:2.8.3 environment: # it uses values from .env file - POSTGREST_URL @@ -157,7 +157,7 @@ services: scrapers: build: ./scrapers - image: rsd/scrapers:1.7.0 + image: rsd/scrapers:1.8.0 environment: # it uses values from .env file - POSTGREST_URL