Skip to content

Commit

Permalink
Merge pull request #1153 from research-software-directory/1117-disabl…
Browse files Browse the repository at this point in the history
…e-scraping

1117 disable scraping
  • Loading branch information
ewan-escience authored Mar 29, 2024
2 parents 17e80dd + 65ab5ed commit 8459fe1
Show file tree
Hide file tree
Showing 12 changed files with 125 additions and 64 deletions.
5 changes: 3 additions & 2 deletions database/004-create-relations-for-software.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- SPDX-FileCopyrightText: 2021 - 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
-- SPDX-FileCopyrightText: 2021 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
-- SPDX-FileCopyrightText: 2021 - 2024 Netherlands eScience Center
-- SPDX-FileCopyrightText: 2022 - 2023 Dusan Mijatovic (dv4all)
-- SPDX-FileCopyrightText: 2022 - 2024 dv4all
Expand Down Expand Up @@ -34,7 +34,8 @@ CREATE TABLE repository_url (
commit_history_scraped_at TIMESTAMPTZ,
contributor_count INTEGER,
contributor_count_last_error VARCHAR(500),
contributor_count_scraped_at TIMESTAMPTZ
contributor_count_scraped_at TIMESTAMPTZ,
scraping_disabled_reason VARCHAR(200)
);


Expand Down
6 changes: 3 additions & 3 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ version: "3.0"
services:
database:
build: ./database
image: rsd/database:2.2.2
image: rsd/database:2.3.2
ports:
# enable connection from outside (development mode)
- "5432:5432"
Expand Down Expand Up @@ -110,7 +110,7 @@ services:
# dockerfile to use for build
dockerfile: Dockerfile
# update version number to correspond to frontend/package.json
image: rsd/frontend:2.8.2
image: rsd/frontend:2.8.3
environment:
# it uses values from .env file
- POSTGREST_URL
Expand Down Expand Up @@ -157,7 +157,7 @@ services:

scrapers:
build: ./scrapers
image: rsd/scrapers:1.7.0
image: rsd/scrapers:1.8.0
environment:
# it uses values from .env file
- POSTGREST_URL
Expand Down
6 changes: 6 additions & 0 deletions frontend/components/software/edit/editSoftwareConfig.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,12 @@ export const softwareInformation = {
{label: 'Other', value: 'other'},
]
},
repository_disabled_scraping_reason: {
label: 'Reason why scraping is disabled',
validation: {
maxLength: {value: 200, message: 'Maximum length is 200'}
}
},
// field for markdown
description: {
label: (brand_name: string) => `What ${brand_name} can do for you`,
Expand Down
74 changes: 56 additions & 18 deletions frontend/components/software/edit/links/AutosaveRepositoryUrl.tsx
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
// SPDX-FileCopyrightText: 2022 - 2023 dv4all
// SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all)
// SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center)
// SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2023 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2023 Dusan Mijatovic (dv4all) (dv4all)
// SPDX-FileCopyrightText: 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2023 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
// SPDX-License-Identifier: Apache-2.0
Expand Down Expand Up @@ -59,14 +59,14 @@ async function suggestPlatform(repositoryUrl: string | null) {
}

export default function AutosaveRepositoryUrl() {
const {token} = useSession()
const {token, user} = useSession()
const {showErrorMessage} = useSnackbar()
const {control, watch, resetField} = useFormContext<EditSoftwareItem>()
const {fieldState: {error: urlError}, field: {value: repository_url}} = useController({
control,
name: 'repository_url'
})
const [id, repository_platform] = watch(['id', 'repository_platform'])
const [id, repository_platform, scraping_disabled_reason] = watch(['id', 'repository_platform', 'scraping_disabled_reason'])
const [platform, setPlatform] = useState<{
id: CodePlatform | null
disabled: boolean
Expand Down Expand Up @@ -128,6 +128,24 @@ export default function AutosaveRepositoryUrl() {
}
}, [urlError, repository_url, platform.id])

async function saveScrapingDisabledReason({value}: {value: string | null}) {
try {
const resp = await fetch(`/api/v1/repository_url?software=eq.${id}`, {
method: 'PATCH',
body: JSON.stringify({scraping_disabled_reason: value}),
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${token}`
}
})
if (!resp.ok) {
showErrorMessage(`Failed to save the disabling reason with status code ${resp.status} and body ${JSON.stringify(resp.body)}`)
}
} catch (e) {
showErrorMessage(`Failed to save the disabling reason with an unknown error: ${e}`)
}
}

async function saveRepositoryInfo({name, value}: OnSaveProps<EditSoftwareItem>) {
// complete record for upsert
const data: RepositoryUrl = {
Expand All @@ -149,7 +167,8 @@ export default function AutosaveRepositoryUrl() {
commit_history_scraped_at: null,
contributor_count: null,
contributor_count_last_error: null,
contributor_count_scraped_at: null
contributor_count_scraped_at: null,
scraping_disabled_reason: scraping_disabled_reason
}
if (name === 'repository_url') {
data.url = value
Expand Down Expand Up @@ -203,24 +222,43 @@ export default function AutosaveRepositoryUrl() {
// console.log('id...', id)
// console.log('repository_url...', repository_url)
// console.log('platform...', platform)
// console.log('scraping_disabled_reason...', scraping_disabled_reason)
// console.log('urlError...', urlError)
// console.log('options...', options)
// console.groupEnd()

return (
<div className="flex gap-4 items-baseline">
<AutosaveControlledTextField
options={options}
control={control}
rules={config.repository_url.validation}
onSaveField={saveRepositoryInfo}
/>
<AutosaveRepositoryPlatform
value={platform.id}
disabled={platform.disabled}
helperText={platform.helperText}
onChange={(platform) => saveRepositoryInfo({name: 'repository_platform', value: platform})}
/>
</div>
<>
<div className="flex gap-4 items-baseline">
<AutosaveControlledTextField
options={options}
control={control}
rules={config.repository_url.validation}
onSaveField={saveRepositoryInfo}
/>
<AutosaveRepositoryPlatform
value={platform.id}
disabled={platform.disabled}
helperText={platform.helperText}
onChange={(platform) => saveRepositoryInfo({name: 'repository_platform', value: platform})}
/>
</div>
{(user?.role === 'rsd_admin')
? <AutosaveControlledTextField
options={{
name: 'scraping_disabled_reason',
label: config.repository_disabled_scraping_reason.label,
useNull: true,
defaultValue: scraping_disabled_reason,
helperTextMessage: config.repository_url.help(repository_url),
helperTextCnt: `${repository_url?.length || 0}/200`,
disabled: user?.role !== 'rsd_admin',
}}
control={control}
rules={config.repository_disabled_scraping_reason.validation}
onSaveField={saveScrapingDisabledReason}
/>
: null}
</>
)
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// SPDX-FileCopyrightText: 2024 Dusan Mijatovic (Netherlands eScience Center)
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2024 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0
Expand All @@ -13,6 +14,7 @@ type EditSoftwareMetadataFormProps={
get_started_url: string | null
repository_url: string | null,
repository_platform: CodePlatform | null
scraping_disabled_reason: string | null
concept_doi: string | null,
licenses: AutocompleteOption<License>[]
keywords: KeywordForSoftware[]
Expand Down
37 changes: 22 additions & 15 deletions frontend/components/software/edit/services/SoftwareRepoServices.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center)
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -17,21 +18,27 @@ export default function SoftwareRepoServices() {
if (loading) return <ContentLoader />

return (
<List>
{repoServiceList.map(service=>{
const props = {
title: service.name,
desc: service.desc,
scraped_at: services ? services[service.props.scraped_at] : null,
last_error: services ? services[service.props.last_error] : null,
url: services ? services[service.props.url] : null,
platform: services ? services['code_platform'] : null
}
return (
<ServiceInfoListItem key={service.name} {...props} />
)
})}
</List>
<>
{services?.scraping_disabled_reason
? <span style={{color: 'red'}}>The harvesters for this repo were disabled by the admins for the following reason: {services?.scraping_disabled_reason}</span>
: null}
<List>
{repoServiceList.map(service=>{
const props = {
title: service.name,
desc: service.desc,
scraped_at: services ? services[service.props.scraped_at] : null,
last_error: services ? services[service.props.last_error] : null,
url: services ? services[service.props.url] : null,
platform: services ? services['code_platform'] : null
}
return (
<ServiceInfoListItem key={service.name} {...props} />
)
})}
</List>
</>

)

}
18 changes: 10 additions & 8 deletions frontend/components/software/edit/services/apiSoftwareServices.tsx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
// SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center)
// SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
//
// SPDX-License-Identifier: Apache-2.0

Expand All @@ -13,14 +14,15 @@ import useSoftwareContext from '../useSoftwareContext'

export type SoftwareServices = {
software:string,
url:string,
url:string,
code_platform: CodePlatform,
basic_data_scraped_at: string|null,
basic_data_last_error: string|null,
languages_scraped_at: string|null,
languages_last_error: string|null,
commit_history_scraped_at: string|null,
commit_history_last_error: string|null
basic_data_scraped_at: string|null,
basic_data_last_error: string|null,
languages_scraped_at: string|null,
languages_last_error: string|null,
commit_history_scraped_at: string|null,
commit_history_last_error: string|null,
scraping_disabled_reason: string|null,
}

export type PackageManagerService = {
Expand All @@ -35,7 +37,7 @@ export type PackageManagerService = {

async function getSoftwareServices(id:string,token:string){
try{
const select='select=software,url,code_platform,basic_data_scraped_at,basic_data_last_error,languages_scraped_at,languages_last_error,commit_history_scraped_at,commit_history_last_error'
const select='select=software,url,code_platform,basic_data_scraped_at,basic_data_last_error,languages_scraped_at,languages_last_error,commit_history_scraped_at,commit_history_last_error,scraping_disabled_reason'
const query = `${select}&software=eq.${id}`
const url = `${getBaseUrl()}/repository_url?${query}`

Expand Down
6 changes: 4 additions & 2 deletions frontend/types/SoftwareTypes.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// SPDX-FileCopyrightText: 2022 - 2023 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2023 Dusan Mijatovic (dv4all)
// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2023 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
// SPDX-FileCopyrightText: 2022 - 2023 dv4all
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center)
// SPDX-FileCopyrightText: 2023 Dusan Mijatovic (dv4all) (dv4all)
Expand Down Expand Up @@ -42,7 +42,8 @@ export type RepositoryUrl = {
commit_history_scraped_at?: string | null,
contributor_count?: number | null,
contributor_count_last_error?: string | null,
contributor_count_scraped_at?: string | null
contributor_count_scraped_at?: string | null,
scraping_disabled_reason: string | null
}

export type NewSoftwareItem = {
Expand All @@ -67,6 +68,7 @@ export type SoftwareTableItem = NewSoftwareItem & {
export type SoftwareItem = SoftwareTableItem & {
repository_url: string | null,
repository_platform: CodePlatform | null
scraping_disabled_reason: string | null
}

export type SoftwareItemFromDB = SoftwareTableItem & {
Expand Down
7 changes: 4 additions & 3 deletions frontend/utils/editSoftware.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// SPDX-FileCopyrightText: 2022 - 2023 Dusan Mijatovic (dv4all)
// SPDX-FileCopyrightText: 2022 - 2023 dv4all
// SPDX-FileCopyrightText: 2022 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
//
// SPDX-License-Identifier: Apache-2.0

Expand Down Expand Up @@ -53,7 +53,7 @@ export async function getSoftwareToEdit({slug, token}:
{ slug: string, token: string }) {
try {
// GET
const select = '*,repository_url!left(url,code_platform)'
const select = '*,repository_url!left(url,code_platform,scraping_disabled_reason)'
const url = `${getBaseUrl()}/software?select=${select}&slug=eq.${slug}`
const resp = await fetch(url, {
method: 'GET',
Expand All @@ -71,6 +71,7 @@ export async function getSoftwareToEdit({slug, token}:
software.repository_url = null
software.repository_platform = null
}
software.scraping_disabled_reason = data[0]?.repository_url?.scraping_disabled_reason
return software
}
} catch (e: any) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
Expand Down Expand Up @@ -55,7 +55,7 @@ public static Optional<GithubScraper> create(String url) {
* Example URL: https://api.github.com/repos/research-software-directory/RSD-as-a-service
*/
@Override
public BasicGitData basicData() throws IOException, InterruptedException, RsdResponseException {
public BasicGitData basicData() throws IOException, InterruptedException, RsdResponseException {
Optional<String> apiCredentials = Config.apiCredentialsGithub();
HttpResponse<String> response;
if (apiCredentials.isPresent()) {
Expand Down Expand Up @@ -144,6 +144,7 @@ public CommitsPerWeek contributions() throws IOException, InterruptedException,
}
}

// Example URL: https://api.github.com/repos/research-software-directory/RSD-as-a-service/contributors?per_page=1
@Override
public Integer contributorCount() throws IOException, InterruptedException, RsdResponseException {
// we request one contributor per page and just extract the number of pages from the headers
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// SPDX-FileCopyrightText: 2022 - 2023 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2023 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 - 2024 Ewan Cahen (Netherlands eScience Center) <[email protected]>
// SPDX-FileCopyrightText: 2022 - 2024 Netherlands eScience Center
// SPDX-FileCopyrightText: 2022 Christian Meeßen (GFZ) <[email protected]>
// SPDX-FileCopyrightText: 2022 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences
//
Expand Down Expand Up @@ -43,6 +43,7 @@ public GitlabScraper(String gitLabApiUrl, String projectPath) {
* returned. If the license could not be detected, returns "Other". API endpoint:
* https://docs.gitlab.com/ee/api/projects.html#get-single-project NOTE: A GraphQL request here
* might be more efficient since less data would be sent.
* Example URL: https://gitlab.com/api/v4/projects/gitlab-org%2Fgitlab-shell?license=True
*
* @return The basic data
*/
Expand All @@ -55,6 +56,8 @@ public BasicGitData basicData() throws IOException, InterruptedException, RsdRes
/**
* Returns the languages used in a project with percentage values. Uses the API Endpoint
* https://docs.gitlab.com/ee/api/projects.html#languages GET /projects/:id/languages
* <p>
* Example URL: https://gitlab.com/api/v4/projects/gitlab-org%2Fgitlab-shell/languages
*
* @return A JSON as a String
*/
Expand Down Expand Up @@ -104,6 +107,7 @@ public CommitsPerWeek contributions() throws IOException, InterruptedException,
return commits;
}

// Example URL: https://gitlab.com/api/v4/projects/gitlab-org%2Fgitlab-shell/repository/contributors
@Override
public Integer contributorCount() throws IOException, InterruptedException, RsdResponseException {
HttpResponse<String> httpResponse = Utils.getAsHttpResponse(apiUri + "/projects/" + Utils.urlEncode(projectPath) + "/repository/contributors");
Expand Down
Loading

0 comments on commit 8459fe1

Please sign in to comment.