From 928c67b04836887ba8c40d6de5f7a1a157047189 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 8 Aug 2024 11:21:15 +0200 Subject: [PATCH 01/68] use ubuntu:22.04 image and add conda installation for fixing libstdc++ issue --- Dockerfile | 43 +++++++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index c2a84df3..98d7eb80 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,35 @@ # Pull base image -FROM --platform=linux/x86_64 condaforge/miniforge3:23.3.1-1 +FROM --platform=linux/amd64 ubuntu:22.04 + +ARG MINIFORGE_NAME=Miniforge3 +ARG MINIFORGE_VERSION=24.3.0-0 +ARG TARGETPLATFORM + +ENV CONDA_DIR=/opt/conda +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 +ENV PATH=${CONDA_DIR}/bin:${PATH} + +RUN \ + ## Install apt dependencies + apt-get update && \ + apt-get install --no-install-recommends --yes \ + wget bzip2 unzip ca-certificates \ + git && \ + ## Download and install Miniforge + wget --no-hsts --quiet https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/${MINIFORGE_NAME}-${MINIFORGE_VERSION}-Linux-x86_64.sh -O /tmp/miniforge.sh && \ + /bin/bash /tmp/miniforge.sh -b -p ${CONDA_DIR} && \ + rm /tmp/miniforge.sh && \ + echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> /etc/skel/.bashrc && \ + echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> ~/.bashrc -# Add files -ADD ./tutorials /home/deeprank2/tutorials ADD ./env/deeprank2.yml /home/deeprank2 RUN \ - # Install dependencies and package - apt update -y && \ - apt install unzip -y && \ - ## GCC - apt install -y gcc && \ ## Create the environment and install the dependencies mamba env create -f /home/deeprank2/deeprank2.yml && \ + conda install -n deeprank2 conda-forge::gcc && \ ## Activate the environment and install pip packages - /opt/conda/bin/conda run -n deeprank2 pip install deeprank2 && \ + conda run -n deeprank2 pip install deeprank2 && \ ## Activate the environment automatically when entering the container echo "source activate deeprank2" >~/.bashrc && \ # Get the data for running the tutorials @@ -22,7 +37,15 @@ RUN \ if [ -d "/home/deeprank2/tutorials/data_processed" ]; then rm -Rf /home/deeprank2/tutorials/data_processed; fi && \ wget https://zenodo.org/records/8349335/files/data_raw.zip && \ unzip data_raw.zip -d data_raw && \ - mv data_raw /home/deeprank2/tutorials + mv data_raw /home/deeprank2/tutorials && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* && \ + conda clean --tarballs --index-cache --packages --yes && \ + find ${CONDA_DIR} -follow -type f -name '*.a' -delete && \ + find ${CONDA_DIR} -follow -type f -name '*.pyc' -delete && \ + conda clean --force-pkgs-dirs --all --yes + +ADD ./tutorials /home/deeprank2/tutorials ENV PATH /opt/conda/envs/deeprank2/bin:$PATH From 7b25a1441dc6465b89953746b0a85e4ecb6a7981 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 13:45:10 +0200 Subject: [PATCH 02/68] add action for releasing new docker image --- .github/workflows/publish_docker_image.yml | 59 ++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 .github/workflows/publish_docker_image.yml diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml new file mode 100644 index 00000000..364587e5 --- /dev/null +++ b/.github/workflows/publish_docker_image.yml @@ -0,0 +1,59 @@ +# +name: Create and publish a Docker image + +# Configures this workflow to run every time a change is pushed to the branch called `release`. +on: + push: + branches: ["529_add_docker_testing_action_gcroci2"] + +# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. +jobs: + build-and-push-image: + runs-on: ubuntu-latest + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. + permissions: + contents: read + packages: write + attestations: write + id-token: write + # + steps: + - name: Checkout repository + uses: actions/checkout@v4 + # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. + - name: Log in to the Container registry + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. + # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. + # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. + - name: Build and push Docker image + id: push + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see "[AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds)." + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} + subject-digest: ${{ steps.push.outputs.digest }} + push-to-registry: true From ded33a70e6433c1954c7ca31d267e9ab19632b48 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 13:46:11 +0200 Subject: [PATCH 03/68] add todo --- .github/workflows/publish_docker_image.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 364587e5..3a7bb7f4 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -4,7 +4,7 @@ name: Create and publish a Docker image # Configures this workflow to run every time a change is pushed to the branch called `release`. on: push: - branches: ["529_add_docker_testing_action_gcroci2"] + branches: ["529_add_docker_testing_action_gcroci2"] #TODO: Change this to be dependant on the release action # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. env: From d8165c1c8fd9f4552f45cebf9752276d1706e8d8 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 14:10:55 +0200 Subject: [PATCH 04/68] add logs --- .github/workflows/publish_docker_image.yml | 10 ++++++++++ Dockerfile | 6 +++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 3a7bb7f4..2917630c 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -46,9 +46,19 @@ jobs: uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 with: context: . + file: ./Dockerfile push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} + # Output verbose logs + build-args: BUILDKIT_INLINE_CACHE=1 + outputs: type=docker,dest=/tmp/myimage.tar + + - name: Output Docker build logs + run: | + echo "Docker build logs" + docker load < /tmp/myimage.tar + docker history ${{ steps.meta.outputs.tags }} # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see "[AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds)." - name: Generate artifact attestation diff --git a/Dockerfile b/Dockerfile index 98d7eb80..5909ac8d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,11 @@ RUN \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> /etc/skel/.bashrc && \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> ~/.bashrc -ADD ./env/deeprank2.yml /home/deeprank2 +RUN ls -la /home + +ADD ./env/deeprank2.yml /home/deeprank2/ + +RUN ls -la /home/deeprank2 RUN \ ## Create the environment and install the dependencies From 385a74887200accaae65605f85c77e69912c5eb0 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 14:12:21 +0200 Subject: [PATCH 05/68] remove unsupported commands --- .github/workflows/publish_docker_image.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 2917630c..fc783d50 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -50,15 +50,6 @@ jobs: push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - # Output verbose logs - build-args: BUILDKIT_INLINE_CACHE=1 - outputs: type=docker,dest=/tmp/myimage.tar - - - name: Output Docker build logs - run: | - echo "Docker build logs" - docker load < /tmp/myimage.tar - docker history ${{ steps.meta.outputs.tags }} # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see "[AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds)." - name: Generate artifact attestation From ed3b17d55821424b295e1280b2dae946a09c48c8 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 14:44:23 +0200 Subject: [PATCH 06/68] specify platform for building the image in the action --- .github/workflows/publish_docker_image.yml | 1 + Dockerfile | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index fc783d50..06231866 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -46,6 +46,7 @@ jobs: uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 with: context: . + platforms: linux/amd64 file: ./Dockerfile push: true tags: ${{ steps.meta.outputs.tags }} diff --git a/Dockerfile b/Dockerfile index 5909ac8d..b26ada57 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,12 +22,8 @@ RUN \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> /etc/skel/.bashrc && \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> ~/.bashrc -RUN ls -la /home - ADD ./env/deeprank2.yml /home/deeprank2/ -RUN ls -la /home/deeprank2 - RUN \ ## Create the environment and install the dependencies mamba env create -f /home/deeprank2/deeprank2.yml && \ From 56d4f77935b91ec23bfe87635a68976b0fe779ec Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 15:00:21 +0200 Subject: [PATCH 07/68] add tags and multiarch build --- .github/workflows/publish_docker_image.yml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 06231866..c835cdca 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -25,7 +25,13 @@ jobs: steps: - name: Checkout repository uses: actions/checkout@v4 - # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Log in to the Container registry uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 with: @@ -38,12 +44,20 @@ jobs: uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 with: images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=raw,value=latest,enable={{is_default_branch}} + type=ref,event=branch + type=ref,event=pr + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=semver,pattern={{major}} + type=sha # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. - name: Build and push Docker image id: push - uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 + uses: docker/build-push-action@v5 with: context: . platforms: linux/amd64 From c464425f2459ee1f82ac2a81fe18db8b2142dc9f Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 15:57:24 +0200 Subject: [PATCH 08/68] add debug runs --- .github/workflows/publish_docker_image.yml | 61 +++++++--------------- 1 file changed, 19 insertions(+), 42 deletions(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index c835cdca..70b341cd 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -1,75 +1,52 @@ -# -name: Create and publish a Docker image +name: Create and publish a Docker image with manifest -# Configures this workflow to run every time a change is pushed to the branch called `release`. on: push: - branches: ["529_add_docker_testing_action_gcroci2"] #TODO: Change this to be dependant on the release action + branches: ["529_add_docker_testing_action_gcroci2"] -# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} -# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. jobs: build-and-push-image: runs-on: ubuntu-latest - # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. permissions: contents: read packages: write - attestations: write - id-token: write - # steps: - name: Checkout repository uses: actions/checkout@v4 - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Log in to the Container registry - uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 + uses: docker/login-action@v3 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - type=raw,value=latest,enable={{is_default_branch}} - type=ref,event=branch - type=ref,event=pr - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=semver,pattern={{major}} - type=sha - # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. - # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. - # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. + - name: Build and push Docker image - id: push uses: docker/build-push-action@v5 with: context: . - platforms: linux/amd64 file: ./Dockerfile push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest-amd64 + platforms: linux/amd64 - # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see "[AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds)." - - name: Generate artifact attestation - uses: actions/attest-build-provenance@v1 - with: - subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} - subject-digest: ${{ steps.push.outputs.digest }} - push-to-registry: true + - name: Create and push manifest + run: | + docker manifest create ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest \ + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest-amd64 + docker manifest annotate ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest \ + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest-amd64 --os linux --arch amd64 + docker manifest push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest + + - name: Check pushed image and manifest + run: | + docker manifest inspect ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest + docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest + docker image inspect ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest From 5b93d05ea2af76d4e5d3ec989b45a7d055c7827e Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 16:01:24 +0200 Subject: [PATCH 09/68] restore the inital file --- .github/workflows/publish_docker_image.yml | 57 ++++++++++++---------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 70b341cd..0cc2bf66 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -1,52 +1,59 @@ -name: Create and publish a Docker image with manifest +# +name: Create and publish a Docker image +# Configures this workflow to run every time a change is pushed to the branch called `release`. on: push: - branches: ["529_add_docker_testing_action_gcroci2"] + branches: ["release"] +# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. env: REGISTRY: ghcr.io IMAGE_NAME: ${{ github.repository }} +# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. jobs: build-and-push-image: runs-on: ubuntu-latest + # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. permissions: contents: read packages: write + attestations: write + id-token: write + # steps: - name: Checkout repository uses: actions/checkout@v4 - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - + # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. - name: Log in to the Container registry - uses: docker/login-action@v3 + uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 with: registry: ${{ env.REGISTRY }} username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} - + # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. + # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. + # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. - name: Build and push Docker image - uses: docker/build-push-action@v5 + id: push + uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 with: context: . - file: ./Dockerfile push: true - tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest-amd64 - platforms: linux/amd64 - - - name: Create and push manifest - run: | - docker manifest create ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest \ - ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest-amd64 - docker manifest annotate ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest \ - ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest-amd64 --os linux --arch amd64 - docker manifest push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} - - name: Check pushed image and manifest - run: | - docker manifest inspect ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest - docker pull ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest - docker image inspect ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest + # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see "[AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds)." + - name: Generate artifact attestation + uses: actions/attest-build-provenance@v1 + with: + subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} + subject-digest: ${{ steps.push.outputs.digest }} + push-to-registry: true From d33106f6d3519604417a0781f215865a7cc0d12a Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 16:52:26 +0200 Subject: [PATCH 10/68] point to the current branch --- .github/workflows/publish_docker_image.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 0cc2bf66..364587e5 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -4,7 +4,7 @@ name: Create and publish a Docker image # Configures this workflow to run every time a change is pushed to the branch called `release`. on: push: - branches: ["release"] + branches: ["529_add_docker_testing_action_gcroci2"] # Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. env: From 03014a873bfc17dc21c886868cf160cf6cc0d554 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 17:24:27 +0200 Subject: [PATCH 11/68] try rsd workflow --- .github/workflows/_ghcr.yml | 70 ++++++++++++++ .github/workflows/publish_docker_image.yml | 103 ++++++++++++--------- 2 files changed, 128 insertions(+), 45 deletions(-) create mode 100644 .github/workflows/_ghcr.yml diff --git a/.github/workflows/_ghcr.yml b/.github/workflows/_ghcr.yml new file mode 100644 index 00000000..abdc910f --- /dev/null +++ b/.github/workflows/_ghcr.yml @@ -0,0 +1,70 @@ +# SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all) +# SPDX-FileCopyrightText: 2022 dv4all +# SPDX-FileCopyrightText: 2024 Ewan Cahen (Netherlands eScience Center) +# SPDX-FileCopyrightText: 2024 Netherlands eScience Center +# +# SPDX-License-Identifier: Apache-2.0 + +name: reusable ghcr.io module + +on: + workflow_call: + inputs: + ghcr_user: + required: true + description: User for logging to ghcr.io (use github.actor) + type: string + base_image_name: + required: true + description: Base image name incl. ghcr.io + type: string + image_tag: + required: true + description: Image tag (version) + type: string + dockerfile: + required: true + description: Location and name of docker file + type: string + docker_context: + required: true + description: Docker context for the build command + type: string + secrets: + token: + required: true + outputs: + image_created: + description: Full image name after upload to ghcr.io + value: ${{jobs.build_and_push.outputs.image_build}} + image_uploaded: + description: Confirmation that image is uploaded to ghcr.io + value: ${{jobs.build_and_push.outputs.image_pushed}} + +jobs: + build_and_push: + name: build and push image + runs-on: ubuntu-22.04 + outputs: + image_build: ${{steps.build_image.outputs.image_build}} + image_pushed: ${{steps.build_image.outputs.image_pushed}} + steps: + - name: checkout + # https://github.com/actions/checkout + uses: actions/checkout@v4 + - name: build + id: build_image + run: | + IMAGE_TAG_VERSION=${{inputs.base_image_name}}:${{inputs.image_tag}} + IMAGE_TAG_LASTEST=${{inputs.base_image_name}}:latest + echo image_tag_version $IMAGE_TAG_VERSION + docker build -t $IMAGE_TAG_VERSION -t $IMAGE_TAG_LASTEST -f ${{inputs.dockerfile}} ${{inputs.docker_context}} + echo "{image_build}={$IMAGE_TAG_VERSIONvalue}" >> $GITHUB_OUTPUT + - name: push to ghcr.io + id: push_image + run: | + echo login + echo "${{secrets.token}}" | docker login https://ghcr.io -u ${{inputs.ghcr_user}} --password-stdin + echo push auth image with all tags + docker push ${{inputs.base_image_name}} --all-tags + echo "{image_build}={$IMAGE_TAG_VERSIONvalue}" >> $GITHUB_OUTPUT diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 364587e5..daa9694b 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -1,4 +1,13 @@ +# SPDX-FileCopyrightText: 2022 Christian Meeßen (GFZ) +# SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all) +# SPDX-FileCopyrightText: 2022 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences +# SPDX-FileCopyrightText: 2022 dv4all +# SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center) +# SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) +# SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center # +# SPDX-License-Identifier: Apache-2.0 + name: Create and publish a Docker image # Configures this workflow to run every time a change is pushed to the branch called `release`. @@ -6,54 +15,58 @@ on: push: branches: ["529_add_docker_testing_action_gcroci2"] -# Defines two custom environment variables for the workflow. These are used for the Container registry domain, and a name for the Docker image that this workflow builds. -env: - REGISTRY: ghcr.io - IMAGE_NAME: ${{ github.repository }} - -# There is a single job in this workflow. It's configured to run on the latest available version of Ubuntu. jobs: - build-and-push-image: - runs-on: ubuntu-latest - # Sets the permissions granted to the `GITHUB_TOKEN` for the actions in this job. - permissions: - contents: read - packages: write - attestations: write - id-token: write - # + release_tag: + name: conventional changelog + runs-on: ubuntu-22.04 + outputs: + changelog: ${{steps.changelog.outputs.clean_changelog}} + tag: ${{steps.changelog.outputs.tag}} + skipped: ${{steps.changelog.outputs.skipped}} steps: - - name: Checkout repository + - name: checkout all history + # https://github.com/actions/checkout uses: actions/checkout@v4 - # Uses the `docker/login-action` action to log in to the Container registry registry using the account and password that will publish the packages. Once published, the packages are scoped to the account defined here. - - name: Log in to the Container registry - uses: docker/login-action@65b78e6e13532edd9afa3aa52ac7964289d1a9c1 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - # This step uses [docker/metadata-action](https://github.com/docker/metadata-action#about) to extract tags and labels that will be applied to the specified image. The `id` "meta" allows the output of this step to be referenced in a subsequent step. The `images` value provides the base name for the tags and labels. - - name: Extract metadata (tags, labels) for Docker - id: meta - uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - # This step uses the `docker/build-push-action` action to build the image, based on your repository's `Dockerfile`. If the build succeeds, it pushes the image to GitHub Packages. - # It uses the `context` parameter to define the build's context as the set of files located in the specified path. For more information, see "[Usage](https://github.com/docker/build-push-action#usage)" in the README of the `docker/build-push-action` repository. - # It uses the `tags` and `labels` parameters to tag and label the image with the output from the "meta" step. - - name: Build and push Docker image - id: push - uses: docker/build-push-action@f2a1d5e99d037542a71f64918e516c093c6f3fc4 - with: - context: . - push: true - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} + # checkout whole history + fetch-depth: 0 - # This step generates an artifact attestation for the image, which is an unforgeable statement about where and how it was built. It increases supply chain security for people who consume the image. For more information, see "[AUTOTITLE](/actions/security-guides/using-artifact-attestations-to-establish-provenance-for-builds)." - - name: Generate artifact attestation - uses: actions/attest-build-provenance@v1 + - name: calculate new version and create changelog content + id: changelog + # https://github.com/TriPSs/conventional-changelog-action + uses: TriPSs/conventional-changelog-action@v5 with: - subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}} - subject-digest: ${{ steps.push.outputs.digest }} - push-to-registry: true + # you can also create separate token to trace action + github-token: "${{secrets.GITHUB_TOKEN}}" + # do not create changelog file, the content is used at next step for relase body + output-file: false + # do not create additional commit, just tag current commit with the version + skip-commit: true + # do not pull - we already checked out the selection we want to use for versioning in previous step + skip-git-pull: true + # skip tag push - it will not push but it will tag + git-push: false + + log_release_tag: + needs: release_tag + name: log version output + runs-on: ubuntu-22.04 + steps: + - name: info + run: | + echo skipped=${{needs.release_tag.outputs.skipped}} + echo tag=${{needs.release_tag.outputs.tag}} + auth: + # it needs to be checked on string value + if: needs.release_tag.outputs.skipped == 'false' + needs: release_tag + name: docker_image_deeprank2 + uses: ./.github/workflows/_ghcr.yml + with: + ghcr_user: ${{github.actor}} + base_image_name: ghcr.io/deeprank/deeprank2 + image_tag: ${{needs.release_tag.outputs.tag}} + dockerfile: ./Dockerfile + docker_context: . + secrets: + token: ${{secrets.GITHUB_TOKEN}} From 4547855a921c2b37ca579cf33a170ef8a962a35e Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 17:25:52 +0200 Subject: [PATCH 12/68] remove skip --- .github/workflows/publish_docker_image.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index daa9694b..f91640a5 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -56,9 +56,9 @@ jobs: run: | echo skipped=${{needs.release_tag.outputs.skipped}} echo tag=${{needs.release_tag.outputs.tag}} - auth: - # it needs to be checked on string value - if: needs.release_tag.outputs.skipped == 'false' + docker_image_deeprank2: + # # it needs to be checked on string value + # if: needs.release_tag.outputs.skipped == 'false' needs: release_tag name: docker_image_deeprank2 uses: ./.github/workflows/_ghcr.yml From f01cfc379e892cdf36addd2e8905fac4cdf9418c Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 17:27:49 +0200 Subject: [PATCH 13/68] fix base_image_name --- .github/workflows/publish_docker_image.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index f91640a5..c638bda0 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -64,7 +64,7 @@ jobs: uses: ./.github/workflows/_ghcr.yml with: ghcr_user: ${{github.actor}} - base_image_name: ghcr.io/deeprank/deeprank2 + base_image_name: ghcr.io/DeepRank/deeprank2 image_tag: ${{needs.release_tag.outputs.tag}} dockerfile: ./Dockerfile docker_context: . From 92e4be6e38af89af9d0df40cedbe10fb2b6c1aab Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 17:29:23 +0200 Subject: [PATCH 14/68] use latest --- .github/workflows/publish_docker_image.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index c638bda0..5b65fa4b 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -65,7 +65,7 @@ jobs: with: ghcr_user: ${{github.actor}} base_image_name: ghcr.io/DeepRank/deeprank2 - image_tag: ${{needs.release_tag.outputs.tag}} + image_tag: latest dockerfile: ./Dockerfile docker_context: . secrets: From ff9d1a9e5af259025265228f4150d1fa4ffac21e Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Wed, 4 Sep 2024 17:30:24 +0200 Subject: [PATCH 15/68] lowercase --- .github/workflows/publish_docker_image.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 5b65fa4b..553b40b7 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -64,7 +64,7 @@ jobs: uses: ./.github/workflows/_ghcr.yml with: ghcr_user: ${{github.actor}} - base_image_name: ghcr.io/DeepRank/deeprank2 + base_image_name: ghcr.io/deeprank/deeprank2 image_tag: latest dockerfile: ./Dockerfile docker_context: . From 394c13242afac2add0eb9c6658287a542e234716 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:03:12 +0200 Subject: [PATCH 16/68] move ghcr to actions/ folder --- .../_ghcr.yml => actions/install-python-and-package/ghcr.yml} | 0 .github/workflows/publish_docker_image.yml | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename .github/{workflows/_ghcr.yml => actions/install-python-and-package/ghcr.yml} (100%) diff --git a/.github/workflows/_ghcr.yml b/.github/actions/install-python-and-package/ghcr.yml similarity index 100% rename from .github/workflows/_ghcr.yml rename to .github/actions/install-python-and-package/ghcr.yml diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 553b40b7..7abce280 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -61,7 +61,7 @@ jobs: # if: needs.release_tag.outputs.skipped == 'false' needs: release_tag name: docker_image_deeprank2 - uses: ./.github/workflows/_ghcr.yml + uses: ./.github/actions/ghcr.yml with: ghcr_user: ${{github.actor}} base_image_name: ghcr.io/deeprank/deeprank2 From 0051d34b221204017aa85890ea19c25863762802 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:06:21 +0200 Subject: [PATCH 17/68] fix folders structure --- .github/actions/{install-python-and-package => }/ghcr.yml | 0 .../action.yml => install-python-and-package.yml} | 0 .github/workflows/build-latest-release.yml | 2 +- .github/workflows/build-repo-frozen-env.yml | 2 +- .github/workflows/build-repo.yml | 2 +- .github/workflows/coveralls.yml | 2 +- .github/workflows/notebooks.yml | 2 +- .github/workflows/release.yml | 2 +- 8 files changed, 6 insertions(+), 6 deletions(-) rename .github/actions/{install-python-and-package => }/ghcr.yml (100%) rename .github/actions/{install-python-and-package/action.yml => install-python-and-package.yml} (100%) diff --git a/.github/actions/install-python-and-package/ghcr.yml b/.github/actions/ghcr.yml similarity index 100% rename from .github/actions/install-python-and-package/ghcr.yml rename to .github/actions/ghcr.yml diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package.yml similarity index 100% rename from .github/actions/install-python-and-package/action.yml rename to .github/actions/install-python-and-package.yml diff --git a/.github/workflows/build-latest-release.yml b/.github/workflows/build-latest-release.yml index 9daa9936..bc1c8d40 100644 --- a/.github/workflows/build-latest-release.yml +++ b/.github/workflows/build-latest-release.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} pkg-installation-type: "latest" diff --git a/.github/workflows/build-repo-frozen-env.yml b/.github/workflows/build-repo-frozen-env.yml index 47f29dce..d3c71dbb 100644 --- a/.github/workflows/build-repo-frozen-env.yml +++ b/.github/workflows/build-repo-frozen-env.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index 7af4cc7a..20654ae5 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index eb4feff2..437f76ac 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -46,7 +46,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index b1e59ea9..1546c06f 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -45,7 +45,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test, notebooks diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0f85061b..fae4c93b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: publishing From 0bb5a1bf7bccbe511e831c896d1247bc6fc6fca5 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:11:22 +0200 Subject: [PATCH 18/68] move actions to workflows --- .github/{actions/ghcr.yml => workflows/ _ghcr.yml} | 0 .../_install-python-and-package.yml} | 0 .github/workflows/build-latest-release.yml | 2 +- .github/workflows/build-repo-frozen-env.yml | 2 +- .github/workflows/build-repo.yml | 2 +- .github/workflows/coveralls.yml | 2 +- .github/workflows/notebooks.yml | 2 +- .github/workflows/publish_docker_image.yml | 2 +- .github/workflows/release.yml | 2 +- 9 files changed, 7 insertions(+), 7 deletions(-) rename .github/{actions/ghcr.yml => workflows/ _ghcr.yml} (100%) rename .github/{actions/install-python-and-package.yml => workflows/_install-python-and-package.yml} (100%) diff --git a/.github/actions/ghcr.yml b/.github/workflows/ _ghcr.yml similarity index 100% rename from .github/actions/ghcr.yml rename to .github/workflows/ _ghcr.yml diff --git a/.github/actions/install-python-and-package.yml b/.github/workflows/_install-python-and-package.yml similarity index 100% rename from .github/actions/install-python-and-package.yml rename to .github/workflows/_install-python-and-package.yml diff --git a/.github/workflows/build-latest-release.yml b/.github/workflows/build-latest-release.yml index bc1c8d40..ecd8e4af 100644 --- a/.github/workflows/build-latest-release.yml +++ b/.github/workflows/build-latest-release.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/workflows/_install-python-and-package.yml with: python-version: ${{ matrix.python-version }} pkg-installation-type: "latest" diff --git a/.github/workflows/build-repo-frozen-env.yml b/.github/workflows/build-repo-frozen-env.yml index d3c71dbb..dc1bc3c3 100644 --- a/.github/workflows/build-repo-frozen-env.yml +++ b/.github/workflows/build-repo-frozen-env.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/workflows/_install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index 20654ae5..2b4e2015 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/workflows/_install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index 437f76ac..2fd4a1a0 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -46,7 +46,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/workflows/_install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index 1546c06f..75e1ea39 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -45,7 +45,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/workflows/_install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test, notebooks diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 7abce280..553b40b7 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -61,7 +61,7 @@ jobs: # if: needs.release_tag.outputs.skipped == 'false' needs: release_tag name: docker_image_deeprank2 - uses: ./.github/actions/ghcr.yml + uses: ./.github/workflows/_ghcr.yml with: ghcr_user: ${{github.actor}} base_image_name: ghcr.io/deeprank/deeprank2 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fae4c93b..ca949fa1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/workflows/_install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: publishing From 56b5e878da133dad423f6181963bb29c2b7076b6 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:15:13 +0200 Subject: [PATCH 19/68] fix action error --- .../actions/install-python-and-package.yml | 84 +++++++++++++++++++ .../workflows/_install-python-and-package.yml | 81 ------------------ .github/workflows/build-latest-release.yml | 2 +- .github/workflows/build-repo-frozen-env.yml | 2 +- .github/workflows/build-repo.yml | 2 +- .github/workflows/coveralls.yml | 2 +- .github/workflows/notebooks.yml | 2 +- .github/workflows/release.yml | 2 +- 8 files changed, 90 insertions(+), 87 deletions(-) create mode 100644 .github/actions/install-python-and-package.yml delete mode 100644 .github/workflows/_install-python-and-package.yml diff --git a/.github/actions/install-python-and-package.yml b/.github/actions/install-python-and-package.yml new file mode 100644 index 00000000..b95bdc7f --- /dev/null +++ b/.github/actions/install-python-and-package.yml @@ -0,0 +1,84 @@ +name: "Install Python and DeepRank2" + +description: "Installs Python, updates pip and installs DeepRank2 together with its dependencies." + +on: + workflow_call: + + inputs: + python-version: + required: false + description: "The Python version to use. Specify major and minor version, e.g. '3.10'." + default: "3.10" + + extras-require: + required: false + description: "The extras dependencies packages to be installed, for instance 'dev' or 'dev,publishing,notebooks'." + default: "test" + + pkg-installation-type: + required: false + description: "The package installation type to install, the latest released version on PyPI ('latest') or the GitHub repository one ('repository')." + default: "repository" + + runs: + using: "composite" + + steps: + - name: Cancel Previous Runs and Set up Python + uses: styfle/cancel-workflow-action@0.4.0 + with: + access_token: ${{ github.token }} + + - uses: actions/checkout@v3 + + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + miniforge-variant: Mambaforge + channels: conda-forge + python-version: ${{ inputs.python-version }} + activate-environment: deeprank2 + environment-file: env/deeprank2.yml + use-mamba: true + if: ${{ inputs.pkg-installation-type != 'frozen' }} + + - name: Setup miniconda with frozen dependencies + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + miniforge-variant: Mambaforge + channels: conda-forge + python-version: ${{ inputs.python-version }} + activate-environment: deeprank2 + environment-file: env/deeprank2_frozen.yml + use-mamba: true + if: ${{ inputs.pkg-installation-type == 'frozen' }} + + - run: | + conda --version + conda env list + shell: bash -l {0} + + - name: Python info + shell: bash -l {0} + run: | + which python3 + python3 --version + + - name: Install the GitHub repository version of the package + shell: bash -l {0} + if: ${{ inputs.pkg-installation-type == 'repository' || inputs.pkg-installation-type == 'frozen' }} + run: | + conda activate deeprank2 + pip install .'[${{ inputs.extras-require }}]' + + - name: Install the latest released PyPI version of the package + shell: bash -l {0} + if: ${{ inputs.pkg-installation-type == 'latest' }} + run: | + conda activate deeprank2 + pip install pytest + rm -r deeprank2 + pip install deeprank2 diff --git a/.github/workflows/_install-python-and-package.yml b/.github/workflows/_install-python-and-package.yml deleted file mode 100644 index 539cfe42..00000000 --- a/.github/workflows/_install-python-and-package.yml +++ /dev/null @@ -1,81 +0,0 @@ -name: "Install Python and DeepRank2" - -description: "Installs Python, updates pip and installs DeepRank2 together with its dependencies." - -inputs: - python-version: - required: false - description: "The Python version to use. Specify major and minor version, e.g. '3.10'." - default: "3.10" - - extras-require: - required: false - description: "The extras dependencies packages to be installed, for instance 'dev' or 'dev,publishing,notebooks'." - default: "test" - - pkg-installation-type: - required: false - description: "The package installation type to install, the latest released version on PyPI ('latest') or the GitHub repository one ('repository')." - default: "repository" - -runs: - using: "composite" - - steps: - - name: Cancel Previous Runs and Set up Python - uses: styfle/cancel-workflow-action@0.4.0 - with: - access_token: ${{ github.token }} - - - uses: actions/checkout@v3 - - - name: Setup miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniforge-variant: Mambaforge - channels: conda-forge - python-version: ${{ inputs.python-version }} - activate-environment: deeprank2 - environment-file: env/deeprank2.yml - use-mamba: true - if: ${{ inputs.pkg-installation-type != 'frozen' }} - - - name: Setup miniconda with frozen dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniforge-variant: Mambaforge - channels: conda-forge - python-version: ${{ inputs.python-version }} - activate-environment: deeprank2 - environment-file: env/deeprank2_frozen.yml - use-mamba: true - if: ${{ inputs.pkg-installation-type == 'frozen' }} - - - run: | - conda --version - conda env list - shell: bash -l {0} - - - name: Python info - shell: bash -l {0} - run: | - which python3 - python3 --version - - - name: Install the GitHub repository version of the package - shell: bash -l {0} - if: ${{ inputs.pkg-installation-type == 'repository' || inputs.pkg-installation-type == 'frozen' }} - run: | - conda activate deeprank2 - pip install .'[${{ inputs.extras-require }}]' - - - name: Install the latest released PyPI version of the package - shell: bash -l {0} - if: ${{ inputs.pkg-installation-type == 'latest' }} - run: | - conda activate deeprank2 - pip install pytest - rm -r deeprank2 - pip install deeprank2 diff --git a/.github/workflows/build-latest-release.yml b/.github/workflows/build-latest-release.yml index ecd8e4af..bc1c8d40 100644 --- a/.github/workflows/build-latest-release.yml +++ b/.github/workflows/build-latest-release.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/workflows/_install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} pkg-installation-type: "latest" diff --git a/.github/workflows/build-repo-frozen-env.yml b/.github/workflows/build-repo-frozen-env.yml index dc1bc3c3..d3c71dbb 100644 --- a/.github/workflows/build-repo-frozen-env.yml +++ b/.github/workflows/build-repo-frozen-env.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/workflows/_install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index 2b4e2015..20654ae5 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/workflows/_install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index 2fd4a1a0..437f76ac 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -46,7 +46,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/workflows/_install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index 75e1ea39..1546c06f 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -45,7 +45,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/workflows/_install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: test, notebooks diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ca949fa1..fae4c93b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/workflows/_install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package.yml with: python-version: ${{ matrix.python-version }} extras-require: publishing From 77c4f0194643eda02314d50e574b8af5cb1ea775 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:16:51 +0200 Subject: [PATCH 20/68] restore old folder --- .../action.yml} | 0 .github/workflows/build-latest-release.yml | 2 +- .github/workflows/build-repo-frozen-env.yml | 2 +- .github/workflows/build-repo.yml | 2 +- .github/workflows/coveralls.yml | 2 +- .github/workflows/notebooks.yml | 2 +- .github/workflows/release.yml | 2 +- 7 files changed, 6 insertions(+), 6 deletions(-) rename .github/actions/{install-python-and-package.yml => install-python-and-package/action.yml} (100%) diff --git a/.github/actions/install-python-and-package.yml b/.github/actions/install-python-and-package/action.yml similarity index 100% rename from .github/actions/install-python-and-package.yml rename to .github/actions/install-python-and-package/action.yml diff --git a/.github/workflows/build-latest-release.yml b/.github/workflows/build-latest-release.yml index bc1c8d40..417174d9 100644 --- a/.github/workflows/build-latest-release.yml +++ b/.github/workflows/build-latest-release.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package/action.yml with: python-version: ${{ matrix.python-version }} pkg-installation-type: "latest" diff --git a/.github/workflows/build-repo-frozen-env.yml b/.github/workflows/build-repo-frozen-env.yml index d3c71dbb..7fb126c8 100644 --- a/.github/workflows/build-repo-frozen-env.yml +++ b/.github/workflows/build-repo-frozen-env.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package/action.yml with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index 20654ae5..b5395395 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package/action.yml with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index 437f76ac..e3a0f120 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -46,7 +46,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package/action.yml with: python-version: ${{ matrix.python-version }} extras-require: test diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index 1546c06f..fce2b216 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -45,7 +45,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package/action.yml with: python-version: ${{ matrix.python-version }} extras-require: test, notebooks diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index fae4c93b..af65141d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package.yml + - uses: ./.github/actions/install-python-and-package/action.yml with: python-version: ${{ matrix.python-version }} extras-require: publishing From 24eea9047c120cc277a36ccc09d0bb0c66b3d77f Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:17:44 +0200 Subject: [PATCH 21/68] reindent yml content --- .../install-python-and-package/action.yml | 135 +++++++++--------- 1 file changed, 66 insertions(+), 69 deletions(-) diff --git a/.github/actions/install-python-and-package/action.yml b/.github/actions/install-python-and-package/action.yml index b95bdc7f..539cfe42 100644 --- a/.github/actions/install-python-and-package/action.yml +++ b/.github/actions/install-python-and-package/action.yml @@ -2,83 +2,80 @@ name: "Install Python and DeepRank2" description: "Installs Python, updates pip and installs DeepRank2 together with its dependencies." -on: - workflow_call: +inputs: + python-version: + required: false + description: "The Python version to use. Specify major and minor version, e.g. '3.10'." + default: "3.10" - inputs: - python-version: - required: false - description: "The Python version to use. Specify major and minor version, e.g. '3.10'." - default: "3.10" + extras-require: + required: false + description: "The extras dependencies packages to be installed, for instance 'dev' or 'dev,publishing,notebooks'." + default: "test" - extras-require: - required: false - description: "The extras dependencies packages to be installed, for instance 'dev' or 'dev,publishing,notebooks'." - default: "test" + pkg-installation-type: + required: false + description: "The package installation type to install, the latest released version on PyPI ('latest') or the GitHub repository one ('repository')." + default: "repository" - pkg-installation-type: - required: false - description: "The package installation type to install, the latest released version on PyPI ('latest') or the GitHub repository one ('repository')." - default: "repository" +runs: + using: "composite" - runs: - using: "composite" + steps: + - name: Cancel Previous Runs and Set up Python + uses: styfle/cancel-workflow-action@0.4.0 + with: + access_token: ${{ github.token }} - steps: - - name: Cancel Previous Runs and Set up Python - uses: styfle/cancel-workflow-action@0.4.0 - with: - access_token: ${{ github.token }} + - uses: actions/checkout@v3 - - uses: actions/checkout@v3 + - name: Setup miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + miniforge-variant: Mambaforge + channels: conda-forge + python-version: ${{ inputs.python-version }} + activate-environment: deeprank2 + environment-file: env/deeprank2.yml + use-mamba: true + if: ${{ inputs.pkg-installation-type != 'frozen' }} - - name: Setup miniconda - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniforge-variant: Mambaforge - channels: conda-forge - python-version: ${{ inputs.python-version }} - activate-environment: deeprank2 - environment-file: env/deeprank2.yml - use-mamba: true - if: ${{ inputs.pkg-installation-type != 'frozen' }} + - name: Setup miniconda with frozen dependencies + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + miniforge-variant: Mambaforge + channels: conda-forge + python-version: ${{ inputs.python-version }} + activate-environment: deeprank2 + environment-file: env/deeprank2_frozen.yml + use-mamba: true + if: ${{ inputs.pkg-installation-type == 'frozen' }} - - name: Setup miniconda with frozen dependencies - uses: conda-incubator/setup-miniconda@v2 - with: - auto-update-conda: true - miniforge-variant: Mambaforge - channels: conda-forge - python-version: ${{ inputs.python-version }} - activate-environment: deeprank2 - environment-file: env/deeprank2_frozen.yml - use-mamba: true - if: ${{ inputs.pkg-installation-type == 'frozen' }} + - run: | + conda --version + conda env list + shell: bash -l {0} - - run: | - conda --version - conda env list - shell: bash -l {0} + - name: Python info + shell: bash -l {0} + run: | + which python3 + python3 --version - - name: Python info - shell: bash -l {0} - run: | - which python3 - python3 --version + - name: Install the GitHub repository version of the package + shell: bash -l {0} + if: ${{ inputs.pkg-installation-type == 'repository' || inputs.pkg-installation-type == 'frozen' }} + run: | + conda activate deeprank2 + pip install .'[${{ inputs.extras-require }}]' - - name: Install the GitHub repository version of the package - shell: bash -l {0} - if: ${{ inputs.pkg-installation-type == 'repository' || inputs.pkg-installation-type == 'frozen' }} - run: | - conda activate deeprank2 - pip install .'[${{ inputs.extras-require }}]' - - - name: Install the latest released PyPI version of the package - shell: bash -l {0} - if: ${{ inputs.pkg-installation-type == 'latest' }} - run: | - conda activate deeprank2 - pip install pytest - rm -r deeprank2 - pip install deeprank2 + - name: Install the latest released PyPI version of the package + shell: bash -l {0} + if: ${{ inputs.pkg-installation-type == 'latest' }} + run: | + conda activate deeprank2 + pip install pytest + rm -r deeprank2 + pip install deeprank2 From 6218944c3e8746f3c9b79b0ea7ba9d10c46c954c Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:19:51 +0200 Subject: [PATCH 22/68] fix path --- .github/workflows/build-latest-release.yml | 2 +- .github/workflows/build-repo-frozen-env.yml | 2 +- .github/workflows/build-repo.yml | 2 +- .github/workflows/coveralls.yml | 2 +- .github/workflows/notebooks.yml | 2 +- .github/workflows/release.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-latest-release.yml b/.github/workflows/build-latest-release.yml index 417174d9..9daa9936 100644 --- a/.github/workflows/build-latest-release.yml +++ b/.github/workflows/build-latest-release.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package/action.yml + - uses: ./.github/actions/install-python-and-package with: python-version: ${{ matrix.python-version }} pkg-installation-type: "latest" diff --git a/.github/workflows/build-repo-frozen-env.yml b/.github/workflows/build-repo-frozen-env.yml index 7fb126c8..47f29dce 100644 --- a/.github/workflows/build-repo-frozen-env.yml +++ b/.github/workflows/build-repo-frozen-env.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package/action.yml + - uses: ./.github/actions/install-python-and-package with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index b5395395..7af4cc7a 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -47,7 +47,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package/action.yml + - uses: ./.github/actions/install-python-and-package with: python-version: ${{ matrix.python-version }} extras-require: test, publishing diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index e3a0f120..eb4feff2 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -46,7 +46,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package/action.yml + - uses: ./.github/actions/install-python-and-package with: python-version: ${{ matrix.python-version }} extras-require: test diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index fce2b216..b1e59ea9 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -45,7 +45,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package/action.yml + - uses: ./.github/actions/install-python-and-package with: python-version: ${{ matrix.python-version }} extras-require: test, notebooks diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index af65141d..0f85061b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -24,7 +24,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: ./.github/actions/install-python-and-package/action.yml + - uses: ./.github/actions/install-python-and-package with: python-version: ${{ matrix.python-version }} extras-require: publishing From 45fa50afa84303b87716747a447cca938e9ea674 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:24:31 +0200 Subject: [PATCH 23/68] fix _ghcr link --- .github/workflows/{ _ghcr.yml => _ghcr.yml} | 0 .github/workflows/ghcr.yml | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{ _ghcr.yml => _ghcr.yml} (100%) create mode 100644 .github/workflows/ghcr.yml diff --git a/.github/workflows/ _ghcr.yml b/.github/workflows/_ghcr.yml similarity index 100% rename from .github/workflows/ _ghcr.yml rename to .github/workflows/_ghcr.yml diff --git a/.github/workflows/ghcr.yml b/.github/workflows/ghcr.yml new file mode 100644 index 00000000..e69de29b From a647d756f98a761e864b80262b8e9df212e8f3fd Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:35:26 +0200 Subject: [PATCH 24/68] add tag based on software version --- .github/workflows/publish_docker_image.yml | 53 ++++++---------------- 1 file changed, 13 insertions(+), 40 deletions(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index 553b40b7..d3962f68 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -16,56 +16,29 @@ on: branches: ["529_add_docker_testing_action_gcroci2"] jobs: - release_tag: - name: conventional changelog - runs-on: ubuntu-22.04 + read_version: + name: Read version from TOML + runs-on: ubuntu-latest outputs: - changelog: ${{steps.changelog.outputs.clean_changelog}} - tag: ${{steps.changelog.outputs.tag}} - skipped: ${{steps.changelog.outputs.skipped}} + version: ${{ steps.get_version.outputs.VERSION }} steps: - - name: checkout all history - # https://github.com/actions/checkout + - name: Checkout code uses: actions/checkout@v4 - with: - # checkout whole history - fetch-depth: 0 - - name: calculate new version and create changelog content - id: changelog - # https://github.com/TriPSs/conventional-changelog-action - uses: TriPSs/conventional-changelog-action@v5 - with: - # you can also create separate token to trace action - github-token: "${{secrets.GITHUB_TOKEN}}" - # do not create changelog file, the content is used at next step for relase body - output-file: false - # do not create additional commit, just tag current commit with the version - skip-commit: true - # do not pull - we already checked out the selection we want to use for versioning in previous step - skip-git-pull: true - # skip tag push - it will not push but it will tag - git-push: false - - log_release_tag: - needs: release_tag - name: log version output - runs-on: ubuntu-22.04 - steps: - - name: info + - name: Read version from TOML + id: get_version run: | - echo skipped=${{needs.release_tag.outputs.skipped}} - echo tag=${{needs.release_tag.outputs.tag}} + VERSION=$(grep '^version =' pyproject.toml | awk -F '"' '{print $2}') + echo "VERSION=$VERSION" >> $GITHUB_OUTPUT + docker_image_deeprank2: - # # it needs to be checked on string value - # if: needs.release_tag.outputs.skipped == 'false' - needs: release_tag + needs: read_version name: docker_image_deeprank2 uses: ./.github/workflows/_ghcr.yml with: ghcr_user: ${{github.actor}} - base_image_name: ghcr.io/deeprank/deeprank2 - image_tag: latest + base_image_name: ghcr.io/${{ github.repository }} + image_tag: ${{ needs.read_version.outputs.version }} dockerfile: ./Dockerfile docker_context: . secrets: From a1ec7e59ff1e13a58b8fff6730bae7728b5bbc94 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:37:39 +0200 Subject: [PATCH 25/68] lowercase the organization --- .github/workflows/publish_docker_image.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml index d3962f68..24a29898 100644 --- a/.github/workflows/publish_docker_image.yml +++ b/.github/workflows/publish_docker_image.yml @@ -21,6 +21,7 @@ jobs: runs-on: ubuntu-latest outputs: version: ${{ steps.get_version.outputs.VERSION }} + repo_lowercase: ${{ steps.repo_lowercase.outputs.REPO_LOWERCASE }} steps: - name: Checkout code uses: actions/checkout@v4 @@ -31,13 +32,19 @@ jobs: VERSION=$(grep '^version =' pyproject.toml | awk -F '"' '{print $2}') echo "VERSION=$VERSION" >> $GITHUB_OUTPUT + - name: Convert repository name to lowercase + id: repo_lowercase + run: | + REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + echo "REPO_LOWERCASE=$REPO_LOWERCASE" >> $GITHUB_OUTPUT + docker_image_deeprank2: needs: read_version name: docker_image_deeprank2 uses: ./.github/workflows/_ghcr.yml with: ghcr_user: ${{github.actor}} - base_image_name: ghcr.io/${{ github.repository }} + base_image_name: ghcr.io/${{ needs.read_version.outputs.repo_lowercase }} image_tag: ${{ needs.read_version.outputs.version }} dockerfile: ./Dockerfile docker_context: . From bcb67b2a2d08da08bccab62dd5b073d4e4d54cd7 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 12:46:19 +0200 Subject: [PATCH 26/68] try to increase patch version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4172c9c1..592edf19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "deeprank2" -version = "3.0.5" +version = "3.0.6" description = "DeepRank2 is an open-source deep learning framework for data mining of protein-protein interfaces or single-residue missense variants." readme = "README.md" requires-python = ">=3.10" From 2e607170aaa3013515aa529d1d228fc239e6dcb9 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Mon, 5 Aug 2024 11:21:01 +0200 Subject: [PATCH 27/68] ci: update linter --- deeprank2/dataset.py | 4 ++-- pyproject.toml | 9 +++++---- tests/test_dataset.py | 2 +- tests/utils/test_graph.py | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/deeprank2/dataset.py b/deeprank2/dataset.py index 18796ac2..144e8d0a 100644 --- a/deeprank2/dataset.py +++ b/deeprank2/dataset.py @@ -112,7 +112,7 @@ def _check_and_inherit_train( # noqa: C901 for key in data["features_transform"].values(): if key["transform"] is None: continue - key["transform"] = eval(key["transform"]) # noqa: S307, PGH001 + key["transform"] = eval(key["transform"]) # noqa: S307 except pickle.UnpicklingError as e: msg = "The path provided to `train_source` is not a valid DeepRank2 pre-trained model." raise ValueError(msg) from e @@ -277,7 +277,7 @@ def _filter_targets(self, grp: h5py.Group) -> bool: for operator_string in [">", "<", "==", "<=", ">=", "!="]: operation = operation.replace(operator_string, f"{target_value}" + operator_string) - if not eval(operation): # noqa: S307, PGH001 + if not eval(operation): # noqa: S307 return False elif target_condition is not None: diff --git a/pyproject.toml b/pyproject.toml index 4172c9c1..4172d7cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,8 +53,8 @@ dependencies = [ "python-louvain >= 0.16, < 1.0", "tqdm >= 4.66.4, < 5.0", "freesasa >= 2.1.1, < 3.0", - "biopython >= 1.83, < 2.0" - ] + "biopython >= 1.83, < 2.0", +] [project.optional-dependencies] # development dependency groups @@ -66,7 +66,7 @@ test = [ "pytest-cov >= 4.1.0, < 5.0", "pytest-runner >= 6.0.0, < 7.0", "coveralls >= 3.3.1, < 4.0", - "ruff == 0.5.1" + "ruff == 0.6.3", ] publishing = ["build", "twine", "wheel"] notebooks = ["nbmake"] @@ -88,7 +88,7 @@ include = ["deeprank2*"] [tool.pytest.ini_options] # pytest options: -ra: show summary info for all test outcomes -addopts = "-ra" +addopts = "-ra" [tool.ruff] output-format = "concise" @@ -148,3 +148,4 @@ isort.known-first-party = ["deeprank2"] ] "docs/*" = ["ALL"] "tests/perf/*" = ["T201"] # Use of print statements +"*.ipynb" = ["T201", "E402", "D103"] diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 932e7d3c..3d242425 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -1201,7 +1201,7 @@ def test_inherit_info_pretrained_model_graphdataset(self) -> None: for key in data["features_transform"].values(): if key["transform"] is None: continue - key["transform"] = eval(key["transform"]) # noqa: S307, PGH001 + key["transform"] = eval(key["transform"]) # noqa: S307 dataset_test_vars = vars(dataset_test) for param in dataset_test.inherited_params: diff --git a/tests/utils/test_graph.py b/tests/utils/test_graph.py index 39bd2c9c..5ca7d427 100644 --- a/tests/utils/test_graph.py +++ b/tests/utils/test_graph.py @@ -27,7 +27,7 @@ target_value = 1.0 -@pytest.fixture() +@pytest.fixture def graph() -> Graph: """Build a simple graph of two nodes and one edge in between them.""" # load the structure From 9fd9c11930a97006c555b4f932edb1bb8dd1357d Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Mon, 5 Aug 2024 12:37:34 +0200 Subject: [PATCH 28/68] ci: lint _generate_testdata.ipynb --- tests/data/hdf5/_generate_testdata.ipynb | 25 ++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/data/hdf5/_generate_testdata.ipynb b/tests/data/hdf5/_generate_testdata.ipynb index b2fc2677..76289783 100644 --- a/tests/data/hdf5/_generate_testdata.ipynb +++ b/tests/data/hdf5/_generate_testdata.ipynb @@ -15,11 +15,8 @@ "PATH_TEST = ROOT / \"tests\"\n", "import glob\n", "import os\n", - "import re\n", - "import sys\n", "\n", "import h5py\n", - "import numpy as np\n", "import pandas as pd\n", "\n", "from deeprank2.dataset import save_hdf5_keys\n", @@ -79,7 +76,7 @@ " chain_ids=[chain_id1, chain_id2],\n", " targets=targets,\n", " pssm_paths={chain_id1: pssm_path1, chain_id2: pssm_path2},\n", - " )\n", + " ),\n", " )\n", "\n", " # Generate graphs and save them in hdf5 files\n", @@ -128,8 +125,8 @@ "csv_data = pd.read_csv(csv_file_path)\n", "csv_data.cluster = csv_data.cluster.fillna(-1)\n", "pdb_ids_csv = [pdb_file.split(\"/\")[-1].split(\".\")[0].replace(\"-\", \"_\") for pdb_file in pdb_files]\n", - "clusters = [csv_data[pdb_id == csv_data.ID].cluster.values[0] for pdb_id in pdb_ids_csv]\n", - "bas = [csv_data[pdb_id == csv_data.ID].measurement_value.values[0] for pdb_id in pdb_ids_csv]\n", + "clusters = [csv_data[pdb_id == csv_data.ID].cluster.to_numpy()[0] for pdb_id in pdb_ids_csv]\n", + "bas = [csv_data[pdb_id == csv_data.ID].measurement_value.to_numpy()[0] for pdb_id in pdb_ids_csv]\n", "\n", "queries = QueryCollection()\n", "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", @@ -147,7 +144,7 @@ " \"cluster\": clusters[i],\n", " },\n", " pssm_paths={\"M\": pssm_m[i], \"P\": pssm_p[i]},\n", - " )\n", + " ),\n", " )\n", "print(\"Queries created and ready to be processed.\\n\")\n", "\n", @@ -183,7 +180,7 @@ "test_ids = []\n", "\n", "with h5py.File(hdf5_path, \"r\") as hdf5:\n", - " for key in hdf5.keys():\n", + " for key in hdf5:\n", " feature_value = float(hdf5[key][target][feature][()])\n", " if feature_value in train_clusters:\n", " train_ids.append(key)\n", @@ -192,7 +189,7 @@ " elif feature_value in test_clusters:\n", " test_ids.append(key)\n", "\n", - " if feature_value in clusters.keys():\n", + " if feature_value in clusters:\n", " clusters[int(feature_value)] += 1\n", " else:\n", " clusters[int(feature_value)] = 1\n", @@ -278,8 +275,12 @@ " targets = compute_ppi_scores(pdb_path, ref_path)\n", " queries.add(\n", " ProteinProteinInterfaceQuery(\n", - " pdb_path=pdb_path, resolution=\"atom\", chain_ids=[chain_id1, chain_id2], targets=targets, pssm_paths={chain_id1: pssm_path1, chain_id2: pssm_path2}\n", - " )\n", + " pdb_path=pdb_path,\n", + " resolution=\"atom\",\n", + " chain_ids=[chain_id1, chain_id2],\n", + " targets=targets,\n", + " pssm_paths={chain_id1: pssm_path1, chain_id2: pssm_path2},\n", + " ),\n", " )\n", "\n", "# Generate graphs and save them in hdf5 files\n", @@ -303,7 +304,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.14" }, "orig_nbformat": 4, "vscode": { From 7ee75afa9254cbf69ed9d4a36ff419255439c927 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Mon, 5 Aug 2024 13:17:16 +0200 Subject: [PATCH 29/68] ci: fix linting for tutorial notebooks --- tutorials/data_generation_ppi.ipynb | 56 +- tutorials/data_generation_srv.ipynb | 59 +- tutorials/training.ipynb | 1539 ++++++++++++++------------- 3 files changed, 828 insertions(+), 826 deletions(-) diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 2d1d9650..88c6c482 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -64,17 +64,18 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import pandas as pd\n", "import glob\n", + "import os\n", + "\n", "import h5py\n", "import matplotlib.image as img\n", "import matplotlib.pyplot as plt\n", - "from deeprank2.query import QueryCollection\n", - "from deeprank2.query import ProteinProteinInterfaceQuery, ProteinProteinInterfaceQuery\n", + "import pandas as pd\n", + "\n", + "from deeprank2.dataset import GraphDataset\n", "from deeprank2.features import components, contact\n", - "from deeprank2.utils.grid import GridSettings, MapMethod\n", - "from deeprank2.dataset import GraphDataset" + "from deeprank2.query import ProteinProteinInterfaceQuery, QueryCollection\n", + "from deeprank2.utils.grid import GridSettings, MapMethod" ] }, { @@ -131,14 +132,15 @@ "metadata": {}, "outputs": [], "source": [ - "def get_pdb_files_and_target_data(data_path):\n", + "def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list[float]]:\n", " csv_data = pd.read_csv(os.path.join(data_path, \"BA_values.csv\"))\n", " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.pdb\"))\n", " pdb_files.sort()\n", " pdb_ids_csv = [pdb_file.split(\"/\")[-1].split(\".\")[0] for pdb_file in pdb_files]\n", " csv_data_indexed = csv_data.set_index(\"ID\")\n", " csv_data_indexed = csv_data_indexed.loc[pdb_ids_csv]\n", - " bas = csv_data_indexed.measurement_value.values.tolist()\n", + " bas = csv_data_indexed.measurement_value.tolist()\n", + "\n", " return pdb_files, bas\n", "\n", "\n", @@ -192,9 +194,9 @@ "\n", "influence_radius = 8 # max distance in Å between two interacting residues/atoms of two proteins\n", "max_edge_length = 8\n", + "binary_target_value = 500\n", "\n", "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", - "count = 0\n", "for i in range(len(pdb_files)):\n", " queries.add(\n", " ProteinProteinInterfaceQuery(\n", @@ -204,16 +206,15 @@ " influence_radius=influence_radius,\n", " max_edge_length=max_edge_length,\n", " targets={\n", - " \"binary\": int(float(bas[i]) <= 500), # binary target value\n", + " \"binary\": int(float(bas[i]) <= binary_target_value),\n", " \"BA\": bas[i], # continuous target value\n", " },\n", - " )\n", + " ),\n", " )\n", - " count += 1\n", - " if count % 20 == 0:\n", - " print(f\"{count} queries added to the collection.\")\n", + " if i + 1 % 20 == 0:\n", + " print(f\"{i+1} queries added to the collection.\")\n", "\n", - "print(\"Queries ready to be processed.\\n\")" + "print(f\"{i+1} queries ready to be processed.\\n\")" ] }, { @@ -340,8 +341,8 @@ "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", "dataset = GraphDataset(processed_data, target=\"binary\")\n", - "df = dataset.hdf5_to_pandas()\n", - "df.head()" + "dataset_df = dataset.hdf5_to_pandas()\n", + "dataset_df.head()" ] }, { @@ -358,7 +359,7 @@ "metadata": {}, "outputs": [], "source": [ - "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", + "fname = os.path.join(processed_data_path, \"residue\", \"res_mass_distance_electrostatic\")\n", "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", @@ -429,9 +430,9 @@ "\n", "influence_radius = 5 # max distance in Å between two interacting residues/atoms of two proteins\n", "max_edge_length = 5\n", + "binary_target_value = 500\n", "\n", "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", - "count = 0\n", "for i in range(len(pdb_files)):\n", " queries.add(\n", " ProteinProteinInterfaceQuery(\n", @@ -441,16 +442,15 @@ " influence_radius=influence_radius,\n", " max_edge_length=max_edge_length,\n", " targets={\n", - " \"binary\": int(float(bas[i]) <= 500), # binary target value\n", + " \"binary\": int(float(bas[i]) <= binary_target_value),\n", " \"BA\": bas[i], # continuous target value\n", " },\n", - " )\n", + " ),\n", " )\n", - " count += 1\n", - " if count % 20 == 0:\n", - " print(f\"{count} queries added to the collection.\")\n", + " if i + 1 % 20 == 0:\n", + " print(f\"{i+1} queries added to the collection.\")\n", "\n", - "print(\"Queries ready to be processed.\\n\")" + "print(f\"{i+1} queries ready to be processed.\\n\")" ] }, { @@ -495,8 +495,8 @@ "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", "dataset = GraphDataset(processed_data, target=\"binary\")\n", - "df = dataset.hdf5_to_pandas()\n", - "df.head()" + "dataset_df = dataset.hdf5_to_pandas()\n", + "dataset_df.head()" ] }, { @@ -540,7 +540,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.14" }, "orig_nbformat": 4 }, diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index 1a68f31a..83247e5d 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -64,18 +64,19 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "import pandas as pd\n", "import glob\n", + "import os\n", + "\n", "import h5py\n", "import matplotlib.image as img\n", "import matplotlib.pyplot as plt\n", - "from deeprank2.query import QueryCollection\n", - "from deeprank2.query import SingleResidueVariantQuery, SingleResidueVariantQuery\n", + "import pandas as pd\n", + "\n", + "from deeprank2.dataset import GraphDataset\n", "from deeprank2.domain.aminoacidlist import amino_acids_by_code\n", "from deeprank2.features import components, contact\n", - "from deeprank2.utils.grid import GridSettings, MapMethod\n", - "from deeprank2.dataset import GraphDataset" + "from deeprank2.query import QueryCollection, SingleResidueVariantQuery\n", + "from deeprank2.utils.grid import GridSettings, MapMethod" ] }, { @@ -132,19 +133,20 @@ "metadata": {}, "outputs": [], "source": [ - "def get_pdb_files_and_target_data(data_path):\n", + "def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list[int], list[str], list[str], list[float]]:\n", " csv_data = pd.read_csv(os.path.join(data_path, \"srv_target_values.csv\"))\n", " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.ent\"))\n", " pdb_files.sort()\n", " pdb_file_names = [os.path.basename(pdb_file) for pdb_file in pdb_files]\n", " csv_data_indexed = csv_data.set_index(\"pdb_file\")\n", " csv_data_indexed = csv_data_indexed.loc[pdb_file_names]\n", - " res_numbers = csv_data_indexed.res_number.values.tolist()\n", - " res_wildtypes = csv_data_indexed.res_wildtype.values.tolist()\n", - " res_variants = csv_data_indexed.res_variant.values.tolist()\n", - " targets = csv_data_indexed.target.values.tolist()\n", - " pdb_names = csv_data_indexed.index.values.tolist()\n", + " res_numbers = csv_data_indexed.res_number.tolist()\n", + " res_wildtypes = csv_data_indexed.res_wildtype.tolist()\n", + " res_variants = csv_data_indexed.res_variant.tolist()\n", + " targets = csv_data_indexed.target.tolist()\n", + " pdb_names = csv_data_indexed.index.tolist()\n", " pdb_files = [data_path + \"/pdb/\" + pdb_name for pdb_name in pdb_names]\n", + "\n", " return pdb_files, res_numbers, res_wildtypes, res_variants, targets\n", "\n", "\n", @@ -204,7 +206,6 @@ "max_edge_length = 4.5 # ??\n", "\n", "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", - "count = 0\n", "for i in range(len(pdb_files)):\n", " queries.add(\n", " SingleResidueVariantQuery(\n", @@ -218,13 +219,12 @@ " targets={\"binary\": targets[i]},\n", " influence_radius=influence_radius,\n", " max_edge_length=max_edge_length,\n", - " )\n", + " ),\n", " )\n", - " count += 1\n", - " if count % 20 == 0:\n", - " print(f\"{count} queries added to the collection.\")\n", + " if i + 1 % 20 == 0:\n", + " print(f\"{i+1} queries added to the collection.\")\n", "\n", - "print(f\"Queries ready to be processed.\\n\")" + "print(f\"{i+1} queries ready to be processed.\\n\")" ] }, { @@ -358,8 +358,8 @@ "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"residue\", \"*.hdf5\"))\n", "dataset = GraphDataset(processed_data, target=\"binary\")\n", - "df = dataset.hdf5_to_pandas()\n", - "df.head()" + "dataset_df = dataset.hdf5_to_pandas()\n", + "dataset_df.head()" ] }, { @@ -376,7 +376,8 @@ "metadata": {}, "outputs": [], "source": [ - "fname = os.path.join(processed_data_path, \"residue\", \"_\".join([\"res_mass\", \"distance\", \"electrostatic\"]))\n", + "fname = os.path.join(processed_data_path, \"residue\", \"res_mass_distance_electrostatic\")\n", + "\n", "dataset.save_hist(features=[\"res_mass\", \"distance\", \"electrostatic\"], fname=fname)\n", "\n", "im = img.imread(fname + \".png\")\n", @@ -450,7 +451,6 @@ "max_edge_length = 4.5 # ??\n", "\n", "print(f\"Adding {len(pdb_files)} queries to the query collection ...\")\n", - "count = 0\n", "for i in range(len(pdb_files)):\n", " queries.add(\n", " SingleResidueVariantQuery(\n", @@ -464,13 +464,12 @@ " targets={\"binary\": targets[i]},\n", " influence_radius=influence_radius,\n", " max_edge_length=max_edge_length,\n", - " )\n", + " ),\n", " )\n", - " count += 1\n", - " if count % 20 == 0:\n", - " print(f\"{count} queries added to the collection.\")\n", + " if i + 1 % 20 == 0:\n", + " print(f\"{i+1} queries added to the collection.\")\n", "\n", - "print(\"Queries ready to be processed.\\n\")" + "print(f\"{i+1} queries ready to be processed.\\n\")" ] }, { @@ -515,8 +514,8 @@ "source": [ "processed_data = glob.glob(os.path.join(processed_data_path, \"atomic\", \"*.hdf5\"))\n", "dataset = GraphDataset(processed_data, target=\"binary\")\n", - "df = dataset.hdf5_to_pandas()\n", - "df.head()" + "dataset_df = dataset.hdf5_to_pandas()\n", + "dataset_df.head()" ] }, { @@ -565,7 +564,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.14" }, "orig_nbformat": 4 }, diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index 499c3f8e..073a4062 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -1,770 +1,773 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Training Neural Networks\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "\n", - "\n", - "\n", - "This tutorial will demonstrate the use of DeepRank2 for training graph neural networks (GNNs) and convolutional neural networks (CNNs) using protein-protein interface (PPI) or single-residue variant (SRV) data for classification and regression predictive tasks.\n", - "\n", - "This tutorial assumes that the PPI data of interest have already been generated and saved as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), with the data structure that DeepRank2 expects. This data can be generated using the [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb) tutorial or downloaded from Zenodo at [this record address](https://zenodo.org/record/8349335). For more details on the data structure, please refer to the other tutorial, which also contains a detailed description of how the data is generated from PDB files.\n", - "\n", - "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Input data\n", - "\n", - "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", - "\n", - "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", - "\n", - "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Utilities\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Libraries\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The libraries needed for this tutorial:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import logging\n", - "import glob\n", - "import os\n", - "import h5py\n", - "import pandas as pd\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import roc_curve, auc, precision_score, recall_score, accuracy_score, f1_score\n", - "import plotly.express as px\n", - "import torch\n", - "import numpy as np\n", - "\n", - "np.seterr(divide=\"ignore\")\n", - "np.seterr(invalid=\"ignore\")\n", - "import pandas as pd\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "from deeprank2.dataset import GraphDataset, GridDataset\n", - "from deeprank2.trainer import Trainer\n", - "from deeprank2.neuralnets.gnn.vanilla_gnn import VanillaNetwork\n", - "from deeprank2.neuralnets.cnn.model3d import CnnClassification\n", - "from deeprank2.utils.exporters import HDF5OutputExporter\n", - "import warnings\n", - "\n", - "warnings.filterwarnings(\"ignore\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Paths and sets\n", - "\n", - "The paths for reading the processed data:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data_type = \"ppi\"\n", - "level = \"residue\"\n", - "processed_data_path = os.path.join(\"data_processed\", data_type, level)\n", - "input_data_path = glob.glob(os.path.join(processed_data_path, \"*.hdf5\"))\n", - "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `data_type` can be either \"ppi\" or \"srv\", depending on which application the user is most interested in. The `level` can be either \"residue\" or \"atomic\", and refers to the structural resolution, where each node either represents a single residue or a single atom from the molecular structure.\n", - "\n", - "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_dict = {}\n", - "df_dict[\"entry\"] = []\n", - "df_dict[\"target\"] = []\n", - "for fname in input_data_path:\n", - " with h5py.File(fname, \"r\") as hdf5:\n", - " for mol in hdf5.keys():\n", - " target_value = float(hdf5[mol][\"target_values\"][\"binary\"][()])\n", - " df_dict[\"entry\"].append(mol)\n", - " df_dict[\"target\"].append(target_value)\n", - "\n", - "df = pd.DataFrame(data=df_dict)\n", - "df.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As explained in [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb), for each data point there are two targets: \"BA\" and \"binary\". The first represents the strength of the interaction between two molecules that bind reversibly (interact) in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) binding one.\n", - "\n", - "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb).\n", - "\n", - "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.target, random_state=42)\n", - "df_train, df_valid = train_test_split(df_train, test_size=0.2, stratify=df_train.target, random_state=42)\n", - "\n", - "print(f\"Data statistics:\\n\")\n", - "print(f\"Total samples: {len(df)}\\n\")\n", - "print(f\"Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%\")\n", - "print(f\"\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%\")\n", - "print(f\"Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%\")\n", - "print(f\"\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%\")\n", - "print(f\"Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%\")\n", - "print(f\"\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%\")\n", - "print(f\"\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Classification example\n", - "\n", - "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### GNN\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### GraphDataset\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For training GNNs the user can create `GraphDataset` instances. This class inherits from `DeeprankDataset` class, which in turns inherits from `Dataset` [PyTorch geometric class](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/dataset.html), a base class for creating graph datasets.\n", - "\n", - "A few notes about `GraphDataset` parameters:\n", - "\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features.\n", - "- For regression, `task` should be set to `regress` and the `target` to `BA`, which is a continuous variable and therefore suitable for regression tasks.\n", - "- For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization.\n", - " - If the `standardize` key is `True`, standardization is applied after transformation. Standardization consists in applying the following formula on each feature's value: ${x' = \\frac{x - \\mu}{\\sigma}}$, being ${\\mu}$ the mean and ${\\sigma}$ the standard deviation. Standardization is a scaling method where the values are centered around mean with a unit standard deviation.\n", - " - The transformation to apply can be speficied as a lambda function as a value of the key `transform`, which defaults to `None`.\n", - " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered.\n", - " - Note that transformations have not currently been implemented for the `GridDataset` class.\n", - " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "target = \"binary\"\n", - "task = \"classif\"\n", - "node_features = [\"res_type\"]\n", - "edge_features = [\"distance\"]\n", - "features_transform = {\"all\": {\"transform\": lambda x: np.cbrt(x), \"standardize\": True}}\n", - "\n", - "print(\"Loading training data...\")\n", - "dataset_train = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " node_features=node_features,\n", - " edge_features=edge_features,\n", - " features_transform=features_transform,\n", - " target=target,\n", - " task=task,\n", - ")\n", - "print(\"\\nLoading validation data...\")\n", - "dataset_val = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source=dataset_train,\n", - ")\n", - "print(\"\\nLoading test data...\")\n", - "dataset_test = GraphDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source=dataset_train,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Trainer\n", - "\n", - "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A few notes about `Trainer` parameters:\n", - "\n", - "- `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. The `Trainer` class takes care of formatting the output shape according to the task. This tutorial uses a simple network, `VanillaNetwork` (implemented in `deeprank2.neuralnets.gnn.vanilla_gnn`). All GNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank-core/tree/main/deeprank2/neuralnets/gnn) and can be used for training or as a basis for implementing new ones.\n", - "- `class_weights` is used for classification tasks only and assigns class weights based on the training dataset content to account for any potential inbalance between the classes. In this case the dataset is balanced (50% 0 and 50% 1), so it is not necessary to use it. It defaults to False.\n", - "- `cuda` and `ngpu` are used for indicating whether to use CUDA and how many GPUs. By default, CUDA is not used and `ngpu` is 0.\n", - "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Training\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer = Trainer(\n", - " neuralnet=VanillaNetwork,\n", - " dataset_train=dataset_train,\n", - " dataset_val=dataset_val,\n", - " dataset_test=dataset_test,\n", - " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))],\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = torch.optim.SGD\n", - "lr = 1e-3\n", - "weight_decay = 0.001\n", - "\n", - "trainer.configure_optimizers(optimizer, lr, weight_decay)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The default loss function for classification is `torch.nn.CrossEntropyLoss` and for regression it is `torch.nn.MSELoss`. It is also possible to set some other PyTorch loss functions by using `Trainer.set_lossfunction` method, although not all are currently implemented.\n", - "\n", - "Then the model can be trained using the `train()` method of the `Trainer` class.\n", - "\n", - "A few notes about `train()` method parameters:\n", - "\n", - "- `earlystop_patience`, `earlystop_maxgap` and `min_epoch` are used for controlling early stopping logic. `earlystop_patience` indicates the number of epochs after which the training ends if the validation loss does not improve. `earlystop_maxgap` indicated the maximum difference allowed between validation and training loss, and `min_epoch` is the minimum number of epochs to be reached before evaluating `maxgap`.\n", - "- If `validate` is set to `True`, validation is performed on an independent dataset, which has been called `dataset_val` few cells above. If set to `False`, validation is performed on the training dataset itself (not recommended).\n", - "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "epochs = 20\n", - "batch_size = 8\n", - "earlystop_patience = 5\n", - "earlystop_maxgap = 0.1\n", - "min_epoch = 10\n", - "\n", - "trainer.train(\n", - " nepoch=epochs,\n", - " batch_size=batch_size,\n", - " earlystop_patience=earlystop_patience,\n", - " earlystop_maxgap=earlystop_maxgap,\n", - " min_epoch=min_epoch,\n", - " validate=True,\n", - " filename=os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"),\n", - ")\n", - "\n", - "epoch = trainer.epoch_saved_model\n", - "print(f\"Model saved at epoch {epoch}\")\n", - "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f\"Total # of parameters: {pytorch_total_params}\")\n", - "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Testing\n", - "\n", - "And the trained model can be tested on `dataset_test`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Results visualization\n", - "\n", - "Finally, the results saved by `HDF5OutputExporter` can be inspected, which can be found in the `data/ppi/gnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`. Note that the folder contains the saved pre-trained model as well.\n", - "\n", - "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_train = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", - "output_test = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", - "output_train.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", - "\n", - "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeeprankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n", - "\n", - "The loss across the epochs can be plotted for the training and the validation sets:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", - "\n", - "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", - "\n", - "fig.update_layout(\n", - " xaxis_title=\"Epoch #\",\n", - " yaxis_title=\"Loss\",\n", - " title=\"Loss vs epochs - GNN training\",\n", - " width=700,\n", - " height=400,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "skip-execution" - ] - }, - "outputs": [], - "source": [ - "threshold = 0.5\n", - "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", - "\n", - "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", - " df_plot_phase = df_plot[(df_plot.phase == set)]\n", - " y_true = df_plot_phase.target\n", - " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", - "\n", - " print(f\"\\nMetrics for {set}:\")\n", - " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", - " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f\"AUC: {round(auc_score, 1)}\")\n", - " print(f\"Considering a threshold of {threshold}\")\n", - " y_pred = (y_score > threshold) * 1\n", - " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", - " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that the poor performance of this network is due to the small number of datapoints used in this tutorial. For a more reliable network we suggest using a number of data points on the order of at least tens of thousands.\n", - "\n", - "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### CNN\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### GridDataset\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For training CNNs the user can create `GridDataset` instances.\n", - "\n", - "A few notes about `GridDataset` parameters:\n", - "\n", - "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\".\n", - "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "target = \"binary\"\n", - "task = \"classif\"\n", - "\n", - "print(\"Loading training data...\")\n", - "dataset_train = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", - " target=target,\n", - " task=task,\n", - ")\n", - "print(\"\\nLoading validation data...\")\n", - "dataset_val = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", - " train_source=dataset_train,\n", - ")\n", - "print(\"\\nLoading test data...\")\n", - "dataset_test = GridDataset(\n", - " hdf5_path=input_data_path,\n", - " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", - " train_source=dataset_train,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Trainer\n", - "\n", - "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- Also in this case, `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. This tutorial uses `CnnClassification` (implemented in `deeprank2.neuralnets.cnn.model3d`). All CNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank2/tree/main/deeprank2/neuralnets/cnn) and can be used for training or as a basis for implementing new ones.\n", - "- The rest of the `Trainer` parameters can be used as explained already for graphs.\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Training\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "optimizer = torch.optim.SGD\n", - "lr = 1e-3\n", - "weight_decay = 0.001\n", - "epochs = 20\n", - "batch_size = 8\n", - "earlystop_patience = 5\n", - "earlystop_maxgap = 0.1\n", - "min_epoch = 10\n", - "\n", - "trainer = Trainer(\n", - " neuralnet=CnnClassification,\n", - " dataset_train=dataset_train,\n", - " dataset_val=dataset_val,\n", - " dataset_test=dataset_test,\n", - " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))],\n", - ")\n", - "\n", - "trainer.configure_optimizers(optimizer, lr, weight_decay)\n", - "\n", - "trainer.train(\n", - " nepoch=epochs,\n", - " batch_size=batch_size,\n", - " earlystop_patience=earlystop_patience,\n", - " earlystop_maxgap=earlystop_maxgap,\n", - " min_epoch=min_epoch,\n", - " validate=True,\n", - " filename=os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"),\n", - ")\n", - "\n", - "epoch = trainer.epoch_saved_model\n", - "print(f\"Model saved at epoch {epoch}\")\n", - "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", - "print(f\"Total # of parameters: {pytorch_total_params}\")\n", - "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", - "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Testing\n", - "\n", - "And the trained model can be tested on `dataset_test`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "trainer.test()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Results visualization\n", - "\n", - "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "output_train = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", - "output_test = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", - "output_train.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", - "\n", - "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", - "\n", - "fig.update_layout(\n", - " xaxis_title=\"Epoch #\",\n", - " yaxis_title=\"Loss\",\n", - " title=\"Loss vs epochs - CNN training\",\n", - " width=700,\n", - " height=400,\n", - ")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And some metrics of interest for classification tasks:\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "threshold = 0.5\n", - "df = pd.concat([output_train, output_test])\n", - "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", - "\n", - "for idx, set in enumerate([\"training\", \"validation\", \"testing\"]):\n", - " df_plot_phase = df_plot[(df_plot.phase == set)]\n", - " y_true = df_plot_phase.target\n", - " y_score = np.array(df_plot_phase.output.values.tolist())[:, 1]\n", - "\n", - " print(f\"\\nMetrics for {set}:\")\n", - " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", - " auc_score = auc(fpr_roc, tpr_roc)\n", - " print(f\"AUC: {round(auc_score, 1)}\")\n", - " print(f\"Considering a threshold of {threshold}\")\n", - " y_pred = (y_score > threshold) * 1\n", - " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", - " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", - " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "deeprank2", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Training Neural Networks\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "\n", + "\n", + "\n", + "This tutorial will demonstrate the use of DeepRank2 for training graph neural networks (GNNs) and convolutional neural networks (CNNs) using protein-protein interface (PPI) or single-residue variant (SRV) data for classification and regression predictive tasks.\n", + "\n", + "This tutorial assumes that the PPI data of interest have already been generated and saved as [HDF5 files](https://en.wikipedia.org/wiki/Hierarchical_Data_Format), with the data structure that DeepRank2 expects. This data can be generated using the [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb) tutorial or downloaded from Zenodo at [this record address](https://zenodo.org/record/8349335). For more details on the data structure, please refer to the other tutorial, which also contains a detailed description of how the data is generated from PDB files.\n", + "\n", + "This tutorial assumes also a basic knowledge of the [PyTorch](https://pytorch.org/) framework, on top of which the machine learning pipeline of DeepRank2 has been developed, for which many online tutorials exist.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Input data\n", + "\n", + "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", + "\n", + "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "\n", + "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Utilities\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Libraries\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The libraries needed for this tutorial:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import logging\n", + "import os\n", + "import warnings\n", + "\n", + "import h5py\n", + "import numpy as np\n", + "import pandas as pd\n", + "import plotly.express as px\n", + "import torch\n", + "from sklearn.metrics import accuracy_score, auc, f1_score, precision_score, recall_score, roc_curve\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "from deeprank2.dataset import GraphDataset, GridDataset\n", + "from deeprank2.neuralnets.cnn.model3d import CnnClassification\n", + "from deeprank2.neuralnets.gnn.vanilla_gnn import VanillaNetwork\n", + "from deeprank2.trainer import Trainer\n", + "from deeprank2.utils.exporters import HDF5OutputExporter\n", + "\n", + "np.seterr(divide=\"ignore\")\n", + "np.seterr(invalid=\"ignore\")\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "# ruff: noqa: PD901" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Paths and sets\n", + "\n", + "The paths for reading the processed data:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_type = \"ppi\"\n", + "level = \"residue\"\n", + "processed_data_path = os.path.join(\"data_processed\", data_type, level)\n", + "input_data_path = glob.glob(os.path.join(processed_data_path, \"*.hdf5\"))\n", + "output_path = os.path.join(\"data_processed\", data_type, level) # for saving predictions results" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `data_type` can be either \"ppi\" or \"srv\", depending on which application the user is most interested in. The `level` can be either \"residue\" or \"atomic\", and refers to the structural resolution, where each node either represents a single residue or a single atom from the molecular structure.\n", + "\n", + "In this tutorial, we will use PPI residue-level data by default, but the same code can be applied to SRV or/and atomic-level data with no changes, apart from setting `data_type` and `level` parameters in the cell above.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A Pandas DataFrame containing data points' IDs and the binary target values can be defined:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_dict = {}\n", + "df_dict[\"entry\"] = []\n", + "df_dict[\"target\"] = []\n", + "for fname in input_data_path:\n", + " with h5py.File(fname, \"r\") as hdf5:\n", + " for mol in hdf5:\n", + " target_value = float(hdf5[mol][\"target_values\"][\"binary\"][()])\n", + " df_dict[\"entry\"].append(mol)\n", + " df_dict[\"target\"].append(target_value)\n", + "\n", + "df = pd.DataFrame(data=df_dict)\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As explained in [data_generation_ppi.ipynb](https://github.com/DeepRank/deeprank2/blob/main/tutorials/data_generation_ppi.ipynb), for each data point there are two targets: \"BA\" and \"binary\". The first represents the strength of the interaction between two molecules that bind reversibly (interact) in nM, while the second represents its binary mapping, being 0 (BA > 500 nM) a not-binding complex and 1 (BA <= 500 nM) binding one.\n", + "\n", + "For SRVs, each data point has a single target, \"binary\", which is 0 if the SRV is considered benign, and 1 if it is pathogenic, as explained in [data_generation_srv.ipynb](https://github.com/DeepRank/deeprank-core/blob/main/tutorials/data_generation_srv.ipynb).\n", + "\n", + "The pandas DataFrame `df` is used only to split data points into training, validation and test sets according to the \"binary\" target - using target stratification to keep the proportion of 0s and 1s constant among the different sets. Training and validation sets will be used during the training for updating the network weights, while the test set will be held out as an independent test and will be used later for the model evaluation.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_train, df_test = train_test_split(df, test_size=0.1, stratify=df.target, random_state=42)\n", + "df_train, df_valid = train_test_split(df_train, test_size=0.2, stratify=df_train.target, random_state=42)\n", + "\n", + "print(\"Data statistics:\\n\")\n", + "print(f\"Total samples: {len(df)}\\n\")\n", + "print(f\"Training set: {len(df_train)} samples, {round(100*len(df_train)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_train[df_train.target == 0])} samples, {round(100*len(df_train[df_train.target == 0])/len(df_train))}%\")\n", + "print(f\"\\t- Class 1: {len(df_train[df_train.target == 1])} samples, {round(100*len(df_train[df_train.target == 1])/len(df_train))}%\")\n", + "print(f\"Validation set: {len(df_valid)} samples, {round(100*len(df_valid)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_valid[df_valid.target == 0])} samples, {round(100*len(df_valid[df_valid.target == 0])/len(df_valid))}%\")\n", + "print(f\"\\t- Class 1: {len(df_valid[df_valid.target == 1])} samples, {round(100*len(df_valid[df_valid.target == 1])/len(df_valid))}%\")\n", + "print(f\"Testing set: {len(df_test)} samples, {round(100*len(df_test)/len(df))}%\")\n", + "print(f\"\\t- Class 0: {len(df_test[df_test.target == 0])} samples, {round(100*len(df_test[df_test.target == 0])/len(df_test))}%\")\n", + "print(f\"\\t- Class 1: {len(df_test[df_test.target == 1])} samples, {round(100*len(df_test[df_test.target == 1])/len(df_test))}%\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classification example\n", + "\n", + "A GNN and a CNN can be trained for a classification predictive task, which consists in predicting the \"binary\" target values.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GNN\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### GraphDataset\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For training GNNs the user can create `GraphDataset` instances. This class inherits from `DeeprankDataset` class, which in turns inherits from `Dataset` [PyTorch geometric class](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/data/dataset.html), a base class for creating graph datasets.\n", + "\n", + "A few notes about `GraphDataset` parameters:\n", + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `node_features` and `edge_features` in `GraphDataset` if not all of them are needed. See the [docs](https://deeprank2.readthedocs.io/en/latest/features.html) for more details about all the possible pre-implemented features.\n", + "- For regression, `task` should be set to `regress` and the `target` to `BA`, which is a continuous variable and therefore suitable for regression tasks.\n", + "- For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization.\n", + " - If the `standardize` key is `True`, standardization is applied after transformation. Standardization consists in applying the following formula on each feature's value: ${x' = \\frac{x - \\mu}{\\sigma}}$, being ${\\mu}$ the mean and ${\\sigma}$ the standard deviation. Standardization is a scaling method where the values are centered around mean with a unit standard deviation.\n", + " - The transformation to apply can be speficied as a lambda function as a value of the key `transform`, which defaults to `None`.\n", + " - Since in the provided example standardization is applied, the training features' means and standard deviations need to be used for scaling validation and test sets. For doing so, `train_source` parameter is used. When `train_source` parameter is set, it will be used to scale the validation/testing sets. You need to pass `features_transform` to the training dataset only, since in other cases it will be ignored and only the one of `train_source` will be considered.\n", + " - Note that transformations have not currently been implemented for the `GridDataset` class.\n", + " - In the example below a logarithmic transformation and then the standardization are applied to all the features. It is also possible to use specific features as keys for indicating that transformation and/or standardization need to be apply to few features only.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target = \"binary\"\n", + "task = \"classif\"\n", + "node_features = [\"res_type\"]\n", + "edge_features = [\"distance\"]\n", + "features_transform = {\"all\": {\"transform\": lambda x: np.cbrt(x), \"standardize\": True}}\n", + "\n", + "print(\"Loading training data...\")\n", + "dataset_train = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " node_features=node_features,\n", + " edge_features=edge_features,\n", + " features_transform=features_transform,\n", + " target=target,\n", + " task=task,\n", + ")\n", + "print(\"\\nLoading validation data...\")\n", + "dataset_val = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", + ")\n", + "print(\"\\nLoading test data...\")\n", + "dataset_test = GraphDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Trainer\n", + "\n", + "The class `Trainer` implements training, validation and testing of PyTorch-based neural networks.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A few notes about `Trainer` parameters:\n", + "\n", + "- `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. The `Trainer` class takes care of formatting the output shape according to the task. This tutorial uses a simple network, `VanillaNetwork` (implemented in `deeprank2.neuralnets.gnn.vanilla_gnn`). All GNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank-core/tree/main/deeprank2/neuralnets/gnn) and can be used for training or as a basis for implementing new ones.\n", + "- `class_weights` is used for classification tasks only and assigns class weights based on the training dataset content to account for any potential inbalance between the classes. In this case the dataset is balanced (50% 0 and 50% 1), so it is not necessary to use it. It defaults to False.\n", + "- `cuda` and `ngpu` are used for indicating whether to use CUDA and how many GPUs. By default, CUDA is not used and `ngpu` is 0.\n", + "- The user can specify a deeprank2 exporter or a custom one in `output_exporters` parameter, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Later the results saved by `HDF5OutputExporter` will be read and evaluated.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer = Trainer(\n", + " neuralnet=VanillaNetwork,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"gnn_{task}\"))],\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default optimizer is `torch.optim.Adam`. It is possible to specify optimizer's parameters or to use another PyTorch optimizer object:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.SGD\n", + "lr = 1e-3\n", + "weight_decay = 0.001\n", + "\n", + "trainer.configure_optimizers(optimizer, lr, weight_decay)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The default loss function for classification is `torch.nn.CrossEntropyLoss` and for regression it is `torch.nn.MSELoss`. It is also possible to set some other PyTorch loss functions by using `Trainer.set_lossfunction` method, although not all are currently implemented.\n", + "\n", + "Then the model can be trained using the `train()` method of the `Trainer` class.\n", + "\n", + "A few notes about `train()` method parameters:\n", + "\n", + "- `earlystop_patience`, `earlystop_maxgap` and `min_epoch` are used for controlling early stopping logic. `earlystop_patience` indicates the number of epochs after which the training ends if the validation loss does not improve. `earlystop_maxgap` indicated the maximum difference allowed between validation and training loss, and `min_epoch` is the minimum number of epochs to be reached before evaluating `maxgap`.\n", + "- If `validate` is set to `True`, validation is performed on an independent dataset, which has been called `dataset_val` few cells above. If set to `False`, validation is performed on the training dataset itself (not recommended).\n", + "- `num_workers` can be set for indicating how many subprocesses to use for data loading. The default is 0 and it means that the data will be loaded in the main process.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "epochs = 20\n", + "batch_size = 8\n", + "earlystop_patience = 5\n", + "earlystop_maxgap = 0.1\n", + "min_epoch = 10\n", + "\n", + "trainer.train(\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"gnn_{task}\", \"model.pth.tar\"),\n", + ")\n", + "\n", + "epoch = trainer.epoch_saved_model\n", + "print(f\"Model saved at epoch {epoch}\")\n", + "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", + "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Testing\n", + "\n", + "And the trained model can be tested on `dataset_test`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Results visualization\n", + "\n", + "Finally, the results saved by `HDF5OutputExporter` can be inspected, which can be found in the `data/ppi/gnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`. Note that the folder contains the saved pre-trained model as well.\n", + "\n", + "`output_exporter.hdf5` contains [HDF5 Groups](https://docs.h5py.org/en/stable/high/group.html) which refer to each phase, e.g. training and testing if both are run, only one of them otherwise. Training phase includes validation results as well. This HDF5 file can be read as a Pandas Dataframe:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_train = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", + "output_test = pd.read_hdf(os.path.join(output_path, f\"gnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", + "output_train.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataframes contain `phase`, `epoch`, `entry`, `output`, `target`, and `loss` columns, and can be easily used to visualize the results.\n", + "\n", + "For classification tasks, the `output` column contains a list of probabilities that each class occurs, and each list sums to 1 (for more details, please see documentation on the [softmax function](https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)). Note that the order of the classes in the list depends on the `classes` attribute of the DeeprankDataset instances. For classification tasks, if `classes` is not specified (as in this example case), it is defaulted to [0, 1].\n", + "\n", + "The loss across the epochs can be plotted for the training and the validation sets:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", + "\n", + "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", + "\n", + "fig.update_layout(\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - GNN training\",\n", + " width=700,\n", + " height=400,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now a few metrics of interest for classification tasks can be printed out: the [area under the ROC curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) (AUC), and for a threshold of 0.5 the [precision, recall, accuracy and f1 score](https://en.wikipedia.org/wiki/Precision_and_recall#Definition).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "skip-execution" + ] + }, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "df = pd.concat([output_train, output_test])\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", + "\n", + "for dataset in [\"training\", \"validation\", \"testing\"]:\n", + " df_plot_phase = df_plot[(df_plot.phase == dataset)]\n", + " y_true = df_plot_phase.target\n", + " y_score = np.array(df_plot_phase.output.tolist())[:, 1]\n", + "\n", + " print(f\"\\nMetrics for {dataset}:\")\n", + " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", + " auc_score = auc(fpr_roc, tpr_roc)\n", + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that the poor performance of this network is due to the small number of datapoints used in this tutorial. For a more reliable network we suggest using a number of data points on the order of at least tens of thousands.\n", + "\n", + "The same exercise can be repeated but using grids instead of graphs and CNNs instead of GNNs.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### CNN\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### GridDataset\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For training CNNs the user can create `GridDataset` instances.\n", + "\n", + "A few notes about `GridDataset` parameters:\n", + "\n", + "- By default, all features contained in the HDF5 files are used, but the user can specify `features` in `GridDataset` if not all of them are needed. Since grids features are derived from node and edge features mapped from graphs to grid, the easiest way to see which features are available is to look at the HDF5 file, as explained in detail in `data_generation_ppi.ipynb` and `data_generation_srv.ipynb`, section \"Other tools\".\n", + "- As is the case for a `GraphDataset`, `task` can be assigned to `regress` and `target` to `BA` to perform a regression task. As mentioned previously, we do not provide sample data to perform a regression task for SRVs.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "target = \"binary\"\n", + "task = \"classif\"\n", + "\n", + "print(\"Loading training data...\")\n", + "dataset_train = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_train.entry), # selects only data points with ids in df_train.entry\n", + " target=target,\n", + " task=task,\n", + ")\n", + "print(\"\\nLoading validation data...\")\n", + "dataset_val = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_valid.entry), # selects only data points with ids in df_valid.entry\n", + " train_source=dataset_train,\n", + ")\n", + "print(\"\\nLoading test data...\")\n", + "dataset_test = GridDataset(\n", + " hdf5_path=input_data_path,\n", + " subset=list(df_test.entry), # selects only data points with ids in df_test.entry\n", + " train_source=dataset_train,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Trainer\n", + "\n", + "As for graphs, the class `Trainer` is used for training, validation and testing of the PyTorch-based CNN.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Also in this case, `neuralnet` can be any neural network class that inherits from `torch.nn.Module`, and it shouldn't be specific to regression or classification in terms of output shape. This tutorial uses `CnnClassification` (implemented in `deeprank2.neuralnets.cnn.model3d`). All CNN architectures already implemented in the pakcage can be found [here](https://github.com/DeepRank/deeprank2/tree/main/deeprank2/neuralnets/cnn) and can be used for training or as a basis for implementing new ones.\n", + "- The rest of the `Trainer` parameters can be used as explained already for graphs.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Training\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = torch.optim.SGD\n", + "lr = 1e-3\n", + "weight_decay = 0.001\n", + "epochs = 20\n", + "batch_size = 8\n", + "earlystop_patience = 5\n", + "earlystop_maxgap = 0.1\n", + "min_epoch = 10\n", + "\n", + "trainer = Trainer(\n", + " neuralnet=CnnClassification,\n", + " dataset_train=dataset_train,\n", + " dataset_val=dataset_val,\n", + " dataset_test=dataset_test,\n", + " output_exporters=[HDF5OutputExporter(os.path.join(output_path, f\"cnn_{task}\"))],\n", + ")\n", + "\n", + "trainer.configure_optimizers(optimizer, lr, weight_decay)\n", + "\n", + "trainer.train(\n", + " nepoch=epochs,\n", + " batch_size=batch_size,\n", + " earlystop_patience=earlystop_patience,\n", + " earlystop_maxgap=earlystop_maxgap,\n", + " min_epoch=min_epoch,\n", + " validate=True,\n", + " filename=os.path.join(output_path, f\"cnn_{task}\", \"model.pth.tar\"),\n", + ")\n", + "\n", + "epoch = trainer.epoch_saved_model\n", + "print(f\"Model saved at epoch {epoch}\")\n", + "pytorch_total_params = sum(p.numel() for p in trainer.model.parameters())\n", + "print(f\"Total # of parameters: {pytorch_total_params}\")\n", + "pytorch_trainable_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)\n", + "print(f\"Total # of trainable parameters: {pytorch_trainable_params}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Testing\n", + "\n", + "And the trained model can be tested on `dataset_test`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "trainer.test()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Results visualization\n", + "\n", + "As for GNNs, the results saved by `HDF5OutputExporter` can be inspected, and are saved in the `data/ppi/cnn_classif/` or `data/srv/cnn_classif/` folder in the form of an HDF5 file, `output_exporter.hdf5`, together with the saved pre-trained model.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_train = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"training\")\n", + "output_test = pd.read_hdf(os.path.join(output_path, f\"cnn_{task}\", \"output_exporter.hdf5\"), key=\"testing\")\n", + "output_train.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also in this case, the loss across the epochs can be plotted for the training and the validation sets:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = px.line(output_train, x=\"epoch\", y=\"loss\", color=\"phase\", markers=True)\n", + "\n", + "fig.add_vline(x=trainer.epoch_saved_model, line_width=3, line_dash=\"dash\", line_color=\"green\")\n", + "\n", + "fig.update_layout(\n", + " xaxis_title=\"Epoch #\",\n", + " yaxis_title=\"Loss\",\n", + " title=\"Loss vs epochs - CNN training\",\n", + " width=700,\n", + " height=400,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And some metrics of interest for classification tasks:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "threshold = 0.5\n", + "df = pd.concat([output_train, output_test])\n", + "df_plot = df[(df.epoch == trainer.epoch_saved_model) | ((df.epoch == trainer.epoch_saved_model) & (df.phase == \"testing\"))]\n", + "\n", + "for dataset in [\"training\", \"validation\", \"testing\"]:\n", + " df_plot_phase = df_plot[(df_plot.phase == dataset)]\n", + " y_true = df_plot_phase.target\n", + " y_score = np.array(df_plot_phase.output.tolist())[:, 1]\n", + "\n", + " print(f\"\\nMetrics for {dataset}:\")\n", + " fpr_roc, tpr_roc, thr_roc = roc_curve(y_true, y_score)\n", + " auc_score = auc(fpr_roc, tpr_roc)\n", + " print(f\"AUC: {round(auc_score, 1)}\")\n", + " print(f\"Considering a threshold of {threshold}\")\n", + " y_pred = (y_score > threshold) * 1\n", + " print(f\"- Precision: {round(precision_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Recall: {round(recall_score(y_true, y_pred), 1)}\")\n", + " print(f\"- Accuracy: {round(accuracy_score(y_true, y_pred), 1)}\")\n", + " print(f\"- F1: {round(f1_score(y_true, y_pred), 1)}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's important to note that the dataset used in this analysis is not sufficiently large to provide conclusive and reliable insights. Depending on your specific application, you might find regression, classification, GNNs, and/or CNNs to be valuable options. Feel free to choose the approach that best aligns with your particular problem!\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deeprank2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 } From 4d8830138373ff4f93be174dd3bc6ee229642cc8 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Wed, 4 Sep 2024 13:36:30 +0200 Subject: [PATCH 30/68] tutorials: error message when folder exists --- tutorials/data_generation_ppi.ipynb | 16 +++++++++++++--- tutorials/data_generation_srv.ipynb | 22 ++++++++++++++++------ 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 88c6c482..82e2c67f 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -64,8 +64,10 @@ "metadata": {}, "outputs": [], "source": [ + "import contextlib\n", "import glob\n", "import os\n", + "from pathlib import Path\n", "\n", "import h5py\n", "import matplotlib.image as img\n", @@ -102,8 +104,15 @@ "source": [ "data_path = os.path.join(\"data_raw\", \"ppi\")\n", "processed_data_path = os.path.join(\"data_processed\", \"ppi\")\n", - "os.makedirs(os.path.join(processed_data_path, \"residue\"))\n", - "os.makedirs(os.path.join(processed_data_path, \"atomic\"))\n", + "residue_data_path = os.path.join(processed_data_path, \"residue\")\n", + "atomic_data_path = os.path.join(processed_data_path, \"atomic\")\n", + "\n", + "for output_path in [residue_data_path, atomic_data_path]:\n", + " os.makedirs(output_path, exist_ok=True)\n", + " if any(Path(output_path).iterdir()):\n", + " msg = f\"Please store any required data from `./{output_path}` and delete the folder.\\nThen re-run this cell to continue.\"\n", + " raise FileExistsError(msg)\n", + "\n", "# Flag limit_data as True if you are running on a machine with limited memory (e.g., Docker container)\n", "limit_data = True" ] @@ -137,7 +146,8 @@ " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.pdb\"))\n", " pdb_files.sort()\n", " pdb_ids_csv = [pdb_file.split(\"/\")[-1].split(\".\")[0] for pdb_file in pdb_files]\n", - " csv_data_indexed = csv_data.set_index(\"ID\")\n", + " with contextlib.suppress(KeyError):\n", + " csv_data_indexed = csv_data.set_index(\"ID\")\n", " csv_data_indexed = csv_data_indexed.loc[pdb_ids_csv]\n", " bas = csv_data_indexed.measurement_value.tolist()\n", "\n", diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index 83247e5d..d040a41f 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -60,12 +60,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ + "import contextlib\n", "import glob\n", "import os\n", + "from pathlib import Path\n", "\n", "import h5py\n", "import matplotlib.image as img\n", @@ -97,16 +99,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "data_path = os.path.join(\"data_raw\", \"srv\")\n", "processed_data_path = os.path.join(\"data_processed\", \"srv\")\n", - "os.makedirs(os.path.join(processed_data_path, \"residue\"))\n", - "os.makedirs(os.path.join(processed_data_path, \"atomic\"))\n", + "residue_data_path = os.path.join(processed_data_path, \"residue\")\n", + "atomic_data_path = os.path.join(processed_data_path, \"atomic\")\n", + "\n", + "for output_path in [residue_data_path, atomic_data_path]:\n", + " os.makedirs(output_path, exist_ok=True)\n", + " if any(Path(output_path).iterdir()):\n", + " msg = f\"Please store any required data from `./{output_path}` and delete the folder.\\nThen re-run this cell to continue.\"\n", + " raise FileExistsError(msg)\n", + "\n", "# Flag limit_data as True if you are running on a machine with limited memory (e.g., Docker container)\n", - "limit_data = True" + "limit_data = False" ] }, { @@ -139,7 +148,8 @@ " pdb_files.sort()\n", " pdb_file_names = [os.path.basename(pdb_file) for pdb_file in pdb_files]\n", " csv_data_indexed = csv_data.set_index(\"pdb_file\")\n", - " csv_data_indexed = csv_data_indexed.loc[pdb_file_names]\n", + " with contextlib.suppress(KeyError):\n", + " csv_data_indexed = csv_data_indexed.loc[pdb_file_names]\n", " res_numbers = csv_data_indexed.res_number.tolist()\n", " res_wildtypes = csv_data_indexed.res_wildtype.tolist()\n", " res_variants = csv_data_indexed.res_variant.tolist()\n", From 008ef413e5e1dddfe22cfa51a8f831dc2ba4b904 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Wed, 4 Sep 2024 15:24:16 +0200 Subject: [PATCH 31/68] docs: improve error messages for failed queries --- deeprank2/query.py | 57 ++++++++++++++++++----------- tutorials/data_generation_srv.ipynb | 4 +- 2 files changed, 38 insertions(+), 23 deletions(-) diff --git a/deeprank2/query.py b/deeprank2/query.py index 89f171a4..b85222d1 100644 --- a/deeprank2/query.py +++ b/deeprank2/query.py @@ -22,7 +22,7 @@ import deeprank2.features from deeprank2.domain.aminoacidlist import convert_aa_nomenclature from deeprank2.features import components, conservation, contact -from deeprank2.molstruct.residue import Residue, SingleResidueVariant +from deeprank2.molstruct.residue import SingleResidueVariant from deeprank2.utils.buildgraph import get_contact_atoms, get_structure, get_surrounding_residues from deeprank2.utils.graph import Graph from deeprank2.utils.grid import Augmentation, GridSettings, MapMethod @@ -265,12 +265,11 @@ def _build_helper(self) -> Graph: structure = self._load_structure() # find the variant residue and its surroundings - variant_residue: Residue = None for residue in structure.get_chain(self.variant_chain_id).residues: if residue.number == self.variant_residue_number and residue.insertion_code == self.insertion_code: variant_residue = residue break - if variant_residue is None: + else: # if break is not reached msg = f"Residue not found in {self.pdb_path}: {self.variant_chain_id} {self.residue_id}" raise ValueError(msg) self.variant = SingleResidueVariant(variant_residue, self.variant_amino_acid) @@ -354,19 +353,12 @@ def _build_helper(self) -> Graph: raise ValueError(msg) # build the graph - if self.resolution == "atom": - graph = Graph.build_graph( - contact_atoms, - self.get_query_id(), - self.max_edge_length, - ) - elif self.resolution == "residue": - residues_selected = list({atom.residue for atom in contact_atoms}) - graph = Graph.build_graph( - residues_selected, - self.get_query_id(), - self.max_edge_length, - ) + nodes = contact_atoms if self.resolution == "atom" else list({atom.residue for atom in contact_atoms}) + graph = Graph.build_graph( + nodes=nodes, + graph_id=self.get_query_id(), + max_edge_length=self.max_edge_length, + ) graph.center = np.mean([atom.position for atom in contact_atoms], axis=0) structure = contact_atoms[0].residue.chain.model @@ -453,7 +445,7 @@ def __iter__(self) -> Iterator[Query]: def __len__(self) -> int: return len(self._queries) - def _process_one_query(self, query: Query) -> None: + def _process_one_query(self, query: Query, log_error_traceback: bool = False) -> None: """Only one process may access an hdf5 file at a time.""" try: output_path = f"{self._prefix}-{os.getpid()}.hdf5" @@ -479,10 +471,12 @@ def _process_one_query(self, query: Query) -> None: except (ValueError, AttributeError, KeyError, TimeoutError) as e: _log.warning( - f"\nGraph/Query with ID {query.get_query_id()} ran into an Exception ({e.__class__.__name__}: {e})," - " and it has not been written to the hdf5 file. More details below:", + f"Graph/Query with ID {query.get_query_id()} ran into an Exception and was not written to the hdf5 file.\n" + f"Exception found: {e.__class__.__name__}: {e}.\n" + "You may proceed with your analysis, but this query will be ignored.\n", ) - _log.exception(e) + if log_error_traceback: + _log.exception(f"----Full error traceback:----\n{e}") def process( self, @@ -493,6 +487,7 @@ def process( grid_settings: GridSettings | None = None, grid_map_method: MapMethod | None = None, grid_augmentation_count: int = 0, + log_error_traceback: bool = False, ) -> list[str]: """Render queries into graphs (and optionally grids). @@ -510,6 +505,8 @@ def process( grid_settings: If valid together with `grid_map_method`, the grid data will be stored as well. Defaults to None. grid_map_method: If valid together with `grid_settings`, the grid data will be stored as well. Defaults to None. grid_augmentation_count: Number of grid data augmentations (must be >= 0). Defaults to 0. + log_error_traceback: if True, logs full error message in case query fails. Otherwise only the error message is logged. + Defaults to false. Returns: The list of paths of the generated HDF5 files. @@ -536,7 +533,7 @@ def process( self._grid_augmentation_count = grid_augmentation_count _log.info(f"Creating pool function to process {len(self)} queries...") - pool_function = partial(self._process_one_query) + pool_function = partial(self._process_one_query, log_error_traceback=log_error_traceback) with Pool(self._cpu_count) as pool: _log.info("Starting pooling...\n") pool.map(pool_function, self.queries) @@ -551,6 +548,24 @@ def process( os.remove(output_path) return glob(f"{prefix}.hdf5") + n_processed = 0 + for hdf5file in output_paths: + with h5py.File(hdf5file, "r") as hdf5: + # List of all graphs in hdf5, each graph representing + # a SRV and its sourrouding environment + n_processed += len(list(hdf5.keys())) + + if not n_processed: + msg = "No queries have been processed." + raise ValueError(msg) + if n_processed != len(self.queries): + _log.warning( + f"Not all queries have been processed. You can proceed with the analysis of {n_processed}/{len(self.queries)} queries.\n" + "Set `log_error_traceback` to True for advanced troubleshooting.", + ) + else: + _log.info(f"{n_processed} queries have been processed.") + return output_paths def _set_feature_modules(self, feature_modules: list[ModuleType, str] | ModuleType | str) -> list[str]: diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index d040a41f..a2eef6a1 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ From 1682a7a15ac83b5a29874dcd817af9638c262f27 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Thu, 5 Sep 2024 13:59:18 +0200 Subject: [PATCH 32/68] tutorial: update csv filename for srv tutorial --- tutorials/data_generation_srv.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index a2eef6a1..4562b8d7 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -143,7 +143,7 @@ "outputs": [], "source": [ "def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list[int], list[str], list[str], list[float]]:\n", - " csv_data = pd.read_csv(os.path.join(data_path, \"srv_target_values.csv\"))\n", + " csv_data = pd.read_csv(os.path.join(data_path, \"srv_target_values_curated.csv\"))\n", " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.ent\"))\n", " pdb_files.sort()\n", " pdb_file_names = [os.path.basename(pdb_file) for pdb_file in pdb_files]\n", From 82ea4bc795e65db6f01900b4eb3582f75675e08c Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 15:07:22 +0200 Subject: [PATCH 33/68] move docker upload to release.yml --- .github/workflows/ghcr.yml | 0 .github/workflows/publish_docker_image.yml | 52 ---------------------- .github/workflows/release.yml | 40 ++++++++++++++++- pyproject.toml | 2 +- 4 files changed, 39 insertions(+), 55 deletions(-) delete mode 100644 .github/workflows/ghcr.yml delete mode 100644 .github/workflows/publish_docker_image.yml diff --git a/.github/workflows/ghcr.yml b/.github/workflows/ghcr.yml deleted file mode 100644 index e69de29b..00000000 diff --git a/.github/workflows/publish_docker_image.yml b/.github/workflows/publish_docker_image.yml deleted file mode 100644 index 24a29898..00000000 --- a/.github/workflows/publish_docker_image.yml +++ /dev/null @@ -1,52 +0,0 @@ -# SPDX-FileCopyrightText: 2022 Christian Meeßen (GFZ) -# SPDX-FileCopyrightText: 2022 Dusan Mijatovic (dv4all) -# SPDX-FileCopyrightText: 2022 Helmholtz Centre Potsdam - GFZ German Research Centre for Geosciences -# SPDX-FileCopyrightText: 2022 dv4all -# SPDX-FileCopyrightText: 2023 - 2024 Dusan Mijatovic (Netherlands eScience Center) -# SPDX-FileCopyrightText: 2023 - 2024 Ewan Cahen (Netherlands eScience Center) -# SPDX-FileCopyrightText: 2023 - 2024 Netherlands eScience Center -# -# SPDX-License-Identifier: Apache-2.0 - -name: Create and publish a Docker image - -# Configures this workflow to run every time a change is pushed to the branch called `release`. -on: - push: - branches: ["529_add_docker_testing_action_gcroci2"] - -jobs: - read_version: - name: Read version from TOML - runs-on: ubuntu-latest - outputs: - version: ${{ steps.get_version.outputs.VERSION }} - repo_lowercase: ${{ steps.repo_lowercase.outputs.REPO_LOWERCASE }} - steps: - - name: Checkout code - uses: actions/checkout@v4 - - - name: Read version from TOML - id: get_version - run: | - VERSION=$(grep '^version =' pyproject.toml | awk -F '"' '{print $2}') - echo "VERSION=$VERSION" >> $GITHUB_OUTPUT - - - name: Convert repository name to lowercase - id: repo_lowercase - run: | - REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') - echo "REPO_LOWERCASE=$REPO_LOWERCASE" >> $GITHUB_OUTPUT - - docker_image_deeprank2: - needs: read_version - name: docker_image_deeprank2 - uses: ./.github/workflows/_ghcr.yml - with: - ghcr_user: ${{github.actor}} - base_image_name: ghcr.io/${{ needs.read_version.outputs.repo_lowercase }} - image_tag: ${{ needs.read_version.outputs.version }} - dockerfile: ./Dockerfile - docker_context: . - secrets: - token: ${{secrets.GITHUB_TOKEN}} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0f85061b..b3c1c6e1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -36,7 +36,7 @@ jobs: path: dist/* upload_test_pypi: - needs: [build] + needs: build runs-on: ubuntu-latest if: github.event_name == 'workflow_dispatch' steps: @@ -51,7 +51,7 @@ jobs: repository_url: https://test.pypi.org/legacy/ upload_pypi: - needs: [build] + needs: build runs-on: ubuntu-latest if: github.event_name == 'release' && github.event.action == 'published' steps: @@ -63,3 +63,39 @@ jobs: with: user: __token__ password: ${{ secrets.PYPI_TOKEN_DEEPRANK2 }} + + read_version: + needs: upload_pypi + name: Read version from TOML + runs-on: ubuntu-latest + outputs: + version: ${{ steps.get_version.outputs.VERSION }} + repo_lowercase: ${{ steps.repo_lowercase.outputs.REPO_LOWERCASE }} + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Read version from TOML + id: get_version + run: | + VERSION=$(grep '^version =' pyproject.toml | awk -F '"' '{print $2}') + echo "VERSION=$VERSION" >> $GITHUB_OUTPUT + + - name: Convert repository name to lowercase + id: repo_lowercase + run: | + REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + echo "REPO_LOWERCASE=$REPO_LOWERCASE" >> $GITHUB_OUTPUT + + upload_docker_image: + needs: read_version + name: Upload Docker image to ghcr.io + uses: ./.github/workflows/_ghcr.yml + with: + ghcr_user: ${{github.actor}} + base_image_name: ghcr.io/${{ needs.read_version.outputs.repo_lowercase }} + image_tag: ${{ needs.read_version.outputs.version }} + dockerfile: ./Dockerfile + docker_context: . + secrets: + token: ${{secrets.GITHUB_TOKEN}} diff --git a/pyproject.toml b/pyproject.toml index 592edf19..4172c9c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "deeprank2" -version = "3.0.6" +version = "3.0.5" description = "DeepRank2 is an open-source deep learning framework for data mining of protein-protein interfaces or single-residue missense variants." readme = "README.md" requires-python = ">=3.10" From 389548a7a05ad6676886fa310245b78fd89b1517 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 15:11:14 +0200 Subject: [PATCH 34/68] fix ubuntu to 22.04 --- .github/workflows/build-latest-release.yml | 2 +- .github/workflows/build-repo-frozen-env.yml | 2 +- .github/workflows/build-repo.yml | 2 +- .github/workflows/cffconvert.yml | 2 +- .github/workflows/coveralls.yml | 2 +- .github/workflows/draft-pdf.yml | 2 +- .github/workflows/fair-software.yml | 2 +- .github/workflows/linting.yml | 2 +- .github/workflows/markdown-link-check.yml | 2 +- .github/workflows/notebooks.yml | 2 +- .github/workflows/release.yml | 8 ++++---- .github/workflows/stale_issue_pr.yml | 2 +- 12 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build-latest-release.yml b/.github/workflows/build-latest-release.yml index 9daa9936..28d37f6a 100644 --- a/.github/workflows/build-latest-release.yml +++ b/.github/workflows/build-latest-release.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # ["3.10", "3.11"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/build-repo-frozen-env.yml b/.github/workflows/build-repo-frozen-env.yml index 47f29dce..47ab259e 100644 --- a/.github/workflows/build-repo-frozen-env.yml +++ b/.github/workflows/build-repo-frozen-env.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # ["3.10", "3.11"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/build-repo.yml b/.github/workflows/build-repo.yml index 7af4cc7a..e777c5d9 100644 --- a/.github/workflows/build-repo.yml +++ b/.github/workflows/build-repo.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # ["3.10", "3.11"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml index 6851c52d..e63baf7f 100644 --- a/.github/workflows/cffconvert.yml +++ b/.github/workflows/cffconvert.yml @@ -8,7 +8,7 @@ on: jobs: validate: name: "validate" - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Check out a copy of the repository uses: actions/checkout@v3 diff --git a/.github/workflows/coveralls.yml b/.github/workflows/coveralls.yml index eb4feff2..0193c441 100644 --- a/.github/workflows/coveralls.yml +++ b/.github/workflows/coveralls.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/draft-pdf.yml b/.github/workflows/draft-pdf.yml index 8b5159f7..d044846b 100644 --- a/.github/workflows/draft-pdf.yml +++ b/.github/workflows/draft-pdf.yml @@ -5,7 +5,7 @@ on: jobs: paper: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 name: Paper Draft steps: - name: Checkout diff --git a/.github/workflows/fair-software.yml b/.github/workflows/fair-software.yml index f20d3c84..f336c6ac 100644 --- a/.github/workflows/fair-software.yml +++ b/.github/workflows/fair-software.yml @@ -11,7 +11,7 @@ jobs: verify: if: github.event.pull_request.draft == false name: "fair-software" - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: fair-software/howfairis-github-action@0.2.1 name: Measure compliance with fair-software.eu recommendations diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 4dd13612..030e4cbc 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -36,7 +36,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/markdown-link-check.yml b/.github/workflows/markdown-link-check.yml index 016e3812..a988faf8 100644 --- a/.github/workflows/markdown-link-check.yml +++ b/.github/workflows/markdown-link-check.yml @@ -23,7 +23,7 @@ jobs: markdown-link-check: if: github.event.pull_request.draft == false name: Check markdown links - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 - uses: gaurav-nelson/github-action-markdown-link-check@v1 diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index b1e59ea9..93433352 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -34,7 +34,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # ["3.10", "3.11"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b3c1c6e1..dfc13ef1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - os: ["ubuntu-latest"] + os: ["ubuntu-22.04"] python-version: ["3.10"] # https://github.com/marketplace/actions/setup-miniconda#use-a-default-shell @@ -37,7 +37,7 @@ jobs: upload_test_pypi: needs: build - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 if: github.event_name == 'workflow_dispatch' steps: - uses: actions/download-artifact@v3 @@ -52,7 +52,7 @@ jobs: upload_pypi: needs: build - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 if: github.event_name == 'release' && github.event.action == 'published' steps: - uses: actions/download-artifact@v3 @@ -67,7 +67,7 @@ jobs: read_version: needs: upload_pypi name: Read version from TOML - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 outputs: version: ${{ steps.get_version.outputs.VERSION }} repo_lowercase: ${{ steps.repo_lowercase.outputs.REPO_LOWERCASE }} diff --git a/.github/workflows/stale_issue_pr.yml b/.github/workflows/stale_issue_pr.yml index d184c9c9..74ef3f5a 100644 --- a/.github/workflows/stale_issue_pr.yml +++ b/.github/workflows/stale_issue_pr.yml @@ -5,7 +5,7 @@ on: jobs: close-issues: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 permissions: issues: write pull-requests: write From 3f91ab01a88bdca45f21ec11a93db47423fefc5f Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 15:27:52 +0200 Subject: [PATCH 35/68] add test for docker image --- .github/workflows/build-latest-release.yml | 2 +- .github/workflows/release.yml | 2 +- .github/workflows/test-docker-image.yml | 40 ++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/test-docker-image.yml diff --git a/.github/workflows/build-latest-release.yml b/.github/workflows/build-latest-release.yml index 28d37f6a..9689e5cd 100644 --- a/.github/workflows/build-latest-release.yml +++ b/.github/workflows/build-latest-release.yml @@ -3,7 +3,7 @@ name: build (latest release) # Only trigger, when the release workflow succeeded on: workflow_run: - workflows: ["Build and upload to PyPI"] + workflows: ["Build and upload to PyPI and ghcr.io"] types: - completed diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dfc13ef1..cce554ce 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -1,4 +1,4 @@ -name: Build and upload to PyPI +name: Build and upload to PyPI and ghcr.io on: workflow_dispatch: diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml new file mode 100644 index 00000000..16a3179b --- /dev/null +++ b/.github/workflows/test-docker-image.yml @@ -0,0 +1,40 @@ +name: Test Latest Docker Image (Dev) + +on: + push: + branches: + - 529_add_docker_testing_action_gcroci2 + +jobs: + test_latest_docker_image: + runs-on: ubuntu-22.04 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Pull latest Docker image + run: | + REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + docker pull ghcr.io/$REPO_LOWERCASE:latest + + - name: Run tests in Docker container + run: | + REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null + docker cp tests test_container:/app/tests + docker exec test_container pip install pytest + docker exec test_container pytest /app/tests + docker stop test_container + docker rm test_container + + - name: Output test results + if: failure() + run: | + echo "Tests failed. Please check the test output above for more details." From 65b123cfc5e910616ab71a98bd2d065d07d77d13 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 15:32:57 +0200 Subject: [PATCH 36/68] debug docker container running --- .github/workflows/test-docker-image.yml | 28 +++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml index 16a3179b..99b27aa5 100644 --- a/.github/workflows/test-docker-image.yml +++ b/.github/workflows/test-docker-image.yml @@ -1,9 +1,9 @@ -name: Test Latest Docker Image (Dev) +name: test latest docker image (dev) # TODO: change on: push: branches: - - 529_add_docker_testing_action_gcroci2 + - 529_add_docker_testing_action_gcroci2 # TODO: change jobs: test_latest_docker_image: @@ -28,9 +28,29 @@ jobs: run: | REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null - docker cp tests test_container:/app/tests + # Inspect container filesystem + echo "Container filesystem structure:" + docker exec test_container ls -R / + + # Determine the appropriate directory for tests + if docker exec test_container [ -d "/app" ]; then + TEST_DIR="/app/tests" + elif docker exec test_container [ -d "/usr/src/app" ]; then + TEST_DIR="/usr/src/app/tests" + else + TEST_DIR="/tests" + fi + + echo "Using test directory: $TEST_DIR" + + # Copy tests to the container + docker cp tests test_container:$TEST_DIR + + # Install pytest and run tests docker exec test_container pip install pytest - docker exec test_container pytest /app/tests + docker exec -e PYTHONPATH=$TEST_DIR test_container pytest $TEST_DIR + + # Clean up docker stop test_container docker rm test_container From 22745b61fa390af514be38271857d86635ed281d Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 15:41:58 +0200 Subject: [PATCH 37/68] copy tests folder recursively --- .github/workflows/test-docker-image.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml index 99b27aa5..1994abf7 100644 --- a/.github/workflows/test-docker-image.yml +++ b/.github/workflows/test-docker-image.yml @@ -28,9 +28,10 @@ jobs: run: | REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null + # Inspect container filesystem - echo "Container filesystem structure:" - docker exec test_container ls -R / + echo "Container filesystem structure before copying tests:" + docker exec test_container ls / # Determine the appropriate directory for tests if docker exec test_container [ -d "/app" ]; then @@ -44,7 +45,10 @@ jobs: echo "Using test directory: $TEST_DIR" # Copy tests to the container - docker cp tests test_container:$TEST_DIR + docker cp -r tests/. test_container:$TEST_DIR + + echo "Container filesystem structure after copying tests:" + docker exec test_container ls $TEST_DIR # Install pytest and run tests docker exec test_container pip install pytest From de8c6266ab0e7aa129a8d4a652442ac13b67ff17 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 15:46:58 +0200 Subject: [PATCH 38/68] print copied tests folder --- .github/workflows/test-docker-image.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml index 1994abf7..1b80e43c 100644 --- a/.github/workflows/test-docker-image.yml +++ b/.github/workflows/test-docker-image.yml @@ -29,10 +29,6 @@ jobs: REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null - # Inspect container filesystem - echo "Container filesystem structure before copying tests:" - docker exec test_container ls / - # Determine the appropriate directory for tests if docker exec test_container [ -d "/app" ]; then TEST_DIR="/app/tests" @@ -45,14 +41,17 @@ jobs: echo "Using test directory: $TEST_DIR" # Copy tests to the container - docker cp -r tests/. test_container:$TEST_DIR + docker cp tests test_container:$TEST_DIR - echo "Container filesystem structure after copying tests:" - docker exec test_container ls $TEST_DIR + # Verify the tests were copied correctly + echo "Contents of $TEST_DIR:" + docker exec test_container ls -la $TEST_DIR + echo "Contents of $TEST_DIR/data/:" + docker exec test_container ls -la $TEST_DIR/data || echo "data directory not found" # Install pytest and run tests docker exec test_container pip install pytest - docker exec -e PYTHONPATH=$TEST_DIR test_container pytest $TEST_DIR + docker exec -e PYTHONPATH=$TEST_DIR test_container pytest $TEST_DIR -v # Clean up docker stop test_container From f61f7c1942bd43fa0e9d2f19a1f2eebf331dffa7 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 15:55:42 +0200 Subject: [PATCH 39/68] try to copy the whole repo --- .github/workflows/test-docker-image.yml | 40 ++++++++++++++----------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml index 1b80e43c..1691db92 100644 --- a/.github/workflows/test-docker-image.yml +++ b/.github/workflows/test-docker-image.yml @@ -29,29 +29,35 @@ jobs: REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null - # Determine the appropriate directory for tests - if docker exec test_container [ -d "/app" ]; then - TEST_DIR="/app/tests" - elif docker exec test_container [ -d "/usr/src/app" ]; then - TEST_DIR="/usr/src/app/tests" - else - TEST_DIR="/tests" - fi + WORK_DIR="/" - echo "Using test directory: $TEST_DIR" + echo "Using working directory: $WORK_DIR" - # Copy tests to the container - docker cp tests test_container:$TEST_DIR + # Copy entire project directory to the container + docker cp . test_container:$WORK_DIR # Verify the tests were copied correctly - echo "Contents of $TEST_DIR:" - docker exec test_container ls -la $TEST_DIR - echo "Contents of $TEST_DIR/data/:" - docker exec test_container ls -la $TEST_DIR/data || echo "data directory not found" + echo "Contents of $WORK_DIR/tests:" + docker exec test_container ls -la $WORK_DIR/tests + echo "Contents of $WORK_DIR/tests/data:" + docker exec test_container ls -la $WORK_DIR/tests/data - # Install pytest and run tests + # Print debugging information + echo "Current working directory:" + docker exec -w $WORK_DIR test_container pwd + echo "Full path of test directory:" + docker exec test_container readlink -f $WORK_DIR/tests + + # Print content of a test file + echo "Content of a test file that accesses data (first 20 lines):" + docker exec test_container head -n 20 $WORK_DIR/tests/test_*.py + + # Install pytest docker exec test_container pip install pytest - docker exec -e PYTHONPATH=$TEST_DIR test_container pytest $TEST_DIR -v + + # Run pytest from the working directory + echo "Running pytest from the working directory:" + docker exec -w $WORK_DIR test_container pytest tests -v # Clean up docker stop test_container From 243721aab7772a7a6b31a06e9ea0f5a2b05d4e22 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 16:01:01 +0200 Subject: [PATCH 40/68] try to fix work dire --- .github/workflows/test-docker-image.yml | 32 ++++++++++++++----------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml index 1691db92..edcc4d26 100644 --- a/.github/workflows/test-docker-image.yml +++ b/.github/workflows/test-docker-image.yml @@ -30,34 +30,38 @@ jobs: docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null WORK_DIR="/" - echo "Using working directory: $WORK_DIR" # Copy entire project directory to the container docker cp . test_container:$WORK_DIR - # Verify the tests were copied correctly - echo "Contents of $WORK_DIR/tests:" - docker exec test_container ls -la $WORK_DIR/tests - echo "Contents of $WORK_DIR/tests/data:" - docker exec test_container ls -la $WORK_DIR/tests/data + # Verify the directory structure + echo "Contents of root directory:" + docker exec test_container ls -la $WORK_DIR + echo "Contents of tests directory (if it exists):" + docker exec test_container ls -la $WORK_DIR/tests || echo "No tests directory found" + echo "Contents of data directory (if it exists):" + docker exec test_container ls -la $WORK_DIR/data || echo "No data directory found" # Print debugging information echo "Current working directory:" docker exec -w $WORK_DIR test_container pwd - echo "Full path of test directory:" - docker exec test_container readlink -f $WORK_DIR/tests - # Print content of a test file - echo "Content of a test file that accesses data (first 20 lines):" - docker exec test_container head -n 20 $WORK_DIR/tests/test_*.py + # Find and print content of a test file + echo "Content of a test file (first 20 lines):" + docker exec test_container find $WORK_DIR -name "test_*.py" -type f | head -n 1 | xargs -I {} docker exec test_container head -n 20 {} # Install pytest docker exec test_container pip install pytest - # Run pytest from the working directory - echo "Running pytest from the working directory:" - docker exec -w $WORK_DIR test_container pytest tests -v + # Find the directory containing test files and run pytest + TEST_DIR=$(docker exec test_container find $WORK_DIR -name "test_*.py" -type f | head -n 1 | xargs dirname) + if [ -n "$TEST_DIR" ]; then + echo "Running pytest in directory: $TEST_DIR" + docker exec -w $TEST_DIR test_container pytest -v + else + echo "No test files found" + fi # Clean up docker stop test_container From 0660c5a71cf087a89b581b36565182c64758127d Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 16:06:08 +0200 Subject: [PATCH 41/68] try to copy tests only again --- .github/workflows/test-docker-image.yml | 39 +++++++++++-------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml index edcc4d26..bbfe6855 100644 --- a/.github/workflows/test-docker-image.yml +++ b/.github/workflows/test-docker-image.yml @@ -32,36 +32,31 @@ jobs: WORK_DIR="/" echo "Using working directory: $WORK_DIR" - # Copy entire project directory to the container - docker cp . test_container:$WORK_DIR + REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') + docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null - # Verify the directory structure - echo "Contents of root directory:" - docker exec test_container ls -la $WORK_DIR - echo "Contents of tests directory (if it exists):" - docker exec test_container ls -la $WORK_DIR/tests || echo "No tests directory found" - echo "Contents of data directory (if it exists):" - docker exec test_container ls -la $WORK_DIR/data || echo "No data directory found" + TEST_DIR="/tests" + echo "Tests directory: $TEST_DIR" - # Print debugging information - echo "Current working directory:" - docker exec -w $WORK_DIR test_container pwd + # Copy only the tests folder to the container + docker cp tests test_container:$TEST_DIR + + # Verify the directory structure + echo "Contents of tests directory:" + docker exec test_container ls -la $TEST_DIR + echo "Contents of tests/data directory:" + docker exec test_container ls -la $TEST_DIR/data - # Find and print content of a test file + # Print content of a test file echo "Content of a test file (first 20 lines):" - docker exec test_container find $WORK_DIR -name "test_*.py" -type f | head -n 1 | xargs -I {} docker exec test_container head -n 20 {} + docker exec test_container head -n 20 $TEST_DIR/test_*.py | head -n 20 # Install pytest docker exec test_container pip install pytest - # Find the directory containing test files and run pytest - TEST_DIR=$(docker exec test_container find $WORK_DIR -name "test_*.py" -type f | head -n 1 | xargs dirname) - if [ -n "$TEST_DIR" ]; then - echo "Running pytest in directory: $TEST_DIR" - docker exec -w $TEST_DIR test_container pytest -v - else - echo "No test files found" - fi + # Run pytest + echo "Running pytest in the tests directory:" + docker exec -w $TEST_DIR test_container pytest -v # Clean up docker stop test_container From fcd14ebdeab95ef574715c129c12148a90bad195 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 16:08:24 +0200 Subject: [PATCH 42/68] remove duplicated run --- .github/workflows/test-docker-image.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml index bbfe6855..eb297282 100644 --- a/.github/workflows/test-docker-image.yml +++ b/.github/workflows/test-docker-image.yml @@ -26,11 +26,6 @@ jobs: - name: Run tests in Docker container run: | - REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') - docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null - - WORK_DIR="/" - echo "Using working directory: $WORK_DIR" REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null From 233fb7772dd3d731b8f52a06a039ac45637289f9 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 16:13:26 +0200 Subject: [PATCH 43/68] add project dir --- .github/workflows/test-docker-image.yml | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml index eb297282..6271bbe7 100644 --- a/.github/workflows/test-docker-image.yml +++ b/.github/workflows/test-docker-image.yml @@ -30,28 +30,33 @@ jobs: REPO_LOWERCASE=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]') docker run --name test_container -d ghcr.io/$REPO_LOWERCASE:latest tail -f /dev/null - TEST_DIR="/tests" + PROJECT_DIR="/app" + TEST_DIR="$PROJECT_DIR/tests" + echo "Project directory: $PROJECT_DIR" echo "Tests directory: $TEST_DIR" - # Copy only the tests folder to the container + # Create project directory and copy tests folder + docker exec test_container mkdir -p $PROJECT_DIR docker cp tests test_container:$TEST_DIR # Verify the directory structure + echo "Contents of project directory:" + docker exec test_container ls -la $PROJECT_DIR echo "Contents of tests directory:" docker exec test_container ls -la $TEST_DIR - echo "Contents of tests/data directory:" - docker exec test_container ls -la $TEST_DIR/data - # Print content of a test file - echo "Content of a test file (first 20 lines):" - docker exec test_container head -n 20 $TEST_DIR/test_*.py | head -n 20 + # List test files and print content of the first one + echo "List of test files:" + docker exec test_container find $TEST_DIR -name "test_*.py" + echo "Content of the first test file (first 20 lines):" + docker exec test_container bash -c "head -n 20 \$(find $TEST_DIR -name 'test_*.py' | head -n 1)" # Install pytest docker exec test_container pip install pytest - # Run pytest - echo "Running pytest in the tests directory:" - docker exec -w $TEST_DIR test_container pytest -v + # Run pytest from the project directory + echo "Running pytest from the project directory:" + docker exec -w $PROJECT_DIR test_container python -m pytest tests -v # Clean up docker stop test_container From 195530083d5af1971a3edd48b118669d44531e7d Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Thu, 5 Sep 2024 16:19:58 +0200 Subject: [PATCH 44/68] remove debugging comment and trigger wf only when release is completed --- .github/workflows/test-docker-image.yml | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/.github/workflows/test-docker-image.yml b/.github/workflows/test-docker-image.yml index 6271bbe7..35eb6bbe 100644 --- a/.github/workflows/test-docker-image.yml +++ b/.github/workflows/test-docker-image.yml @@ -1,9 +1,11 @@ -name: test latest docker image (dev) # TODO: change +name: test latest docker image +# Only trigger, when the release workflow succeeded on: - push: - branches: - - 529_add_docker_testing_action_gcroci2 # TODO: change + workflow_run: + workflows: ["Build and upload to PyPI and ghcr.io"] + types: + - completed jobs: test_latest_docker_image: @@ -45,12 +47,6 @@ jobs: echo "Contents of tests directory:" docker exec test_container ls -la $TEST_DIR - # List test files and print content of the first one - echo "List of test files:" - docker exec test_container find $TEST_DIR -name "test_*.py" - echo "Content of the first test file (first 20 lines):" - docker exec test_container bash -c "head -n 20 \$(find $TEST_DIR -name 'test_*.py' | head -n 1)" - # Install pytest docker exec test_container pip install pytest From e0b209a2e5a10beda87848103ae5791d5354299d Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Thu, 5 Sep 2024 16:06:28 +0200 Subject: [PATCH 45/68] tutorials: use zenodo doi for latest files in the workflow it is necessary to link to the exact version of the data. In the tutorials themselves, the link is given that always refers to the newest version --- .github/workflows/notebooks.yml | 4 +++- tutorials/data_generation_ppi.ipynb | 2 +- tutorials/data_generation_srv.ipynb | 2 +- tutorials/training.ipynb | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index b1e59ea9..f4e9d1f1 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -54,9 +54,11 @@ jobs: - name: Download the data for the tutorials shell: bash -l {0} run: | - wget https://zenodo.org/records/8349335/files/data_raw.zip + wget https://zenodo.org/records/13709906/files/data_raw.zip unzip data_raw.zip -d data_raw mv data_raw tutorials + echo listing files in data_raw: + ls tutorials/data_raw - name: Run tutorial notebooks run: pytest --nbmake tutorials diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 82e2c67f..98cd77fa 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -29,7 +29,7 @@ "source": [ "### Input Data\n", "\n", - "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/8349335). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/7997585). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", "\n", "Note that the dataset contains only 100 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" ] diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index 4562b8d7..7dcde58a 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -29,7 +29,7 @@ "source": [ "### Input Data\n", "\n", - "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/8349335). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/7997585). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", "\n", "Note that the dataset contains only 96 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" ] diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index 073a4062..b3445cc4 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -33,7 +33,7 @@ "\n", "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", "\n", - "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/7997585). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", "\n", "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" ] From 6bcb480502ef3a1477be64c8e167c45fec184f0c Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 6 Sep 2024 11:15:55 +0200 Subject: [PATCH 46/68] fix tutorials add --- Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index b26ada57..9cbe9900 100644 --- a/Dockerfile +++ b/Dockerfile @@ -23,6 +23,7 @@ RUN \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >> ~/.bashrc ADD ./env/deeprank2.yml /home/deeprank2/ +ADD ./tutorials /home/deeprank2/tutorials RUN \ ## Create the environment and install the dependencies @@ -35,7 +36,7 @@ RUN \ # Get the data for running the tutorials if [ -d "/home/deeprank2/tutorials/data_raw" ]; then rm -Rf /home/deeprank2/tutorials/data_raw; fi && \ if [ -d "/home/deeprank2/tutorials/data_processed" ]; then rm -Rf /home/deeprank2/tutorials/data_processed; fi && \ - wget https://zenodo.org/records/8349335/files/data_raw.zip && \ + wget https://zenodo.org/records/13709906/files/data_raw.zip && \ unzip data_raw.zip -d data_raw && \ mv data_raw /home/deeprank2/tutorials && \ apt-get clean && \ @@ -45,8 +46,6 @@ RUN \ find ${CONDA_DIR} -follow -type f -name '*.pyc' -delete && \ conda clean --force-pkgs-dirs --all --yes -ADD ./tutorials /home/deeprank2/tutorials - ENV PATH /opt/conda/envs/deeprank2/bin:$PATH # Define working directory From f931154b780d1b3efc7796b082422129449611a4 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 6 Sep 2024 11:16:23 +0200 Subject: [PATCH 47/68] update zenodo address --- tutorials/data_generation_ppi.ipynb | 2 +- tutorials/data_generation_srv.ipynb | 2 +- tutorials/training.ipynb | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tutorials/data_generation_ppi.ipynb b/tutorials/data_generation_ppi.ipynb index 2d1d9650..8553d4a7 100644 --- a/tutorials/data_generation_ppi.ipynb +++ b/tutorials/data_generation_ppi.ipynb @@ -29,7 +29,7 @@ "source": [ "### Input Data\n", "\n", - "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/8349335). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/13709906). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", "\n", "Note that the dataset contains only 100 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" ] diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index 1a68f31a..a4fb3c16 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -29,7 +29,7 @@ "source": [ "### Input Data\n", "\n", - "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/8349335). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "The example data used in this tutorial are available on Zenodo at [this record address](https://zenodo.org/record/13709906). To download the raw data used in this tutorial, please visit the link and download `data_raw.zip`. Unzip it, and save the `data_raw/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", "\n", "Note that the dataset contains only 96 data points, which is not enough to develop an impactful predictive model, and the scope of its use is indeed only demonstrative and informative for the users.\n" ] diff --git a/tutorials/training.ipynb b/tutorials/training.ipynb index 499c3f8e..2c662b8d 100644 --- a/tutorials/training.ipynb +++ b/tutorials/training.ipynb @@ -33,7 +33,7 @@ "\n", "If you have previously run `data_generation_ppi.ipynb` or `data_generation_srv.ipynb` notebook, then their output can be directly used as input for this tutorial.\n", "\n", - "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/8349335). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", + "Alternatively, preprocessed HDF5 files can be downloaded directly from Zenodo at [this record address](https://zenodo.org/record/13709906). To download the data used in this tutorial, please visit the link and download `data_processed.zip`. Unzip it, and save the `data_processed/` folder in the same directory as this notebook. The name and the location of the folder are optional but recommended, as they are the name and the location we will use to refer to the folder throughout the tutorial.\n", "\n", "Note that the datasets contain only ~100 data points each, which is not enough to develop an impactful predictive model, and the scope of their use is indeed only demonstrative and informative for the users.\n" ] From 3d784caddc9200abe95222b263a11d5e52dac038 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 6 Sep 2024 11:16:35 +0200 Subject: [PATCH 48/68] update zenodo address --- .github/workflows/notebooks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/notebooks.yml b/.github/workflows/notebooks.yml index 93433352..9f3f2902 100644 --- a/.github/workflows/notebooks.yml +++ b/.github/workflows/notebooks.yml @@ -54,7 +54,7 @@ jobs: - name: Download the data for the tutorials shell: bash -l {0} run: | - wget https://zenodo.org/records/8349335/files/data_raw.zip + wget https://zenodo.org/records/13709906/files/data_raw.zip unzip data_raw.zip -d data_raw mv data_raw tutorials From b61662b3929aa8dff2fcd22d1823df2b3b8e13f0 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 6 Sep 2024 11:16:49 +0200 Subject: [PATCH 49/68] update docs --- README.md | 79 +++++++++++++++++++++++++------------ docs/source/docking.md | 4 +- docs/source/features.md | 30 +++++++------- docs/source/getstarted.md | 12 +++--- docs/source/installation.md | 48 ++++++++++++++++------ 5 files changed, 114 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index c9cd5187..8042303a 100644 --- a/README.md +++ b/README.md @@ -41,27 +41,30 @@ Main features: 📣 [Discussions](https://github.com/DeepRank/deeprank2/discussions) -## Table of contents +## Table of Contents - [DeepRank2](#deeprank2) - [Overview](#overview) - - [Table of contents](#table-of-contents) + - [Table of Contents](#table-of-contents) - [Installation](#installation) - [Containerized Installation](#containerized-installation) - - [Local/remote installation](#localremote-installation) - - [YML file installation (recommended)](#yml-file-installation-recommended) - - [Manual installation (customizable)](#manual-installation-customizable) - - [Testing DeepRank2 installation](#testing-deeprank2-installation) + - [Pull and Run the Pre-build Docker Image (Recommended)](#pull-and-run-the-pre-build-docker-image-recommended) + - [Build the Docker Image Manually](#build-the-docker-image-manually) + - [Removing the Docker Image](#removing-the-docker-image) + - [Local/remote Installation](#localremote-installation) + - [YML File Installation (Recommended)](#yml-file-installation-recommended) + - [Manual Installation (Customizable)](#manual-installation-customizable) + - [Testing DeepRank2 Installation](#testing-deeprank2-installation) - [Contributing](#contributing) - [Using DeepRank2](#using-deeprank2) - - [Data generation](#data-generation) + - [Data Generation](#data-generation) - [Datasets](#datasets) - [GraphDataset](#graphdataset) - [GridDataset](#griddataset) - [Training](#training) - - [Run a pre-trained model on new data](#run-a-pre-trained-model-on-new-data) - - [Computational performances](#computational-performances) - - [Package development](#package-development) + - [Run a Pre-trained Model on New Data](#run-a-pre-trained-model-on-new-data) + - [Computational Performances](#computational-performances) + - [Package Development](#package-development) ## Installation @@ -74,33 +77,59 @@ There are two ways to install DeepRank2: ### Containerized Installation -In order to try out the package without worrying about your OS and without the need of installing all the required dependencies, we created a `Dockerfile` that can be used for taking care of everything in a suitable container. +We provide a pre-built Docker image hosted on GitHub Packages, allowing you to use DeepRank2 without worrying about installing dependencies or configuring your system. This is the recommended method for trying out the package quickly. -For this, you first need to install [Docker](https://docs.docker.com/engine/install/) on your system. Then run the following commands. You may need to have sudo permission for some steps, in which case the commands below can be preceded by `sudo`: +#### Pull and Run the Pre-build Docker Image (Recommended) + +- Install [Docker](https://docs.docker.com/engine/install/) on your system, if not already installed. +- Pull the latest Docker image from GitHub Packages by running the following command: + +```bash +docker pull ghcr.io/deeprank/deeprank2:latest +``` + +- Run the container from the pulled image: + +```bash +docker run -p 8888:8888 ghcr.io/deeprank/deeprank2:latest +``` + +- Once the container is running, open your browser and navigate to `http://localhost:8888` to access the DeepRank2 application. + +From here, you can use DeepRank2, including running the tutorial notebooks. More details about the tutorials can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that the Docker container downloads only the raw PDB files required for the tutorials. To generate processed HDF5 files, you will need to run the `data_generation_xxx.ipynb` notebooks. Since Docker containers may have limited memory resources, we reduce the number of data points processed in the tutorials. To fully utilize the package, consider [installing it locally](#localremote-installation). + +#### Build the Docker Image Manually + +If you prefer to build the Docker image yourself or run into issues with the pre-built image, you can manually build and run the container as follows: + +- Install [Docker](https://docs.docker.com/engine/install/) on your system, if not already installed. +- Clone the DeepRank2 repository and navigate to its root directory: ```bash -# Clone the DeepRank2 repository and enter its root directory git clone https://github.com/DeepRank/deeprank2 cd deeprank2 +``` -# Build and run the Docker image +- Build and run the Docker image: + +```bash docker build -t deeprank2 . docker run -p 8888:8888 deeprank2 ``` -Next, open a browser and go to `http://localhost:8888` to access the application running inside the Docker container. From there you can use DeepRank2, e.g. to run the tutorial notebooks. +- Once the container is running, open your browser and navigate to `http://localhost:8888` to access the DeepRank2 application. -More details about the tutorials' contents can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that in the docker container only the raw PDB files are downloaded, which needed as a starting point for the tutorials. You can obtain the processed HDF5 files by running the `data_generation_xxx.ipynb` notebooks. Because Docker containers are limited in memory resources, we limit the number of data points processed in the tutorials. Please [install the package locally](#localremote-installation) to fully leverage its capabilities. +#### Removing the Docker Image -If after running the tutorials you want to remove the (quite large) Docker image from your machine, you must first [stop the container](https://docs.docker.com/engine/reference/commandline/stop/) and can then [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). More general information about Docker can be found on the [official website docs](https://docs.docker.com/get-started/). +If you no longer need the Docker image (which can be quite large), you can remove it after stopping the container. Follow the [container stop instructions](https://docs.docker.com/engine/reference/commandline/stop/) and [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). For more general information on Docker, refer to the [official Docker documentation](https://docs.docker.com/get-started/). -### Local/remote installation +### Local/remote Installation Local installation is formally only supported on the latest stable release of ubuntu, for which widespread automated testing through continuous integration workflows has been set up. However, it is likely that the package runs smoothly on other operating systems as well. Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. -#### YML file installation (recommended) +#### YML File Installation (Recommended) You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) via [mamba](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html), containing the latest stable release of DeepRank2 and all its dependencies. This will install the CPU-only version of DeepRank2 on Python 3.10. @@ -117,7 +146,7 @@ pip install deeprank2 We also provide a frozen environment YML file located at `env/deeprank2_frozen.yml` with all dependencies set to fixed versions. The `env/deeprank2_frozen.yml` file provides a frozen environment with all dependencies set to fixed versions. This ensures reproducibility of experiments and results by preventing changes in package versions that could occur due to updates or modifications in the default `env/deeprank2.yml`. Use this frozen environment file for a stable and consistent setup, particularly if you encounter issues with the default environment file. -#### Manual installation (customizable) +#### Manual Installation (Customizable) If you want to use the GPUs, choose a specific python version (note that at the moment we support python 3.10 only), are a MacOS user, or if the YML installation was not successful, you can install the package manually. We advise to do this inside a [conda virtual environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). @@ -143,7 +172,7 @@ pip install -e .'[test]' The `test` extra is optional, and can be used to install test-related dependencies, useful during development. -#### Testing DeepRank2 installation +#### Testing DeepRank2 Installation If you have cloned the repository, you can check that all components were installed correctly using `pytest`. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). @@ -160,7 +189,7 @@ If you would like to contribute to the package in any way, please see [our guide The following section serves as a first guide to start using the package, using protein-protein Interface (PPI) queries as example. For an enhanced learning experience, we provide in-depth [tutorial notebooks](https://github.com/DeepRank/deeprank2/tree/main/tutorials) for generating PPI data, generating SVR data, and for the training pipeline. For more details, see the [extended documentation](https://deeprank2.rtfd.io/). -### Data generation +### Data Generation For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. @@ -370,7 +399,7 @@ trainer.test() ``` -#### Run a pre-trained model on new data +#### Run a Pre-trained Model on New Data If you want to analyze new PDB files using a pre-trained model, the first step is to process and save them into HDF5 files [as we have done above](#data-generation). @@ -404,7 +433,7 @@ trainer.test() For more details about how to run a pre-trained model on new data, see the [docs](https://deeprank2.readthedocs.io/en/latest/getstarted.html#run-a-pre-trained-model-on-new-data). -## Computational performances +## Computational Performances We measured the efficiency of data generation in DeepRank2 using the tutorials' [PDB files](https://zenodo.org/record/8187806) (~100 data points per data set), averaging the results run on Apple M1 Pro, using a single CPU. Parameter settings were: atomic resolution, `distance_cutoff` of 5.5 Å, radius (for SRV only) of 10 Å. The [features modules](https://deeprank2.readthedocs.io/en/latest/features.html) used were `components`, `contact`, `exposure`, `irc`, `secondary_structure`, `surfacearea`, for a total of 33 features for PPIs and 26 for SRVs (the latter do not use `irc` features). @@ -414,6 +443,6 @@ Parameter settings were: atomic resolution, `distance_cutoff` of 5.5 Å, radius | PPIs | graph only: **2.99** (std 0.23)
graph+grid: **11.35** (std 1.30) | graph only: **0.54** (std 0.07)
graph+grid: **16.09** (std 0.44) | | SRVs | graph only: **2.20** (std 0.08)
graph+grid: **2.85** (std 0.10) | graph only: **0.05** (std 0.01)
graph+grid: **17.52** (std 0.59) | -## Package development +## Package Development If you're looking for developer documentation, go [here](https://github.com/DeepRank/deeprank2/blob/dev/README.dev.md). diff --git a/docs/source/docking.md b/docs/source/docking.md index eb5f83ae..f7e05091 100644 --- a/docs/source/docking.md +++ b/docs/source/docking.md @@ -1,4 +1,4 @@ -# Docking scores +# Docking Scores The following scores have been developed for evaluating the quality of the protein-protein models produced by computational methods (docking models), and all of them compare the structural similarity between the decoys (computationally generated structures) and the experimentally solved native structures. To calculate these measures, the interface between the two interacting protein molecules is defined as any pair of heavy atoms from the two molecules within 5Å of each other. @@ -11,7 +11,7 @@ The following scores have been developed for evaluating the quality of the prote See https://onlinelibrary.wiley.com/doi/abs/10.1002/prot.10393 for more details about `capri_class`, `lrmsd`, `irmsd`, and `fnat`. See https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0161879 for more details about `dockq`. -## Compute and add docking scores +## Compute and Add Docking Scores The following code snippet shows an example of how to use deeprank2 to compute the docking scores for a given docking model, and how to add one of the scores (e.g., `dockq`) as a target to the already processed data. diff --git a/docs/source/features.md b/docs/source/features.md index 6fa59ea6..3271f6e9 100644 --- a/docs/source/features.md +++ b/docs/source/features.md @@ -2,7 +2,7 @@ Features implemented in the code-base are defined in `deeprank2.feature` subpackage. -## Custom features +## Custom Features Users can add custom features by cloning the repository, creating a new module and placing it in `deeprank2.feature` subpackage. The custom features can then be used by installing the package in editable mode (see [here](https://deeprank2.readthedocs.io/en/latest/installation.html#install-deeprank2) for more details). We strongly recommend submitting a pull request (PR) to merge the new feature into the official repository. @@ -77,15 +77,15 @@ dataset = GraphDataset( The following is a brief description of the features already implemented in the code-base, for each features' module. -## Default node features +## Default Node Features For atomic graphs, when features relate to residues then _all_ atoms of one residue receive the feature value for that residue. -### Core properties of atoms and residues: `deeprank2.features.components` +### Core Properties of Atoms and Residues: `deeprank2.features.components` These features relate to the chemical components (atoms and amino acid residues) of which the graph is composed. Detailed information and descrepancies between sources are described can be found in `deeprank2.domain.aminoacidlist.py`. -#### Atom properties: +#### Atom Properties: These features are only used in atomic graphs. @@ -93,7 +93,7 @@ These features are only used in atomic graphs. - `atom_charge`: Atomic charge in Coulomb (float). Taken from `deeprank2.domain.forcefield.patch.top`. - `pdb_occupancy`: Proportion of structures where the atom was detected at this position (float). In some cases a single atom was detected at different positions, in which case separate structures exist whose occupancies sum to 1. Only the highest occupancy atom is used by deeprank2. -#### Residue properties: +#### Residue Properties: - `res_type`: One-hot encoding of the amino acid residue (size 20). - `polarity`: One-hot encoding of the polarity of the amino acid (options: NONPOLAR, POLAR, NEGATIVE, POSITIVE). Note that sources vary on the polarity for few of the amino acids; see detailed information in `deeprank2.domain.aminoacidlist.py`. @@ -104,14 +104,14 @@ These features are only used in atomic graphs. - `hb_donors`, `hb_acceptors`: The number of hydrogen bond donor/acceptor atoms in the residue (int). Hydrogen bonds are noncovalent intermolecular interactions formed between an hydrogen atom (partially positively charged) bound to a small, highly electronegative atom (O, N, F) with an unshared electron pair. -#### Properties related to variant residues: +#### Properties Related to Variant Residues: These features are only used in SingleResidueVariant queries. - `variant_res`: One-hot encoding of variant amino acid (size 20). - `diff_charge`, `diff_polarity`, `diff_size`, `diff_mass`, `diff_pI`, `diff_hb_donors`, `diff_hb_acceptors`: Subtraction of the wildtype value of indicated feature from the variant value. For example, if the variant has 4 hb_donors and the wildtype has 5, then `diff_hb_donors == -1`. -### Conservation features: `deeprank2.features.conservation` +### Conservation Features: `deeprank2.features.conservation` These features relate to the conservation state of individual residues. @@ -120,36 +120,36 @@ These features relate to the conservation state of individual residues. - `conservation` (only used in SingleResidueVariant queries): Conservation of the wild type amino acid (float). _More details required._ - `diff_conservation` (only used in SingleResidueVariant queries): Subtraction of wildtype conservation from the variant conservation (float). -### Protein context features: +### Protein Context Features: -#### Surface exposure: `deeprank2.features.exposure` +#### Surface Exposure: `deeprank2.features.exposure` These features relate to the exposure of residues to the surface, and are computed using [biopython](https://biopython.org/docs/1.81/api/Bio.PDB.html). Note that these features can only be calculated per residue and not per atom. - `res_depth`: [Residue depth](https://en.wikipedia.org/wiki/Residue_depth) is the average distance (in Å) of the residue to the closest molecule of bulk water (float). See also [`Bio.PDB.ResidueDepth`](https://biopython.org/docs/1.75/api/Bio.PDB.ResidueDepth.html). - `hse`: [Half sphere exposure (HSE)](https://en.wikipedia.org/wiki/Half_sphere_exposure) is a protein solvent exposure measure indicating how buried an amino acid residue is in a protein (3 float values, see [Bio.PDB.HSExposure](https://biopython.org/docs/dev/api/Bio.PDB.HSExposure.html#module-Bio.PDB.HSExposure) for details). -#### Surface accessibility: `deeprank2.features.surfacearea` +#### Surface Accessibility: `deeprank2.features.surfacearea` These features relate to the surface area of the residue, and are computed using [freesasa](https://freesasa.github.io). Note that these features can only be calculated per residue and not per atom. - `sasa`: [Solvent-Accessible Surface Area](https://en.wikipedia.org/wiki/Accessible_surface_area) is the surface area (in Å^2) of a biomolecule that is accessible to the solvent (float). - `bsa`: Buried Surface Area is the surface area (in Å^2) that is buried away from the solvent when two or more proteins or subunits associate to form a complex, i.e. it measures the size of the complex interface (float). -#### Secondary structure: `deeprank2.features.secondary_structure` +#### Secondary Structure: `deeprank2.features.secondary_structure` - `sec_struct`: One-hot encoding of the [DSSP]() assigned secondary structure of the amino acid, using the three major classes (HELIX, STRAND, COIL). Calculated using [DSSP4](https://github.com/PDB-REDO/dssp). -#### Inter-residue contacts (IRCs): `deeprank2.features.irc` +#### Inter-residue Contacts (IRCs): `deeprank2.features.irc` These features are only calculated for ProteinProteinInterface queries. - `irc_total`: The number of residues on the other chain that are within a cutoff distance of 5.5 Å (int). - `irc_nonpolar_nonpolar`, `irc_nonpolar_polar`, `irc_nonpolar_negative`, `irc_nonpolar_positive`, `irc_polar_polar`, `irc_polar_negative`, `irc_polar_positive`, `irc_negative_negative`, `irc_positive_positive`, `irc_negative_positive`: As above, but for specific residue polarity pairings. -## Default edge features +## Default Edge Features -### Contact features: `deeprank2.features.contact` +### Contact Features: `deeprank2.features.contact` These features relate to relationships between individual nodes. For atomic graphs, when features relate to residues then _all_ atoms of one residue receive the feature value for that residue. @@ -166,7 +166,7 @@ These features relate to the structural relationship between nodes. - `same_res`: Boolean indicating whether atoms belong to the same residue (1) or separate residues (0). Only used in atomic graphs. - `covalent`: Boolean indicating whether nodes are covalently bound (1) or not (0). Note that covalency is not directly assessed, but any edge with a maximum distance of 2.1 Å is considered covalent. -#### Nonbond energies: +#### Nonbond Energies: These features measure nonbond energy potentials between nodes, and are calculated using [OPLS forcefield](https://en.wikipedia.org/wiki/OPLS). For residue graphs, the pairwise sum of potentials for all atoms from each residue is used. Note that no distance cutoff is used and the radius of influence is assumed to be infinite, although the potentials tends to 0 at large distance. Also edges are only assigned within a given cutoff radius when graphs are created. diff --git a/docs/source/getstarted.md b/docs/source/getstarted.md index cca87829..9c264df0 100644 --- a/docs/source/getstarted.md +++ b/docs/source/getstarted.md @@ -1,9 +1,9 @@ -# Get started +# Get Started The following section serves as a first guide to start using the package, using protein-protein interface (PPI) queries as example. For an enhanced learning experience, we provide in-depth [tutorial notebooks](https://github.com/DeepRank/deeprank2/tree/main/tutorials) for generating PPI data, generating SRVs data, and for the training pipeline. -## Data generation +## Data Generation For each protein-protein complex (or protein structure containing a missense variant), a `Query` can be created and added to the `QueryCollection` object, to be processed later on. Two subtypes of `Query` exist: `ProteinProteinInterfaceQuery` and `SingleResidueVariantQuery`. @@ -89,7 +89,7 @@ hdf5_paths = queries.process( grid_map_method = MapMethod.GAUSSIAN) ``` -## Data exploration +## Data Exploration As representative example, the following is the HDF5 structure generated by the previous phase for `1ATN_1w.pdb`, so for one single graph, for the graph + grid case: @@ -199,7 +199,7 @@ dataset_test = GraphDataset( ) ``` -#### Transforming features +#### Transforming Features For the `GraphDataset` class it is possible to define a dictionary to indicate which transformations to apply to the features, being the transformations lambda functions and/or standardization. If `True`, standardization is applied after transformation, if the latter is present. Example: @@ -360,7 +360,7 @@ trainer.test() ``` -### Results export and visualization +### Results Export and Visualization The user can specify a DeepRank2 exporter or a custom one in `output_exporters` parameter of the Trainer class, together with the path where to save the results. Exporters are used for storing predictions information collected later on during training and testing. Example: @@ -411,7 +411,7 @@ fig.update_layout( ) ``` -## Run a pre-trained model on new data +## Run a Pre-trained Model on New Data If you want to run a pre-trained model on new PDB files, the first step is to process and save them into HDF5 files. Let's suppose that the model has been trained with `ProteinProteinInterfaceQuery` queries mapped to graphs: diff --git a/docs/source/installation.md b/docs/source/installation.md index b945f260..904bb3ff 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -11,27 +11,53 @@ There are two ways to install DeepRank2: (containerized-installation)= -In order to try out the package without worrying about your OS and without the need of installing all the required dependencies, we created a `Dockerfile` that can be used for taking care of everything in a suitable container. +We provide a pre-built Docker image hosted on GitHub Packages, allowing you to use DeepRank2 without worrying about installing dependencies or configuring your system. This is the recommended method for trying out the package quickly. -For this, you first need to install [Docker](https://docs.docker.com/engine/install/) on your system. Then run the following commands. You may need to have sudo permission for some steps, in which case the commands below can be preceded by `sudo`: +### Pull and Run the Pre-build Docker Image (Recommended) + +- Install [Docker](https://docs.docker.com/engine/install/) on your system, if not already installed. +- Pull the latest Docker image from GitHub Packages by running the following command: + +```bash +docker pull ghcr.io/deeprank/deeprank2:latest +``` + +- Run the container from the pulled image: + +```bash +docker run -p 8888:8888 ghcr.io/deeprank/deeprank2:latest +``` + +- Once the container is running, open your browser and navigate to `http://localhost:8888` to access the DeepRank2 application. + +From here, you can use DeepRank2, including running the tutorial notebooks. More details about the tutorials can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that the Docker container downloads only the raw PDB files required for the tutorials. To generate processed HDF5 files, you will need to run the `data_generation_xxx.ipynb` notebooks. Since Docker containers may have limited memory resources, we reduce the number of data points processed in the tutorials. To fully utilize the package, consider [installing it locally](#localremote-installation). + +### Build the Docker Image Manually + +If you prefer to build the Docker image yourself or run into issues with the pre-built image, you can manually build and run the container as follows: + +- Install [Docker](https://docs.docker.com/engine/install/) on your system, if not already installed. +- Clone the DeepRank2 repository and navigate to its root directory: ```bash -# Clone the DeepRank2 repository and enter its root directory git clone https://github.com/DeepRank/deeprank2 cd deeprank2 +``` -# Build and run the Docker image +- Build and run the Docker image: + +```bash docker build -t deeprank2 . docker run -p 8888:8888 deeprank2 ``` -Next, open a browser and go to `http://localhost:8888` to access the application running inside the Docker container. From there you can use DeepRank2, e.g. to run the tutorial notebooks. +- Once the container is running, open your browser and navigate to `http://localhost:8888` to access the DeepRank2 application. -More details about the tutorials' contents can be found [here](https://github.com/DeepRank/deeprank2/blob/main/tutorials/TUTORIAL.md). Note that in the docker container only the raw PDB files are downloaded, which needed as a starting point for the tutorials. You can obtain the processed HDF5 files by running the `data_generation_xxx.ipynb` notebooks. Because Docker containers are limited in memory resources, we limit the number of data points processed in the tutorials. Please [install the package locally](#localremote-installation) to fully leverage its capabilities. +### Removing the Docker Image -If after running the tutorials you want to remove the (quite large) Docker image from your machine, you must first [stop the container](https://docs.docker.com/engine/reference/commandline/stop/) and can then [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). More general information about Docker can be found on the [official website docs](https://docs.docker.com/get-started/). +If you no longer need the Docker image (which can be quite large), you can remove it after stopping the container. Follow the [container stop instructions](https://docs.docker.com/engine/reference/commandline/stop/) and [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). For more general information on Docker, refer to the [official Docker documentation](https://docs.docker.com/get-started/). -## Local/remote installation +## Local/remote Installation (localremote-installation)= @@ -39,7 +65,7 @@ Local installation is formally only supported on the latest stable release of ub Before installing DeepRank2 please ensure you have [GCC](https://gcc.gnu.org/install/) installed: if running `gcc --version` gives an error, run `sudo apt-get install gcc`. -## YML file installation (recommended) +## YML File Installation (Recommended) You can use the provided YML file for creating a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) via [mamba](https://mamba.readthedocs.io/en/latest/installation/mamba-installation.html), containing the latest stable release of DeepRank2 and all its dependencies. This will install the CPU-only version of DeepRank2 on Python 3.10. @@ -56,7 +82,7 @@ pip install deeprank2 We also provide a frozen environment YML file located at `env/deeprank2_frozen.yml` with all dependencies set to fixed versions. The `env/deeprank2_frozen.yml` file provides a frozen environment with all dependencies set to fixed versions. This ensures reproducibility of experiments and results by preventing changes in package versions that could occur due to updates or modifications in the default `env/deeprank2.yml`. Use this frozen environment file for a stable and consistent setup, particularly if you encounter issues with the default environment file. -## Manual installation (customizable) +## Manual Installation (Customizable) (manual-installation)= @@ -84,7 +110,7 @@ pip install -e .'[test]' The `test` extra is optional, and can be used to install test-related dependencies, useful during development. -## Testing DeepRank2 installation +## Testing DeepRank2 Installation If you have cloned the repository, you can check that all components were installed correctly using `pytest`. We especially recommend doing this in case you installed DeepRank2 and its dependencies manually (the latter option above). From 55bd679e3de15f5b6b2c83eace07900098a6a043 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 6 Sep 2024 11:23:33 +0200 Subject: [PATCH 50/68] update target values file for svr --- tests/perf/srv_perf.py | 2 +- tutorials/data_generation_srv.ipynb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/perf/srv_perf.py b/tests/perf/srv_perf.py index 94be374e..645030cb 100644 --- a/tests/perf/srv_perf.py +++ b/tests/perf/srv_perf.py @@ -88,7 +88,7 @@ def get_pdb_files_and_target_data(data_path: str) -> tuple[list[str], list, list, list, list]: - csv_data = pd.read_csv(os.path.join(data_path, "srv_target_values.csv")) + csv_data = pd.read_csv(os.path.join(data_path, "srv_target_values_curated.csv")) # before running this script change .ent to .pdb pdb_files = glob.glob(os.path.join(data_path, "pdb", "*.pdb")) pdb_files.sort() diff --git a/tutorials/data_generation_srv.ipynb b/tutorials/data_generation_srv.ipynb index a4fb3c16..f8606c7e 100644 --- a/tutorials/data_generation_srv.ipynb +++ b/tutorials/data_generation_srv.ipynb @@ -114,7 +114,7 @@ "metadata": {}, "source": [ "- Raw data are PDB files in `data_raw/srv/pdb/`, which contains atomic coordinates of the protein structure containing the variant.\n", - "- Target data, so in our case pathogenic versus benign labels, are in `data_raw/srv/srv_target_values.csv`.\n", + "- Target data, so in our case pathogenic versus benign labels, are in `data_raw/srv/srv_target_values_curated.csv`.\n", "- The final SRV processed data will be saved in `data_processed/srv/` folder, which in turns contains a folder for residue-level data and another one for atomic-level data. More details about such different levels will come a few cells below.\n" ] }, @@ -133,7 +133,7 @@ "outputs": [], "source": [ "def get_pdb_files_and_target_data(data_path):\n", - " csv_data = pd.read_csv(os.path.join(data_path, \"srv_target_values.csv\"))\n", + " csv_data = pd.read_csv(os.path.join(data_path, \"srv_target_values_curated.csv\"))\n", " pdb_files = glob.glob(os.path.join(data_path, \"pdb\", \"*.ent\"))\n", " pdb_files.sort()\n", " pdb_file_names = [os.path.basename(pdb_file) for pdb_file in pdb_files]\n", From e2c0e991b1e137cd11464dba9e3b61315d78ce50 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Fri, 6 Sep 2024 13:59:23 +0200 Subject: [PATCH 51/68] ci: move ruff settings to separate file --- .ruff.toml | 59 +++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 60 -------------------------------------------------- 2 files changed, 59 insertions(+), 60 deletions(-) create mode 100644 .ruff.toml diff --git a/.ruff.toml b/.ruff.toml new file mode 100644 index 00000000..80320a9c --- /dev/null +++ b/.ruff.toml @@ -0,0 +1,59 @@ +target-version = "py310" +output-format = "concise" +line-length = 159 + +[lint] +select = ["ALL"] +pydocstyle.convention = "google" # docstring settings +ignore = [ + # Unrealistic for this code base + "PTH", # flake8-use-pathlib + "N", # naming conventions + "PLR0912", # Too many branches, + "PLR0913", # Too many arguments in function definition + "D102", # Missing docstring in public method + # Unwanted + "FBT", # Using boolean arguments + "ANN101", # Missing type annotation for `self` in method + "ANN102", # Missing type annotation for `cls` in classmethod + "ANN204", # Missing return type annotation for special (dunder) method + "B028", # No explicit `stacklevel` keyword argument found in warning + "S105", # Possible hardcoded password + "S311", # insecure random generators + "PT011", # pytest-raises-too-broad + "SIM108", # Use ternary operator + # Unwanted docstrings + "D100", # Missing module docstring + "D104", # Missing public package docstring + "D105", # Missing docstring in magic method + "D107", # Missing docstring in `__init__` +] + +# Autofix settings +fixable = ["ALL"] +unfixable = ["F401"] # unused imports (should not disappear while editing) +extend-safe-fixes = [ + "D415", # First line should end with a period, question mark, or exclamation point + "D300", # Use triple double quotes `"""` + "D200", # One-line docstring should fit on one line + "TCH", # Format type checking only imports + "ISC001", # Implicitly concatenated strings on a single line + "EM", # Exception message variables + "RUF013", # Implicit Optional + "B006", # Mutable default argument +] + +isort.known-first-party = ["deeprank2"] + +[lint.per-file-ignores] +"tests/*" = [ + "S101", # Use of `assert` detected + "PLR2004", # Magic value used in comparison + "D101", # Missing class docstring + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "SLF001", # private member access +] +"docs/*" = ["ALL"] +"tests/perf/*" = ["T201"] # Use of print statements +"*.ipynb" = ["T201", "E402", "D103"] diff --git a/pyproject.toml b/pyproject.toml index 4172d7cf..881c670f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,63 +89,3 @@ include = ["deeprank2*"] [tool.pytest.ini_options] # pytest options: -ra: show summary info for all test outcomes addopts = "-ra" - -[tool.ruff] -output-format = "concise" -line-length = 159 - -[tool.ruff.lint] -select = ["ALL"] -pydocstyle.convention = "google" # docstring settings -ignore = [ - # Unrealistic for this code base - "PTH", # flake8-use-pathlib - "N", # naming conventions - "PLR0912", # Too many branches, - "PLR0913", # Too many arguments in function definition - "D102", # Missing docstring in public method - # Unwanted - "FBT", # Using boolean arguments - "ANN101", # Missing type annotation for `self` in method - "ANN102", # Missing type annotation for `cls` in classmethod - "ANN204", # Missing return type annotation for special (dunder) method - "B028", # No explicit `stacklevel` keyword argument found in warning - "S105", # Possible hardcoded password - "S311", # insecure random generators - "PT011", # pytest-raises-too-broad - "SIM108", # Use ternary operator - # Unwanted docstrings - "D100", # Missing module docstring - "D104", # Missing public package docstring - "D105", # Missing docstring in magic method - "D107", # Missing docstring in `__init__` -] - -# Autofix settings -fixable = ["ALL"] -unfixable = ["F401"] # unused imports (should not disappear while editing) -extend-safe-fixes = [ - "D415", # First line should end with a period, question mark, or exclamation point - "D300", # Use triple double quotes `"""` - "D200", # One-line docstring should fit on one line - "TCH", # Format type checking only imports - "ISC001", # Implicitly concatenated strings on a single line - "EM", # Exception message variables - "RUF013", # Implicit Optional - "B006", # Mutable default argument -] - -isort.known-first-party = ["deeprank2"] - -[tool.ruff.lint.per-file-ignores] -"tests/*" = [ - "S101", # Use of `assert` detected - "PLR2004", # Magic value used in comparison - "D101", # Missing class docstring - "D102", # Missing docstring in public method - "D103", # Missing docstring in public function - "SLF001", # private member access -] -"docs/*" = ["ALL"] -"tests/perf/*" = ["T201"] # Use of print statements -"*.ipynb" = ["T201", "E402", "D103"] From 27ca9a8b37704f35ce1a4a6acc918969b130598b Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 9 Jul 2024 12:00:55 +0200 Subject: [PATCH 52/68] build: use bump-my-version bump2version is no longer maintained --- .bumpversion.cfg | 17 ----------------- pyproject.toml | 5 ++--- 2 files changed, 2 insertions(+), 20 deletions(-) delete mode 100644 .bumpversion.cfg diff --git a/.bumpversion.cfg b/.bumpversion.cfg deleted file mode 100644 index 51e0082c..00000000 --- a/.bumpversion.cfg +++ /dev/null @@ -1,17 +0,0 @@ -[bumpversion] -current_version = 3.0.5 - -[comment] -comment = The contents of this file cannot be merged with that of setup.cfg until https://github.com/c4urself/bump2version/issues/185 is resolved - -[bumpversion:file:deeprank2/__init__.py] -search = __version__ = "{current_version}" -replace = __version__ = "{new_version}" - -[bumpversion:file:pyproject.toml] -search = version = "{current_version}" -replace = version = "{new_version}" - -[bumpversion:file:CITATION.cff] -search = version: "{current_version}" -replace = version: "{new_version}" diff --git a/pyproject.toml b/pyproject.toml index 881c670f..df3c3b43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,17 +57,16 @@ dependencies = [ ] [project.optional-dependencies] -# development dependency groups test = [ "pytest >= 7.4.0, < 8.0", - "bump2version >= 1.0.1, < 2.0", + "bump-my-version >= 0.24.2, < 1.0", "coverage >= 6.5.0, < 7.0", "pycodestyle >= 2.8.0, < 3.0", "pytest-cov >= 4.1.0, < 5.0", "pytest-runner >= 6.0.0, < 7.0", "coveralls >= 3.3.1, < 4.0", "ruff == 0.6.3", -] +] # development dependency groups publishing = ["build", "twine", "wheel"] notebooks = ["nbmake"] From 7b37933b1321c0f4ea56139d75fb5edf1b8206dc Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 9 Jul 2024 12:01:17 +0200 Subject: [PATCH 53/68] docs: update releasing instructions --- README.dev.md | 66 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/README.dev.md b/README.dev.md index d15731b3..59d1ad1d 100644 --- a/README.dev.md +++ b/README.dev.md @@ -79,10 +79,68 @@ During the development cycle, three main supporting branches are used: ## Making a release -1. Branch from `dev` and prepare the branch for the release (e.g., removing the unnecessary dev files, fix minor bugs if necessary). -2. [Bump the version](https://github.com/DeepRank/deeprank2/blob/dev/README.dev.md#versioning). -3. Merge the release branch into `main` (and `dev`), and [run the tests](https://github.com/DeepRank/deeprank2/blob/dev/README.dev.md#running-the-tests). -4. Go to https://github.com/DeepRank/deeprank2/releases and draft a new release; create a new tag for the release, generate release notes automatically and adjust them, and finally publish the release as latest. This will trigger [a GitHub action](https://github.com/DeepRank/deeprank2/actions/workflows/release.yml) that will take care of publishing the package on PyPi. +### Automated release workflow: + +0. **IMP0RTANT:** Create a PR for the release branch (usually `dev`) and make sure that all checks pass! + - if everything goes well, this PR will automatically be closed after the draft release is created. +1. Navigate to [Draft Github Release](https://github.com/DeepRank/deeprank2/actions/workflows/release_github.yml) + on the [Actions](https://github.com/DeepRank/deeprank2/actions) tab. +2. On the right hand side, you can select the level increase ("patch", "minor", or "major") and which branch to release from. + - [Follow semantic versioning conventions](https://semver.org/) to chose the level increase: + - `patch`: when backward compatible bug fixes were made + - `minor`: when functionality was added in a backward compatible manner + - `major`: when API-incompatible changes have been made + - Note that you cannot release from `main` (the default shown) using the automated workflow. To release from `main` + directly, you must [create the release manually](#manually-create-a-release). +3. Visit [Actions](https://github.com/DeepRank/deeprank2/actions) tab to check whether everything went as expected. + - NOTE: there are two separate jobs in the workflow: "draft_release" and "tidy_workspace". The first creates the draft release on github, while the second merges changes into `dev` and closes the PR. + - If "draft_release" fails, then there are likely merge conflicts with `main` that need to be resolved first. No release draft is created and the "tidy_workspace" job does not run. + - If "draft_release" is succesfull but "tidy_workspace" fails, then there are likely merge conflicts with `dev` that are not conflicts with `main`. In the latter case, the draft release is created and it is up to the user to decide whether to proceed with finalizing the release or first resolving the conflicts with `dev` (this should never happen if the release branch was `dev`, as the only change will be the version bump). + - If both jobs succeed, then the draft release is created and the changes are merged into `dev` without any problems and the associated PR is closed. Also, if the release branch is different from `dev`, then that branch will be deleted from the remote repository. +4. Navigate to the [Releases](https://github.com/DeepRank/deeprank2/releases) tab and click on the newest draft + release that was just generated. +5. Click on the edit (pencil) icon on the right side of the draft release. +6. Check/adapt the release notes and make sure that everything is as expected. +7. Check that "Set as the latest release is checked". +8. Click green "Publish Release" button to convert the draft to a published release on GitHub. + - This will automatically trigger [another GitHub workflow](https://github.com/DeepRank/deeprank2/actions/workflows/release.yml) that will take care of publishing the package on PyPi. + +#### Updating the token: + +NOTE: the current token (associated to @DaniBodor) allowing to bypass branch protection will expire on 9 July 2025. To update the token do the following: + +1. [Create a personal access token](https://github.com/settings/tokens/new) from a GitHub user account with admin + priviliges for this repo. +2. Check all the "repo" boxes and the "workflow" box, set an expiration date, and give the token a note. +3. Click green "Generate token" button on the bottom +4. Copy the token immediately, as it will not be visible again later. +5. Navigate to the [secrets settings](https://github.com/DeepRank/deeprank2/settings/secrets/actions). +6. Edit the `GH_RELEASE` key giving your access token as the new value. + +### Manually create a release: + +0. Make sure you have all required developers tools installed `pip install -e .'[test]'`. +1. Create a `release` branch from `main` and merge the changes into this branch. + - Ensure that the `release` branch is ready to be merged back into `main` (e.g., removing the unnecessary files, fix minor bugs if necessary). + - Also see our [branching strategy](#branching-strategy) above. +2. Ensure all tests pass `pytest -v` and that linting (`ruff check`) and formatting (`ruff format --check`) conventions + are adhered to. +3. Bump the version using [bump-my-version](https://github.com/callowayproject/bump-my-version): `bump-my-version bump ` + where level must be one of the following ([following semantic versioning conventions](https://semver.org/)): + - `major`: when API-incompatible changes have been made + - `minor`: when functionality was added in a backward compatible manner + - `patch`: when backward compatible bug fixes were made +4. Merge the release branch into `main` and `dev`. +5. On the [Releases page](https://github.com/DeepRank/deeprank2/releases): + 1. Click "Draft a new release" + 2. By convention, use `v` as both the release title and as a tag for the release. + 3. Click "Generate release notes" to automatically load release notes from merged PRs since the last release. + 4. Adjust the notes as required. + 5. Ensure that "Set as latest release" is checked and that both other boxes are unchecked. + 6. Hit "Publish release". + - This will automatically trigger a [GitHub + workflow](https://github.com/DeepRank/deeprank2/actions/workflows/release.yml) that will take care of publishing + the package on PyPi. ## UML From 3fc8fd40d0039db8ef0878b207a178511a514f92 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 9 Jul 2024 12:01:46 +0200 Subject: [PATCH 54/68] ci: create workflow for automated github release --- .github/workflows/release_github.yml | 118 ++++++++++++++++++ .../{release.yml => release_pypi.yml} | 0 2 files changed, 118 insertions(+) create mode 100644 .github/workflows/release_github.yml rename .github/workflows/{release.yml => release_pypi.yml} (100%) diff --git a/.github/workflows/release_github.yml b/.github/workflows/release_github.yml new file mode 100644 index 00000000..334fe998 --- /dev/null +++ b/.github/workflows/release_github.yml @@ -0,0 +1,118 @@ +name: Draft GitHub Release + +on: + workflow_dispatch: + inputs: + version_level: + description: "Semantic version level increase." + required: true + type: choice + options: + - patch + - minor + - major + +permissions: + contents: write + pull-requests: write + +jobs: + draft_release: + runs-on: "ubuntu-latest" + defaults: + run: + shell: bash -l {0} + + steps: + - name: Fail if main branch was selected + if: ${{ github.ref_name }} == 'main' + run: | + echo "Cannot release from main branch, please select valid release branch." + exit 1 + + - name: Checkout repository + uses: actions/checkout@v4 + with: + token: ${{ secrets.GH_RELEASE }} + + - name: Configure git + run: | + git config user.email "actions@github.com" + git config user.name "GitHub Actions" + git pull + + - name: Merge changes into main + run: | + git switch main + git merge ${{ github.ref_name }} --no-ff --no-commit + git merge --continue + + - name: Bump version + id: bump + run: | + echo "-- install bump-my-version" + python3 -m pip install bump-my-version + echo "-- bump the version" + bump-my-version bump ${{ github.event.inputs.version_level }} --commit --tag + echo "-- push bumped version" + echo "RELEASE_TAG=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT + git push --tags -f + git push + + - name: Create GitHub Release + id: create_release + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh release create ${{ steps.bump.outputs.RELEASE_TAG }} \ + --title="Release ${{ steps.bump.outputs.RELEASE_TAG }}" \ + --generate-notes \ + --draft + + tidy_workspace: + # only run if action above succeeds + needs: draft_release + runs-on: "ubuntu-latest" + defaults: + run: + shell: bash -l {0} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + token: ${{ secrets.GH_RELEASE }} + + - name: Configure git + run: | + git config user.email "actions@github.com" + git config user.name "GitHub Actions" + git pull + + - name: Close PR + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + echo "-- searching for associated PR" + pr_number=$(gh pr list --head ${{ github.ref_name }} --json number --jq '.[0].number') + if [ -n "$pr_number" ]; then + echo "-- closing PR #$pr_number" + gh pr close $pr_number + else + echo "-- no open pull request found for branch $branch_name" + fi + + - name: Merge updates into dev + run: | + git switch dev + git merge origin/main + git push + + - name: Delete release branch other than main or dev + run: | + if [[ ${{ github.ref_name }} != "main" && ${{ github.ref_name }} != "dev" ]]; then + echo "-- deleting branch '${{ github.ref_name }}'" + git push origin -d ${{ github.ref_name }} + else + echo "-- branch '${{ github.ref_name }}' will not be deleted from remote" + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release_pypi.yml similarity index 100% rename from .github/workflows/release.yml rename to .github/workflows/release_pypi.yml From bc88f31204b1f39fc31102f5a30ddb8d6120fde4 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 9 Jul 2024 17:10:59 +0200 Subject: [PATCH 55/68] ci: add comment about GitHub token --- .github/workflows/release_github.yml | 2 ++ README.dev.md | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/workflows/release_github.yml b/.github/workflows/release_github.yml index 334fe998..583e4d86 100644 --- a/.github/workflows/release_github.yml +++ b/.github/workflows/release_github.yml @@ -33,6 +33,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: + # token with admin priviliges to override brach protection on main and dev token: ${{ secrets.GH_RELEASE }} - name: Configure git @@ -81,6 +82,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 with: + # token with admin priviliges to override brach protection on main and dev token: ${{ secrets.GH_RELEASE }} - name: Configure git diff --git a/README.dev.md b/README.dev.md index 59d1ad1d..ea1a6bc1 100644 --- a/README.dev.md +++ b/README.dev.md @@ -107,6 +107,7 @@ During the development cycle, three main supporting branches are used: #### Updating the token: +In order for the workflow above to be able to bypass the branch protection on `main` and `dev`, a token with admin priviliges for the current repo is required. Below are instructions on how to create such a token. NOTE: the current token (associated to @DaniBodor) allowing to bypass branch protection will expire on 9 July 2025. To update the token do the following: 1. [Create a personal access token](https://github.com/settings/tokens/new) from a GitHub user account with admin From 2dbb4158735682dd81fb059ef7738d551b11ffba Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Tue, 9 Jul 2024 17:20:04 +0200 Subject: [PATCH 56/68] docs: improve release instructions --- README.dev.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.dev.md b/README.dev.md index ea1a6bc1..bcde45f3 100644 --- a/README.dev.md +++ b/README.dev.md @@ -81,7 +81,7 @@ During the development cycle, three main supporting branches are used: ### Automated release workflow: -0. **IMP0RTANT:** Create a PR for the release branch (usually `dev`) and make sure that all checks pass! +0. **IMP0RTANT:** Create a PR pointing to `main` for the release branch (usually `dev`) and make sure that there are no conflicts and that all checks pass! - if everything goes well, this PR will automatically be closed after the draft release is created. 1. Navigate to [Draft Github Release](https://github.com/DeepRank/deeprank2/actions/workflows/release_github.yml) on the [Actions](https://github.com/DeepRank/deeprank2/actions) tab. @@ -94,9 +94,9 @@ During the development cycle, three main supporting branches are used: directly, you must [create the release manually](#manually-create-a-release). 3. Visit [Actions](https://github.com/DeepRank/deeprank2/actions) tab to check whether everything went as expected. - NOTE: there are two separate jobs in the workflow: "draft_release" and "tidy_workspace". The first creates the draft release on github, while the second merges changes into `dev` and closes the PR. - - If "draft_release" fails, then there are likely merge conflicts with `main` that need to be resolved first. No release draft is created and the "tidy_workspace" job does not run. - - If "draft_release" is succesfull but "tidy_workspace" fails, then there are likely merge conflicts with `dev` that are not conflicts with `main`. In the latter case, the draft release is created and it is up to the user to decide whether to proceed with finalizing the release or first resolving the conflicts with `dev` (this should never happen if the release branch was `dev`, as the only change will be the version bump). - - If both jobs succeed, then the draft release is created and the changes are merged into `dev` without any problems and the associated PR is closed. Also, if the release branch is different from `dev`, then that branch will be deleted from the remote repository. + - If "draft_release" fails, then there are likely merge conflicts with `main` that need to be resolved first. No release draft is created and the "tidy_workspace" job does not run. If this action is succesfull, then the release branch (including a version bump) have been merged into the remote `main` branch. + - If "draft_release" is succesfull but "tidy_workspace" fails, then there are likely merge conflicts with `dev` that are not conflicts with `main`. In this case, the draft release is created (and changes were merged into the remote `main`). Conflicts with `dev` need to be resolved with `dev` by the user (note that this should never happen if the release branch was `dev`, as the only change will be the version bump). + - If both jobs succeed, then the draft release is created and the changes are merged into both remote `main` and `dev` without any problems and the associated PR is closed. Also, if the release branch is different from `dev`, then that branch will be deleted from the remote repository. 4. Navigate to the [Releases](https://github.com/DeepRank/deeprank2/releases) tab and click on the newest draft release that was just generated. 5. Click on the edit (pencil) icon on the right side of the draft release. @@ -123,7 +123,7 @@ NOTE: the current token (associated to @DaniBodor) allowing to bypass branch pro 0. Make sure you have all required developers tools installed `pip install -e .'[test]'`. 1. Create a `release` branch from `main` and merge the changes into this branch. - Ensure that the `release` branch is ready to be merged back into `main` (e.g., removing the unnecessary files, fix minor bugs if necessary). - - Also see our [branching strategy](#branching-strategy) above. + - Normally speaking, `release` should contain the changes from `dev` (see our [development workflow](#branching-workflow)), although in some cases a hotfix may be implemented from a different branch. 2. Ensure all tests pass `pytest -v` and that linting (`ruff check`) and formatting (`ruff format --check`) conventions are adhered to. 3. Bump the version using [bump-my-version](https://github.com/callowayproject/bump-my-version): `bump-my-version bump ` From 61b0ab09ecae101d2757958e60aad889417a5d7c Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Fri, 12 Jul 2024 17:52:23 +0200 Subject: [PATCH 57/68] explicitly install ruff version in linting workflow --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 4dd13612..8cdcd3c7 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -57,6 +57,6 @@ jobs: python3 --version - name: Check linting and formatting using ruff run: | - python3 -m pip install ruff + python3 -m pip install pip install ruff==0.5.1 ruff check || (echo "Please ensure you have the latest version of ruff (`ruff -V`) installed locally." && (exit 1)) ruff format --check || (echo "Please ensure you have the latest version of ruff (`ruff -V`) installed locally." && (exit 1)) From c656ddf5a20d11b0a1ce9ebdd4c8d0b636237875 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Wed, 17 Jul 2024 13:46:19 +0200 Subject: [PATCH 58/68] ci: fix action --- .github/workflows/release_github.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release_github.yml b/.github/workflows/release_github.yml index 583e4d86..5e8ec4c3 100644 --- a/.github/workflows/release_github.yml +++ b/.github/workflows/release_github.yml @@ -25,7 +25,7 @@ jobs: steps: - name: Fail if main branch was selected - if: ${{ github.ref_name }} == 'main' + if: ${{ github.ref_name == 'main' }} run: | echo "Cannot release from main branch, please select valid release branch." exit 1 @@ -35,6 +35,8 @@ jobs: with: # token with admin priviliges to override brach protection on main and dev token: ${{ secrets.GH_RELEASE }} + ref: main + fetch-depth: 0 - name: Configure git run: | @@ -45,8 +47,8 @@ jobs: - name: Merge changes into main run: | git switch main - git merge ${{ github.ref_name }} --no-ff --no-commit - git merge --continue + git merge origin/${{ github.ref_name }} --no-ff --no-commit + git commit --no-edit - name: Bump version id: bump @@ -84,6 +86,7 @@ jobs: with: # token with admin priviliges to override brach protection on main and dev token: ${{ secrets.GH_RELEASE }} + fetch-depth: 0 - name: Configure git run: | From 9a293188b965f1575442447f6aec7cecbb9ec4e8 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Mon, 5 Aug 2024 15:40:52 +0200 Subject: [PATCH 59/68] ci: fix linting action --- .github/workflows/linting.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 8cdcd3c7..b870e6e4 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -57,6 +57,6 @@ jobs: python3 --version - name: Check linting and formatting using ruff run: | - python3 -m pip install pip install ruff==0.5.1 + python3 -m pip install ruff==0.5.1 ruff check || (echo "Please ensure you have the latest version of ruff (`ruff -V`) installed locally." && (exit 1)) ruff format --check || (echo "Please ensure you have the latest version of ruff (`ruff -V`) installed locally." && (exit 1)) From f1d829c810b42b638ed75e0e504e46c51267c1e4 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Fri, 6 Sep 2024 10:01:22 +0200 Subject: [PATCH 60/68] ci: move bumpversion settings to separate toml --- .bumpversion.toml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .bumpversion.toml diff --git a/.bumpversion.toml b/.bumpversion.toml new file mode 100644 index 00000000..b527b8dd --- /dev/null +++ b/.bumpversion.toml @@ -0,0 +1,17 @@ +[tool.bumpversion] +current_version = "3.0.5" + +[[tool.bumpversion.files]] +filename = "pyproject.toml" +search = 'version = "{current_version}"' +replace = 'version = "{new_version}"' + +[[tool.bumpversion.files]] +filename = "CITATION.cff" +search = 'version: "{current_version}"' +replace = 'version: "{new_version}"' + +[[tool.bumpversion.files]] +filename = "deeprank2/__init__.py" +search = '__version__ = "{current_version}"' +replace = '__version__ = "{new_version}"' From ddbd99b1e896fed5ff7088e6fde70cf022ce4a78 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Fri, 6 Sep 2024 11:13:17 +0200 Subject: [PATCH 61/68] docs: update instructions for manual release --- README.dev.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/README.dev.md b/README.dev.md index bcde45f3..ef0ef550 100644 --- a/README.dev.md +++ b/README.dev.md @@ -121,11 +121,8 @@ NOTE: the current token (associated to @DaniBodor) allowing to bypass branch pro ### Manually create a release: 0. Make sure you have all required developers tools installed `pip install -e .'[test]'`. -1. Create a `release` branch from `main` and merge the changes into this branch. - - Ensure that the `release` branch is ready to be merged back into `main` (e.g., removing the unnecessary files, fix minor bugs if necessary). - - Normally speaking, `release` should contain the changes from `dev` (see our [development workflow](#branching-workflow)), although in some cases a hotfix may be implemented from a different branch. -2. Ensure all tests pass `pytest -v` and that linting (`ruff check`) and formatting (`ruff format --check`) conventions - are adhered to. +1. Create a `release-` branch from `main` (if there has been an hotfix) or `dev` (regular new production release). +2. Prepare the branch for the release (e.g., removing the unnecessary dev files, fix minor bugs if necessary). Do this by ensuring all tests pass `pytest -v` and that linting (`ruff check`) and formatting (`ruff format --check`) conventions are adhered to. 3. Bump the version using [bump-my-version](https://github.com/callowayproject/bump-my-version): `bump-my-version bump ` where level must be one of the following ([following semantic versioning conventions](https://semver.org/)): - `major`: when API-incompatible changes have been made From c3e884976aa11dd5317588610241ddd1fe5a95cf Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Fri, 6 Sep 2024 11:22:10 +0200 Subject: [PATCH 62/68] ci: do not allow releases from dev branch --- .github/workflows/release_github.yml | 6 +++--- README.dev.md | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/.github/workflows/release_github.yml b/.github/workflows/release_github.yml index 5e8ec4c3..f95965b1 100644 --- a/.github/workflows/release_github.yml +++ b/.github/workflows/release_github.yml @@ -24,10 +24,10 @@ jobs: shell: bash -l {0} steps: - - name: Fail if main branch was selected - if: ${{ github.ref_name == 'main' }} + - name: Ensure that permitted release branch was selected + if: ${{ github.ref_name == 'main' }} || ${{ github.ref_name == 'dev' }} run: | - echo "Cannot release from main branch, please select valid release branch." + echo "Releasing from main or dev branch is not permitted, please select a valid release branch." exit 1 - name: Checkout repository diff --git a/README.dev.md b/README.dev.md index ef0ef550..4d703fbf 100644 --- a/README.dev.md +++ b/README.dev.md @@ -81,28 +81,28 @@ During the development cycle, three main supporting branches are used: ### Automated release workflow: -0. **IMP0RTANT:** Create a PR pointing to `main` for the release branch (usually `dev`) and make sure that there are no conflicts and that all checks pass! +1. **IMP0RTANT:** Create a PR pointing to `main` for the release branch and make sure that there are no conflicts and that all checks pass. - if everything goes well, this PR will automatically be closed after the draft release is created. -1. Navigate to [Draft Github Release](https://github.com/DeepRank/deeprank2/actions/workflows/release_github.yml) +2. Navigate to [Draft Github Release](https://github.com/DeepRank/deeprank2/actions/workflows/release_github.yml) on the [Actions](https://github.com/DeepRank/deeprank2/actions) tab. -2. On the right hand side, you can select the level increase ("patch", "minor", or "major") and which branch to release from. +3. On the right hand side, you can select the level increase ("patch", "minor", or "major") and which branch to release from. - [Follow semantic versioning conventions](https://semver.org/) to chose the level increase: - `patch`: when backward compatible bug fixes were made - `minor`: when functionality was added in a backward compatible manner - `major`: when API-incompatible changes have been made - Note that you cannot release from `main` (the default shown) using the automated workflow. To release from `main` directly, you must [create the release manually](#manually-create-a-release). -3. Visit [Actions](https://github.com/DeepRank/deeprank2/actions) tab to check whether everything went as expected. +4. Visit [Actions](https://github.com/DeepRank/deeprank2/actions) tab to check whether everything went as expected. - NOTE: there are two separate jobs in the workflow: "draft_release" and "tidy_workspace". The first creates the draft release on github, while the second merges changes into `dev` and closes the PR. - - If "draft_release" fails, then there are likely merge conflicts with `main` that need to be resolved first. No release draft is created and the "tidy_workspace" job does not run. If this action is succesfull, then the release branch (including a version bump) have been merged into the remote `main` branch. + - If "draft_release" fails, then there are likely merge conflicts with `main` that need to be resolved first. No release draft is created and the "tidy_workspace" job does not run. Coversely, if this action is succesfull, then the release branch (including a version bump) have been merged into the remote `main` branch. - If "draft_release" is succesfull but "tidy_workspace" fails, then there are likely merge conflicts with `dev` that are not conflicts with `main`. In this case, the draft release is created (and changes were merged into the remote `main`). Conflicts with `dev` need to be resolved with `dev` by the user (note that this should never happen if the release branch was `dev`, as the only change will be the version bump). - - If both jobs succeed, then the draft release is created and the changes are merged into both remote `main` and `dev` without any problems and the associated PR is closed. Also, if the release branch is different from `dev`, then that branch will be deleted from the remote repository. -4. Navigate to the [Releases](https://github.com/DeepRank/deeprank2/releases) tab and click on the newest draft + - If both jobs succeed, then the draft release is created and the changes are merged into both remote `main` and `dev` without any problems and the associated PR is closed. Also, the release branch is deleted from the remote repository. +5. Navigate to the [Releases](https://github.com/DeepRank/deeprank2/releases) tab and click on the newest draft release that was just generated. -5. Click on the edit (pencil) icon on the right side of the draft release. -6. Check/adapt the release notes and make sure that everything is as expected. -7. Check that "Set as the latest release is checked". -8. Click green "Publish Release" button to convert the draft to a published release on GitHub. +6. Click on the edit (pencil) icon on the right side of the draft release. +7. Check/adapt the release notes and make sure that everything is as expected. +8. Check that "Set as the latest release is checked". +9. Click green "Publish Release" button to convert the draft to a published release on GitHub. - This will automatically trigger [another GitHub workflow](https://github.com/DeepRank/deeprank2/actions/workflows/release.yml) that will take care of publishing the package on PyPi. #### Updating the token: From 604650ba1a07580575da54d396f2e3313caed9f0 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Fri, 6 Sep 2024 11:41:13 +0200 Subject: [PATCH 63/68] ci: ensure that job fails if a step fails --- .github/workflows/release_github.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release_github.yml b/.github/workflows/release_github.yml index f95965b1..c8664ee8 100644 --- a/.github/workflows/release_github.yml +++ b/.github/workflows/release_github.yml @@ -22,6 +22,8 @@ jobs: defaults: run: shell: bash -l {0} + strategy: + fail-fast: true steps: - name: Ensure that permitted release branch was selected From f9e536306d3063e4a88a35e2d4fd97e9f80813f9 Mon Sep 17 00:00:00 2001 From: Dani Bodor Date: Fri, 6 Sep 2024 13:43:44 +0200 Subject: [PATCH 64/68] docs: Update README.dev.md Co-authored-by: Giulia Crocioni <55382553+gcroci2@users.noreply.github.com> --- README.dev.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.dev.md b/README.dev.md index 4d703fbf..4d3b19f5 100644 --- a/README.dev.md +++ b/README.dev.md @@ -81,7 +81,7 @@ During the development cycle, three main supporting branches are used: ### Automated release workflow: -1. **IMP0RTANT:** Create a PR pointing to `main` for the release branch and make sure that there are no conflicts and that all checks pass. +1. **IMP0RTANT:** Create a PR for the release branch, targeting the `main` branch. Ensure there are no conflicts and that all checks pass successfully. Release branches are typically: traditional [release branches](https://nvie.com/posts/a-successful-git-branching-model/#release-branches) (these are created from the `dev` branch), or [hotfix branches](https://nvie.com/posts/a-successful-git-branching-model/#hotfix-branches) (these are created directly from the `main` branch). - if everything goes well, this PR will automatically be closed after the draft release is created. 2. Navigate to [Draft Github Release](https://github.com/DeepRank/deeprank2/actions/workflows/release_github.yml) on the [Actions](https://github.com/DeepRank/deeprank2/actions) tab. @@ -95,7 +95,7 @@ During the development cycle, three main supporting branches are used: 4. Visit [Actions](https://github.com/DeepRank/deeprank2/actions) tab to check whether everything went as expected. - NOTE: there are two separate jobs in the workflow: "draft_release" and "tidy_workspace". The first creates the draft release on github, while the second merges changes into `dev` and closes the PR. - If "draft_release" fails, then there are likely merge conflicts with `main` that need to be resolved first. No release draft is created and the "tidy_workspace" job does not run. Coversely, if this action is succesfull, then the release branch (including a version bump) have been merged into the remote `main` branch. - - If "draft_release" is succesfull but "tidy_workspace" fails, then there are likely merge conflicts with `dev` that are not conflicts with `main`. In this case, the draft release is created (and changes were merged into the remote `main`). Conflicts with `dev` need to be resolved with `dev` by the user (note that this should never happen if the release branch was `dev`, as the only change will be the version bump). + - If "draft_release" is succesfull but "tidy_workspace" fails, then there are likely merge conflicts with `dev` that are not conflicts with `main`. In this case, the draft release is created (and changes were merged into the remote `main`). Conflicts with `dev` need to be resolved with `dev` by the user. - If both jobs succeed, then the draft release is created and the changes are merged into both remote `main` and `dev` without any problems and the associated PR is closed. Also, the release branch is deleted from the remote repository. 5. Navigate to the [Releases](https://github.com/DeepRank/deeprank2/releases) tab and click on the newest draft release that was just generated. From f8a7fbc7e6f613d720fae08ea7c7c4ca37e80988 Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Fri, 6 Sep 2024 16:01:35 +0200 Subject: [PATCH 65/68] ci: error message if token expired --- .github/workflows/release_github.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/release_github.yml b/.github/workflows/release_github.yml index c8664ee8..d5a8894a 100644 --- a/.github/workflows/release_github.yml +++ b/.github/workflows/release_github.yml @@ -32,6 +32,18 @@ jobs: echo "Releasing from main or dev branch is not permitted, please select a valid release branch." exit 1 + - name: Check GitHub Token Validity + run: | + echo "-- Validating GitHub Token" + status_code=$(curl -o /dev/null -s -w "%{http_code}" -H "Authorization: token ${{ secrets.GH_RELEASE }}" https://api.github.com/user) + if [ "$status_code" -ne 200 ]; then + echo "Error: GitHub token is invalid or expired. Please update your token in secrets." + echo "Instructions can be found at: https://github.com/DeepRank/deeprank2/blob/main/README.dev.md#updating-the-token" + exit 1 + else + echo "GitHub token is valid." + fi + - name: Checkout repository uses: actions/checkout@v4 with: From 3472459838df681802b7c7733c1343f680ad741a Mon Sep 17 00:00:00 2001 From: DaniBodor Date: Fri, 6 Sep 2024 19:00:05 +0200 Subject: [PATCH 66/68] ci: fix erroneous branch checking --- .github/workflows/release_github.yml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/release_github.yml b/.github/workflows/release_github.yml index d5a8894a..001dfc4f 100644 --- a/.github/workflows/release_github.yml +++ b/.github/workflows/release_github.yml @@ -26,10 +26,16 @@ jobs: fail-fast: true steps: + - name: Display selection + run: | + echo "Branch selected: '${{ github.ref_name }}'" + echo "Release level selected: '${{ github.event.inputs.version_level }}'" + - name: Ensure that permitted release branch was selected - if: ${{ github.ref_name == 'main' }} || ${{ github.ref_name == 'dev' }} + if: ${{ github.ref_name == 'main' || github.ref_name == 'dev' }} run: | - echo "Releasing from main or dev branch is not permitted, please select a valid release branch." + echo "Branch selected: '${{ github.ref_name }}'" + echo "Releasing from main or dev branch is not permitted, please select a different release branch." exit 1 - name: Check GitHub Token Validity @@ -129,9 +135,5 @@ jobs: - name: Delete release branch other than main or dev run: | - if [[ ${{ github.ref_name }} != "main" && ${{ github.ref_name }} != "dev" ]]; then - echo "-- deleting branch '${{ github.ref_name }}'" - git push origin -d ${{ github.ref_name }} - else - echo "-- branch '${{ github.ref_name }}' will not be deleted from remote" - fi + echo "-- deleting branch '${{ github.ref_name }}'" + git push origin -d ${{ github.ref_name }} From d65507ba71f963dc2ff62ad4584d4e43d65441ed Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Fri, 13 Sep 2024 18:54:19 +0200 Subject: [PATCH 67/68] add suggestions --- .github/workflows/release.yml | 8 ++++---- README.md | 2 +- docs/source/installation.md | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cce554ce..7639baa2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -64,7 +64,7 @@ jobs: user: __token__ password: ${{ secrets.PYPI_TOKEN_DEEPRANK2 }} - read_version: + read_only_version: needs: upload_pypi name: Read version from TOML runs-on: ubuntu-22.04 @@ -88,13 +88,13 @@ jobs: echo "REPO_LOWERCASE=$REPO_LOWERCASE" >> $GITHUB_OUTPUT upload_docker_image: - needs: read_version + needs: read_only_version name: Upload Docker image to ghcr.io uses: ./.github/workflows/_ghcr.yml with: ghcr_user: ${{github.actor}} - base_image_name: ghcr.io/${{ needs.read_version.outputs.repo_lowercase }} - image_tag: ${{ needs.read_version.outputs.version }} + base_image_name: ghcr.io/${{ needs.read_only_version.outputs.repo_lowercase }} + image_tag: ${{ needs.read_only_version.outputs.version }} dockerfile: ./Dockerfile docker_context: . secrets: diff --git a/README.md b/README.md index 8042303a..49e028af 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ docker run -p 8888:8888 deeprank2 #### Removing the Docker Image -If you no longer need the Docker image (which can be quite large), you can remove it after stopping the container. Follow the [container stop instructions](https://docs.docker.com/engine/reference/commandline/stop/) and [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). For more general information on Docker, refer to the [official Docker documentation](https://docs.docker.com/get-started/). +If you no longer need the Docker image (which can be quite large), you can remove it after stopping the container. Follow the [container stop](https://docs.docker.com/engine/reference/commandline/stop/) and [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/) instructions. For more general information on Docker, refer to the [Docker documentation](https://docs.docker.com/get-started/) directly. ### Local/remote Installation diff --git a/docs/source/installation.md b/docs/source/installation.md index 904bb3ff..9a0f22ea 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -55,7 +55,7 @@ docker run -p 8888:8888 deeprank2 ### Removing the Docker Image -If you no longer need the Docker image (which can be quite large), you can remove it after stopping the container. Follow the [container stop instructions](https://docs.docker.com/engine/reference/commandline/stop/) and [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/). For more general information on Docker, refer to the [official Docker documentation](https://docs.docker.com/get-started/). +If you no longer need the Docker image (which can be quite large), you can remove it after stopping the container. Follow the [container stop](https://docs.docker.com/engine/reference/commandline/stop/) and [remove the image](https://docs.docker.com/engine/reference/commandline/image_rm/) instructions. For more general information on Docker, refer to the [Docker documentation](https://docs.docker.com/get-started/) directly. ## Local/remote Installation From e8e31f396ffa8544f0278177d3c01da8e68d1e28 Mon Sep 17 00:00:00 2001 From: gcroci2 Date: Mon, 16 Sep 2024 17:15:36 +0200 Subject: [PATCH 68/68] bump minor version --- .bumpversion.toml | 2 +- CITATION.cff | 2 +- deeprank2/__init__.py | 2 +- pyproject.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.bumpversion.toml b/.bumpversion.toml index b527b8dd..0c38f8c3 100644 --- a/.bumpversion.toml +++ b/.bumpversion.toml @@ -1,5 +1,5 @@ [tool.bumpversion] -current_version = "3.0.5" +current_version = "3.1.0" [[tool.bumpversion.files]] filename = "pyproject.toml" diff --git a/CITATION.cff b/CITATION.cff index e6db61e8..c3d693e7 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -86,4 +86,4 @@ preferred-citation: volume: 9 title: "DeepRank2: Mining 3D Protein Structures with Geometric Deep Learning" -version: "3.0.5" +version: "3.1.0" diff --git a/deeprank2/__init__.py b/deeprank2/__init__.py index e94f36fe..f5f41e56 100644 --- a/deeprank2/__init__.py +++ b/deeprank2/__init__.py @@ -1 +1 @@ -__version__ = "3.0.5" +__version__ = "3.1.0" diff --git a/pyproject.toml b/pyproject.toml index df3c3b43..31c2956c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "deeprank2" -version = "3.0.5" +version = "3.1.0" description = "DeepRank2 is an open-source deep learning framework for data mining of protein-protein interfaces or single-residue missense variants." readme = "README.md" requires-python = ">=3.10"