From 975f9f177f780bc6223f3538b0e974a7ba1173d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Enol=20Fern=C3=A1ndez?= Date: Wed, 17 Jul 2024 08:13:13 +0100 Subject: [PATCH] Image sync (#350) * Minor fix to dockerfile * Reorganise deployment Use different dirs for each component with the terraform specifics, try to have common parts without duplication * Image syncing * Improve deployment * Make sure we have CAs in the right place * Adjust device * Add the images to the schema * Add image sync to WALTON * Improved role * Ignore hadolint issue * New ADR for image sync * Fine tune deployment * Update README * Debug * Fix deployment call * Do not debug here * Remove site configuration that we are not using * Improve text * Moved repo to EGI-Federation * Improve building image --- .github/workflows/deploy-cloud-info.yml | 18 ++ .github/workflows/deploy-image-sync.yml | 19 ++ .github/workflows/deploy.yml | 41 +++-- README.md | 35 ++-- cloud-info/Dockerfile | 2 +- deploy/CESNET-MCC.tfvars | 8 - deploy/{ => cloud-info}/NCG-INGRID-PT.tfvars | 0 deploy/{ => cloud-info}/backend.tf | 0 deploy/cloud-info/extra-cloud-init.yaml | 0 deploy/{ => cloud-info}/main.tf | 0 deploy/{ => cloud-info}/vars.tf | 0 deploy/{ => cloud-info}/versions.tf | 0 deploy/cloud-init.yaml | 3 +- deploy/deploy.sh | 18 +- deploy/image-sync/NCG-INGRID-PT.tfvars | 8 + deploy/image-sync/backend.tf | 12 ++ deploy/image-sync/extra-cloud-init.yaml | 13 ++ deploy/image-sync/main.tf | 26 +++ deploy/image-sync/vars.tf | 14 ++ deploy/image-sync/versions.tf | 9 + deploy/playbook.yaml | 2 - deploy/roles/catchall/defaults/main.yaml | 21 ++- .../catchall/molecule/default/converge.yml | 2 + .../molecule/default/tests/test_default.py | 1 + deploy/roles/catchall/tasks/cloud-info.yml | 14 ++ deploy/roles/catchall/tasks/main.yml | 63 ++++--- deploy/roles/catchall/tasks/sync.yml | 35 ++++ .../catchall/templates/cloud-info.env.j2 | 2 +- deploy/roles/catchall/templates/sync.conf.j2 | 7 + .../decisions/0003-image-synching.md | 48 +++++ image-sync/Dockerfile | 48 +++++ image-sync/image_sync/__init__.py | 0 image-sync/image_sync/sync.py | 174 ++++++++++++++++++ image-sync/pyproject.toml | 24 +++ image-sync/requirements.txt | 4 + schema.json | 36 +++- sites/WALTON-CLOUD.yaml | 5 + 37 files changed, 624 insertions(+), 88 deletions(-) create mode 100644 .github/workflows/deploy-cloud-info.yml create mode 100644 .github/workflows/deploy-image-sync.yml delete mode 100644 deploy/CESNET-MCC.tfvars rename deploy/{ => cloud-info}/NCG-INGRID-PT.tfvars (100%) rename deploy/{ => cloud-info}/backend.tf (100%) create mode 100644 deploy/cloud-info/extra-cloud-init.yaml rename deploy/{ => cloud-info}/main.tf (100%) rename deploy/{ => cloud-info}/vars.tf (100%) rename deploy/{ => cloud-info}/versions.tf (100%) create mode 100644 deploy/image-sync/NCG-INGRID-PT.tfvars create mode 100644 deploy/image-sync/backend.tf create mode 100644 deploy/image-sync/extra-cloud-init.yaml create mode 100644 deploy/image-sync/main.tf create mode 100644 deploy/image-sync/vars.tf create mode 100644 deploy/image-sync/versions.tf create mode 100644 deploy/roles/catchall/tasks/sync.yml create mode 100644 deploy/roles/catchall/templates/sync.conf.j2 create mode 100644 doc/architecture/decisions/0003-image-synching.md create mode 100644 image-sync/Dockerfile create mode 100644 image-sync/image_sync/__init__.py create mode 100644 image-sync/image_sync/sync.py create mode 100644 image-sync/pyproject.toml create mode 100644 image-sync/requirements.txt diff --git a/.github/workflows/deploy-cloud-info.yml b/.github/workflows/deploy-cloud-info.yml new file mode 100644 index 00000000..3811a1ac --- /dev/null +++ b/.github/workflows/deploy-cloud-info.yml @@ -0,0 +1,18 @@ +--- +name: 'Deploy cloud-info' + +on: + push: + branches: + - main + pull_request: + paths: + - "deploy/**" + +jobs: + deploy: + uses: ./.github/workflows/deploy.yml + with: + dir: "deploy/cloud-info" + tags: "cloud-info,docker" + secrets: inherit diff --git a/.github/workflows/deploy-image-sync.yml b/.github/workflows/deploy-image-sync.yml new file mode 100644 index 00000000..70d90042 --- /dev/null +++ b/.github/workflows/deploy-image-sync.yml @@ -0,0 +1,19 @@ +--- +name: 'Deploy image sync' + +on: + push: + branches: + - main + pull_request: + paths: + - "deploy/**" + +jobs: + deploy: + uses: ./.github/workflows/deploy.yml + with: + dir: "deploy/image-sync" + tags: "docker,image-sync" + secrets: inherit + diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 58c205e5..7aff92d4 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -1,13 +1,15 @@ --- -name: 'Deploy' +name: Deploy on: - push: - branches: - - main - pull_request: - paths: - - "deploy/**" + workflow_call: + inputs: + dir: + required: true + type: string + tags: + required: true + type: string jobs: terraform: @@ -38,7 +40,8 @@ jobs: -d "grant_type=refresh_token&client_id=token-portal&scope=$SCOPE&refresh_token=$REFRESH_TOKEN" \ | jq -r ".access_token") echo "::add-mask::$OIDC_TOKEN" - cd deploy + cd "${{ inputs.dir }}" + cp ../clouds.yaml . BACKEND_SITE="$(yq -r .clouds.backend.site clouds.yaml)" BACKEND_VO="$(yq -r .clouds.backend.vo clouds.yaml)" EGI_SITE="$(yq -r .clouds.deploy.site clouds.yaml)" @@ -67,19 +70,21 @@ jobs: - name: Terraform Format id: fmt run: | - cd deploy + cd "${{ inputs.dir }}" terraform fmt -check - name: Terraform init id: init run: | - cd deploy + cd "${{ inputs.dir }}" terraform init - name: Adjust cloud-init file env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} run: | - cd deploy + cd "${{ inputs.dir }}" + cat ../cloud-init.yaml extra-cloud-init.yaml > cloud-init.yaml sed -i -e "s/%TOKEN%/${{ secrets.GITHUB_TOKEN }}/" cloud-init.yaml + sed -i -e "s/%TAGS%/${{ inputs.tags }}/" cloud-init.yaml sed -i -e "s/%REF%/${{ github.sha }}/" cloud-init.yaml sed -i -e "s/%SHORT_REF%/$(git rev-parse --short HEAD)/" cloud-init.yaml sed -i -e "s#%SLACK_WEBHOOK_URL%#$SLACK_WEBHOOK_URL#" cloud-init.yaml @@ -89,7 +94,7 @@ jobs: id: plan if: github.event_name == 'pull_request' run: | - cd deploy + cd "${{ inputs.dir }}" terraform plan -no-color -var-file="$EGI_SITE.tfvars" continue-on-error: true - name: Update Pull Request @@ -125,13 +130,13 @@ jobs: id: terraform-apply if: github.ref == 'refs/heads/main' && github.event_name == 'push' run: | - cd deploy + cd "${{ inputs.dir }}" terraform apply -auto-approve -var-file="$EGI_SITE.tfvars" - name: Get VM ID id: terraform-vm-id if: github.ref == 'refs/heads/main' && github.event_name == 'push' run: | - cd deploy + cd "${{ inputs.dir }}" terraform output -raw instance-id - name: Re-configure providers access env: @@ -145,8 +150,8 @@ jobs: -d "grant_type=refresh_token&refresh_token=$REFRESH_TOKEN&client_id=token-portal&scope=$SCOPE" \ | jq -r ".access_token") echo "::add-mask::$OIDC_TOKEN" - cd deploy - git checkout -- clouds.yaml + cd "${{ inputs.dir }}" + cp ../clouds.yaml . BACKEND_SITE="$(yq -r .clouds.backend.site clouds.yaml)" BACKEND_VO="$(yq -r .clouds.backend.vo clouds.yaml)" BACKEND_OS_TOKEN="$(fedcloud openstack token issue --oidc-access-token "$OIDC_TOKEN" \ @@ -164,13 +169,13 @@ jobs: max_attempts: 20 retry_wait_seconds: 40 command: > - pushd deploy && + pushd "${{ inputs.dir }}" && openstack --os-cloud backend --os-token "$BACKEND_OS_TOKEN" object save fedcloud-catchall "${{ steps.terraform-vm-id.outputs.stdout }}" && openstack --os-cloud backend --os-token "$BACKEND_OS_TOKEN" object delete fedcloud-catchall "${{ steps.terraform-vm-id.outputs.stdout }}" - name: Look for errors if: github.ref == 'refs/heads/main' && github.event_name == 'push' run: | - cd deploy + cd "${{ inputs.dir }}" # show the status in the build log cat "${{ steps.terraform-vm-id.outputs.stdout }}" grep -v "error" "${{ steps.terraform-vm-id.outputs.stdout }}" diff --git a/README.md b/README.md index 371603d1..982a7433 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,16 @@ its configuration with a format as follows: ```yaml gocdb: endpoint: +# optional: use central image sync +images: + # true, get sync, false do not + sync: true + # a list of supported formats of the site can be specified + # if not available, no conversion will be done, so whatever format + # is available in AppDB will be used + formats: + - qcow2 + - raw # optionally specify a protocol for the Keystone V3 federation API protocol: openid | oidc (default is openid) # optionally specify a region name if using different regions @@ -26,20 +36,6 @@ vos: publicNetwork: ``` -## Generating configurations - -The mapping configuration of the VOs supported at each site can be easily -generated with the `generate-config.py` utility (requires `pyyaml`). It takes as -parameter the YAML file describing the site and will dump the requested -keystone, caso or cloudkeeper-os json config: - -```shell -python generate-config.py --config-type keystone sites/SITE.yaml -``` - -This mapping file should work for most cases. If you have special requirements -open an issue so we can tune the generation to meet your needs! - ## Docker containers Components are run as docker containers, which if not available upstream, are @@ -47,9 +43,10 @@ generated in this repository. ## Deployment -Deployment is managed on a separate private repository that includes several -secrets. Deployment is done with ansible using a -[dedicated role](./deploy/roles/catchall) with: +Deployment is managed with GitHub actions, there is a VM for the +cloud-info-provider and one VM for the image sync. Check the [deploy](./deploy) +directory for details. Configuration is done with ansible using a +[dedicated role](./deploy/roles/catchall): ```sh ansible-playbook -i inventory.yaml --extra-vars "@secrets.yaml" playbook.yaml @@ -60,5 +57,5 @@ where: - `inventory.yaml` contains the ansible inventory with the host to configure - `secrets.yaml` contains the credentials for every configured VO and a valid token for the AMS -- `playbook.yaml` is an ansible playbook that just uses the - `fedcloud-catchall-ops` role to configure the host +- `playbook.yaml` is an ansible playbook that just uses the `catchall` role to + configure the host diff --git a/cloud-info/Dockerfile b/cloud-info/Dockerfile index d66c6c8c..1bea3bdc 100644 --- a/cloud-info/Dockerfile +++ b/cloud-info/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3 as build +FROM python:3 AS build SHELL ["/bin/bash", "-o", "pipefail", "-c"] diff --git a/deploy/CESNET-MCC.tfvars b/deploy/CESNET-MCC.tfvars deleted file mode 100644 index 19eeb820..00000000 --- a/deploy/CESNET-MCC.tfvars +++ /dev/null @@ -1,8 +0,0 @@ -# Network -net_id = "5b8fbb48-461d-4907-ac79-7f7b2ebd17f9" - -# Flavor: standard.medium 2cores/4GB RAM -flavor_id = "4c153ce3-a163-4668-baa7-2cbcb57e2dd8" - -# Image: ubuntu bionic -image_id = "e8d75fc1-ac32-4851-90b5-b4c925e9e6f8" diff --git a/deploy/NCG-INGRID-PT.tfvars b/deploy/cloud-info/NCG-INGRID-PT.tfvars similarity index 100% rename from deploy/NCG-INGRID-PT.tfvars rename to deploy/cloud-info/NCG-INGRID-PT.tfvars diff --git a/deploy/backend.tf b/deploy/cloud-info/backend.tf similarity index 100% rename from deploy/backend.tf rename to deploy/cloud-info/backend.tf diff --git a/deploy/cloud-info/extra-cloud-init.yaml b/deploy/cloud-info/extra-cloud-init.yaml new file mode 100644 index 00000000..e69de29b diff --git a/deploy/main.tf b/deploy/cloud-info/main.tf similarity index 100% rename from deploy/main.tf rename to deploy/cloud-info/main.tf diff --git a/deploy/vars.tf b/deploy/cloud-info/vars.tf similarity index 100% rename from deploy/vars.tf rename to deploy/cloud-info/vars.tf diff --git a/deploy/versions.tf b/deploy/cloud-info/versions.tf similarity index 100% rename from deploy/versions.tf rename to deploy/cloud-info/versions.tf diff --git a/deploy/cloud-init.yaml b/deploy/cloud-init.yaml index 8d79eea9..96fe581d 100644 --- a/deploy/cloud-init.yaml +++ b/deploy/cloud-init.yaml @@ -36,6 +36,7 @@ write_files: COMMIT_SHA="%REF%" SHORT_COMMIT_SHA="%SHORT_REF%" FEDCLOUD_LOCKER_TOKEN="%FEDCLOUD_LOCKER_TOKEN%" + TAGS="%TAGS%" # get the repo code and untar at cwd curl -L -H "Accept: application/vnd.github.v3+raw" \ @@ -43,7 +44,7 @@ write_files: tar xz --strip=1 cd deploy ./deploy.sh "$OAUTH_TOKEN" "$COMMIT_SHA" "$FEDCLOUD_LOCKER_TOKEN" \ - "$SHORT_COMMIT_SHA" "$SLACK_WEBHOOK_URL" + "$TAGS" "$SHORT_COMMIT_SHA" "$SLACK_WEBHOOK_URL" path: /var/lib/cloud/scripts/per-boot/deploy.sh permissions: '0755' - content: | diff --git a/deploy/deploy.sh b/deploy/deploy.sh index 159be470..4c9545f0 100755 --- a/deploy/deploy.sh +++ b/deploy/deploy.sh @@ -4,6 +4,7 @@ # - a GitHub OAUTH_TOKEN to update the PR # - the COMMIT_SHA # - a locker for fedcloud secret to obtain the secrets +# - tags for the ansible configuration # - the SHORT_SHA used for pulling the docker image to use # - a SLACK_WEBHOOK_URL to report on the status set -e @@ -11,22 +12,29 @@ set -e OAUTH_TOKEN="$1" COMMIT_SHA="$2" FEDCLOUD_SECRET_LOCKER="$3" -SHORT_SHA="$4" -SLACK_WEBHOOK_URL="$5" +TAGS="$4" +SHORT_SHA="$5" +SLACK_WEBHOOK_URL="$6" # create a virtual env for fedcloudclient python3 -m venv "$PWD/.venv" "$PWD/.venv/bin/pip" install fedcloudclient +TMP_SECRETS="$(mktemp)" "$PWD/.venv/bin/fedcloud" secret get --locker-token "$FEDCLOUD_SECRET_LOCKER" \ - deploy data >secrets.yaml + deploy data >"$TMP_SECRETS" && mv "$TMP_SECRETS" secrets.yaml -echo "cloud_info_image: \"ghcr.io/egi-federation/fedcloud-cloud-info:sha-$SHORT_SHA\"" >>extra-vars.yaml +cat >>extra-vars.yaml <ansible.log 2>&1; then status_summary="success" color="#6DBF59" @@ -91,7 +99,7 @@ cat >slack_body.json < " + "text": "fedcloud-catchall deployment was completed for <$comment_url| PR \`#$ISSUE_NUMBER\`> " } } ] diff --git a/deploy/image-sync/NCG-INGRID-PT.tfvars b/deploy/image-sync/NCG-INGRID-PT.tfvars new file mode 100644 index 00000000..97a34dff --- /dev/null +++ b/deploy/image-sync/NCG-INGRID-PT.tfvars @@ -0,0 +1,8 @@ +# Network +net_id = "f15a0e1f-570e-4135-9739-a59b8c2b3e8e" + +# Flavor: svc1.m 2cores/4GB RAM +flavor_id = "737f8483-8063-4567-a8e5-e09a4bcbdb49" + +# Image: ubuntu 22.04 +image_id = "966f2e5a-7b48-4cb2-be92-6e2132413cf2" diff --git a/deploy/image-sync/backend.tf b/deploy/image-sync/backend.tf new file mode 100644 index 00000000..9ced8d34 --- /dev/null +++ b/deploy/image-sync/backend.tf @@ -0,0 +1,12 @@ +# This is where the info about the deployment is to be stored +terraform { + backend "swift" { + container = "terraform-image-sync" + cloud = "backend" + } +} + +# The provider where the deployment is actually performed +provider "openstack" { + cloud = "deploy" +} diff --git a/deploy/image-sync/extra-cloud-init.yaml b/deploy/image-sync/extra-cloud-init.yaml new file mode 100644 index 00000000..530dad4c --- /dev/null +++ b/deploy/image-sync/extra-cloud-init.yaml @@ -0,0 +1,13 @@ +# Disk layout +disk_setup: + /dev/sdb: + table_type: 'mbr' + layout: true + overwrite: false +fs_setup: + - filesystem: ext4 + device: /dev/sdb + partition: any + overwrite: false +mounts: + - [ /dev/sdb, /var/cache/image-sync ] diff --git a/deploy/image-sync/main.tf b/deploy/image-sync/main.tf new file mode 100644 index 00000000..16efa219 --- /dev/null +++ b/deploy/image-sync/main.tf @@ -0,0 +1,26 @@ +resource "openstack_blockstorage_volume_v3" "image-cache" { + name = "image-cache" + size = 200 +} + +resource "openstack_compute_instance_v2" "image-sync" { + name = "image-sync" + image_id = var.image_id + flavor_id = var.flavor_id + security_groups = ["default"] + user_data = file("cloud-init.yaml") + network { + uuid = var.net_id + } +} + +resource "openstack_compute_volume_attach_v2" "attached" { + instance_id = openstack_compute_instance_v2.image-sync.id + volume_id = openstack_blockstorage_volume_v3.image-cache.id +} + + + +output "instance-id" { + value = openstack_compute_instance_v2.image-sync.id +} diff --git a/deploy/image-sync/vars.tf b/deploy/image-sync/vars.tf new file mode 100644 index 00000000..b214778b --- /dev/null +++ b/deploy/image-sync/vars.tf @@ -0,0 +1,14 @@ +variable "net_id" { + type = string + description = "The id of the network" +} + +variable "image_id" { + type = string + description = "VM image id" +} + +variable "flavor_id" { + type = string + description = "VM flavor id" +} diff --git a/deploy/image-sync/versions.tf b/deploy/image-sync/versions.tf new file mode 100644 index 00000000..1f1e7d48 --- /dev/null +++ b/deploy/image-sync/versions.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + openstack = { + source = "terraform-provider-openstack/openstack" + version = "~> 1.48" + } + } + required_version = ">= 0.13" +} diff --git a/deploy/playbook.yaml b/deploy/playbook.yaml index 61f35053..9cda45fd 100644 --- a/deploy/playbook.yaml +++ b/deploy/playbook.yaml @@ -3,7 +3,5 @@ become: true roles: - role: catchall - tags: ["all", "docker"] vars: site_config_dir: ../sites/ - checkin_token_endpoint: https://aai.egi.eu/auth/realms/egi/protocol/openid-connect/token diff --git a/deploy/roles/catchall/defaults/main.yaml b/deploy/roles/catchall/defaults/main.yaml index 8ad330ae..9b57c285 100644 --- a/deploy/roles/catchall/defaults/main.yaml +++ b/deploy/roles/catchall/defaults/main.yaml @@ -3,14 +3,25 @@ ams_project: egi_cloud_info ams_host: msg.argo.grnet.gr ams_token: secret +# AppDB details +appdb_token: secret + # check-in endpoint -checkin_token_endpoint: "https://aai.egi.eu/auth/realms/egi/protocol/openid-connect/token" +checkin: + token_endpoint: "https://aai.egi.eu/auth/realms/egi/protocol/openid-connect/token" + client_id: id + client_secret: secret # docker image for the cloud info provider cloud_info_image: egifedcloud/ops-cloud-info:latest # site configuration location site_config_dir: sites +site_config_mountpoint: /sites + +# default user for the containers +egi_user: "1999" +egi_group: "1999" # No site information as default sites: [] @@ -20,3 +31,11 @@ cloud_info_cron: hour: "*" weekday: "*" timeout: "600" + +image_sync_image: egifedcloud/ops-image-sync:latest + +image_sync_cron: + minute: "5" + hour: "*/3" + weekday: "*" + timeout: "9000" # 2.5 hours diff --git a/deploy/roles/catchall/molecule/default/converge.yml b/deploy/roles/catchall/molecule/default/converge.yml index 2f4ad264..ba71c0f6 100644 --- a/deploy/roles/catchall/molecule/default/converge.yml +++ b/deploy/roles/catchall/molecule/default/converge.yml @@ -9,6 +9,8 @@ sites: - endpoint: https://example.com:5000/v3/ gocdb: foo.bar + images: + sync: true vos: - auth: project_id: a123456 diff --git a/deploy/roles/catchall/molecule/default/tests/test_default.py b/deploy/roles/catchall/molecule/default/tests/test_default.py index 5fb4704b..a92d8437 100644 --- a/deploy/roles/catchall/molecule/default/tests/test_default.py +++ b/deploy/roles/catchall/molecule/default/tests/test_default.py @@ -16,6 +16,7 @@ def test_site_files(host): assert not host.file("/etc/egi/cloud-info/%s.env" % filename).contains("OS_REGION") assert host.file("/etc/egi/cloud-info/%s.env" % filename).exists assert host.file("/etc/cron.d/cloud-info-%s" % filename).exists + assert host.file("/etc/cron.d/egi-image-sync").exists def test_site_files_region(host): diff --git a/deploy/roles/catchall/tasks/cloud-info.yml b/deploy/roles/catchall/tasks/cloud-info.yml index a3019ca4..338d029d 100644 --- a/deploy/roles/catchall/tasks/cloud-info.yml +++ b/deploy/roles/catchall/tasks/cloud-info.yml @@ -1,4 +1,18 @@ --- +- name: Cloud-info dirs + ansible.builtin.file: + path: "{{ dir }}" + state: directory + mode: "755" + owner: "{{ egi_user }}" + group: "{{ egi_group }}" + loop: + - /etc/egi/cloud-info + - /var/lock/cloud-info + - /var/log/cloud-info + loop_control: + loop_var: dir + - name: Cloud-info config directory ansible.builtin.template: src: site-info.yaml.j2 diff --git a/deploy/roles/catchall/tasks/main.yml b/deploy/roles/catchall/tasks/main.yml index 09157e11..f0b10c65 100644 --- a/deploy/roles/catchall/tasks/main.yml +++ b/deploy/roles/catchall/tasks/main.yml @@ -1,16 +1,21 @@ --- - name: Ensure cron is available + tags: ["cloud-info", "image-sync"] ansible.builtin.apt: name: cron state: present update_cache: true - name: Install docker - ansible.builtin.include_tasks: docker.yml + ansible.builtin.include_tasks: + file: docker.yml + apply: + tags: "docker" # this is only executed if explicity requested tags: ['never', 'docker'] - name: Load site configuration + tags: ["cloud-info", "image-sync"] ansible.builtin.include_vars: file: "{{ item }}" name: "{{ 'site_incl_vars_' ~ item | basename | splitext | first }}" @@ -18,30 +23,40 @@ - "{{ site_config_dir }}/*.yaml" - name: Set site configuration variable + tags: ["cloud-info", "image-sync"] ansible.builtin.set_fact: sites: "{{ sites | default([]) + [lookup('vars', item)] }}" loop: "{{ query('varnames', '^site_incl_vars_(.*)$') }}" -- name: EGI configuration - block: - - name: Create directories - ansible.builtin.file: - path: "{{ item }}" - state: directory - mode: "755" - owner: 1999 - group: 1999 - loop: - - /etc/egi - - /etc/egi/vos - - /etc/egi/cloud-info - - /var/lock/cloud-info - - /var/log/cloud-info - - name: Site specific config - ansible.builtin.include_tasks: cloud-info.yml - vars: - site: "{{ item }}" - filename: "{{ item.gocdb | replace('.', '-') }}-{{ item.endpoint | hash('md5') }}" - with_items: - - "{{ sites }}" - when: sites is iterable +- name: EGI configuration directories + tags: ["cloud-info", "image-sync"] + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "755" + owner: "{{ egi_user }}" + group: "{{ egi_group }}" + loop: + - /etc/egi + - /etc/egi/vos + +- name: Site specific config + tags: ["cloud-info"] + ansible.builtin.include_tasks: + file: cloud-info.yml + apply: + tags: "cloud-info" + vars: + site: "{{ item }}" + filename: "{{ item.gocdb | replace('.', '-') }}-{{ item.endpoint | hash('md5') }}" + with_items: + - "{{ sites }}" + when: sites is iterable + + +- name: Image sync config + tags: ["image-sync"] + ansible.builtin.include_tasks: + file: sync.yml + apply: + tags: "image-sync" diff --git a/deploy/roles/catchall/tasks/sync.yml b/deploy/roles/catchall/tasks/sync.yml new file mode 100644 index 00000000..f9f52215 --- /dev/null +++ b/deploy/roles/catchall/tasks/sync.yml @@ -0,0 +1,35 @@ +--- +- name: Sync dirs + ansible.builtin.file: + path: "{{ item }}" + state: directory + mode: "755" + owner: "{{ egi_user }}" + group: "{{ egi_group }}" + loop: + - /etc/egi/image-sync + - /var/cache/image-sync + +- name: sync configuration + ansible.builtin.template: + src: sync.conf.j2 + dest: /etc/egi/image-sync/sync.conf + mode: "600" + owner: "{{ egi_user }}" + group: "{{ egi_group }}" + +- name: Image sync cron + ansible.builtin.cron: + name: image sync + weekday: "{{ image_sync_cron.weekday }}" + minute: "{{ image_sync_cron.minute }}" + hour: "{{ image_sync_cron.hour }}" + user: root + job: > + flock -n -w {{ image_sync_cron.timeout }} /var/lock/sync + docker run --rm -v /etc/egi:/etc/egi:ro + -v {{ site_config_dir }}:{{ site_config_mountpoint }}:ro + -v /var/cache/image-sync:/atrope-state + {{ image_sync_image }} image-sync + --config-dir /etc/egi/image-sync >> /var/log/sync.log 2>&1 + cron_file: "egi-image-sync" diff --git a/deploy/roles/catchall/templates/cloud-info.env.j2 b/deploy/roles/catchall/templates/cloud-info.env.j2 index 9c7c6e2a..af6843d4 100644 --- a/deploy/roles/catchall/templates/cloud-info.env.j2 +++ b/deploy/roles/catchall/templates/cloud-info.env.j2 @@ -1,7 +1,7 @@ AMS_HOST={{ ams_host }} AMS_PROJECT={{ ams_project }} AMS_TOKEN={{ ams_token }} -CHECKIN_OIDC_TOKEN={{ checkin_token_endpoint }} +CHECKIN_OIDC_TOKEN={{ checkin.token_endpoint }} CHECKIN_SECRETS_PATH=/etc/egi/vos/ CLOUD_INFO_CONFIG=/etc/egi/cloud-info/{{ filename }}.yaml OS_AUTH_TYPE=v3oidcaccesstoken diff --git a/deploy/roles/catchall/templates/sync.conf.j2 b/deploy/roles/catchall/templates/sync.conf.j2 new file mode 100644 index 00000000..73dd10e3 --- /dev/null +++ b/deploy/roles/catchall/templates/sync.conf.j2 @@ -0,0 +1,7 @@ +[sync] +appdb_token = {{ appdb_token }} +site_config_dir = {{ site_config_mountpoint }} + +[checkin] +client_id = {{ checkin.client_id }} +client_secret = {{checkin.client_secret }} diff --git a/doc/architecture/decisions/0003-image-synching.md b/doc/architecture/decisions/0003-image-synching.md new file mode 100644 index 00000000..702f12d1 --- /dev/null +++ b/doc/architecture/decisions/0003-image-synching.md @@ -0,0 +1,48 @@ +# 3. Image synching + +Date: 2024-07-12 + +## Status + +Accepted + +## Context + +EGI provides an image repository (AppDB) for users to share VM images and VOs to +add them to a list of images to be available at the sites supporting the VO. +This has been managed with the installation and configuration of specific tools at +site-level although the uploading of images is for most sites a user-level operation +that does not need any special privileges. + +The main software product for this is +[cloudkeeper](https://github.com/the-cloudkeeper-project/cloudkeeper), which +takes care of analysing the list of images, downloading them locally and then +uploading them to the configured Glance endpoint. cloudkeeper has a pluggable +architecture with a server component (backend) managing the connection with the +cloud site and a fronted component managing the lists. While this allows for +supporting multiple cloud providers, at the moment this complexity brings no +clear added value. [atrope](https://github.com/IFCA-Advanced-Computing/atrope) +is a simpler alternative implementation that focuses on OpenStack. + +Both Cloudkeeper and atrope do not have any recent development. + +## Decision + +Operate a central image synching that takes care of making the images available +at the sites for all the EGI VOs. Use atrope as it's easier to develop and +adjust to our needs. + +Make the synchronisation optional for sites so we can roll the feature gradually +or avoid it completely for those sites where image uploading is not available +for users. + +## Consequences + +With this change in place: + +- sites won't need to run cloudkeeper, the management of VM images becomes + responsibility of the fedcloud control panel +- we can get ready for the new implementation of AppDB faster as there is only + one place to adjust +- we introduce a single point of failure for the image synchronisation which may + be problematic in the future diff --git a/image-sync/Dockerfile b/image-sync/Dockerfile new file mode 100644 index 00000000..2811db4a --- /dev/null +++ b/image-sync/Dockerfile @@ -0,0 +1,48 @@ +FROM python:3 AS build + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +WORKDIR /image-sync + +RUN python -m venv /image-sync/venv +ENV PATH="/image-sync/venv/bin:$PATH" + +COPY requirements.txt . + +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +RUN pip install --no-cache-dir . + +# The actual image +FROM python:3-slim + +LABEL org.opencontainers.image.source=https://github.com/EGI-Federation/fedcloud-catchall-operations + +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +# hadolint ignore=DL3015, DL3008 +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + gnupg2 qemu-utils curl \ + && curl -s https://dist.eugridpma.info/distribution/igtf/current/GPG-KEY-EUGridPMA-RPM-3 \ + | apt-key add - \ + && echo "deb https://repository.egi.eu/sw/production/cas/1/current egi-igtf core" > /etc/apt/sources.list.d/igtf.list \ + && apt-get update \ + && apt-get install -y ca-policy-egi-core \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /image-sync + +RUN groupadd -g 1999 python \ + && useradd -r -u 1999 -g python python + +COPY --chown=python:python --from=build /image-sync/venv ./venv + +RUN cat /etc/grid-security/certificates/*.pem >> "$(/image-sync/venv/bin/python -m requests.certs)" + +USER 1999 + +ENV PATH="/image-sync/venv/bin:$PATH" +CMD ["image-sync"] diff --git a/image-sync/image_sync/__init__.py b/image-sync/image_sync/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/image-sync/image_sync/sync.py b/image-sync/image_sync/sync.py new file mode 100644 index 00000000..7e4aea71 --- /dev/null +++ b/image-sync/image_sync/sync.py @@ -0,0 +1,174 @@ +import glob +import logging +import os +import os.path +import subprocess +import sys +import tempfile + +import requests +import yaml +from oslo_config import cfg + +# Configuraion +CONF = cfg.CONF +CONF.register_opts( + [ + cfg.StrOpt("site_config_dir", default="."), + cfg.StrOpt("graphql_url", default="https://is.appdb.egi.eu/graphql"), + cfg.ListOpt("formats", default=[]), + cfg.StrOpt("appdb_token"), + ], + group="sync", +) + +# Check-in config +checkin_grp = cfg.OptGroup("checkin") +CONF.register_opts( + [ + cfg.StrOpt("client_id"), + cfg.StrOpt("client_secret"), + cfg.StrOpt("scopes", default="openid profile eduperson_entitlement email"), + cfg.StrOpt( + "discovery_endpoint", + default="https://aai.egi.eu/auth/realms/egi/.well-known/openid-configuration", + ), + ], + group="checkin", +) + + +def fetch_site_info(): + logging.debug("Fetching site info from AppDB") + query = """ + { + siteCloudComputingEndpoints{ + items{ + endpointURL + site { + name + } + shares: shareList { + VO + entityCreationTime + projectID + } + } + } + } + """ + params = {"query": query} + r = requests.get( + CONF.sync.graphql_url, params=params, headers={"accept": "application/json"} + ) + r.raise_for_status() + data = r.json()["data"]["siteCloudComputingEndpoints"]["items"] + return data + + +def dump_atrope_config(site, share, hepix_file): + config_template = """ +[DEFAULT] +state_path = /atrope-state/ + +[glance] +auth_type = v3oidcclientcredentials +auth_url = {auth_url} +protocol = openid +identity_provider = egi.eu +client_id = {client_id} +client_secret = {client_secret} +scope = {scopes} +discovery_endpoint = {discovery_endpoint} +project_id = {project_id} +access_token_type = access_token +formats = {formats} + +[dispatchers] +dispatcher = glance + +[cache] +formats = {formats} + +[sources] +hepix_sources = {hepix_file} + """ + formats = site.get("formats", CONF.sync.formats) + return config_template.format( + auth_url=site["endpointURL"], + client_id=CONF.checkin.client_id, + client_secret=CONF.checkin.client_secret, + scopes=CONF.checkin.scopes, + discovery_endpoint=CONF.checkin.discovery_endpoint, + project_id=share["projectID"], + formats=",".join(formats), + hepix_file=hepix_file, + ) + + +def dump_hepix_config(share): + hepix = { + share["VO"]: { + "enabled": True, + "endorser": { + "ca": "/DC=ORG/DC=SEE-GRID/CN=SEE-GRID CA 2013", + "dn": "/DC=EU/DC=EGI/C=NL/O=Hosts/O=EGI.eu/CN=appdb.egi.eu", + }, + "prefix": "EGI ", + "project": share["projectID"], + "token": CONF.sync.appdb_token, + "url": f"https://vmcaster.appdb.egi.eu/store/vo/{share['VO']}/image.list", + } + } + return yaml.dump(hepix) + + +def do_sync(sites_config): + sites_info = fetch_site_info() + for site in sites_info: + site_name = site["site"]["name"] + # filter out those sites that are not part of the centralised ops + if site_name not in sites_config: + logging.debug(f"Discarding site {site_name}, not in config.") + continue + site_image_config = sites_config[site_name].get("images", {}) + if not site_image_config.get("sync", False): + logging.debug(f"Discarding site {site_name}, no sync set.") + continue + site.update(site_image_config) + logging.info(f"Configuring site {site_name}") + for share in site["shares"]: + logging.info(f"Configuring {share['VO']}") + with tempfile.TemporaryDirectory() as tmpdirname: + hepix_file = os.path.join(tmpdirname, "hepix.yaml") + with open(os.path.join(tmpdirname, "atrope.conf"), "w+") as f: + f.write(dump_atrope_config(site, share, hepix_file)) + with open(hepix_file, "w+") as f: + f.write(dump_hepix_config(share)) + cmd = [ + "atrope", + "--config-dir", + tmpdirname, + "sync", + ] + logging.debug(f"Running {' '.join(cmd)}") + subprocess.call(cmd) + + +def load_sites(): + sites = {} + for site_file in glob.iglob("*.yaml", root_dir=CONF.sync.site_config_dir): + with open(os.path.join(CONF.sync.site_config_dir, site_file), "r") as f: + site = yaml.safe_load(f.read()) + sites[site["gocdb"]] = site + return sites + + +def main(): + CONF(sys.argv[1:]) + logging.basicConfig(level=logging.DEBUG) + do_sync(load_sites()) + + +if __name__ == "__main__": + main() diff --git a/image-sync/pyproject.toml b/image-sync/pyproject.toml new file mode 100644 index 00000000..541b4834 --- /dev/null +++ b/image-sync/pyproject.toml @@ -0,0 +1,24 @@ +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[project] +name = "image_sync" +version = "0.0.1" +description = "Sync images with atrope" +authors = [ + { name = "Enol Fernandez", email = "enol.fernandez@egi.eu" }, +] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +requires-python = ">=3.10" + +[project.scripts] +image-sync = "image_sync.sync:main" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + diff --git a/image-sync/requirements.txt b/image-sync/requirements.txt new file mode 100644 index 00000000..d8d15cf0 --- /dev/null +++ b/image-sync/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/EGI-Federation/atrope@catchall +requests +oslo.config +PyYAML diff --git a/schema.json b/schema.json index 1439e286..b52ceef9 100644 --- a/schema.json +++ b/schema.json @@ -5,6 +5,25 @@ "title": "site specs", "description": "site configuration schema", "definitions": { + "imagesdata": { + "type": "object", + "properties": { + "sync": { + "type": "boolean", + "description": "Do image synchronisation", + "default": false + }, + "formats": { + "type": "array", + "title": "formats", + "description": "Supported VM image formats at the site.", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false + }, "vodata": { "type": "object", "properties": { @@ -22,17 +41,11 @@ } } }, - "required": [ - "auth", - "name" - ], + "required": ["auth", "name"], "additionalProperties": true } }, - "required": [ - "gocdb", - "endpoint" - ], + "required": ["gocdb", "endpoint"], "properties": { "gocdb": { "$id": "#/properties/gocdb", @@ -70,6 +83,13 @@ "items": { "$ref": "#/definitions/vodata" } + }, + "images": { + "$id": "#/properties/images", + "title": "Image Configuration", + "description": "Image configuration", + "type": "object", + "$ref": "#/definitions/imagesdata" } }, "additionalProperties": false diff --git a/sites/WALTON-CLOUD.yaml b/sites/WALTON-CLOUD.yaml index 9c7138e2..e68a0a87 100644 --- a/sites/WALTON-CLOUD.yaml +++ b/sites/WALTON-CLOUD.yaml @@ -1,6 +1,11 @@ --- gocdb: WALTON-CLOUD endpoint: https://horizon.waltoncloud.eu:5000/v3 +images: + sync: true + formats: + - qcow2 + - raw vos: - name: ops auth: