Merge branch 'stackhpc/2023.1' into logs_in_grafana

stackhpc · Jul 10, 2024 · ba3fc8a · ba3fc8a
2 parents 2d6f175 + 4c7b7c9
commit ba3fc8a
Show file tree

Hide file tree

Showing 122 changed files with 51,929 additions and 50,713 deletions.
diff --git a/.automation b/.automation
diff --git a/.automation.conf/tempest/tempest-ci-multinode.overrides.conf b/.automation.conf/tempest/tempest-ci-multinode.overrides.conf
@@ -32,4 +32,4 @@ max_microversion = 3.70
 build_timeout = 600
 
 [dashboard]
-dashboard_url = http://192.168.39.2
+dashboard_url = https://192.168.39.2
diff --git a/.github/workflows/stackhpc-all-in-one.yml b/.github/workflows/stackhpc-all-in-one.yml
@@ -167,7 +167,7 @@ jobs:
           VM_NETWORK: ${{ inputs.vm_network }}
           VM_SUBNET: ${{ inputs.vm_subnet }}
           VM_INTERFACE: ${{ inputs.vm_interface }}
-          VM_VOLUME_SIZE: ${{ inputs.upgrade && '45' || '35' }}
+          VM_VOLUME_SIZE: ${{ inputs.upgrade && '50' || '40' }}
           VM_TAGS: '["skc-ci-aio", "PR=${{ github.event.number }}"]'
 
       - name: Terraform Plan
@@ -179,6 +179,7 @@ jobs:
           OS_APPLICATION_CREDENTIAL_SECRET: ${{ secrets.OS_APPLICATION_CREDENTIAL_SECRET }}
 
       - name: Terraform Apply
+        id: tf_apply
         run: |
           for attempt in $(seq 5); do
               if terraform apply -auto-approve; then
@@ -355,6 +356,7 @@ jobs:
         if: inputs.upgrade
 
       - name: Tempest tests
+        id: tempest
         run: |
           mkdir -p tempest-artifacts
           docker run -t --rm \
@@ -366,16 +368,55 @@ jobs:
         env:
           KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
 
+      - name: StackHPC OpenStack tests
+        id: stackhpc-openstack-tests
+        continue-on-error: true
+        run: |
+          mkdir -p sot-results
+          docker run -t --rm \
+            -v $(pwd):/stack/kayobe-automation-env/src/kayobe-config \
+            -v $(pwd)/sot-results:/stack/sot-results \
+            -e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \
+            $KAYOBE_IMAGE \
+            /stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/stackhpc-openstack-tests.yml'
+        env:
+          KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
+
+      - name: Collect diagnostic information
+        id: diagnostics
+        run: |
+          mkdir -p diagnostics
+          sudo -E docker run -t --rm \
+            -v $(pwd):/stack/kayobe-automation-env/src/kayobe-config \
+            -v $(pwd)/diagnostics:/stack/diagnostics \
+            -e KAYOBE_ENVIRONMENT -e KAYOBE_VAULT_PASSWORD -e KAYOBE_AUTOMATION_SSH_PRIVATE_KEY \
+            $KAYOBE_IMAGE \
+            /stack/kayobe-automation-env/src/kayobe-config/.automation/pipeline/playbook-run.sh '$KAYOBE_CONFIG_PATH/ansible/diagnostics.yml'
+        env:
+          KAYOBE_AUTOMATION_SSH_PRIVATE_KEY: ${{ steps.ssh_key.outputs.ssh_key }}
+        if: ${{ !cancelled() && steps.tf_apply.outcome == 'success' }}
+
       - name: Upload test result artifacts
         uses: actions/upload-artifact@v4
         with:
-          name: tempest-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}${{ inputs.upgrade && '-upgrade' }}
-          path: tempest-artifacts/*
+          name: test-results-${{ inputs.os_distribution }}-${{ inputs.os_release }}-${{ inputs.neutron_plugin }}${{ inputs.upgrade && '-upgrade' || '' }}
+          path: |
+            diagnostics/
+            tempest-artifacts/
+            sot-results/
+        if: ${{ !cancelled() && (steps.tempest.outcome == 'success' || steps.stackhpc-openstack-tests.outcome == 'success' || steps.diagnostics.outcome == 'success') }}
 
       - name: Fail if any Tempest tests failed
         run: |
           test $(wc -l < tempest-artifacts/failed-tests) -lt 1
 
+      - name: Fail if any StackHPC OpenStack tests failed
+        run: |
+          echo "Some StackHPC OpenStack tests failed."
+          echo "See HTML results artifact (sot-results) for details."
+          exit 1
+        if: steps.stackhpc-openstack-tests.outcome == 'failure'
+
       - name: Destroy
         run: terraform destroy -auto-approve
         working-directory: ${{ github.workspace }}/terraform/aio

diff --git a/.github/workflows/stackhpc-container-image-build.yml b/.github/workflows/stackhpc-container-image-build.yml
@@ -34,11 +34,10 @@ on:
         required: false
         default: true
       push-dirty:
-        description: Push scanned images that have vulnerabilities?
+        description: Push scanned images that have critical vulnerabilities?
         type: boolean
         required: false
-        # NOTE(Alex-Welsh): This default should be flipped once we resolve existing failures
-        default: true
+        default: false
 
 env:
   ANSIBLE_FORCE_COLOR: True
@@ -136,6 +135,10 @@ jobs:
         run: |
           curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sudo sh -s -- -b /usr/local/bin v0.49.0
 
+      - name: Install yq
+        run: |
+          curl -sL https://github.com/mikefarah/yq/releases/download/v4.42.1/yq_linux_amd64.tar.gz | tar xz && sudo mv yq_linux_amd64 /usr/bin/yq
+
       - name: Install Kayobe
         run: |
           mkdir -p venvs &&
@@ -149,7 +152,7 @@ jobs:
       # Normally installed during host configure.
       - name: Install Docker Python SDK
         run: |
-          sudo pip install docker
+          sudo pip install docker 'requests<2.32.0'
       
       - name: Get Kolla tag
         id: write-kolla-tag
@@ -176,7 +179,7 @@ jobs:
           KAYOBE_VAULT_PASSWORD: ${{ secrets.KAYOBE_VAULT_PASSWORD }}
 
       - name: Create build logs output directory
-        run: mkdir image-build-logs 
+        run: mkdir image-build-logs
 
       - name: Build kolla overcloud images
         id: build_overcloud_images
@@ -235,9 +238,16 @@ jobs:
         run: cp image-build-logs/image-scan-output/clean-images.txt image-build-logs/push-attempt-images.txt
         if: inputs.push
 
+      # NOTE(seunghun1ee): This always appends dirty images with CVEs severity lower than critical.
+      # This should be reverted when it's decided to filter high level CVEs as well.
       - name: Append dirty images to push list
         run: |
           cat image-build-logs/image-scan-output/dirty-images.txt >> image-build-logs/push-attempt-images.txt
+        if: ${{ inputs.push }}
+
+      - name: Append images with critical vulnerabilities to push list
+        run: |
+          cat image-build-logs/image-scan-output/critical-images.txt >> image-build-logs/push-attempt-images.txt
         if: ${{ inputs.push && inputs.push-dirty }}
 
       - name: Push images
@@ -249,11 +259,11 @@ jobs:
 
           while read -r image; do
             # Retries!
-            for i in {1..5}; do 
+            for i in {1..5}; do
               if docker push $image; then
                 echo "Pushed $image"
                 break
-              elif $i == 5; then
+              elif [ $i -eq 5 ] ; then
                 echo "Failed to push $image"
                 echo $image >> image-build-logs/push-failed-images.txt
               else
@@ -283,8 +293,15 @@ jobs:
         run: if [ $(wc -l < image-build-logs/push-failed-images.txt) -gt 0 ]; then cat image-build-logs/push-failed-images.txt && exit 1; fi
         if: ${{ !cancelled() }}
 
-      - name: Fail when images failed scanning
-        run: if [ $(wc -l < image-build-logs/dirty-images.txt) -gt 0 ]; then cat image-build-logs/dirty-images.txt && exit 1; fi
+      # NOTE(seunghun1ee): Currently we want to mark the job fail only when critical CVEs are detected.
+      # This can be used again instead of "Fail when critical vulnerabilities are found" when it's
+      # decided to fail the job on detecting high CVEs as well.
+      # - name: Fail when images failed scanning
+      #   run: if [ $(wc -l < image-build-logs/image-scan-output/dirty-images.txt) -gt 0 ]; then cat image-build-logs/image-scan-output/dirty-images.txt && exit 1; fi
+      #   if: ${{ !inputs.push-dirty && !cancelled() }}
+
+      - name: Fail when critical vulnerabilities are found
+        run: if [ $(wc -l < image-build-logs/image-scan-output/critical-images.txt) -gt 0 ]; then cat image-build-logs/image-scan-output/critical-images.txt && exit 1; fi
         if: ${{ !inputs.push-dirty && !cancelled() }}
 
       # NOTE(mgoddard): Trigger another CI workflow in the

diff --git a/doc/source/configuration/cephadm.rst b/doc/source/configuration/cephadm.rst
@@ -1,9 +1,9 @@
-================
-Cephadm & Kayobe
-================
+====
+Ceph
+====
 
 This section describes how to use the Cephadm integration included in StackHPC
-Kayobe configuration since Xena to deploy Ceph.
+Kayobe configuration to deploy Ceph.
 
 The Cephadm integration takes the form of custom playbooks that wrap
 around the Ansible `stackhpc.cephadm collection
@@ -19,10 +19,10 @@ create or modify Ceph cluster deployments. Supported features are:
 Resources
 =========
 
--  https://docs.ceph.com/en/pacific/cephadm/index.html
--  https://docs.ceph.com/en/pacific/
 -  https://docs.ceph.com/en/quincy/cephadm/index.html
 -  https://docs.ceph.com/en/quincy/
+-  https://docs.ceph.com/en/reef/cephadm/index.html
+-  https://docs.ceph.com/en/reef/
 -  https://github.com/stackhpc/ansible-collection-cephadm
 
 Configuration
@@ -107,7 +107,7 @@ OSD specification
 ~~~~~~~~~~~~~~~~~
 
 The following example is a basic OSD spec that adds OSDs for all
-available disks:
+available disks with encryption at rest:
 
 .. code:: yaml
 
@@ -118,9 +118,10 @@ available disks:
        host_pattern: "*"
      data_devices:
        all: true
+     encrypted: true
 
 More information about OSD service placement is available
-`here <https://docs.ceph.com/en/pacific/cephadm/services/osd/#advanced-osd-service-specifications>`__.
+`here <https://docs.ceph.com/en/quincy/cephadm/services/osd/#advanced-osd-service-specifications>`__.
 
 Container image
 ~~~~~~~~~~~~~~~
@@ -264,6 +265,24 @@ post-deployment configuration is applied. Commands in the
 ``cephadm_commands_post`` list are executed after the rest of the Ceph
 post-deployment configuration is applied.
 
+Messenger v2 encryption in transit
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Messenger v2 is the default on-wire protocol since the Nautilus release. It
+supports `encryption of data in transit
+<https://docs.ceph.com/en/quincy/rados/configuration/msgr2/#connection-mode-configuration-options>`_,
+but this is not used by default. It may be enabled as follows:
+
+.. code:: yaml
+
+   # A list of commands to pass to cephadm shell -- ceph. See stackhpc.cephadm.commands
+   # for format.
+   cephadm_commands_pre:
+    # Enable messenger v2 encryption in transit.
+    - "config set global ms_cluster_mode secure"
+    - "config set global ms_service_mode secure"
+    - "config set global ms_client_mode secure"
+
 Manila & CephFS
 ~~~~~~~~~~~~~~~
 

diff --git a/doc/source/configuration/cloudkitty.rst b/doc/source/configuration/cloudkitty.rst
@@ -0,0 +1,141 @@
+==========
+CloudKitty
+==========
+
+Configuring in kayobe-config
+============================
+
+By default, CloudKitty uses Gnocchi and Ceilometer as the collector and fetcher
+backends. Unless the system has a specific reason not to, we recommend instead
+using Prometheus as the backend for both. The following instructions explain
+how to do this. Also, see the `Kolla Ansible docs on CloudKitty
+<https://docs.openstack.org/kolla-ansible/latest/reference/rating/cloudkitty-guide.html>`__
+for more details.
+
+Enable CloudKitty and disable InfluxDB, as we are using OpenSearch as the
+storage backend. Set the following in ``kolla.yml``:
+
+.. code-block:: yaml
+
+  kolla_enable_cloudkitty: true
+  # Explicitly disable influxdb as we are using OpenSearch as the CloudKitty backend
+  kolla_enable_influxdb: false
+
+Set Prometheus as the backend for both the collector and fetcher, and
+Elasticsearch as the storage backend. Note that our fork of CloudKitty is
+patched so that the CloudKitty Elasticsearch V2 storage backend will also work
+with an OpenSearch cluster. Proper support for the V2 OpenSearch storage
+backend is still pending in Kolla-Ansible `here
+<https://review.opendev.org/c/openstack/kolla-ansible/+/898555>`__. Set the
+following in ``kolla/globals.yml``:
+
+.. code-block:: yaml
+
+  cloudkitty_collector_backend: prometheus
+  cloudkitty_fetcher_backend: prometheus
+  cloudkitty_storage_backend: elasticsearch
+
+If you have TLS enabled, you will also need to set the cafile for Prometheus
+and Elasticsearch. Set the following in ``kolla/globals.yml``.
+
+.. code-block::
+
+  {% raw %}
+  cloudkitty_prometheus_cafile: "{{ openstack_cacert }}"
+  cloudkitty_elasticsearch_cafile: "{{ openstack_cacert }}"
+  {% endraw %}
+
+The default collection period is one hour, which is likely too long for most
+systems as CloudKitty charges by the **entire** collection period if any usage
+is seen within this timeframe. This is regardless of actual usage, meaning that
+even one minute will be charged as a full hour's usage. As a result, it is
+recommended to adjust the collection interval, ``period`` (in units of
+seconds), appropriately (e.g. ten minutes). Furthermore, when using Prometheus
+as the collector, you need to change the ``scope_key`` to match the metrics
+provided by the Prometheus OpenStack Exporter. Both of these can be achieved by
+setting the following in ``kolla/config/cloudkitty.conf``:
+
+.. code-block:: console
+
+  [collect]
+  scope_key = tenant_id
+  period = 600
+
+You will need to configure which metrics CloudKitty should track. The following
+example, set in ``kolla/config/cloudkitty/metrics.yml``, will track for VM flavors and
+the total utilised volume.
+
+.. code-block:: yaml
+
+  metrics:
+    openstack_nova_server_status:
+      alt_name: instance
+      groupby:
+        - uuid
+        - user_id
+        - tenant_id
+      metadata:
+        - flavor_id
+        - name
+      mutate: MAP
+      mutate_map:
+        0.0: 1.0  # ACTIVE
+        11.0: 1.0 # SHUTOFF
+        12.0: 1.0 # SUSPENDED
+        16.0: 1.0 # PAUSED
+      unit: instance
+    openstack_cinder_limits_volume_used_gb:
+      alt_name: storage
+      unit: GiB
+      groupby:
+        - tenant_id
+
+If your system had Monasca deployed in the past, you likely have some
+relabelled attributes in the Prometheus OpenStack exporter. To account for
+this, you should either remove the custom relabelling (in
+``kolla/config/prometheus.yml``) or change your ``metrics.yml`` to use the
+correct attributes.
+
+Post-configuration with openstack-config
+========================================
+
+This is an example `openstack-config
+<https://github.com/stackhpc/openstack-config>`__ setup to create mappings for
+the metrics configured above. Note that the costs are scaled for the ten minute
+collection period, e.g. a flavor with 1 VCPU will cost 1 unit per hour.
+
+.. code-block:: yaml
+
+  # Map flavors based on VCPUs
+  openstack_ratings_hashmap_field_mappings:
+    - service: instance
+      name: flavor_id
+      mappings:
+      - value: '1' # tiny compute flavor (1 vcpu) with an OpenStack flavor ID of 1
+        cost: 0.1666666666666666
+        type: flat
+      - value: '2' # small compute flavor (2 vcpus) with an OpenStack flavor ID of 2
+        cost: 0.3333333333333333
+        type: flat
+      - value: '3' # medium compute flavor (3 vcpus) with an OpenStack flavor ID of 3
+        cost: 0.5
+        type: flat
+      - value: '4' # large compute flavor (4 vcpus) with an OpenStack flavor ID of 4
+        cost: 0.6666666666666666
+        type: flat
+      - value: '5' # xlarge compute flavor (8 vcpus) with an OpenStack flavor ID of 5
+        cost: 1.3333333333333333
+        type: flat
+      - value: '6' # tiny 2 compute flavor (2 vcpus) with an OpenStack flavor ID of 6
+        cost: 0.3333333333333333
+        type: flat
+
+  # Map volumes based on GB
+  openstack_ratings_hashmap_service_mappings:
+    - service: storage
+      cost: 0.16666666666666666
+      type: flat
+
+See the `OpenStack CloudKitty Ratings role
+<https://github.com/stackhpc/ansible-collection-openstack/tree/main/roles/os_ratings>`__
+for more details.