Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metric verification job #144

Open
wants to merge 33 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
5258e57
Metric verification job
vyzigold Aug 28, 2024
51c0c64
Update for Emma's comments
vyzigold Sep 3, 2024
c1d6e7c
Fix CI failures
vyzigold Sep 4, 2024
d8ce03d
Change verify metric job after logging job merged
vyzigold Sep 24, 2024
03f2f40
Add [TEST] to test tasks
vyzigold Sep 25, 2024
3c9048a
Add temporary arbitrary IDs to tests
vyzigold Sep 26, 2024
ae12bc9
Add metric_sources_to_test variable
vyzigold Oct 8, 2024
9ff5b91
Rename metric_sources_to_test variable
vyzigold Oct 10, 2024
582803a
Move openstack resource checks to common role
vyzigold Oct 10, 2024
d35988e
Remove verify_ceilometer_metrics.yml
vyzigold Oct 11, 2024
10da834
Remove whitespace
vyzigold Oct 11, 2024
4d5c162
Merge branch 'master' into jwysogla-verify-metrics
vyzigold Oct 22, 2024
3956344
Add retries on metric verification failures
vyzigold Oct 22, 2024
b5f2737
Fix conditions for repeated metric checks
vyzigold Oct 22, 2024
0e625a0
Extend verify metrics for checking scrapeconfigs
vyzigold Oct 23, 2024
0970e75
Merge remote-tracking branch 'origin/master' into jwysogla-verify-met…
vyzigold Oct 24, 2024
90ee44f
Fix lister issues
vyzigold Oct 24, 2024
88281a4
Add container and pod tests to metric verification
vyzigold Oct 24, 2024
432cf9c
Set the nodepool.cloud variable
vyzigold Oct 25, 2024
7aec434
Fix ssh key
vyzigold Oct 25, 2024
9fb1f0c
Rename loop variables
vyzigold Oct 30, 2024
464a8cd
Merge remote-tracking branch 'origin/master' into jwysogla-verify-met…
vyzigold Oct 30, 2024
8abb4ab
Use delegate_to for container checks
vyzigold Oct 31, 2024
c285028
Fix quote escaping
vyzigold Oct 31, 2024
27ee240
Remove debug tasks
vyzigold Nov 1, 2024
de8dfba
Disable node exporter tests
vyzigold Nov 4, 2024
670fd09
Merge remote-tracking branch 'origin/master' into jwysogla-verify-met…
vyzigold Nov 5, 2024
668630a
Change prefix from [TEST] to TEST
vyzigold Nov 7, 2024
33b1b36
Add test id to check openstack services
vyzigold Nov 7, 2024
e9c32cb
Move delegate_to out of common role
vyzigold Nov 7, 2024
ba63e98
Generalize rabbitmq checks
vyzigold Nov 7, 2024
e5313a6
Fix linter issues
vyzigold Nov 7, 2024
fde1c8e
Remove the check_openstack_rervices_rhoso
vyzigold Nov 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .zuul.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,22 @@
required-projects: *required_projects
irrelevant-files: *irrelevant_files

- job:
name: functional-metric-verification-tests-osp18
parent: telemetry-operator-multinode-autoscaling
description: |
Run the metric verification functional test on osp18.
vars:
patch_observabilityclient: true
cifmw_extras:
- "@{{ ansible_user_dir }}/{{ zuul.projects['github.com/openstack-k8s-operators/ci-framework'].src_dir }}/scenarios/centos-9/multinode-ci.yml"
- "@{{ ansible_user_dir }}/{{ zuul.projects['github.com/openstack-k8s-operators/telemetry-operator'].src_dir }}/ci/vars-autoscaling.yml"
- "@{{ ansible_user_dir }}/{{ zuul.projects['github.com/infrawatch/feature-verification-tests'].src_dir }}/ci/vars-metric-verification-test.yml"
roles:
- zuul: github.com/openstack-k8s-operators/ci-framework
required-projects: *required_projects
irrelevant-files: *irrelevant_files

- job:
name: feature-verification-tests-noop
parent: noop
Expand Down Expand Up @@ -87,3 +103,4 @@
- functional-tests-on-osp18
- functional-logging-tests-osp18
- functional-graphing-tests-osp18
- functional-metric-verification-tests-osp18
28 changes: 28 additions & 0 deletions ci/run_verify_metrics_osp18.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
---
- name: Run telemetry tests to verify metrics on osp18
hosts: "{{ cifmw_target_hook_host | default('localhost') }}"
gather_facts: true
environment:
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
PATH: "{{ cifmw_path }}"
vars_files:
- vars/common.yml
- vars/osp18_env.yml
tasks:
- name: Include vars from the extra_vars files
ansible.builtin.include_vars:
dir: "{{ cifmw_basedir }}/artifacts/parameters"

- name: Patch observabilityclient into openstackclient
ansible.builtin.shell:
cmd: |
oc exec openstackclient -- python3 -m ensurepip --upgrade
oc exec openstackclient -- python3 -m pip install --upgrade aodhclient
oc exec openstackclient -- python3 -m pip install --upgrade python-observabilityclient
when: patch_observabilityclient | bool
tags:
- setup

- name: "Run Telemetry Verify Metrics tests"
ansible.builtin.import_role:
name: telemetry_verify_metrics
8 changes: 8 additions & 0 deletions ci/vars-metric-verification-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
---
post_deploy_00_fvt_verify_metrics:
source: "{{ ansible_user_dir }}/{{ zuul.projects['github.com/infrawatch/feature-verification-tests'].src_dir }}/ci/run_verify_metrics_osp18.yml"
type: playbook
config_file: "{{ ansible_user_dir }}/{{ zuul.projects['github.com/infrawatch/feature-verification-tests'].src_dir }}/ci/ansible.cfg"
post_deploy_99_collect_results:
source: "{{ ansible_user_dir }}/{{ zuul.projects['github.com/infrawatch/feature-verification-tests'].src_dir }}/ci/report_result.yml"
type: playbook
2 changes: 1 addition & 1 deletion ci/vars/osp18_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@ stack_image: "cirros"
stack_flavor: "m1.small"
stack_network: "private"
stack_external_network: "public"
stack_name: "vnf"
stack_name: "vnf"
18 changes: 16 additions & 2 deletions roles/common/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,14 @@ For pod_tests.yml tasks:
common_pod_nspace
- list of projects where pods exist


For crd_tests.yml tasks:

common_crd_test_id
- polarion ID number for each test.
common_crd_list
- list of crd to validate



For endpoint_tests.yml tasks:

common_endpoint_test_id
Expand All @@ -66,6 +65,21 @@ For manifest_tests.yml tasks:
manifest_list
- list of package manifests to validate

For cr\_tests.yml tasks:

common\_cr\_test\_id is defined
- polarion ID number for each test that a CR exists
common\_cr\_list is defined
- list of CRs to check
Each dict should include the following keys: kind, name
A dict can optionally include a "condition\_type" key.
Example:
kind: metricstorage
name: metric-storage
condition\_type: Ready
common\_cr\_ready\_test\_id is optionally defined
- polarion ID number for each test of readiness of the CR



Dependencies
Expand Down
9 changes: 4 additions & 5 deletions roles/common/tasks/container_test.yml
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This whole file feels a little hacky after my changes. The thing is, that the task for getting container status needs to run on the compute nodes. I'd like to run this from the telemetry_verify_metrics role to for example verify the node exporter container is healthy before checking if we're getting metrics from it. I don't think I can set the hosts for one task. Any other ideas other than what I have here?

Copy link
Collaborator

@elfiesmelfie elfiesmelfie Oct 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use delegate_to.

OR add the check to the verify metrics job using a second play in the playbook.

Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
- name: Get container status
ansible.builtin.shell:
cmd: |
podman ps -a --format "{{ '{{.Names}} {{.Status}}' }}" | grep {{ item }}
podman ps -a --format "{{ '{{.Names}} {{.Status}}' }}" | grep {{ container_name }}
changed_when: false
register: container_status


- name: Verify container status {{ common_container_test_id }}
ansible.builtin.assert:
that:
- "'Up' in container_status.stdout"
success_msg: "Container '{{ item }}' is in 'Up' status."
fail_msg: "Container '{{ item }}' is not in 'Up' status. Current status: {{ container_status.stdout }}"
- "'unhealthy' not in container_status.stdout"
success_msg: "Container '{{ container_name }}' is in 'healthy' status."
fail_msg: "Container '{{ container_name }}' is not in 'healthy' status. Current status: {{ container_status.stdout }}"
20 changes: 20 additions & 0 deletions roles/common/tasks/cr_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
- name: Verify that a CR exists {{ common_cr_test_id }}
ansible.builtin.command:
cmd: |
oc get {{ item.kind }} {{ item.name }}
register: result
changed_when: false
failed_when:
- result.rc != 0

- name: Verify that a CR is ready {{ common_cr_ready_test_id }}
ansible.builtin.command:
cmd: |
oc get {{ item.kind }} {{ item.name }} -o jsonpath='{.status.conditions[?(@.type=="{{ item.condition_type }}")].status}{"\n"}'
register: result
changed_when: false
failed_when:
- result.stdout != "True"
when:
- common_cr_ready_test_id is defined
- item.condition_type is defined
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nicely done

10 changes: 9 additions & 1 deletion roles/common/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,18 @@
ansible.builtin.include_tasks: "crd_tests.yml"
loop: "{{ common_crd_list }}"

- name: "Run CR tests"
when:
- common_cr_test_id is defined
- common_cr_list is defined
ansible.builtin.include_tasks: "cr_tests.yml"
loop: "{{ common_cr_list }}"

- name: "Verify container tests"
when:
- common_container_list is defined
- common_container_test_id is defined
ansible.builtin.include_tasks: "container_test.yml"
loop: "{{ common_container_list }}"

loop_control:
loop_var: container_name
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very nice, it is a stylistic change, but enhances readability

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wasn't my idea. Ansible complained about "item" being redefined 🤣

51 changes: 51 additions & 0 deletions roles/telemetry_verify_metrics/README.md
vyzigold marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
telemetry_verify_metrics
=========

Test that expected metrics appear in Prometheus

Requirements
------------
OpenStack deployed with the following enabled:
- telemetry
- metricstorage
- ceilometer
- rabbitmq

Tests:
------
- Verify OpenStack is deployed correctly
- Verify telemetry is ready
- Verify metricstorage is ready
- Verify ceilometer is ready
- Verify rabbitmq is ready
- Verify RabbitMQ metrics are being exposed and stored
- Check the rabbitmq metrics endpoint
- Use openstack observabilityclient to verify RabbitMQ metrics are stored in Prometheus
- Verify Ceilometer metrics are being exposed and stored
- Use openstack observabilityclient to verify Ceilometer central metrics are stored in Prometheus
- Use openstack observabilityclient to verify Ceilometer compute metrics are stored in Prometheus
- Verify NodeExporter metrics are being exposed and stored
- Use openstack observabilityclient to verify NodeExporter metrics are stored in Prometheus

Role Variables
--------------
openstack\_cmd - command to access openstack cli. For example: "oc rsh openstackclient openstack"
vyzigold marked this conversation as resolved.
Show resolved Hide resolved
telemetry\_verify\_metrics\_metric\_sources\_to\_test - List of sources to test. Current set of possible sources: ceilometer\_compute\_agent, ceilometer\_central\_agent, node\_exporter, rabbitmq

Example Playbook
----------------
- name: Run telemetry tests to verify metrics on osp18
hosts: "{{ cifmw\_target\_hook\_host | default('localhost') }}"
gather\_facts: true
environment:
KUBECONFIG: "path to kubeconfig"
PATH: "PATH variable contents"
tasks
- name: "Run Telemetry Verify Metrics tests"
ansible.builtin.import_role:
name: telemetry_verify_metrics

License
-------

Apache 2
7 changes: 7 additions & 0 deletions roles/telemetry_verify_metrics/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
telemetry_verify_metrics_metric_sources_to_test:
- ceilometer_compute_agent
- ceilometer_central_agent
# Disable node exporter testing until OSPRH-11059 is fixed
# - node_exporter
- rabbitmq
52 changes: 52 additions & 0 deletions roles/telemetry_verify_metrics/meta/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
galaxy_info:
author: Jaromir Wysoglad
description: Test that metrics from all sources are stored in Prometheus
company: Red Hat

# If the issue tracker for your role is not on github, uncomment the
# next line and provide a value
# issue_tracker_url: http://example.com/issue/tracker

# Choose a valid license ID from https://spdx.org - some suggested licenses:
# - BSD-3-Clause (default)
# - MIT
# - GPL-2.0-or-later
# - GPL-3.0-only
# - Apache-2.0
# - CC-BY-4.0
license: Apache-2.0

min_ansible_version: "2.1"

# If this a Container Enabled role, provide the minimum Ansible Container version.
# min_ansible_container_version:

#
# Provide a list of supported platforms, and for each platform a list of versions.
# If you don't wish to enumerate all versions for a particular platform, use 'all'.
# To view available platforms and versions (or releases), visit:
# https://galaxy.ansible.com/api/v1/platforms/
#
# platforms:
# - name: Fedora
# versions:
# - all
# - 25
# - name: SomePlatform
# versions:
# - all
# - 1.0
# - 7
# - 99.99

galaxy_tags: []
# List tags for your role here, one per line. A tag is a keyword that describes
# and categorizes the role. Users find roles by searching for tags. Be sure to
# remove the '[]' above, if you add tags to this list.
#
# NOTE: A tag is limited to a single word comprised of alphanumeric characters.
# Maximum 20 tags per role.

dependencies: []
# List your role dependencies here, one per line. Be sure to remove the '[]' above,
# if you add dependencies to this list.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- delegate_to: "{{ compute_node }}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still not a fan of having this in the role, but I'm not going to block based on this.

I hope you'll later consider whether this is better being included in the ci/run_verify_metrics_osp18.yml playbook, as a separate play for compute and controller node tests.
Something similar (i.e. separating compute and controller tests into different plays) is done in the logging jobs.

# The containers on compute nodes seem to run on the root user, so we need to connect as root
become: true
block:
- name: Check compute node containers are up for {{ compute_node }}
ansible.builtin.include_role:
name: common
48 changes: 48 additions & 0 deletions roles/telemetry_verify_metrics/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
---
- name: Verify OpenStack is deployed correctly
ansible.builtin.include_role:
name: common
vars:
common_cr_test_id: RHOSO-1258
common_cr_ready_test_id: RHOSO-1259
common_cr_list:
- kind: telemetry
name: telemetry
condition_type: Ready
Comment on lines +9 to +11
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the use of dicts here, which makes the test configs more readable than using lists.

- kind: metricstorage
name: metric-storage
condition_type: Ready
- kind: ceilometer
name: ceilometer
condition_type: Ready
- kind: rabbitmq
name: rabbitmq
condition_type: ReconcileSuccess
- kind: rabbitmq
name: rabbitmq-cell1
condition_type: ReconcileSuccess
tags: precheck

- name: Verify RabbitMQ metrics are being exposed and stored
ansible.builtin.include_tasks:
file: verify_rabbitmq_metrics.yml
tags: test
when: '"rabbitmq" in telemetry_verify_metrics_metric_sources_to_test'

- name: Verify Ceilometer compute metrics are being exposed and stored
ansible.builtin.include_tasks:
file: verify_ceilometer_compute_metrics.yml
tags: test
when: '"ceilometer_compute_agent" in telemetry_verify_metrics_metric_sources_to_test'

- name: Verify Ceilometer central metrics are being exposed and stored
ansible.builtin.include_tasks:
file: verify_ceilometer_central_metrics.yml
tags: test
when: '"ceilometer_central_agent" in telemetry_verify_metrics_metric_sources_to_test'

- name: Verify NodeExporter metrics are being exposed and stored
ansible.builtin.include_tasks:
file: verify_node_exporter_metrics.yml
tags: test
when: '"node_exporter" in telemetry_verify_metrics_metric_sources_to_test'
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
- name: Verify ceilometer scrapeconfig exists
ansible.builtin.include_role:
name: common
vars:
common_cr_test_id: RHOSO-1220
common_cr_list:
- kind: scrapeconfigs.monitoring.rhobs
name: telemetry-ceilometer

- name: Verify ceilometer central agent is running
ansible.builtin.include_role:
name: common
vars:
common_pod_test_id: RHOSO-1240
common_pod_status_str: "Running"
common_pod_nspace: openstack
common_pod_list:
- ceilometer-0

- block:
- name: Create an image
ansible.builtin.shell: |
curl -L -# http://download.cirros-cloud.net/0.5.2/cirros-0.5.2-x86_64-disk.img > /tmp/fvt_testing_image.img
{{ openstack_cmd }} image create --container-format bare --disk-format qcow2 fvt_central_testing_image < /tmp/fvt_testing_image.img
register: result
changed_when: result.rc == 0
failed_when: result.rc >= 1

- name: |
TEST Use openstack observabilityclient to verify ceilometer central metrics are stored in prometheus
RHOSO-1212
ansible.builtin.shell: |
{{ openstack_cmd }} metric show ceilometer_image_size
register: result
delay: 30
retries: 10
until: result.rc == 0 and "ceilometer_image_size" in result.stdout
changed_when: false

always:
- name: Delete the image
ansible.builtin.shell: |
{{ openstack_cmd }} image show fvt_central_testing_image && {{ openstack_cmd }} image delete fvt_central_testing_image
register: result
changed_when: result.rc == 0
failed_when: false
Loading
Loading