From f4ad39dc7212317b60b8dbcb7c9119ff8b9eed90 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 16 May 2024 18:31:44 +0100 Subject: [PATCH 1/3] Update smart metrics to include rated DWPD --- etc/kayobe/ansible/get-nvme-drives.yml | 97 ++++++++++++++++++++++++++ etc/kayobe/ansible/scripts/nvmemon.sh | 73 +++++++++++++------ etc/kayobe/ansible/smartmon-tools.yml | 19 ++++- etc/kayobe/stackhpc-monitoring.yml | 3 + 4 files changed, 171 insertions(+), 21 deletions(-) create mode 100644 etc/kayobe/ansible/get-nvme-drives.yml diff --git a/etc/kayobe/ansible/get-nvme-drives.yml b/etc/kayobe/ansible/get-nvme-drives.yml new file mode 100644 index 000000000..7a1bbef12 --- /dev/null +++ b/etc/kayobe/ansible/get-nvme-drives.yml @@ -0,0 +1,97 @@ +--- +- name: Gather unique NVMe disk models and generate a prepopulated variable template + hosts: overcloud + gather_facts: no + tasks: + - name: Get NVMe device information + command: "nvme list -o json" + register: nvme_list + changed_when: false + become: true + + - name: Parse NVMe device model names + set_fact: + nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}" + loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}" + changed_when: false + + - name: Set gathered NVMe models as host facts + set_fact: + unique_nvme_models: "{{ nvme_models | unique }}" + run_once: true + +- name: Update stackhpc-monitoring.yml with DWPD ratings + hosts: localhost + gather_facts: no + tasks: + - name: Aggregate unique NVMe models from all hosts + set_fact: + all_nvme_models: "{{ all_nvme_models | default([]) | union(hostvars[item].unique_nvme_models | default([])) }}" + with_items: "{{ groups['overcloud'] }}" + run_once: true + + - name: Ensure unique NVMe models + set_fact: + all_nvme_models: "{{ all_nvme_models | unique }}" + run_once: true + + - name: Create a dictionary for quick lookup of DWPD ratings + set_fact: + dwpd_lookup: "{{ stackhpc_dwpd_ratings | items2dict(key_name='model_name', value_name='rated_dwpd') }}" + when: stackhpc_dwpd_ratings is defined and stackhpc_dwpd_ratings | length > 0 + run_once: true + + - name: Generate new DWPD ratings section + set_fact: + new_dwpd_section: | + stackhpc_dwpd_ratings: + {% for model in all_nvme_models %} + - model_name: "{{ model }}" + rated_dwpd: "{{ dwpd_lookup[model] if model in dwpd_lookup else '#FILL ME IN' }}" + {% endfor %} + run_once: true + + - name: Read the current stackhpc-monitoring.yml file + slurp: + src: "{{ playbook_dir }}/../stackhpc-monitoring.yml" + register: monitoring_file_content + + - name: Ensure markers exist in the file + set_fact: + markers_exist: "{{ ('# BEGIN DWPD Ratings' in old_content) and ('# END DWPD Ratings' in old_content) }}" + vars: + old_content: "{{ monitoring_file_content.content | b64decode }}" + run_once: true + + - name: Fail if markers do not exist + fail: + msg: "The stackhpc-monitoring.yml file does not contain the required markers: # BEGIN DWPD Ratings and # END DWPD Ratings" + when: not markers_exist + run_once: true + + - name: Update the content with new DWPD ratings section + set_fact: + updated_monitoring_content: | + {% set old_content = monitoring_file_content.content | b64decode %} + {% set before_section = old_content.split('# BEGIN DWPD Ratings')[0] %} + {% set after_section = old_content.split('# END DWPD Ratings')[1] %} + {{ before_section }}# BEGIN DWPD Ratings + {{ new_dwpd_section }} + # END DWPD Ratings{{ after_section }} + when: markers_exist + run_once: true + + - name: Write the updated content back to stackhpc-monitoring.yml + copy: + content: "{{ updated_monitoring_content }}" + dest: "{{ playbook_dir }}/../stackhpc-monitoring.yml" + backup: yes + when: markers_exist + run_once: true + + - name: Print new DWPD ratings section + debug: + msg: + - "{{ new_dwpd_section }}" + - "PLEASE UPDATE stackhpc-monitoring.yml IF NEEDED AND REMEMBER TO COMMIT THE FILE TO GIT" + run_once: true diff --git a/etc/kayobe/ansible/scripts/nvmemon.sh b/etc/kayobe/ansible/scripts/nvmemon.sh index 761e81b7d..9b4696614 100644 --- a/etc/kayobe/ansible/scripts/nvmemon.sh +++ b/etc/kayobe/ansible/scripts/nvmemon.sh @@ -21,6 +21,28 @@ if ! command -v nvme >/dev/null 2>&1; then exit 1 fi +# Set path to the DWPD ratings file +dwpd_file="/opt/kayobe/etc/monitoring/dwpd_ratings.yml" + +# Function to load rated DWPD values from the YML file +load_dwpd_ratings() { + declare -gA rated_dwpd + if [[ -f "$dwpd_file" ]]; then + while IFS= read -r line; do + key="$(echo "$line" | jq -r '.model_name')" + value="$(echo "$line" | jq -r '.rated_dwpd')" + # Strip trailing spaces + key="$(echo "$key" | sed 's/[[:space:]]*$//')" + value="$(echo "$value" | sed 's/[[:space:]]*$//')" + rated_dwpd["$key"]="$value" + done < <(jq -c '.[]' "$dwpd_file") + else + echo "Warning: DWPD ratings file not found at $dwpd_file. Defaulting to 1 DWPD." + fi +} + +load_dwpd_ratings + output_format_awk="$( cat <<'OUTPUTAWK' BEGIN { v = "" } @@ -44,58 +66,69 @@ format_output() { nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output -# Get devices (DevicePath and PhysicalSize) -device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath: .DevicePath, PhysicalSize: .PhysicalSize}')" +# Get devices (DevicePath, PhysicalSize and ModelNumber) +device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath, PhysicalSize, ModelNumber}')" + +# Convert device_info to an array +device_info_array=() +while IFS= read -r line; do + device_info_array+=("$line") +done <<< "$device_info" # Loop through the NVMe devices -echo "$device_info" | while read -r device_data; do - device=$(echo "$device_data" | jq -r '.DevicePath') +for device_data in "${device_info_array[@]}"; do + device="$(echo "$device_data" | jq -r '.DevicePath')" json_check="$(nvme smart-log -o json "${device}")" disk="${device##*/}" + model_name="$(echo "$device_data" | jq -r '.ModelNumber')" - physical_size=$(echo "$device_data" | jq -r '.PhysicalSize') - echo "physical_size_bytes{device=\"${disk}\"} ${physical_size}" + physical_size="$(echo "$device_data" | jq -r '.PhysicalSize')" + echo "physical_size_bytes{device=\"${disk}\",model=\"${model_name}\"} ${physical_size}" # The temperature value in JSON is in Kelvin, we want Celsius value_temperature="$(echo "$json_check" | jq '.temperature - 273')" - echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}" + echo "temperature_celsius{device=\"${disk}\",model=\"${model_name}\"} ${value_temperature}" + + # Get the rated DWPD from the dictionary or default to 1 if not found + value_rated_dwpd="${rated_dwpd[$model_name]:-1}" + echo "rated_dwpd{device=\"${disk}\",model=\"${model_name}\"} ${value_rated_dwpd}" value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" - echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}" + echo "available_spare_ratio{device=\"${disk}\",model=\"${model_name}\"} ${value_available_spare}" value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" - echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}" + echo "available_spare_threshold_ratio{device=\"${disk}\",model=\"${model_name}\"} ${value_available_spare_threshold}" value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" - echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}" + echo "percentage_used_ratio{device=\"${disk}\",model=\"${model_name}\"} ${value_percentage_used}" value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" - echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}" + echo "critical_warning_total{device=\"${disk}\",model=\"${model_name}\"} ${value_critical_warning}" value_media_errors="$(echo "$json_check" | jq '.media_errors')" - echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}" + echo "media_errors_total{device=\"${disk}\",model=\"${model_name}\"} ${value_media_errors}" value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')" - echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}" + echo "num_err_log_entries_total{device=\"${disk}\",model=\"${model_name}\"} ${value_num_err_log_entries}" value_power_cycles="$(echo "$json_check" | jq '.power_cycles')" - echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}" + echo "power_cycles_total{device=\"${disk}\",model=\"${model_name}\"} ${value_power_cycles}" value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')" - echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}" + echo "power_on_hours_total{device=\"${disk}\",model=\"${model_name}\"} ${value_power_on_hours}" value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')" - echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}" + echo "controller_busy_time_seconds{device=\"${disk}\",model=\"${model_name}\"} ${value_controller_busy_time}" value_data_units_written="$(echo "$json_check" | jq '.data_units_written')" - echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}" + echo "data_units_written_total{device=\"${disk}\",model=\"${model_name}\"} ${value_data_units_written}" value_data_units_read="$(echo "$json_check" | jq '.data_units_read')" - echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}" + echo "data_units_read_total{device=\"${disk}\",model=\"${model_name}\"} ${value_data_units_read}" value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')" - echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}" + echo "host_read_commands_total{device=\"${disk}\",model=\"${model_name}\"} ${value_host_read_commands}" value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')" - echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}" + echo "host_write_commands_total{device=\"${disk}\",model=\"${model_name}\"} ${value_host_write_commands}" done | format_output diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml index b4a064b63..1fac2e575 100644 --- a/etc/kayobe/ansible/smartmon-tools.yml +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -1,6 +1,5 @@ --- - hosts: overcloud - tasks: - name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed package: @@ -49,3 +48,21 @@ - smartmon - nvmemon become: yes + + - name: Ensure the DWPD Ratings directory exists + file: + path: /opt/kayobe/etc/monitoring + state: directory + mode: '0755' + when: stackhpc_dwpd_ratings is defined + become: true + + - name: Create a DWPD ratings file + copy: + content: | + {% for drive in stackhpc_dwpd_ratings %} + {{ drive.model_name }}: {{ drive.rated_dwpd }} + {% endfor %} + dest: /opt/kayobe/etc/monitoring/dwpd_ratings.yml + when: stackhpc_dwpd_ratings is defined + become: true diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index e8e0bb91f..a4c6fafd7 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -23,3 +23,6 @@ stackhpc_enable_os_capacity: true # Whether TLS certificate verification is enabled for the OpenStack Capacity # exporter during Keystone authentication. stackhpc_os_capacity_openstack_verify: true + +# BEGIN DWPD Ratings +# END DWPD Ratings From 30de21586e2c7cedff1d096dd6c63210ae424d42 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 16 May 2024 18:35:55 +0100 Subject: [PATCH 2/3] Add release note --- releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml diff --git a/releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml b/releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml new file mode 100644 index 000000000..5b8eb5065 --- /dev/null +++ b/releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Add support of the operator supplying the rated DWPD value for NVMe drives. + There is a playbook ``get-nvme-drives.yml`` that will populate a new + section in the ``stackhpc-monitoring.yml`` file with drive model names for + NVMes in the cloud. The operator can then fill in the rated DWPD values for + each drive. From 16cb22395b3a82b3d0da063e92f99a8cfc395e71 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 16 May 2024 18:39:23 +0100 Subject: [PATCH 3/3] Update alert to use new metric --- etc/kayobe/kolla/config/prometheus/smart.rules | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/kayobe/kolla/config/prometheus/smart.rules b/etc/kayobe/kolla/config/prometheus/smart.rules index 853d9268a..cd7dbb3d6 100644 --- a/etc/kayobe/kolla/config/prometheus/smart.rules +++ b/etc/kayobe/kolla/config/prometheus/smart.rules @@ -14,19 +14,19 @@ groups: description: "{{ $labels.instance }} is reporting unhealthy for the disk at {{ $labels.disk }}. Disk serial number is: {{ $labels.serial_number }}" - alert: DWPDTooHigh - expr: (delta(nvme_data_units_written_total[30d])*512000 / nvme_physical_size_bytes) / 30 > 1 + expr: (delta(nvme_data_units_written_total[30d])*512000 / nvme_physical_size_bytes) / 30 > nvme_rated_dwpd labels: severity: alert annotations: summary: "High 30-Day Average DWPD for {{ $labels.instance }}" - description: "The 30-Day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD" + description: "The 30-Day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds the rated DWPD" - alert: DWPDTooHighWarning - expr: (delta(nvme_data_units_written_total[7d])*512000 / nvme_physical_size_bytes) / 7 > 1 + expr: (delta(nvme_data_units_written_total[7d])*512000 / nvme_physical_size_bytes) / 7 > nvme_rated_dwpd labels: severity: warning annotations: summary: "High 7-Day Average DWPD for {{ $labels.instance }}" - description: "The 7-day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD" + description: "The 7-day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds the rated DWPD" {% endraw %}