From e56312ccfe04ad0680712bf458cb347d5990ce71 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 16 May 2024 18:31:44 +0100 Subject: [PATCH 1/5] Update smart metrics to include rated DWPD --- etc/kayobe/ansible/get-nvme-drives.yml | 96 ++++++++++++++++++++++++++ etc/kayobe/ansible/scripts/nvmemon.sh | 89 ++++++++++++++++++------ etc/kayobe/ansible/smartmon-tools.yml | 56 +++++++++++++-- 3 files changed, 215 insertions(+), 26 deletions(-) create mode 100644 etc/kayobe/ansible/get-nvme-drives.yml diff --git a/etc/kayobe/ansible/get-nvme-drives.yml b/etc/kayobe/ansible/get-nvme-drives.yml new file mode 100644 index 000000000..1d2404d80 --- /dev/null +++ b/etc/kayobe/ansible/get-nvme-drives.yml @@ -0,0 +1,96 @@ +--- +- name: Gather unique NVMe disk models on all hosts + hosts: overcloud + gather_facts: no + tasks: + - name: Retrieve NVMe device information + ansible.builtin.command: "nvme list -o json" + register: nvme_list + changed_when: false + become: true + + - name: Parse NVMe device model names + ansible.builtin.set_fact: + nvme_models: "{{ nvme_models | default([]) + [item.ModelNumber] }}" + loop: "{{ nvme_list.stdout | from_json | json_query('Devices[].{ModelNumber: ModelNumber}') }}" + changed_when: false + + - name: Set unique NVMe models as host facts + ansible.builtin.set_fact: + unique_nvme_models: "{{ (nvme_models | default([])) | unique }}" + + - name: Show unique NVMe models per host + ansible.builtin.debug: + var: unique_nvme_models + +- name: Aggregate all unique NVMe models from all hosts + hosts: localhost + gather_facts: no + tasks: + - name: Aggregate unique NVMe models from all overcloud hosts + ansible.builtin.set_fact: + all_nvme_models: "{{ groups['overcloud'] | map('extract', hostvars, 'unique_nvme_models') | select('defined') | sum(start=[]) | unique }}" + + - name: Show all unique NVMe models + ansible.builtin.debug: + var: all_nvme_models + + - name: Ensure dwpd-ratings.yml exists + ansible.builtin.stat: + path: "{{ kayobe_env_config_path }}/dwpd-ratings.yml" + register: dwpd_ratings_stat + run_once: true + + - name: Load existing dwpd-ratings.yml + ansible.builtin.set_fact: + existing_dwpd_yml: "{{ lookup('file', kayobe_env_config_path ~ '/dwpd-ratings.yml') | from_yaml }}" + when: dwpd_ratings_stat.stat.exists + run_once: true + + - name: Convert existing YAML array into a dictionary + ansible.builtin.set_fact: + dwpd_lookup: "{{ dwpd_lookup | default({}) | combine({item.model_name: item.rated_dwpd}) }}" + loop: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) }}" + loop_control: + label: "{{ item.model_name }}" + run_once: true + + - name: Get list of existing model names + ansible.builtin.set_fact: + existing_model_names: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) | map(attribute='model_name') | list }}" + run_once: true + + - name: Identify new models not already in the configuration + ansible.builtin.set_fact: + new_models: "{{ all_nvme_models | default([]) | reject('in', existing_model_names | default([])) | list }}" + run_once: true + + - name: Create entry dictionary for new models + ansible.builtin.set_fact: + new_entries: "{{ new_entries | default([]) + [{'model_name': item, 'rated_dwpd': 1}] }}" + loop: "{{ new_models }}" + run_once: true + when: new_models | length > 0 + + - name: Build updated list for stackhpc_dwpd_ratings + ansible.builtin.set_fact: + new_dwpd_list: "{{ existing_dwpd_yml.stackhpc_dwpd_ratings | default([]) + (new_entries | default([])) }}" + run_once: true + + - name: Write updated dwpd-ratings.yml + ansible.builtin.copy: + content: "---\nstackhpc_dwpd_ratings:\n{% for item in new_dwpd_list %} - model_name: \"{{ item.model_name }}\"\n rated_dwpd: {{ item.rated_dwpd }}\n{% endfor %}" + dest: "{{ kayobe_env_config_path }}/dwpd-ratings.yml" + run_once: true + notify: Show updated dwpd-ratings.yml contents + when: new_dwpd_list is defined and new_dwpd_list | length > 0 + + handlers: + - name: Show updated dwpd-ratings.yml contents + ansible.builtin.debug: + msg: + - "Updated local dwpd-ratings.yml contents" + - "{{ {'stackhpc_dwpd_ratings': new_dwpd_list} | to_nice_yaml }}" + - "PLEASE REVIEW AND COMMIT {{ kayobe_env_config_path }}/dwpd-ratings.yml TO VERSION CONTROL." + run_once: true + changed_when: true diff --git a/etc/kayobe/ansible/scripts/nvmemon.sh b/etc/kayobe/ansible/scripts/nvmemon.sh index 761e81b7d..40c2cb70f 100644 --- a/etc/kayobe/ansible/scripts/nvmemon.sh +++ b/etc/kayobe/ansible/scripts/nvmemon.sh @@ -21,6 +21,43 @@ if ! command -v nvme >/dev/null 2>&1; then exit 1 fi +if ! command -v jq >/dev/null 2>&1; then + echo "${0##*/}: jq is required but not installed. Aborting." >&2 + exit 1 +fi + +# Path to the DWPD ratings JSON file +dwpd_file="/opt/kayobe/etc/monitoring/dwpd_ratings.json" + +declare -A rated_dwpd + +load_dwpd_ratings() { + if [[ -f "$dwpd_file" ]]; then + # Read the JSON; if it fails, default to empty array + dwpd_json="$(cat "$dwpd_file" 2>/dev/null | jq '.' || echo '[]')" + + # We iterate over each array element in dwpd_json + while IFS= read -r line; do + key="$(echo "$line" | jq -r '.model_name')" + value="$(echo "$line" | jq -r '.rated_dwpd')" + + # Clean up trailing whitespace + key="${key%%[[:space:]]*}" + value="${value%%[[:space:]]*}" + + # If we have a valid key, store it in the dictionary + if [[ -n "$key" && "$key" != "null" ]]; then + rated_dwpd["$key"]="$value" + fi + done < <(echo "$dwpd_json" | jq -c '.[]') + else + echo "Warning: DWPD ratings file not found at '$dwpd_file'. Defaulting to rated_dwpd=1." >&2 + fi +} + + +load_dwpd_ratings + output_format_awk="$( cat <<'OUTPUTAWK' BEGIN { v = "" } @@ -44,58 +81,70 @@ format_output() { nvme_version="$(nvme version | awk '$1 == "nvme" {print $3}')" echo "nvmecli{version=\"${nvme_version}\"} 1" | format_output -# Get devices (DevicePath and PhysicalSize) -device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath: .DevicePath, PhysicalSize: .PhysicalSize}')" +# Get devices (DevicePath, PhysicalSize and ModelNumber) +device_info="$(nvme list -o json | jq -c '.Devices[] | {DevicePath, PhysicalSize, ModelNumber, SerialNumber}')" + +# Convert device_info to an array +device_info_array=() +while IFS= read -r line; do + device_info_array+=("$line") +done <<< "$device_info" # Loop through the NVMe devices -echo "$device_info" | while read -r device_data; do - device=$(echo "$device_data" | jq -r '.DevicePath') +for device_data in "${device_info_array[@]}"; do + device="$(echo "$device_data" | jq -r '.DevicePath')" json_check="$(nvme smart-log -o json "${device}")" disk="${device##*/}" + model_name="$(echo "$device_data" | jq -r '.ModelNumber')" + serial_number="$(echo "$device_data" | jq -r '.SerialNumber')" - physical_size=$(echo "$device_data" | jq -r '.PhysicalSize') - echo "physical_size_bytes{device=\"${disk}\"} ${physical_size}" + physical_size="$(echo "$device_data" | jq -r '.PhysicalSize')" + echo "physical_size_bytes{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${physical_size}" # The temperature value in JSON is in Kelvin, we want Celsius value_temperature="$(echo "$json_check" | jq '.temperature - 273')" - echo "temperature_celsius{device=\"${disk}\"} ${value_temperature}" + echo "temperature_celsius{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_temperature}" + + # Get the rated DWPD from the dictionary or default to 1 if not found + value_rated_dwpd="${rated_dwpd[$model_name]:-1}" + echo "rated_dwpd{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_rated_dwpd}" value_available_spare="$(echo "$json_check" | jq '.avail_spare / 100')" - echo "available_spare_ratio{device=\"${disk}\"} ${value_available_spare}" + echo "available_spare_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare}" value_available_spare_threshold="$(echo "$json_check" | jq '.spare_thresh / 100')" - echo "available_spare_threshold_ratio{device=\"${disk}\"} ${value_available_spare_threshold}" + echo "available_spare_threshold_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_available_spare_threshold}" value_percentage_used="$(echo "$json_check" | jq '.percent_used / 100')" - echo "percentage_used_ratio{device=\"${disk}\"} ${value_percentage_used}" + echo "percentage_used_ratio{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_percentage_used}" value_critical_warning="$(echo "$json_check" | jq '.critical_warning')" - echo "critical_warning_total{device=\"${disk}\"} ${value_critical_warning}" + echo "critical_warning_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_critical_warning}" value_media_errors="$(echo "$json_check" | jq '.media_errors')" - echo "media_errors_total{device=\"${disk}\"} ${value_media_errors}" + echo "media_errors_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_media_errors}" value_num_err_log_entries="$(echo "$json_check" | jq '.num_err_log_entries')" - echo "num_err_log_entries_total{device=\"${disk}\"} ${value_num_err_log_entries}" + echo "num_err_log_entries_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_num_err_log_entries}" value_power_cycles="$(echo "$json_check" | jq '.power_cycles')" - echo "power_cycles_total{device=\"${disk}\"} ${value_power_cycles}" + echo "power_cycles_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_cycles}" value_power_on_hours="$(echo "$json_check" | jq '.power_on_hours')" - echo "power_on_hours_total{device=\"${disk}\"} ${value_power_on_hours}" + echo "power_on_hours_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_power_on_hours}" value_controller_busy_time="$(echo "$json_check" | jq '.controller_busy_time')" - echo "controller_busy_time_seconds{device=\"${disk}\"} ${value_controller_busy_time}" + echo "controller_busy_time_seconds{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_controller_busy_time}" value_data_units_written="$(echo "$json_check" | jq '.data_units_written')" - echo "data_units_written_total{device=\"${disk}\"} ${value_data_units_written}" + echo "data_units_written_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_written}" value_data_units_read="$(echo "$json_check" | jq '.data_units_read')" - echo "data_units_read_total{device=\"${disk}\"} ${value_data_units_read}" + echo "data_units_read_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_data_units_read}" value_host_read_commands="$(echo "$json_check" | jq '.host_read_commands')" - echo "host_read_commands_total{device=\"${disk}\"} ${value_host_read_commands}" + echo "host_read_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_read_commands}" value_host_write_commands="$(echo "$json_check" | jq '.host_write_commands')" - echo "host_write_commands_total{device=\"${disk}\"} ${value_host_write_commands}" + echo "host_write_commands_total{device=\"${disk}\",model=\"${model_name}\",serial_number=\"${serial_number}\"} ${value_host_write_commands}" done | format_output diff --git a/etc/kayobe/ansible/smartmon-tools.yml b/etc/kayobe/ansible/smartmon-tools.yml index c6fa35acc..4c69c7cbb 100644 --- a/etc/kayobe/ansible/smartmon-tools.yml +++ b/etc/kayobe/ansible/smartmon-tools.yml @@ -1,7 +1,6 @@ --- -- name: Install and set up smartmon-tools +- name: Install and set up SMART monitoring tools hosts: overcloud - tasks: - name: Ensure smartmontools, jq, nvme-cli and cron/cronie are installed ansible.builtin.package: @@ -13,11 +12,23 @@ state: present become: true - - name: Ensure Python 3, venv, and pip are installed - ansible.builtin.package: - name: > - {{ ['python3', 'python3-pip'] + (['python3-venv'] if ansible_facts['distribution'] == 'Ubuntu' else []) }} + - name: Ensure Python 3, venv, and pip are installed on Debian/Ubuntu + ansible.builtin.apt: + name: + - python3 + - python3-venv + - python3-pip + state: present + when: ansible_facts.os_family == 'Debian' + become: true + + - name: Ensure Python 3, and pip are installed on RedHat/CentOS + ansible.builtin.yum: + name: + - python3 + - python3-pip state: present + when: ansible_facts.os_family == 'RedHat' become: true - name: Create smartmon Python virtual environment @@ -31,6 +42,7 @@ name: - prometheus_client - pySMART + state: present virtualenv: /opt/smartmon-venv virtualenv_python: python3 become: true @@ -98,3 +110,35 @@ path: /usr/local/bin/smartmon.sh state: absent become: true + +- name: Gather NVMe drives and generate dwpd ratings + import_playbook: get-nvme-drives.yml + when: create_dwpd_ratings | default(false) + +- name: Copy DWPD ratings to overcloud hosts + hosts: overcloud + gather_facts: false + tasks: + - name: Convert the stackhpc_dwpd_ratings variable to JSON + ansible.builtin.set_fact: + dwpd_ratings_json: "{{ stackhpc_dwpd_ratings | default([]) | to_json }}" + run_once: true + when: stackhpc_dwpd_ratings is defined + + - name: Ensure /opt/kayobe/etc/monitoring directory exists + ansible.builtin.file: + path: /opt/kayobe/etc/monitoring + state: directory + mode: '0755' + become: true + when: stackhpc_dwpd_ratings is defined + + - name: Copy JSON file to remote + ansible.builtin.copy: + content: "{{ dwpd_ratings_json }}" + dest: "/opt/kayobe/etc/monitoring/dwpd_ratings.json" + owner: root + group: root + mode: '0644' + become: true + when: stackhpc_dwpd_ratings is defined From 4bc1856f293c2ff64a948116ba8b40131900298a Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 16 May 2024 18:35:55 +0100 Subject: [PATCH 2/5] Add release note --- releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml diff --git a/releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml b/releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml new file mode 100644 index 000000000..5b8eb5065 --- /dev/null +++ b/releasenotes/notes/rated-dwpd-40526e85e24ef7ea.yaml @@ -0,0 +1,8 @@ +--- +features: + - | + Add support of the operator supplying the rated DWPD value for NVMe drives. + There is a playbook ``get-nvme-drives.yml`` that will populate a new + section in the ``stackhpc-monitoring.yml`` file with drive model names for + NVMes in the cloud. The operator can then fill in the rated DWPD values for + each drive. From 31b837219536e5f28459523b84f05ec5f60e3ee6 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Thu, 16 May 2024 18:39:23 +0100 Subject: [PATCH 3/5] Update alert to use new metric --- etc/kayobe/kolla/config/prometheus/smart.rules | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/etc/kayobe/kolla/config/prometheus/smart.rules b/etc/kayobe/kolla/config/prometheus/smart.rules index 853d9268a..cd7dbb3d6 100644 --- a/etc/kayobe/kolla/config/prometheus/smart.rules +++ b/etc/kayobe/kolla/config/prometheus/smart.rules @@ -14,19 +14,19 @@ groups: description: "{{ $labels.instance }} is reporting unhealthy for the disk at {{ $labels.disk }}. Disk serial number is: {{ $labels.serial_number }}" - alert: DWPDTooHigh - expr: (delta(nvme_data_units_written_total[30d])*512000 / nvme_physical_size_bytes) / 30 > 1 + expr: (delta(nvme_data_units_written_total[30d])*512000 / nvme_physical_size_bytes) / 30 > nvme_rated_dwpd labels: severity: alert annotations: summary: "High 30-Day Average DWPD for {{ $labels.instance }}" - description: "The 30-Day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD" + description: "The 30-Day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds the rated DWPD" - alert: DWPDTooHighWarning - expr: (delta(nvme_data_units_written_total[7d])*512000 / nvme_physical_size_bytes) / 7 > 1 + expr: (delta(nvme_data_units_written_total[7d])*512000 / nvme_physical_size_bytes) / 7 > nvme_rated_dwpd labels: severity: warning annotations: summary: "High 7-Day Average DWPD for {{ $labels.instance }}" - description: "The 7-day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds 1 DWPD" + description: "The 7-day average for Disk Writes Per Day for disk {{ $labels.device }} on {{ $labels.instance }} exceeds the rated DWPD" {% endraw %} From c830ab78cda16b58ffe2a8fe580913ed3349137d Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Tue, 13 May 2025 00:37:17 +0100 Subject: [PATCH 4/5] Update hardware overview dashboard with more nvme metrics --- .../openstack/hardware_overview.json | 778 +++++++++++++++++- 1 file changed, 733 insertions(+), 45 deletions(-) diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json index b27496136..b30550222 100644 --- a/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/hardware_overview.json @@ -1,5 +1,48 @@ {% raw %} { + "__inputs": [ + { + "name": "datasource", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.4.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "table", + "name": "Table", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], "annotations": { "list": [ { @@ -25,9 +68,22 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, + "id": null, "links": [], - "liveNow": false, "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 10, + "panels": [], + "title": "All Disks Metrics", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -56,15 +112,15 @@ "h": 7, "w": 6, "x": 0, - "y": 0 + "y": 1 }, - "hideTimeOverride": false, "id": 4, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" @@ -76,7 +132,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -130,15 +186,15 @@ "h": 7, "w": 6, "x": 6, - "y": 0 + "y": 1 }, - "hideTimeOverride": false, "id": 5, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" @@ -150,7 +206,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -199,15 +255,15 @@ "h": 7, "w": 6, "x": 12, - "y": 0 + "y": 1 }, - "hideTimeOverride": false, "id": 6, "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", + "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" @@ -219,7 +275,7 @@ "textMode": "auto", "wideLayout": true }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -489,7 +545,7 @@ "h": 10, "w": 20, "x": 0, - "y": 7 + "y": 8 }, "id": 2, "options": { @@ -506,7 +562,7 @@ "showHeader": true, "sortBy": [] }, - "pluginVersion": "11.0.0", + "pluginVersion": "11.4.0", "targets": [ { "$$hashKey": "object:40", @@ -541,7 +597,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "smartmon_temperature_case_raw_value{instance=~\"$node\"} or smartmon_temperature_celsius_raw_value{instance=~\"$node\"}", + "expr": "smartmon_temperature{instance=~\"$node\"}", "format": "table", "hide": false, "instant": true, @@ -562,7 +618,12 @@ { "id": "organize", "options": { - "excludeByName": {}, + "excludeByName": { + "Time": true, + "__name__": true, + "job": true + }, + "includeByName": {}, "indexByName": { "Time 1": 3, "Time 2": 10, @@ -583,7 +644,12 @@ "type 1": 2, "type 2": 17 }, - "renameByName": {} + "renameByName": { + "disk": "Disk", + "instance": "Instance", + "job": "", + "type": "Type" + } } } ], @@ -607,6 +673,7 @@ "axisLabel": "Temperature (°C)", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -653,9 +720,8 @@ "h": 13, "w": 9, "x": 0, - "y": 17 + "y": 18 }, - "hideTimeOverride": false, "id": 8, "options": { "legend": { @@ -665,11 +731,11 @@ "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "single", "sort": "none" } }, + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -678,7 +744,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "avg_over_time(smartmon_temperature_case_raw_value{instance=~\"$node\"}[1h]) or avg_over_time(smartmon_temperature_celsius_raw_value{instance=~\"$node\"}[1h])", + "expr": "avg_over_time(smartmon_temperature{instance=~\"$node\"}[1h])", "instant": false, "interval": "", "legendFormat": "{{instance}} - {{disk}} - {{serial_number}}", @@ -686,9 +752,548 @@ "refId": "A" } ], - "title": "Disk Temperatures", + "title": "All Disk Temperatures", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 11, + "panels": [], + "title": "NVMe Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "align": "center", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "job" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "unique_device" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #Health" + }, + "properties": [ + { + "id": "custom.cellOptions", + "value": { + "mode": "basic", + "type": "color-background" + } + }, + { + "id": "custom.width" + }, + { + "id": "displayName", + "value": "Health" + }, + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "green", + "index": 0, + "text": "Ok" + } + }, + "type": "value" + }, + { + "options": { + "from": 1, + "result": { + "color": "red", + "index": 1, + "text": "Bad" + }, + "to": 1000000000000000 + }, + "type": "range" + } + ] + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".* 2" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #Temp" + }, + "properties": [ + { + "id": "displayName", + "value": "Temperature" + }, + { + "id": "unit", + "value": "celsius" + }, + { + "id": "noValue", + "value": "-" + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".* 1" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byRegexp", + "options": ".* 3" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #TBW" + }, + "properties": [ + { + "id": "unit", + "value": "deckbytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #TBR" + }, + "properties": [ + { + "id": "unit", + "value": "deckbytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #Capacity" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Device" + }, + "properties": [ + { + "id": "links", + "value": [ + { + "title": "", + "url": "/d/uesjf83hh/nvme-monitoring?var-serial_number=${__data.fields[\"Serial Number\"]}" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 10, + "w": 20, + "x": 0, + "y": 32 + }, + "id": 12, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true, + "sortBy": [ + { + "desc": true, + "displayName": "TBW" + } + ] + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "$$hashKey": "object:40", + "aggregation": "Last", + "alias": "Healthy", + "crit": 0, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "decimals": 0, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "Never", + "editorMode": "code", + "exemplar": false, + "expr": "label_join(nvme_critical_warning_total{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "range": false, + "refId": "Health", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 0 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "label_join(nvme_temperature_celsius{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "__auto", + "range": false, + "refId": "Temp" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "label_join(nvme_data_units_written_total{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\") * 512", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "", + "range": false, + "refId": "TBW" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "label_join(nvme_data_units_read_total{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\") * 512", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "", + "range": false, + "refId": "TBR" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "label_join(nvme_physical_size_bytes{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "", + "range": false, + "refId": "Capacity" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "label_join(delta(nvme_data_units_written_total{instance=~\"$node\"}[24h])*512000,\"unique_device\", \"-\", \"instance\", \"device\")/label_join(nvme_physical_size_bytes{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "", + "range": false, + "refId": "DWPD" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "label_join(nvme_rated_dwpd{instance=~\"$node\"},\"unique_device\", \"-\", \"instance\", \"device\")", + "format": "table", + "hide": false, + "instant": true, + "legendFormat": "__auto", + "range": false, + "refId": "Rated DWPD" + } + ], + "title": "SMART Info", + "transformations": [ + { + "id": "seriesToColumns", + "options": { + "byField": "unique_device", + "mode": "outer" + } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time 1": true, + "Time 2": true, + "Time 3": true, + "Time 4": true, + "Time 5": true, + "Time 6": true, + "Time 7": true, + "Value #Health": false, + "__name__ 1": true, + "__name__ 2": true, + "__name__ 3": true, + "__name__ 4": true, + "device 1": false, + "device 2": true, + "device 3": true, + "device 4": true, + "device 5": true, + "device 6": true, + "device 7": true, + "instance 1": false, + "instance 2": true, + "instance 3": true, + "instance 4": true, + "instance 5": true, + "instance 6": true, + "instance 7": true, + "job 1": true, + "job 2": true, + "job 3": true, + "job 4": true, + "job 5": true, + "job 6": true, + "job 7": true, + "model 2": true, + "model 3": true, + "model 4": true, + "model 5": true, + "model 6": true, + "model 7": true, + "original_device 1": true, + "original_device 2": true, + "original_device 3": true, + "original_device 4": true, + "original_device 5": true, + "original_device 6": true, + "original_device 7": true, + "serial_number 2": true, + "serial_number 3": true, + "serial_number 4": true, + "serial_number 5": true, + "serial_number 6": true, + "serial_number 7": true, + "unique_device": true + }, + "includeByName": {}, + "indexByName": { + "Time 1": 11, + "Time 2": 15, + "Time 3": 23, + "Time 4": 27, + "Time 5": 32, + "Time 6": 38, + "Time 7": 53, + "Value #Capacity": 6, + "Value #DWPD": 8, + "Value #Health": 2, + "Value #Rated DWPD": 7, + "Value #TBR": 5, + "Value #TBW": 4, + "Value #Temp": 3, + "__name__ 1": 12, + "__name__ 2": 16, + "__name__ 3": 37, + "__name__ 4": 54, + "device 1": 1, + "device 2": 21, + "device 3": 24, + "device 4": 28, + "device 5": 33, + "device 6": 39, + "device 7": 55, + "instance 1": 0, + "instance 2": 17, + "instance 3": 14, + "instance 4": 29, + "instance 5": 34, + "instance 6": 40, + "instance 7": 56, + "job 1": 13, + "job 2": 18, + "job 3": 25, + "job 4": 30, + "job 5": 35, + "job 6": 41, + "job 7": 57, + "model 1": 9, + "model 2": 43, + "model 3": 45, + "model 4": 47, + "model 5": 49, + "model 6": 51, + "model 7": 58, + "original_device 1": 20, + "original_device 2": 22, + "original_device 3": 26, + "original_device 4": 31, + "original_device 5": 36, + "original_device 6": 42, + "original_device 7": 59, + "serial_number 1": 10, + "serial_number 2": 44, + "serial_number 3": 46, + "serial_number 4": 48, + "serial_number 5": 50, + "serial_number 6": 52, + "serial_number 7": 60, + "unique_device": 19 + }, + "renameByName": { + "Time 1": "", + "Value #Capacity": "Disk Size", + "Value #DWPD": "DWPD", + "Value #Rated DWPD": "Rated DWPD", + "Value #TBR": "TBR", + "Value #TBW": "TBW", + "__name__ 1": "", + "device 1": "Device", + "instance 1": "Hostname", + "model 1": "Model Name", + "serial_number 1": "Serial Number" + } + } + } + ], + "transparent": true, + "type": "table" + }, { "datasource": { "type": "prometheus", @@ -707,6 +1312,7 @@ "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, + "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", @@ -748,8 +1354,8 @@ "gridPos": { "h": 13, "w": 10, - "x": 9, - "y": 17 + "x": 0, + "y": 42 }, "id": 9, "options": { @@ -760,11 +1366,11 @@ "showLegend": true }, "tooltip": { - "maxHeight": 600, "mode": "single", "sort": "none" } }, + "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -780,58 +1386,142 @@ ], "title": "DWPD", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Temperature (°C)", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 9, + "x": 10, + "y": 42 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "avg_over_time(nvme_temperature_celsius{instance=~\"$node\"}[1h]) ", + "instant": false, + "interval": "", + "legendFormat": "{{instance}} - {{device}}", + "range": true, + "refId": "A" + } + ], + "title": "NVMe Temperatures", + "type": "timeseries" } ], "refresh": false, - "schemaVersion": 39, + "schemaVersion": 40, "tags": [], "templating": { "list": [ { + "baseFilters": [], "datasource": { "type": "prometheus", "uid": "PBFA97CFB590B2093" }, "filters": [], - "hide": 0, "name": "Filters", - "skipUrlSync": false, "type": "adhoc" }, { - "current": { - "selected": false, - "text": "Prometheus", - "value": "Prometheus" - }, - "hide": 0, + "current": {}, "includeAll": false, - "multi": false, "name": "datasource", "options": [], "query": "prometheus", - "queryValue": "", "refresh": 1, "regex": "", - "skipUrlSync": false, "type": "datasource" }, { "allValue": ".*", - "current": { - "selected": false, - "text": "All", - "value": "$__all" - }, + "current": {}, "datasource": { "type": "prometheus", "uid": "${datasource}" }, "definition": "label_values(node_cpu_seconds_total{job=\"node\"}, instance)", - "hide": 0, "includeAll": true, "label": "Host:", - "multi": false, "name": "node", "options": [], "query": { @@ -840,7 +1530,6 @@ }, "refresh": 1, "regex": "", - "skipUrlSync": false, "sort": 1, "type": "query" } @@ -850,12 +1539,11 @@ "from": "now-24h", "to": "now" }, - "timeRangeUpdatedDuringEditOrView": false, "timepicker": {}, "timezone": "", "title": "Hardware Overview", "uid": "TCN51Y25P", - "version": 1, + "version": 10, "weekStart": "" } {% endraw %} From 98cc322b73a153e23ba8ba8acc7e5d1123ad9421 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Tue, 13 May 2025 00:38:14 +0100 Subject: [PATCH 5/5] Add nvme drive drill down dashboard --- .../grafana/dashboards/openstack/nvme.json | 1217 +++++++++++++++++ 1 file changed, 1217 insertions(+) create mode 100644 etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json diff --git a/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json b/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json new file mode 100644 index 000000000..1669b02a0 --- /dev/null +++ b/etc/kayobe/kolla/config/grafana/dashboards/openstack/nvme.json @@ -0,0 +1,1217 @@ +{% raw %} +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 17197, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": {}, + "textMode": "name", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "nvme_data_units_written_total{serial_number=~\"$serial_number\"}", + "instant": true, + "legendFormat": "{{instance}} - {{device}} - {{serial_number}}", + "refId": "A" + } + ], + "title": "Device & Serial Number", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 26, + "panels": [], + "title": "Device Information", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 4 + }, + "id": 22, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_physical_size_bytes{serial_number=\"$serial_number\"}", + "legendFormat": "Physical Size", + "refId": "A" + } + ], + "title": "Physical Size", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 65 + }, + { + "color": "red", + "value": 75 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 4 + }, + "id": 6, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_temperature_celsius{serial_number=\"$serial_number\"}", + "legendFormat": "Temperature", + "refId": "A" + } + ], + "title": "Temperature", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 4 + }, + "id": 23, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_rated_dwpd{serial_number=\"$serial_number\"}", + "legendFormat": "Rated DWPD", + "refId": "A" + } + ], + "title": "Rated DWPD", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 50000 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 12, + "y": 4 + }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_power_on_hours_total{serial_number=\"$serial_number\"}", + "legendFormat": "Power Hours", + "refId": "A" + } + ], + "title": "Power-On Hours", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 16, + "y": 4 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_power_cycles_total{serial_number=\"$serial_number\"}", + "legendFormat": "Power Cycles", + "refId": "A" + } + ], + "title": "Power Cycles", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 10, + "panels": [], + "title": "Health Indicators", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 12 + }, + "id": 21, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_critical_warning_total{serial_number=\"$serial_number\"}", + "legendFormat": "Critical Warnings", + "refId": "A" + } + ], + "title": "Critical Warnings", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 12 + }, + "id": 5, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_percentage_used_ratio{serial_number=\"$serial_number\"} * 100", + "legendFormat": "Percentage Used", + "refId": "A" + } + ], + "title": "Percentage Used", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 12 + }, + "id": 19, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_num_err_log_entries_total{serial_number=\"$serial_number\"}", + "legendFormat": "Error Log Entries", + "refId": "A" + } + ], + "title": "Error Log Entries", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 12, + "y": 12 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_media_errors_total{serial_number=\"$serial_number\"}", + "legendFormat": "Media Errors", + "refId": "A" + } + ], + "title": "Media Errors", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 20 + }, + { + "color": "green", + "value": 50 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 16, + "y": 12 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_available_spare_ratio{serial_number=\"$serial_number\"} * 100", + "legendFormat": "Available Spare", + "refId": "A" + } + ], + "title": "Available Spare Ratio", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 12 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "nvme_available_spare_threshold_ratio{serial_number=\"$serial_number\"} * 100", + "legendFormat": "Spare Threshold", + "refId": "A" + } + ], + "title": "Spare Threshold Ratio", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 11, + "panels": [], + "title": "Performance Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 20 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(nvme_data_units_read_total{serial_number=\"$serial_number\"}[5m])*512000", + "legendFormat": "Data Read", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "rate(nvme_data_units_written_total{serial_number=\"$serial_number\"}[5m])*512000", + "legendFormat": "Data Written", + "range": true, + "refId": "B" + } + ], + "title": "Disk I/O", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 20 + }, + "id": 25, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "nvme_data_units_written_total{serial_number=\"$serial_number\"} * 512000", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "All Time TBW", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 20 + }, + "id": 24, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "nvme_data_units_read_total{serial_number=\"$serial_number\"} * 512000", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "All Time TBR", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 27 + }, + "id": 20, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "irate(nvme_controller_busy_time_seconds{serial_number=\"$serial_number\"}[5m])", + "legendFormat": "Controller Busy Time", + "range": true, + "refId": "A" + } + ], + "title": "Controller Busy Time", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "1m", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "PBFA97CFB590B2093" + }, + "description": "", + "label": "Datasource", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "Z2M0A13LTCD8", + "value": "Z2M0A13LTCD8" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(nvme_data_units_read_total,serial_number)", + "includeAll": false, + "label": "Serial Number", + "name": "serial_number", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(nvme_data_units_read_total,serial_number)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "NVMe Monitoring", + "uid": "uesjf83hh", + "version": 1, + "weekStart": "" +} +{% endraw %}