From 37daaf09747da330dac594aee1a7e1e839e7bd29 Mon Sep 17 00:00:00 2001 From: Peter Sabaini Date: Wed, 22 Nov 2023 14:53:54 +0100 Subject: [PATCH] Add support for the cos_agent relation This supports grafana dashboards and metrics scraping from the ceph mgr prometheus module. Have to build with charmcraft 2.6 for dependency handling Also remove zed tests as it's EOL Change-Id: I8b2f132a4997d205119f7afe2a1ab6b2ae4c0134 cherry-picked from 868adde Change-Id: Idd479cef04a24ea64af643bd6e142ac40906e86c func-test-pr: https://github.com/openstack-charmers/zaza-openstack-tests/pull/1208 --- .../ceph-cluster-advanced.json | 3792 +++++++++++++++++ .../grafana_dashboards/ceph-cluster.json | 1269 ++++++ .../grafana_dashboards/cephfs-overview.json | 348 ++ .../grafana_dashboards/host-details.json | 1314 ++++++ .../grafana_dashboards/hosts-overview.json | 880 ++++ .../osd-device-details.json | 857 ++++ .../grafana_dashboards/osds-overview.json | 1028 +++++ .../files/grafana_dashboards/pool-detail.json | 694 +++ .../grafana_dashboards/pool-overview.json | 1711 ++++++++ .../grafana_dashboards/radosgw-detail.json | 522 +++ .../grafana_dashboards/radosgw-overview.json | 695 +++ .../radosgw-sync-overview.json | 490 +++ .../files/grafana_dashboards/rbd-details.json | 444 ++ .../grafana_dashboards/rbd-overview.json | 723 ++++ ...rts.yml.default => prometheus_alerts.yaml} | 0 .../lib/charms/grafana_agent/v0/cos_agent.py | 842 ++++ ceph-mon/metadata.yaml | 3 + ceph-mon/osci.yaml | 3 +- ceph-mon/src/ceph_metrics.py | 81 + ceph-mon/src/charm.py | 2 +- ceph-mon/src/ops_actions/rotate_key.py | 48 +- ceph-mon/test-requirements.txt | 3 + ceph-mon/tests/bundles/jammy-zed.yaml | 259 -- ceph-mon/tests/bundles/kinetic-zed.yaml | 265 -- ceph-mon/unit_tests/test_ceph_metrics.py | 66 +- 25 files changed, 15803 insertions(+), 536 deletions(-) create mode 100644 ceph-mon/files/grafana_dashboards/ceph-cluster-advanced.json create mode 100644 ceph-mon/files/grafana_dashboards/ceph-cluster.json create mode 100644 ceph-mon/files/grafana_dashboards/cephfs-overview.json create mode 100644 ceph-mon/files/grafana_dashboards/host-details.json create mode 100644 ceph-mon/files/grafana_dashboards/hosts-overview.json create mode 100644 ceph-mon/files/grafana_dashboards/osd-device-details.json create mode 100644 ceph-mon/files/grafana_dashboards/osds-overview.json create mode 100644 ceph-mon/files/grafana_dashboards/pool-detail.json create mode 100644 ceph-mon/files/grafana_dashboards/pool-overview.json create mode 100644 ceph-mon/files/grafana_dashboards/radosgw-detail.json create mode 100644 ceph-mon/files/grafana_dashboards/radosgw-overview.json create mode 100644 ceph-mon/files/grafana_dashboards/radosgw-sync-overview.json create mode 100644 ceph-mon/files/grafana_dashboards/rbd-details.json create mode 100644 ceph-mon/files/grafana_dashboards/rbd-overview.json rename ceph-mon/files/prometheus_alert_rules/{prometheus_alerts.yml.default => prometheus_alerts.yaml} (100%) create mode 100644 ceph-mon/lib/charms/grafana_agent/v0/cos_agent.py delete mode 100644 ceph-mon/tests/bundles/jammy-zed.yaml delete mode 100644 ceph-mon/tests/bundles/kinetic-zed.yaml diff --git a/ceph-mon/files/grafana_dashboards/ceph-cluster-advanced.json b/ceph-mon/files/grafana_dashboards/ceph-cluster-advanced.json new file mode 100644 index 00000000..db61f332 --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/ceph-cluster-advanced.json @@ -0,0 +1,3792 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "heatmap", + "name": "Heatmap", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "Ceph cluster overview", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CLUSTER STATE", + "titleSize": "h6", + "type": "row" + }, + { + "colors": null, + "datasource": "${prometheusds}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "0": { + "text": "HEALTHY" + }, + "1": { + "text": "WARNING" + }, + "2": { + "text": "ERROR" + } + }, + "type": "value" + }, + { + "id": 1, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#9ac48a" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 1 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 2 + } + ] + }, + "unit": "none" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 3, + "interval": "1m", + "links": [ ], + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "ceph_health_status{}", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 300 + } + ], + "title": "Ceph health status", + "transparent": true, + "type": "stat" + }, + { + "datasource": "${prometheusds}", + "description": "", + "fieldConfig": { + "defaults": { + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.10000000000000001 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 0.29999999999999999 + } + ] + }, + "unit": "percentunit" + } + }, + "gridPos": { + "h": 6, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 4, + "interval": "1m", + "links": [ ], + "maxDataPoints": 100, + "options": { + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "(ceph_cluster_total_bytes{}-ceph_cluster_total_used_bytes{})/ceph_cluster_total_bytes{}", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 300 + } + ], + "title": "Available Capacity", + "transparent": false, + "type": "gauge" + }, + { + "colors": null, + "datasource": "${prometheusds}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 2, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.025000000000000001 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 1 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 5, + "interval": "1m", + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "ceph_cluster_total_bytes{}", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 300 + } + ], + "title": "Cluster Capacity", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${prometheusds}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 1, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 6, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(irate(ceph_osd_op_w_in_bytes{}[5m]))", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Write Throughput", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${prometheusds}", + "description": "", + "fieldConfig": { + "defaults": { + "decimals": 1, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "match": null, + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a" + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#9ac48a", + "value": 0 + } + ] + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 7, + "links": [ ], + "options": { + "colorMode": "none", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(irate(ceph_osd_op_r_out_bytes{}[5m]))", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Read Throughput", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgb(255, 0, 0)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 0, + "datasource": "${prometheusds}", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 15, + "y": 1 + }, + "id": 8, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "All", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_osd_metadata)", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "All", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "In", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_osd_in)", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "In", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Out", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_in == bool 0)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Out", + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + }, + { + "aggregation": "Last", + "alias": "Up", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Up", + "refId": "D", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Down", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up == bool 0)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Down", + "refId": "E", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + } + ], + "title": "OSDs", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "${prometheusds}", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 6, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 9, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "Active", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status == 1) or vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Active", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Standby", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status == 0) or vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Standby", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + } + ], + "title": "MGRs", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "${prometheusds}", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Critical" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Warning" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#987d24", + "mode": "fixed" + } + } + ] + } + ] + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 4 + }, + "id": 10, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "Active", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"^Ceph.+\", severity=\"critical\"}) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Critical", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Standby", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ALERTS{alertstate=\"firing\",alertname=~\"^Ceph.+\", severity=\"warning\"}) OR vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Warning", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + } + ], + "title": "Firing Alerts", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${prometheusds}", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0.025000000000000001 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 0.10000000000000001 + } + ] + }, + "unit": "decbytes" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 4 + }, + "id": 11, + "links": [ ], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "ceph_cluster_total_used_bytes{}", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Used Capacity", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${prometheusds}", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 4 + }, + "id": 12, + "links": [ ], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(irate(ceph_osd_op_w{}[$__rate_interval]))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Write IOPS", + "transparent": false, + "type": "stat" + }, + { + "colors": null, + "datasource": "${prometheusds}", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ + { + "id": 0, + "options": { + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#9ac48a", + "value": 0 + } + ] + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 4 + }, + "id": 13, + "links": [ ], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(irate(ceph_osd_op_r{}[$__rate_interval]))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Read IOPS", + "transparent": false, + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "${prometheusds}", + "description": "", + "displayName": "", + "fieldConfig": { + "defaults": { + "decimals": 0, + "links": [ ], + "mappings": [ ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + } + }, + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 15, + "y": 4 + }, + "id": 14, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [ ], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "aggregation": "Last", + "alias": "In Quorum", + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_mon_quorum_status)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "In Quorum", + "refId": "A", + "units": "none", + "valueHandler": "Text Only" + }, + { + "aggregation": "Last", + "alias": "Total", + "crit": 1, + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mon_quorum_status)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "B", + "units": "none", + "valueHandler": "Text Only", + "warn": 2 + }, + { + "aggregation": "Last", + "alias": "MONs out of Quorum", + "crit": 1.6000000000000001, + "datasource": "${prometheusds}", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Annotation", + "displayValueWithAlias": "Never", + "expr": "count(ceph_mon_quorum_status) - sum(ceph_mon_quorum_status)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MONs out of Quorum", + "range": true, + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1.1000000000000001 + } + ], + "title": "Monitors", + "transparent": false, + "type": "stat" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 15, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CLUSTER STATS", + "titleSize": "h6", + "type": "row" + }, + { + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 16, + "limit": 10, + "onlyAlertsOnDashboard": true, + "options": { + "alertInstanceLabelFilter": "{alertname=~\"^Ceph.+\"}", + "alertName": "", + "dashboardAlerts": false, + "groupBy": [ ], + "groupMode": "default", + "maxItems": 20, + "sortOrder": 1, + "stateFilter": { + "error": true, + "firing": true, + "noData": false, + "normal": false, + "pending": true + }, + "viewMode": "list" + }, + "show": "current", + "sortOrder": 1, + "stateFilter": [ ], + "title": "Alerts", + "type": "alertlist" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 40, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 0, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#c0921f", + "value": 75 + }, + { + "color": "#E02F44", + "value": 85 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Total Capacity" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Used" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + }, + { + "id": "custom.thresholdsStyle", + "value": { + "mode": "dashed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 17, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "last" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true, + "sortBy": "Last", + "sortDesc": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "ceph_cluster_total_bytes{}", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Total Capacity", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "${prometheusds}", + "expr": "ceph_cluster_total_used_bytes{}", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Used", + "range": true, + "refId": "B", + "step": 300 + } + ], + "title": "Capacity", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 85 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 18, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(irate(ceph_osd_op_w_in_bytes{}[5m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "${prometheusds}", + "expr": "sum(irate(ceph_osd_op_r_out_bytes{}[5m]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B", + "step": 300 + } + ], + "title": "Cluster Throughput", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 19, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max", + "min" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(irate(ceph_osd_op_w{}[$__rate_interval]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Write", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "${prometheusds}", + "expr": "sum(irate(ceph_osd_op_r{}[$__rate_interval]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Read", + "range": true, + "refId": "B", + "step": 300 + } + ], + "title": "IOPS", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 16 + }, + "id": 20, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "(ceph_pool_bytes_used{}) *on (pool_id) group_left(name)(ceph_pool_metadata{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A", + "step": 300 + } + ], + "title": "Pool Used Bytes", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rbd Stored" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "transparent", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 16 + }, + "id": 21, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "(ceph_pool_stored_raw{}) *on (pool_id) group_left(name)(ceph_pool_metadata{})", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "range": true, + "refId": "A", + "step": 300 + } + ], + "title": "Pool Used RAW Bytes", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 24 + }, + "id": 22, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "(ceph_pool_quota_objects{}) *on (pool_id) group_left(name)(ceph_pool_metadata{})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A", + "step": 300 + } + ], + "title": "Pool Objects Quota", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 24 + }, + "id": 23, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "(ceph_pool_quota_bytes{}) *on (pool_id) group_left(name)(ceph_pool_metadata{})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A", + "step": 300 + } + ], + "title": "Pool Quota Bytes", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 24 + }, + "id": 24, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "(ceph_pool_objects{}) *on (pool_id) group_left(name)(ceph_pool_metadata{})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "title": "Objects Per Pool", + "type": "timeseries" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 25, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "OBJECTS", + "titleSize": "h6", + "type": "row" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Total.*$/" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 6, + "x": 0, + "y": 32 + }, + "id": 26, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pool_objects)", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Total", + "range": true, + "refId": "A", + "step": 200 + } + ], + "title": "OSD Type Count", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Total.*$/" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 12, + "w": 8, + "x": 6, + "y": 32 + }, + "id": 27, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_active{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Active", + "range": true, + "refId": "A" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_clean{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Clean", + "range": true, + "refId": "B" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_peering{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Peering", + "range": true, + "refId": "C" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_degraded{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Degraded", + "range": true, + "refId": "D", + "step": 300 + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_stale{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Stale", + "range": true, + "refId": "E", + "step": 300 + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_unclean_pgs{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Unclean", + "range": true, + "refId": "F", + "step": 300 + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_undersized{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Undersized", + "range": true, + "refId": "G", + "step": 300 + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_incomplete{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Incomplete", + "range": true, + "refId": "H" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_forced_backfill{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Forced Backfill", + "range": true, + "refId": "I" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_forced_recovery{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Forced Recovery", + "range": true, + "refId": "J" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_creating{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Creating", + "range": true, + "refId": "K" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_wait_backfill{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Wait Backfill", + "range": true, + "refId": "L" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_deep{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Deep", + "range": true, + "refId": "M" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_scrubbing{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Scrubbing", + "range": true, + "refId": "N" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_recovering{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Recovering", + "range": true, + "refId": "O" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_repair{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Repair", + "range": true, + "refId": "P" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_down{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Down", + "range": true, + "refId": "Q" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_peered{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Peered", + "range": true, + "refId": "R" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_backfill{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Backfill", + "range": true, + "refId": "S" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_remapped{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Remapped", + "range": true, + "refId": "T" + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_backfill_toofull{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Backfill Toofull", + "range": true, + "refId": "U" + } + ], + "title": "PGs State", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byRegexp", + "options": "/^Total.*$/" + }, + "properties": [ + { + "id": "custom.stacking", + "value": { + "group": false, + "mode": "normal" + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 10, + "x": 14, + "y": 32 + }, + "id": 28, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "mean", + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "asc" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_degraded{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Degraded", + "range": true, + "refId": "A", + "step": 300 + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_stale{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Stale", + "range": true, + "refId": "B", + "step": 300 + }, + { + "datasource": "${prometheusds}", + "expr": "sum(ceph_pg_undersized{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "Undersized", + "range": true, + "refId": "C", + "step": 300 + } + ], + "title": "Stuck PGs", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 6, + "w": 10, + "x": 14, + "y": 38 + }, + "id": 29, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "sum(irate(ceph_osd_recovery_ops{}[$interval]))", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "OPS", + "refId": "A", + "step": 300 + } + ], + "title": "Recovery Operations", + "type": "timeseries" + }, + { + "collapse": false, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 44 + }, + "id": 30, + "panels": [ + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 42 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 31, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + }, + "value": "1" + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#b4ff00", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "ceph_osd_apply_latency_ms{}", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Apply Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": 10 + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#65c5db", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 42 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 32, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#65c5db", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "ceph_osd_commit_latency_ms{}", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Commit Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": 10 + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#806eb7", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 50 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 33, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#806eb7", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 2, + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "rate(ceph_osd_op_r_latency_sum{}[5m]) / rate(ceph_osd_op_r_latency_count{}[5m]) >= 0", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Read Op Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#f9934e", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "opacity" + }, + "dataFormat": "timeseries", + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 50 + }, + "heatmap": { }, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 34, + "legend": { + "show": true + }, + "options": { + "calculate": true, + "calculation": { + "yBuckets": { + "mode": "count", + "scale": { + "log": 2, + "type": "log" + } + } + }, + "cellGap": 2, + "cellValues": { }, + "color": { + "exponent": 0.5, + "fill": "#f9934e", + "mode": "opacity", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 128 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1.0000000000000001e-09 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "showValue": "never", + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "decimals": 2, + "min": "0", + "reverse": false, + "unit": "ms" + } + }, + "pluginVersion": "9.4.7", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "rate(ceph_osd_op_w_latency_sum{}[5m]) / rate(ceph_osd_op_w_latency_count{}[5m]) >= 0", + "format": "time_series", + "instant": false, + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "OSD Write Op Latency Distribution", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 2, + "max": null, + "min": "0", + "show": true, + "splitFactor": 1 + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 35, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "avg(rate(ceph_osd_op_r_latency_sum{}[5m]) / rate(ceph_osd_op_r_latency_count{}[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "A" + }, + { + "datasource": "${prometheusds}", + "expr": "avg(rate(ceph_osd_op_w_latency_sum{}[5m]) / rate(ceph_osd_op_w_latency_count{}[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "B" + } + ], + "title": "Recovery Operations", + "type": "timeseries" + }, + { + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [ ] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 36, + "interval": "$interval", + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.3", + "targets": [ + { + "datasource": "${prometheusds}", + "expr": "avg(ceph_osd_apply_latency_ms{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "apply", + "metric": "ceph_osd_perf_apply_latency_seconds", + "refId": "A", + "step": 4 + }, + { + "datasource": "${prometheusds}", + "expr": "avg(ceph_osd_commit_latency_ms{})", + "format": "time_series", + "interval": "$interval", + "intervalFactor": 1, + "legendFormat": "commit", + "metric": "ceph_osd_perf_commit_latency_seconds", + "refId": "B", + "step": 4 + } + ], + "title": "AVG OSD Apply + Commit Latency", + "type": "timeseries" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "LATENCY", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": true, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 45 + }, + "id": 37, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "", + "titleSize": "h6", + "type": "row" + }, + { + "columns": [ ], + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "filterable": false, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "custom.hidden", + "value": true + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 38, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "9.4.7", + "styles": "", + "targets": [ + { + "datasource": "${prometheusds}", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_osd_metadata{})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "OSD Services", + "range": false, + "refId": "A" + }, + { + "datasource": "${prometheusds}", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_mon_metadata{})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Mon Services", + "range": false, + "refId": "B" + }, + { + "datasource": "${prometheusds}", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_mds_metadata{})", + "format": "table", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "MDS Services", + "range": false, + "refId": "C" + }, + { + "datasource": "${prometheusds}", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_rgw_metadata{})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "RGW Services", + "range": false, + "refId": "D" + }, + { + "datasource": "${prometheusds}", + "exemplar": false, + "expr": "count by (ceph_version)(ceph_mgr_metadata{})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "MGR Services", + "range": false, + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Ceph Versions", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "organize", + "options": { + "excludeByName": { }, + "indexByName": { }, + "renameByName": { + "Time": "", + "Value #A": "OSD Services", + "Value #B": "Mon Services", + "Value #C": "MDS Services", + "Value #D": "RGW Services", + "Value #E": "MGR Services", + "ceph_version": "Ceph Version" + } + } + } + ], + "type": "table" + } + ], + "refresh": "1m", + "rows": [ ], + "schemaVersion": 38, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": "Data Source", + "name": "prometheusds", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "auto": true, + "auto_count": 10, + "auto_min": "1m", + "current": { + "text": "$__auto_interval_interval", + "value": "$__auto_interval_interval" + }, + "hide": 0, + "label": "Interval", + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "5s", + "value": "5s" + }, + { + "selected": false, + "text": "10s", + "value": "10s" + }, + { + "selected": false, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "5s,10s,30s,1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "type": "interval", + "valuelabels": { } + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph Cluster - Advanced", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/ceph-cluster.json b/ceph-mon/files/grafana_dashboards/ceph-cluster.json new file mode 100644 index 00000000..9d1fbd73 --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/ceph-cluster.json @@ -0,0 +1,1269 @@ +{ + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "5.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "5.0.0" + } + ], + "annotations": { + "list": [] + }, + "description": "Ceph cluster overview", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1525415495309, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": true, + "colorValue": false, + "colors": [ + "rgba(50, 128, 45, 0.9)", + "rgba(237, 129, 40, 0.9)", + "rgb(255, 0, 0)" + ], + "datasource": "${prometheusds}", + "editable": false, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 6, + "x": 0, + "y": 0 + }, + "hideTimeOverride": true, + "id": 21, + "interval": "1m", + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "ceph_health_status", + "format": "time_series", + "instant": true, + "interval": "$interval", + "intervalFactor": 1, + "refId": "A", + "step": 60 + } + ], + "thresholds": "1,2", + "timeFrom": null, + "title": "Health Status", + "transparent": false, + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "OK", + "value": "0" + }, + { + "op": "=", + "text": "WARN", + "value": "1" + }, + { + "op": "=", + "text": "ERR", + "value": "2" + } + ], + "valueName": "current" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgb(255, 0, 0)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 0, + "datasource": "${prometheusds}", + "displayName": "", + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 43, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "aggregation": "Last", + "alias": "All", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_osd_metadata)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "All", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "In", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_in)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "In", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Out", + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_in == bool 0)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Out", + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + }, + { + "aggregation": "Last", + "alias": "Up", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Up", + "refId": "D", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Down", + "crit": 2, + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_osd_up == bool 0)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Down", + "refId": "E", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1 + } + ], + "title": "OSDs", + "type": "stat" + }, + { + "clusterName": "", + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "${prometheusds}", + "displayName": "", + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 41, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "aggregation": "Last", + "alias": "In Quorum", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "sum(ceph_mon_quorum_status)", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "In Quorum", + "refId": "A", + "units": "none", + "valueHandler": "Text Only" + }, + { + "aggregation": "Last", + "alias": "Total", + "crit": 1, + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mon_quorum_status)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "B", + "units": "none", + "valueHandler": "Text Only", + "warn": 2 + }, + { + "aggregation": "Last", + "alias": "MONs out of Quorum", + "crit": 1.6, + "decimals": 2, + "displayAliasType": "Warning / Critical", + "displayType": "Annotation", + "displayValueWithAlias": "Never", + "expr": "count(ceph_mon_quorum_status) - sum(ceph_mon_quorum_status)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MONs out of Quorum", + "refId": "C", + "units": "none", + "valueHandler": "Number Threshold", + "warn": 1.1 + } + ], + "title": "Monitors", + "type": "stat" + }, + { + "colorMode": "Panel", + "colors": { + "crit": "rgba(245, 54, 54, 0.9)", + "disable": "rgba(128, 128, 128, 0.9)", + "ok": "rgba(50, 128, 45, 0.9)", + "warn": "rgba(237, 129, 40, 0.9)" + }, + "cornerRadius": 1, + "datasource": "${prometheusds}", + "displayName": "", + "flipCard": false, + "flipTime": 5, + "fontFormat": "Regular", + "gridPos": { + "h": 3, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 68, + "isAutoScrollOnOverflow": false, + "isGrayOnNoData": false, + "isHideAlertsOnDisable": false, + "isIgnoreOKColors": false, + "links": [], + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + } + }, + "targets": [ + { + "aggregation": "Last", + "alias": "Active", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status == 1) or vector(0)", + "format": "time_series", + "intervalFactor": 1, + "instant": true, + "legendFormat": "Active", + "refId": "A", + "units": "none", + "valueHandler": "Number Threshold" + }, + { + "aggregation": "Last", + "alias": "Standby", + "decimals": 2, + "displayAliasType": "Always", + "displayType": "Regular", + "displayValueWithAlias": "When Alias Displayed", + "expr": "count(ceph_mgr_status == 0) or vector(0)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "Standby", + "refId": "B", + "units": "none", + "valueHandler": "Number Threshold" + } + ], + "title": "MGRs", + "type": "stat" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "decimals": 2, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 6 + }, + "id": 47, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_osd_stat_bytes_used)/sum(ceph_osd_stat_bytes)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Used", + "refId": "A" + } + ], + "thresholds": "0.7,0.8", + "title": "Capacity used", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 0, + "gridPos": { + "h": 6, + "w": 9, + "x": 6, + "y": 6 + }, + "id": 53, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Active", + "color": "#508642", + "fill": 1, + "stack": "A" + }, + { + "alias": "Total", + "color": "#f9e2d2" + }, + { + "alias": "Degraded", + "color": "#eab839" + }, + { + "alias": "Undersized", + "color": "#f9934e" + }, + { + "alias": "Inconsistent", + "color": "#e24d42" + }, + { + "alias": "Down", + "color": "#bf1b00" + }, + { + "alias": "Inactive", + "color": "#bf1b00", + "fill": 4, + "linewidth": 0, + "stack": "A" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(ceph_pg_total)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total", + "refId": "A" + }, + { + "expr": "sum(ceph_pg_active)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Active", + "refId": "B" + }, + { + "expr": "sum(ceph_pg_total - ceph_pg_active)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Inactive", + "refId": "G" + }, + { + "expr": "sum(ceph_pg_undersized)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Undersized", + "refId": "F" + }, + { + "expr": "sum(ceph_pg_degraded)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Degraded", + "refId": "C" + }, + { + "expr": "sum(ceph_pg_inconsistent)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Inconsistent", + "refId": "D" + }, + { + "expr": "sum(ceph_pg_down)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Down", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "PG States", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 0, + "gridPos": { + "h": 6, + "w": 9, + "x": 15, + "y": 6 + }, + "id": 66, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Avg Apply Latency", + "color": "#7eb26d" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "quantile(0.95, ceph_osd_apply_latency_ms)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Apply Latency P_95", + "refId": "A" + }, + { + "expr": "quantile(0.95, ceph_osd_commit_latency_ms)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Commit Latency P_95", + "refId": "B" + }, + { + "expr": "avg(ceph_osd_apply_latency_ms)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Avg Apply Latency", + "refId": "C" + }, + { + "expr": "avg(ceph_osd_commit_latency_ms)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Avg Commit Latency", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "OSD Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 45, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 0.5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(ceph_osd_op_w_in_bytes[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "A" + }, + { + "expr": "sum(irate(ceph_osd_op_r_out_bytes[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Cluster I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 62, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(deriv(ceph_pool_stored[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "In-/Egress", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": " Egress (-) / Ingress (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "cards": { + "cardPadding": null, + "cardRound": 1 + }, + "color": { + "cardColor": "rgb(0, 254, 255)", + "colorScale": "sqrt", + "colorScheme": "interpolateBlues", + "exponent": 0.5, + "min": null, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "datasource": "${prometheusds}", + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 15 + }, + "heatmap": {}, + "highlightCards": true, + "id": 55, + "legend": { + "show": true + }, + "links": [], + "span": 12, + "targets": [ + { + "expr": "ceph_osd_stat_bytes_used / ceph_osd_stat_bytes", + "format": "time_series", + "interval": "1m", + "intervalFactor": 1, + "legendFormat": "Util (%)", + "refId": "A", + "step": 60 + } + ], + "timeFrom": null, + "title": "OSD Capacity Utilization", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": 2, + "format": "percentunit", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketNumber": null, + "yBucketSize": null + }, + { + "cards": { + "cardPadding": null, + "cardRound": 1 + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateBlues", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "datasource": "${prometheusds}", + "gridPos": { + "h": 9, + "w": 6, + "x": 6, + "y": 15 + }, + "heatmap": {}, + "highlightCards": true, + "id": 59, + "legend": { + "show": true + }, + "links": [], + "targets": [ + { + "expr": "ceph_osd_numpg", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "#PGs", + "refId": "A" + } + ], + "title": "PGs per OSD", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": "", + "yAxis": { + "decimals": null, + "format": "none", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 64, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(ceph_osd_recovery_ops[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Op/s", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Recovery Rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": "Recovery Ops/s", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "30s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph", + "cluster" + ], + "templating": { + "list": [ + { + "auto": true, + "auto_count": 10, + "auto_min": "1m", + "current": { + "text": "auto", + "value": "$__auto_interval_interval" + }, + "datasource": null, + "hide": 0, + "includeAll": false, + "label": "Interval", + "multi": false, + "name": "interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_interval" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "type": "interval" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph - Cluster", + "version": 13 + } diff --git a/ceph-mon/files/grafana_dashboards/cephfs-overview.json b/ceph-mon/files/grafana_dashboards/cephfs-overview.json new file mode 100644 index 00000000..dd3a025f --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/cephfs-overview.json @@ -0,0 +1,348 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "MDS Performance", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(ceph_objecter_op_r{job=~\"$job\", ceph_daemon=~\"($mds_servers).*\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read Ops", + "refId": "A" + }, + { + "expr": "sum(rate(ceph_objecter_op_w{job=~\"$job\", ceph_daemon=~\"($mds_servers).*\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write Ops", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "MDS Workload - $mds_servers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "none", + "label": "Reads(-) / Writes (+)", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_mds_server_handle_client_request{job=~\"$job\", ceph_daemon=~\"($mds_servers).*\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ceph_daemon}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Client Request Load - $mds_servers", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "none", + "label": "Client Requests", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "MDS Server", + "multi": false, + "name": "mds_servers", + "options": [ ], + "query": "label_values(ceph_mds_inodes{job=~\"$job\"}, ceph_daemon)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "MDS Performance", + "uid": "tbO9LAiZz", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/host-details.json b/ceph-mon/files/grafana_dashboards/host-details.json new file mode 100644 index 00000000..5d6a3060 --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/host-details.json @@ -0,0 +1,1314 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "$ceph_hosts System Overview", + "titleSize": "h6", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(sum by (ceph_daemon) (ceph_osd_metadata{job=~\"$job\", instance='$ceph_hosts'}))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "OSDs", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { + "interrupt": "#447EBC", + "steal": "#6D1F62", + "system": "#890F02", + "user": "#3F6833", + "wait": "#C15C17" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 6, + "x": 3, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (mode) (\n rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\", mode=~\"(irq|nice|softirq|steal|system|user|iowait)\"}[$__rate_interval])\n) / (\n scalar(\n sum(rate(node_cpu{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_cpu_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]))\n ) * 100\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{mode}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": "% Utilization", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "Available": "#508642", + "Free": "#508642", + "Total": "#bf1b00", + "Used": "#bf1b00", + "total": "#bf1b00", + "used": "#0a50a1" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 6, + "x": 9, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "total", + "color": "#bf1b00", + "fill": 0, + "linewidth": 2, + "stack": false + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemFree{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemFree_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Free", + "refId": "A" + }, + { + "expr": "node_memory_MemTotal{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemTotal_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "total", + "refId": "B" + }, + { + "expr": "(\n node_memory_Cached{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Cached_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) + (\n node_memory_Buffers{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Buffers_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) + (\n node_memory_Slab{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Slab_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "buffers/cache", + "refId": "C" + }, + { + "expr": "(\n node_memory_MemTotal{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemTotal_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n) - (\n (\n node_memory_MemFree{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_MemFree_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) + (\n node_memory_Cached{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Cached_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) + (\n node_memory_Buffers{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Buffers_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n ) +\n (\n node_memory_Slab{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"} or\n node_memory_Slab_bytes{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "used", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "RAM Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "RAM used", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 6, + "x": 15, + "y": 1 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (device) (\n rate(\n node_network_receive_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.rx", + "refId": "A" + }, + { + "expr": "sum by (device) (\n rate(node_network_transmit_bytes{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\",device!=\"lo\"}[$__rate_interval])\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.tx", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network Load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": "Send (-) / Receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.rx", + "refId": "A" + }, + { + "expr": "rate(node_network_transmit_drop{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_drop_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.tx", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network drop rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "pps", + "label": "Send (-) / Receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 0, + "y": 6 + }, + "id": 8, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(\n ceph_osd_stat_bytes{job=~\"$job\"} and\n on (ceph_daemon) ceph_disk_occupation{job=~\"$job\", instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Raw Capacity", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 6 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*tx/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_receive_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.rx", + "refId": "A" + }, + { + "expr": "rate(node_network_transmit_errs{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_network_transmit_errs_total{instance=~\"$ceph_hosts([\\\\\\\\.:].*)?\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}.tx", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network error rate", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "pps", + "label": "Send (-) / Receive (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 11 + }, + "id": 10, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "OSD Disk Performance Statistics", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 0, + "y": 12 + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n (\n rate(node_disk_writes_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) writes", + "refId": "A" + }, + { + "expr": "label_replace(\n (\n rate(node_disk_reads_completed{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\"},\"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) reads", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Disk IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ops", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 12, + "y": 12 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*read/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n (\n rate(node_disk_bytes_written{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_written_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{job=~\"$job\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) write", + "refId": "A" + }, + { + "expr": "label_replace(\n (\n rate(node_disk_bytes_read{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) or\n rate(node_disk_read_bytes_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])\n ),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") * on(instance, device)\n group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{job=~\"$job\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}}) read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Throughput by Disk", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 0, + "y": 21 + }, + "id": 13, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max by(instance, device) (label_replace(\n (rate(node_disk_write_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_writes_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001) or\n (rate(node_disk_read_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval])) /\n clamp_min(rate(node_disk_reads_completed_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]), 0.001),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}})", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Disk Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 11, + "x": 12, + "y": 21 + }, + "id": 14, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n (\n (rate(node_disk_io_time_ms{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) / 10) or\n rate(node_disk_io_time_seconds_total{instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"}[$__rate_interval]) * 100\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(ceph_disk_occupation_human{job=~\"$job\", instance=~\"($ceph_hosts)([\\\\\\\\.:].*)?\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}}({{ceph_daemon}})", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$ceph_hosts Disk utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": "%Util", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${prometheusds}", + "description": "This table shows the 10 hosts with the highest number of slow ops", + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 40 + }, + "id": 15, + "links": [ ], + "sort": { + "col": 2, + "desc": true + }, + "styles": [ + { + "alias": "Instance", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "instance", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Slow Ops", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "none", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n (sum by (instance)(ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"}))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops per Host", + "transform": "table", + "type": "table" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": false, + "label": "Hostname", + "multi": false, + "name": "ceph_hosts", + "options": [ ], + "query": "label_values(ceph_disk_occupation, instance)", + "refresh": 1, + "regex": "(.*)", + "sort": 3, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph OSD Host Details", + "uid": "rtOg0AiWz", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/hosts-overview.json b/ceph-mon/files/grafana_dashboards/hosts-overview.json new file mode 100644 index 00000000..d9250e5b --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/hosts-overview.json @@ -0,0 +1,880 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(sum by (instance) (ceph_osd_metadata{job=~\"$job\"}))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "OSD Hosts", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n))\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "AVG CPU Busy", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 4, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg ((\n (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) - ((\n node_memory_MemFree{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemFree_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) +\n (\n node_memory_Cached{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Cached_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Buffers{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Buffers_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n ) + (\n node_memory_Slab{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_Slab_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}\n )\n )\n) / (\n node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or\n node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*\"}\n))\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "AVG RAM Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "IOPS Load at the device as reported by the OS on all OSD hosts", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 5, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum ((\n rate(node_disk_reads_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_reads_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n) + (\n rate(node_disk_writes_completed{instance=~\"($osd_hosts).*\"}[$__rate_interval]) or\n rate(node_disk_writes_completed_total{instance=~\"($osd_hosts).*\"}[$__rate_interval])\n))\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Physical IOPS", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 16, + "y": 0 + }, + "id": 6, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg (\n label_replace(\n (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or\n (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),\n \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) * on(instance, device) group_left(ceph_daemon) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", instance=~\"($osd_hosts).*\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n )\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "AVG Disk Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Total send/receive network load across all hosts in the ceph cluster", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 7, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum (\n (\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n (\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n ) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n)\n", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Network Load", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Show the top 10 busiest hosts by cpu", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10,\n 100 * (\n 1 - (\n avg by(instance) (\n rate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval]) or\n rate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[$__rate_interval])\n )\n )\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Busy - Top 10 Hosts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Top 10 hosts by network load", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10, (sum by(instance) (\n(\n rate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) +\n(\n rate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval]) or\n rate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[$__rate_interval])\n) unless on (device, instance)\n label_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Network Load - Top 10 Hosts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "osd_hosts", + "options": [ ], + "query": "label_values(ceph_disk_occupation{job=~\"$job\"}, exported_instance)", + "refresh": 1, + "regex": "([^.]*).*", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "mon_hosts", + "options": [ ], + "query": "label_values(ceph_mon_metadata{job=~\"$job\"}, ceph_daemon)", + "refresh": 1, + "regex": "mon.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "mds_hosts", + "options": [ ], + "query": "label_values(ceph_mds_inodes{job=~\"$job\"}, ceph_daemon)", + "refresh": 1, + "regex": "mds.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "rgw_hosts", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{job=~\"$job\"}, ceph_daemon)", + "refresh": 1, + "regex": "rgw.(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph OSD Host Overview", + "uid": "y0KGL0iZz", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/osd-device-details.json b/ceph-mon/files/grafana_dashboards/osd-device-details.json new file mode 100644 index 00000000..2e817476 --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/osd-device-details.json @@ -0,0 +1,857 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "OSD Performance", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "read", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_osd_op_r_latency_sum{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "rate(ceph_osd_op_w_latency_sum{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$osd Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "Reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_osd_op_r{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "A" + }, + { + "expr": "rate(ceph_osd_op_w{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$osd R/W IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "Read Bytes", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_osd_op_r_out_bytes{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read Bytes", + "refId": "A" + }, + { + "expr": "rate(ceph_osd_op_w_in_bytes{job=~\"$job\", ceph_daemon=~\"$osd\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write Bytes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$osd R/W Bytes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 6, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Physical Device Performance", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 0, + "y": 11 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(\n label_replace(\n rate(node_disk_read_time_seconds_total{}[$__rate_interval]) /\n rate(node_disk_reads_completed_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n ) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}/{{device}} Reads", + "refId": "A" + }, + { + "expr": "(\n label_replace(\n rate(node_disk_write_time_seconds_total{}[$__rate_interval]) /\n rate(node_disk_writes_completed_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\") and on (instance, device)\n label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n )\n )\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}/{{device}} Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device Latency for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 6, + "y": 11 + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(node_disk_writes_completed_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} on {{instance}} Writes", + "refId": "A" + }, + { + "expr": "label_replace(\n rate(node_disk_reads_completed_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} on {{instance}} Reads", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device R/W IOPS for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 12, + "y": 11 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*Reads/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(node_disk_read_bytes_total{}[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}} {{device}} Reads", + "refId": "A" + }, + { + "expr": "label_replace(\n rate(node_disk_written_bytes_total{}[$__rate_interval]), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"},\n \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}} {{device}} Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device R/W Bytes for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 6, + "x": 18, + "y": 11 + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(node_disk_io_time_seconds_total{}[$__rate_interval]),\n \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n) and on (instance, device) label_replace(\n label_replace(\n ceph_disk_occupation_human{job=~\"$job\", ceph_daemon=~\"$osd\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"\n ), \"instance\", \"$1\", \"instance\", \"([^:.]*).*\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device}} on {{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Physical Device Util% for $osd", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": false, + "label": "OSD", + "multi": false, + "name": "osd", + "options": [ ], + "query": "label_values(ceph_osd_metadata{job=~\"$job\"}, ceph_daemon)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "OSD device details", + "uid": "CrAHE0iZz", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/osds-overview.json b/ceph-mon/files/grafana_dashboards/osds-overview.json new file mode 100644 index 00000000..50d0254f --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/osds-overview.json @@ -0,0 +1,1028 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "table", + "name": "Table", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { + "@95%ile": "#e0752d" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg (\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval]) * 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AVG read", + "refId": "A" + }, + { + "expr": "max(\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval]) * 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MAX read", + "refId": "B" + }, + { + "expr": "quantile(0.95,\n (\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval])\n * 1000\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "@95%ile", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "OSD Read Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${prometheusds}", + "description": "This table shows the osd's that are delivering the 10 highest read latencies within the cluster", + "gridPos": { + "h": 8, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 3, + "links": [ ], + "sort": { + "col": 2, + "desc": true + }, + "styles": [ + { + "alias": "OSD ID", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "ceph_daemon", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Latency (ms)", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "none", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n (sort(\n (\n rate(ceph_osd_op_r_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_r_latency_count{job=~\"$job\"}[$__rate_interval]) *\n 1000\n )\n ))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest READ Latencies", + "transform": "table", + "type": "table" + }, + { + "aliasColors": { + "@95%ile write": "#e0752d" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 12, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(\n rate(ceph_osd_op_w_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval])\n * 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "AVG write", + "refId": "A" + }, + { + "expr": "max(\n rate(ceph_osd_op_w_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval]) *\n 1000\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "MAX write", + "refId": "B" + }, + { + "expr": "quantile(0.95, (\n rate(ceph_osd_op_w_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval]) *\n 1000\n))\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "@95%ile write", + "refId": "C" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "OSD Write Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${prometheusds}", + "description": "This table shows the osd's that are delivering the 10 highest write latencies within the cluster", + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 0 + }, + "id": 5, + "links": [ ], + "sort": { + "col": 2, + "desc": true + }, + "styles": [ + { + "alias": "OSD ID", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "ceph_daemon", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Latency (ms)", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "none", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n (sort(\n (rate(ceph_osd_op_w_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n on (ceph_daemon) rate(ceph_osd_op_w_latency_count{job=~\"$job\"}[$__rate_interval]) *\n 1000)\n ))\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest WRITE Latencies", + "transform": "table", + "type": "table" + }, + { + "aliasColors": { }, + "datasource": "${prometheusds}", + "description": "", + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 8 + }, + "id": 6, + "legend": { + "percentage": true, + "show": true, + "values": true + }, + "legendType": "Under graph", + "pieType": "pie", + "targets": [ + { + "expr": "count by (device_class) (ceph_osd_metadata{job=~\"$job\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{device_class}}", + "refId": "A" + } + ], + "title": "OSD Types Summary", + "type": "piechart", + "valueName": "current" + }, + { + "aliasColors": { + "Non-Encrypted": "#E5AC0E" + }, + "datasource": "${prometheusds}", + "description": "", + "gridPos": { + "h": 8, + "w": 4, + "x": 4, + "y": 8 + }, + "id": 7, + "legend": { + "percentage": true, + "show": true, + "values": true + }, + "legendType": "Under graph", + "pieType": "pie", + "targets": [ + { + "expr": "count(ceph_bluefs_wal_total_bytes{job=~\"$job\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "bluestore", + "refId": "A" + }, + { + "expr": "absent(ceph_bluefs_wal_total_bytes{job=~\"$job\"}) * count(ceph_osd_metadata{job=~\"$job\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "filestore", + "refId": "B" + } + ], + "title": "OSD Objectstore Types", + "type": "piechart", + "valueName": "current" + }, + { + "aliasColors": { }, + "datasource": "${prometheusds}", + "description": "The pie chart shows the various OSD sizes used within the cluster", + "gridPos": { + "h": 8, + "w": 4, + "x": 8, + "y": 8 + }, + "id": 8, + "legend": { + "percentage": true, + "show": true, + "values": true + }, + "legendType": "Under graph", + "pieType": "pie", + "targets": [ + { + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} < 1099511627776)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<1TB", + "refId": "A" + }, + { + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 1099511627776 < 2199023255552)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<2TB", + "refId": "B" + }, + { + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 2199023255552 < 3298534883328)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<3TB", + "refId": "C" + }, + { + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 3298534883328 < 4398046511104)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<4TB", + "refId": "D" + }, + { + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 4398046511104 < 6597069766656)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<6TB", + "refId": "E" + }, + { + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 6597069766656 < 8796093022208)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<8TB", + "refId": "F" + }, + { + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 8796093022208 < 10995116277760)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<10TB", + "refId": "G" + }, + { + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 10995116277760 < 13194139533312)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<12TB", + "refId": "H" + }, + { + "expr": "count(ceph_osd_stat_bytes{job=~\"$job\"} >= 13194139533312)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "<12TB+", + "refId": "I" + } + ], + "title": "OSD Size Summary", + "type": "piechart", + "valueName": "current" + }, + { + "aliasColors": { }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 8, + "x": 12, + "y": 8 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_osd_numpg{job=~\"$job\"}", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "PGs per OSD", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Distribution of PGs per OSD", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": 20, + "mode": "histogram", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "# of OSDs", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster", + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 8 + }, + "id": 10, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_bluestore_onode_hits{job=~\"$job\"}) / (\n sum(ceph_bluestore_onode_hits{job=~\"$job\"}) +\n sum(ceph_bluestore_onode_misses{job=~\"$job\"})\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": ".75", + "title": "OSD onode Hits Ratio", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 11, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "R/W Profile", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Show the read/write workload profile overtime", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(ceph_pool_rd{job=~\"$job\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "A" + }, + { + "expr": "round(sum(rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Read/Write Profile", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${prometheusds}", + "description": "This table shows the 10 OSDs with the highest number of slow ops", + "gridPos": { + "h": 8, + "w": 4, + "x": 0, + "y": 20 + }, + "id": 13, + "links": [ ], + "sort": { + "col": 2, + "desc": true + }, + "styles": [ + { + "alias": "OSD ID", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "ceph_daemon", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Slow Ops", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "none", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n (ceph_daemon_health_metrics{type=\"SLOW_OPS\", ceph_daemon=~\"osd.*\"})\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Top Slow Ops", + "transform": "table", + "type": "table" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "OSD Overview", + "uid": "lo02I1Aiz", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/pool-detail.json b/ceph-mon/files/grafana_dashboards/pool-detail.json new file mode 100644 index 00000000..9e4cf744 --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/pool-detail.json @@ -0,0 +1,694 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "singlestat", + "name": "Singlestat", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 7, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "(ceph_pool_stored{job=~\"$job\"} / (ceph_pool_stored{job=~\"$job\"} + ceph_pool_max_avail{job=~\"$job\"})) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": ".7,.8", + "title": "Capacity used", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": 100, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Time till pool is full assuming the average fill rate of the last 6 hours", + "format": "s", + "gauge": { + "maxValue": false, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 7, + "y": 0 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": "" + }, + "tableColumn": "", + "targets": [ + { + "expr": "(ceph_pool_max_avail{job=~\"$job\"} / deriv(ceph_pool_stored{job=~\"$job\"}[6h])) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"} > 0\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "current", + "title": "Time till full", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": false + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "deriv(ceph_pool_objects{job=~\"$job\"}[$__rate_interval]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Objects per second", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Object Ingress/Egress", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ops", + "label": "Objects out(-) / in(+) ", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_pool_rd{job=~\"$job\"}[$__rate_interval]) *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "reads", + "refId": "A" + }, + { + "expr": "rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval]) *\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Client IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "iops", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "reads", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_pool_rd_bytes{job=~\"$job\"}[$__rate_interval]) +\n on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "reads", + "refId": "A" + }, + { + "expr": "rate(ceph_pool_wr_bytes{job=~\"$job\"}[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "writes", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Client Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Read (-) / Write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + "read_op_per_sec": "#3F6833", + "write_op_per_sec": "#E5AC0E" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_pool_objects{job=~\"$job\"} *\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\", name=~\"$pool_name\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Number of Objects", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$pool_name Objects", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Objects", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": false, + "label": "Pool Name", + "multi": false, + "name": "pool_name", + "options": [ ], + "query": "label_values(ceph_pool_metadata{job=~\"$job\"}, name)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph Pool Details", + "uid": "-xyV8KCiz", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/pool-overview.json b/ceph-mon/files/grafana_dashboards/pool-overview.json new file mode 100644 index 00000000..6316d4c5 --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/pool-overview.json @@ -0,0 +1,1711 @@ +{ + "__inputs": [ ], + "__requires": [ ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 2, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(ceph_pool_metadata{job=~\"$job\"})", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Pools", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Count of the pools that have compression enabled", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 3, + "y": 0 + }, + "id": 3, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(ceph_pool_metadata{job=~\"$job\", compression_mode!=\"none\"})", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Pools with Compression", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Total raw capacity available to the cluster", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 4, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_osd_stat_bytes{job=~\"$job\"})", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Total Raw Capacity", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Total raw capacity consumed by user data and associated overheads (metadata + redundancy)", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 5, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_pool_bytes_used{job=~\"$job\"})", + "format": "", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Raw Capacity Consumed", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Total of client data stored in the cluster", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 6, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(ceph_pool_stored{job=~\"$job\"})", + "format": "", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Logical Stored ", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "A compression saving is determined as the data eligible to be compressed minus the capacity used to store the data after compression", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 7, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(\n ceph_pool_compress_under_bytes{job=~\"$job\"} -\n ceph_pool_compress_bytes_used{job=~\"$job\"}\n)\n", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Compression Savings", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "Indicates how suitable the data is within the pools that are/have been enabled for compression - averaged across all pools holding compressed data", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 18, + "y": 0 + }, + "id": 8, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(\n sum(ceph_pool_compress_under_bytes{job=~\"$job\"} > 0) /\n sum(ceph_pool_stored_raw{job=~\"$job\"} and ceph_pool_compress_under_bytes{job=~\"$job\"} > 0)\n) * 100\n", + "format": "table", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Compression Eligibility", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "${prometheusds}", + "description": "This factor describes the average ratio of data eligible to be compressed divided by the data actually stored. It does not account for data written that was ineligible for compression (too small, or compression yield too low)", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 21, + "y": 0 + }, + "id": 9, + "interval": null, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(\n ceph_pool_compress_under_bytes{job=~\"$job\"} > 0)\n / sum(ceph_pool_compress_bytes_used{job=~\"$job\"} > 0\n)\n", + "format": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Compression Factor", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "columns": [ ], + "datasource": "${prometheusds}", + "fieldConfig": { + "defaults": { + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Time" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "instance" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "job" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "name" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool Name" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "pool_id" + }, + "properties": [ + { + "id": "displayName", + "value": "Pool ID" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #A" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression Factor" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #D" + }, + "properties": [ + { + "id": "displayName", + "value": "% Used" + }, + { + "id": "unit", + "value": "percentunit" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 85 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #B" + }, + "properties": [ + { + "id": "displayName", + "value": "Usable Free" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #C" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression Eligibility" + }, + { + "id": "unit", + "value": "percent" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #E" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression Savings" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #F" + }, + "properties": [ + { + "id": "displayName", + "value": "Growth (5d)" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + }, + { + "id": "custom.cellOptions", + "value": { + "type": "color-text" + } + }, + { + "id": "thresholds", + "value": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(245, 54, 54, 0.9)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 70 + }, + { + "color": "rgba(50, 172, 45, 0.97)", + "value": 85 + } + ] + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #G" + }, + "properties": [ + { + "id": "displayName", + "value": "IOPS" + }, + { + "id": "unit", + "value": "none" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #H" + }, + "properties": [ + { + "id": "displayName", + "value": "Bandwidth" + }, + { + "id": "unit", + "value": "Bps" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "__name__" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "type" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "compression_mode" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "description" + }, + "properties": [ + { + "id": "displayName", + "value": "Type" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #J" + }, + "properties": [ + { + "id": "displayName", + "value": "Stored" + }, + { + "id": "unit", + "value": "bytes" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #I" + }, + "properties": [ + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Value #K" + }, + "properties": [ + { + "id": "displayName", + "value": "Compression" + }, + { + "id": "unit", + "value": "short" + }, + { + "id": "decimals", + "value": 2 + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 3 + }, + "id": 10, + "links": [ ], + "options": { + "footer": { + "countRows": false, + "enablePagination": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 1, + "showHeader": true + }, + "pluginVersion": "10.4.0", + "styles": "", + "targets": [ + { + "expr": "(\n ceph_pool_compress_under_bytes{job=~\"$job\"} /\n ceph_pool_compress_bytes_used{job=~\"$job\"} > 0\n) and on(pool_id) (\n (\n (ceph_pool_compress_under_bytes{job=~\"$job\"} > 0) /\n ceph_pool_stored_raw{job=~\"$job\"}\n ) * 100 > 0.5\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "A", + "refId": "A" + }, + { + "expr": "ceph_pool_max_avail{job=~\"$job\"} *\n on(pool_id) group_left(name) ceph_pool_metadata{job=~\"$job\"}\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "B", + "refId": "B" + }, + { + "expr": "(\n (ceph_pool_compress_under_bytes{job=~\"$job\"} > 0) /\n ceph_pool_stored_raw{job=~\"$job\"}\n) * 100\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "C", + "refId": "C" + }, + { + "expr": "ceph_pool_percent_used{job=~\"$job\"} *\n on(pool_id) group_left(name) ceph_pool_metadata{job=~\"$job\"}\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "D", + "refId": "D" + }, + { + "expr": "ceph_pool_compress_under_bytes{job=~\"$job\"} -\n ceph_pool_compress_bytes_used{job=~\"$job\"} > 0\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "E", + "refId": "E" + }, + { + "expr": "delta(ceph_pool_stored{job=~\"$job\"}[5d])", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "F", + "refId": "F" + }, + { + "expr": "rate(ceph_pool_rd{job=~\"$job\"}[$__rate_interval])\n + rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval])\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "G", + "refId": "G" + }, + { + "expr": "rate(ceph_pool_rd_bytes{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{job=~\"$job\"}[$__rate_interval])\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "H", + "refId": "H" + }, + { + "expr": "ceph_pool_metadata{job=~\"$job\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "I", + "refId": "I" + }, + { + "expr": "ceph_pool_stored{job=~\"$job\"} * on(pool_id) group_left ceph_pool_metadata{job=~\"$job\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "J", + "refId": "J" + }, + { + "expr": "ceph_pool_metadata{job=~\"$job\", compression_mode!=\"none\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "K", + "refId": "K" + }, + { + "expr": "", + "format": "", + "intervalFactor": "", + "legendFormat": "L", + "refId": "L" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Pool Overview", + "transformations": [ + { + "id": "merge", + "options": { } + }, + { + "id": "seriesToRows", + "options": { } + }, + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "Value #A": true, + "Value #B": false, + "Value #C": true, + "Value #D": false, + "Value #E": true, + "Value #I": true, + "Value #K": true, + "__name__": true, + "cluster": true, + "compression_mode": true, + "instance": true, + "job": true, + "pool_id": true, + "type": true + }, + "includeByName": { }, + "indexByName": { }, + "renameByName": { } + } + } + ], + "type": "table" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "This chart shows the sum of read and write IOPS from all clients by pool", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "short" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($topk,\n round(\n (\n rate(ceph_pool_rd{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval])\n ), 1\n ) * on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\"})\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}} ", + "refId": "A" + }, + { + "expr": "topk($topk,\n rate(ceph_pool_wr{job=~\"$job\"}[$__rate_interval]) +\n on(pool_id) group_left(instance,name) ceph_pool_metadata{job=~\"$job\"}\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}} - write", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Top $topk Client IOPS by Pool", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "IOPS", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "The chart shows the sum of read and write bytes from all clients, by pool", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "Bps" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk($topk,\n (\n rate(ceph_pool_rd_bytes{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_pool_wr_bytes{job=~\"$job\"}[$__rate_interval])\n ) * on(pool_id) group_left(instance, name) ceph_pool_metadata{job=~\"$job\"}\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Top $topk Client Bandwidth by Pool", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "Throughput", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Historical view of capacity usage, to help identify growth and trends in pool consumption", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 8, + "showPoints": "never" + }, + "unit": "bytes" + } + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 13, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "ceph_pool_bytes_used{job=~\"$job\"} * on(pool_id) group_right ceph_pool_metadata{job=~\"$job\"}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Pool Capacity Usage (RAW)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "timeseries", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Capacity Used", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 22, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": "Data Source", + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "15", + "value": "15" + }, + "hide": 0, + "includeAll": false, + "label": "TopK", + "multi": false, + "name": "topk", + "options": [ + { + "text": "15", + "value": "15" + } + ], + "query": "15", + "refresh": 0, + "type": "custom" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Ceph Pools Overview", + "uid": "z99hzWtmk", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/radosgw-detail.json b/ceph-mon/files/grafana_dashboards/radosgw-detail.json new file mode 100644 index 00000000..58d17389 --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/radosgw-detail.json @@ -0,0 +1,522 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "RGW Host Detail : $rgw_servers", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance_id) (\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GET {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "sum by (instance_id) (\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUT {{ceph_daemon}}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "$rgw_servers GET/PUT Latencies", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 6, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rgw_get_b{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "rate(ceph_rgw_put_b{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs {{ceph_daemon}}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth by HTTP Operation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + "GETs": "#7eb26d", + "Other": "#447ebc", + "PUTs": "#eab839", + "Requests": "#3f2b5b", + "Requests Failed": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 7, + "x": 13, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rgw_failed_req{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\",ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requests Failed {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "rate(ceph_rgw_get{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs {{ceph_daemon}}", + "refId": "B" + }, + { + "expr": "rate(ceph_rgw_put{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs {{ceph_daemon}}", + "refId": "C" + }, + { + "expr": "(\n rate(ceph_rgw_req{job=~\"$job\"}[$__rate_interval]) -\n (\n rate(ceph_rgw_get{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rgw_put{job=~\"$job\"}[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Other {{ceph_daemon}}", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "HTTP Request Breakdown", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + "Failures": "#bf1b00", + "GETs": "#7eb26d", + "Other (HEAD,POST,DELETE)": "#447ebc", + "PUTs": "#eab839", + "Requests": "#3f2b5b" + }, + "datasource": "${prometheusds}", + "description": "", + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 1 + }, + "id": 6, + "legend": { + "percentage": true, + "show": true, + "values": true + }, + "legendType": "Under graph", + "pieType": "pie", + "targets": [ + { + "expr": "rate(ceph_rgw_failed_req{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Failures {{ceph_daemon}}", + "refId": "A" + }, + { + "expr": "rate(ceph_rgw_get{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs {{ceph_daemon}}", + "refId": "B" + }, + { + "expr": "rate(ceph_rgw_put{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs {{ceph_daemon}}", + "refId": "C" + }, + { + "expr": "(\n rate(ceph_rgw_req{job=~\"$job\"}[$__rate_interval]) -\n (\n rate(ceph_rgw_get{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rgw_put{job=~\"$job\"}[$__rate_interval])\n )\n) * on (instance_id) group_left (ceph_daemon)\n ceph_rgw_metadata{job=~\"$job\", ceph_daemon=~\"$rgw_servers\"}\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Other (DELETE,LIST) {{ceph_daemon}}", + "refId": "D" + } + ], + "title": "Workload Breakdown", + "type": "piechart", + "valueName": "current" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "", + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{job=~\"$job\"}, ceph_daemon)", + "refresh": 1, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW Instance Detail", + "uid": "x5ARzZtmk", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/radosgw-overview.json b/ceph-mon/files/grafana_dashboards/radosgw-overview.json new file mode 100644 index 00000000..12c56b6e --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/radosgw-overview.json @@ -0,0 +1,695 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "collapse": false, + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 2, + "panels": [ ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "RGW Overview - All Gateways", + "titleSize": "h6", + "type": "row" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GET {{rgw_host}}", + "refId": "A" + }, + { + "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUT {{rgw_host}}", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average GET/PUT Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 7, + "x": 8, + "y": 1 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (rgw_host) (\n label_replace(\n rate(ceph_rgw_req{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n )\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Total Requests/sec by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 6, + "x": 15, + "y": 1 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(ceph_rgw_get_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_get_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "GET Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Total bytes transferred in/out of all radosgw instances within the cluster", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(ceph_rgw_get_b{job=~\"$job\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "GETs", + "refId": "A" + }, + { + "expr": "sum(rate(ceph_rgw_put_b{job=~\"$job\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "PUTs", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth Consumed by Type", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Total bytes transferred in/out through get/put operations, by radosgw instance", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 7, + "x": 8, + "y": 8 + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rgw_put_b{job=~\"$job\"}[$__rate_interval])) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Bandwidth by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 6, + "x": 15, + "y": 8 + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "label_replace(\n rate(ceph_rgw_put_initial_lat_sum{job=~\"$job\"}[$__rate_interval]) /\n rate(ceph_rgw_put_initial_lat_count{job=~\"$job\"}[$__rate_interval]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{job=~\"$job\"},\n \"rgw_host\", \"$1\", \"ceph_daemon\", \"rgw.(.*)\"\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{rgw_host}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "PUT Latencies by RGW Instance", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "", + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{job=~\"$job\"}, ceph_daemon)", + "refresh": 1, + "regex": "RGW Server", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW Overview", + "uid": "WAkugZpiz", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/radosgw-sync-overview.json b/ceph-mon/files/grafana_dashboards/radosgw-sync-overview.json new file mode 100644 index 00000000..49db9e00 --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/radosgw-sync-overview.json @@ -0,0 +1,490 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.0.0" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_sum{job=~\"$job\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Replication (throughput) from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_bytes_count{job=~\"$job\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Replication (objects) from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Objects/s", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_poll_latency_sum{job=~\"$job\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Polling Request Latency from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (source_zone) (rate(ceph_data_sync_from_zone_fetch_errors{job=~\"$job\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{source_zone}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Unsuccessful Object Replications from Source Zone", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": "Count/s", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "", + "multi": false, + "name": "rgw_servers", + "options": [ ], + "query": "label_values(ceph_rgw_metadata{job=~\"$job\"}, ceph_daemon)", + "refresh": 1, + "regex": "RGW Server", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RGW Sync Overview", + "uid": "rgw-sync-overview", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/rbd-details.json b/ceph-mon/files/grafana_dashboards/rbd-details.json new file mode 100644 index 00000000..8efc8f47 --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/rbd-details.json @@ -0,0 +1,444 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.3.3" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "Detailed Performance of RBD Images (IOPS/Throughput/Latency)", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rbd_write_ops{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Write", + "refId": "A" + }, + { + "expr": "rate(ceph_rbd_read_ops{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "iops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "iops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rbd_write_bytes{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Write", + "refId": "A" + }, + { + "expr": "rate(ceph_rbd_read_bytes{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(ceph_rbd_write_latency_sum{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval]) /\n rate(ceph_rbd_write_latency_count{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Write", + "refId": "A" + }, + { + "expr": "rate(ceph_rbd_read_latency_sum{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval]) /\n rate(ceph_rbd_read_latency_count{job=~\"$job\", pool=\"$pool\", image=\"$image\"}[$__rate_interval])\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pool}} Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "pool", + "options": [ ], + "query": "label_values(pool)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": false, + "label": "", + "multi": false, + "name": "image", + "options": [ ], + "query": "label_values(image)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RBD Details", + "uid": "YhCYGcuZz", + "version": 0 +} diff --git a/ceph-mon/files/grafana_dashboards/rbd-overview.json b/ceph-mon/files/grafana_dashboards/rbd-overview.json new file mode 100644 index 00000000..d46297fa --- /dev/null +++ b/ceph-mon/files/grafana_dashboards/rbd-overview.json @@ -0,0 +1,723 @@ +{ + "__inputs": [ ], + "__requires": [ + { + "id": "grafana", + "name": "Grafana", + "type": "grafana", + "version": "5.4.2" + }, + { + "id": "graph", + "name": "Graph", + "type": "panel", + "version": "5.0.0" + }, + { + "id": "prometheus", + "name": "Prometheus", + "type": "datasource", + "version": "5.0.0" + }, + { + "id": "table", + "name": "Table", + "type": "panel", + "version": "5.0.0" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "showIn": 0, + "tags": [ ], + "type": "dashboard" + } + ] + }, + "description": "", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(ceph_rbd_write_ops{job=~\"$job\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Writes", + "refId": "A" + }, + { + "expr": "round(sum(rate(ceph_rbd_read_ops{job=~\"$job\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Reads", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "IOPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(sum(rate(ceph_rbd_write_bytes{job=~\"$job\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "A" + }, + { + "expr": "round(sum(rate(ceph_rbd_read_bytes{job=~\"$job\"}[$__rate_interval])))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${prometheusds}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 4, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "round(\n sum(rate(ceph_rbd_write_latency_sum{job=~\"$job\"}[$__rate_interval])) /\n sum(rate(ceph_rbd_write_latency_count{job=~\"$job\"}[$__rate_interval]))\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Write", + "refId": "A" + }, + { + "expr": "round(\n sum(rate(ceph_rbd_read_latency_sum{job=~\"$job\"}[$__rate_interval])) /\n sum(rate(ceph_rbd_read_latency_count{job=~\"$job\"}[$__rate_interval]))\n)\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Read", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Average Latency", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "ns", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "columns": [ ], + "datasource": "${prometheusds}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 5, + "links": [ ], + "sort": { + "col": 3, + "desc": true + }, + "styles": [ + { + "alias": "Pool", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "pool", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Image", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "image", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "IOPS", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "iops", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n (\n sort((\n rate(ceph_rbd_write_ops{job=~\"$job\"}[$__rate_interval]) +\n on (image, pool, namespace) rate(ceph_rbd_read_ops{job=~\"$job\"}[$__rate_interval])\n ))\n )\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest IOPS", + "transform": "table", + "type": "table" + }, + { + "columns": [ ], + "datasource": "${prometheusds}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 7 + }, + "id": 6, + "links": [ ], + "sort": { + "col": 3, + "desc": true + }, + "styles": [ + { + "alias": "Pool", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "pool", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Image", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "image", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Throughput", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "Bps", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n sort(\n sum(\n rate(ceph_rbd_read_bytes{job=~\"$job\"}[$__rate_interval]) +\n rate(ceph_rbd_write_bytes{job=~\"$job\"}[$__rate_interval])\n ) by (pool, image, namespace)\n )\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest Throughput", + "transform": "table", + "type": "table" + }, + { + "columns": [ ], + "datasource": "${prometheusds}", + "description": "RBD per-image IO statistics are disabled by default.\n\nPlease refer to https://docs.ceph.com/en/latest/mgr/prometheus/#rbd-io-statistics for information about how to enable those optionally.", + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 7 + }, + "id": 7, + "links": [ ], + "sort": { + "col": 3, + "desc": true + }, + "styles": [ + { + "alias": "Pool", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "pool", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Image", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "image", + "thresholds": [ ], + "type": "string", + "unit": "short", + "valueMaps": [ ] + }, + { + "alias": "Latency", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value", + "thresholds": [ ], + "type": "number", + "unit": "ns", + "valueMaps": [ ] + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "/.*/", + "thresholds": [ ], + "type": "hidden", + "unit": "short", + "valueMaps": [ ] + } + ], + "targets": [ + { + "expr": "topk(10,\n sum(\n rate(ceph_rbd_write_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_write_latency_count{job=~\"$job\"}[$__rate_interval]), 1) +\n rate(ceph_rbd_read_latency_sum{job=~\"$job\"}[$__rate_interval]) /\n clamp_min(rate(ceph_rbd_read_latency_count{job=~\"$job\"}[$__rate_interval]), 1)\n ) by (pool, image, namespace)\n)\n", + "format": "table", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Highest Latency", + "transform": "table", + "type": "table" + } + ], + "refresh": "30s", + "rows": [ ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "ceph-mixin", + "overview" + ], + "templating": { + "list": [ + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 2, + "includeAll": true, + "label": "cluster", + "multi": true, + "name": "cluster", + "options": [ ], + "query": "label_values(ceph_osd_metadata, cluster)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { }, + "datasource": "${prometheusds}", + "hide": 0, + "includeAll": true, + "label": "job", + "multi": true, + "name": "job", + "options": [ ], + "query": "label_values(ceph_osd_metadata{}, job)", + "refresh": 1, + "regex": "(.*)", + "sort": 1, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "RBD Overview", + "uid": "41FrpeUiz", + "version": 0 +} diff --git a/ceph-mon/files/prometheus_alert_rules/prometheus_alerts.yml.default b/ceph-mon/files/prometheus_alert_rules/prometheus_alerts.yaml similarity index 100% rename from ceph-mon/files/prometheus_alert_rules/prometheus_alerts.yml.default rename to ceph-mon/files/prometheus_alert_rules/prometheus_alerts.yaml diff --git a/ceph-mon/lib/charms/grafana_agent/v0/cos_agent.py b/ceph-mon/lib/charms/grafana_agent/v0/cos_agent.py new file mode 100644 index 00000000..d3130b2b --- /dev/null +++ b/ceph-mon/lib/charms/grafana_agent/v0/cos_agent.py @@ -0,0 +1,842 @@ +# Copyright 2023 Canonical Ltd. +# See LICENSE file for licensing details. + +r"""## Overview. + +This library can be used to manage the cos_agent relation interface: + +- `COSAgentProvider`: Use in machine charms that need to have a workload's metrics + or logs scraped, or forward rule files or dashboards to Prometheus, Loki or Grafana through + the Grafana Agent machine charm. + +- `COSAgentConsumer`: Used in the Grafana Agent machine charm to manage the requirer side of + the `cos_agent` interface. + + +## COSAgentProvider Library Usage + +Grafana Agent machine Charmed Operator interacts with its clients using the cos_agent library. +Charms seeking to send telemetry, must do so using the `COSAgentProvider` object from +this charm library. + +Using the `COSAgentProvider` object only requires instantiating it, +typically in the `__init__` method of your charm (the one which sends telemetry). + +The constructor of `COSAgentProvider` has only one required and nine optional parameters: + +```python + def __init__( + self, + charm: CharmType, + relation_name: str = DEFAULT_RELATION_NAME, + metrics_endpoints: Optional[List[_MetricsEndpointDict]] = None, + metrics_rules_dir: str = "./src/prometheus_alert_rules", + logs_rules_dir: str = "./src/loki_alert_rules", + recurse_rules_dirs: bool = False, + log_slots: Optional[List[str]] = None, + dashboard_dirs: Optional[List[str]] = None, + refresh_events: Optional[List] = None, + scrape_configs: Optional[Union[List[Dict], Callable]] = None, + ): +``` + +### Parameters + +- `charm`: The instance of the charm that instantiates `COSAgentProvider`, typically `self`. + +- `relation_name`: If your charmed operator uses a relation name other than `cos-agent` to use + the `cos_agent` interface, this is where you have to specify that. + +- `metrics_endpoints`: In this parameter you can specify the metrics endpoints that Grafana Agent + machine Charmed Operator will scrape. The configs of this list will be merged with the configs + from `scrape_configs`. + +- `metrics_rules_dir`: The directory in which the Charmed Operator stores its metrics alert rules + files. + +- `logs_rules_dir`: The directory in which the Charmed Operator stores its logs alert rules files. + +- `recurse_rules_dirs`: This parameters set whether Grafana Agent machine Charmed Operator has to + search alert rules files recursively in the previous two directories or not. + +- `log_slots`: Snap slots to connect to for scraping logs in the form ["snap-name:slot", ...]. + +- `dashboard_dirs`: List of directories where the dashboards are stored in the Charmed Operator. + +- `refresh_events`: List of events on which to refresh relation data. + +- `scrape_configs`: List of standard scrape_configs dicts or a callable that returns the list in + case the configs need to be generated dynamically. The contents of this list will be merged + with the configs from `metrics_endpoints`. + + +### Example 1 - Minimal instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentProvider +... +class TelemetryProviderCharm(CharmBase): + def __init__(self, *args): + ... + self._grafana_agent = COSAgentProvider(self) +``` + +### Example 2 - Full instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentProvider +... +class TelemetryProviderCharm(CharmBase): + def __init__(self, *args): + ... + self._grafana_agent = COSAgentProvider( + self, + relation_name="custom-cos-agent", + metrics_endpoints=[ + # specify "path" and "port" to scrape from localhost + {"path": "/metrics", "port": 9000}, + {"path": "/metrics", "port": 9001}, + {"path": "/metrics", "port": 9002}, + ], + metrics_rules_dir="./src/alert_rules/prometheus", + logs_rules_dir="./src/alert_rules/loki", + recursive_rules_dir=True, + log_slots=["my-app:slot"], + dashboard_dirs=["./src/dashboards_1", "./src/dashboards_2"], + refresh_events=["update-status", "upgrade-charm"], + scrape_configs=[ + { + "job_name": "custom_job", + "metrics_path": "/metrics", + "authorization": {"credentials": "bearer-token"}, + "static_configs": [ + { + "targets": ["localhost:9003"]}, + "labels": {"key": "value"}, + }, + ], + }, + ] + ) +``` + +### Example 3 - Dynamic scrape configs generation: + +Pass a function to the `scrape_configs` to decouple the generation of the configs +from the instantiation of the COSAgentProvider object. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentProvider +... + +class TelemetryProviderCharm(CharmBase): + def generate_scrape_configs(self): + return [ + { + "job_name": "custom", + "metrics_path": "/metrics", + "static_configs": [{"targets": ["localhost:9000"]}], + }, + ] + + def __init__(self, *args): + ... + self._grafana_agent = COSAgentProvider( + self, + scrape_configs=self.generate_scrape_configs, + ) +``` + +## COSAgentConsumer Library Usage + +This object may be used by any Charmed Operator which gathers telemetry data by +implementing the consumer side of the `cos_agent` interface. +For instance Grafana Agent machine Charmed Operator. + +For this purpose the charm needs to instantiate the `COSAgentConsumer` object with one mandatory +and two optional arguments. + +### Parameters + +- `charm`: A reference to the parent (Grafana Agent machine) charm. + +- `relation_name`: The name of the relation that the charm uses to interact + with its clients that provides telemetry data using the `COSAgentProvider` object. + + If provided, this relation name must match a provided relation in metadata.yaml with the + `cos_agent` interface. + The default value of this argument is "cos-agent". + +- `refresh_events`: List of events on which to refresh relation data. + + +### Example 1 - Minimal instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentConsumer +... +class GrafanaAgentMachineCharm(GrafanaAgentCharm) + def __init__(self, *args): + ... + self._cos = COSAgentRequirer(self) +``` + + +### Example 2 - Full instrumentation: + +In order to use this object the following should be in the `charm.py` file. + +```python +from charms.grafana_agent.v0.cos_agent import COSAgentConsumer +... +class GrafanaAgentMachineCharm(GrafanaAgentCharm) + def __init__(self, *args): + ... + self._cos = COSAgentRequirer( + self, + relation_name="cos-agent-consumer", + refresh_events=["update-status", "upgrade-charm"], + ) +``` +""" + +import base64 +import json +import logging +import lzma +from collections import namedtuple +from itertools import chain +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Set, Union + +import pydantic +from cosl import JujuTopology +from cosl.rules import AlertRules +from ops.charm import RelationChangedEvent +from ops.framework import EventBase, EventSource, Object, ObjectEvents +from ops.model import Relation, Unit +from ops.testing import CharmType + +if TYPE_CHECKING: + try: + from typing import TypedDict + + class _MetricsEndpointDict(TypedDict): + path: str + port: int + + except ModuleNotFoundError: + _MetricsEndpointDict = Dict # pyright: ignore + +LIBID = "dc15fa84cef84ce58155fb84f6c6213a" +LIBAPI = 0 +LIBPATCH = 6 + +PYDEPS = ["cosl", "pydantic < 2"] + +DEFAULT_RELATION_NAME = "cos-agent" +DEFAULT_PEER_RELATION_NAME = "peers" +DEFAULT_SCRAPE_CONFIG = { + "static_configs": [{"targets": ["localhost:80"]}], + "metrics_path": "/metrics", +} + +logger = logging.getLogger(__name__) +SnapEndpoint = namedtuple("SnapEndpoint", "owner, name") + + +class GrafanaDashboard(str): + """Grafana Dashboard encoded json; lzma-compressed.""" + + # TODO Replace this with a custom type when pydantic v2 released (end of 2023 Q1?) + # https://github.com/pydantic/pydantic/issues/4887 + @staticmethod + def _serialize(raw_json: Union[str, bytes]) -> "GrafanaDashboard": + if not isinstance(raw_json, bytes): + raw_json = raw_json.encode("utf-8") + encoded = base64.b64encode(lzma.compress(raw_json)).decode("utf-8") + return GrafanaDashboard(encoded) + + def _deserialize(self) -> Dict: + try: + raw = lzma.decompress(base64.b64decode(self.encode("utf-8"))).decode() + return json.loads(raw) + except json.decoder.JSONDecodeError as e: + logger.error("Invalid Dashboard format: %s", e) + return {} + + def __repr__(self): + """Return string representation of self.""" + return "" + + +class CosAgentProviderUnitData(pydantic.BaseModel): + """Unit databag model for `cos-agent` relation.""" + + # The following entries are the same for all units of the same principal. + # Note that the same grafana agent subordinate may be related to several apps. + # this needs to make its way to the gagent leader + metrics_alert_rules: dict + log_alert_rules: dict + dashboards: List[GrafanaDashboard] + subordinate: Optional[bool] + + # The following entries may vary across units of the same principal app. + # this data does not need to be forwarded to the gagent leader + metrics_scrape_jobs: List[Dict] + log_slots: List[str] + + # when this whole datastructure is dumped into a databag, it will be nested under this key. + # while not strictly necessary (we could have it 'flattened out' into the databag), + # this simplifies working with the model. + KEY: ClassVar[str] = "config" + + +class CosAgentPeersUnitData(pydantic.BaseModel): + """Unit databag model for `peers` cos-agent machine charm peer relation.""" + + # We need the principal unit name and relation metadata to be able to render identifiers + # (e.g. topology) on the leader side, after all the data moves into peer data (the grafana + # agent leader can only see its own principal, because it is a subordinate charm). + principal_unit_name: str + principal_relation_id: str + principal_relation_name: str + + # The only data that is forwarded to the leader is data that needs to go into the app databags + # of the outgoing o11y relations. + metrics_alert_rules: Optional[dict] + log_alert_rules: Optional[dict] + dashboards: Optional[List[GrafanaDashboard]] + + # when this whole datastructure is dumped into a databag, it will be nested under this key. + # while not strictly necessary (we could have it 'flattened out' into the databag), + # this simplifies working with the model. + KEY: ClassVar[str] = "config" + + @property + def app_name(self) -> str: + """Parse out the app name from the unit name. + + TODO: Switch to using `model_post_init` when pydantic v2 is released? + https://github.com/pydantic/pydantic/issues/1729#issuecomment-1300576214 + """ + return self.principal_unit_name.split("/")[0] + + +class COSAgentProvider(Object): + """Integration endpoint wrapper for the provider side of the cos_agent interface.""" + + def __init__( + self, + charm: CharmType, + relation_name: str = DEFAULT_RELATION_NAME, + metrics_endpoints: Optional[List["_MetricsEndpointDict"]] = None, + metrics_rules_dir: str = "./src/prometheus_alert_rules", + logs_rules_dir: str = "./src/loki_alert_rules", + recurse_rules_dirs: bool = False, + log_slots: Optional[List[str]] = None, + dashboard_dirs: Optional[List[str]] = None, + refresh_events: Optional[List] = None, + *, + scrape_configs: Optional[Union[List[dict], Callable]] = None, + ): + """Create a COSAgentProvider instance. + + Args: + charm: The `CharmBase` instance that is instantiating this object. + relation_name: The name of the relation to communicate over. + metrics_endpoints: List of endpoints in the form [{"path": path, "port": port}, ...]. + This argument is a simplified form of the `scrape_configs`. + The contents of this list will be merged with the contents of `scrape_configs`. + metrics_rules_dir: Directory where the metrics rules are stored. + logs_rules_dir: Directory where the logs rules are stored. + recurse_rules_dirs: Whether to recurse into rule paths. + log_slots: Snap slots to connect to for scraping logs + in the form ["snap-name:slot", ...]. + dashboard_dirs: Directory where the dashboards are stored. + refresh_events: List of events on which to refresh relation data. + scrape_configs: List of standard scrape_configs dicts or a callable + that returns the list in case the configs need to be generated dynamically. + The contents of this list will be merged with the contents of `metrics_endpoints`. + """ + super().__init__(charm, relation_name) + dashboard_dirs = dashboard_dirs or ["./src/grafana_dashboards"] + + self._charm = charm + self._relation_name = relation_name + self._metrics_endpoints = metrics_endpoints or [] + self._scrape_configs = scrape_configs or [] + self._metrics_rules = metrics_rules_dir + self._logs_rules = logs_rules_dir + self._recursive = recurse_rules_dirs + self._log_slots = log_slots or [] + self._dashboard_dirs = dashboard_dirs + self._refresh_events = refresh_events or [self._charm.on.config_changed] + + events = self._charm.on[relation_name] + self.framework.observe(events.relation_joined, self._on_refresh) + self.framework.observe(events.relation_changed, self._on_refresh) + for event in self._refresh_events: + self.framework.observe(event, self._on_refresh) + + def _on_refresh(self, event): + """Trigger the class to update relation data.""" + relations = self._charm.model.relations[self._relation_name] + + for relation in relations: + # Before a principal is related to the grafana-agent subordinate, we'd get + # ModelError: ERROR cannot read relation settings: unit "zk/2": settings not found + # Add a guard to make sure it doesn't happen. + if relation.data and self._charm.unit in relation.data: + # Subordinate relations can communicate only over unit data. + try: + data = CosAgentProviderUnitData( + metrics_alert_rules=self._metrics_alert_rules, + log_alert_rules=self._log_alert_rules, + dashboards=self._dashboards, + metrics_scrape_jobs=self._scrape_jobs, + log_slots=self._log_slots, + subordinate=self._charm.meta.subordinate, + ) + relation.data[self._charm.unit][data.KEY] = data.json() + except ( + pydantic.ValidationError, + json.decoder.JSONDecodeError, + ) as e: + logger.error("Invalid relation data provided: %s", e) + + @property + def _scrape_jobs(self) -> List[Dict]: + """Return a prometheus_scrape-like data structure for jobs. + + https://prometheus.io/docs/prometheus/latest/configuration/configuration/#scrape_config + """ + if callable(self._scrape_configs): + scrape_configs = self._scrape_configs() + else: + # Create a copy of the user scrape_configs, since we will mutate this object + scrape_configs = self._scrape_configs.copy() + + # Convert "metrics_endpoints" to standard scrape_configs, and add them in + for endpoint in self._metrics_endpoints: + scrape_configs.append( + { + "metrics_path": endpoint["path"], + "static_configs": [{"targets": [f"localhost:{endpoint['port']}"]}], + } + ) + + scrape_configs = scrape_configs or [DEFAULT_SCRAPE_CONFIG] + + # Augment job name to include the app name and a unique id (index) + for idx, scrape_config in enumerate(scrape_configs): + scrape_config["job_name"] = "_".join( + [self._charm.app.name, str(idx), scrape_config.get("job_name", "default")] + ) + + return scrape_configs + + @property + def _metrics_alert_rules(self) -> Dict: + """Use (for now) the prometheus_scrape AlertRules to initialize this.""" + alert_rules = AlertRules( + query_type="promql", topology=JujuTopology.from_charm(self._charm) + ) + alert_rules.add_path(self._metrics_rules, recursive=self._recursive) + return alert_rules.as_dict() + + @property + def _log_alert_rules(self) -> Dict: + """Use (for now) the loki_push_api AlertRules to initialize this.""" + alert_rules = AlertRules(query_type="logql", topology=JujuTopology.from_charm(self._charm)) + alert_rules.add_path(self._logs_rules, recursive=self._recursive) + return alert_rules.as_dict() + + @property + def _dashboards(self) -> List[GrafanaDashboard]: + dashboards: List[GrafanaDashboard] = [] + for d in self._dashboard_dirs: + for path in Path(d).glob("*"): + dashboard = GrafanaDashboard._serialize(path.read_bytes()) + dashboards.append(dashboard) + return dashboards + + +class COSAgentDataChanged(EventBase): + """Event emitted by `COSAgentRequirer` when relation data changes.""" + + +class COSAgentValidationError(EventBase): + """Event emitted by `COSAgentRequirer` when there is an error in the relation data.""" + + def __init__(self, handle, message: str = ""): + super().__init__(handle) + self.message = message + + def snapshot(self) -> Dict: + """Save COSAgentValidationError source information.""" + return {"message": self.message} + + def restore(self, snapshot): + """Restore COSAgentValidationError source information.""" + self.message = snapshot["message"] + + +class COSAgentRequirerEvents(ObjectEvents): + """`COSAgentRequirer` events.""" + + data_changed = EventSource(COSAgentDataChanged) + validation_error = EventSource(COSAgentValidationError) + + +class MultiplePrincipalsError(Exception): + """Custom exception for when there are multiple principal applications.""" + + pass + + +class COSAgentRequirer(Object): + """Integration endpoint wrapper for the Requirer side of the cos_agent interface.""" + + on = COSAgentRequirerEvents() # pyright: ignore + + def __init__( + self, + charm: CharmType, + *, + relation_name: str = DEFAULT_RELATION_NAME, + peer_relation_name: str = DEFAULT_PEER_RELATION_NAME, + refresh_events: Optional[List[str]] = None, + ): + """Create a COSAgentRequirer instance. + + Args: + charm: The `CharmBase` instance that is instantiating this object. + relation_name: The name of the relation to communicate over. + peer_relation_name: The name of the peer relation to communicate over. + refresh_events: List of events on which to refresh relation data. + """ + super().__init__(charm, relation_name) + self._charm = charm + self._relation_name = relation_name + self._peer_relation_name = peer_relation_name + self._refresh_events = refresh_events or [self._charm.on.config_changed] + + events = self._charm.on[relation_name] + self.framework.observe( + events.relation_joined, self._on_relation_data_changed + ) # TODO: do we need this? + self.framework.observe(events.relation_changed, self._on_relation_data_changed) + for event in self._refresh_events: + self.framework.observe(event, self.trigger_refresh) # pyright: ignore + + # Peer relation events + # A peer relation is needed as it is the only mechanism for exchanging data across + # subordinate units. + # self.framework.observe( + # self.on[self._peer_relation_name].relation_joined, self._on_peer_relation_joined + # ) + peer_events = self._charm.on[peer_relation_name] + self.framework.observe(peer_events.relation_changed, self._on_peer_relation_changed) + + @property + def peer_relation(self) -> Optional["Relation"]: + """Helper function for obtaining the peer relation object. + + Returns: peer relation object + (NOTE: would return None if called too early, e.g. during install). + """ + return self.model.get_relation(self._peer_relation_name) + + def _on_peer_relation_changed(self, _): + # Peer data is used for forwarding data from principal units to the grafana agent + # subordinate leader, for updating the app data of the outgoing o11y relations. + if self._charm.unit.is_leader(): + self.on.data_changed.emit() # pyright: ignore + + def _on_relation_data_changed(self, event: RelationChangedEvent): + # Peer data is the only means of communication between subordinate units. + if not self.peer_relation: + event.defer() + return + + cos_agent_relation = event.relation + if not event.unit or not cos_agent_relation.data.get(event.unit): + return + principal_unit = event.unit + + # Coherence check + units = cos_agent_relation.units + if len(units) > 1: + # should never happen + raise ValueError( + f"unexpected error: subordinate relation {cos_agent_relation} " + f"should have exactly one unit" + ) + + if not (raw := cos_agent_relation.data[principal_unit].get(CosAgentProviderUnitData.KEY)): + return + + if not (provider_data := self._validated_provider_data(raw)): + return + + # Copy data from the principal relation to the peer relation, so the leader could + # follow up. + # Save the originating unit name, so it could be used for topology later on by the leader. + data = CosAgentPeersUnitData( # peer relation databag model + principal_unit_name=event.unit.name, + principal_relation_id=str(event.relation.id), + principal_relation_name=event.relation.name, + metrics_alert_rules=provider_data.metrics_alert_rules, + log_alert_rules=provider_data.log_alert_rules, + dashboards=provider_data.dashboards, + ) + self.peer_relation.data[self._charm.unit][ + f"{CosAgentPeersUnitData.KEY}-{event.unit.name}" + ] = data.json() + + # We can't easily tell if the data that was changed is limited to only the data + # that goes into peer relation (in which case, if this is not a leader unit, we wouldn't + # need to emit `on.data_changed`), so we're emitting `on.data_changed` either way. + self.on.data_changed.emit() # pyright: ignore + + def _validated_provider_data(self, raw) -> Optional[CosAgentProviderUnitData]: + try: + return CosAgentProviderUnitData(**json.loads(raw)) + except (pydantic.ValidationError, json.decoder.JSONDecodeError) as e: + self.on.validation_error.emit(message=str(e)) # pyright: ignore + return None + + def trigger_refresh(self, _): + """Trigger a refresh of relation data.""" + # FIXME: Figure out what we should do here + self.on.data_changed.emit() # pyright: ignore + + @property + def _principal_unit(self) -> Optional[Unit]: + """Return the principal unit for a relation. + + Assumes that the relation is of type subordinate. + Relies on the fact that, for subordinate relations, the only remote unit visible to + *this unit* is the principal unit that this unit is attached to. + """ + if relations := self._principal_relations: + # Technically it's a list, but for subordinates there can only be one relation + principal_relation = next(iter(relations)) + if units := principal_relation.units: + # Technically it's a list, but for subordinates there can only be one + return next(iter(units)) + + return None + + @property + def _principal_relations(self): + relations = [] + for relation in self._charm.model.relations[self._relation_name]: + if not json.loads(relation.data[next(iter(relation.units))]["config"]).get( + ["subordinate"], False + ): + relations.append(relation) + if len(relations) > 1: + logger.error( + "Multiple applications claiming to be principal. Update the cos-agent library in the client application charms." + ) + raise MultiplePrincipalsError("Multiple principal applications.") + return relations + + @property + def _remote_data(self) -> List[CosAgentProviderUnitData]: + """Return a list of remote data from each of the related units. + + Assumes that the relation is of type subordinate. + Relies on the fact that, for subordinate relations, the only remote unit visible to + *this unit* is the principal unit that this unit is attached to. + """ + all_data = [] + + for relation in self._charm.model.relations[self._relation_name]: + if not relation.units: + continue + unit = next(iter(relation.units)) + if not (raw := relation.data[unit].get(CosAgentProviderUnitData.KEY)): + continue + if not (provider_data := self._validated_provider_data(raw)): + continue + all_data.append(provider_data) + + return all_data + + def _gather_peer_data(self) -> List[CosAgentPeersUnitData]: + """Collect data from the peers. + + Returns a trimmed-down list of CosAgentPeersUnitData. + """ + relation = self.peer_relation + + # Ensure that whatever context we're running this in, we take the necessary precautions: + if not relation or not relation.data or not relation.app: + return [] + + # Iterate over all peer unit data and only collect every principal once. + peer_data: List[CosAgentPeersUnitData] = [] + app_names: Set[str] = set() + + for unit in chain((self._charm.unit,), relation.units): + if not relation.data.get(unit): + continue + + for unit_name in relation.data.get(unit): # pyright: ignore + if not unit_name.startswith(CosAgentPeersUnitData.KEY): + continue + raw = relation.data[unit].get(unit_name) + if raw is None: + continue + data = CosAgentPeersUnitData(**json.loads(raw)) + # Have we already seen this principal app? + if (app_name := data.app_name) in app_names: + continue + peer_data.append(data) + app_names.add(app_name) + + return peer_data + + @property + def metrics_alerts(self) -> Dict[str, Any]: + """Fetch metrics alerts.""" + alert_rules = {} + + seen_apps: List[str] = [] + for data in self._gather_peer_data(): + if rules := data.metrics_alert_rules: + app_name = data.app_name + if app_name in seen_apps: + continue # dedup! + seen_apps.append(app_name) + # This is only used for naming the file, so be as specific as we can be + identifier = JujuTopology( + model=self._charm.model.name, + model_uuid=self._charm.model.uuid, + application=app_name, + # For the topology unit, we could use `data.principal_unit_name`, but that unit + # name may not be very stable: `_gather_peer_data` de-duplicates by app name so + # the exact unit name that turns up first in the iterator may vary from time to + # time. So using the grafana-agent unit name instead. + unit=self._charm.unit.name, + ).identifier + + alert_rules[identifier] = rules + + return alert_rules + + @property + def metrics_jobs(self) -> List[Dict]: + """Parse the relation data contents and extract the metrics jobs.""" + scrape_jobs = [] + for data in self._remote_data: + for job in data.metrics_scrape_jobs: + # In #220, relation schema changed from a simplified dict to the standard + # `scrape_configs`. + # This is to ensure backwards compatibility with Providers older than v0.5. + if "path" in job and "port" in job and "job_name" in job: + job = { + "job_name": job["job_name"], + "metrics_path": job["path"], + "static_configs": [{"targets": [f"localhost:{job['port']}"]}], + } + + scrape_jobs.append(job) + + return scrape_jobs + + @property + def snap_log_endpoints(self) -> List[SnapEndpoint]: + """Fetch logging endpoints exposed by related snaps.""" + plugs = [] + for data in self._remote_data: + targets = data.log_slots + if targets: + for target in targets: + if target in plugs: + logger.warning( + f"plug {target} already listed. " + "The same snap is being passed from multiple " + "endpoints; this should not happen." + ) + else: + plugs.append(target) + + endpoints = [] + for plug in plugs: + if ":" not in plug: + logger.error(f"invalid plug definition received: {plug}. Ignoring...") + else: + endpoint = SnapEndpoint(*plug.split(":")) + endpoints.append(endpoint) + return endpoints + + @property + def logs_alerts(self) -> Dict[str, Any]: + """Fetch log alerts.""" + alert_rules = {} + seen_apps: List[str] = [] + + for data in self._gather_peer_data(): + if rules := data.log_alert_rules: + # This is only used for naming the file, so be as specific as we can be + app_name = data.app_name + if app_name in seen_apps: + continue # dedup! + seen_apps.append(app_name) + + identifier = JujuTopology( + model=self._charm.model.name, + model_uuid=self._charm.model.uuid, + application=app_name, + # For the topology unit, we could use `data.principal_unit_name`, but that unit + # name may not be very stable: `_gather_peer_data` de-duplicates by app name so + # the exact unit name that turns up first in the iterator may vary from time to + # time. So using the grafana-agent unit name instead. + unit=self._charm.unit.name, + ).identifier + + alert_rules[identifier] = rules + + return alert_rules + + @property + def dashboards(self) -> List[Dict[str, str]]: + """Fetch dashboards as encoded content. + + Dashboards are assumed not to vary across units of the same primary. + """ + dashboards: List[Dict[str, Any]] = [] + + seen_apps: List[str] = [] + for data in self._gather_peer_data(): + app_name = data.app_name + if app_name in seen_apps: + continue # dedup! + seen_apps.append(app_name) + + for encoded_dashboard in data.dashboards or (): + content = GrafanaDashboard(encoded_dashboard)._deserialize() + + title = content.get("title", "no_title") + + dashboards.append( + { + "relation_id": data.principal_relation_id, + # We have the remote charm name - use it for the identifier + "charm": f"{data.principal_relation_name}-{app_name}", + "content": content, + "title": title, + } + ) + + return dashboards diff --git a/ceph-mon/metadata.yaml b/ceph-mon/metadata.yaml index f94c0360..e13f1500 100644 --- a/ceph-mon/metadata.yaml +++ b/ceph-mon/metadata.yaml @@ -42,6 +42,9 @@ provides: interface: prometheus_scrape dashboard: interface: ceph-dashboard + cos-agent: + interface: cos_agent + requires: bootstrap-source: interface: ceph-bootstrap diff --git a/ceph-mon/osci.yaml b/ceph-mon/osci.yaml index d831eb2e..0355a807 100644 --- a/ceph-mon/osci.yaml +++ b/ceph-mon/osci.yaml @@ -3,12 +3,11 @@ - charm-unit-jobs-py38 - charm-unit-jobs-py310 - charm-yoga-functional-jobs - - charm-zed-functional-jobs vars: needs_charm_build: true charm_build_name: ceph-mon build_type: charmcraft - charmcraft_channel: 2.0/stable + charmcraft_channel: 2.x/stable check: jobs: - new-install-focal-yoga diff --git a/ceph-mon/src/ceph_metrics.py b/ceph-mon/src/ceph_metrics.py index a1745b31..e720c846 100644 --- a/ceph-mon/src/ceph_metrics.py +++ b/ceph-mon/src/ceph_metrics.py @@ -9,6 +9,8 @@ import logging import os.path import pathlib +import socket + from typing import Optional, Union, List, TYPE_CHECKING import ops.model @@ -17,6 +19,7 @@ import charm from charms.prometheus_k8s.v0 import prometheus_scrape +from charms.grafana_agent.v0 import cos_agent from charms_ceph import utils as ceph_utils from ops.framework import BoundEvent from utils import mgr_config_set_rbd_stats_pools @@ -28,6 +31,10 @@ "metrics_path": "/metrics", "static_configs": [{"targets": ["*:9283"]}], } +DEFAULT_CEPH_METRICS_ENDPOINT = { + "path": "/metrics", + "port": 9283, +} DEFAULT_ALERT_RULES_RELATIVE_PATH = "files/prometheus_alert_rules" @@ -144,3 +151,77 @@ def update_alert_rules(self): self._charm._stored.alert_rule_errors = msg return self._set_alert_rules(alert_rules_as_dict) + + +class CephCOSAgentProvider(cos_agent.COSAgentProvider): + + def __init__(self, charm): + super().__init__( + charm, + metrics_rules_dir="./files/prometheus_alert_rules", + dashboard_dirs=["./files/grafana_dashboards"], + scrape_configs=self._custom_scrape_configs, + ) + events = self._charm.on[cos_agent.DEFAULT_RELATION_NAME] + self.framework.observe( + events.relation_departed, self._on_relation_departed + ) + + def _on_refresh(self, event): + """Enable prometheus on relation change""" + if self._charm.unit.is_leader() and ceph_utils.is_bootstrapped(): + logger.debug("refreshing cos_agent relation") + mgr_config_set_rbd_stats_pools() + ceph_utils.mgr_enable_module("prometheus") + super()._on_refresh(event) + + def _on_relation_departed(self, event): + """Disable prometheus on depart of relation""" + if self._charm.unit.is_leader() and ceph_utils.is_bootstrapped(): + logger.debug( + "is_leader and is_bootstrapped, running rel departed: %s", + event, + ) + ceph_utils.mgr_disable_module("prometheus") + logger.debug("module_disabled") + + def _custom_scrape_configs(self): + fqdn = socket.getfqdn() + fqdn_parts = fqdn.split('.') + domain = '.'.join(fqdn_parts[1:]) if len(fqdn_parts) > 1 else fqdn + return [ + { + "metrics_path": "/metrics", + "static_configs": [{"targets": ["localhost:9283"]}], + "honor_labels": True, + "metric_relabel_configs": [ + { + # localhost:9283 is the generic default instance label + # added by grafana-agent which is kinda useless. + # Replace it with a somewhat more meaningful label + "source_labels": ["instance"], + "regex": "^localhost:9283$", + "target_label": "instance", + "action": "replace", + "replacement": "ceph_cluster", + }, + { # if we have a non-empty hostname label, use it as the + # instance label + "source_labels": ["hostname"], + "regex": "(.+)", + "target_label": "instance", + "action": "replace", + "replacement": "${1}", + }, + { # tack on the domain to the instance label to make it + # conform to grafana-agent's node-exporter expectations + "source_labels": ["instance"], + "regex": "(.*)", + "target_label": "instance", + "action": "replace", + "replacement": "${1}." + domain, + }, + ] + }, + + ] diff --git a/ceph-mon/src/charm.py b/ceph-mon/src/charm.py index 6d7bdbc6..dcf58d7c 100755 --- a/ceph-mon/src/charm.py +++ b/ceph-mon/src/charm.py @@ -23,7 +23,6 @@ import ops_actions - logger = logging.getLogger(__name__) @@ -222,6 +221,7 @@ def __init__(self, *args): self.clients = ceph_client.CephClientProvides(self) self.metrics_endpoint = ceph_metrics.CephMetricsEndpointProvider(self) + self.cos_agent = ceph_metrics.CephCOSAgentProvider(self) self.ceph_status = ceph_status.StatusAssessor(self) self._observe_action(self.on.change_osd_weight_action, diff --git a/ceph-mon/src/ops_actions/rotate_key.py b/ceph-mon/src/ops_actions/rotate_key.py index 62e49168..1e4ba829 100644 --- a/ceph-mon/src/ops_actions/rotate_key.py +++ b/ceph-mon/src/ops_actions/rotate_key.py @@ -99,13 +99,45 @@ def _handle_rgw_key_rotation(entity, event, model): event.fail("Entity %s not found" % entity) +def _find_mds_unit(relations, mds_name): + for relation in relations: + for unit in relation.units: + try: + if mds_name == relation.data[unit]['mds-name']: + return relation.data + except KeyError: + logger.exception('mds name not found in relation data bag') + + +def _handle_mds_key_rotation(entity, event, model): + mds_name = entity[4:] + relations = model.relations.get('mds') + if not relations: + event.fail('No mds relations found') + return + + bag = _find_mds_unit(relations, mds_name) + if bag is None: + event.fail('No unit found for entity: %s' % entity) + return + + pending_key = _create_key(entity, event) + bag[model.unit][mds_name + "_mds_key"] = pending_key + event.set_results({'message': 'success'}) + + def _get_osd_tree(): out = subprocess.check_output(["sudo", "ceph", "osd", "dump", "--format=json"]) return json.loads(out.decode("utf8")).get("osds", ()) -def _get_osd_addr(osd_id, tree=None): +def _clean_address(addr): + ix = addr.find(":") + return addr if ix < 0 else addr[0:ix] + + +def _get_osd_addrs(osd_id, tree=None): if tree is None: tree = _get_osd_tree() @@ -113,9 +145,9 @@ def _get_osd_addr(osd_id, tree=None): if osd.get("osd") != osd_id: continue - addr = osd["public_addr"] - ix = addr.find(":") - return addr if ix < 0 else addr[0:ix] + return [_clean_address(osd[x]) + for x in ("public_addr", "cluster_addr") + if x in osd] def _get_unit_addr(unit, rel_id): @@ -125,13 +157,13 @@ def _get_unit_addr(unit, rel_id): def _find_osd_unit(relations, model, osd_id, tree): - addr = _get_osd_addr(osd_id, tree) - if not addr: + addrs = _get_osd_addrs(osd_id, tree) + if not addrs: return None for relation in relations: for unit in relation.units: - if _get_unit_addr(unit.name, relation.id) == addr: + if _get_unit_addr(unit.name, relation.id) in addrs: return relation.data[model.unit] @@ -225,6 +257,8 @@ def rotate_key(event, model=None) -> None: event.set_results({"message": "success"}) elif entity.startswith("client.rgw."): _handle_rgw_key_rotation(entity, event, model) + elif entity.startswith('mds.'): + _handle_mds_key_rotation(entity, event, model) elif entity == "osd": _rotate_all_osds(event, model) elif entity.startswith("osd."): diff --git a/ceph-mon/test-requirements.txt b/ceph-mon/test-requirements.txt index 20854bdd..4e4e7d20 100644 --- a/ceph-mon/test-requirements.txt +++ b/ceph-mon/test-requirements.txt @@ -57,3 +57,6 @@ pyopenssl<=22.0.0 # newer jsonschema needs rustc and cargo jsonschema<4.18.0 + +pydantic < 2 +cosl diff --git a/ceph-mon/tests/bundles/jammy-zed.yaml b/ceph-mon/tests/bundles/jammy-zed.yaml deleted file mode 100644 index 64ade939..00000000 --- a/ceph-mon/tests/bundles/jammy-zed.yaml +++ /dev/null @@ -1,259 +0,0 @@ -variables: - openstack-origin: &openstack-origin cloud:jammy-zed - -series: jammy - -comment: -- 'machines section to decide order of deployment. database sooner = faster' -machines: - '0': - constraints: mem=3072M - '1': - constraints: mem=3072M - '2': - constraints: mem=3072M - '3': - '4': - '5': - '6': - '7': - '8': - '9': - '10': - '11': - '12': - '13': - '14': - '15': - '16': - '17': - - -applications: - - keystone-mysql-router: - charm: ch:mysql-router - channel: 8.0/edge - glance-mysql-router: - charm: ch:mysql-router - channel: 8.0/edge - cinder-mysql-router: - charm: ch:mysql-router - channel: 8.0/edge - nova-cloud-controller-mysql-router: - charm: ch:mysql-router - channel: 8.0/edge - placement-mysql-router: - charm: ch:mysql-router - channel: 8.0/edge - - mysql-innodb-cluster: - charm: ch:mysql-innodb-cluster - num_units: 3 - to: - - '0' - - '1' - - '2' - channel: 8.0/edge - - ceph-osd: - charm: ch:ceph-osd - num_units: 3 - storage: - osd-devices: '10G' - options: - source: *openstack-origin - osd-devices: '/dev/test-non-existent' - to: - - '3' - - '4' - - '5' - channel: quincy/edge - - ceph-mon: - charm: ch:ceph-mon - channel: quincy/edge - num_units: 3 - options: - source: *openstack-origin - monitor-count: '3' - to: - - '6' - - '7' - - '8' - - ceph-fs: - charm: ch:ceph-fs - num_units: 1 - options: - source: *openstack-origin - channel: quincy/edge - to: - - '17' - - rabbitmq-server: - charm: ch:rabbitmq-server - num_units: 1 - to: - - '9' - channel: 3.9/edge - - keystone: - expose: True - charm: ch:keystone - num_units: 1 - options: - openstack-origin: *openstack-origin - to: - - '10' - channel: zed/edge - - nova-compute: - charm: ch:nova-compute - num_units: 1 - options: - openstack-origin: *openstack-origin - libvirt-image-backend: rbd - to: - - '11' - channel: zed/edge - - glance: - expose: True - charm: ch:glance - num_units: 1 - options: - openstack-origin: *openstack-origin - to: - - '12' - channel: zed/edge - - cinder: - expose: True - charm: ch:cinder - num_units: 1 - options: - block-device: 'None' - glance-api-version: '2' - openstack-origin: *openstack-origin - to: - - '13' - channel: zed/edge - - cinder-ceph: - charm: ch:cinder-ceph - channel: zed/edge - - nova-cloud-controller: - expose: True - charm: ch:nova-cloud-controller - num_units: 1 - options: - openstack-origin: *openstack-origin - to: - - '14' - channel: zed/edge - - placement: - charm: ch:placement - num_units: 1 - options: - openstack-origin: *openstack-origin - to: - - '15' - channel: zed/edge - - prometheus2: - charm: ch:prometheus2 - num_units: 1 - to: - - '16' - -relations: - - - 'nova-compute:amqp' - - 'rabbitmq-server:amqp' - - - - 'nova-compute:image-service' - - 'glance:image-service' - - - - 'nova-compute:ceph' - - 'ceph-mon:client' - - - - nova-compute:ceph-access - - cinder-ceph:ceph-access - - - - 'keystone:shared-db' - - 'keystone-mysql-router:shared-db' - - - 'keystone-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'glance:shared-db' - - 'glance-mysql-router:shared-db' - - - 'glance-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'glance:identity-service' - - 'keystone:identity-service' - - - - 'glance:amqp' - - 'rabbitmq-server:amqp' - - - - 'glance:ceph' - - 'ceph-mon:client' - - - - 'cinder:shared-db' - - 'cinder-mysql-router:shared-db' - - - 'cinder-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'cinder:identity-service' - - 'keystone:identity-service' - - - - 'cinder:amqp' - - 'rabbitmq-server:amqp' - - - - 'cinder:image-service' - - 'glance:image-service' - - - - 'cinder-ceph:storage-backend' - - 'cinder:storage-backend' - - - - 'cinder-ceph:ceph' - - 'ceph-mon:client' - - - - 'ceph-osd:mon' - - 'ceph-mon:osd' - - - - 'ceph-mon:mds' - - 'ceph-fs:ceph-mds' - - - - 'nova-cloud-controller:shared-db' - - 'nova-cloud-controller-mysql-router:shared-db' - - - 'nova-cloud-controller-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'nova-cloud-controller:identity-service' - - 'keystone:identity-service' - - - - 'nova-cloud-controller:amqp' - - 'rabbitmq-server:amqp' - - - - 'nova-cloud-controller:cloud-compute' - - 'nova-compute:cloud-compute' - - - - 'nova-cloud-controller:image-service' - - 'glance:image-service' - - - - 'placement:shared-db' - - 'placement-mysql-router:shared-db' - - - 'placement-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'placement' - - 'keystone' - - - - 'placement' - - 'nova-cloud-controller' - - - - 'ceph-mon:prometheus' - - 'prometheus2:target' diff --git a/ceph-mon/tests/bundles/kinetic-zed.yaml b/ceph-mon/tests/bundles/kinetic-zed.yaml deleted file mode 100644 index 24818960..00000000 --- a/ceph-mon/tests/bundles/kinetic-zed.yaml +++ /dev/null @@ -1,265 +0,0 @@ -variables: - openstack-origin: &openstack-origin distro - -series: jammy - -comment: -- 'machines section to decide order of deployment. database sooner = faster' -machines: - '0': - constraints: mem=3072M - '1': - constraints: mem=3072M - '2': - constraints: mem=3072M - '3': - '4': - '5': - '6': - '7': - '8': - '9': - '10': - '11': - '12': - '13': - '14': - '15': - '16': - series: focal - '17': - - -applications: - - keystone-mysql-router: - charm: ch:mysql-router - channel: 8.0.19/edge - glance-mysql-router: - charm: ch:mysql-router - channel: 8.0.19/edge - cinder-mysql-router: - charm: ch:mysql-router - channel: 8.0.19/edge - nova-cloud-controller-mysql-router: - charm: ch:mysql-router - channel: 8.0.19/edge - placement-mysql-router: - charm: ch:mysql-router - channel: 8.0.19/edge - - mysql-innodb-cluster: - charm: ch:mysql-innodb-cluster - num_units: 3 - options: - source: *openstack-origin - to: - - '0' - - '1' - - '2' - channel: 8.0.19/edge - - ceph-osd: - charm: ch:ceph-osd - num_units: 3 - storage: - osd-devices: '10G' - options: - source: *openstack-origin - osd-devices: '/dev/test-non-existent' - to: - - '3' - - '4' - - '5' - channel: quincy/edge - - ceph-mon: - charm: ch:ceph-mon - channel: quincy/edge - num_units: 3 - options: - source: *openstack-origin - monitor-count: '3' - to: - - '6' - - '7' - - '8' - - ceph-fs: - charm: ch:ceph-fs - num_units: 1 - options: - source: *openstack-origin - channel: quincy/edge - to: - - '17' - - rabbitmq-server: - charm: ch:rabbitmq-server - num_units: 1 - options: - source: *openstack-origin - to: - - '9' - channel: 3.9/edge - - keystone: - expose: True - charm: ch:keystone - num_units: 1 - options: - openstack-origin: *openstack-origin - to: - - '10' - channel: yoga/edge - - nova-compute: - charm: ch:nova-compute - num_units: 1 - options: - openstack-origin: *openstack-origin - libvirt-image-backend: rbd - to: - - '11' - channel: yoga/edge - - glance: - expose: True - charm: ch:glance - num_units: 1 - options: - openstack-origin: *openstack-origin - to: - - '12' - channel: yoga/edge - - cinder: - expose: True - charm: ch:cinder - num_units: 1 - options: - block-device: 'None' - glance-api-version: '2' - openstack-origin: *openstack-origin - to: - - '13' - channel: yoga/edge - - cinder-ceph: - charm: ch:cinder-ceph - channel: yoga/edge - - nova-cloud-controller: - expose: True - charm: ch:nova-cloud-controller - num_units: 1 - options: - openstack-origin: *openstack-origin - to: - - '14' - channel: yoga/edge - - placement: - charm: ch:placement - num_units: 1 - options: - openstack-origin: *openstack-origin - to: - - '15' - channel: yoga/edge - - prometheus2: - charm: ch:prometheus2 - num_units: 1 - series: focal - to: - - '16' - -relations: - - - 'nova-compute:amqp' - - 'rabbitmq-server:amqp' - - - - 'nova-compute:image-service' - - 'glance:image-service' - - - - 'nova-compute:ceph' - - 'ceph-mon:client' - - - - nova-compute:ceph-access - - cinder-ceph:ceph-access - - - - 'keystone:shared-db' - - 'keystone-mysql-router:shared-db' - - - 'keystone-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'glance:shared-db' - - 'glance-mysql-router:shared-db' - - - 'glance-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'glance:identity-service' - - 'keystone:identity-service' - - - - 'glance:amqp' - - 'rabbitmq-server:amqp' - - - - 'glance:ceph' - - 'ceph-mon:client' - - - - 'cinder:shared-db' - - 'cinder-mysql-router:shared-db' - - - 'cinder-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'cinder:identity-service' - - 'keystone:identity-service' - - - - 'cinder:amqp' - - 'rabbitmq-server:amqp' - - - - 'cinder:image-service' - - 'glance:image-service' - - - - 'cinder-ceph:storage-backend' - - 'cinder:storage-backend' - - - - 'cinder-ceph:ceph' - - 'ceph-mon:client' - - - - 'ceph-osd:mon' - - 'ceph-mon:osd' - - - - 'ceph-mon:mds' - - 'ceph-fs:ceph-mds' - - - - 'nova-cloud-controller:shared-db' - - 'nova-cloud-controller-mysql-router:shared-db' - - - 'nova-cloud-controller-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'nova-cloud-controller:identity-service' - - 'keystone:identity-service' - - - - 'nova-cloud-controller:amqp' - - 'rabbitmq-server:amqp' - - - - 'nova-cloud-controller:cloud-compute' - - 'nova-compute:cloud-compute' - - - - 'nova-cloud-controller:image-service' - - 'glance:image-service' - - - - 'placement:shared-db' - - 'placement-mysql-router:shared-db' - - - 'placement-mysql-router:db-router' - - 'mysql-innodb-cluster:db-router' - - - - 'placement' - - 'keystone' - - - - 'placement' - - 'nova-cloud-controller' - - - - 'ceph-mon:prometheus' - - 'prometheus2:target' diff --git a/ceph-mon/unit_tests/test_ceph_metrics.py b/ceph-mon/unit_tests/test_ceph_metrics.py index f93141f1..6e11c3b2 100644 --- a/ceph-mon/unit_tests/test_ceph_metrics.py +++ b/ceph-mon/unit_tests/test_ceph_metrics.py @@ -17,8 +17,7 @@ import helpers -@helpers.patch_network_get() -class TestCephMetrics(unittest.TestCase): +class CephMetricsTestBase(unittest.TestCase): @classmethod def setUpClass(cls): """Run once before tests begin.""" @@ -33,11 +32,17 @@ def setUpClass(cls): rules: [] """ ) + rules_file = cls.rules_dir / "alert-rules.yaml" + with rules_file.open("w") as f: + f.write(cls.rules) @classmethod def tearDownClass(cls): cls.tempdir.cleanup() + +@helpers.patch_network_get() +class TestCephMetrics(CephMetricsTestBase): def setUp(self): super().setUp() self.harness = Harness(charm.CephMonCharm) @@ -134,3 +139,60 @@ def test_update_alert_rules(self, _is_bootstrapped): self.harness.charm.metrics_endpoint.update_alert_rules() alert_rules = self.get_alert_rules(rel_id) self.assertTrue(alert_rules.get("groups")) + + +class TestCephCOSAgentProvider(CephMetricsTestBase): + def setUp(self): + super().setUp() + self.harness = Harness(charm.CephMonCharm) + self.addCleanup(self.harness.cleanup) + self.harness.begin() + self.harness.set_leader(True) + self.harness.charm.cos_agent._metrics_rules = self.rules_dir + + def test_init(self): + self.assertEqual( + self.harness.charm.cos_agent._relation_name, + "cos-agent", + ) + + @patch("ceph_metrics.mgr_config_set_rbd_stats_pools", lambda: None) + @patch("ceph_metrics.ceph_utils.is_bootstrapped", return_value=True) + @patch("ceph_metrics.ceph_utils.is_mgr_module_enabled", return_value=False) + @patch("ceph_metrics.ceph_utils.mgr_enable_module") + @patch("ceph_metrics.ceph_utils.mgr_disable_module") + def test_add_remove_rel( + self, + mgr_disable_module, + mgr_enable_module, + _is_mgr_module_enable, + _is_bootstrapped, + ): + rel_id = self.harness.add_relation("cos-agent", "grafana-agent") + self.harness.add_relation_unit(rel_id, "grafana-agent/0") + + unit_rel_data = self.harness.get_relation_data( + rel_id, self.harness.model.unit + ) + data = json.loads(unit_rel_data["config"]) + self.assertTrue("metrics_scrape_jobs" in data) + self.assertEqual( + data["metrics_scrape_jobs"][0]["metrics_path"], "/metrics" + ) + self.assertTrue("metrics_alert_rules" in data) + self.assertTrue("groups" in data["metrics_alert_rules"]) + mgr_enable_module.assert_called_once() + + self.harness.remove_relation(rel_id) + mgr_disable_module.assert_called_once() + + @patch("socket.getfqdn", return_value="node1.ceph.example.com") + def test_custom_scrape_configs(self, _mock_getfqdn): + configs = self.harness.charm.cos_agent._custom_scrape_configs() + self.assertEqual( + configs[0]["static_configs"][0]["targets"], ["localhost:9283"] + ) + self.assertEqual( + configs[0]["metric_relabel_configs"][0]["replacement"], + "ceph_cluster", + )