diff --git a/definitions/grafana/grafana-dashboards/k8s-cluster.json b/definitions/grafana/grafana-dashboards/k8s-cluster.json index 4c9d007..f6b15ca 100644 --- a/definitions/grafana/grafana-dashboards/k8s-cluster.json +++ b/definitions/grafana/grafana-dashboards/k8s-cluster.json @@ -74,7 +74,7 @@ }, "targets": [ { - "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}) / sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "expr": "sum (container_memory_working_set_bytes{id=\"/\",instance=~\"^$Node$\"}) / sum (machine_memory_bytes{instance=~\"^$Node$\"}) * 100", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -154,7 +154,7 @@ }, "targets": [ { - "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"}) * 100", + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$Node$\"}[1m])) / sum (machine_cpu_cores{instance=~\"^$Node$\"}) * 100", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -316,7 +316,7 @@ }, "targets": [ { - "expr": "sum (container_memory_working_set_bytes{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "expr": "sum (container_memory_working_set_bytes{id=\"/\",instance=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "refId": "A", @@ -396,7 +396,7 @@ }, "targets": [ { - "expr": "sum (machine_memory_bytes{kubernetes_io_hostname=~\"^$Node$\"})", + "expr": "sum (machine_memory_bytes{instance=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -477,7 +477,7 @@ }, "targets": [ { - "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",kubernetes_io_hostname=~\"^$Node$\"}[1m]))", + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$Node$\"}[1m]))", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -558,7 +558,7 @@ }, "targets": [ { - "expr": "sum (machine_cpu_cores{kubernetes_io_hostname=~\"^$Node$\"})", + "expr": "sum (machine_cpu_cores{instance=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -639,7 +639,7 @@ }, "targets": [ { - "expr": "sum (container_fs_usage_bytes{device=\"/dev/xvda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "expr": "sum (container_fs_usage_bytes{device=\"/dev/xvda9\",id=\"/\",instance=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -720,7 +720,7 @@ }, "targets": [ { - "expr": "sum (container_fs_limit_bytes{device=\"/dev/xvda9\",id=\"/\",kubernetes_io_hostname=~\"^$Node$\"})", + "expr": "sum (container_fs_limit_bytes{device=\"/dev/xvda9\",id=\"/\",instance=~\"^$Node$\"})", "interval": "10s", "intervalFactor": 1, "legendFormat": "", @@ -800,7 +800,7 @@ }, "targets": [ { - "expr": "sum(kubelet_running_pod_count{instance=~\"^$Node$\",job=\"kubernetes-nodes\"})", + "expr": "sum(kubelet_running_pod_count{instance=~\"^$Node$\",job=\"kubelets\"})", "interval": "10s", "intervalFactor": 2, "legendFormat": "", @@ -880,7 +880,7 @@ }, "targets": [ { - "expr": "sum(kubelet_running_container_count{instance=~\"^$Node$\",job=\"kubernetes-nodes\"})", + "expr": "sum(kubelet_running_container_count{instance=~\"^$Node$\",job=\"kubelets\"})", "interval": "10s", "intervalFactor": 2, "legendFormat": "", @@ -1151,7 +1151,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",kubernetes_io_hostname=~\"^$Node$\"}[1m])) by (pod_name)", + "expr": "sum (rate (container_cpu_usage_seconds_total{image!=\"\",instance=~\"^$Node$\"}[1m])) by (pod_name)", "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ pod_name }}", @@ -1256,7 +1256,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",kubernetes_io_hostname=~\"^$Node$\"}) by (pod_name)", + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$Node$\"}) by (pod_name)", "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ pod_name }}", @@ -1353,7 +1353,7 @@ "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{kubernetes_io_hostname=~\"^$Node$\",name!=\"\", pod_name=~\".*\"}[1m]) ))", + "expr": "sort_desc(sum by (pod_name) (rate (container_network_receive_bytes_total{instance=~\"^$Node$\",name!=\"\", pod_name=~\".*\"}[1m]) ))", "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ pod_name }}Reveived", @@ -1362,7 +1362,7 @@ "step": 10 }, { - "expr": "- sort_desc(sum by (pod_name) (rate (container_network_transmit_bytes_total{kubernetes_io_hostname=~\"^$Node$\",name!=\"\", pod_name=~\".*\"}[1m]) ))", + "expr": "- sort_desc(sum by (pod_name) (rate (container_network_transmit_bytes_total{instance=~\"^$Node$\",name!=\"\", pod_name=~\".*\"}[1m]) ))", "interval": "10s", "intervalFactor": 1, "legendFormat": "{{ pod_name }}Sent", @@ -1490,7 +1490,7 @@ "selected": false } ], - "query": "label_values(kubelet_running_pod_count{job=\"kubernetes-nodes\"}, instance)", + "query": "label_values(kubelet_running_pod_count{job=\"kubelets\"}, instance)", "refresh": 1, "regex": "", "type": "query" diff --git a/definitions/grafana/grafana-dashboards/k8s-nodes.json b/definitions/grafana/grafana-dashboards/k8s-nodes.json index 6e18853..b147515 100644 --- a/definitions/grafana/grafana-dashboards/k8s-nodes.json +++ b/definitions/grafana/grafana-dashboards/k8s-nodes.json @@ -75,7 +75,7 @@ }, "targets": [ { - "expr": "node_load1{app=\"node-exporter\",instance=~\"$node:.*\",job=\"kubernetes-service-endpoints\",name=\"node-exporter\"}", + "expr": "node_load1{instance=~\"$node:.*\",job=\"node-exporter\"}", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -155,7 +155,7 @@ }, "targets": [ { - "expr": "up{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"}", + "expr": "up{job=\"node-exporter\", instance=~\"$node:.*\"}", "intervalFactor": 2, "legendFormat": "", "refId": "A", @@ -238,7 +238,7 @@ }, "targets": [ { - "expr": "count(count by(cpu)(node_cpu{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"}))\t", + "expr": "count(count by(cpu)(node_cpu{job=\"node-exporter\", instance=~\"$node:.*\"}))\t", "intervalFactor": 10, "legendFormat": "", "metric": "", @@ -317,7 +317,7 @@ }, "targets": [ { - "expr": "node_memory_MemTotal{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"}", + "expr": "node_memory_MemTotal{job=\"node-exporter\", instance=~\"$node:.*\"}", "intervalFactor": 10, "legendFormat": "", "metric": "", @@ -396,7 +396,7 @@ }, "targets": [ { - "expr": "sum(node_filesystem_size{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\", device=~\"overlay\", mountpoint!=\"/var/lib/docker/aufs\"})", + "expr": "sum(node_filesystem_size{job=\"node-exporter\", instance=~\"$node:.*\", device=~\"overlay\", mountpoint!=\"/var/lib/docker/aufs\"})", "intervalFactor": 10, "legendFormat": "", "metric": "", @@ -475,7 +475,7 @@ }, "targets": [ { - "expr": "time() - node_boot_time{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"}", + "expr": "time() - node_boot_time{job=\"node-exporter\", instance=~\"$node:.*\"}", "intervalFactor": 10, "legendFormat": "", "metric": "node_boot_time", @@ -552,7 +552,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by(mode)(irate(node_cpu{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\", mode!=\"idle\"}[5m])) > 0", + "expr": "sum by(mode)(irate(node_cpu{job=\"node-exporter\", instance=~\"$node:.*\", mode!=\"idle\"}[5m])) > 0", "interval": "", "intervalFactor": 2, "legendFormat": "{{mode}}", @@ -666,7 +666,7 @@ "steppedLine": false, "targets": [ { - "expr": "node_memory_Buffers{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"}", + "expr": "node_memory_Buffers{job=\"node-exporter\", instance=~\"$node:.*\"}", "interval": "", "intervalFactor": 2, "legendFormat": "buffers", @@ -674,7 +674,7 @@ "step": 10 }, { - "expr": "node_memory_Cached{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"}", + "expr": "node_memory_Cached{job=\"node-exporter\", instance=~\"$node:.*\"}", "interval": "", "intervalFactor": 2, "legendFormat": "cached", @@ -682,7 +682,7 @@ "step": 10 }, { - "expr": "node_memory_MemFree{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"}", + "expr": "node_memory_MemFree{job=\"node-exporter\", instance=~\"$node:.*\"}", "interval": "", "intervalFactor": 2, "legendFormat": "free", @@ -690,7 +690,7 @@ "step": 10 }, { - "expr": "node_memory_MemTotal{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"} - node_memory_MemFree{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"} - node_memory_Cached{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"} - node_memory_Buffers{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"}", + "expr": "node_memory_MemTotal{job=\"node-exporter\", instance=~\"$node:.*\"} - node_memory_MemFree{job=\"node-exporter\", instance=~\"$node:.*\"} - node_memory_Cached{job=\"node-exporter\", instance=~\"$node:.*\"} - node_memory_Buffers{job=\"node-exporter\", instance=~\"$node:.*\"}", "interval": "", "intervalFactor": 2, "legendFormat": "used", @@ -789,7 +789,7 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_network_transmit_bytes{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\", device!~\"lo|bond[0-9]|cbr[0-9]|veth.*\"}[5m]) > 0", + "expr": "irate(node_network_transmit_bytes{job=\"node-exporter\", instance=~\"$node:.*\", device!~\"lo|bond[0-9]|cbr[0-9]|veth.*\"}[5m]) > 0", "interval": "", "intervalFactor": 2, "legendFormat": "{{device}} outbound", @@ -798,7 +798,7 @@ "step": 10 }, { - "expr": "irate(node_network_receive_bytes{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\", device!~\"lo|bond[0-9]|cbr[0-9]|veth.*\"}[5m]) > 0", + "expr": "irate(node_network_receive_bytes{job=\"node-exporter\", instance=~\"$node:.*\", device!~\"lo|bond[0-9]|cbr[0-9]|veth.*\"}[5m]) > 0", "hide": false, "interval": "", "intervalFactor": 2, @@ -902,7 +902,7 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_disk_io_time_ms{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\"}[5m]) / 1000", + "expr": "irate(node_disk_io_time_ms{job=\"node-exporter\", instance=~\"$node:.*\"}[5m]) / 1000", "interval": "", "intervalFactor": 2, "legendFormat": "{{device}}", @@ -1012,7 +1012,7 @@ "steppedLine": false, "targets": [ { - "expr": "node_filesystem_size{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\", device=\"overlay\", mountpoint=\"/\"} - node_filesystem_free{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\", device=\"overlay\", mountpoint=\"/\"}", + "expr": "node_filesystem_size{job=\"node-exporter\", instance=~\"$node:.*\", device=\"overlay\", mountpoint=\"/\"} - node_filesystem_free{job=\"node-exporter\", instance=~\"$node:.*\", device=\"overlay\", mountpoint=\"/\"}", "interval": "", "intervalFactor": 2, "legendFormat": "used", @@ -1021,7 +1021,7 @@ "step": 10 }, { - "expr": "node_filesystem_free{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\", device=\"overlay\", mountpoint=\"/\"}", + "expr": "node_filesystem_free{job=\"node-exporter\", instance=~\"$node:.*\", device=\"overlay\", mountpoint=\"/\"}", "interval": "", "intervalFactor": 2, "legendFormat": "free", @@ -1139,7 +1139,7 @@ "selected": false } ], - "query": "label_values(node_load1{job=\"kubernetes-service-endpoints\"}, instance)", + "query": "label_values(node_load1{job=\"node-exporter\"}, instance)", "refresh": 1, "regex": "/([^:]+):.*/", "type": "query", @@ -1170,7 +1170,7 @@ "selected": false } ], - "query": "node_filesystem_size{job=\"kubernetes-service-endpoints\", instance=~\"$node:.*\", device=\"overlay\", mountpoint=\"/\"}", + "query": "node_filesystem_size{job=\"node-exporter\", instance=~\"$node:.*\", device=\"overlay\", mountpoint=\"/\"}", "refresh": 1, "regex": "/mountpoint=\"([^\"]+)/", "type": "query" diff --git a/definitions/grafana/grafana-dashboards/prometheus-data-exploration.json b/definitions/grafana/grafana-dashboards/prometheus-data-exploration.json index 885e20e..5b40e5b 100644 --- a/definitions/grafana/grafana-dashboards/prometheus-data-exploration.json +++ b/definitions/grafana/grafana-dashboards/prometheus-data-exploration.json @@ -2,7 +2,9 @@ "id": 3, "title": "Prometheus Data Exploration", "description": "VERY simple dashboard to VERY easily view and explore Prometheus data. Just click away.", - "tags": [], + "tags": [ + "prometheus" + ], "style": "dark", "timezone": "browser", "editable": true, diff --git a/definitions/grafana/grafana-dashboards/prometheus-stats.json b/definitions/grafana/grafana-dashboards/prometheus-stats.json index c57cfcb..b3a6716 100644 --- a/definitions/grafana/grafana-dashboards/prometheus-stats.json +++ b/definitions/grafana/grafana-dashboards/prometheus-stats.json @@ -1,5 +1,5 @@ { - "id": 2, + "id": 1, "title": "Prometheus Stats Extended", "description": "VIew Prometheus internal metrics: an extension of the standard dashboard.", "tags": [ @@ -8,7 +8,7 @@ "style": "dark", "timezone": "browser", "editable": true, - "hideControls": true, + "hideControls": false, "sharedCrosshair": true, "rows": [ { @@ -74,11 +74,11 @@ }, "targets": [ { - "expr": "(time() - process_start_time_seconds{name=\"prometheus\"})", + "expr": "(time() - process_start_time_seconds{job=\"prometheus\"})", "intervalFactor": 2, + "legendFormat": "", "refId": "A", - "step": 4, - "legendFormat": "" + "step": 4 } ], "thresholds": "", @@ -528,7 +528,7 @@ "steppedLine": false, "targets": [ { - "expr": "prometheus_evaluator_duration_milliseconds{quantile!=\"0.01\", quantile!=\"0.05\"}", + "expr": "prometheus_evaluator_duration_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}", "interval": "", "intervalFactor": 2, "legendFormat": "{{quantile}}", @@ -551,7 +551,7 @@ }, "yaxes": [ { - "format": "percentunit", + "format": "s", "label": "", "logBase": 1, "max": null, @@ -1268,7 +1268,7 @@ }, "refresh": "10s", "schemaVersion": 12, - "version": 5, + "version": 0, "links": [ { "icon": "info", diff --git a/definitions/k8s/prometheus/01-prometheus.configmap.yaml b/definitions/k8s/prometheus/01-prometheus.configmap.yaml index 84e11ca..d8ad336 100644 --- a/definitions/k8s/prometheus/01-prometheus.configmap.yaml +++ b/definitions/k8s/prometheus/01-prometheus.configmap.yaml @@ -6,95 +6,67 @@ metadata: data: prometheus.yml: |- global: - scrape_interval: 10s + evaluation_interval: 30s scrape_configs: - - job_name: 'kubernetes-cluster' + + - job_name: kubelets + + scrape_interval: 20s scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - - api_servers: - - 'https://kubernetes.default.svc' - in_cluster: true - role: apiserver - - job_name: 'kubernetes-nodes' - scheme: https + - role: node + + - job_name: standard-endpoints + + scrape_interval: 20s tls_config: ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecure_skip_verify: true bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - - api_servers: - - 'https://kubernetes.default.svc' - in_cluster: true - role: node + - role: endpoints relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - job_name: 'kubernetes-service-endpoints' - kubernetes_sd_configs: - - api_servers: - - 'https://kubernetes.default.svc' - in_cluster: true - role: endpoint - relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] - action: replace + - action: keep + source_labels: [__meta_kubernetes_service_name] + regex: kubernetes|node-exporter|kube-state-metrics|etcd-k8s|prometheus + - action: replace + source_labels: [__meta_kubernetes_service_name] + target_label: job + - action: replace + source_labels: [__meta_kubernetes_service_name] + regex: kubernetes target_label: __scheme__ - regex: (https?) - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] - action: replace - target_label: __address__ - regex: (.+)(?::\d+);(\d+) - replacement: $1:$2 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_service_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - action: replace - target_label: kubernetes_name - - job_name: 'kubernetes-services' - metrics_path: /probe - params: - module: [http_2xx] + replacement: https + + - job_name: kube-components + + scrape_interval: 20s + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token kubernetes_sd_configs: - - api_servers: - - 'https://kubernetes.default.svc' - in_cluster: true - role: service + - role: endpoints relabel_configs: - - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] - action: keep - regex: true - - source_labels: [__address__] - target_label: __param_target - - target_label: __address__ - replacement: blackbox - - source_labels: [__param_target] - target_label: instance - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: [__meta_kubernetes_service_namespace] - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_service_name] - target_label: kubernetes_name + - action: replace + source_labels: [__meta_kubernetes_service_name] + target_label: job + regex: "kube-(.*)-prometheus-discovery" + replacement: "kube-${1}" + - action: keep + source_labels: [__meta_kubernetes_service_name] + regex: "kube-(.*)-prometheus-discovery" + - action: keep + source_labels: [__meta_kubernetes_endpoint_port_name] + regex: "prometheus" + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: - - api_servers: - - 'https://kubernetes.default.svc' - in_cluster: true - role: pod + - role: pod relabel_configs: - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] action: keep @@ -116,7 +88,9 @@ data: - source_labels: [__meta_kubernetes_pod_name] action: replace target_label: kubernetes_pod_name + - job_name: ec2 + ec2_sd_configs: - region: us-east-1 access_key: aws_access_key diff --git a/definitions/k8s/prometheus/02-prometheus.svc.petset.yaml b/definitions/k8s/prometheus/02-prometheus.svc.petset.yaml index e8bb40c..2940b96 100644 --- a/definitions/k8s/prometheus/02-prometheus.svc.petset.yaml +++ b/definitions/k8s/prometheus/02-prometheus.svc.petset.yaml @@ -35,7 +35,7 @@ spec: terminationGracePeriodSeconds: 0 containers: - name: prometheus-prom-1 - image: prom/prometheus:v1.2.1 + image: prom/prometheus:v1.3.0 args: - '-storage.local.retention=720h' - '-storage.local.memory-chunks=500000' diff --git a/definitions/k8s/prometheus/04-alertmanager.svc.deployment.yaml b/definitions/k8s/prometheus/04-alertmanager.svc.deployment.yaml index 549eabd..5ce639a 100644 --- a/definitions/k8s/prometheus/04-alertmanager.svc.deployment.yaml +++ b/definitions/k8s/prometheus/04-alertmanager.svc.deployment.yaml @@ -28,7 +28,7 @@ spec: spec: containers: - name: alertmanager - image: prom/alertmanager:v0.4.2 + image: prom/alertmanager:v0.5.0 ports: - containerPort: 9093 imagePullPolicy: Always