initial tracing implementation

canonical · Aug 6, 2024 · e2a5e9c · e2a5e9c
1 parent 47ad768
commit e2a5e9c
Show file tree

Hide file tree

Showing 21 changed files with 23,912 additions and 69 deletions.
diff --git a/grafana_dashboards/node-exporter-full.json b/grafana_dashboards/node-exporter-full.json
diff --git a/loki_alert_rules/grafana_agent_high_rate.rule b/loki_alert_rules/grafana_agent_high_rate.rule
@@ -0,0 +1,10 @@
+groups:
+  - name: grafana-agent-high-log-volume
+    rules:
+      - alert: HighLogVolume
+        expr: |
+          count_over_time(({%%juju_topology%%})[30s]) > 100
+        labels:
+            severity: high
+        annotations:
+            summary: Log rate is too high!
diff --git a/loki_alert_rules/high_error_rate.rule b/loki_alert_rules/high_error_rate.rule
@@ -0,0 +1,8 @@
+alert: HostHighLogErrorRate
+expr: count_over_time({job="varlogs"} |= "error" [1h]) > 100
+for: 0m
+labels:
+  severity: warning
+annotations:
+  summary: High error rate in logs (instance {{ $labels.instance }})
+  description: "High error rate in logs\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/metadata.yaml b/metadata.yaml
@@ -64,6 +64,8 @@ requires:
     limit: 1
 
 provides:
+  tracing-provider:
+    interface: tracing
   logging-provider:
     interface: loki_push_api
   grafana-dashboards-provider:

diff --git a/prometheus_alert_rules/.gitkeep b/prometheus_alert_rules/.gitkeep
diff --git a/prometheus_alert_rules/.wokeignore b/prometheus_alert_rules/.wokeignore
@@ -0,0 +1 @@
+network.rules
diff --git a/prometheus_alert_rules/arp_cache.rules b/prometheus_alert_rules/arp_cache.rules
@@ -0,0 +1,14 @@
+groups:
+- name: HostArpCache
+  rules:
+  - alert: HostArpCache
+    expr: 100 * node_arp_entries / node_sysctl_net_ipv4_neigh_default_gc_thresh3 >= 80
+    for: 2m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host arp cache reached {{ $value | printf "%.0f" }}% limit (instance {{ $labels.instance }})
+      description: >-
+        Host arp cache reached {{ $value | printf "%.0f" }}% limit.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
diff --git a/prometheus_alert_rules/conntrack.rules b/prometheus_alert_rules/conntrack.rules
@@ -0,0 +1,14 @@
+groups:
+- name: HostConnctrack
+  rules:
+  - alert: HostConntrackLimit
+    expr: 100 * (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 80
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host conntrack reached {{ $value | printf "%.0f" }}% (instance {{ $labels.instance }})
+      description: >-
+        Host conntrack reached {{ $value | printf "%.0f" }}% usage.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
diff --git a/prometheus_alert_rules/disk.rules b/prometheus_alert_rules/disk.rules
@@ -0,0 +1,75 @@
+groups:
+- name: HostDisk
+  rules:
+  - record: used_disk_space
+    expr: |
+      100 * (1 - (
+        node_filesystem_free_bytes / node_filesystem_size_bytes)
+      )
+  - alert: HostDiskSpaceFillsUp
+    expr: predict_linear(used_disk_space{mountpoint=~"/"}[6h], 6*60*60) > 90
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: "[Prediction] Host filesystem '{{ $labels.mountpoint }}' is using {{ $value | printf \"%.0f\" }}% of the total space (instance {{ $labels.instance }})"
+      description: >-
+        Host filesystem '{{ $labels.mountpoint }}' usage can potentially reach {{ $value | printf "%.0f" }}% of the total space.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+        The 6-hour-ahead prediction is made as a linear regression from the last 60 minutes of data.
+  - alert: HostDiskSpace
+    expr: used_disk_space{mountpoint=~"/"} > 90
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host filesystem '{{ $labels.mountpoint }}' is using {{ $value | printf "%.0f" }}% of the total space (instance {{ $labels.instance }})
+      description: >-
+        Host filesystem '{{ $labels.mountpoint }}' is using {{ $value | printf "%.0f" }}% of the total space.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+  - alert: HostReadonlyFilesystem
+    expr: node_filesystem_readonly{mountpoint!~"/snap/.*|/sys/fs/cgroup/.*"} > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host filesystem '{{ $labels.mountpoint }}' is readonly (instance {{ $labels.instance }})
+      description: >-
+        Host filesystem '{{ $labels.mountpoint }}' is readonly.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+  - alert: HostXFSError
+    expr: node_filesystem_device_error{fstype="xfs"} > 0
+    for: 0m
+    labels:
+      severity: critical
+    annotations:
+      summary: XFS error found for device '{{ $labels.device }}' (instance {{ $labels.instance }})
+      description: >-
+         XFS error found for device '{{ $labels.device }}'.
+           VALUE = {{ $value }}
+           LABELS = {{ $labels }}
+  - alert: HostHighDiskReadRate
+    expr: irate(node_disk_read_bytes_total[2m]) / 1024 / 1024 > 50
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host high disk '{{ $labels.device }}' read rate (instance {{ $labels.instance }})
+      description: >-
+        Host disk '{{ $labels.device }}' is probably reading too much data ({{ $value | printf "%.0f" }} > 50 MB/s) for last 5m.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+  - alert: HostHighDiskWriteRate
+    expr: irate(node_disk_written_bytes_total[2m]) / 1024 / 1024 > 50
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host high disk '{{ $labels.device }}' write rate (instance {{ $labels.instance }})
+      description: >-
+        Host disk '{{ $labels.device }}' is probably writing too much data ({{ $value | printf "%.0f" }} > 50 MB/s) for last 5m.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
diff --git a/prometheus_alert_rules/high_cpu_iowait.rule b/prometheus_alert_rules/high_cpu_iowait.rule
@@ -0,0 +1,8 @@
+alert: HostCpuHighIowait
+expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10
+for: 0m
+labels:
+  severity: warning
+annotations:
+  summary: Host CPU high iowait (instance {{ $labels.instance }})
+  description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/prometheus_alert_rules/hwmon.rules b/prometheus_alert_rules/hwmon.rules
@@ -0,0 +1,16 @@
+---
+groups:
+  - name: hwmon
+    rules:
+      - alert: HwmonTempAlarm
+        expr: node_hwmon_temp_alarm != 0
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: Chip {{ $labels.chip }} is throwing a temperature alarm on {{ $labels.instance }}
+          description: >-
+            Chip {{ $labels.chip }} is throwing a temperature alarm on {{ $labels.instance }}
+              VALUE = {{ $value }}
+              LABELS = {{ $labels }}
+
diff --git a/prometheus_alert_rules/mdadm.rules b/prometheus_alert_rules/mdadm.rules
@@ -0,0 +1,50 @@
+groups:
+- name: mdadm
+  rules:
+    - alert: RaidDisksFailed  
+      expr: node_md_disks{state="failed"} > 0
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: "{{ $value }} disks failed on device {{ $labels.device }}.(instance {{ $labels.instance }})"
+        description: >-
+          Disks failed on raid device {{ $labels.device }}.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+
+    - alert: RaidDisksSpare
+      expr: node_md_disks{state="spare"} > 0
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: "{{ $value }} disks marked as spare on device {{ $labels.device }}.(instance {{ $labels.instance }})"
+        description: >-
+          Disks marked as spare on raid device {{ $labels.device }}.
+            VALUE = {{ $value }}
+            LABELS = {{ $labels }}
+
+    - alert: RaidDeviceInactive
+      expr: node_md_state{state="inactive"} > 0
+      for: 0m
+      labels:
+        severity: critical
+      annotations:
+        summary: RAID device {{ $labels.device }} in inactive state.(instance {{ $labels.instance }})
+        description: >-
+          RAID device {{ $labels.device }} in inactive state.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+
+    - alert: RaidDeviceRecovering
+      expr: node_md_state{state="recovering"} > 0
+      for: 0m
+      labels:
+        severity: warning
+      annotations:
+        summary: RAID device {{ $labels.device }} in recovering state.(instance {{ $labels.instance }})
+        description: >-
+          RAID device {{ $labels.device }} in recovering state.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
diff --git a/prometheus_alert_rules/memory.rules b/prometheus_alert_rules/memory.rules
@@ -0,0 +1,72 @@
+groups:
+- name: HostMemory
+  rules:
+  - record: node_memory_MemUsed_percentage
+    expr: |
+      100 * (1 - (
+        (
+          node_memory_MemFree_bytes
+          + node_memory_Cached_bytes
+          + node_memory_Buffers_bytes
+          + node_memory_SReclaimable_bytes
+        ) / node_memory_MemTotal_bytes
+      ))
+  - record: node_memory_SwapUsed_percentage
+    expr: |
+      100 * (1 - (
+        (
+          node_memory_SwapFree_bytes
+          + node_memory_SwapCached_bytes
+        ) / node_memory_SwapTotal_bytes
+      ))
+  - alert: HostMemoryFillsUp
+    expr: |
+      predict_linear(node_memory_MemUsed_percentage[30m], 5*60) >= 90
+      and
+      avg_over_time(node_memory_MemUsed_percentage[2m]) < 90
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: '[Prediction] Host memory usage will increase to {{ $value | printf "%.0f" }}% in the near future (instance {{ $labels.instance }})'
+      description: >-
+        Host can potentially reach {{ $value | printf "%.0f" }}% memory utilization and risk an OOM kill.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+        The 5-minute-ahead prediction is made as a linear regression from the last 30 minutes of data.
+  - alert: HostMemoryFull
+    expr: avg_over_time(node_memory_MemUsed_percentage[1m]) > 95
+    for: 2m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host memory usage reached {{ $value | printf "%.0f" }}% load (instance {{ $labels.instance }})
+      description: >-
+        Host memory usage reached {{ $value | printf "%.0f" }}%
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+  - alert: HostSwapFull
+    expr: |
+      avg_over_time(node_memory_MemUsed_percentage[1m]) > 90
+      and
+      avg_over_time(node_memory_SwapUsed_percentage[1m]) > 50
+    for: 2m
+    labels:
+      severity: critical
+    annotations:
+      summary: Host memory and swap usage reached 90% and 50% load (instance {{ $labels.instance }})
+      description: >-
+        Host memory and swap usage reached 90% and 50% load
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+  - alert: HostMemoryUnderMemoryPressure
+    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host memory under memory pressure (instance {{ $labels.instance }})
+      description: >-
+        The node is under heavy memory pressure. High rate of major page faults.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
diff --git a/prometheus_alert_rules/network.rules b/prometheus_alert_rules/network.rules
@@ -0,0 +1,58 @@
+groups:
+- name: HostNetwork
+  rules:
+  - alert: HostInterfaceMTUSize
+    expr: last_over_time(node_network_mtu_bytes[30m]) and changes(node_network_mtu_bytes[30m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Interface '{{ $labels.device }}' MTU size changed (instance {{ $labels.instance }})
+      description: >-
+        Interface '{{ $labels.device }}' MTU size changed to {{ $value }} on host {{ $labels.instance }}.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+  - alert: HostInterfaceSpeed
+    expr: last_over_time(node_network_speed_bytes[30m]) and changes(node_network_speed_bytes[30m]) > 0
+    for: 0m
+    labels:
+      severity: warning
+    annotations:
+      summary: Interface '{{ $labels.device }}' speed changed (instance {{ $labels.instance }})
+      description: >-
+        Interface '{{ $labels.device }}' speed changed to {{ $value }}B/s on host {{ $labels.instance }}.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+  - alert: HostNetworkReceiveErrors
+    expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Interface '{{ $labels.device }}' network receive to many errors (instance {{ $labels.instance }})
+      description: >-
+        Interface '{{ $labels.device }}' has encountered {{ $value | printf "%.4f" }}% receive errors in the last two minutes on host {{ $labels.instance }}.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+  - alert: HostNetworkTransmitErrors
+    expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Interface '{{ $labels.device }}' network transmit to many errors (instance {{ $labels.instance }})
+      description: >-
+        Interface '{{ $labels.device }}' has encountered {{ $value | printf "%.4f" }}% transmit errors in the last two minutes on host {{ $labels.instance }}.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
+  - alert: HostNetworkBondDegraded
+    expr: (node_bonding_active - node_bonding_slaves) != 0
+    for: 2m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host bond network is degraded (instance {{ $labels.instance }})
+      description: >-
+        Host bond `{{ $labels.master }}` network is degraded.
+          VALUE = {{ $value }}
+          LABELS = {{ $labels }}
diff --git a/prometheus_alert_rules/oomkill.rule b/prometheus_alert_rules/oomkill.rule
@@ -0,0 +1,8 @@
+alert: HostOomKillDetected
+expr: increase(node_vmstat_oom_kill[1m]) > 0
+for: 0m
+labels:
+  severity: warning
+annotations:
+  summary: Host OOM kill detected (instance {{ $labels.instance }})
+  description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
diff --git a/prometheus_alert_rules/pressure.rules b/prometheus_alert_rules/pressure.rules
@@ -0,0 +1,33 @@
+groups:
+- name: HostPressure
+  rules:
+
+  # Alert for host cpu pressure - high instantanoues cpu waiting time
+  - alert: HostHighCpuWaitingTime
+    expr: irate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.9
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host processes spent too much time waiting for CPU resources (instance {{ $labels.instance }})
+      description: The instantanoues time that the processes spent on waiting for CPU resource is too high. This might indicates that the server is under high CPU pressure.\n  VALUE = {{ $value | printf "%.2f" }}\n  LABELS = {{ $labels }}
+
+  # Alert for host io pressure - high instantanoues io waiting time
+  - alert: HostHighIOWaitingTime
+    expr: irate(node_pressure_io_waiting_seconds_total[5m]) > 0.5
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host processes spent too much time waiting due to I/O congestion (instance {{ $labels.instance }})
+      description: The instantanoues time that the processes spent on waiting for I/O is too high. This might indicates that the server is under high I/O pressure.\n  VALUE = {{ $value | printf "%.2f" }}\n  LABELS = {{ $labels }}
+
+  # Alert for host mem pressure - high instantanoues mem waiting time
+  - alert: HostHighMemoryWaitingTime
+    expr: irate(node_pressure_memory_waiting_seconds_total[5m]) > 0.5
+    for: 5m
+    labels:
+      severity: warning
+    annotations:
+      summary: Host processes spent too much time waiting for memory (instance {{ $labels.instance }})
+      description: The instantanoues time that the processes spent on waiting for memory is too high. This might indicates that the server is under high memory pressure.\n  VALUE = {{ $value | printf "%.2f" }}\n  LABELS = {{ $labels }}