Skip to content

Commit

Permalink
initial tracing implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
PietroPasotti committed Aug 6, 2024
1 parent 47ad768 commit e2a5e9c
Show file tree
Hide file tree
Showing 21 changed files with 23,912 additions and 69 deletions.
23,200 changes: 23,200 additions & 0 deletions grafana_dashboards/node-exporter-full.json

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions loki_alert_rules/grafana_agent_high_rate.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
groups:
- name: grafana-agent-high-log-volume
rules:
- alert: HighLogVolume
expr: |
count_over_time(({%%juju_topology%%})[30s]) > 100
labels:
severity: high
annotations:
summary: Log rate is too high!
8 changes: 8 additions & 0 deletions loki_alert_rules/high_error_rate.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
alert: HostHighLogErrorRate
expr: count_over_time({job="varlogs"} |= "error" [1h]) > 100
for: 0m
labels:
severity: warning
annotations:
summary: High error rate in logs (instance {{ $labels.instance }})
description: "High error rate in logs\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
2 changes: 2 additions & 0 deletions metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ requires:
limit: 1

provides:
tracing-provider:
interface: tracing
logging-provider:
interface: loki_push_api
grafana-dashboards-provider:
Expand Down
Empty file added prometheus_alert_rules/.gitkeep
Empty file.
1 change: 1 addition & 0 deletions prometheus_alert_rules/.wokeignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
network.rules
14 changes: 14 additions & 0 deletions prometheus_alert_rules/arp_cache.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
groups:
- name: HostArpCache
rules:
- alert: HostArpCache
expr: 100 * node_arp_entries / node_sysctl_net_ipv4_neigh_default_gc_thresh3 >= 80
for: 2m
labels:
severity: critical
annotations:
summary: Host arp cache reached {{ $value | printf "%.0f" }}% limit (instance {{ $labels.instance }})
description: >-
Host arp cache reached {{ $value | printf "%.0f" }}% limit.
VALUE = {{ $value }}
LABELS = {{ $labels }}
14 changes: 14 additions & 0 deletions prometheus_alert_rules/conntrack.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
groups:
- name: HostConnctrack
rules:
- alert: HostConntrackLimit
expr: 100 * (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 80
for: 2m
labels:
severity: warning
annotations:
summary: Host conntrack reached {{ $value | printf "%.0f" }}% (instance {{ $labels.instance }})
description: >-
Host conntrack reached {{ $value | printf "%.0f" }}% usage.
VALUE = {{ $value }}
LABELS = {{ $labels }}
75 changes: 75 additions & 0 deletions prometheus_alert_rules/disk.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
groups:
- name: HostDisk
rules:
- record: used_disk_space
expr: |
100 * (1 - (
node_filesystem_free_bytes / node_filesystem_size_bytes)
)
- alert: HostDiskSpaceFillsUp
expr: predict_linear(used_disk_space{mountpoint=~"/"}[6h], 6*60*60) > 90
for: 2m
labels:
severity: warning
annotations:
summary: "[Prediction] Host filesystem '{{ $labels.mountpoint }}' is using {{ $value | printf \"%.0f\" }}% of the total space (instance {{ $labels.instance }})"
description: >-
Host filesystem '{{ $labels.mountpoint }}' usage can potentially reach {{ $value | printf "%.0f" }}% of the total space.
VALUE = {{ $value }}
LABELS = {{ $labels }}
The 6-hour-ahead prediction is made as a linear regression from the last 60 minutes of data.
- alert: HostDiskSpace
expr: used_disk_space{mountpoint=~"/"} > 90
for: 0m
labels:
severity: critical
annotations:
summary: Host filesystem '{{ $labels.mountpoint }}' is using {{ $value | printf "%.0f" }}% of the total space (instance {{ $labels.instance }})
description: >-
Host filesystem '{{ $labels.mountpoint }}' is using {{ $value | printf "%.0f" }}% of the total space.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostReadonlyFilesystem
expr: node_filesystem_readonly{mountpoint!~"/snap/.*|/sys/fs/cgroup/.*"} > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host filesystem '{{ $labels.mountpoint }}' is readonly (instance {{ $labels.instance }})
description: >-
Host filesystem '{{ $labels.mountpoint }}' is readonly.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostXFSError
expr: node_filesystem_device_error{fstype="xfs"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: XFS error found for device '{{ $labels.device }}' (instance {{ $labels.instance }})
description: >-
XFS error found for device '{{ $labels.device }}'.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostHighDiskReadRate
expr: irate(node_disk_read_bytes_total[2m]) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: Host high disk '{{ $labels.device }}' read rate (instance {{ $labels.instance }})
description: >-
Host disk '{{ $labels.device }}' is probably reading too much data ({{ $value | printf "%.0f" }} > 50 MB/s) for last 5m.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostHighDiskWriteRate
expr: irate(node_disk_written_bytes_total[2m]) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: Host high disk '{{ $labels.device }}' write rate (instance {{ $labels.instance }})
description: >-
Host disk '{{ $labels.device }}' is probably writing too much data ({{ $value | printf "%.0f" }} > 50 MB/s) for last 5m.
VALUE = {{ $value }}
LABELS = {{ $labels }}
8 changes: 8 additions & 0 deletions prometheus_alert_rules/high_cpu_iowait.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
alert: HostCpuHighIowait
expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU high iowait (instance {{ $labels.instance }})
description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
16 changes: 16 additions & 0 deletions prometheus_alert_rules/hwmon.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
groups:
- name: hwmon
rules:
- alert: HwmonTempAlarm
expr: node_hwmon_temp_alarm != 0
for: 0m
labels:
severity: warning
annotations:
summary: Chip {{ $labels.chip }} is throwing a temperature alarm on {{ $labels.instance }}
description: >-
Chip {{ $labels.chip }} is throwing a temperature alarm on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}

50 changes: 50 additions & 0 deletions prometheus_alert_rules/mdadm.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
groups:
- name: mdadm
rules:
- alert: RaidDisksFailed
expr: node_md_disks{state="failed"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: "{{ $value }} disks failed on device {{ $labels.device }}.(instance {{ $labels.instance }})"
description: >-
Disks failed on raid device {{ $labels.device }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}

- alert: RaidDisksSpare
expr: node_md_disks{state="spare"} > 0
for: 0m
labels:
severity: warning
annotations:
summary: "{{ $value }} disks marked as spare on device {{ $labels.device }}.(instance {{ $labels.instance }})"
description: >-
Disks marked as spare on raid device {{ $labels.device }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}

- alert: RaidDeviceInactive
expr: node_md_state{state="inactive"} > 0
for: 0m
labels:
severity: critical
annotations:
summary: RAID device {{ $labels.device }} in inactive state.(instance {{ $labels.instance }})
description: >-
RAID device {{ $labels.device }} in inactive state.
VALUE = {{ $value }}
LABELS = {{ $labels }}

- alert: RaidDeviceRecovering
expr: node_md_state{state="recovering"} > 0
for: 0m
labels:
severity: warning
annotations:
summary: RAID device {{ $labels.device }} in recovering state.(instance {{ $labels.instance }})
description: >-
RAID device {{ $labels.device }} in recovering state.
VALUE = {{ $value }}
LABELS = {{ $labels }}
72 changes: 72 additions & 0 deletions prometheus_alert_rules/memory.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
groups:
- name: HostMemory
rules:
- record: node_memory_MemUsed_percentage
expr: |
100 * (1 - (
(
node_memory_MemFree_bytes
+ node_memory_Cached_bytes
+ node_memory_Buffers_bytes
+ node_memory_SReclaimable_bytes
) / node_memory_MemTotal_bytes
))
- record: node_memory_SwapUsed_percentage
expr: |
100 * (1 - (
(
node_memory_SwapFree_bytes
+ node_memory_SwapCached_bytes
) / node_memory_SwapTotal_bytes
))
- alert: HostMemoryFillsUp
expr: |
predict_linear(node_memory_MemUsed_percentage[30m], 5*60) >= 90
and
avg_over_time(node_memory_MemUsed_percentage[2m]) < 90
for: 2m
labels:
severity: warning
annotations:
summary: '[Prediction] Host memory usage will increase to {{ $value | printf "%.0f" }}% in the near future (instance {{ $labels.instance }})'
description: >-
Host can potentially reach {{ $value | printf "%.0f" }}% memory utilization and risk an OOM kill.
VALUE = {{ $value }}
LABELS = {{ $labels }}
The 5-minute-ahead prediction is made as a linear regression from the last 30 minutes of data.
- alert: HostMemoryFull
expr: avg_over_time(node_memory_MemUsed_percentage[1m]) > 95
for: 2m
labels:
severity: critical
annotations:
summary: Host memory usage reached {{ $value | printf "%.0f" }}% load (instance {{ $labels.instance }})
description: >-
Host memory usage reached {{ $value | printf "%.0f" }}%
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostSwapFull
expr: |
avg_over_time(node_memory_MemUsed_percentage[1m]) > 90
and
avg_over_time(node_memory_SwapUsed_percentage[1m]) > 50
for: 2m
labels:
severity: critical
annotations:
summary: Host memory and swap usage reached 90% and 50% load (instance {{ $labels.instance }})
description: >-
Host memory and swap usage reached 90% and 50% load
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: >-
The node is under heavy memory pressure. High rate of major page faults.
VALUE = {{ $value }}
LABELS = {{ $labels }}
58 changes: 58 additions & 0 deletions prometheus_alert_rules/network.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
groups:
- name: HostNetwork
rules:
- alert: HostInterfaceMTUSize
expr: last_over_time(node_network_mtu_bytes[30m]) and changes(node_network_mtu_bytes[30m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Interface '{{ $labels.device }}' MTU size changed (instance {{ $labels.instance }})
description: >-
Interface '{{ $labels.device }}' MTU size changed to {{ $value }} on host {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostInterfaceSpeed
expr: last_over_time(node_network_speed_bytes[30m]) and changes(node_network_speed_bytes[30m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Interface '{{ $labels.device }}' speed changed (instance {{ $labels.instance }})
description: >-
Interface '{{ $labels.device }}' speed changed to {{ $value }}B/s on host {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Interface '{{ $labels.device }}' network receive to many errors (instance {{ $labels.instance }})
description: >-
Interface '{{ $labels.device }}' has encountered {{ $value | printf "%.4f" }}% receive errors in the last two minutes on host {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Interface '{{ $labels.device }}' network transmit to many errors (instance {{ $labels.instance }})
description: >-
Interface '{{ $labels.device }}' has encountered {{ $value | printf "%.4f" }}% transmit errors in the last two minutes on host {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
- alert: HostNetworkBondDegraded
expr: (node_bonding_active - node_bonding_slaves) != 0
for: 2m
labels:
severity: warning
annotations:
summary: Host bond network is degraded (instance {{ $labels.instance }})
description: >-
Host bond `{{ $labels.master }}` network is degraded.
VALUE = {{ $value }}
LABELS = {{ $labels }}
8 changes: 8 additions & 0 deletions prometheus_alert_rules/oomkill.rule
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
33 changes: 33 additions & 0 deletions prometheus_alert_rules/pressure.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
groups:
- name: HostPressure
rules:

# Alert for host cpu pressure - high instantanoues cpu waiting time
- alert: HostHighCpuWaitingTime
expr: irate(node_pressure_cpu_waiting_seconds_total[5m]) > 0.9
for: 5m
labels:
severity: warning
annotations:
summary: Host processes spent too much time waiting for CPU resources (instance {{ $labels.instance }})
description: The instantanoues time that the processes spent on waiting for CPU resource is too high. This might indicates that the server is under high CPU pressure.\n VALUE = {{ $value | printf "%.2f" }}\n LABELS = {{ $labels }}

# Alert for host io pressure - high instantanoues io waiting time
- alert: HostHighIOWaitingTime
expr: irate(node_pressure_io_waiting_seconds_total[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: Host processes spent too much time waiting due to I/O congestion (instance {{ $labels.instance }})
description: The instantanoues time that the processes spent on waiting for I/O is too high. This might indicates that the server is under high I/O pressure.\n VALUE = {{ $value | printf "%.2f" }}\n LABELS = {{ $labels }}

# Alert for host mem pressure - high instantanoues mem waiting time
- alert: HostHighMemoryWaitingTime
expr: irate(node_pressure_memory_waiting_seconds_total[5m]) > 0.5
for: 5m
labels:
severity: warning
annotations:
summary: Host processes spent too much time waiting for memory (instance {{ $labels.instance }})
description: The instantanoues time that the processes spent on waiting for memory is too high. This might indicates that the server is under high memory pressure.\n VALUE = {{ $value | printf "%.2f" }}\n LABELS = {{ $labels }}
Loading

0 comments on commit e2a5e9c

Please sign in to comment.