diff --git a/mina-alerts/.helmignore b/mina-alerts/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/mina-alerts/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/mina-alerts/Chart.yaml b/mina-alerts/Chart.yaml new file mode 100644 index 0000000..78c05a0 --- /dev/null +++ b/mina-alerts/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: mina-alerts +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.16.0" diff --git a/mina-alerts/README.md b/mina-alerts/README.md new file mode 100644 index 0000000..07410d5 --- /dev/null +++ b/mina-alerts/README.md @@ -0,0 +1,48 @@ +# mina-alerts + +![Version: 0.1.0](https://img.shields.io/badge/Version-0.1.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 1.16.0](https://img.shields.io/badge/AppVersion-1.16.0-informational?style=flat-square) + +A Helm chart for Kubernetes + +## Prerequisites + +Before using this Helm chart, you should have the following prerequisites: + +- Access to Kubernetes cluster (If needed contact your friendly neighbourhood DevOps engineer) +- Helm >= v3.14.3 +- (**Optional**) helmfile >= v0.162.0 to install this chart + +## Installation + +> Note: **examples** can be found in the repository + +To install this Helm chart, the easiest is to create a helmfile.yaml with needed values and run: + +``` +helmfile template +helmfile apply +``` + +Or use helmfile only to generate resources and apply them with kubectl like so: + +``` +helmfile template | kubectl -f - +``` + +Verify that the chart is deployed successfully: + +> Note: `kubectl` is a better suited tool for this + +``` +helmfile status +``` + +## Values + +| Key | Type | Default | Description | +|-----|------|---------|-------------| +| alert_evaluation_duration | string | `"10m"` | The evaluation duration | +| alert_timeframe | string | `"1h"` | The alert time frame | +| severity | string | `"testing"` | The severity of the alert | +| testnet_regex | string | `"testnet=~\"^(devnet|mainnet).*\""` | The alert testnet regex | + diff --git a/mina-alerts/templates/_helpers.tpl b/mina-alerts/templates/_helpers.tpl new file mode 100644 index 0000000..3f6a4f7 --- /dev/null +++ b/mina-alerts/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "mina-alerts.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "mina-alerts.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "mina-alerts.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "mina-alerts.labels" -}} +helm.sh/chart: {{ include "mina-alerts.chart" . }} +{{ include "mina-alerts.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "mina-alerts.selectorLabels" -}} +app.kubernetes.io/name: {{ include "mina-alerts.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "mina-alerts.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "mina-alerts.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/mina-alerts/templates/prometheus-rules.yaml b/mina-alerts/templates/prometheus-rules.yaml new file mode 100644 index 0000000..36cca21 --- /dev/null +++ b/mina-alerts/templates/prometheus-rules.yaml @@ -0,0 +1,393 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: "{{ include "mina-alerts.fullname" . }}" + labels: + app: "{{ include "mina-alerts.fullname" . }}" +spec: + groups: + - name: MinaProtocol + rules: + - alert: WatchdogClusterCrashes + expr: max by (testnet) (max_over_time(Coda_watchdog_cluster_crashes {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) > 0.5 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} cluster nodes have crashed" + description: "{{`{{`}} $value {{`}}`}} Cluster nodes have crashed on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/WatchdogClusterCrashes-13f1fb28abc381cb96f6f20547a32406" + + - alert: MultipleNodeRestarted + expr: count by (testnet) (Coda_Runtime_process_uptime_ms_total {{`{`}} {{ .Values.testnet_regex }} {{`}`}} < 600000) > 2 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "At least 3 nodes on {{`{{`}} $labels.testnet {{`}}`}} restarted" + description: "{{`{{`}} $value {{`}}`}} nodes on {{`{{`}} $labels.testnet {{`}}`}} restarted" + runbook: "https://www.notion.so/minafoundation/MultipleNodeRestarted-13f1fb28abc381ba9fdcf5b11286b6d5" + + - alert: HighDisconnectedBlocksPerHour + expr: max by (testnet) (increase(Coda_Rejected_blocks_no_common_ancestor {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) > 3 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has more than 3 blocks that have been produced on a remote side chains in the last hour" + description: "{{`{{`}} $value {{`}}`}} blocks have been produced that share no common ancestor with our transition frontier on network {{`{{`}} $labels.test {{`}}`}} in the last hour." + runbook: "https://www.notion.so/minafoundation/HighDisconnectedBlocksPerHour-13f1fb28abc38102a7ecd592da5bc34b" + + - alert: HighOldBlocksPerHour + expr: max by (testnet) (increase(Coda_Rejected_blocks_worse_than_root {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) > 5 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has more than 5 blocks that are not selected over the root of our transition frontier in the last hour" + description: "{{`{{`}} $value {{`}}`}} blocks have been produced that are not selected over the root of our transition frontier in the last hour" + runbook: "https://www.notion.so/minafoundation/HighOldBlocksPerHour-13f1fb28abc3810999a7ef6079a76faf" + + - alert: HighInvalidProofPerHour + expr: max by (testnet) (increase(Coda_Rejected_blocks_invalid_proof {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) > 3 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has more than 3 blocks that contains an invalid blockchain snark proof in last hour" + description: "{{`{{`}} $value {{`}}`}} blocks have been produced that contains an invalid blockchain snark proof in last hour" + runbook: "https://www.notion.so/minafoundation/HighInvalidProofPerHour-13f1fb28abc381cca113d4431a1696ca" + + - alert: WatchdogNoNewLogs + expr: max by (testnet) (Coda_watchdog_pods_with_no_new_logs) > 0 + for: 12m + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has pods which have not logged in an hour" + description: "There are no new logs in the last hour for {{`{{`}} $value {{`}}`}} pods on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/WatchdogNoNewLogs-13f1fb28abc381c2bb60cc1bc76e5826" + + - alert: SeedListDown + expr: min by (testnet) (min_over_time(Coda_watchdog_seeds_reachable {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) == 0 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} seed list is down (no seeds are reachable)" + description: "Seed list is down on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/SeedListDown-13f1fb28abc38192bd46f493e44555d4" + + - alert: BlockStorageBucketNoNewBlocks + expr: min by (testnet) (min_over_time(Coda_watchdog_recent_google_bucket_blocks {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) >= 30*60 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has no new blocks posted to the google block storage bucket recently" + description: "{{`{{`}} $value {{`}}`}} new blocks posted to the google storage bucket for {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/BlockStorageBucketNoNewBlock-for-review-13f1fb28abc381ed8aeec066a1efbafe" + + - alert: ProverErrors + expr: max by (testnet) (max_over_time(Coda_watchdog_prover_errors_total {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) > 0 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has observed a prover error" + description: "{{`{{`}} $value {{`}}`}} Prover errors on network {{`{{`}} $labels.testnet {{`}}`}}." + + - alert: NodesNotSynced + expr: min by (testnet) (Coda_watchdog_nodes_synced {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) <= .5 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has <= 50% of nodes synced" + description: "Nodes sync rate of {{`{{`}} $value {{`}}`}} is <= 50% on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/Nodes-not-synced-13f1fb28abc381ce98cfd6758ab93ca5" + + - alert: NodesOutOfSync + expr: min by (testnet) (avg_over_time(Coda_watchdog_nodes_synced_near_best_tip {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) < .6 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has < 60% of nodes that are synced on the same best tip" + description: "< 60% of nodes that are synced are on the same best tip for network {{`{{`}} $labels.testnet {{`}}`}} with rate of {{`{{`}} $value {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/NodesOutOfSync-O1NodesOutOfSync-13f1fb28abc38153b365c08b9fcbbbd6" + + - alert: MFNodesOutOfSync + expr: min by (testnet) (increase(Coda_Transition_frontier_max_blocklength_observed {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) < 1 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "One or more {{`{{`}} $labels.testnet {{`}}`}} nodes are stuck at an old block height (Observed block height did not increase in the last hour)" + description: "{{`{{`}} $value {{`}}`}} blocks have been validated on network {{`{{`}} $labels.testnet {{`}}`}} in the last hour (according to some node)." + runbook: "https://www.notion.so/minafoundation/NodesOutOfSync-O1NodesOutOfSync-13f1fb28abc38153b365c08b9fcbbbd6" + + - alert: LowPeerCount + expr: min by (testnet) (Coda_Network_peers {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) < 3 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} avg. peer count is critically low" + description: "Critically low peer count of {{`{{`}} $value {{`}}`}} on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/LowPeerCount-13f1fb28abc381bcae2bd814f90891b8" + + - alert: CriticallyLowMinWindowDensity + expr: quantile by (testnet) (0.5, Coda_Transition_frontier_min_window_density {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) <= 13 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} min density is critically low" + description: "Critically low min density of {{`{{`}} $value {{`}}`}} on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/LowMinWindowDensity-review-13f1fb28abc38129ba13f713db280787" + + - alert: LowFillRate + expr: quantile by (testnet) (0.5, Coda_Transition_frontier_slot_fill_rate {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) < 0.75 * 0.6 + for: 1h + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} slot fill rate is critically low" + description: "Lower fill rate of {{`{{`}} $value {{`}}`}} than expected on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/LowFillRate-13f1fb28abc38105a2a4f319d1712dfe" + + - alert: NoTransactionsInSeveralBlocks + expr: quantile by (testnet) (0.5, Coda_Transition_frontier_empty_blocks_at_best_tip {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) >= 5 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has >= 5 blocks without transactions at the tip" + description: "{{`{{`}} $value {{`}}`}} blocks without transactions on tip of network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/No-Transactions-In-Several-Blocks-13f1fb28abc38178b0bdecc96f07f1a2" + + - alert: NoCoinbaseInBlocks + expr: quantile by (testnet) (0.5, Coda_Transition_frontier_best_tip_coinbase {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) < 1 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has blocks without coinbases" + description: "{{`{{`}} $value {{`}}`}} Blocks without coinbases on tip of network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/NoCoinbaseInBlocks-review-13f1fb28abc3814dbdffd16b26a07f24" + + - alert: LongFork + expr: max by (testnet) (Coda_Transition_frontier_longest_fork {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) >= 16 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has a fork of length at least 16" + description: "Fork of length {{`{{`}} $value {{`}}`}} on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/LongFork-13f1fb28abc381239fa5c67f940e3530" + + - alert: OldBestTip + expr: min by (testnet) ((time() - 1609459200) - Coda_Transition_frontier_best_tip_slot_time_sec {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) >= 15 * 180 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}}: all nodes have best tips older than 15 slots" + description: "All nodes have best tips older than 15 slots (45 minutes) on network {{`{{`}} $labels.testnet {{`}}`}}. Best tip: {{`{{`}} $value {{`}}`}}" + runbook: "https://www.notion.so/minafoundation/OldBestTip-review-13f1fb28abc381908b2cd3e7d465c270" + + - alert: NoNewSnarks + expr: min by (testnet) ((time() - 1609459200) - Coda_Snark_work_useful_snark_work_received_time_sec {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) >= 2 * 180 and max by (testnet) (Coda_Snark_work_pending_snark_work {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) != 0 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}}: no new SNARK work seen for 2 slots." + description: "No node has received SNARK work in the last 2 slots (6 minutes) on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/NoNewSnarks-final-copy-13f1fb28abc3813cb13bf483478135f7" + + - alert: NoNewTransactions + expr: min by (testnet) ((time() - 1609459200) - Coda_Transaction_pool_useful_transactions_received_time_sec {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) >= 2 * 180 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}}: no new transactions seen for 2 slots." + description: "No node has received transactions in their transaction pool in the last 2 slots (6 minutes) on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/NoNewTransactions-13f1fb28abc381cf9318e167a21a7955" + - alert: HighUnparentedBlockCount + expr: max by (testnet) (Coda_Archive_unparented_blocks {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) > 30 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has a critically high unparented block count" + description: "{{`{{`}} $value {{`}}`}} Unparented block count is critically high on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/Archive-Node-Metrics-13f1fb28abc381bea03fda89e9de3fc1" + + - alert: HighMissingBlockCount + expr: max by (testnet) (Coda_Archive_missing_blocks {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) > 30 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has a critically high missing block count" + description: "{{`{{`}} $value {{`}}`}} Missing block count is critically high on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/Archive-Node-Metrics-13f1fb28abc381bea03fda89e9de3fc1" + + - alert: FewBlocksPerHour + expr: quantile by (testnet) (0.5, increase(Coda_Transition_frontier_max_blocklength_observed {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [30m])) < 1 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "One or more {{`{{`}} $labels.testnet {{`}}`}} nodes are stuck at an old block height (Observed block height did not increase in the last 30m)" + description: "{{`{{`}} $value {{`}}`}} blocks have been validated on network {{`{{`}} $labels.testnet {{`}}`}} in the last hour (according to some node)." + runbook: "https://www.notion.so/minafoundation/FewBlocksPerHour-13f1fb28abc3816e9cbdceb75149ddae" + + - name: MinaProtocolWarning + rules: + - alert: HighBlockGossipLatency + expr: max by (testnet) (max_over_time(Coda_Block_latency_gossip_time {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) > 200 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} block gossip latency is high" + description: "High block gossip latency of {{`{{`}} $value {{`}}`}}(ms) within {{`{{`}} $labels.testnet {{`}}`}} network." + runbook: "https://www.notion.so/minafoundation/HighBlockGossipLatency-13f1fb28abc3816fbe34ec9213dd0139" + + - alert: SomewhatOldBestTip + expr: count by (testnet) (((time() - 1609459200) - Coda_Transition_frontier_best_tip_slot_time_sec {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) >= 8 * 180) > 1 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}}: at least 2 nodes have best tips older than 8 slots" + description: "At least 2 nodes have best tips older than 8 slots (24 minutes) on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/SomewhatOldBestTip-13f1fb28abc38190bacae3363652ba5b" + + - alert: MediumFork + expr: max by (testnet) (Coda_Transition_frontier_longest_fork {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) >= 8 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has a fork of length at least 8" + description: "Fork of length {{`{{`}} $value {{`}}`}} on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/MediumFork-13f1fb28abc38131bf48f7cb73d2cb8f" + + - alert: NoTransactionsInAtLeastOneBlock + expr: max by (testnet) (Coda_Transition_frontier_empty_blocks_at_best_tip {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) > 0 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has at least 1 block without transactions at the tip" + description: "{{`{{`}} $value {{`}}`}} Blocks without transactions on tip of network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/NoTransactionsInAtLeastOneBlock-13f1fb28abc38101bc4bdb04943689f0" + + - alert: LowMinWindowDensity + expr: quantile by (testnet) (0.5, Coda_Transition_frontier_min_window_density {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) <= 35 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} min density is low" + description: "Low min density on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/LowMinWindowDensity-Runbook-13f1fb28abc3818ea8f1c199487c7157" + + - alert: SeedListDegraded + expr: min by (testnet) (Coda_watchdog_seeds_reachable {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) <= 0.5 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} seed list is degraded (less than 50% reachable)" + description: "Seed list is degraded at {{`{{`}} $value {{`}}`}} on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/SeedListDown-13f1fb28abc38192bd46f493e44555d4" + + - alert: LowDisconnectedBlocksPerHour + expr: max by (testnet) (increase(Coda_Rejected_blocks_no_common_ancestor {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) > 0 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has at least 1 blocks that have been produced on a remote side chains in the last hour" + description: "{{`{{`}} $value {{`}}`}} blocks have been produced that share no common ancestor with our transition frontier on network {{`{{`}} $labels.test {{`}}`}} in the last hour." + runbook: "https://www.notion.so/minafoundation/LowDisconnectedBlocksPerHour-13f1fb28abc381ae8caccecf4934d7ea" + + - alert: LowOldBlocksPerHour + expr: max by (testnet) (increase(Coda_Rejected_blocks_worse_than_root {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) > 0 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has at least 1 blocks that are not selected over the root of our transition frontier in the last hour" + description: "{{`{{`}} $value {{`}}`}} blocks have been produced that are not selected over the root of our transition frontier in the last hour" + runbook: "https://www.notion.so/minafoundation/LowOldBlocksPerHour-final-copy-13f1fb28abc381bd883fc3ac13e58981" + + - alert: LowInvalidProofPerHour + expr: max by (testnet) (increase(Coda_Rejected_blocks_invalid_proof {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) > 0 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} has at least 1 blocks that contains an invalid blockchain snark proof in last hour" + description: "{{`{{`}} $value {{`}}`}} blocks have been produced that contains an invalid blockchain snark proof in last hour" + runbook: "https://www.notion.so/minafoundation/LowInvalidProofPerHour-13f1fb28abc38133a5cdef5bc058bdd6" + + - alert: LowPostgresBlockHeightGrowth + expr: min by (testnet) (increase(Coda_Archive_max_block_height {{`{`}} {{ .Values.testnet_regex }} {{`}`}} [{{ .Values.alert_timeframe }}])) < 1 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "{{`{{`}} $labels.testnet {{`}}`}} rate of archival of network blocks in Postgres DB is lower than expected" + description: "The rate of {{`{{`}} $value {{`}}`}} new blocks observed by archive postgres instances is low on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/Archive-Node-Metrics-13f1fb28abc381bea03fda89e9de3fc1" + + - alert: NodeRestarted + expr: count by (testnet) (Coda_Runtime_process_uptime_ms_total {{`{`}} {{ .Values.testnet_regex }} {{`}`}} < 360000) > 0 + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: {{ .Values.severity }} + annotations: + summary: "At least one of the nodes on {{`{{`}} $labels.testnet {{`}}`}} restarted" + description: "{{`{{`}} $value {{`}}`}} nodes on {{`{{`}} $labels.testnet {{`}}`}} restarted" + runbook: "https://www.notion.so/minafoundation/NodeRestarted-13f1fb28abc38110ad75f28ff80f5b34" + + - alert: UnparentedBlocksObserved + expr: max by (testnet) (Coda_Archive_unparented_blocks {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) > 1 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "Unparented blocks observed on {{`{{`}} $labels.testnet {{`}}`}}" + description: "{{`{{`}} $value {{`}}`}} Unparented block(s) observed on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/NodeRestarted-13f1fb28abc38110ad75f28ff80f5b34" + + - alert: MissingBlocksObserved + expr: max by (testnet) (Coda_Archive_missing_blocks {{`{`}} {{ .Values.testnet_regex }} {{`}`}}) > 0 + for: {{ .Values.alert_evaluation_duration }} + labels: + testnet: "{{`{{`}} $labels.testnet {{`}}`}}" + severity: none + annotations: + summary: "Missing blocks observed on {{`{{`}} $labels.testnet {{`}}`}}" + description: "{{`{{`}} $value {{`}}`}} Missing block(s) observed on network {{`{{`}} $labels.testnet {{`}}`}}." + runbook: "https://www.notion.so/minafoundation/NodeRestarted-13f1fb28abc38110ad75f28ff80f5b34" diff --git a/mina-alerts/values.yaml b/mina-alerts/values.yaml new file mode 100644 index 0000000..371e6e4 --- /dev/null +++ b/mina-alerts/values.yaml @@ -0,0 +1,9 @@ +# Default values for mina-alerts. +# -- The severity of the alert +severity: "testing" +# -- The alert time frame +alert_timeframe: "1h" +# -- The evaluation duration +alert_evaluation_duration: "10m" +# -- The alert testnet regex +testnet_regex: "testnet=~\"^(devnet|mainnet).*\""