From 7429f4f630d3f4ced5290ae6fecb4563b2b987c3 Mon Sep 17 00:00:00 2001 From: Leonhardt Wille Date: Mon, 28 Aug 2023 18:06:01 +0200 Subject: [PATCH] Add runbooks for Argo and Vector --- .gitignore | 1 + content/runbooks/argo/ArgoAppNotSynced.md | 23 +++++++++++++++++++ .../runbooks/vector/VectorDiscardEvents.md | 23 +++++++++++++++++++ 3 files changed, 47 insertions(+) create mode 100644 content/runbooks/argo/ArgoAppNotSynced.md create mode 100644 content/runbooks/vector/VectorDiscardEvents.md diff --git a/.gitignore b/.gitignore index 1c3fa73..fb8c50b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ public/ resources/ .hugo_build.lock +.vscode diff --git a/content/runbooks/argo/ArgoAppNotSynced.md b/content/runbooks/argo/ArgoAppNotSynced.md new file mode 100644 index 0000000..1eebd06 --- /dev/null +++ b/content/runbooks/argo/ArgoAppNotSynced.md @@ -0,0 +1,23 @@ +--- +title: Argo App Not Synced +weight: 20 +--- + +# ArgoAppNotSynced + +## Meaning + +At least one Application in Argo CD is not in sync, meaning there is a difference between the state running in Kubernetes and the latest state in git. + +## Impact + +It's possible that a change was supposed to be deployed, but it was not. In what ways this can impact the system depends on the change itself. + +## Diagnosis + +Check the application's diff in Argo CD. The diff will show the differences between the state in git and the state in Kubernetes. + +## Mitigation + +To address this alert, you can either sync the application in Argo CD or revert the change in git. +Make sure to check the diff before syncing or reverting, and consider talking to the person who made the change. diff --git a/content/runbooks/vector/VectorDiscardEvents.md b/content/runbooks/vector/VectorDiscardEvents.md new file mode 100644 index 0000000..620b005 --- /dev/null +++ b/content/runbooks/vector/VectorDiscardEvents.md @@ -0,0 +1,23 @@ +--- +title: Vector Discarded Events +weight: 20 +--- + +# VectorDiscardEvents + +## Meaning + +Vector discarded events because it was unable to keep up with the rate of incoming events. + +## Impact + +This means that some events were not processed by Vector. This means that some logs were not sent to Loki, which means that they are not available in Grafana. + +## Diagnosis + +Check the Vector logs for errors. If there are no errors, check the Vector metrics for the `vector_events_discarded_total` metric. If the metric is increasing, it means that Vector is unable to keep up with the rate of incoming events. + +## Mitigation + +To address this alert, you can either scale up the Vector deployment or reduce the rate of incoming events. +As we deploy Vector as a DaemonSet, scaling up the deployment will scale up Vector on all nodes.