Skip to content

Commit

Permalink
Merge pull request #128 from vshn/nonslaalerts
Browse files Browse the repository at this point in the history
Move non-slo alerts to comp functions
  • Loading branch information
wejdross authored Feb 5, 2024
2 parents 345fd46 + a9e4e7a commit a64a561
Show file tree
Hide file tree
Showing 9 changed files with 259 additions and 3 deletions.
11 changes: 11 additions & 0 deletions apis/vshn/v1/dbaas_vshn_postgresql.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,3 +335,14 @@ func (pg *VSHNPostgreSQL) GetInstanceNamespace() string {
func (pg *XVSHNPostgreSQL) GetInstanceNamespace() string {
return fmt.Sprintf("vshn-postgresql-%s", pg.GetName())
}

// GetBackupRetention returns the retention definition for this backup.
// !!! This is just a placeholder to satisfy InfoGetter interface
func (v *VSHNPostgreSQL) GetBackupRetention() K8upRetentionPolicy {
return K8upRetentionPolicy{}
}

// GetServiceName returns the name of this service
func (v *VSHNPostgreSQL) GetServiceName() string {
return "postgresql"
}
11 changes: 11 additions & 0 deletions apis/vshn/v1/vshn_minio.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,3 +186,14 @@ func (v *VSHNMinio) GetFullMaintenanceSchedule() VSHNDBaaSMaintenanceScheduleSpe
schedule.TimeOfDay = v.GetMaintenanceTimeOfDay()
return schedule
}

// GetBackupRetention returns the retention definition for this backup.
// !!! This is just a placeholder to satisfy InfoGetter interface !!!
func (v *VSHNMinio) GetBackupRetention() K8upRetentionPolicy {
return K8upRetentionPolicy{}
}

// GetServiceName returns the name of this service
func (v *VSHNMinio) GetServiceName() string {
return "minio"
}
167 changes: 167 additions & 0 deletions pkg/comp-functions/functions/common/non_sla_prom_rules.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
package common

import (
"context"
"fmt"
"strings"

fnproto "github.com/crossplane/function-sdk-go/proto/v1beta1"
promV1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
"github.com/vshn/appcat/v4/pkg/comp-functions/runtime"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
)

var (
synTeam string = "schedar"
severityCritical string = "critical"
memoryContainers = map[string]string{
"mariadb": "mariadb",
"minio": "minio",
"postgresql": "patroni",
"redis": "redis",
}
)

func GenerateNonSLAPromRules(obj client.Object) func(ctx context.Context, svc *runtime.ServiceRuntime) *fnproto.Result {
return func(ctx context.Context, svc *runtime.ServiceRuntime) *fnproto.Result {

log := controllerruntime.LoggerFrom(ctx)
log.Info("Satrting non SLA prometheus rules")

log.V(1).Info("Transforming", "obj", svc)

err := svc.GetObservedComposite(obj)
if err != nil {
return runtime.NewFatalResult(fmt.Errorf("can't get composite: %w", err))
}
elem, ok := obj.(InfoGetter)
if !ok {
return runtime.NewFatalResult(err)
}

err = generatePromeRules(elem, svc)
if err != nil {
log.Info("broken addition")
return runtime.NewWarningResult("can't create prometheus rules: " + err.Error())
}

log.Info("\n\n\n\nRules added successfully = " + elem.GetInstanceNamespace())

return nil
}
}

func generatePromeRules(elem InfoGetter, svc *runtime.ServiceRuntime) error {
var minuteInterval, hourInterval, twoHourInterval promV1.Duration
minuteInterval = "1m"
hourInterval = "1h"
twoHourInterval = "2h"

instanceNamespaceRegex, instanceNamespaceSplitted, err := getInstanceNamespaceRegex(elem.GetInstanceNamespace())
if err != nil {
return fmt.Errorf("getInstanceNamespaceRegex func failed to parse instance namespace: %s, with err: %s", elem.GetInstanceNamespace(), err.Error())
}

name := elem.GetName()
namespace := elem.GetInstanceNamespace()
serviceName := memoryContainers[instanceNamespaceSplitted[1]]
prometheusRules := &promV1.PrometheusRule{
ObjectMeta: metav1.ObjectMeta{
Name: serviceName + "-non-slo-rules",
Namespace: namespace,
},
Spec: promV1.PrometheusRuleSpec{
Groups: []promV1.RuleGroup{
promV1.RuleGroup{
Name: serviceName + "-non-slo-rules",
Rules: []promV1.Rule{
promV1.Rule{
Alert: serviceName + "PersistentVolumeFillingUp",
Annotations: map[string]string{
"description": "The volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is only {{ $value | humanizePercentage }} free.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup",
"summary": "PersistentVolume is filling up.",
},
Expr: intstr.IntOrString{
Type: intstr.String,
StrVal: fmt.Sprintf("label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}) < 0.03 and kubelet_volume_stats_used_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} > 0 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"%s\")", instanceNamespaceRegex),
},
For: minuteInterval,
Labels: map[string]string{
"severity": severityCritical,
"syn_team": synTeam,
},
},
promV1.Rule{
Alert: serviceName + "PersistentVolumeExpectedToFillUp",
Annotations: map[string]string{
"description": "Based on recent sampling, the volume claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available.",
"runbook_url": "https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup",
"summary": "PersistentVolume is expected to fill up.",
},
Expr: intstr.IntOrString{
Type: intstr.String,
StrVal: fmt.Sprintf("label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}) < 0.15 and kubelet_volume_stats_used_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"%s\")", instanceNamespaceRegex),
},
For: hourInterval,
Labels: map[string]string{
"severity": severityCritical,
"syn_team": synTeam,
},
},
promV1.Rule{
Alert: serviceName + "MemoryCritical",
Annotations: map[string]string{
"description": "The memory claimed by the instance {{ $labels.name }} in namespace {{ $labels.label_appcat_vshn_io_claim_namespace }} has been over 85% for 2 hours.\n Please reducde the load of this instance, or increase the memory.",
"runbook_url": "https://hub.syn.tools/appcat/runbooks/vshn-generic.html#MemoryCritical",
"summary": "Memory usage critical.",
},
Expr: intstr.IntOrString{
Type: intstr.String,
StrVal: fmt.Sprintf("label_replace( topk(1, (max(container_memory_working_set_bytes{container=\"%s\"})without (name, id) / on(container,pod,namespace) kube_pod_container_resource_limits{resource=\"memory\"}* 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"%s\")", serviceName, instanceNamespaceRegex),
},
For: twoHourInterval,
Labels: map[string]string{
"severity": severityCritical,
"syn_team": synTeam,
},
},
},
},
},
},
}

return svc.SetDesiredKubeObject(prometheusRules, name+"-non-sla-alerts")
}

// Get InstanceNamespaceRegex returns regex for prometheus rules, splitted instance namespace and error if necessary
func getInstanceNamespaceRegex(instanceNamespace string) (string, []string, error) {
// from instance namespace, f.e. vshn-postgresql-customer-namespace-whatever
// make vshn-postgresql-(.+)-.+
// vshn-redis-(.+)-.+
// vshn-minio-(.+)-.+
// required for Prometheus queries

// vshn- <- takes 5 letters, anything shorter that 7 makes no sense
if len(instanceNamespace) < 7 {
return "", nil, fmt.Errorf("GetInstanceNamespaceRegex: instance namespace is way too short")
}

splitted := strings.Split(instanceNamespace, "-")
// at least [vshn, serviceName] should be present
if len(splitted) < 3 {
return "", nil, fmt.Errorf("GetInstanceNamespaceRegex: instance namespace broken during splitting")
}

for _, val := range splitted {
if len(val) == 0 {
return "", nil, fmt.Errorf("GetInstanceNamespaceRegex: broken instance namespace, name ending with hyphen: %s", val)
}
}

return fmt.Sprintf("%s-%s-(.+)-.+", splitted[0], splitted[1]), splitted, nil
}
43 changes: 43 additions & 0 deletions pkg/comp-functions/functions/common/non_sla_prom_rules_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package common

import (
"regexp"
"testing"
)

func TestMain(t *testing.T) {
checkValidNamespaceRegex := regexp.MustCompile(`[a-z]*-[a-z]*-\(\.\+\)\-\.\+`)
goodTestCases := []string{
"vshn-postgresql-development-app1",
"vshn-postgresql-prod-app2",
"vshn-minio-main-cluster-prod",
"vshn-mariadb-prd",
"vshn-kafka-with-very-long-but-valid-name-including-many-separators-because-we-can",
}

brokenCases := []string{
"",
"vshn-postgresql",
"vshn-postgresql-",
"a-b-a",
"vshnpostgresqlnoseparator",
"vshn-redi1s-sadfasd",
}

for _, val := range goodTestCases {
regex, _, err := getInstanceNamespaceRegex(val)
if err != nil && !checkValidNamespaceRegex.MatchString(regex) {
t.Logf("Failed goodTestCases test case for: %s, with error: %v", val, err)
t.FailNow()
}
}

for _, val := range brokenCases {
regex, d1, err := getInstanceNamespaceRegex(val)
if err == nil && checkValidNamespaceRegex.MatchString(regex) {
t.Logf("Failed brokenCases test case for: %s, with error: %v", val, err)
t.Log(regex, d1)
t.FailNow()
}
}
}
10 changes: 9 additions & 1 deletion pkg/comp-functions/functions/vshnmariadb/register.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package vshnmariadb

import "github.com/vshn/appcat/v4/pkg/comp-functions/runtime"
import (
vshnv1 "github.com/vshn/appcat/v4/apis/vshn/v1"
"github.com/vshn/appcat/v4/pkg/comp-functions/functions/common"
"github.com/vshn/appcat/v4/pkg/comp-functions/runtime"
)

func init() {
runtime.RegisterService("mariadb", runtime.Service{
Expand All @@ -18,6 +22,10 @@ func init() {
Name: "backup",
Execute: AddBackupMariadb,
},
{
Name: "non-sla-prometheus-rules",
Execute: common.GenerateNonSLAPromRules(&vshnv1.VSHNMariaDB{}),
},
},
})
}
10 changes: 9 additions & 1 deletion pkg/comp-functions/functions/vshnminio/register.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package vshnminio

import "github.com/vshn/appcat/v4/pkg/comp-functions/runtime"
import (
vshnv1 "github.com/vshn/appcat/v4/apis/vshn/v1"
"github.com/vshn/appcat/v4/pkg/comp-functions/functions/common"
"github.com/vshn/appcat/v4/pkg/comp-functions/runtime"
)

func init() {
runtime.RegisterService("minio", runtime.Service{
Expand All @@ -18,6 +22,10 @@ func init() {
Name: "maintenance",
Execute: AddMaintenanceJob,
},
{
Name: "non-sla-prometheus-rules",
Execute: common.GenerateNonSLAPromRules(&vshnv1.VSHNMinio{}),
},
},
})
}
4 changes: 4 additions & 0 deletions pkg/comp-functions/functions/vshnpostgres/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ func init() {
Name: "delay-cluster-deployment",
Execute: DelayClusterDeployment,
},
{
Name: "non-sla-prometheus-rules",
Execute: common.GenerateNonSLAPromRules(&vshnv1.VSHNPostgreSQL{}),
},
{
Name: "pgbouncer-settings",
Execute: addPGBouncerSettings,
Expand Down
2 changes: 1 addition & 1 deletion pkg/comp-functions/functions/vshnredis/redis_deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func DeployRedis(ctx context.Context, svc *runtime.ServiceRuntime) *xfnproto.Res
err = common.BootstrapInstanceNs(ctx, comp, "redis", "namespace-conditions", svc)
if err != nil {
err = fmt.Errorf("cannot bootstrap instance namespace: %w", err)
return runtime.NewFatalResult(err)
return runtime.NewWarningResult(err.Error())
}

return nil
Expand Down
4 changes: 4 additions & 0 deletions pkg/comp-functions/functions/vshnredis/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ func init() {
Name: "user-alerting",
Execute: common.AddUserAlerting(&vshnv1.VSHNRedis{}),
},
{
Name: "non-sla-prometheus-rules",
Execute: common.GenerateNonSLAPromRules(&vshnv1.VSHNRedis{}),
},
},
})
}

0 comments on commit a64a561

Please sign in to comment.