diff --git a/cmd/base/healthz.go b/cmd/base/healthz.go index fcfe61454..90f645dfa 100644 --- a/cmd/base/healthz.go +++ b/cmd/base/healthz.go @@ -19,7 +19,6 @@ package katalyst_base import ( "context" "encoding/json" - "strings" "time" "go.uber.org/atomic" @@ -27,6 +26,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/metrics" "github.com/kubewharf/katalyst-core/pkg/util/general" + "github.com/kubewharf/katalyst-core/pkg/util/metric" ) const ( @@ -55,7 +55,7 @@ func (h *HealthzChecker) Run(ctx context.Context) { if !result.Ready { _ = h.emitter.StoreInt64(MetricNameUnhealthyRule, 1, metrics.MetricTypeNameRaw, metrics.MetricTag{Key: "rule", Val: string(key)}, - metrics.MetricTag{Key: "error_message", Val: strings.ReplaceAll(result.Message, " ", "_")}) + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(result.Message)}) general.Warningf("%v healthz rule is not ready,error message is %v", string(key), result.Message) } } diff --git a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go index 438d4b0cb..ea8a4d1d7 100644 --- a/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go +++ b/pkg/agent/qrm-plugins/cpu/dynamicpolicy/policy.go @@ -55,6 +55,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/cgroup/common" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" + "github.com/kubewharf/katalyst-core/pkg/util/metric" "github.com/kubewharf/katalyst-core/pkg/util/native" "github.com/kubewharf/katalyst-core/pkg/util/process" "github.com/kubewharf/katalyst-core/pkg/util/timemonitor" @@ -694,7 +695,9 @@ func (p *DynamicPolicy) GetTopologyHints(ctx context.Context, defer func() { p.RUnlock() if err != nil { - _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw) + _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(err)}) + general.ErrorS(err, "GetTopologyHints failed", "podNamespace", req.PodNamespace, "podName", req.PodName, @@ -840,7 +843,8 @@ func (p *DynamicPolicy) Allocate(ctx context.Context, } } else if respErr != nil { _ = p.removeContainer(req.PodUid, req.ContainerName) - _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw) + _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(respErr)}) } p.Unlock() @@ -933,7 +937,8 @@ func (p *DynamicPolicy) RemovePod(ctx context.Context, defer func() { p.Unlock() if err != nil { - _ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw) + _ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(err)}) general.ErrorS(err, "RemovePod failed", "podUID", req.PodUid) } general.InfoS("finished", "duration", time.Since(startTime).String(), "podUID", req.PodUid) diff --git a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go index 8b65a05f8..058ad3402 100644 --- a/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go +++ b/pkg/agent/qrm-plugins/memory/dynamicpolicy/policy.go @@ -55,6 +55,7 @@ import ( "github.com/kubewharf/katalyst-core/pkg/util/asyncworker" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" + "github.com/kubewharf/katalyst-core/pkg/util/metric" "github.com/kubewharf/katalyst-core/pkg/util/native" "github.com/kubewharf/katalyst-core/pkg/util/process" "github.com/kubewharf/katalyst-core/pkg/util/timemonitor" @@ -584,7 +585,8 @@ func (p *DynamicPolicy) GetTopologyHints(ctx context.Context, defer func() { p.RUnlock() if err != nil { - _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw) + _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(err)}) general.ErrorS(err, "GetTopologyHints failed", "podNamespace", req.PodNamespace, "podName", req.PodName, @@ -626,7 +628,8 @@ func (p *DynamicPolicy) RemovePod(ctx context.Context, defer func() { p.Unlock() if err != nil { - _ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw) + _ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(err)}) general.ErrorS(err, "RemovePod failed", "podUID", req.PodUid) } general.InfoS("finished", "duration", time.Since(startTime), "podUID", req.PodUid) @@ -652,7 +655,8 @@ func (p *DynamicPolicy) RemovePod(ctx context.Context, err = p.removePod(req.PodUid) if err != nil { general.ErrorS(err, "remove pod failed with error", "podUID", req.PodUid) - _ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw) + _ = p.emitter.StoreInt64(util.MetricNameRemovePodFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(err)}) return nil, err } @@ -944,7 +948,8 @@ func (p *DynamicPolicy) Allocate(ctx context.Context, } } else if respErr != nil { _ = p.removeContainer(req.PodUid, req.ContainerName) - _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw) + _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(respErr)}) } p.Unlock() diff --git a/pkg/agent/qrm-plugins/network/staticpolicy/policy.go b/pkg/agent/qrm-plugins/network/staticpolicy/policy.go index 64f409115..70fd32f7c 100644 --- a/pkg/agent/qrm-plugins/network/staticpolicy/policy.go +++ b/pkg/agent/qrm-plugins/network/staticpolicy/policy.go @@ -45,6 +45,7 @@ import ( cgroupcmutils "github.com/kubewharf/katalyst-core/pkg/util/cgroup/manager" "github.com/kubewharf/katalyst-core/pkg/util/general" "github.com/kubewharf/katalyst-core/pkg/util/machine" + "github.com/kubewharf/katalyst-core/pkg/util/metric" "github.com/kubewharf/katalyst-core/pkg/util/native" "github.com/kubewharf/katalyst-core/pkg/util/qos" ) @@ -290,7 +291,8 @@ func (p *StaticPolicy) GetTopologyHints(_ context.Context, defer func() { p.Unlock() if err != nil { - _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw) + _ = p.emitter.StoreInt64(util.MetricNameGetTopologyHintsFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(err)}) } }() @@ -531,7 +533,8 @@ func (p *StaticPolicy) Allocate(_ context.Context, defer func() { p.Unlock() if err != nil { - _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw) + _ = p.emitter.StoreInt64(util.MetricNameAllocateFailed, 1, metrics.MetricTypeNameRaw, + metrics.MetricTag{Key: "error_message", Val: metric.MetricTagValueFormat(err)}) } }() diff --git a/pkg/util/general/string.go b/pkg/util/general/string.go index f03b7c8d1..d4af715af 100644 --- a/pkg/util/general/string.go +++ b/pkg/util/general/string.go @@ -35,3 +35,12 @@ func StructToString(val interface{}) string { func BytesToString(b []byte) string { return string(b) } + +// TruncateString truncates the string to the first n characters +func TruncateString(s string, n int) string { + runeSlice := []rune(s) + if len(runeSlice) > n { + return string(runeSlice[:n]) + } + return s +} diff --git a/pkg/util/metric/store_util.go b/pkg/util/metric/store_util.go index e6ad315ea..12f5183e1 100644 --- a/pkg/util/metric/store_util.go +++ b/pkg/util/metric/store_util.go @@ -17,6 +17,8 @@ limitations under the License. package metric import ( + "fmt" + "strings" "time" v1 "k8s.io/api/core/v1" @@ -32,6 +34,8 @@ type Aggregator string const ( AggregatorSum Aggregator = "sum" AggregatorAvg Aggregator = "avg" + + MaxTagLength = 255 ) // ContainerMetricFilter is used to filter out unnecessary metrics if this function returns false @@ -141,3 +145,8 @@ func (c *MetricStore) AggregateCoreMetric(cpuset machine.CPUSet, metricName stri } return data } + +// MetricTagValueFormat formats the given tag value to a string that is suitable for metric tagging +func MetricTagValueFormat(tagValue interface{}) string { + return general.TruncateString(strings.ReplaceAll(fmt.Sprintf("%v", tagValue), " ", "_"), MaxTagLength) +}