Skip to content

Commit

Permalink
Export internal prom metrics (#417)
Browse files Browse the repository at this point in the history
  • Loading branch information
anjmao authored Dec 3, 2024
1 parent 220112a commit a7a3778
Show file tree
Hide file tree
Showing 25 changed files with 275 additions and 64 deletions.
4 changes: 3 additions & 1 deletion charts/kvisor/values-local.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ agent:
enabled: true
extraArgs:
log-level: debug
prom-metrics-export-interval: 10s
container-stats-scrape-interval: 10s
pyroscope-addr: http://kvisord-pyroscope:4040
file-hash-enricher-enabled: true
Expand All @@ -37,6 +38,7 @@ controller:
replicas: 1
extraArgs:
log-level: debug
prom-metrics-export-interval: 10s
image-scan-enabled: true
image-scan-interval: 5s
image-scan-init-delay: 5s
Expand Down Expand Up @@ -68,7 +70,7 @@ mockServer:
tag: latest

clickhouse:
enabled: true
enabled: false
persistentVolume:
size: 10Gi

Expand Down
20 changes: 17 additions & 3 deletions cmd/agent/daemon/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/castai/kvisor/pkg/processtree"
"github.com/go-playground/validator/v10"
"github.com/grafana/pyroscope-go"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/samber/lo"
"golang.org/x/sync/errgroup"
Expand All @@ -51,6 +52,8 @@ type Config struct {
LogRateInterval time.Duration `json:"logRateInterval"`
LogRateBurst int `json:"logRateBurst"`
SendLogsLevel string `json:"sendLogsLevel"`
PromMetricsExportEnabled bool `json:"promMetricsExportEnabled"`
PromMetricsExportInterval time.Duration `json:"promMetricsExportInterval"`
Version string `json:"version"`
BTFPath string `json:"BTFPath"`
PyroscopeAddr string `json:"pyroscopeAddr"`
Expand Down Expand Up @@ -122,6 +125,9 @@ func (a *App) Run(ctx context.Context) error {
Inform: true,
},
}

podName := os.Getenv("POD_NAME")

var log *logging.Logger
var exporters *state.Exporters
// Castai specific spetup if config is valid.
Expand All @@ -137,6 +143,14 @@ func (a *App) Run(ctx context.Context) error {
castaiLogsExporter := castai.NewLogsExporter(castaiClient)
go castaiLogsExporter.Run(ctx) //nolint:errcheck

if a.cfg.PromMetricsExportEnabled {
castaiMetricsExporter := castai.NewPromMetricsExporter(log, castaiLogsExporter, prometheus.DefaultGatherer, castai.PromMetricsExporterConfig{
PodName: podName,
ExportInterval: a.cfg.PromMetricsExportInterval,
})
go castaiMetricsExporter.Run(ctx) //nolint:errcheck
}

logCfg.Export = logging.ExportConfig{
ExportFunc: castaiLogsExporter.ExportFunc(),
MinLevel: logging.MustParseLevel(a.cfg.SendLogsLevel),
Expand Down Expand Up @@ -214,7 +228,7 @@ func (a *App) Run(ctx context.Context) error {
defer log.Infof("stopping kvisor agent, version=%s", a.cfg.Version)

if addr := a.cfg.PyroscopeAddr; addr != "" {
withPyroscope(addr)
withPyroscope(podName, addr)
}

cgroupClient, err := cgroup.NewClient(log, a.cfg.HostCgroupsDir)
Expand Down Expand Up @@ -550,12 +564,12 @@ func waitWithTimeout(errg *errgroup.Group, timeout time.Duration) error {
}
}

func withPyroscope(addr string) {
func withPyroscope(podName, addr string) {
if _, err := pyroscope.Start(pyroscope.Config{
ApplicationName: "kvisor-agent",
ServerAddress: addr,
Tags: map[string]string{
"pod": os.Getenv("POD_NAME"),
"pod": podName,
},
ProfileTypes: []pyroscope.ProfileType{
pyroscope.ProfileCPU,
Expand Down
29 changes: 17 additions & 12 deletions cmd/agent/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ func NewRunCommand(version string) *cobra.Command {

var (
logLevel = command.Flags().String("log-level", slog.LevelInfo.String(), "log level")
logRateInterval = command.Flags().Duration("log-rate-iterval", 100*time.Millisecond, "Log rate limit interval")
logRateInterval = command.Flags().Duration("log-rate-interval", 100*time.Millisecond, "Log rate limit interval")
logRateBurst = command.Flags().Int("log-rate-burst", 100, "Log rate burst")

promMetricsExportEnabled = command.Flags().Bool("prom-metrics-export-enabled", false, "Enabled sending internal prometheus metrics")
promMetricsExportInterval = command.Flags().Duration("prom-metrics-export-interval", 5*time.Minute, "Internal prometheus metrics export interval")

sendLogLevel = command.Flags().String("send-logs-level", "", "send logs level")
containerdSockPath = command.Flags().String("containerd-sock", "/run/containerd/containerd.sock", "Path to containerd socket file")
metricsHTTPListenPort = command.Flags().Int("metrics-http-listen-port", 6060, "metrics http listen port")
Expand Down Expand Up @@ -130,17 +133,19 @@ func NewRunCommand(version string) *cobra.Command {
}

if err := app.New(&app.Config{
LogLevel: *logLevel,
LogRateInterval: *logRateInterval,
LogRateBurst: *logRateBurst,
SendLogsLevel: *sendLogLevel,
Version: version,
BTFPath: *btfPath,
PyroscopeAddr: *pyroscopeAddr,
ContainerdSockPath: *containerdSockPath,
HostCgroupsDir: *hostCgroupsDir,
MetricsHTTPListenPort: *metricsHTTPListenPort,
ContainerStatsEnabled: *containerStatsEnabled,
LogLevel: *logLevel,
LogRateInterval: *logRateInterval,
LogRateBurst: *logRateBurst,
SendLogsLevel: *sendLogLevel,
PromMetricsExportEnabled: *promMetricsExportEnabled,
PromMetricsExportInterval: *promMetricsExportInterval,
Version: version,
BTFPath: *btfPath,
PyroscopeAddr: *pyroscopeAddr,
ContainerdSockPath: *containerdSockPath,
HostCgroupsDir: *hostCgroupsDir,
MetricsHTTPListenPort: *metricsHTTPListenPort,
ContainerStatsEnabled: *containerStatsEnabled,
State: state.Config{
ContainerStatsScrapeInterval: *containerStatsScrapeInterval,
NetflowExportInterval: *netflowExportInterval,
Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/enrichment/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"runtime/debug"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/ebpftracer/types"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
)

type EnrichRequest struct {
Expand Down
12 changes: 2 additions & 10 deletions pkg/metrics/metrics.go → cmd/agent/daemon/metrics/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,6 @@ const (
)

var (
ControllerImagesCount = promauto.NewGauge(prometheus.GaugeOpts{
Name: "kvisor_controller_images_count",
})

ControllerPendingImagesCount = promauto.NewGauge(prometheus.GaugeOpts{
Name: "kvisor_controller_pending_images_count",
})

AgentPulledEventsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "kvisor_agent_kernel_pulled_events_total",
Help: "Counter for tracking pulled events from kernel rate",
Expand Down Expand Up @@ -102,11 +94,11 @@ var (

EBPFProgramRunTimeMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "kvisor_agent_ebpf_program_run_time_ms",
Help: "Run time of eBPF programs in milliseconds as reported by the kernel",
Help: "Run time of eBPF programs in milliseconds as reported by the kernel",
}, []string{EBPFProgramLabel})

EBPFProgramRunCountMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "kvisor_agent_ebpf_program_run_count",
Help: "Number of times a certain eBPF program run as reported by the kernel",
Help: "Number of times a certain eBPF program run as reported by the kernel",
}, []string{EBPFProgramLabel})
)
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/castai_container_stats_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"time"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/castai"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
"google.golang.org/grpc"
)

Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/castai_events_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"time"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/castai"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/grpc"
)
Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/castai_netflow_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"time"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/castai"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
"google.golang.org/grpc"
)

Expand Down
10 changes: 5 additions & 5 deletions cmd/agent/daemon/state/castai_process_tree_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ import (
"time"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/castai"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
"github.com/castai/kvisor/pkg/processtree"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/grpc"
)

const (
castaiProcessTreeLabel = "castai_process_tree"
castaiProcessTreeLabel = "castai_process_tree"
)

type CastaiProcessTreeExporter struct {
Expand Down Expand Up @@ -81,13 +81,13 @@ func toProtoProcessTreeEvent(e processtree.ProcessTreeEvent) *castpb.ProcessTree

for i, pe := range e.Events {
events[i] = &castpb.ProcessEvent{
Timestamp: uint64(pe.Timestamp.UnixNano()), // nolint:gosec
Timestamp: uint64(pe.Timestamp.UnixNano()), // nolint:gosec
ContainerId: pe.ContainerID,
Process: &castpb.Process{
Pid: pe.Process.PID,
StartTime: uint64(pe.Process.StartTime), // nolint:gosec
StartTime: uint64(pe.Process.StartTime), // nolint:gosec
Ppid: pe.Process.PPID,
ParentStartTime: uint64(pe.Process.ParentStartTime), // nolint:gosec
ParentStartTime: uint64(pe.Process.ParentStartTime), // nolint:gosec
Args: pe.Process.Args,
Filepath: pe.Process.FilePath,
ExitTime: pe.Process.ExitTime,
Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/clickhouse_netflow_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ import (

"github.com/ClickHouse/clickhouse-go/v2"
castaipb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
)

func NewClickhouseNetflowExporter(log *logging.Logger, conn clickhouse.Conn, queueSize int) *ClickHouseNetflowExporter {
Expand Down
8 changes: 4 additions & 4 deletions cmd/agent/daemon/state/clickhouse_process_tree_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ import (
"time"

"github.com/ClickHouse/clickhouse-go/v2"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
"github.com/castai/kvisor/pkg/processtree"
"github.com/samber/lo"
)
Expand Down Expand Up @@ -139,7 +139,7 @@ func (c *ClickhouseProcessTreeExporter) generateExitEvents(ctx context.Context,
StartTime: startTime,
PPID: ppid,
ParentStartTime: parentStartTime,
ExitTime: uint64(now.UnixNano()), // nolint:gosec
ExitTime: uint64(now.UnixNano()), // nolint:gosec
},
Action: processtree.ProcessExit,
})
Expand Down Expand Up @@ -171,9 +171,9 @@ func (c *ClickhouseProcessTreeExporter) asyncWrite(ctx context.Context, wait boo
clickhouse.Named("pid", e.Process.PID),
// NOTE: StartTime will be stored in seconds since boot, since this is the best resolution we can get everywhere
// we need. This should still be good enough to identify a process.
clickhouse.Named("start_time", uint64(e.Process.StartTime/time.Second)), // nolint:gosec
clickhouse.Named("start_time", uint64(e.Process.StartTime/time.Second)), // nolint:gosec
clickhouse.Named("ppid", e.Process.PPID),
clickhouse.Named("parent_start_time", uint64(e.Process.ParentStartTime/time.Second)), // nolint:gosec
clickhouse.Named("parent_start_time", uint64(e.Process.ParentStartTime/time.Second)), // nolint:gosec
clickhouse.Named("args", e.Process.Args),
clickhouse.Named("file_path", e.Process.FilePath),
clickhouse.Named("exit_time", e.Process.ExitTime),
Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/events_pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ import (

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/enrichment"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
ebpftypes "github.com/castai/kvisor/pkg/ebpftracer/types"
"github.com/castai/kvisor/pkg/metrics"
"github.com/cespare/xxhash/v2"
"github.com/elastic/go-freelru"
)
Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/netflow_pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ import (

kubepb "github.com/castai/kvisor/api/v1/kube"
castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/containers"
"github.com/castai/kvisor/pkg/ebpftracer"
"github.com/castai/kvisor/pkg/ebpftracer/types"
"github.com/castai/kvisor/pkg/metrics"
"golang.org/x/sync/errgroup"
"golang.org/x/sys/unix"
)
Expand Down
13 changes: 13 additions & 0 deletions cmd/controller/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/grafana/pyroscope-go"
"github.com/labstack/echo/v4"
"github.com/labstack/echo/v4/middleware"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/samber/lo"
"golang.org/x/sync/errgroup"
Expand All @@ -39,6 +40,9 @@ type Config struct {
LogRateInterval time.Duration `json:"logRateInterval"`
LogRateBurst int `json:"logRateBurst"`

PromMetricsExportEnabled bool `json:"promMetricsExportEnabled"`
PromMetricsExportInterval time.Duration `json:"promMetricsExportInterval"`

// Built binary version.
Version string `json:"version"`
ChartVersion string `json:"chartVersion"`
Expand Down Expand Up @@ -105,6 +109,15 @@ func (a *App) Run(ctx context.Context) error {
defer castaiClient.Close()
castaiLogsExporter := castai.NewLogsExporter(castaiClient)
go castaiLogsExporter.Run(ctx) //nolint:errcheck

if a.cfg.PromMetricsExportEnabled {
castaiMetricsExporter := castai.NewPromMetricsExporter(log, castaiLogsExporter, prometheus.DefaultGatherer, castai.PromMetricsExporterConfig{
PodName: a.cfg.PodName,
ExportInterval: a.cfg.PromMetricsExportInterval,
})
go castaiMetricsExporter.Run(ctx) //nolint:errcheck
}

logCfg.Export = logging.ExportConfig{
ExportFunc: castaiLogsExporter.ExportFunc(),
MinLevel: slog.LevelInfo,
Expand Down
34 changes: 19 additions & 15 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@ var (
serverHTTPListenPort = pflag.Int("http-listen-port", 8080, "server http listen port")
kubeServerListenPort = pflag.Int("kube-server-listen-port", 8090, "kube server grpc http listen port")

logLevel = pflag.String("log-level", slog.LevelDebug.String(), "Log level")
logRateInterval = pflag.Duration("log-rate-iterval", 100*time.Millisecond, "Log rate limit interval")
logRateBurst = pflag.Int("log-rate-burst", 100, "Log rate burst")
logLevel = pflag.String("log-level", slog.LevelDebug.String(), "Log level")
logRateInterval = pflag.Duration("log-rate-interval", 100*time.Millisecond, "Log rate limit interval")
logRateBurst = pflag.Int("log-rate-burst", 100, "Log rate burst")
promMetricsExportEnabled = pflag.Bool("prom-metrics-export-enabled", false, "Enabled sending internal prometheus metrics")
promMetricsExportInterval = pflag.Duration("prom-metrics-export-interval", 5*time.Minute, "Internal prometheus metrics export interval")

chartVersion = pflag.String("chart-version", "", "Helm chart version")

Expand Down Expand Up @@ -162,18 +164,20 @@ func main() {
podName = "localenv"
}
appInstance := app.New(app.Config{
LogLevel: *logLevel,
LogRateInterval: *logRateInterval,
LogRateBurst: *logRateBurst,
PodName: podName,
PodNamespace: podNs,
Version: Version,
ChartVersion: *chartVersion,
PyroscopeAddr: *pyroscopeAddr,
MetricsHTTPListenPort: *metricsHTTPListenPort,
HTTPListenPort: *serverHTTPListenPort,
KubeServerListenPort: *kubeServerListenPort,
CastaiEnv: castaiClientCfg,
LogLevel: *logLevel,
LogRateInterval: *logRateInterval,
LogRateBurst: *logRateBurst,
PromMetricsExportEnabled: *promMetricsExportEnabled,
PromMetricsExportInterval: *promMetricsExportInterval,
PodName: podName,
PodNamespace: podNs,
Version: Version,
ChartVersion: *chartVersion,
PyroscopeAddr: *pyroscopeAddr,
MetricsHTTPListenPort: *metricsHTTPListenPort,
HTTPListenPort: *serverHTTPListenPort,
KubeServerListenPort: *kubeServerListenPort,
CastaiEnv: castaiClientCfg,
CastaiController: state.CastaiConfig{
RemoteConfigSyncDuration: *castaiConfigSyncDuration,
},
Expand Down
Loading

0 comments on commit a7a3778

Please sign in to comment.