Skip to content

Commit

Permalink
Merge branch 'main' into cri-labels-annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
matas-cast committed Dec 5, 2024
2 parents ba98d82 + 0711cd8 commit d2a0922
Show file tree
Hide file tree
Showing 51 changed files with 1,361 additions and 1,006 deletions.
4 changes: 2 additions & 2 deletions charts/kvisor/Chart.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ apiVersion: v2
name: castai-kvisor
description: CAST AI security and observability.
type: application
version: 1.0.55
appVersion: "v1.27.0"
version: 1.0.58
appVersion: "v1.29.1"
2 changes: 2 additions & 0 deletions charts/kvisor/values-local.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ agent:
enabled: true
extraArgs:
log-level: debug
prom-metrics-export-interval: 10s
container-stats-scrape-interval: 10s
pyroscope-addr: http://kvisord-pyroscope:4040
file-hash-enricher-enabled: true
Expand All @@ -39,6 +40,7 @@ controller:
replicas: 1
extraArgs:
log-level: debug
prom-metrics-export-interval: 10s
image-scan-enabled: true
image-scan-interval: 5s
image-scan-init-delay: 5s
Expand Down
38 changes: 26 additions & 12 deletions cmd/agent/daemon/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/castai/kvisor/pkg/processtree"
"github.com/go-playground/validator/v10"
"github.com/grafana/pyroscope-go"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/samber/lo"
"golang.org/x/sync/errgroup"
Expand All @@ -52,6 +53,8 @@ type Config struct {
LogRateInterval time.Duration `json:"logRateInterval"`
LogRateBurst int `json:"logRateBurst"`
SendLogsLevel string `json:"sendLogsLevel"`
PromMetricsExportEnabled bool `json:"promMetricsExportEnabled"`
PromMetricsExportInterval time.Duration `json:"promMetricsExportInterval"`
Version string `json:"version"`
BTFPath string `json:"BTFPath"`
PyroscopeAddr string `json:"pyroscopeAddr"`
Expand All @@ -61,11 +64,13 @@ type Config struct {
State state.Config `json:"state"`
ContainerStatsEnabled bool `json:"containerStatsEnabled"`
EBPFEventsEnabled bool `json:"EBPFEventsEnabled"`
EBPFEventsPerCPUBuffer int `validate:"required" json:"EBPFEventsPerCPUBuffer"`
EBPFEventsOutputChanSize int `validate:"required" json:"EBPFEventsOutputChanSize"`
EBPFEventsStdioExporterEnabled bool `json:"EBPFEventsStdioExporterEnabled"`
EBPFMetrics EBPFMetricsConfig `json:"EBPFMetrics"`
EBPFEventsPolicyConfig ebpftracer.EventsPolicyConfig `json:"EBPFEventsPolicyConfig"`
EBPFSignalEventsRingBufferSize uint32 `json:"EBPFSignalEventsRingBufferSize"`
EBPFEventsRingBufferSize uint32 `json:"EBPFEventsRingBufferSize"`
EBPFSkbEventsRingBufferSize uint32 `json:"EBPFSkbEventsRingBufferSize"`
MutedNamespaces []string `json:"mutedNamespaces"`
SignatureEngineConfig signature.SignatureEngineConfig `json:"signatureEngineConfig"`
Castai castai.Config `json:"castai"`
Expand All @@ -89,7 +94,6 @@ type EnricherConfig struct {
type NetflowConfig struct {
Enabled bool `json:"enabled"`
SampleSubmitIntervalSeconds uint64 `json:"sampleSubmitIntervalSeconds"`
OutputChanSize int `json:"outputChanSize"`
Grouping ebpftracer.NetflowGrouping `json:"grouping"`
}

Expand Down Expand Up @@ -126,6 +130,9 @@ func (a *App) Run(ctx context.Context) error {
Inform: true,
},
}

podName := os.Getenv("POD_NAME")

var log *logging.Logger
var exporters *state.Exporters
// Castai specific spetup if config is valid.
Expand All @@ -141,6 +148,14 @@ func (a *App) Run(ctx context.Context) error {
castaiLogsExporter := castai.NewLogsExporter(castaiClient)
go castaiLogsExporter.Run(ctx) //nolint:errcheck

if a.cfg.PromMetricsExportEnabled {
castaiMetricsExporter := castai.NewPromMetricsExporter(log, castaiLogsExporter, prometheus.DefaultGatherer, castai.PromMetricsExporterConfig{
PodName: podName,
ExportInterval: a.cfg.PromMetricsExportInterval,
})
go castaiMetricsExporter.Run(ctx) //nolint:errcheck
}

logCfg.Export = logging.ExportConfig{
ExportFunc: castaiLogsExporter.ExportFunc(),
MinLevel: logging.MustParseLevel(a.cfg.SendLogsLevel),
Expand Down Expand Up @@ -218,7 +233,7 @@ func (a *App) Run(ctx context.Context) error {
defer log.Infof("stopping kvisor agent, version=%s", a.cfg.Version)

if addr := a.cfg.PyroscopeAddr; addr != "" {
withPyroscope(addr)
withPyroscope(podName, addr)
}

cgroupClient, err := cgroup.NewClient(log, a.cfg.HostCgroupsDir)
Expand Down Expand Up @@ -283,7 +298,9 @@ func (a *App) Run(ctx context.Context) error {

tracer := ebpftracer.New(log, ebpftracer.Config{
BTFPath: a.cfg.BTFPath,
EventsPerCPUBuffer: a.cfg.EBPFEventsPerCPUBuffer,
SignalEventsRingBufferSize: a.cfg.EBPFSignalEventsRingBufferSize,
EventsRingBufferSize: a.cfg.EBPFEventsRingBufferSize,
SkbEventsRingBufferSize: a.cfg.EBPFSkbEventsRingBufferSize,
EventsOutputChanSize: a.cfg.EBPFEventsOutputChanSize,
DefaultCgroupsVersion: cgroupClient.DefaultCgroupVersion().String(),
ContainerClient: containersClient,
Expand All @@ -292,7 +309,6 @@ func (a *App) Run(ctx context.Context) error {
SignatureEngine: signatureEngine,
MountNamespacePIDStore: mountNamespacePIDStore,
HomePIDNS: pidNSID,
NetflowOutputChanSize: a.cfg.Netflow.OutputChanSize,
NetflowSampleSubmitIntervalSeconds: a.cfg.Netflow.SampleSubmitIntervalSeconds,
NetflowGrouping: a.cfg.Netflow.Grouping,
TrackSyscallStats: cfg.ContainerStatsEnabled,
Expand All @@ -301,6 +317,7 @@ func (a *App) Run(ctx context.Context) error {
ProgramMetricsEnabled: cfg.EBPFMetrics.ProgramMetricsEnabled,
TracerMetricsEnabled: cfg.EBPFMetrics.TracerMetricsEnabled,
},
PodName: podName,
})
if err := tracer.Load(); err != nil {
return fmt.Errorf("loading tracer: %w", err)
Expand Down Expand Up @@ -415,11 +432,8 @@ func buildEBPFPolicy(log *logging.Logger, cfg *Config, exporters *state.Exporter
switch enabledEvent {
case events.SockSetState:
policy.Events = append(policy.Events, &ebpftracer.EventPolicy{
ID: events.SockSetState,
FilterGenerator: ebpftracer.RateLimitPrivateIP(ebpftracer.RateLimitPolicy{
Rate: 100,
Burst: 1,
}),
ID: events.SockSetState,
FilterGenerator: ebpftracer.SkipPrivateIP(), // TODO: Move private ip skip to kernel side.
})
case events.NetPacketDNSBase:
policy.Events = append(policy.Events, dnsEventPolicy)
Expand Down Expand Up @@ -562,12 +576,12 @@ func waitWithTimeout(errg *errgroup.Group, timeout time.Duration) error {
}
}

func withPyroscope(addr string) {
func withPyroscope(podName, addr string) {
if _, err := pyroscope.Start(pyroscope.Config{
ApplicationName: "kvisor-agent",
ServerAddress: addr,
Tags: map[string]string{
"pod": os.Getenv("POD_NAME"),
"pod": podName,
},
ProfileTypes: []pyroscope.ProfileType{
pyroscope.ProfileCPU,
Expand Down
48 changes: 28 additions & 20 deletions cmd/agent/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,12 @@ func NewRunCommand(version string) *cobra.Command {

var (
logLevel = command.Flags().String("log-level", slog.LevelInfo.String(), "log level")
logRateInterval = command.Flags().Duration("log-rate-iterval", 100*time.Millisecond, "Log rate limit interval")
logRateInterval = command.Flags().Duration("log-rate-interval", 100*time.Millisecond, "Log rate limit interval")
logRateBurst = command.Flags().Int("log-rate-burst", 100, "Log rate burst")

promMetricsExportEnabled = command.Flags().Bool("prom-metrics-export-enabled", false, "Enabled sending internal prometheus metrics")
promMetricsExportInterval = command.Flags().Duration("prom-metrics-export-interval", 5*time.Minute, "Internal prometheus metrics export interval")

sendLogLevel = command.Flags().String("send-logs-level", "", "send logs level")
containerdSockPath = command.Flags().String("containerd-sock", "/run/containerd/containerd.sock", "Path to containerd socket file")
metricsHTTPListenPort = command.Flags().Int("metrics-http-listen-port", 6060, "metrics http listen port")
Expand All @@ -62,17 +65,20 @@ func NewRunCommand(version string) *cobra.Command {
events.NetPacketDNSBase,
events.MagicWrite,
events.ProcessOomKilled,
events.StdioViaSocket,
// events.StdioViaSocket, // TODO(anjmao): Tracing this event via syscall hooks is very expensive. Rework the whole syscall tracing.
events.TtyWrite,
events.NetPacketSSHBase,
},
}
ebpfEventsStdioExporterEnabled = command.Flags().Bool("ebpf-events-stdio-exporter-enabled", false, "Export ebpf event to stdio")
ebpfEventsPerCPUBuffer = command.Flags().Int("ebpf-events-per-cpu-buffer", os.Getpagesize()*64, "Ebpf per cpu buffer size")
ebpfEventsOutputChanSize = command.Flags().Int("ebpf-events-output-queue-size", 4096, "Ebpf user spaces output channel size")
ebpfTracerMetricsEnabled = command.Flags().Bool("ebpf-tracer-metrics-enabled", false, "Enables the export of tracer related metrics from eBPF")
ebpfTracerMetricsEnabled = command.Flags().Bool("ebpf-tracer-metrics-enabled", true, "Enables the export of tracer related metrics from eBPF")
ebpfProgramMetricsEnabled = command.Flags().Bool("ebpf-program-metrics-enabled", false, "Enables the export of metrics about eBPF programs")

EBPFSignalEventsRingBufferSize = command.Flags().Uint32("ebpf-signal-events-ring-buffer-size", 1<<20, "Ebpf ring buffer size in bytes for priority events. Should be power of 2")
EBPFEventsRingBufferSize = command.Flags().Uint32("ebpf-events-ring-buffer-size", 1<<20, "Ebpf ring buffer size in bytes for events. Should be power of 2")
EBPFSkbEventsRingBufferSize = command.Flags().Uint32("ebpf-skb-events-ring-buffer-size", 1<<20, "Ebpf ring buffer size in bytes for skb network events. Should be power of 2")

mutedNamespaces = command.Flags().StringSlice("ignored-namespaces", []string{"kube-system", "calico", "calico-system"},
"List of namespaces to ignore tracing events for. To ignore multiple namespaces, separate by comma or pass flag multiple times."+
" For example: --ignored-namespaces=kube-system,calico-system")
Expand All @@ -86,7 +92,6 @@ func NewRunCommand(version string) *cobra.Command {

netflowEnabled = command.Flags().Bool("netflow-enabled", false, "Enables netflow tracking")
netflowSampleSubmitIntervalSeconds = command.Flags().Uint64("netflow-sample-submit-interval-seconds", 15, "Netflow sample submit interval")
netflowOutputChanSize = command.Flags().Int("netflow-output-queue-size", 4096, "Netflow output queue size")
netflowExportInterval = command.Flags().Duration("netflow-export-interval", 15*time.Second, "Netflow export interval")
netflowGrouping = ebpftracer.NetflowGroupingDropSrcPort

Expand Down Expand Up @@ -134,31 +139,35 @@ func NewRunCommand(version string) *cobra.Command {
}

if err := app.New(&app.Config{
LogLevel: *logLevel,
LogRateInterval: *logRateInterval,
LogRateBurst: *logRateBurst,
SendLogsLevel: *sendLogLevel,
Version: version,
BTFPath: *btfPath,
PyroscopeAddr: *pyroscopeAddr,
ContainerdSockPath: *containerdSockPath,
HostCgroupsDir: *hostCgroupsDir,
MetricsHTTPListenPort: *metricsHTTPListenPort,
ContainerStatsEnabled: *containerStatsEnabled,
LogLevel: *logLevel,
LogRateInterval: *logRateInterval,
LogRateBurst: *logRateBurst,
SendLogsLevel: *sendLogLevel,
PromMetricsExportEnabled: *promMetricsExportEnabled,
PromMetricsExportInterval: *promMetricsExportInterval,
Version: version,
BTFPath: *btfPath,
PyroscopeAddr: *pyroscopeAddr,
ContainerdSockPath: *containerdSockPath,
HostCgroupsDir: *hostCgroupsDir,
MetricsHTTPListenPort: *metricsHTTPListenPort,
ContainerStatsEnabled: *containerStatsEnabled,
State: state.Config{
ContainerStatsScrapeInterval: *containerStatsScrapeInterval,
NetflowExportInterval: *netflowExportInterval,
},
EBPFEventsEnabled: *ebpfEventsEnabled,
EBPFEventsStdioExporterEnabled: *ebpfEventsStdioExporterEnabled,
EBPFEventsPerCPUBuffer: *ebpfEventsPerCPUBuffer,
EBPFEventsOutputChanSize: *ebpfEventsOutputChanSize,
EBPFMetrics: app.EBPFMetricsConfig{
TracerMetricsEnabled: *ebpfTracerMetricsEnabled,
ProgramMetricsEnabled: *ebpfProgramMetricsEnabled,
},
EBPFEventsPolicyConfig: ebpfEventsPolicy,
MutedNamespaces: *mutedNamespaces,
EBPFEventsPolicyConfig: ebpfEventsPolicy,
EBPFSignalEventsRingBufferSize: *EBPFSignalEventsRingBufferSize,
EBPFEventsRingBufferSize: *EBPFEventsRingBufferSize,
EBPFSkbEventsRingBufferSize: *EBPFSkbEventsRingBufferSize,
MutedNamespaces: *mutedNamespaces,
SignatureEngineConfig: signature.SignatureEngineConfig{
InputChanSize: *signatureEngineInputEventChanSize,
OutputChanSize: *signatureEngineOutputEventChanSize,
Expand All @@ -177,7 +186,6 @@ func NewRunCommand(version string) *cobra.Command {
Netflow: app.NetflowConfig{
Enabled: *netflowEnabled,
SampleSubmitIntervalSeconds: *netflowSampleSubmitIntervalSeconds,
OutputChanSize: *netflowOutputChanSize,
Grouping: netflowGrouping,
},
Clickhouse: app.ClickhouseConfig{
Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/enrichment/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"runtime/debug"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/ebpftracer/types"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
)

type EnrichRequest struct {
Expand Down
17 changes: 2 additions & 15 deletions pkg/metrics/metrics.go → cmd/agent/daemon/metrics/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,6 @@ const (
)

var (
ControllerImagesCount = promauto.NewGauge(prometheus.GaugeOpts{
Name: "kvisor_controller_images_count",
})

ControllerPendingImagesCount = promauto.NewGauge(prometheus.GaugeOpts{
Name: "kvisor_controller_pending_images_count",
})

AgentPulledEventsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "kvisor_agent_kernel_pulled_events_total",
Help: "Counter for tracking pulled events from kernel rate",
Expand All @@ -31,11 +23,6 @@ var (
Help: "Counter for tracking pulled events bytes from kernel rate",
}, []string{EventTypeLabel})

AgentKernelLostEventsTotal = promauto.NewCounter(prometheus.CounterOpts{
Name: "kvisor_agent_kernel_lost_events_total",
Help: "Counter for tracking lost events from kernel rate",
})

AgentSkippedEventsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "kvisor_agent_skipped_events_total",
Help: "Counter for tracking skipped events rate",
Expand Down Expand Up @@ -102,11 +89,11 @@ var (

EBPFProgramRunTimeMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "kvisor_agent_ebpf_program_run_time_ms",
Help: "Run time of eBPF programs in milliseconds as reported by the kernel",
Help: "Run time of eBPF programs in milliseconds as reported by the kernel",
}, []string{EBPFProgramLabel})

EBPFProgramRunCountMetrics = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "kvisor_agent_ebpf_program_run_count",
Help: "Number of times a certain eBPF program run as reported by the kernel",
Help: "Number of times a certain eBPF program run as reported by the kernel",
}, []string{EBPFProgramLabel})
)
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/castai_container_stats_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"time"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/castai"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
"google.golang.org/grpc"
)

Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/castai_events_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"time"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/castai"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/grpc"
)
Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/castai_netflow_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ import (
"time"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/castai"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
"google.golang.org/grpc"
)

Expand Down
10 changes: 5 additions & 5 deletions cmd/agent/daemon/state/castai_process_tree_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ import (
"time"

castpb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/castai"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
"github.com/castai/kvisor/pkg/processtree"
"github.com/prometheus/client_golang/prometheus"
"google.golang.org/grpc"
)

const (
castaiProcessTreeLabel = "castai_process_tree"
castaiProcessTreeLabel = "castai_process_tree"
)

type CastaiProcessTreeExporter struct {
Expand Down Expand Up @@ -81,13 +81,13 @@ func toProtoProcessTreeEvent(e processtree.ProcessTreeEvent) *castpb.ProcessTree

for i, pe := range e.Events {
events[i] = &castpb.ProcessEvent{
Timestamp: uint64(pe.Timestamp.UnixNano()), // nolint:gosec
Timestamp: uint64(pe.Timestamp.UnixNano()), // nolint:gosec
ContainerId: pe.ContainerID,
Process: &castpb.Process{
Pid: pe.Process.PID,
StartTime: uint64(pe.Process.StartTime), // nolint:gosec
StartTime: uint64(pe.Process.StartTime), // nolint:gosec
Ppid: pe.Process.PPID,
ParentStartTime: uint64(pe.Process.ParentStartTime), // nolint:gosec
ParentStartTime: uint64(pe.Process.ParentStartTime), // nolint:gosec
Args: pe.Process.Args,
Filepath: pe.Process.FilePath,
ExitTime: pe.Process.ExitTime,
Expand Down
2 changes: 1 addition & 1 deletion cmd/agent/daemon/state/clickhouse_netflow_exporter.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ import (

"github.com/ClickHouse/clickhouse-go/v2"
castaipb "github.com/castai/kvisor/api/v1/runtime"
"github.com/castai/kvisor/cmd/agent/daemon/metrics"
"github.com/castai/kvisor/pkg/logging"
"github.com/castai/kvisor/pkg/metrics"
)

func NewClickhouseNetflowExporter(log *logging.Logger, conn clickhouse.Conn, queueSize int) *ClickHouseNetflowExporter {
Expand Down
Loading

0 comments on commit d2a0922

Please sign in to comment.