From bb1d90d105c87dbdfd5135205d46594d98e1edc0 Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Wed, 6 Nov 2024 14:13:50 +0100 Subject: [PATCH] Improve e2e troubleshooting Improve / fix some issues with e2e tests: - Add more logs; print some useful information such as when cluster is still up - Improve readiness (e.g: had agents pods crashing) - Use more up to date templates for loki and kafka (similar to what we have in docs repo) --- Makefile | 2 +- e2e/basic/common.go | 1 + e2e/basic/flow_test.go | 1 + e2e/cluster/base/02-loki.yml | 74 ++++++++++++++++++++---- e2e/cluster/kind.go | 84 +++++++++++++++++++++------- e2e/cluster/kind_test.go | 31 +++++++--- e2e/cluster/tester/loki.go | 14 +++++ e2e/cluster/tester/pods.go | 15 ++++- e2e/ipfix/ipfix_test.go | 2 +- e2e/kafka/kafka_test.go | 16 ++++-- e2e/kafka/manifests/10-kafka-crd.yml | 4 +- 11 files changed, 192 insertions(+), 52 deletions(-) diff --git a/Makefile b/Makefile index 4d583c841..a00ccca53 100644 --- a/Makefile +++ b/Makefile @@ -170,7 +170,7 @@ tests-e2e: prereqs ## Run e2e tests go clean -testcache # making the local agent image available to kind in two ways, so it will work in different # environments: (1) as image tagged in the local repository (2) as image archive. - $(OCI_BIN) build . --build-arg TARGETARCH=$(GOARCH) -t localhost/ebpf-agent:test + $(OCI_BIN) build . --build-arg LDFLAGS="" --build-arg TARGETARCH=$(GOARCH) -t localhost/ebpf-agent:test $(OCI_BIN) save -o ebpf-agent.tar localhost/ebpf-agent:test GOOS=$(GOOS) go test -p 1 -timeout 30m -v -mod vendor -tags e2e ./e2e/... diff --git a/e2e/basic/common.go b/e2e/basic/common.go index 1bea80094..d62fc61d2 100644 --- a/e2e/basic/common.go +++ b/e2e/basic/common.go @@ -282,6 +282,7 @@ func (bt *FlowCaptureTester) lokiQuery(t *testing.T, logQL string) tester.LokiQu query, err = bt.Cluster.Loki().Query(1, logQL) require.NoError(t, err) require.NotNil(t, query) + require.NotNil(t, query.Data) require.NotEmpty(t, query.Data.Result) }, test.Interval(time.Second)) result := query.Data.Result[0] diff --git a/e2e/basic/flow_test.go b/e2e/basic/flow_test.go index 97035e686..60417af6e 100644 --- a/e2e/basic/flow_test.go +++ b/e2e/basic/flow_test.go @@ -152,6 +152,7 @@ func getPingFlows(t *testing.T, newerThan time.Time, expectedBytes int) (sent, r }, test.Interval(time.Second)) test.Eventually(t, time.Minute, func(t require.TestingT) { + // testCluster.Loki().DebugPrint(100, `{app="netobserv-flowcollector",DstK8S_OwnerName="pinger"}`) query, err = testCluster.Loki(). Query(1, fmt.Sprintf(`{SrcK8S_OwnerName="server",DstK8S_OwnerName="pinger"}`+ `|~"\"Proto\":1[,}]"`+ // Proto 1 == ICMP diff --git a/e2e/cluster/base/02-loki.yml b/e2e/cluster/base/02-loki.yml index 463f84086..1d6a1b270 100644 --- a/e2e/cluster/base/02-loki.yml +++ b/e2e/cluster/base/02-loki.yml @@ -20,6 +20,11 @@ data: server: http_listen_port: 3100 grpc_listen_port: 9096 + grpc_server_max_recv_msg_size: 10485760 + http_server_read_timeout: 1m + http_server_write_timeout: 1m + log_level: error + target: all common: path_prefix: /loki-store storage: @@ -31,9 +36,32 @@ data: instance_addr: 127.0.0.1 kvstore: store: inmemory + compactor: + compaction_interval: 5m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + frontend: + compress_responses: true + ingester: + chunk_encoding: snappy + chunk_retain_period: 1m + query_range: + align_queries_with_step: true + cache_results: true + max_retries: 5 + results_cache: + cache: + enable_fifocache: true + fifocache: + max_size_bytes: 500MB + validity: 24h + parallelise_shardable_queries: true + query_scheduler: + max_outstanding_requests_per_tenant: 2048 schema_config: configs: - - from: 2020-10-24 + - from: 2022-01-01 store: boltdb-shipper object_store: filesystem schema: v11 @@ -47,15 +75,39 @@ data: active_index_directory: /loki-store/index shared_store: filesystem cache_location: /loki-store/boltdb-cache - datasource.yaml: | - apiVersion: 1 - datasources: - - name: Loki - type: loki - access: proxy - url: http://localhost:3100 - isDefault: true - version: 1 + cache_ttl: 24h + limits_config: + ingestion_rate_strategy: global + ingestion_rate_mb: 10 + ingestion_burst_size_mb: 10 + max_label_name_length: 1024 + max_label_value_length: 2048 + max_label_names_per_series: 30 + reject_old_samples: true + reject_old_samples_max_age: 15m + creation_grace_period: 10m + enforce_metric_name: false + max_line_size: 256000 + max_line_size_truncate: false + max_entries_limit_per_query: 10000 + max_streams_per_user: 0 + max_global_streams_per_user: 0 + unordered_writes: true + max_chunks_per_query: 2000000 + max_query_length: 721h + max_query_parallelism: 32 + max_query_series: 10000 + cardinality_limit: 100000 + max_streams_matchers_per_query: 1000 + max_concurrent_tail_requests: 10 + retention_period: 24h + max_cache_freshness_per_query: 5m + max_queriers_per_tenant: 0 + per_stream_rate_limit: 3MB + per_stream_rate_limit_burst: 15MB + max_query_lookback: 0 + min_sharding_lookback: 0s + split_queries_by_interval: 1m --- apiVersion: apps/v1 kind: Deployment @@ -83,7 +135,7 @@ spec: name: loki-config containers: - name: loki - image: grafana/loki:2.4.1 + image: grafana/loki:2.9.0 volumeMounts: - mountPath: "/loki-store" name: loki-store diff --git a/e2e/cluster/kind.go b/e2e/cluster/kind.go index 2b728c3b7..39c1bb8f6 100644 --- a/e2e/cluster/kind.go +++ b/e2e/cluster/kind.go @@ -12,8 +12,10 @@ import ( "fmt" "io" "os" + "os/signal" "path" "sort" + "syscall" "testing" "time" @@ -90,18 +92,43 @@ var defaultBaseDeployments = map[DeployID]Deployment{ Loki: { Order: ExternalServices, ManifestFile: path.Join(packageDir(), "base", "02-loki.yml"), - ReadyFunction: func(*envconf.Config) error { - return (&tester.Loki{BaseURL: "http://127.0.0.1:30100"}).Ready() + Ready: &Readiness{ + Function: func(*envconf.Config) error { return (&tester.Loki{BaseURL: "http://localhost:30100"}).Ready() }, + Description: "Check that http://localhost:30100 is reachable (Loki NodePort)", + Timeout: 5 * time.Minute, + Retry: 5 * time.Second, }, }, FlowLogsPipeline: { Order: NetObservServices, ManifestFile: path.Join(packageDir(), "base", "03-flp.yml"), + Ready: &Readiness{ + Function: testPodsReady("flp"), + Description: "Check that flp pods are up and running", + Timeout: 5 * time.Minute, + Retry: 5 * time.Second, + }, }, Agent: { Order: WithAgent, ManifestFile: path.Join(packageDir(), "base", "04-agent.yml"), + Ready: &Readiness{ + Function: testPodsReady("netobserv-ebpf-agent"), + Description: "Check that agent pods are up and running", + Timeout: 5 * time.Minute, + Retry: 5 * time.Second, + }, }, } +func testPodsReady(dsName string) func(*envconf.Config) error { + return func(cfg *envconf.Config) error { + pods, err := tester.NewPods(cfg) + if err != nil { + return err + } + return pods.DSReady(context.Background(), "default", dsName) + } +} + // Deployment of components. Not only K8s deployments but also Pods, Services, DaemonSets, ... type Deployment struct { // Order of the deployment. Deployments with the same order will be executed by alphabetical @@ -109,9 +136,14 @@ type Deployment struct { Order DeployOrder // ManifestFile path to the kubectl-like YAML manifest file ManifestFile string - // ReadyFunction is an optional function that returns error if the deployment is not ready. - // Used when it's needed to wait before starting tests or deploying later components. - ReadyFunction func(*envconf.Config) error + Ready *Readiness +} + +type Readiness struct { + Function func(*envconf.Config) error + Description string + Timeout time.Duration + Retry time.Duration } // Kind cluster deployed by each TestMain function, prepared for a given test scenario. @@ -146,6 +178,7 @@ func Deploy(def Deployment) Option { // Timeout for long-running operations (e.g. deployments, readiness probes...) func Timeout(t time.Duration) Option { + log.Infof("Timeout set to %s", t.String()) return func(k *Kind) { k.timeout = t } @@ -156,6 +189,9 @@ func Timeout(t time.Duration) Option { // backend doesn't provide access to the local images, where the ebpf-agent.tar container image // is located. Usually it will be the project root. func NewKind(kindClusterName, baseDir string, options ...Option) *Kind { + fmt.Println() + fmt.Println() + log.Infof("Starting KIND cluster %s", kindClusterName) k := &Kind{ testEnv: env.New(), baseDir: baseDir, @@ -191,10 +227,19 @@ func (k *Kind) Run(m *testing.M) { currentOrder = c.Order } envFuncs = append(envFuncs, deploy(c)) - readyFuncs = append(readyFuncs, withTimeout(isReady(c), k.timeout)) + readyFuncs = append(readyFuncs, isReady(c)) } envFuncs = append(envFuncs, readyFuncs...) + exit := make(chan os.Signal, 1) + signal.Notify(exit, os.Interrupt, syscall.SIGTERM) + go func() { + <-exit + fmt.Println("SIGTERM received, cluster might still be running") + fmt.Printf("To clean up, run: \033[33mkind delete cluster --name %s\033[0m\n", k.clusterName) + os.Exit(1) + }() + log.Info("starting kind setup") code := k.testEnv.Setup(envFuncs...). Finish( @@ -244,7 +289,7 @@ func (k *Kind) TestEnv() env.Environment { // Loki client pointing to the Loki instance inside the test cluster func (k *Kind) Loki() *tester.Loki { - return &tester.Loki{BaseURL: "http://127.0.0.1:30100"} + return &tester.Loki{BaseURL: "http://localhost:30100"} } func deploy(definition Deployment) env.Func { @@ -285,6 +330,7 @@ func deployManifestFile(definition Deployment, if !errors.Is(err, io.EOF) { return fmt.Errorf("decoding manifest raw object: %w", err) } + log.WithField("file", definition.ManifestFile).Info("done") // eof return nil } @@ -344,7 +390,7 @@ func (k *Kind) loadLocalImage() env.Func { } // withTimeout retries the execution of an env.Func until it succeeds or a timeout is reached -func withTimeout(f env.Func, timeout time.Duration) env.Func { +func withTimeout(f env.Func, timeout, retry time.Duration) env.Func { tlog := log.WithField("function", "withTimeout") return func(ctx context.Context, config *envconf.Config) (context.Context, error) { start := time.Now() @@ -356,26 +402,24 @@ func withTimeout(f env.Func, timeout time.Duration) env.Func { if time.Since(start) > timeout { return ctx, fmt.Errorf("timeout (%s) trying to execute function: %w", timeout, err) } - tlog.WithError(err).Debug("function did not succeed. Retrying after 5s") - time.Sleep(5 * time.Second) + tlog.WithError(err).Debugf("function did not succeed. Retrying after %s", retry.String()) + time.Sleep(retry) } } } // isReady succeeds if the passed deployment does not have ReadyFunction, or it succeeds func isReady(definition Deployment) env.Func { - return withTimeout(func(ctx context.Context, cfg *envconf.Config) (context.Context, error) { - if definition.ReadyFunction != nil { - log.WithFields(logrus.Fields{ - "function": "isReady", - "deployment": definition.ManifestFile, - }).Debug("checking readiness") - if err := definition.ReadyFunction(cfg); err != nil { + if definition.Ready != nil { + log.WithFields(logrus.Fields{"deployment": definition.ManifestFile, "readiness": definition.Ready.Description}).Infof("Readiness check set with timeout: %s", definition.Ready.Timeout.String()) + return withTimeout(func(ctx context.Context, cfg *envconf.Config) (context.Context, error) { + if err := definition.Ready.Function(cfg); err != nil { return ctx, fmt.Errorf("component not ready: %w", err) } - } - return ctx, nil - }, time.Minute*20) + return ctx, nil + }, definition.Ready.Timeout, definition.Ready.Retry) + } + return func(ctx context.Context, _ *envconf.Config) (context.Context, error) { return ctx, nil } } // helper to get the base directory of this package, allowing to load the test deployment diff --git a/e2e/cluster/kind_test.go b/e2e/cluster/kind_test.go index fd7397ef6..0967dec41 100644 --- a/e2e/cluster/kind_test.go +++ b/e2e/cluster/kind_test.go @@ -18,13 +18,28 @@ func TestOrderManifests(t *testing.T) { Deploy(Deployment{Order: ExternalServices, ManifestFile: "sql"}), Override(Loki, Deployment{Order: ExternalServices, ManifestFile: "loki"})) + var orders []DeployOrder + var files []string + for _, m := range tc.orderedManifests() { + orders = append(orders, m.Order) + files = append(files, m.ManifestFile) + } + // verify that deployments are overridden and/or inserted in proper order - require.Equal(t, []Deployment{ - {Order: Preconditions, ManifestFile: path.Join(packageDir(), "base", "01-permissions.yml")}, - {Order: ExternalServices, ManifestFile: "sql"}, - {Order: ExternalServices, ManifestFile: "loki"}, - {Order: NetObservServices, ManifestFile: path.Join(packageDir(), "base", "03-flp.yml")}, - {Order: WithAgent, ManifestFile: path.Join(packageDir(), "base", "04-agent.yml")}, - {ManifestFile: "pods.yml"}, - }, tc.orderedManifests()) + require.Equal(t, []DeployOrder{ + Preconditions, + ExternalServices, + ExternalServices, + NetObservServices, + WithAgent, + 0, + }, orders) + require.Equal(t, []string{ + path.Join(packageDir(), "base", "01-permissions.yml"), + "sql", + "loki", + path.Join(packageDir(), "base", "03-flp.yml"), + path.Join(packageDir(), "base", "04-agent.yml"), + "pods.yml", + }, files) } diff --git a/e2e/cluster/tester/loki.go b/e2e/cluster/tester/loki.go index 4c7f2b2a8..c6dee3b22 100644 --- a/e2e/cluster/tester/loki.go +++ b/e2e/cluster/tester/loki.go @@ -51,6 +51,20 @@ func (l *Loki) Ready() error { return nil } +func (l *Loki) DebugPrint(limit int, query string) { + fmt.Printf("---- DEBUG PRINT %d ----\n", limit) + resp, err := l.Query(limit, query) + if err != nil { + fmt.Printf("Error: %v\n", err) + return + } + if resp == nil { + fmt.Printf("Response is nil\n") + return + } + fmt.Printf("LOKI CONTENT: %v\n", resp.Data.Result) +} + // Query executes an arbitrary logQL query, given a limit in the results func (l *Loki) Query(limit int, logQL string) (*LokiQueryResponse, error) { status, body, err := l.get(fmt.Sprintf("%s?%s=%d&%s&%s=%s", diff --git a/e2e/cluster/tester/pods.go b/e2e/cluster/tester/pods.go index 5ac141f51..e7190afc0 100644 --- a/e2e/cluster/tester/pods.go +++ b/e2e/cluster/tester/pods.go @@ -33,9 +33,7 @@ func NewPods(cfg *envconf.Config) (*Pods, error) { }, nil } -func (p *Pods) MACAddress( - ctx context.Context, namespace, name, iface string, -) (net.HardwareAddr, error) { +func (p *Pods) MACAddress(ctx context.Context, namespace, name, iface string) (net.HardwareAddr, error) { mac, errStr, err := p.Execute(ctx, namespace, name, "cat", "/sys/class/net/"+iface+"/address") if err != nil { return nil, fmt.Errorf("executing command: %w", err) @@ -78,3 +76,14 @@ func (p *Pods) Execute(ctx context.Context, namespace, name string, command ...s } return buf.String(), errBuf.String(), nil } + +func (p *Pods) DSReady(ctx context.Context, namespace, name string) error { + ds, err := p.client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("getting DS %s: %w", name, err) + } + if ds.Status.NumberReady != 1 { + return fmt.Errorf("%s not ready", name) + } + return nil +} diff --git a/e2e/ipfix/ipfix_test.go b/e2e/ipfix/ipfix_test.go index 6a30bebf7..34d66c3c7 100644 --- a/e2e/ipfix/ipfix_test.go +++ b/e2e/ipfix/ipfix_test.go @@ -14,7 +14,7 @@ import ( const ( clusterNamePrefix = "ipfix-test-cluster" - testTimeout = 20 * time.Minute + testTimeout = 10 * time.Minute namespace = "default" ) diff --git a/e2e/kafka/kafka_test.go b/e2e/kafka/kafka_test.go index 3f144b6ea..ad17e3809 100644 --- a/e2e/kafka/kafka_test.go +++ b/e2e/kafka/kafka_test.go @@ -43,12 +43,16 @@ func TestMain(m *testing.M) { }), cluster.Deploy(cluster.Deployment{ Order: cluster.ExternalServices, ManifestFile: path.Join("manifests", "11-kafka-cluster.yml"), - ReadyFunction: func(cfg *envconf.Config) error { - // wait for kafka to be ready - if !checkResources(cfg.Client(), "kafka-cluster-zookeeper", "kafka-cluster-kafka", "strimzi-cluster-operator", "kafka-cluster-entity-operator") { - return errors.New("waiting for kafka cluster to be ready") - } - return nil + Ready: &cluster.Readiness{ + Function: func(cfg *envconf.Config) error { + // wait for kafka to be ready + if !checkResources(cfg.Client(), "kafka-cluster-zookeeper", "kafka-cluster-kafka", "strimzi-cluster-operator", "kafka-cluster-entity-operator") { + return errors.New("waiting for kafka cluster to be ready") + } + return nil + }, + Timeout: 10 * time.Minute, + Retry: 20 * time.Second, }, }), cluster.Override(cluster.FlowLogsPipeline, cluster.Deployment{ diff --git a/e2e/kafka/manifests/10-kafka-crd.yml b/e2e/kafka/manifests/10-kafka-crd.yml index 469cd72a2..1082217fa 100644 --- a/e2e/kafka/manifests/10-kafka-crd.yml +++ b/e2e/kafka/manifests/10-kafka-crd.yml @@ -11678,10 +11678,10 @@ spec: resources: limits: cpu: 1000m - memory: 384Mi + memory: 500Mi requests: cpu: 200m - memory: 384Mi + memory: 100Mi strategy: type: Recreate