Skip to content

Commit

Permalink
Improve e2e troubleshooting
Browse files Browse the repository at this point in the history
Improve / fix some issues with e2e tests:
- Add more logs; print some useful information such as when cluster is
  still up
- Improve readiness (e.g: had agents pods crashing)
- Use more up to date templates for loki and kafka (similar to what we
  have in docs repo)
  • Loading branch information
jotak committed Nov 6, 2024
1 parent 626d526 commit bb1d90d
Show file tree
Hide file tree
Showing 11 changed files with 192 additions and 52 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ tests-e2e: prereqs ## Run e2e tests
go clean -testcache
# making the local agent image available to kind in two ways, so it will work in different
# environments: (1) as image tagged in the local repository (2) as image archive.
$(OCI_BIN) build . --build-arg TARGETARCH=$(GOARCH) -t localhost/ebpf-agent:test
$(OCI_BIN) build . --build-arg LDFLAGS="" --build-arg TARGETARCH=$(GOARCH) -t localhost/ebpf-agent:test
$(OCI_BIN) save -o ebpf-agent.tar localhost/ebpf-agent:test
GOOS=$(GOOS) go test -p 1 -timeout 30m -v -mod vendor -tags e2e ./e2e/...

Expand Down
1 change: 1 addition & 0 deletions e2e/basic/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ func (bt *FlowCaptureTester) lokiQuery(t *testing.T, logQL string) tester.LokiQu
query, err = bt.Cluster.Loki().Query(1, logQL)
require.NoError(t, err)
require.NotNil(t, query)
require.NotNil(t, query.Data)
require.NotEmpty(t, query.Data.Result)
}, test.Interval(time.Second))
result := query.Data.Result[0]
Expand Down
1 change: 1 addition & 0 deletions e2e/basic/flow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ func getPingFlows(t *testing.T, newerThan time.Time, expectedBytes int) (sent, r
}, test.Interval(time.Second))

test.Eventually(t, time.Minute, func(t require.TestingT) {
// testCluster.Loki().DebugPrint(100, `{app="netobserv-flowcollector",DstK8S_OwnerName="pinger"}`)
query, err = testCluster.Loki().
Query(1, fmt.Sprintf(`{SrcK8S_OwnerName="server",DstK8S_OwnerName="pinger"}`+
`|~"\"Proto\":1[,}]"`+ // Proto 1 == ICMP
Expand Down
74 changes: 63 additions & 11 deletions e2e/cluster/base/02-loki.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ data:
server:
http_listen_port: 3100
grpc_listen_port: 9096
grpc_server_max_recv_msg_size: 10485760
http_server_read_timeout: 1m
http_server_write_timeout: 1m
log_level: error
target: all
common:
path_prefix: /loki-store
storage:
Expand All @@ -31,9 +36,32 @@ data:
instance_addr: 127.0.0.1
kvstore:
store: inmemory
compactor:
compaction_interval: 5m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
frontend:
compress_responses: true
ingester:
chunk_encoding: snappy
chunk_retain_period: 1m
query_range:
align_queries_with_step: true
cache_results: true
max_retries: 5
results_cache:
cache:
enable_fifocache: true
fifocache:
max_size_bytes: 500MB
validity: 24h
parallelise_shardable_queries: true
query_scheduler:
max_outstanding_requests_per_tenant: 2048
schema_config:
configs:
- from: 2020-10-24
- from: 2022-01-01
store: boltdb-shipper
object_store: filesystem
schema: v11
Expand All @@ -47,15 +75,39 @@ data:
active_index_directory: /loki-store/index
shared_store: filesystem
cache_location: /loki-store/boltdb-cache
datasource.yaml: |
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://localhost:3100
isDefault: true
version: 1
cache_ttl: 24h
limits_config:
ingestion_rate_strategy: global
ingestion_rate_mb: 10
ingestion_burst_size_mb: 10
max_label_name_length: 1024
max_label_value_length: 2048
max_label_names_per_series: 30
reject_old_samples: true
reject_old_samples_max_age: 15m
creation_grace_period: 10m
enforce_metric_name: false
max_line_size: 256000
max_line_size_truncate: false
max_entries_limit_per_query: 10000
max_streams_per_user: 0
max_global_streams_per_user: 0
unordered_writes: true
max_chunks_per_query: 2000000
max_query_length: 721h
max_query_parallelism: 32
max_query_series: 10000
cardinality_limit: 100000
max_streams_matchers_per_query: 1000
max_concurrent_tail_requests: 10
retention_period: 24h
max_cache_freshness_per_query: 5m
max_queriers_per_tenant: 0
per_stream_rate_limit: 3MB
per_stream_rate_limit_burst: 15MB
max_query_lookback: 0
min_sharding_lookback: 0s
split_queries_by_interval: 1m
---
apiVersion: apps/v1
kind: Deployment
Expand Down Expand Up @@ -83,7 +135,7 @@ spec:
name: loki-config
containers:
- name: loki
image: grafana/loki:2.4.1
image: grafana/loki:2.9.0
volumeMounts:
- mountPath: "/loki-store"
name: loki-store
Expand Down
84 changes: 64 additions & 20 deletions e2e/cluster/kind.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ import (
"fmt"
"io"
"os"
"os/signal"
"path"
"sort"
"syscall"
"testing"
"time"

Expand Down Expand Up @@ -90,28 +92,58 @@ var defaultBaseDeployments = map[DeployID]Deployment{
Loki: {
Order: ExternalServices,
ManifestFile: path.Join(packageDir(), "base", "02-loki.yml"),
ReadyFunction: func(*envconf.Config) error {
return (&tester.Loki{BaseURL: "http://127.0.0.1:30100"}).Ready()
Ready: &Readiness{
Function: func(*envconf.Config) error { return (&tester.Loki{BaseURL: "http://localhost:30100"}).Ready() },
Description: "Check that http://localhost:30100 is reachable (Loki NodePort)",
Timeout: 5 * time.Minute,
Retry: 5 * time.Second,
},
},
FlowLogsPipeline: {
Order: NetObservServices, ManifestFile: path.Join(packageDir(), "base", "03-flp.yml"),
Ready: &Readiness{
Function: testPodsReady("flp"),
Description: "Check that flp pods are up and running",
Timeout: 5 * time.Minute,
Retry: 5 * time.Second,
},
},
Agent: {
Order: WithAgent, ManifestFile: path.Join(packageDir(), "base", "04-agent.yml"),
Ready: &Readiness{
Function: testPodsReady("netobserv-ebpf-agent"),
Description: "Check that agent pods are up and running",
Timeout: 5 * time.Minute,
Retry: 5 * time.Second,
},
},
}

func testPodsReady(dsName string) func(*envconf.Config) error {
return func(cfg *envconf.Config) error {
pods, err := tester.NewPods(cfg)
if err != nil {
return err
}
return pods.DSReady(context.Background(), "default", dsName)
}
}

// Deployment of components. Not only K8s deployments but also Pods, Services, DaemonSets, ...
type Deployment struct {
// Order of the deployment. Deployments with the same order will be executed by alphabetical
// order of its manifest file
Order DeployOrder
// ManifestFile path to the kubectl-like YAML manifest file
ManifestFile string
// ReadyFunction is an optional function that returns error if the deployment is not ready.
// Used when it's needed to wait before starting tests or deploying later components.
ReadyFunction func(*envconf.Config) error
Ready *Readiness
}

type Readiness struct {
Function func(*envconf.Config) error
Description string
Timeout time.Duration
Retry time.Duration
}

// Kind cluster deployed by each TestMain function, prepared for a given test scenario.
Expand Down Expand Up @@ -146,6 +178,7 @@ func Deploy(def Deployment) Option {

// Timeout for long-running operations (e.g. deployments, readiness probes...)
func Timeout(t time.Duration) Option {
log.Infof("Timeout set to %s", t.String())
return func(k *Kind) {
k.timeout = t
}
Expand All @@ -156,6 +189,9 @@ func Timeout(t time.Duration) Option {
// backend doesn't provide access to the local images, where the ebpf-agent.tar container image
// is located. Usually it will be the project root.
func NewKind(kindClusterName, baseDir string, options ...Option) *Kind {
fmt.Println()
fmt.Println()
log.Infof("Starting KIND cluster %s", kindClusterName)
k := &Kind{
testEnv: env.New(),
baseDir: baseDir,
Expand Down Expand Up @@ -191,10 +227,19 @@ func (k *Kind) Run(m *testing.M) {
currentOrder = c.Order
}
envFuncs = append(envFuncs, deploy(c))
readyFuncs = append(readyFuncs, withTimeout(isReady(c), k.timeout))
readyFuncs = append(readyFuncs, isReady(c))
}
envFuncs = append(envFuncs, readyFuncs...)

exit := make(chan os.Signal, 1)
signal.Notify(exit, os.Interrupt, syscall.SIGTERM)
go func() {
<-exit
fmt.Println("SIGTERM received, cluster might still be running")
fmt.Printf("To clean up, run: \033[33mkind delete cluster --name %s\033[0m\n", k.clusterName)
os.Exit(1)
}()

log.Info("starting kind setup")
code := k.testEnv.Setup(envFuncs...).
Finish(
Expand Down Expand Up @@ -244,7 +289,7 @@ func (k *Kind) TestEnv() env.Environment {

// Loki client pointing to the Loki instance inside the test cluster
func (k *Kind) Loki() *tester.Loki {
return &tester.Loki{BaseURL: "http://127.0.0.1:30100"}
return &tester.Loki{BaseURL: "http://localhost:30100"}
}

func deploy(definition Deployment) env.Func {
Expand Down Expand Up @@ -285,6 +330,7 @@ func deployManifestFile(definition Deployment,
if !errors.Is(err, io.EOF) {
return fmt.Errorf("decoding manifest raw object: %w", err)
}
log.WithField("file", definition.ManifestFile).Info("done") // eof
return nil
}

Expand Down Expand Up @@ -344,7 +390,7 @@ func (k *Kind) loadLocalImage() env.Func {
}

// withTimeout retries the execution of an env.Func until it succeeds or a timeout is reached
func withTimeout(f env.Func, timeout time.Duration) env.Func {
func withTimeout(f env.Func, timeout, retry time.Duration) env.Func {
tlog := log.WithField("function", "withTimeout")
return func(ctx context.Context, config *envconf.Config) (context.Context, error) {
start := time.Now()
Expand All @@ -356,26 +402,24 @@ func withTimeout(f env.Func, timeout time.Duration) env.Func {
if time.Since(start) > timeout {
return ctx, fmt.Errorf("timeout (%s) trying to execute function: %w", timeout, err)
}
tlog.WithError(err).Debug("function did not succeed. Retrying after 5s")
time.Sleep(5 * time.Second)
tlog.WithError(err).Debugf("function did not succeed. Retrying after %s", retry.String())
time.Sleep(retry)
}
}
}

// isReady succeeds if the passed deployment does not have ReadyFunction, or it succeeds
func isReady(definition Deployment) env.Func {
return withTimeout(func(ctx context.Context, cfg *envconf.Config) (context.Context, error) {
if definition.ReadyFunction != nil {
log.WithFields(logrus.Fields{
"function": "isReady",
"deployment": definition.ManifestFile,
}).Debug("checking readiness")
if err := definition.ReadyFunction(cfg); err != nil {
if definition.Ready != nil {
log.WithFields(logrus.Fields{"deployment": definition.ManifestFile, "readiness": definition.Ready.Description}).Infof("Readiness check set with timeout: %s", definition.Ready.Timeout.String())
return withTimeout(func(ctx context.Context, cfg *envconf.Config) (context.Context, error) {
if err := definition.Ready.Function(cfg); err != nil {
return ctx, fmt.Errorf("component not ready: %w", err)
}
}
return ctx, nil
}, time.Minute*20)
return ctx, nil
}, definition.Ready.Timeout, definition.Ready.Retry)
}
return func(ctx context.Context, _ *envconf.Config) (context.Context, error) { return ctx, nil }
}

// helper to get the base directory of this package, allowing to load the test deployment
Expand Down
31 changes: 23 additions & 8 deletions e2e/cluster/kind_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,28 @@ func TestOrderManifests(t *testing.T) {
Deploy(Deployment{Order: ExternalServices, ManifestFile: "sql"}),
Override(Loki, Deployment{Order: ExternalServices, ManifestFile: "loki"}))

var orders []DeployOrder
var files []string
for _, m := range tc.orderedManifests() {
orders = append(orders, m.Order)
files = append(files, m.ManifestFile)
}

// verify that deployments are overridden and/or inserted in proper order
require.Equal(t, []Deployment{
{Order: Preconditions, ManifestFile: path.Join(packageDir(), "base", "01-permissions.yml")},
{Order: ExternalServices, ManifestFile: "sql"},
{Order: ExternalServices, ManifestFile: "loki"},
{Order: NetObservServices, ManifestFile: path.Join(packageDir(), "base", "03-flp.yml")},
{Order: WithAgent, ManifestFile: path.Join(packageDir(), "base", "04-agent.yml")},
{ManifestFile: "pods.yml"},
}, tc.orderedManifests())
require.Equal(t, []DeployOrder{
Preconditions,
ExternalServices,
ExternalServices,
NetObservServices,
WithAgent,
0,
}, orders)
require.Equal(t, []string{
path.Join(packageDir(), "base", "01-permissions.yml"),
"sql",
"loki",
path.Join(packageDir(), "base", "03-flp.yml"),
path.Join(packageDir(), "base", "04-agent.yml"),
"pods.yml",
}, files)
}
14 changes: 14 additions & 0 deletions e2e/cluster/tester/loki.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,20 @@ func (l *Loki) Ready() error {
return nil
}

func (l *Loki) DebugPrint(limit int, query string) {
fmt.Printf("---- DEBUG PRINT %d ----\n", limit)
resp, err := l.Query(limit, query)
if err != nil {
fmt.Printf("Error: %v\n", err)
return
}
if resp == nil {
fmt.Printf("Response is nil\n")
return
}
fmt.Printf("LOKI CONTENT: %v\n", resp.Data.Result)
}

// Query executes an arbitrary logQL query, given a limit in the results
func (l *Loki) Query(limit int, logQL string) (*LokiQueryResponse, error) {
status, body, err := l.get(fmt.Sprintf("%s?%s=%d&%s&%s=%s",
Expand Down
15 changes: 12 additions & 3 deletions e2e/cluster/tester/pods.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ func NewPods(cfg *envconf.Config) (*Pods, error) {
}, nil
}

func (p *Pods) MACAddress(
ctx context.Context, namespace, name, iface string,
) (net.HardwareAddr, error) {
func (p *Pods) MACAddress(ctx context.Context, namespace, name, iface string) (net.HardwareAddr, error) {
mac, errStr, err := p.Execute(ctx, namespace, name, "cat", "/sys/class/net/"+iface+"/address")
if err != nil {
return nil, fmt.Errorf("executing command: %w", err)
Expand Down Expand Up @@ -78,3 +76,14 @@ func (p *Pods) Execute(ctx context.Context, namespace, name string, command ...s
}
return buf.String(), errBuf.String(), nil
}

func (p *Pods) DSReady(ctx context.Context, namespace, name string) error {
ds, err := p.client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{})
if err != nil {
return fmt.Errorf("getting DS %s: %w", name, err)
}
if ds.Status.NumberReady != 1 {
return fmt.Errorf("%s not ready", name)
}
return nil
}
2 changes: 1 addition & 1 deletion e2e/ipfix/ipfix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (

const (
clusterNamePrefix = "ipfix-test-cluster"
testTimeout = 20 * time.Minute
testTimeout = 10 * time.Minute
namespace = "default"
)

Expand Down
Loading

0 comments on commit bb1d90d

Please sign in to comment.