From 0de6a01824bee7453c6997a12e1f6a5830564f9d Mon Sep 17 00:00:00 2001 From: Joel Takvorian Date: Tue, 12 Nov 2024 17:55:45 +0100 Subject: [PATCH] NETOBSERV-1954: fix under-estimation of traffic (#444) * NETOBSERV-1954: fix under-estimation of traffic Remove flawed logic that ignores some flows, especially visible under stress This came from that time: https://github.com/netobserv/netobserv-ebpf-agent/pull/49/files#diff-f3248f855cc319b6d96125842a513586d6d8cbfb1e241a93e79911d08e41c728L334-L336 Old comment said: "...when a new flow maps to the same entry, it has some zombie entries." - this is weird and I'm not sure how that would be possible, flow_metrics entries should be zeroed as we're using designed initializers. In any case, relying on a single eviction time reference is wrong because flows are evicted one by one, resulting in many new flows having their timer set before the "last eviction time". If we find evidence of those "zombie entries" then we must find another way to fix them. * do not set startTime=0 --- pkg/flow/tracer_map.go | 14 -------------- pkg/model/record.go | 2 +- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/pkg/flow/tracer_map.go b/pkg/flow/tracer_map.go index a24d13d0e..7137ea142 100644 --- a/pkg/flow/tracer_map.go +++ b/pkg/flow/tracer_map.go @@ -26,7 +26,6 @@ type MapTracer struct { staleEntriesEvictTimeout time.Duration // manages the access to the eviction routines, avoiding two evictions happening at the same time evictionCond *sync.Cond - lastEvictionNs uint64 metrics *metrics.Metrics timeSpentinLookupAndDelete prometheus.Histogram } @@ -40,7 +39,6 @@ func NewMapTracer(fetcher mapFetcher, evictionTimeout, staleEntriesEvictTimeout return &MapTracer{ mapFetcher: fetcher, evictionTimeout: evictionTimeout, - lastEvictionNs: uint64(monotime.Now()), evictionCond: sync.NewCond(&sync.Mutex{}), staleEntriesEvictTimeout: staleEntriesEvictTimeout, metrics: m, @@ -101,7 +99,6 @@ func (m *MapTracer) evictFlows(ctx context.Context, forceGC bool, forwardFlows c currentTime := time.Now() var forwardingFlows []*model.Record - laterFlowNs := uint64(0) flows := m.mapFetcher.LookupAndDeleteMap(m.metrics) elapsed := time.Since(currentTime) for flowKey, flowMetrics := range flows { @@ -110,10 +107,6 @@ func (m *MapTracer) evictFlows(ctx context.Context, forceGC bool, forwardFlows c if aggregatedMetrics.EndMonoTimeTs == 0 { continue } - // If it iterated an entry that do not have updated flows - if aggregatedMetrics.EndMonoTimeTs > laterFlowNs { - laterFlowNs = aggregatedMetrics.EndMonoTimeTs - } forwardingFlows = append(forwardingFlows, model.NewRecord( flowKey, aggregatedMetrics, @@ -122,7 +115,6 @@ func (m *MapTracer) evictFlows(ctx context.Context, forceGC bool, forwardFlows c )) } m.mapFetcher.DeleteMapsStaleEntries(m.staleEntriesEvictTimeout) - m.lastEvictionNs = laterFlowNs select { case <-ctx.Done(): mtlog.Debug("skipping flow eviction as agent is being stopped") @@ -146,12 +138,6 @@ func (m *MapTracer) aggregate(metrics []ebpf.BpfFlowMetrics) *ebpf.BpfFlowMetric } aggr := &ebpf.BpfFlowMetrics{} for i := range metrics { - // eBPF hashmap values are not zeroed when the entry is removed. That causes that we - // might receive entries from previous collect-eviction timeslots. - // We need to check the flow time and discard old flows. - if metrics[i].StartMonoTimeTs <= m.lastEvictionNs || metrics[i].EndMonoTimeTs <= m.lastEvictionNs { - continue - } model.Accumulate(aggr, &metrics[i]) } return aggr diff --git a/pkg/model/record.go b/pkg/model/record.go index 8d4c6c708..4b0406add 100644 --- a/pkg/model/record.go +++ b/pkg/model/record.go @@ -92,7 +92,7 @@ func NewRecord( func Accumulate(r *ebpf.BpfFlowMetrics, src *ebpf.BpfFlowMetrics) { // time == 0 if the value has not been yet set - if r.StartMonoTimeTs == 0 || r.StartMonoTimeTs > src.StartMonoTimeTs { + if r.StartMonoTimeTs == 0 || (r.StartMonoTimeTs > src.StartMonoTimeTs && src.StartMonoTimeTs != 0) { r.StartMonoTimeTs = src.StartMonoTimeTs } if r.EndMonoTimeTs == 0 || r.EndMonoTimeTs < src.EndMonoTimeTs {