From 25ccec8ec0319d3a0227094f0bf9b966c19a8e4b Mon Sep 17 00:00:00 2001 From: Jason Parraga Date: Thu, 1 Aug 2024 21:47:21 -0700 Subject: [PATCH 1/3] Log stack trace when refresh cache sync recovers from panic Signed-off-by: Jason Parraga --- flytestdlib/cache/auto_refresh.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/flytestdlib/cache/auto_refresh.go b/flytestdlib/cache/auto_refresh.go index bb23ef9369..8218e577a8 100644 --- a/flytestdlib/cache/auto_refresh.go +++ b/flytestdlib/cache/auto_refresh.go @@ -3,6 +3,7 @@ package cache import ( "context" "fmt" + "runtime/debug" "sync" "time" @@ -290,9 +291,9 @@ func (w *autoRefresh) sync(ctx context.Context) (err error) { } if err, isErr = rVal.(error); isErr { - err = fmt.Errorf("worker panic'd and is shutting down. Error: %w", err) + err = fmt.Errorf("worker panic'd and is shutting down. Error: %w with Stack: %v", err, string(debug.Stack())) } else { - err = fmt.Errorf("worker panic'd and is shutting down. Panic value: %v", rVal) + err = fmt.Errorf("worker panic'd and is shutting down. Panic value: %v with Stack: %v", rVal, string(debug.Stack())) } logger.Error(ctx, err) From 878b86ebc3e7136ed9b4d8285bf173eb8856cea7 Mon Sep 17 00:00:00 2001 From: Jason Parraga Date: Thu, 1 Aug 2024 22:12:30 -0700 Subject: [PATCH 2/3] Add unit test for panic scenario Signed-off-by: Jason Parraga --- flytestdlib/cache/auto_refresh_test.go | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/flytestdlib/cache/auto_refresh_test.go b/flytestdlib/cache/auto_refresh_test.go index e798300f5d..59ae4b7494 100644 --- a/flytestdlib/cache/auto_refresh_test.go +++ b/flytestdlib/cache/auto_refresh_test.go @@ -64,6 +64,15 @@ func syncTerminalItem(_ context.Context, batch Batch) ([]ItemSyncResponse, error panic("This should never be called") } +type panickingSyncer struct { + callCount atomic.Int32 +} + +func (p *panickingSyncer) sync(_ context.Context, _ Batch) ([]ItemSyncResponse, error) { + p.callCount.Inc() + panic("testing") +} + func TestCacheFour(t *testing.T) { testResyncPeriod := 10 * time.Millisecond rateLimiter := workqueue.DefaultControllerRateLimiter() @@ -172,6 +181,34 @@ func TestCacheFour(t *testing.T) { cancel() }) + + t.Run("Test panic on sync and shutdown", func(t *testing.T) { + syncer := &panickingSyncer{} + cache, err := NewAutoRefreshCache("fake3", syncer.sync, rateLimiter, testResyncPeriod, 10, 2, promutils.NewTestScope()) + assert.NoError(t, err) + + ctx, cancel := context.WithCancel(context.Background()) + assert.NoError(t, cache.Start(ctx)) + + itemID := "dummy_id" + _, err = cache.GetOrCreate(itemID, fakeCacheItem{ + val: 0, + }) + assert.NoError(t, err) + + // wait for all workers to run + assert.Eventually(t, func() bool { + return syncer.callCount.Load() == int32(10) + }, time.Second, time.Millisecond) + + // wait some more time + time.Sleep(500 * time.Millisecond) + + // all workers should have shut down. + assert.Equal(t, int32(10), syncer.callCount.Load()) + + cancel() + }) } func TestQueueBuildUp(t *testing.T) { From 169baa9c2add3b5a0cb77eef895c3586a9c2349d Mon Sep 17 00:00:00 2001 From: Jason Parraga Date: Thu, 1 Aug 2024 22:41:38 -0700 Subject: [PATCH 3/3] Increase test timeout for CI Signed-off-by: Jason Parraga --- flytestdlib/cache/auto_refresh_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flytestdlib/cache/auto_refresh_test.go b/flytestdlib/cache/auto_refresh_test.go index 59ae4b7494..5e1c49777e 100644 --- a/flytestdlib/cache/auto_refresh_test.go +++ b/flytestdlib/cache/auto_refresh_test.go @@ -199,7 +199,7 @@ func TestCacheFour(t *testing.T) { // wait for all workers to run assert.Eventually(t, func() bool { return syncer.callCount.Load() == int32(10) - }, time.Second, time.Millisecond) + }, 5*time.Second, time.Millisecond) // wait some more time time.Sleep(500 * time.Millisecond)