From 960308863fe65b26e1d00a4b6da9015237bdf462 Mon Sep 17 00:00:00 2001 From: Sax Authors Date: Wed, 11 Sep 2024 18:18:39 -0700 Subject: [PATCH] reintroduce poison() so that saxadmin will work when the the user recreates a cluster. PiperOrigin-RevId: 673614166 Change-Id: I39c0418445e5b6e7cf26d71c989ac5ec97d6e321 --- saxml/client/go/BUILD | 2 ++ saxml/client/go/admin.go | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/saxml/client/go/BUILD b/saxml/client/go/BUILD index 71b787be..7c2890cc 100644 --- a/saxml/client/go/BUILD +++ b/saxml/client/go/BUILD @@ -97,6 +97,8 @@ go_library( # unused internal admin gRPC dependency, "@com_github_golang_glog//:go_default_library", "@org_golang_google_grpc//:go_default_library", + "@org_golang_google_grpc//codes:go_default_library", + "@org_golang_google_grpc//status:go_default_library", ], ) diff --git a/saxml/client/go/admin.go b/saxml/client/go/admin.go index 96e68660..e6687efe 100644 --- a/saxml/client/go/admin.go +++ b/saxml/client/go/admin.go @@ -20,11 +20,14 @@ import ( "encoding/binary" "fmt" "hash/maphash" + "strings" "sync" "time" log "github.com/golang/glog" + "google.golang.org/grpc/codes" "google.golang.org/grpc" + "google.golang.org/grpc/status" "saxml/common/addr" "saxml/common/errors" "saxml/common/platform/env" @@ -120,12 +123,34 @@ func (a *Admin) getAdminClient(ctx context.Context) (pbgrpc.AdminClient, error) return a.client, nil } +func (a *Admin) poison() { + a.mu.Lock() + conn := a.conn + a.conn = nil + a.client = nil + a.mu.Unlock() + + if conn != nil { + conn.Close() + } +} + func (a *Admin) retry(ctx context.Context, callback func(client pbgrpc.AdminClient) error) error { action := func() error { client, err := a.getAdminClient(ctx) if err == nil { err = callback(client) } + if errors.AdminShouldPoison(err) { + a.poison() + } + // After poison(), RPCs active on the old connection will fail with + // "Canceled desc = grpc: the client connection is closing". Translate it to + // Unavailable so that the RPC will retry. + if status.Code(err) == codes.Canceled && + strings.Contains(err.Error(), "client connection is closing") { + err = fmt.Errorf("Admin connection is poisoned: %w", errors.ErrUnavailable) + } return err } return retrier.Do(ctx, action, errors.AdminShouldRetry)