-
Notifications
You must be signed in to change notification settings - Fork 4.6k
clientconn: Wait for all goroutines on close #8666
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -925,25 +925,24 @@ func (cc *ClientConn) incrCallsFailed() { | |
| // connect starts creating a transport. | ||
| // It does nothing if the ac is not IDLE. | ||
| // TODO(bar) Move this to the addrConn section. | ||
| func (ac *addrConn) connect() error { | ||
| func (ac *addrConn) connect(abort <-chan struct{}) { | ||
| ac.mu.Lock() | ||
| if ac.state == connectivity.Shutdown { | ||
| if logger.V(2) { | ||
| logger.Infof("connect called on shutdown addrConn; ignoring.") | ||
| } | ||
| ac.mu.Unlock() | ||
| return errConnClosing | ||
| return | ||
| } | ||
| if ac.state != connectivity.Idle { | ||
| if logger.V(2) { | ||
| logger.Infof("connect called on addrConn in non-idle state (%v); ignoring.", ac.state) | ||
| } | ||
| ac.mu.Unlock() | ||
| return nil | ||
| return | ||
| } | ||
|
|
||
| ac.resetTransportAndUnlock() | ||
| return nil | ||
| ac.resetTransportAndUnlock(abort) | ||
| } | ||
|
|
||
| // equalAddressIgnoringBalAttributes returns true is a and b are considered equal. | ||
|
|
@@ -962,7 +961,7 @@ func equalAddressesIgnoringBalAttributes(a, b []resolver.Address) bool { | |
|
|
||
| // updateAddrs updates ac.addrs with the new addresses list and handles active | ||
| // connections or connection attempts. | ||
| func (ac *addrConn) updateAddrs(addrs []resolver.Address) { | ||
| func (ac *addrConn) updateAddrs(abort <-chan struct{}, addrs []resolver.Address) { | ||
| addrs = copyAddresses(addrs) | ||
| limit := len(addrs) | ||
| if limit > 5 { | ||
|
|
@@ -1018,7 +1017,7 @@ func (ac *addrConn) updateAddrs(addrs []resolver.Address) { | |
|
|
||
| // Since we were connecting/connected, we should start a new connection | ||
| // attempt. | ||
| go ac.resetTransportAndUnlock() | ||
| ac.resetTransportAndUnlock(abort) | ||
| } | ||
|
|
||
| // getServerName determines the serverName to be used in the connection | ||
|
|
@@ -1249,9 +1248,17 @@ func (ac *addrConn) adjustParams(r transport.GoAwayReason) { | |
| // resetTransportAndUnlock unconditionally connects the addrConn. | ||
| // | ||
| // ac.mu must be held by the caller, and this function will guarantee it is released. | ||
| func (ac *addrConn) resetTransportAndUnlock() { | ||
| acCtx := ac.ctx | ||
| if acCtx.Err() != nil { | ||
| func (ac *addrConn) resetTransportAndUnlock(abort <-chan struct{}) { | ||
| ctx, cancel := context.WithCancel(ac.ctx) | ||
| go func() { | ||
| select { | ||
| case <-abort: | ||
| cancel() | ||
| case <-ctx.Done(): | ||
| } | ||
| }() | ||
|
|
||
| if ctx.Err() != nil { | ||
| ac.mu.Unlock() | ||
| return | ||
| } | ||
|
|
@@ -1279,12 +1286,12 @@ func (ac *addrConn) resetTransportAndUnlock() { | |
| ac.updateConnectivityState(connectivity.Connecting, nil) | ||
| ac.mu.Unlock() | ||
|
|
||
| if err := ac.tryAllAddrs(acCtx, addrs, connectDeadline); err != nil { | ||
| if err := ac.tryAllAddrs(ctx, addrs, connectDeadline); err != nil { | ||
| // TODO: #7534 - Move re-resolution requests into the pick_first LB policy | ||
| // to ensure one resolution request per pass instead of per subconn failure. | ||
| ac.cc.resolveNow(resolver.ResolveNowOptions{}) | ||
| ac.mu.Lock() | ||
| if acCtx.Err() != nil { | ||
| if ctx.Err() != nil { | ||
| // addrConn was torn down. | ||
| ac.mu.Unlock() | ||
| return | ||
|
|
@@ -1305,13 +1312,13 @@ func (ac *addrConn) resetTransportAndUnlock() { | |
| ac.mu.Unlock() | ||
| case <-b: | ||
| timer.Stop() | ||
| case <-acCtx.Done(): | ||
| case <-ctx.Done(): | ||
| timer.Stop() | ||
| return | ||
| } | ||
|
|
||
| ac.mu.Lock() | ||
| if acCtx.Err() == nil { | ||
| if ctx.Err() == nil { | ||
| ac.updateConnectivityState(connectivity.Idle, err) | ||
| } | ||
| ac.mu.Unlock() | ||
|
|
@@ -1366,6 +1373,9 @@ func (ac *addrConn) tryAllAddrs(ctx context.Context, addrs []resolver.Address, c | |
| // new transport. | ||
| func (ac *addrConn) createTransport(ctx context.Context, addr resolver.Address, copts transport.ConnectOptions, connectDeadline time.Time) error { | ||
| addr.ServerName = ac.cc.getServerName(addr) | ||
|
|
||
| var healthCheckStarted atomic.Bool | ||
| healthCheckDone := make(chan struct{}) | ||
| hctx, hcancel := context.WithCancel(ctx) | ||
|
|
||
| onClose := func(r transport.GoAwayReason) { | ||
|
|
@@ -1394,6 +1404,9 @@ func (ac *addrConn) createTransport(ctx context.Context, addr resolver.Address, | |
| // Always go idle and wait for the LB policy to initiate a new | ||
| // connection attempt. | ||
| ac.updateConnectivityState(connectivity.Idle, nil) | ||
| if healthCheckStarted.Load() { | ||
| <-healthCheckDone | ||
| } | ||
| } | ||
|
|
||
| connectCtx, cancel := context.WithDeadline(ctx, connectDeadline) | ||
|
|
@@ -1406,29 +1419,35 @@ func (ac *addrConn) createTransport(ctx context.Context, addr resolver.Address, | |
| logger.Infof("Creating new client transport to %q: %v", addr, err) | ||
| } | ||
| // newTr is either nil, or closed. | ||
| hcancel() | ||
| channelz.Warningf(logger, ac.channelz, "grpc: addrConn.createTransport failed to connect to %s. Err: %v", addr, err) | ||
| return err | ||
| } | ||
|
|
||
| ac.mu.Lock() | ||
| defer ac.mu.Unlock() | ||
| acMu := &ac.mu | ||
| acMu.Lock() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason the original code did not work? this seems unnecessarily complicated and does the same thing. Or am I missing something.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a way to make the defer do the unlock conditionally, as the code might need to unlock it before returning (see lines 1441 and 1442). We can achieve the same e.g. with a boolean variable, if you prefer. I think using a pointer for this is good, because, if you forget the nil check, it will panic with an easy to understand stack trace. Whereas if you forget to check a boolean, you'd do a double-unlock and there's a chance that you get weirder problems. |
||
| defer func() { | ||
| if acMu != nil { | ||
| acMu.Unlock() | ||
| } | ||
| }() | ||
| if ctx.Err() != nil { | ||
| // This can happen if the subConn was removed while in `Connecting` | ||
| // state. tearDown() would have set the state to `Shutdown`, but | ||
| // would not have closed the transport since ac.transport would not | ||
| // have been set at that point. | ||
| // | ||
| // We run this in a goroutine because newTr.Close() calls onClose() | ||
|
|
||
| // We unlock ac.mu because newTr.Close() calls onClose() | ||
| // inline, which requires locking ac.mu. | ||
| // | ||
| acMu.Unlock() | ||
| acMu = nil | ||
|
|
||
| // The error we pass to Close() is immaterial since there are no open | ||
| // streams at this point, so no trailers with error details will be sent | ||
| // out. We just need to pass a non-nil error. | ||
| // | ||
| // This can also happen when updateAddrs is called during a connection | ||
| // attempt. | ||
| go newTr.Close(transport.ErrConnClosing) | ||
| newTr.Close(transport.ErrConnClosing) | ||
| return nil | ||
| } | ||
| if hctx.Err() != nil { | ||
|
|
@@ -1440,7 +1459,7 @@ func (ac *addrConn) createTransport(ctx context.Context, addr resolver.Address, | |
| } | ||
| ac.curAddr = addr | ||
| ac.transport = newTr | ||
| ac.startHealthCheck(hctx) // Will set state to READY if appropriate. | ||
| healthCheckStarted.Store(ac.startHealthCheck(hctx, healthCheckDone)) // Will set state to READY if appropriate. | ||
| return nil | ||
| } | ||
|
|
||
|
|
@@ -1456,7 +1475,7 @@ func (ac *addrConn) createTransport(ctx context.Context, addr resolver.Address, | |
| // It sets addrConn to READY if the health checking stream is not started. | ||
| // | ||
| // Caller must hold ac.mu. | ||
| func (ac *addrConn) startHealthCheck(ctx context.Context) { | ||
| func (ac *addrConn) startHealthCheck(ctx context.Context, done chan<- struct{}) bool { | ||
| var healthcheckManagingState bool | ||
| defer func() { | ||
| if !healthcheckManagingState { | ||
|
|
@@ -1465,22 +1484,22 @@ func (ac *addrConn) startHealthCheck(ctx context.Context) { | |
| }() | ||
|
|
||
| if ac.cc.dopts.disableHealthCheck { | ||
| return | ||
| return false | ||
| } | ||
| healthCheckConfig := ac.cc.healthCheckConfig() | ||
| if healthCheckConfig == nil { | ||
| return | ||
| return false | ||
| } | ||
| if !ac.scopts.HealthCheckEnabled { | ||
| return | ||
| return false | ||
| } | ||
| healthCheckFunc := internal.HealthCheckFunc | ||
| if healthCheckFunc == nil { | ||
| // The health package is not imported to set health check function. | ||
| // | ||
| // TODO: add a link to the health check doc in the error message. | ||
| channelz.Error(logger, ac.channelz, "Health check is requested but health check function is not set.") | ||
| return | ||
| return false | ||
| } | ||
|
|
||
| healthcheckManagingState = true | ||
|
|
@@ -1506,6 +1525,7 @@ func (ac *addrConn) startHealthCheck(ctx context.Context) { | |
| } | ||
| // Start the health checking stream. | ||
| go func() { | ||
| defer close(done) | ||
| err := healthCheckFunc(ctx, newStream, setConnectivityState, healthCheckConfig.ServiceName) | ||
| if err != nil { | ||
| if status.Code(err) == codes.Unimplemented { | ||
|
|
@@ -1515,6 +1535,7 @@ func (ac *addrConn) startHealthCheck(ctx context.Context) { | |
| } | ||
| } | ||
| }() | ||
| return true | ||
| } | ||
|
|
||
| func (ac *addrConn) resetConnectBackoff() { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason we changed this to run the
acbw.ac.updateAddrs(addrs)function in a go routine??There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's basically a "bubbled-up" goroutine. Previously, the goroutine was spawned in
updateAddrsitself (line 1021). But, as we now need to track those, I figured it would be most appropriate to do it here. Another option would be to somehow push this down intoupdateAddrsitself, by passing theacBalancerWrapperpointer, or a function pointer toacbw.goFuncor sth. along those lines, and then use that to spawn the goroutine there:Then we could write line 1021 of
updateAddrslike so: