diff --git a/config/config.go b/config/config.go index d146d4efa..70d82f6e6 100644 --- a/config/config.go +++ b/config/config.go @@ -448,6 +448,31 @@ func GetAllPrefixes() []ConfigPrefix { return prefixes } +// Helper function to start a metadata request. +// +// We see periodic timeouts when doing metadata lookup at sites. +// Adding a modest retry will, hopefully, reduce the number of errors +// that propagate out to users +func startMetadataQuery(ctx context.Context, httpClient *http.Client, discoveryUrl *url.URL) (result *http.Response, err error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, discoveryUrl.String(), nil) + if err != nil { + err = errors.Wrapf(err, "Failure when doing federation metadata request creation for %s", discoveryUrl) + return + } + req.Header.Set("User-Agent", "pelican/"+version) + + result, err = httpClient.Do(req) + if err != nil { + var netErr net.Error + if errors.As(err, &netErr) && netErr.Timeout() { + err = MetadataTimeoutErr.Wrap(err) + } else { + err = NewMetadataError(err, "Error occurred when querying for metadata") + } + } + return +} + // This function is for discovering federations as specified by a url during a pelican:// transfer. // this does not populate global fields and is more temporary per url func DiscoverUrlFederation(ctx context.Context, federationDiscoveryUrl string) (metadata FederationDiscovery, err error) { @@ -478,24 +503,23 @@ func DiscoverUrlFederation(ctx context.Context, federationDiscoveryUrl string) ( Transport: GetTransport(), Timeout: time.Second * 5, } - req, err := http.NewRequestWithContext(ctx, http.MethodGet, discoveryUrl.String(), nil) - if err != nil { - err = errors.Wrapf(err, "Failure when doing federation metadata request creation for %s", discoveryUrl) - return - } - req.Header.Set("User-Agent", "pelican/"+version) - result, err := httpClient.Do(req) - if err != nil { - var netErr net.Error - if errors.As(err, &netErr) && netErr.Timeout() { - err = MetadataTimeoutErr.Wrap(err) - return + var result *http.Response + for idx := 1; idx <= 3; idx++ { + result, err = startMetadataQuery(ctx, &httpClient, discoveryUrl) + if err == nil { + break + } else if errors.Is(err, MetadataTimeoutErr) && ctx.Err() == nil { + log.Warningln("Timeout occurred when querying discovery URL", discoveryUrl.String(), "for metadata;", 3-idx, "retries remaining") + time.Sleep(2 * time.Second) } else { - err = NewMetadataError(err, "Error occurred when querying for metadata") return } } + if errors.Is(err, MetadataTimeoutErr) { + log.Errorln("3 timeouts occurred when querying discovery URL", discoveryUrl.String()) + return + } if result.Body != nil { defer result.Body.Close() diff --git a/config/config_test.go b/config/config_test.go index 184d44eef..f3e441117 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -33,6 +33,7 @@ import ( "github.com/pkg/errors" "github.com/sirupsen/logrus" + log "github.com/sirupsen/logrus" "github.com/sirupsen/logrus/hooks/test" "github.com/spf13/viper" "github.com/stretchr/testify/assert" @@ -835,6 +836,49 @@ func TestDiscoverUrlFederation(t *testing.T) { viper.Reset() }) + t.Run("TestMetadataDiscoveryTimeoutRetry", func(t *testing.T) { + viper.Set("tlsskipverify", true) + viper.Set("Logging.Level", "Debug") + viper.Set("Transport.ResponseHeaderTimeout", "300ms") + InitConfig() + err := InitClient() + require.NoError(t, err) + + hook := test.NewGlobal() + logrus.SetLevel(logrus.WarnLevel) + + // Create a server that sleeps for a longer duration than the timeout + ctr := 0 + server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if ctr < 1 { + time.Sleep(2 * time.Second) + } + + ctr += 1 + w.WriteHeader(200) + _, err := w.Write([]byte("{\"director_endpoint\": \"https://osdf-director.osg-htc.org\", \"namespace_registration_endpoint\": \"https://osdf-registry.osg-htc.org\", \"jwks_uri\": \"https://osg-htc.org/osdf/public_signing_key.jwks\"}")) + assert.NoError(t, err) + })) + defer server.Close() + + // Set a short timeout for the test + timeout := 5 * time.Second + + // Create a context with the timeout + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + // Call the function with the server URL and the context + _, err = DiscoverUrlFederation(ctx, server.URL) + + assert.Equal(t, log.WarnLevel, hook.LastEntry().Level) + assert.Equal(t, "Timeout occurred when querying discovery URL "+server.URL+"/.well-known/pelican-configuration for metadata; 2 retries remaining", hook.LastEntry().Message) + + // Assert that the error is the expected metadata timeout error + assert.NoError(t, err) + viper.Reset() + }) + t.Run("TestCanceledContext", func(t *testing.T) { // Create a server that waits for the context to be canceled server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {