Skip to content

Commit

Permalink
Merge pull request kubernetes#617 from kisieland/add-exp-backoff
Browse files Browse the repository at this point in the history
Add Exponential Backoff on the ListReferrers GCE API calls.
  • Loading branch information
k8s-ci-robot authored Oct 5, 2023
2 parents b55897d + b695996 commit 8fbe8d2
Show file tree
Hide file tree
Showing 4 changed files with 284 additions and 107 deletions.
26 changes: 13 additions & 13 deletions cmd/gcp-controller-manager/loops.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,19 @@ import (
)

type controllerContext struct {
client clientset.Interface
sharedInformers informers.SharedInformerFactory
recorder record.EventRecorder
gcpCfg gcpConfig
clusterSigningGKEKubeconfig string
csrApproverVerifyClusterMembership bool
csrApproverAllowLegacyKubelet bool
csrApproverUseGCEInstanceListReferrers bool
authAuthorizeServiceAccountMappingURL string
authSyncNodeURL string
hmsAuthorizeSAMappingURL string
hmsSyncNodeURL string
clearStalePodsOnNodeRegistration bool
client clientset.Interface
sharedInformers informers.SharedInformerFactory
recorder record.EventRecorder
gcpCfg gcpConfig
clusterSigningGKEKubeconfig string
csrApproverVerifyClusterMembership bool
csrApproverAllowLegacyKubelet bool
csrApproverListReferrersConfig gceInstanceListReferrersConfig
authAuthorizeServiceAccountMappingURL string
authSyncNodeURL string
hmsAuthorizeSAMappingURL string
hmsSyncNodeURL string
clearStalePodsOnNodeRegistration bool
}

// loops returns all the control loops that the GCPControllerManager can start.
Expand Down
129 changes: 71 additions & 58 deletions cmd/gcp-controller-manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,24 +62,26 @@ const (
)

var (
port = pflag.Int("port", 8089, "Port to serve status endpoints on (such as /healthz and /metrics).")
metricsPort = pflag.Int("metrics-port", 8089, "Deprecated. Port to expose Prometheus metrics on. If not set, uses the value of --port.")
kubeconfig = pflag.String("kubeconfig", "", "Path to kubeconfig file with authorization and master location information.")
clusterSigningGKEKubeconfig = pflag.String("cluster-signing-gke-kubeconfig", "", "If set, use the kubeconfig file to call GKE to sign cluster-scoped certificates instead of using a local private key.")
gceConfigPath = pflag.String("gce-config", "/etc/gce.conf", "Path to gce.conf.")
controllers = pflag.StringSlice("controllers", []string{"*"}, "Controllers to enable. Possible controllers are: "+strings.Join(loopNames(), ",")+".")
csrApproverVerifyClusterMembership = pflag.Bool("csr-validate-cluster-membership", true, "Validate that VMs requesting CSRs belong to current GKE cluster.")
csrApproverAllowLegacyKubelet = pflag.Bool("csr-allow-legacy-kubelet", true, "Allow legacy kubelet bootstrap flow.")
csrApproverUseGCEInstanceListReferrers = pflag.Bool("csr-use-gce-instance-list-referrers", false, "If true use https://cloud.google.com/compute/docs/reference/rest/v1/instances/listReferrers to validate instance cluster membership.")
gceAPIEndpointOverride = pflag.String("gce-api-endpoint-override", "", "If set, talks to a different GCE API Endpoint. By default it talks to https://www.googleapis.com/compute/v1/projects/")
directPath = pflag.Bool("direct-path", false, "Enable Direct Path.")
authAuthorizeServiceAccountMappingURL = pflag.String("auth-authorize-service-account-mapping-url", "", "URL for reaching the Auth Service AuthorizeServiceAccountMapping API.")
authSyncNodeURL = pflag.String("auth-sync-node-url", "", "URL for reaching the Auth Service SyncNode API.")
hmsAuthorizeSAMappingURL = pflag.String("hms-authorize-sa-mapping-url", "", "URL for reaching the Hosted Master Service AuthorizeSAMapping API.")
hmsSyncNodeURL = pflag.String("hms-sync-node-url", "", "URL for reaching the Hosted Master Service SyncNode API.")
kubeletReadOnlyCSRApprover = pflag.Bool("kubelet-read-only-csr-approver", false, "Enable kubelet readonly csr approver or not")
autopilotEnabled = pflag.Bool("autopilot", false, "Is this a GKE Autopilot cluster.")
clearStalePodsOnNodeRegistration = pflag.Bool("clearStalePodsOnNodeRegistration", false, "If true, after node registration, delete pods bound to old node.")
port = pflag.Int("port", 8089, "Port to serve status endpoints on (such as /healthz and /metrics).")
metricsPort = pflag.Int("metrics-port", 8089, "Deprecated. Port to expose Prometheus metrics on. If not set, uses the value of --port.")
kubeconfig = pflag.String("kubeconfig", "", "Path to kubeconfig file with authorization and master location information.")
clusterSigningGKEKubeconfig = pflag.String("cluster-signing-gke-kubeconfig", "", "If set, use the kubeconfig file to call GKE to sign cluster-scoped certificates instead of using a local private key.")
gceConfigPath = pflag.String("gce-config", "/etc/gce.conf", "Path to gce.conf.")
controllers = pflag.StringSlice("controllers", []string{"*"}, "Controllers to enable. Possible controllers are: "+strings.Join(loopNames(), ",")+".")
csrApproverVerifyClusterMembership = pflag.Bool("csr-validate-cluster-membership", true, "Validate that VMs requesting CSRs belong to current GKE cluster.")
csrApproverAllowLegacyKubelet = pflag.Bool("csr-allow-legacy-kubelet", true, "Allow legacy kubelet bootstrap flow.")
csrApproverUseGCEInstanceListReferrers = pflag.Bool("csr-use-gce-instance-list-referrers", false, "If true use https://cloud.google.com/compute/docs/reference/rest/v1/instances/listReferrers to validate instance cluster membership.")
csrApproverListReferrersInitialInterval = pflag.Duration("csr-gce-list-referrers-initial-interval", 5*time.Second, "Initial interval of the exponential back-off retries for calls to listReferrers, exponential factor is set to 1.5, defaults to 5s.")
csrApproverListReferrersRetryCount = pflag.Int("csr-gce-list-referrers-retry-count", 10, "Maximal number of retries in exponential back-off for calls to listReferrers, defaults to 10")
gceAPIEndpointOverride = pflag.String("gce-api-endpoint-override", "", "If set, talks to a different GCE API Endpoint. By default it talks to https://www.googleapis.com/compute/v1/projects/")
directPath = pflag.Bool("direct-path", false, "Enable Direct Path.")
authAuthorizeServiceAccountMappingURL = pflag.String("auth-authorize-service-account-mapping-url", "", "URL for reaching the Auth Service AuthorizeServiceAccountMapping API.")
authSyncNodeURL = pflag.String("auth-sync-node-url", "", "URL for reaching the Auth Service SyncNode API.")
hmsAuthorizeSAMappingURL = pflag.String("hms-authorize-sa-mapping-url", "", "URL for reaching the Hosted Master Service AuthorizeSAMapping API.")
hmsSyncNodeURL = pflag.String("hms-sync-node-url", "", "URL for reaching the Hosted Master Service SyncNode API.")
kubeletReadOnlyCSRApprover = pflag.Bool("kubelet-read-only-csr-approver", false, "Enable kubelet readonly csr approver or not")
autopilotEnabled = pflag.Bool("autopilot", false, "Is this a GKE Autopilot cluster.")
clearStalePodsOnNodeRegistration = pflag.Bool("clearStalePodsOnNodeRegistration", false, "If true, after node registration, delete pods bound to old node.")
)

func main() {
Expand All @@ -104,22 +106,26 @@ func main() {
logs.InitLogs()

s := &controllerManager{
clusterSigningGKEKubeconfig: *clusterSigningGKEKubeconfig,
gceConfigPath: *gceConfigPath,
gceAPIEndpointOverride: *gceAPIEndpointOverride,
controllers: *controllers,
csrApproverVerifyClusterMembership: *csrApproverVerifyClusterMembership,
csrApproverAllowLegacyKubelet: *csrApproverAllowLegacyKubelet,
csrApproverUseGCEInstanceListReferrers: *csrApproverUseGCEInstanceListReferrers,
leaderElectionConfig: *leConfig,
authAuthorizeServiceAccountMappingURL: *authAuthorizeServiceAccountMappingURL,
authSyncNodeURL: *authSyncNodeURL,
hmsAuthorizeSAMappingURL: *hmsAuthorizeSAMappingURL,
hmsSyncNodeURL: *hmsSyncNodeURL,
healthz: healthz.NewHandler(),
kubeletReadOnlyCSRApprover: *kubeletReadOnlyCSRApprover,
autopilotEnabled: *autopilotEnabled,
clearStalePodsOnNodeRegistration: *clearStalePodsOnNodeRegistration,
clusterSigningGKEKubeconfig: *clusterSigningGKEKubeconfig,
gceConfigPath: *gceConfigPath,
gceAPIEndpointOverride: *gceAPIEndpointOverride,
controllers: *controllers,
csrApproverVerifyClusterMembership: *csrApproverVerifyClusterMembership,
csrApproverAllowLegacyKubelet: *csrApproverAllowLegacyKubelet,
csrApproverListReferrersConfig: gceInstanceListReferrersConfig{
enabled: *csrApproverUseGCEInstanceListReferrers,
initialInterval: *csrApproverListReferrersInitialInterval,
retryCount: *csrApproverListReferrersRetryCount,
},
leaderElectionConfig: *leConfig,
authAuthorizeServiceAccountMappingURL: *authAuthorizeServiceAccountMappingURL,
authSyncNodeURL: *authSyncNodeURL,
hmsAuthorizeSAMappingURL: *hmsAuthorizeSAMappingURL,
hmsSyncNodeURL: *hmsSyncNodeURL,
healthz: healthz.NewHandler(),
kubeletReadOnlyCSRApprover: *kubeletReadOnlyCSRApprover,
autopilotEnabled: *autopilotEnabled,
clearStalePodsOnNodeRegistration: *clearStalePodsOnNodeRegistration,
}
var err error
s.informerKubeconfig, err = clientcmd.BuildConfigFromFlags("", *kubeconfig)
Expand Down Expand Up @@ -166,20 +172,20 @@ func main() {
// controllerManager is the main context object for the package.
type controllerManager struct {
// Fields initialized from flags.
clusterSigningGKEKubeconfig string
gceConfigPath string
gceAPIEndpointOverride string
controllers []string
csrApproverVerifyClusterMembership bool
csrApproverAllowLegacyKubelet bool
csrApproverUseGCEInstanceListReferrers bool
leaderElectionConfig componentbaseconfig.LeaderElectionConfiguration
authAuthorizeServiceAccountMappingURL string
authSyncNodeURL string
hmsAuthorizeSAMappingURL string
hmsSyncNodeURL string
autopilotEnabled bool
clearStalePodsOnNodeRegistration bool
clusterSigningGKEKubeconfig string
gceConfigPath string
gceAPIEndpointOverride string
controllers []string
csrApproverVerifyClusterMembership bool
csrApproverAllowLegacyKubelet bool
csrApproverListReferrersConfig gceInstanceListReferrersConfig
leaderElectionConfig componentbaseconfig.LeaderElectionConfiguration
authAuthorizeServiceAccountMappingURL string
authSyncNodeURL string
hmsAuthorizeSAMappingURL string
hmsSyncNodeURL string
autopilotEnabled bool
clearStalePodsOnNodeRegistration bool

// Kubelet Readonly CSR Approver
kubeletReadOnlyCSRApprover bool
Expand All @@ -191,6 +197,13 @@ type controllerManager struct {
healthz *healthz.Handler
}

// gceInstanceListReferrersConfig configuration on the ListReferrers retry logic.
type gceInstanceListReferrersConfig struct {
enabled bool
initialInterval time.Duration
retryCount int
}

func (s *controllerManager) isEnabled(name string) bool {
var star bool
for _, controller := range s.controllers {
Expand Down Expand Up @@ -240,16 +253,16 @@ func run(s *controllerManager) error {
recorder: eventBroadcaster.NewRecorder(legacyscheme.Scheme, v1.EventSource{
Component: name,
}),
gcpCfg: s.gcpConfig,
clusterSigningGKEKubeconfig: s.clusterSigningGKEKubeconfig,
csrApproverVerifyClusterMembership: s.csrApproverVerifyClusterMembership,
csrApproverAllowLegacyKubelet: s.csrApproverAllowLegacyKubelet,
csrApproverUseGCEInstanceListReferrers: s.csrApproverUseGCEInstanceListReferrers,
authAuthorizeServiceAccountMappingURL: s.authAuthorizeServiceAccountMappingURL,
authSyncNodeURL: s.authSyncNodeURL,
hmsAuthorizeSAMappingURL: s.hmsAuthorizeSAMappingURL,
hmsSyncNodeURL: s.hmsSyncNodeURL,
clearStalePodsOnNodeRegistration: s.clearStalePodsOnNodeRegistration,
gcpCfg: s.gcpConfig,
clusterSigningGKEKubeconfig: s.clusterSigningGKEKubeconfig,
csrApproverVerifyClusterMembership: s.csrApproverVerifyClusterMembership,
csrApproverAllowLegacyKubelet: s.csrApproverAllowLegacyKubelet,
csrApproverListReferrersConfig: s.csrApproverListReferrersConfig,
authAuthorizeServiceAccountMappingURL: s.authAuthorizeServiceAccountMappingURL,
authSyncNodeURL: s.authSyncNodeURL,
hmsAuthorizeSAMappingURL: s.hmsAuthorizeSAMappingURL,
hmsSyncNodeURL: s.hmsSyncNodeURL,
clearStalePodsOnNodeRegistration: s.clearStalePodsOnNodeRegistration,
}); err != nil {
klog.Fatalf("Failed to start %q: %v", name, err)
}
Expand Down
52 changes: 42 additions & 10 deletions cmd/gcp-controller-manager/node_csr_approver.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (
compute "google.golang.org/api/compute/v1"
container "google.golang.org/api/container/v1"
"google.golang.org/api/googleapi"
"k8s.io/apimachinery/pkg/util/wait"

authorization "k8s.io/api/authorization/v1"
capi "k8s.io/api/certificates/v1"
Expand All @@ -46,6 +47,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/apiserver/pkg/util/webhook"
corev1apply "k8s.io/client-go/applyconfigurations/core/v1"
"k8s.io/cloud-provider-gcp/pkg/csrmetrics"
"k8s.io/cloud-provider-gcp/pkg/nodeidentity"
Expand Down Expand Up @@ -695,6 +697,43 @@ func validateInstanceGroupHint(instanceGroupUrls []string, instanceGroupHint str
return resolved, nil
}

var errNotFoundListReferrers = errors.New("not found the entry in ListReferrers")

func checkInstanceReferrersBackOff(ctx *controllerContext, instance *compute.Instance, clusterInstanceGroupUrls []string) bool {
if !ctx.csrApproverListReferrersConfig.enabled {
return false
}
klog.Infof("Using compute.InstancesService.ListReferrers to verify cluster membership of instance %q", instance.Name)

var found bool
startTime := time.Now()
backoffPolicy := wait.Backoff{
Duration: ctx.csrApproverListReferrersConfig.initialInterval,
Factor: 1.5,
Jitter: 0.2,
Steps: ctx.csrApproverListReferrersConfig.retryCount,
}
webhook.WithExponentialBackoff(context.TODO(), backoffPolicy, func() error {
var retryErr error
found, retryErr = checkInstanceReferrers(ctx, instance, clusterInstanceGroupUrls)
if retryErr != nil || !found {
return errNotFoundListReferrers
}
return nil
},
func(err error) bool {
return err != nil
},
)

if found {
klog.V(2).Infof("Determined cluster membership of instance %q using compute.InstancesService.ListReferrers after %v", instance.Name, time.Since(startTime))
} else {
klog.Warningf("Could not determine cluster membership of instance %q using compute.InstancesService.ListReferrers after %v; falling back to checking all instance groups", instance.Name, time.Since(startTime))
}
return found
}

func checkInstanceReferrers(ctx *controllerContext, instance *compute.Instance, clusterInstanceGroupUrls []string) (bool, error) {
// instanceZone looks like
// "https://www.googleapis.com/compute/v1/projects/my-project/zones/us-central1-c"
Expand Down Expand Up @@ -743,16 +782,9 @@ func clusterHasInstance(ctx *controllerContext, instance *compute.Instance, inst
return false, err
}

if ctx.csrApproverUseGCEInstanceListReferrers {
klog.Infof("using compute.InstancesService.ListReferrers to verify cluster membership of instance %q", instance.Name)
ok, err := checkInstanceReferrers(ctx, instance, clusterInstanceGroupUrls)
if err != nil {
return false, err
}
if ok {
return true, nil
}
klog.Warningf("could not determine cluster membership of instance %q using compute.InstancesService.ListReferrers; falling back to checking all instance groups", instance.Name)
ok := checkInstanceReferrersBackOff(ctx, instance, clusterInstanceGroupUrls)
if ok {
return true, nil
}

validatedInstanceGroupHint, err := validateInstanceGroupHint(clusterInstanceGroupUrls, instanceGroupHint)
Expand Down
Loading

0 comments on commit 8fbe8d2

Please sign in to comment.