Skip to content

Commit

Permalink
TPU Provisioner reliability improvements (GoogleCloudPlatform#614)
Browse files Browse the repository at this point in the history
* use jobset job-key label in node pool name instead of owner UID; track jobset that triggered creation of  node pool instead of pod

* change node pool naming convention
  • Loading branch information
danielvegamyhre authored Apr 26, 2024
1 parent 1ed3e54 commit 20b802f
Show file tree
Hide file tree
Showing 4,202 changed files with 449 additions and 1,499,332 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
3 changes: 0 additions & 3 deletions tpu-provisioner/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ RUN go mod download
COPY cmd/main.go cmd/main.go
COPY internal/ internal/

# Copy dependencies
COPY vendor/ vendor/

# Build
# the GOARCH has not a default value to allow the binary be built according to the host where the command
# was called. For example, if we call make docker-build in a local env which has the Apple Silicon M1 SO
Expand Down
35 changes: 18 additions & 17 deletions tpu-provisioner/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@ import (
"sync"
"time"

"k8s.io/apimachinery/pkg/runtime/schema"

"github.com/kelseyhightower/envconfig"
corev1 "k8s.io/api/core/v1"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
Expand All @@ -41,16 +38,21 @@ import (
"cloud.google.com/go/compute/metadata"
"github.com/GoogleCloudPlatform/ai-on-gke/tpu-provisioner/internal/cloud"
"github.com/GoogleCloudPlatform/ai-on-gke/tpu-provisioner/internal/controller"

containerv1beta1 "google.golang.org/api/container/v1beta1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"

ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/config/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
"sigs.k8s.io/controller-runtime/pkg/webhook"
//+kubebuilder:scaffold:imports
)

Expand Down Expand Up @@ -109,30 +111,27 @@ func main() {
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Metrics: server.Options{
BindAddress: metricsAddr,
},
WebhookServer: webhook.NewServer(
webhook.Options{
Port: 9443,
},
),
Scheme: scheme,
MetricsBindAddress: metricsAddr,
Port: 9443,
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "ecaf1259.google.com",
Controller: v1alpha1.ControllerConfigurationSpec{
GroupKindConcurrency: map[string]int{
// Concurrent node pool creations:
schema.GroupKind{Kind: "Pod"}.String(): cfg.Concurrency,
// Concurrent node pool deletions:
schema.GroupKind{Kind: "Node"}.String(): cfg.Concurrency,
},
},
NewCache: cache.BuilderWithOptions(cache.Options{
SelectorsByObject: cache.SelectorsByObject{
Cache: cache.Options{
ByObject: map[client.Object]cache.ByObject{
&corev1.Node{}: {
// Only listen for Nodes with label selectors indicating that they
// are managed by this controller.
Label: labels.SelectorFromSet(labels.Set{cloud.LabelNodepoolManager: cloud.LabelNodepoolManagerTPUPodinator}),
},
},
},
),
})
if err != nil {
setupLog.Error(err, "unable to start manager")
Expand Down Expand Up @@ -220,6 +219,7 @@ func main() {
PodCriteria: controller.PodCriteria{
ResourceType: cfg.PodResourceType,
},
Concurrency: cfg.Concurrency,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "CreationReconciler")
os.Exit(1)
Expand All @@ -234,6 +234,7 @@ func main() {
MinLifetime: cfg.NodeMinLifespan,
PoolDeletionDelay: cfg.NodepoolDeletionDelay,
},
Concurrency: cfg.Concurrency,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "DeletionReconciler")
os.Exit(1)
Expand Down
114 changes: 62 additions & 52 deletions tpu-provisioner/go.mod
Original file line number Diff line number Diff line change
@@ -1,81 +1,91 @@
module github.com/GoogleCloudPlatform/ai-on-gke/tpu-provisioner

go 1.19
go 1.22.0

require (
cloud.google.com/go/compute/metadata v0.2.3
cloud.google.com/go/compute/metadata v0.3.0
github.com/kelseyhightower/envconfig v1.4.0
github.com/onsi/ginkgo/v2 v2.6.0
github.com/onsi/gomega v1.24.1
golang.org/x/oauth2 v0.15.0
google.golang.org/api v0.153.0
k8s.io/api v0.26.0
k8s.io/apimachinery v0.26.0
k8s.io/client-go v0.26.0
k8s.io/klog/v2 v2.80.1
sigs.k8s.io/controller-runtime v0.14.1
github.com/onsi/ginkgo/v2 v2.17.1
github.com/onsi/gomega v1.32.0
golang.org/x/oauth2 v0.19.0
google.golang.org/api v0.176.1
k8s.io/api v0.30.0
k8s.io/apimachinery v0.30.0
k8s.io/client-go v0.30.0
k8s.io/klog/v2 v2.120.1
sigs.k8s.io/controller-runtime v0.18.0
sigs.k8s.io/jobset v0.5.0
)

require (
cloud.google.com/go/compute v1.23.3 // indirect
cloud.google.com/go/auth v0.3.0 // indirect
cloud.google.com/go/auth/oauth2adapt v0.2.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.9.0 // indirect
github.com/evanphx/json-patch/v5 v5.6.0 // indirect
github.com/fsnotify/fsnotify v1.6.0 // indirect
github.com/go-logr/logr v1.2.3 // indirect
github.com/go-logr/zapr v1.2.3 // indirect
github.com/go-openapi/jsonpointer v0.19.5 // indirect
github.com/go-openapi/jsonreference v0.20.0 // indirect
github.com/go-openapi/swag v0.19.14 // indirect
github.com/emicklei/go-restful/v3 v3.12.0 // indirect
github.com/evanphx/json-patch/v5 v5.9.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/google/gnostic v0.5.7-v3refs // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.1.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect
github.com/google/s2a-go v0.1.7 // indirect
github.com/google/uuid v1.4.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/googleapis/enterprise-certificate-proxy v0.3.2 // indirect
github.com/googleapis/gax-go/v2 v2.12.0 // indirect
github.com/imdario/mergo v0.3.6 // indirect
github.com/googleapis/gax-go/v2 v2.12.3 // indirect
github.com/imdario/mergo v0.3.16 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mailru/easyjson v0.7.6 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/prometheus/client_golang v1.19.0 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.53.0 // indirect
github.com/prometheus/procfs v0.14.0 // indirect
github.com/spf13/pflag v1.0.5 // indirect
go.opencensus.io v0.24.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
go.uber.org/zap v1.24.0 // indirect
golang.org/x/crypto v0.16.0 // indirect
golang.org/x/net v0.19.0 // indirect
golang.org/x/sys v0.15.0 // indirect
golang.org/x/term v0.15.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect
go.opentelemetry.io/otel v1.26.0 // indirect
go.opentelemetry.io/otel/metric v1.26.0 // indirect
go.opentelemetry.io/otel/trace v1.26.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
go.uber.org/zap v1.26.0 // indirect
golang.org/x/crypto v0.22.0 // indirect
golang.org/x/exp v0.0.0-20240416160154-fe59bbe5cc7f // indirect
golang.org/x/net v0.24.0 // indirect
golang.org/x/sys v0.19.0 // indirect
golang.org/x/term v0.19.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.5.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect
google.golang.org/appengine v1.6.8 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20231127180814-3a041ad873d4 // indirect
google.golang.org/grpc v1.59.0 // indirect
google.golang.org/protobuf v1.31.0 // indirect
golang.org/x/tools v0.20.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240415180920-8c6c420018be // indirect
google.golang.org/grpc v1.63.2 // indirect
google.golang.org/protobuf v1.33.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
k8s.io/apiextensions-apiserver v0.26.0 // indirect
k8s.io/component-base v0.26.0 // indirect
k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect
k8s.io/utils v0.0.0-20221128185143-99ec85e7a448 // indirect
sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
sigs.k8s.io/yaml v1.3.0 // indirect
k8s.io/apiextensions-apiserver v0.30.0 // indirect
k8s.io/component-base v0.30.0 // indirect
k8s.io/kube-openapi v0.0.0-20240423202451-8948a665c108 // indirect
k8s.io/utils v0.0.0-20240423183400-0849a56e8f22 // indirect
sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
)
Loading

0 comments on commit 20b802f

Please sign in to comment.