diff --git a/Makefile b/Makefile index 5c66e19fa6e..10f8af95603 100755 --- a/Makefile +++ b/Makefile @@ -81,13 +81,18 @@ endif sync-go-mod: go mod tidy -go $(GO_VERSION) +CONTROLLER_GEN = $(shell pwd)/bin/controller-gen +.PHONY: controller-gen +controller-gen: + @GOBIN=$(shell pwd)/bin GO111MODULE=on go install sigs.k8s.io/controller-tools/cmd/controller-gen@v0.12.0 + # Run this if you update any existing controller APIs. # 1. Generate deepcopy, clientset, listers, informers for the APIs (hack/update-codegen.sh) # 2. Generate open-api for the APIs (hack/update-openapigen) # 3. Generate Python SDK for Katib (hack/gen-python-sdk/gen-sdk.sh) # 4. Generate gRPC manager APIs (pkg/apis/manager/v1beta1/build.sh and pkg/apis/manager/health/build.sh) # 5. Generate Go mock codes -generate: +generate: controller-gen ifndef GOPATH $(error GOPATH not defined, please define GOPATH. Run "go help gopath" to learn more about GOPATH) endif diff --git a/cmd/katib-controller/v1beta1/main.go b/cmd/katib-controller/v1beta1/main.go index 5d8ee5f5fd0..fe13f15e48b 100644 --- a/cmd/katib-controller/v1beta1/main.go +++ b/cmd/katib-controller/v1beta1/main.go @@ -24,6 +24,7 @@ import ( "os" "github.com/spf13/viper" + "k8s.io/apimachinery/pkg/runtime" _ "k8s.io/client-go/plugin/pkg/client/auth/gcp" "sigs.k8s.io/controller-runtime/pkg/client/config" "sigs.k8s.io/controller-runtime/pkg/healthz" @@ -32,38 +33,32 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/manager/signals" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" apis "github.com/kubeflow/katib/pkg/apis/controller" - controller "github.com/kubeflow/katib/pkg/controller.v1beta1" + "github.com/kubeflow/katib/pkg/controller.v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" - trialutil "github.com/kubeflow/katib/pkg/controller.v1beta1/trial/util" + "github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig" webhook "github.com/kubeflow/katib/pkg/webhook/v1beta1" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" ) +var scheme = runtime.NewScheme() + +func init() { + utilruntime.Must(apis.AddToScheme(scheme)) + utilruntime.Must(configapi.AddToScheme(scheme)) + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) +} + func main() { logf.SetLogger(zap.New()) log := logf.Log.WithName("entrypoint") - var experimentSuggestionName string - var metricsAddr string - var healthzAddr string - var webhookPort int - var injectSecurityContext bool - var enableGRPCProbeInSuggestion bool - var trialResources trialutil.GvkListFlag - var enableLeaderElection bool - var leaderElectionID string - - flag.StringVar(&experimentSuggestionName, "experiment-suggestion-name", - "default", "The implementation of suggestion interface in experiment controller (default)") - flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.") - flag.StringVar(&healthzAddr, "healthz-addr", ":18080", "The address the healthz endpoint binds to.") - flag.BoolVar(&injectSecurityContext, "webhook-inject-securitycontext", false, "Inject the securityContext of container[0] in the sidecar") - flag.BoolVar(&enableGRPCProbeInSuggestion, "enable-grpc-probe-in-suggestion", true, "enable grpc probe in suggestions") - flag.Var(&trialResources, "trial-resources", "The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org)") - flag.IntVar(&webhookPort, "webhook-port", 8443, "The port number to be used for admission webhook server.") - // For leader election - flag.BoolVar(&enableLeaderElection, "enable-leader-election", false, "Enable leader election for katib-controller. Enabling this will ensure there is only one active katib-controller.") - flag.StringVar(&leaderElectionID, "leader-election-id", "3fbc96e9.katib.kubeflow.org", "The ID for leader election.") + var katibConfigFile string + flag.StringVar(&katibConfigFile, "katib-config", "", + "The katib-controller will load its initial configuration from this file. "+ + "Omit this flag to use the default configuration values. ") // TODO (andreyvelich): Currently it is not possible to set different webhook service name. // flag.StringVar(&serviceName, "webhook-service-name", "katib-controller", "The service name which will be used in webhook") @@ -72,21 +67,33 @@ func main() { flag.Parse() + inintConfig, err := katibconfig.GetInitConfigData(scheme, katibConfigFile) + if err != nil { + log.Error(err, "Failed to get KatibConfig") + os.Exit(1) + } + // Set the config in viper. - viper.Set(consts.ConfigExperimentSuggestionName, experimentSuggestionName) - viper.Set(consts.ConfigInjectSecurityContext, injectSecurityContext) - viper.Set(consts.ConfigEnableGRPCProbeInSuggestion, enableGRPCProbeInSuggestion) - viper.Set(consts.ConfigTrialResources, trialResources) + viper.Set(consts.ConfigExperimentSuggestionName, inintConfig.ControllerConfig.ExperimentSuggestionName) + viper.Set(consts.ConfigInjectSecurityContext, inintConfig.ControllerConfig.InjectSecurityContext) + viper.Set(consts.ConfigEnableGRPCProbeInSuggestion, inintConfig.ControllerConfig.EnableGRPCProbeInSuggestion) + + trialGVKs, err := configapi.TrialResourcesToGVKs(inintConfig.ControllerConfig.TrialResources) + if err != nil { + log.Error(err, "Failed to parse trialResources") + os.Exit(1) + } + viper.Set(consts.ConfigTrialResources, trialGVKs) log.Info("Config:", consts.ConfigExperimentSuggestionName, viper.GetString(consts.ConfigExperimentSuggestionName), "webhook-port", - webhookPort, + inintConfig.ControllerConfig.WebhookPort, "metrics-addr", - metricsAddr, + inintConfig.ControllerConfig.MetricsAddr, "healthz-addr", - healthzAddr, + inintConfig.ControllerConfig.HealthzAddr, consts.ConfigInjectSecurityContext, viper.GetBool(consts.ConfigInjectSecurityContext), consts.ConfigEnableGRPCProbeInSuggestion, @@ -104,10 +111,11 @@ func main() { // Create a new katib controller to provide shared dependencies and start components mgr, err := manager.New(cfg, manager.Options{ - MetricsBindAddress: metricsAddr, - HealthProbeBindAddress: healthzAddr, - LeaderElection: enableLeaderElection, - LeaderElectionID: leaderElectionID, + MetricsBindAddress: inintConfig.ControllerConfig.MetricsAddr, + HealthProbeBindAddress: inintConfig.ControllerConfig.HealthzAddr, + LeaderElection: *inintConfig.ControllerConfig.EnableLeaderElection, + LeaderElectionID: inintConfig.ControllerConfig.LeaderElectionID, + Scheme: scheme, }) if err != nil { log.Error(err, "Failed to create the manager") @@ -116,12 +124,6 @@ func main() { log.Info("Registering Components.") - // Setup Scheme for all resources - if err := apis.AddToScheme(mgr.GetScheme()); err != nil { - log.Error(err, "Unable to add APIs to scheme") - os.Exit(1) - } - // Setup all Controllers log.Info("Setting up controller.") if err := controller.AddToManager(mgr); err != nil { @@ -130,7 +132,7 @@ func main() { } log.Info("Setting up webhooks.") - if err := webhook.AddToManager(mgr, webhookPort); err != nil { + if err := webhook.AddToManager(mgr, *inintConfig.ControllerConfig.WebhookPort); err != nil { log.Error(err, "Unable to register webhooks to the manager") os.Exit(1) } diff --git a/go.mod b/go.mod index 053a556f274..71425319925 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/go-sql-driver/mysql v1.5.0 github.com/golang/mock v1.6.0 github.com/golang/protobuf v1.5.2 + github.com/google/go-cmp v0.5.9 github.com/google/go-containerregistry v0.9.0 github.com/google/go-containerregistry/pkg/authn/k8schain v0.0.0-20211222182933-7c19fa370dbd github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 @@ -24,12 +25,14 @@ require ( github.com/tidwall/gjson v1.14.1 golang.org/x/net v0.5.0 google.golang.org/grpc v1.53.0 + gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.25.3 k8s.io/apimachinery v0.25.3 k8s.io/client-go v0.25.3 k8s.io/code-generator v0.25.3 k8s.io/klog v1.0.0 k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 + k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed sigs.k8s.io/controller-runtime v0.13.0 ) @@ -69,7 +72,6 @@ require ( github.com/golang-jwt/jwt/v4 v4.2.0 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/google/gnostic v0.5.7-v3refs // indirect - github.com/google/go-cmp v0.5.9 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.3.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect @@ -129,7 +131,6 @@ require ( gopkg.in/ini.v1 v1.63.2 // indirect gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect gotest.tools/v3 v3.1.0 // indirect k8s.io/apiextensions-apiserver v0.25.0 // indirect k8s.io/cloud-provider v0.21.0 // indirect @@ -137,7 +138,6 @@ require ( k8s.io/gengo v0.0.0-20211129171323-c02415ce4185 // indirect k8s.io/klog/v2 v2.70.1 // indirect k8s.io/legacy-cloud-providers v0.21.0 // indirect - k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed // indirect sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect sigs.k8s.io/yaml v1.3.0 // indirect diff --git a/hack/update-codegen.sh b/hack/update-codegen.sh index 1756ad3026c..c924aa69df9 100755 --- a/hack/update-codegen.sh +++ b/hack/update-codegen.sh @@ -26,7 +26,7 @@ if [[ -z "${GOPATH:-}" ]]; then fi # Grab code-generator version from go.mod -CODEGEN_VERSION=$(cd ../../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}') +CODEGEN_VERSION=$(cd ../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}') CODEGEN_PKG="$GOPATH/pkg/mod/k8s.io/code-generator@${CODEGEN_VERSION}" if [[ ! -d "${CODEGEN_PKG}" ]]; then @@ -53,3 +53,8 @@ echo "Generating clients for ${GROUP_VERSIONS} ..." github.com/kubeflow/katib/pkg/apis/controller \ "${GROUP_VERSIONS}" \ --go-header-file "${PROJECT_ROOT}/hack/boilerplate/boilerplate.go.txt" + + +"${PROJECT_ROOT}/bin/controller-gen" \ + object:headerFile="${PROJECT_ROOT}/hack/boilerplate/boilerplate.go.txt" \ + paths="${PROJECT_ROOT}/pkg/apis/config/..." diff --git a/hack/update-openapigen.sh b/hack/update-openapigen.sh index 9f4f55aa40f..1b5639454c9 100755 --- a/hack/update-openapigen.sh +++ b/hack/update-openapigen.sh @@ -26,7 +26,7 @@ if [[ -z "${GOPATH:-}" ]]; then fi # Grab code-generator version from go.mod -CODEGEN_VERSION=$(cd ../../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}') +CODEGEN_VERSION=$(cd ../.. && grep 'k8s.io/code-generator' go.mod | awk '{print $2}') CODEGEN_PKG="${GOPATH}/pkg/mod/k8s.io/code-generator@${CODEGEN_VERSION}" if [[ ! -d ${CODEGEN_PKG} ]]; then diff --git a/manifests/v1beta1/components/controller/controller.yaml b/manifests/v1beta1/components/controller/controller.yaml index 51487d1631e..75d832af51e 100644 --- a/manifests/v1beta1/components/controller/controller.yaml +++ b/manifests/v1beta1/components/controller/controller.yaml @@ -26,13 +26,7 @@ spec: image: docker.io/kubeflowkatib/katib-controller command: ["./katib-controller"] args: - - "--webhook-port=8443" - - "--trial-resources=Job.v1.batch" - - "--trial-resources=TFJob.v1.kubeflow.org" - - "--trial-resources=PyTorchJob.v1.kubeflow.org" - - "--trial-resources=MPIJob.v1.kubeflow.org" - - "--trial-resources=XGBoostJob.v1.kubeflow.org" - - "--trial-resources=MXJob.v1.kubeflow.org" + - "--katib-config=katib-config.yaml" ports: - containerPort: 8443 name: webhook @@ -60,8 +54,15 @@ spec: - mountPath: /tmp/cert name: cert readOnly: true + - mountPath: /katib-config.yaml + name: katib-config + subPath: katib-config.yaml + readOnly: true volumes: - name: cert secret: defaultMode: 420 secretName: katib-webhook-cert + - name: katib-config + configMap: + name: katib-config diff --git a/manifests/v1beta1/components/controller/katib-config.yaml b/manifests/v1beta1/components/controller/katib-config.yaml index d9525a96e3e..185fedc20f4 100644 --- a/manifests/v1beta1/components/controller/katib-config.yaml +++ b/manifests/v1beta1/components/controller/katib-config.yaml @@ -1,81 +1,59 @@ --- -apiVersion: v1 -kind: ConfigMap -metadata: - name: katib-config - namespace: kubeflow -data: - metrics-collector-sidecar: |- - { - "StdOut": { - "image": "docker.io/kubeflowkatib/file-metrics-collector:latest" - }, - "File": { - "image": "docker.io/kubeflowkatib/file-metrics-collector:latest" - }, - "TensorFlowEvent": { - "image": "docker.io/kubeflowkatib/tfevent-metrics-collector:latest", - "resources": { - "limits": { - "memory": "1Gi" - } - } - } - } - suggestion: |- - { - "random": { - "image": "docker.io/kubeflowkatib/suggestion-hyperopt:latest" - }, - "tpe": { - "image": "docker.io/kubeflowkatib/suggestion-hyperopt:latest" - }, - "grid": { - "image": "docker.io/kubeflowkatib/suggestion-optuna:latest" - }, - "hyperband": { - "image": "docker.io/kubeflowkatib/suggestion-hyperband:latest" - }, - "bayesianoptimization": { - "image": "docker.io/kubeflowkatib/suggestion-skopt:latest" - }, - "cmaes": { - "image": "docker.io/kubeflowkatib/suggestion-goptuna:latest" - }, - "sobol": { - "image": "docker.io/kubeflowkatib/suggestion-goptuna:latest" - }, - "multivariate-tpe": { - "image": "docker.io/kubeflowkatib/suggestion-optuna:latest" - }, - "enas": { - "image": "docker.io/kubeflowkatib/suggestion-enas:latest", - "resources": { - "limits": { - "memory": "200Mi" - } - } - }, - "darts": { - "image": "docker.io/kubeflowkatib/suggestion-darts:latest" - }, - "pbt": { - "image": "docker.io/kubeflowkatib/suggestion-pbt:latest", - "persistentVolumeClaimSpec": { - "accessModes": [ - "ReadWriteMany" - ], - "resources": { - "requests": { - "storage": "5Gi" - } - } - } - } - } - early-stopping: |- - { - "medianstop": { - "image": "docker.io/kubeflowkatib/earlystopping-medianstop:latest" - } - } +apiVersion: config.kubeflow.org/v1beta1 +kind: KatibConfig +init: + controller: + webhookPort: 8443 + trialResources: + - Job.v1.batch + - TFJob.v1.kubeflow.org + - PyTorchJob.v1.kubeflow.org + - MPIJob.v1.kubeflow.org + - XGBoostJob.v1.kubeflow.org + - MXJob.v1.kubeflow.org +runtime: + metricsCollectorSidecars: + - collectorKind: StdOut + image: docker.io/kubeflowkatib/file-metrics-collector:latest + - collectorKind: File + image: docker.io/kubeflowkatib/file-metrics-collector:latest + - collectorKind: TensorFlowEvent + image: docker.io/kubeflowkatib/tfevent-metrics-collector:latest + resources: + limits: + memory: 1Gi + suggestions: + - algorithmName: random + image: docker.io/kubeflowkatib/suggestion-hyperopt:latest + - algorithmName: tpe + image: docker.io/kubeflowkatib/suggestion-hyperopt:latest + - algorithmName: grid + image: docker.io/kubeflowkatib/suggestion-optuna:latest + - algorithmName: hyperband + image: docker.io/kubeflowkatib/suggestion-hyperband:latest + - algorithmName: bayesianoptimization + image: docker.io/kubeflowkatib/suggestion-skopt:latest + - algorithmName: cmaes + image: docker.io/kubeflowkatib/suggestion-goptuna:latest + - algorithmName: sobol + image: docker.io/kubeflowkatib/suggestion-goptuna:latest + - algorithmName: multivariate-tpe + image: docker.io/kubeflowkatib/suggestion-optuna:latest + - algorithmName: enas + image: docker.io/kubeflowkatib/suggestion-enas:latest + resources: + limits: + memory: 200Mi + - algorithmName: darts + image: docker.io/kubeflowkatib/suggestion-darts:latest + - algorithmName: pbt + image: docker.io/kubeflowkatib/suggestion-pbt:latest + persistentVolumeClaimSpec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + earlyStoppings: + - algorithmName: medianstop + image: docker.io/kubeflowkatib/earlystopping-medianstop:latest diff --git a/manifests/v1beta1/components/controller/kustomization.yaml b/manifests/v1beta1/components/controller/kustomization.yaml index 9d410a6fcf1..d306b056b59 100644 --- a/manifests/v1beta1/components/controller/kustomization.yaml +++ b/manifests/v1beta1/components/controller/kustomization.yaml @@ -4,7 +4,10 @@ kind: Kustomization resources: - controller.yaml - - katib-config.yaml - rbac.yaml - service.yaml - trial-templates.yaml +configMapGenerator: + - name: katib-config + files: + - katib-config.yaml diff --git a/pkg/apis/apis.go b/pkg/apis/apis.go new file mode 100644 index 00000000000..185f6bbce50 --- /dev/null +++ b/pkg/apis/apis.go @@ -0,0 +1,23 @@ +/* +Copyright 2023 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Generate deepcopy, clientset, listers, informers for apis +//go:generate ../../hack/update-codegen.sh + +// Generate open-api for apis +//go:generate ../../hack/update-openapigen.sh + +package apis diff --git a/pkg/apis/config/v1beta1/defaults.go b/pkg/apis/config/v1beta1/defaults.go new file mode 100644 index 00000000000..cc3cd7433f9 --- /dev/null +++ b/pkg/apis/config/v1beta1/defaults.go @@ -0,0 +1,234 @@ +package v1beta1 + +import ( + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/runtime" + + "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" +) + +const ( + DefaultExperimentSuggestionName = "default" + DefaultMetricsAddr = ":8080" + DefaultHealthzAddr = ":18080" + DefaultLeaderElectionID = "3fbc96e9.katib.kubeflow.org" +) + +var ( + DefaultInjectSecurityContext = false + DefaultEnableGRPCProbeInSuggestion = true + DefaultWebhookPort = 8443 + DefaultEnableLeaderElection = false + DefaultTrialResources = []string{"Job.v1.batch"} +) + +func addDefaultingFuncs(scheme *runtime.Scheme) error { + scheme.AddTypeDefaultingFunc(&KatibConfig{}, func(obj interface{}) { + SetDefaults_KatibConfig(obj.(*KatibConfig)) + }) + return nil +} + +func SetDefaults_KatibConfig(cfg *KatibConfig) { + if cfg == nil { + return + } + setInitConfig(&cfg.InitConfig) + setRuntimeConfig(&cfg.RuntimeConfig) +} + +func setInitConfig(initConfig *InitConfig) { + // Set ExperimentSuggestionName. + if initConfig.ControllerConfig.ExperimentSuggestionName == "" { + initConfig.ControllerConfig.ExperimentSuggestionName = DefaultExperimentSuggestionName + } + // Set MetricsAddr. + if initConfig.ControllerConfig.MetricsAddr == "" { + initConfig.ControllerConfig.MetricsAddr = DefaultMetricsAddr + } + // Set HealthzAddr. + if initConfig.ControllerConfig.HealthzAddr == "" { + initConfig.ControllerConfig.HealthzAddr = DefaultHealthzAddr + } + // Set InjectSecurityContext. + if initConfig.ControllerConfig.InjectSecurityContext == nil { + initConfig.ControllerConfig.InjectSecurityContext = &DefaultInjectSecurityContext + } + // Set EnableGRPCProbeInSuggestion. + if initConfig.ControllerConfig.EnableGRPCProbeInSuggestion == nil { + initConfig.ControllerConfig.EnableGRPCProbeInSuggestion = &DefaultEnableGRPCProbeInSuggestion + } + // Set TrialResources. + if len(initConfig.ControllerConfig.TrialResources) == 0 { + initConfig.ControllerConfig.TrialResources = DefaultTrialResources + } + // Set WebhookPort. + if initConfig.ControllerConfig.WebhookPort == nil { + initConfig.ControllerConfig.WebhookPort = &DefaultWebhookPort + } + // Set EnableLeaderElection. + if initConfig.ControllerConfig.EnableLeaderElection == nil { + initConfig.ControllerConfig.EnableLeaderElection = &DefaultEnableLeaderElection + } + // Set LeaderElectionID. + if initConfig.ControllerConfig.LeaderElectionID == "" { + initConfig.ControllerConfig.LeaderElectionID = DefaultLeaderElectionID + } +} + +func setRuntimeConfig(runtimeConfig *RuntimeConfig) { + setSuggestionConfigs(runtimeConfig.SuggestionConfigs) + setMetricsCollectorConfigs(runtimeConfig.MetricsCollectorConfigs) + setEarlyStoppingConfigs(runtimeConfig.EarlyStoppingConfigs) +} + +func setSuggestionConfigs(suggestionConfigs []SuggestionConfig) { + for i := range suggestionConfigs { + // Set Image Pull Policy + suggestionConfigs[i].ImagePullPolicy = setImagePullPolicy(suggestionConfigs[i].ImagePullPolicy) + + // Set resource requirements for suggestion + suggestionConfigs[i].Resources = setResourceRequirements(suggestionConfigs[i].Resources) + + // Set default suggestion container volume mount path + if suggestionConfigs[i].VolumeMountPath == "" { + suggestionConfigs[i].VolumeMountPath = consts.DefaultContainerSuggestionVolumeMountPath + } + + // Get persistent volume claim spec from config + pvcSpec := suggestionConfigs[i].PersistentVolumeClaimSpec + + // Set default access modes + if len(pvcSpec.AccessModes) == 0 { + pvcSpec.AccessModes = []corev1.PersistentVolumeAccessMode{ + consts.DefaultSuggestionVolumeAccessMode, + } + } + + // Set default resources + if len(pvcSpec.Resources.Requests) == 0 { + defaultVolumeStorage, _ := resource.ParseQuantity(consts.DefaultSuggestionVolumeStorage) + pvcSpec.Resources.Requests = make(map[corev1.ResourceName]resource.Quantity) + pvcSpec.Resources.Requests[corev1.ResourceStorage] = defaultVolumeStorage + } + + // Set PVC back for suggestion config. + suggestionConfigs[i].PersistentVolumeClaimSpec = pvcSpec + + // Get PV from config only if it exists. + if !equality.Semantic.DeepEqual(suggestionConfigs[i].PersistentVolumeSpec, corev1.PersistentVolumeSpec{}) { + + // Set PersistentVolumeReclaimPolicy to "Delete" to automatically delete PV once PVC is deleted. + // Kubernetes doesn't allow to specify ownerReferences for the cluster-scoped + // resources (which PV is) with namespace-scoped owner (which Suggestion is). + suggestionConfigs[i].PersistentVolumeSpec.PersistentVolumeReclaimPolicy = corev1.PersistentVolumeReclaimDelete + } + } +} + +func setMetricsCollectorConfigs(metricsCollectorConfigs []MetricsCollectorConfig) { + for i := range metricsCollectorConfigs { + // Set Image Pull Policy + metricsCollectorConfigs[i].ImagePullPolicy = setImagePullPolicy(metricsCollectorConfigs[i].ImagePullPolicy) + + // Set resource requirements for metrics collector + metricsCollectorConfigs[i].Resource = setResourceRequirements(metricsCollectorConfigs[i].Resource) + } +} + +func setEarlyStoppingConfigs(earlyStoppingConfigs []EarlyStoppingConfig) { + for i := range earlyStoppingConfigs { + // Set Image Pull Policy. + earlyStoppingConfigs[i].ImagePullPolicy = setImagePullPolicy(earlyStoppingConfigs[i].ImagePullPolicy) + + // Set resource requirements + earlyStoppingConfigs[i].Resource = setResourceRequirements(earlyStoppingConfigs[i].Resource) + } +} + +func setImagePullPolicy(imagePullPolicy corev1.PullPolicy) corev1.PullPolicy { + if imagePullPolicy != corev1.PullAlways && imagePullPolicy != corev1.PullIfNotPresent && imagePullPolicy != corev1.PullNever { + return consts.DefaultImagePullPolicy + } + return imagePullPolicy +} + +func setResourceRequirements(configResource corev1.ResourceRequirements) corev1.ResourceRequirements { + // If requests are empty create new map + if len(configResource.Requests) == 0 { + configResource.Requests = make(map[corev1.ResourceName]resource.Quantity) + } + + // Get CPU, Memory and Disk Requests from config + cpuRequest := configResource.Requests[corev1.ResourceCPU] + memRequest := configResource.Requests[corev1.ResourceMemory] + diskRequest := configResource.Requests[corev1.ResourceEphemeralStorage] + + // If resource is empty set default value for CPU, Memory, Disk + if cpuRequest.IsZero() { + defaultCPURequest, _ := resource.ParseQuantity(consts.DefaultCPURequest) + configResource.Requests[corev1.ResourceCPU] = defaultCPURequest + } + if memRequest.IsZero() { + defaultMemRequest, _ := resource.ParseQuantity(consts.DefaultMemRequest) + configResource.Requests[corev1.ResourceMemory] = defaultMemRequest + } + if diskRequest.IsZero() { + defaultDiskRequest, _ := resource.ParseQuantity(consts.DefaultDiskRequest) + configResource.Requests[corev1.ResourceEphemeralStorage] = defaultDiskRequest + } + + // If limits are empty create new map + if len(configResource.Limits) == 0 { + configResource.Limits = make(map[corev1.ResourceName]resource.Quantity) + } + + // Get CPU, Memory and Disk Limits from config + cpuLimit := configResource.Limits[corev1.ResourceCPU] + memLimit := configResource.Limits[corev1.ResourceMemory] + diskLimit := configResource.Limits[corev1.ResourceEphemeralStorage] + + // If limit is empty set default value for CPU, Memory, Disk + if cpuLimit.IsZero() { + defaultCPULimit, _ := resource.ParseQuantity(consts.DefaultCPULimit) + configResource.Limits[corev1.ResourceCPU] = defaultCPULimit + } + if memLimit.IsZero() { + defaultMemLimit, _ := resource.ParseQuantity(consts.DefaultMemLimit) + configResource.Limits[corev1.ResourceMemory] = defaultMemLimit + } + if diskLimit.IsZero() { + defaultDiskLimit, _ := resource.ParseQuantity(consts.DefaultDiskLimit) + configResource.Limits[corev1.ResourceEphemeralStorage] = defaultDiskLimit + } + + // If user explicitly sets CPU value to -1, nuke it. + if cpuLimit.Sign() == -1 { + delete(configResource.Limits, corev1.ResourceCPU) + } + if cpuRequest.Sign() == -1 { + delete(configResource.Requests, corev1.ResourceCPU) + } + + // If user explicitly sets Memory value to -1, nuke it. + if memLimit.Sign() == -1 { + delete(configResource.Limits, corev1.ResourceMemory) + } + if memRequest.Sign() == -1 { + delete(configResource.Requests, corev1.ResourceMemory) + } + + // If user explicitly sets ephemeral-storage value to something negative, nuke it. + // This enables compatibility with the GKE nodepool autoscalers, which cannot scale + // pods which define ephemeral-storage resource constraints. + if diskLimit.Sign() == -1 { + delete(configResource.Limits, corev1.ResourceEphemeralStorage) + } + if diskRequest.Sign() == -1 { + delete(configResource.Requests, corev1.ResourceEphemeralStorage) + } + + return configResource +} diff --git a/pkg/apis/config/v1beta1/defaults_test.go b/pkg/apis/config/v1beta1/defaults_test.go new file mode 100644 index 00000000000..e8bef7502a4 --- /dev/null +++ b/pkg/apis/config/v1beta1/defaults_test.go @@ -0,0 +1,369 @@ +package v1beta1 + +import ( + "fmt" + "testing" + + "github.com/google/go-cmp/cmp" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/utils/pointer" + + commonv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" + "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" +) + +func TestSetSuggestionConfigs(t *testing.T) { + const testAlgorithmName = "test-suggestion" + + cases := map[string]struct { + config []SuggestionConfig + wantConfig []SuggestionConfig + }{ + "All parameters correctly are specified": { + config: func() []SuggestionConfig { + suggestionConfig := newFakeSuggestionConfig(testAlgorithmName) + suggestionConfig.ImagePullPolicy = corev1.PullAlways + suggestionConfig.Resources = *newFakeCustomResourceRequirements() + return []SuggestionConfig{*suggestionConfig} + }(), + wantConfig: func() []SuggestionConfig { + c := newFakeSuggestionConfig(testAlgorithmName) + c.ImagePullPolicy = corev1.PullAlways + c.Resources = *newFakeCustomResourceRequirements() + return []SuggestionConfig{*c} + }(), + }, + fmt.Sprintf("GetSuggestionConfigData sets %s to imagePullPolicy", consts.DefaultImagePullPolicy): { + config: func() []SuggestionConfig { + suggestion := newFakeSuggestionConfig(testAlgorithmName) + suggestion.ImagePullPolicy = "" + return []SuggestionConfig{*suggestion} + }(), + wantConfig: []SuggestionConfig{*newFakeSuggestionConfig(testAlgorithmName)}, + }, + "GetSuggestionConfigData sets resource.requests and resource.limits for the suggestion service": { + config: func() []SuggestionConfig { + suggestion := newFakeSuggestionConfig(testAlgorithmName) + suggestion.Resources = corev1.ResourceRequirements{} + return []SuggestionConfig{*suggestion} + }(), + wantConfig: []SuggestionConfig{*newFakeSuggestionConfig(testAlgorithmName)}, + }, + fmt.Sprintf("GetSuggestionConfigData sets %s to volumeMountPath", consts.DefaultContainerSuggestionVolumeMountPath): { + config: func() []SuggestionConfig { + suggestion := newFakeSuggestionConfig(testAlgorithmName) + suggestion.VolumeMountPath = "" + return []SuggestionConfig{*suggestion} + }(), + wantConfig: []SuggestionConfig{*newFakeSuggestionConfig(testAlgorithmName)}, + }, + "GetSuggestionConfigData sets accessMode and resource.requests for PVC": { + config: func() []SuggestionConfig { + suggestion := newFakeSuggestionConfig(testAlgorithmName) + suggestion.PersistentVolumeClaimSpec = corev1.PersistentVolumeClaimSpec{} + return []SuggestionConfig{*suggestion} + }(), + wantConfig: []SuggestionConfig{*newFakeSuggestionConfig(testAlgorithmName)}, + }, + fmt.Sprintf("GetSuggestionConfigData does not set %s to persistentVolumeReclaimPolicy", corev1.PersistentVolumeReclaimDelete): { + config: func() []SuggestionConfig { + suggestion := newFakeSuggestionConfig(testAlgorithmName) + suggestion.PersistentVolumeSpec = corev1.PersistentVolumeSpec{} + return []SuggestionConfig{*suggestion} + }(), + wantConfig: func() []SuggestionConfig { + c := newFakeSuggestionConfig(testAlgorithmName) + c.PersistentVolumeSpec = corev1.PersistentVolumeSpec{} + return []SuggestionConfig{*c} + }(), + }, + } + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + kc := &KatibConfig{ + RuntimeConfig: RuntimeConfig{ + SuggestionConfigs: tc.config, + }, + } + SetDefaults_KatibConfig(kc) + if diff := cmp.Diff(tc.wantConfig, kc.RuntimeConfig.SuggestionConfigs); len(diff) != 0 { + t.Errorf("Unexpected SuggestionConfigs (-want,+got):\n%s", diff) + } + }) + } +} + +func TestSetEarlyStoppingConfigs(t *testing.T) { + const testAlgorithmName = "test-early-stopping" + + cases := map[string]struct { + config []EarlyStoppingConfig + wantConfig []EarlyStoppingConfig + }{ + "All parameters correctly are specified": { + config: func() []EarlyStoppingConfig { + config := newFakeEarlyStoppingConfig(testAlgorithmName) + config.ImagePullPolicy = corev1.PullIfNotPresent + return []EarlyStoppingConfig{*config} + }(), + wantConfig: func() []EarlyStoppingConfig { + c := newFakeEarlyStoppingConfig(testAlgorithmName) + c.ImagePullPolicy = corev1.PullIfNotPresent + return []EarlyStoppingConfig{*c} + }(), + }, + fmt.Sprintf("GetEarlyStoppingConfigData sets %s to imagePullPolicy", consts.DefaultImagePullPolicy): { + config: func() []EarlyStoppingConfig { + c := newFakeEarlyStoppingConfig(testAlgorithmName) + c.ImagePullPolicy = "" + return []EarlyStoppingConfig{*c} + }(), + wantConfig: []EarlyStoppingConfig{*newFakeEarlyStoppingConfig(testAlgorithmName)}, + }, + } + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + kc := &KatibConfig{ + RuntimeConfig: RuntimeConfig{ + EarlyStoppingConfigs: tc.config, + }, + } + SetDefaults_KatibConfig(kc) + if diff := cmp.Diff(tc.wantConfig, kc.RuntimeConfig.EarlyStoppingConfigs); len(diff) != 0 { + t.Errorf("Unexpected EarlyStoppingConfigs (-want,+got):\n%s", diff) + } + }) + } +} + +func TestSetMetricsCollectorConfigs(t *testing.T) { + const ( + invalidCollectorKind commonv1beta1.CollectorKind = "invalidCollector" + testCollectorKind commonv1beta1.CollectorKind = "testCollector" + ) + nukeResource, _ := resource.ParseQuantity("-1") + nukeResourceRequirements := map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: nukeResource, + corev1.ResourceMemory: nukeResource, + corev1.ResourceEphemeralStorage: nukeResource, + } + + cases := map[string]struct { + config, wantConfig []MetricsCollectorConfig + }{ + "All parameters correctly are specified": { + config: func() []MetricsCollectorConfig { + c := newFakeMetricsCollectorConfig(testCollectorKind) + c.ImagePullPolicy = corev1.PullNever + return []MetricsCollectorConfig{*c} + }(), + wantConfig: func() []MetricsCollectorConfig { + c := newFakeMetricsCollectorConfig(testCollectorKind) + c.ImagePullPolicy = corev1.PullNever + return []MetricsCollectorConfig{*c} + }(), + }, + fmt.Sprintf("GetMetricsConfigData sets %s to imagePullPolicy", consts.DefaultImagePullPolicy): { + config: func() []MetricsCollectorConfig { + c := newFakeMetricsCollectorConfig(testCollectorKind) + c.ImagePullPolicy = "" + return []MetricsCollectorConfig{*c} + }(), + wantConfig: []MetricsCollectorConfig{*newFakeMetricsCollectorConfig(testCollectorKind)}, + }, + "GetMetricsConfigData nukes resource.requests and resource.limits for the metrics collector": { + config: func() []MetricsCollectorConfig { + c := newFakeMetricsCollectorConfig(testCollectorKind) + c.Resource = corev1.ResourceRequirements{ + Requests: nukeResourceRequirements, + Limits: nukeResourceRequirements, + } + return []MetricsCollectorConfig{*c} + }(), + wantConfig: func() []MetricsCollectorConfig { + c := newFakeMetricsCollectorConfig(testCollectorKind) + c.Resource = corev1.ResourceRequirements{ + Requests: map[corev1.ResourceName]resource.Quantity{}, + Limits: map[corev1.ResourceName]resource.Quantity{}, + } + return []MetricsCollectorConfig{*c} + }(), + }, + } + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + kc := &KatibConfig{ + RuntimeConfig: RuntimeConfig{ + MetricsCollectorConfigs: tc.config, + }, + } + SetDefaults_KatibConfig(kc) + if diff := cmp.Diff(tc.wantConfig, kc.RuntimeConfig.MetricsCollectorConfigs); len(diff) != 0 { + t.Errorf("Unexpected MetricsCollectorConfigs (-want,+got):\n%s", diff) + } + }) + } +} + +func TestSetInitConfig(t *testing.T) { + cases := map[string]struct { + config InitConfig + wantConfig InitConfig + }{ + "All parameters correctly are specified": { + config: InitConfig{ + ControllerConfig: ControllerConfig{ + ExperimentSuggestionName: "test", + MetricsAddr: ":8081", + HealthzAddr: ":18081", + InjectSecurityContext: pointer.Bool(true), + EnableGRPCProbeInSuggestion: pointer.Bool(false), + TrialResources: []string{ + "Job.v1.batch", + "TFJob.v1.kubeflow.org", + }, + WebhookPort: pointer.Int(18443), + EnableLeaderElection: pointer.Bool(true), + LeaderElectionID: "xyz0123", + }, + }, + wantConfig: InitConfig{ + ControllerConfig: ControllerConfig{ + ExperimentSuggestionName: "test", + MetricsAddr: ":8081", + HealthzAddr: ":18081", + InjectSecurityContext: pointer.Bool(true), + EnableGRPCProbeInSuggestion: pointer.Bool(false), + TrialResources: []string{ + "Job.v1.batch", + "TFJob.v1.kubeflow.org", + }, + WebhookPort: pointer.Int(18443), + EnableLeaderElection: pointer.Bool(true), + LeaderElectionID: "xyz0123", + }, + }, + }, + "ControllerConfig is empty": { + config: InitConfig{ + ControllerConfig: ControllerConfig{}, + }, + wantConfig: InitConfig{ + ControllerConfig: ControllerConfig{ + ExperimentSuggestionName: DefaultExperimentSuggestionName, + MetricsAddr: DefaultMetricsAddr, + HealthzAddr: DefaultHealthzAddr, + InjectSecurityContext: &DefaultInjectSecurityContext, + EnableGRPCProbeInSuggestion: &DefaultEnableGRPCProbeInSuggestion, + TrialResources: DefaultTrialResources, + WebhookPort: &DefaultWebhookPort, + EnableLeaderElection: &DefaultEnableLeaderElection, + LeaderElectionID: DefaultLeaderElectionID, + }, + }, + }, + } + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + kc := &KatibConfig{ + InitConfig: tc.config, + } + SetDefaults_KatibConfig(kc) + if diff := cmp.Diff(tc.wantConfig, kc.InitConfig); len(diff) != 0 { + t.Errorf("Unexpected InitConfig (-want,+got):\n%s", diff) + } + }) + } +} + +func newFakeSuggestionConfig(algorithmName string) *SuggestionConfig { + defaultVolumeStorage, _ := resource.ParseQuantity(consts.DefaultSuggestionVolumeStorage) + + return &SuggestionConfig{ + AlgorithmName: algorithmName, + Container: corev1.Container{ + Image: "suggestion-image", + ImagePullPolicy: consts.DefaultImagePullPolicy, + Resources: *setFakeResourceRequirements(), + }, + VolumeMountPath: consts.DefaultContainerSuggestionVolumeMountPath, + PersistentVolumeClaimSpec: corev1.PersistentVolumeClaimSpec{ + AccessModes: []corev1.PersistentVolumeAccessMode{ + consts.DefaultSuggestionVolumeAccessMode, + }, + Resources: corev1.ResourceRequirements{ + Requests: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceStorage: defaultVolumeStorage, + }, + }, + }, + PersistentVolumeSpec: corev1.PersistentVolumeSpec{ + PersistentVolumeReclaimPolicy: corev1.PersistentVolumeReclaimDelete, + }, + } +} + +func newFakeEarlyStoppingConfig(algorithmName string) *EarlyStoppingConfig { + return &EarlyStoppingConfig{ + AlgorithmName: algorithmName, + Image: "early-stopping-image", + ImagePullPolicy: consts.DefaultImagePullPolicy, + Resource: *setFakeResourceRequirements(), + } +} + +func newFakeMetricsCollectorConfig(collectorKind commonv1beta1.CollectorKind) *MetricsCollectorConfig { + return &MetricsCollectorConfig{ + CollectorKind: string(collectorKind), + Image: "metrics-collector-image", + ImagePullPolicy: consts.DefaultImagePullPolicy, + Resource: *setFakeResourceRequirements(), + } +} + +func setFakeResourceRequirements() *corev1.ResourceRequirements { + defaultCPURequest, _ := resource.ParseQuantity(consts.DefaultCPURequest) + defaultMemoryRequest, _ := resource.ParseQuantity(consts.DefaultMemRequest) + defaultEphemeralStorageRequest, _ := resource.ParseQuantity(consts.DefaultDiskRequest) + + defaultCPULimit, _ := resource.ParseQuantity(consts.DefaultCPULimit) + defaultMemoryLimit, _ := resource.ParseQuantity(consts.DefaultMemLimit) + defaultEphemeralStorageLimit, _ := resource.ParseQuantity(consts.DefaultDiskLimit) + + return &corev1.ResourceRequirements{ + Requests: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: defaultCPURequest, + corev1.ResourceMemory: defaultMemoryRequest, + corev1.ResourceEphemeralStorage: defaultEphemeralStorageRequest, + }, + Limits: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: defaultCPULimit, + corev1.ResourceMemory: defaultMemoryLimit, + corev1.ResourceEphemeralStorage: defaultEphemeralStorageLimit, + }, + } +} + +func newFakeCustomResourceRequirements() *corev1.ResourceRequirements { + customCPURequest, _ := resource.ParseQuantity("25m") + customMemoryRequest, _ := resource.ParseQuantity("200Mi") + customEphemeralStorageRequest, _ := resource.ParseQuantity("550Mi") + + customCPULimit, _ := resource.ParseQuantity("250m") + customMemoryLimit, _ := resource.ParseQuantity("2Gi") + customEphemeralStorageLimit, _ := resource.ParseQuantity("15Gi") + + return &corev1.ResourceRequirements{ + Requests: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: customCPURequest, + corev1.ResourceMemory: customMemoryRequest, + corev1.ResourceEphemeralStorage: customEphemeralStorageRequest, + }, + Limits: map[corev1.ResourceName]resource.Quantity{ + corev1.ResourceCPU: customCPULimit, + corev1.ResourceMemory: customMemoryLimit, + corev1.ResourceEphemeralStorage: customEphemeralStorageLimit, + }, + } +} diff --git a/pkg/apis/config/v1beta1/groupversion_info.go b/pkg/apis/config/v1beta1/groupversion_info.go new file mode 100644 index 00000000000..02adaa73645 --- /dev/null +++ b/pkg/apis/config/v1beta1/groupversion_info.go @@ -0,0 +1,41 @@ +/* +Copyright 2023 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1beta1 contains API Schema definitions for the config v1beta1 API group +// +kubebuilder:object:generate=true +// +groupName=config.kubeflow.org +package v1beta1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects + GroupVersion = schema.GroupVersion{Group: "config.kubeflow.org", Version: "v1beta1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) + +func init() { + SchemeBuilder.Register(&KatibConfig{}) + SchemeBuilder.SchemeBuilder.Register(addDefaultingFuncs) +} diff --git a/pkg/apis/config/v1beta1/types.go b/pkg/apis/config/v1beta1/types.go new file mode 100644 index 00000000000..489f71cb11f --- /dev/null +++ b/pkg/apis/config/v1beta1/types.go @@ -0,0 +1,108 @@ +/* +Copyright 2023 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1beta1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// +kubebuilder:object:root=true + +// KatibConfig is the YAML katib-config.yaml structure in Katib config. +type KatibConfig struct { + metav1.TypeMeta `json:",inline"` + + RuntimeConfig RuntimeConfig `json:"runtime,omitempty"` + InitConfig InitConfig `json:"init,omitempty"` +} + +// RuntimeConfig is the YAML runtime structure in Katib config. +type RuntimeConfig struct { + SuggestionConfigs []SuggestionConfig `json:"suggestions,omitempty"` + EarlyStoppingConfigs []EarlyStoppingConfig `json:"earlyStoppings,omitempty"` + MetricsCollectorConfigs []MetricsCollectorConfig `json:"metricsCollectorSideCars,omitempty"` +} + +// InitConfig is the YAML init structure in Katib config. +type InitConfig struct { + ControllerConfig ControllerConfig `json:"controller,omitempty"` + + // TODO (tenzen-y): Adding a config for the webhook certs here. + // Ref: https://github.com/kubeflow/katib/issues/2149 +} + +// ControllerConfig is the YAML controller structure in Katib config. +type ControllerConfig struct { + // ExperimentSuggestionName is the implementation of suggestion interface in experiment controller. + // Defaults to 'default'. + ExperimentSuggestionName string `json:"experimentSuggestionName,omitempty"` + // MetricsAddr is the address the metric endpoint binds to. + // Defaults to ':8080'. + MetricsAddr string `json:"metricsAddr,omitempty"` + // HealthzAddr is the address the healthz endpoint binds to. + // Defaults to ':18080'. + HealthzAddr string `json:"healthzAddr,omitempty"` + // InjectSecurityContext indicates whether inject the securityContext of container[0] in the sidecar. + // Defaults to 'false'. + InjectSecurityContext *bool `json:"injectSecurityContext,omitempty"` + // EnableGRPCProbeInSuggestion indicates whether enable grpc probe in suggestions. + // Defaults to 'true'. + EnableGRPCProbeInSuggestion *bool `json:"enableGRPCProbeInSuggestion,omitempty"` + // TrialResources is the list of resources that can be used as trial template, + // in the form: Kind.Version.Group (e.g. TFJob.v1.kubeflow.org) + // It doesn't set as a default. + TrialResources []string `json:"trialResources,omitempty"` + // WebhookPort is the port number to be used for admission webhook server. + // Defaults to '8443'. + WebhookPort *int `json:"webhookPort,omitempty"` + // EnableLeaderElection indicates whether enable leader election for katib-controller. + // Enabling this will ensure there is only one active katib-controller. + // Defaults to 'false'. + EnableLeaderElection *bool `json:"enableLeaderElection,omitempty"` + // LeaderElectionID is the ID for leader election. + // Defaults to '3fbc96e9.katib.kubeflow.org'. + LeaderElectionID string `json:"leaderElectionID,omitempty"` +} + +// SuggestionConfig is the YAML suggestion structure in Katib config. +type SuggestionConfig struct { + AlgorithmName string `json:"algorithmName"` + corev1.Container `json:",inline"` + ServiceAccountName string `json:"serviceAccountName,omitempty"` + VolumeMountPath string `json:"volumeMountPath,omitempty"` + PersistentVolumeClaimSpec corev1.PersistentVolumeClaimSpec `json:"persistentVolumeClaimSpec,omitempty"` + PersistentVolumeSpec corev1.PersistentVolumeSpec `json:"persistentVolumeSpec,omitempty"` + PersistentVolumeLabels map[string]string `json:"persistentVolumeLabels,omitempty"` +} + +// EarlyStoppingConfig is the YAML early stopping structure in Katib config. +type EarlyStoppingConfig struct { + AlgorithmName string `json:"algorithmName"` + Image string `json:"image"` + ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"` + Resource corev1.ResourceRequirements `json:"resources,omitempty"` +} + +// MetricsCollectorConfig is the YAML metrics collector structure in Katib config. +type MetricsCollectorConfig struct { + CollectorKind string `json:"collectorKind"` + Image string `json:"image"` + ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"` + Resource corev1.ResourceRequirements `json:"resources,omitempty"` + WaitAllProcesses *bool `json:"waitAllProcesses,omitempty"` +} diff --git a/pkg/apis/config/v1beta1/util.go b/pkg/apis/config/v1beta1/util.go new file mode 100644 index 00000000000..22115b68a41 --- /dev/null +++ b/pkg/apis/config/v1beta1/util.go @@ -0,0 +1,21 @@ +package v1beta1 + +import ( + "errors" + + "k8s.io/apimachinery/pkg/runtime/schema" +) + +var ErrInvalidGVKFormat = errors.New("invalid GroupVersionKinds") + +func TrialResourcesToGVKs(trialResources []string) ([]schema.GroupVersionKind, error) { + gvks := make([]schema.GroupVersionKind, 0, len(trialResources)) + for i := range trialResources { + gvk, _ := schema.ParseKindArg(trialResources[i]) + if gvk == nil || gvk.Empty() { + return nil, ErrInvalidGVKFormat + } + gvks = append(gvks, *gvk) + } + return gvks, nil +} diff --git a/pkg/apis/config/v1beta1/zz_generated.deepcopy.go b/pkg/apis/config/v1beta1/zz_generated.deepcopy.go new file mode 100644 index 00000000000..b28a44cb37e --- /dev/null +++ b/pkg/apis/config/v1beta1/zz_generated.deepcopy.go @@ -0,0 +1,206 @@ +//go:build !ignore_autogenerated +// +build !ignore_autogenerated + +/* +Copyright 2022 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1beta1 + +import ( + "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ControllerConfig) DeepCopyInto(out *ControllerConfig) { + *out = *in + if in.InjectSecurityContext != nil { + in, out := &in.InjectSecurityContext, &out.InjectSecurityContext + *out = new(bool) + **out = **in + } + if in.EnableGRPCProbeInSuggestion != nil { + in, out := &in.EnableGRPCProbeInSuggestion, &out.EnableGRPCProbeInSuggestion + *out = new(bool) + **out = **in + } + if in.TrialResources != nil { + in, out := &in.TrialResources, &out.TrialResources + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.WebhookPort != nil { + in, out := &in.WebhookPort, &out.WebhookPort + *out = new(int) + **out = **in + } + if in.EnableLeaderElection != nil { + in, out := &in.EnableLeaderElection, &out.EnableLeaderElection + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ControllerConfig. +func (in *ControllerConfig) DeepCopy() *ControllerConfig { + if in == nil { + return nil + } + out := new(ControllerConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *EarlyStoppingConfig) DeepCopyInto(out *EarlyStoppingConfig) { + *out = *in + in.Resource.DeepCopyInto(&out.Resource) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new EarlyStoppingConfig. +func (in *EarlyStoppingConfig) DeepCopy() *EarlyStoppingConfig { + if in == nil { + return nil + } + out := new(EarlyStoppingConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *InitConfig) DeepCopyInto(out *InitConfig) { + *out = *in + in.ControllerConfig.DeepCopyInto(&out.ControllerConfig) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new InitConfig. +func (in *InitConfig) DeepCopy() *InitConfig { + if in == nil { + return nil + } + out := new(InitConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *KatibConfig) DeepCopyInto(out *KatibConfig) { + *out = *in + out.TypeMeta = in.TypeMeta + in.RuntimeConfig.DeepCopyInto(&out.RuntimeConfig) + in.InitConfig.DeepCopyInto(&out.InitConfig) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KatibConfig. +func (in *KatibConfig) DeepCopy() *KatibConfig { + if in == nil { + return nil + } + out := new(KatibConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *KatibConfig) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MetricsCollectorConfig) DeepCopyInto(out *MetricsCollectorConfig) { + *out = *in + in.Resource.DeepCopyInto(&out.Resource) + if in.WaitAllProcesses != nil { + in, out := &in.WaitAllProcesses, &out.WaitAllProcesses + *out = new(bool) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MetricsCollectorConfig. +func (in *MetricsCollectorConfig) DeepCopy() *MetricsCollectorConfig { + if in == nil { + return nil + } + out := new(MetricsCollectorConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RuntimeConfig) DeepCopyInto(out *RuntimeConfig) { + *out = *in + if in.SuggestionConfigs != nil { + in, out := &in.SuggestionConfigs, &out.SuggestionConfigs + *out = make([]SuggestionConfig, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.EarlyStoppingConfigs != nil { + in, out := &in.EarlyStoppingConfigs, &out.EarlyStoppingConfigs + *out = make([]EarlyStoppingConfig, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } + if in.MetricsCollectorConfigs != nil { + in, out := &in.MetricsCollectorConfigs, &out.MetricsCollectorConfigs + *out = make([]MetricsCollectorConfig, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RuntimeConfig. +func (in *RuntimeConfig) DeepCopy() *RuntimeConfig { + if in == nil { + return nil + } + out := new(RuntimeConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *SuggestionConfig) DeepCopyInto(out *SuggestionConfig) { + *out = *in + in.Container.DeepCopyInto(&out.Container) + in.PersistentVolumeClaimSpec.DeepCopyInto(&out.PersistentVolumeClaimSpec) + in.PersistentVolumeSpec.DeepCopyInto(&out.PersistentVolumeSpec) + if in.PersistentVolumeLabels != nil { + in, out := &in.PersistentVolumeLabels, &out.PersistentVolumeLabels + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SuggestionConfig. +func (in *SuggestionConfig) DeepCopy() *SuggestionConfig { + if in == nil { + return nil + } + out := new(SuggestionConfig) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/apis/controller/apis.go b/pkg/apis/controller/apis.go index 435d1dc2cae..e4a40787b7f 100644 --- a/pkg/apis/controller/apis.go +++ b/pkg/apis/controller/apis.go @@ -14,12 +14,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -// Generate deepcopy, clientset, listers, informers for apis -//go:generate ../../../hack/update-codegen.sh - -// Generate open-api for apis -//go:generate ../../../hack/update-openapigen.sh - // Package apis contains Kubernetes API groups. package apis diff --git a/pkg/controller.v1beta1/consts/const.go b/pkg/controller.v1beta1/consts/const.go index 2bfeadd30ad..b745486147a 100644 --- a/pkg/controller.v1beta1/consts/const.go +++ b/pkg/controller.v1beta1/consts/const.go @@ -118,6 +118,8 @@ const ( LabelMetricsCollectorSidecar = "metrics-collector-sidecar" // LabelEarlyStoppingTag is the name of early stopping config in Katib configmap. LabelEarlyStoppingTag = "early-stopping" + // LabelKatibConfigTag is the name of KatibConfig in Katib copnfigmap. + LabelKatibConfigTag = "katib-config.yaml" // DefaultImagePullPolicy is the default value for image pull policy. DefaultImagePullPolicy = corev1.PullIfNotPresent // DefaultCPULimit is the default value for CPU limit. diff --git a/pkg/controller.v1beta1/experiment/manifest/generator.go b/pkg/controller.v1beta1/experiment/manifest/generator.go index 3c55e462769..7973d151abc 100644 --- a/pkg/controller.v1beta1/experiment/manifest/generator.go +++ b/pkg/controller.v1beta1/experiment/manifest/generator.go @@ -24,6 +24,7 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "sigs.k8s.io/controller-runtime/pkg/client" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" @@ -37,9 +38,9 @@ type Generator interface { InjectClient(c client.Client) GetTrialTemplate(instance *experimentsv1beta1.Experiment) (string, error) GetRunSpecWithHyperParameters(experiment *experimentsv1beta1.Experiment, trialName, trialNamespace string, assignments []commonapiv1beta1.ParameterAssignment) (*unstructured.Unstructured, error) - GetSuggestionConfigData(algorithmName string) (katibconfig.SuggestionConfig, error) - GetEarlyStoppingConfigData(algorithmName string) (katibconfig.EarlyStoppingConfig, error) - GetMetricsCollectorConfigData(cKind commonapiv1beta1.CollectorKind) (katibconfig.MetricsCollectorConfig, error) + GetSuggestionConfigData(algorithmName string) (configapi.SuggestionConfig, error) + GetEarlyStoppingConfigData(algorithmName string) (configapi.EarlyStoppingConfig, error) + GetMetricsCollectorConfigData(cKind commonapiv1beta1.CollectorKind) (configapi.MetricsCollectorConfig, error) } // DefaultGenerator is the default implementation of Generator. @@ -60,17 +61,17 @@ func (g *DefaultGenerator) InjectClient(c client.Client) { } // GetMetricsCollectorConfigData returns metrics collector configuration for a given collector kind. -func (g *DefaultGenerator) GetMetricsCollectorConfigData(cKind commonapiv1beta1.CollectorKind) (katibconfig.MetricsCollectorConfig, error) { +func (g *DefaultGenerator) GetMetricsCollectorConfigData(cKind commonapiv1beta1.CollectorKind) (configapi.MetricsCollectorConfig, error) { return katibconfig.GetMetricsCollectorConfigData(cKind, g.client.GetClient()) } // GetSuggestionConfigData returns suggestion configuration for a given algorithm name. -func (g *DefaultGenerator) GetSuggestionConfigData(algorithmName string) (katibconfig.SuggestionConfig, error) { +func (g *DefaultGenerator) GetSuggestionConfigData(algorithmName string) (configapi.SuggestionConfig, error) { return katibconfig.GetSuggestionConfigData(algorithmName, g.client.GetClient()) } // GetEarlyStoppingConfigData returns early stopping configuration for a given algorithm. -func (g *DefaultGenerator) GetEarlyStoppingConfigData(algorithmName string) (katibconfig.EarlyStoppingConfig, error) { +func (g *DefaultGenerator) GetEarlyStoppingConfigData(algorithmName string) (configapi.EarlyStoppingConfig, error) { return katibconfig.GetEarlyStoppingConfigData(algorithmName, g.client.GetClient()) } diff --git a/pkg/controller.v1beta1/suggestion/composer/composer.go b/pkg/controller.v1beta1/suggestion/composer/composer.go index 8cc508bad9d..6846d919f7a 100644 --- a/pkg/controller.v1beta1/suggestion/composer/composer.go +++ b/pkg/controller.v1beta1/suggestion/composer/composer.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/manager" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" suggestionsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/suggestions/v1beta1" trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1" @@ -83,7 +84,7 @@ func (g *General) DesiredDeployment(s *suggestionsv1beta1.Suggestion) (*appsv1.D } // If early stopping is used, get the config data. - earlyStoppingConfigData := katibconfig.EarlyStoppingConfig{} + earlyStoppingConfigData := configapi.EarlyStoppingConfig{} if s.Spec.EarlyStopping != nil && s.Spec.EarlyStopping.AlgorithmName != "" { earlyStoppingConfigData, err = katibconfig.GetEarlyStoppingConfigData(s.Spec.EarlyStopping.AlgorithmName, g.Client) if err != nil { @@ -183,8 +184,8 @@ func (g *General) DesiredService(s *suggestionsv1beta1.Suggestion) (*corev1.Serv } func (g *General) desiredContainers(s *suggestionsv1beta1.Suggestion, - suggestionConfigData katibconfig.SuggestionConfig, - earlyStoppingConfigData katibconfig.EarlyStoppingConfig) []corev1.Container { + suggestionConfigData configapi.SuggestionConfig, + earlyStoppingConfigData configapi.EarlyStoppingConfig) []corev1.Container { var ( containers []corev1.Container diff --git a/pkg/controller.v1beta1/suggestion/composer/composer_test.go b/pkg/controller.v1beta1/suggestion/composer/composer_test.go index c705828c7d7..6b51d19f47e 100644 --- a/pkg/controller.v1beta1/suggestion/composer/composer_test.go +++ b/pkg/controller.v1beta1/suggestion/composer/composer_test.go @@ -31,6 +31,7 @@ import ( "github.com/onsi/gomega" "github.com/spf13/viper" + "gopkg.in/yaml.v3" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -45,13 +46,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/envtest" "sigs.k8s.io/controller-runtime/pkg/manager" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" apis "github.com/kubeflow/katib/pkg/apis/controller" commonv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" suggestionsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/suggestions/v1beta1" trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" - "github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig" ) var ( @@ -132,6 +133,7 @@ func TestDesiredDeployment(t *testing.T) { mgr, err := manager.New(cfg, manager.Options{MetricsBindAddress: "0"}) g.Expect(err).NotTo(gomega.HaveOccurred()) + g.Expect(configapi.AddToScheme(mgr.GetScheme())).NotTo(gomega.HaveOccurred()) // Start test manager. wg := &sync.WaitGroup{} @@ -175,12 +177,13 @@ func TestDesiredDeployment(t *testing.T) { suggestion: newFakeSuggestion(), configMap: func() *corev1.ConfigMap { cm := newFakeKatibConfig(newFakeSuggestionConfig(), newFakeEarlyStoppingConfig()) - cm.Data["suggestion"] = strings.ReplaceAll(cm.Data["suggestion"], string(imagePullPolicy), "invalid") + cm.Data[consts.LabelKatibConfigTag] = strings.ReplaceAll(cm.Data[consts.LabelKatibConfigTag], string(imagePullPolicy), "invalid") return cm }(), expectedDeployment: func() *appsv1.Deployment { deploy := newFakeDeployment() - deploy.Spec.Template.Spec.Containers[0].ImagePullPolicy = corev1.PullIfNotPresent + deploy.Spec.Template.Spec.Containers[0].ImagePullPolicy = consts.DefaultImagePullPolicy + deploy.Spec.Template.Spec.Containers[1].ImagePullPolicy = consts.DefaultImagePullPolicy return deploy }(), err: false, @@ -190,7 +193,7 @@ func TestDesiredDeployment(t *testing.T) { suggestion: newFakeSuggestion(), configMap: func() *corev1.ConfigMap { cm := newFakeKatibConfig(newFakeSuggestionConfig(), newFakeEarlyStoppingConfig()) - cm.Data["suggestion"] = strings.ReplaceAll(cm.Data["suggestion"], cpu, "invalid") + cm.Data[consts.LabelKatibConfigTag] = strings.ReplaceAll(cm.Data[consts.LabelKatibConfigTag], cpu, "invalid") return cm }(), err: true, @@ -374,11 +377,11 @@ func TestDesiredService(t *testing.T) { } func TestDesiredVolume(t *testing.T) { - g := gomega.NewGomegaWithT(t) mgr, err := manager.New(cfg, manager.Options{MetricsBindAddress: "0"}) g.Expect(err).NotTo(gomega.HaveOccurred()) + g.Expect(configapi.AddToScheme(mgr.GetScheme())).NotTo(gomega.HaveOccurred()) // Start test manager. wg := &sync.WaitGroup{} @@ -640,12 +643,13 @@ func metaEqual(expected, actual metav1.ObjectMeta) bool { len(actual.OwnerReferences) == 0) } -func newFakeSuggestionConfig() katibconfig.SuggestionConfig { +func newFakeSuggestionConfig() configapi.SuggestionConfig { cpuQ, _ := resource.ParseQuantity(cpu) memoryQ, _ := resource.ParseQuantity(memory) diskQ, _ := resource.ParseQuantity(disk) - return katibconfig.SuggestionConfig{ + return configapi.SuggestionConfig{ + AlgorithmName: suggestionAlgorithm, Container: corev1.Container{ Image: image, ImagePullPolicy: imagePullPolicy, @@ -666,12 +670,13 @@ func newFakeSuggestionConfig() katibconfig.SuggestionConfig { } } -func newFakeEarlyStoppingConfig() katibconfig.EarlyStoppingConfig { +func newFakeEarlyStoppingConfig() configapi.EarlyStoppingConfig { cpuQ, _ := resource.ParseQuantity(cpu) memoryQ, _ := resource.ParseQuantity(memory) diskQ, _ := resource.ParseQuantity(disk) - return katibconfig.EarlyStoppingConfig{ + return configapi.EarlyStoppingConfig{ + AlgorithmName: earlyStoppingAlgorithm, Image: image, ImagePullPolicy: imagePullPolicy, Resource: corev1.ResourceRequirements{ @@ -689,28 +694,36 @@ func newFakeEarlyStoppingConfig() katibconfig.EarlyStoppingConfig { } } -func newFakeKatibConfig(suggestionConfig katibconfig.SuggestionConfig, earlyStoppingConfig katibconfig.EarlyStoppingConfig) *corev1.ConfigMap { - - jsonConfigSuggestion := map[string]katibconfig.SuggestionConfig{ - suggestionAlgorithm: suggestionConfig, +func newFakeKatibConfig(suggestionConfig configapi.SuggestionConfig, earlyStoppingConfig configapi.EarlyStoppingConfig) *corev1.ConfigMap { + katibConfig := configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + SuggestionConfigs: []configapi.SuggestionConfig{ + suggestionConfig, + }, + EarlyStoppingConfigs: []configapi.EarlyStoppingConfig{ + earlyStoppingConfig, + }, + }, } - - bSuggestion, _ := json.Marshal(jsonConfigSuggestion) - - jsonConfigEarlyStopping := map[string]katibconfig.EarlyStoppingConfig{ - earlyStoppingAlgorithm: earlyStoppingConfig, + bKatibConfig, err := json.Marshal(katibConfig) + if err != nil { + stdlog.Fatal(err) + } + yamlKatibConfig := make(map[string]interface{}) + if err = yaml.Unmarshal(bKatibConfig, yamlKatibConfig); err != nil { + stdlog.Fatal(err) + } + bKatibConfig, err = yaml.Marshal(yamlKatibConfig) + if err != nil { + stdlog.Fatal(err) } - - bEarlyStopping, _ := json.Marshal(jsonConfigEarlyStopping) - return &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: configMap, Namespace: namespace, }, Data: map[string]string{ - consts.LabelSuggestionTag: string(bSuggestion), - consts.LabelEarlyStoppingTag: string(bEarlyStopping), + consts.LabelKatibConfigTag: string(bKatibConfig), }, } } diff --git a/pkg/controller.v1beta1/suggestion/suggestion_controller_test.go b/pkg/controller.v1beta1/suggestion/suggestion_controller_test.go index b876ebe0e7a..e966459fa0d 100644 --- a/pkg/controller.v1beta1/suggestion/suggestion_controller_test.go +++ b/pkg/controller.v1beta1/suggestion/suggestion_controller_test.go @@ -19,6 +19,7 @@ package suggestion import ( "encoding/json" "fmt" + stdlog "log" "strings" "sync" "testing" @@ -26,6 +27,7 @@ import ( "github.com/golang/mock/gomock" "github.com/onsi/gomega" + "gopkg.in/yaml.v3" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" @@ -36,6 +38,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/manager" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" commonv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" suggestionsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/suggestions/v1beta1" @@ -44,7 +47,6 @@ import ( "github.com/kubeflow/katib/pkg/controller.v1beta1/suggestion/composer" "github.com/kubeflow/katib/pkg/controller.v1beta1/util" suggestionclientmock "github.com/kubeflow/katib/pkg/mock/v1beta1/suggestion/suggestionclient" - "github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig" ) const ( @@ -82,6 +84,7 @@ func TestReconcile(t *testing.T) { // channel when it is finished. mgr, err := manager.New(cfg, manager.Options{MetricsBindAddress: "0"}) g.Expect(err).NotTo(gomega.HaveOccurred()) + g.Expect(configapi.AddToScheme(mgr.GetScheme())).NotTo(gomega.HaveOccurred()) c := mgr.GetClient() r := &ReconcileSuggestion{ @@ -433,32 +436,44 @@ func newFakeInstance() *suggestionsv1beta1.Suggestion { func newKatibConfigMapInstance() *corev1.ConfigMap { // Create suggestion config - suggestionConfig := map[string]katibconfig.SuggestionConfig{ - "random": { - Container: corev1.Container{ - Image: suggestionImage, + katibConfig := configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + SuggestionConfigs: []configapi.SuggestionConfig{ + { + AlgorithmName: "random", + Container: corev1.Container{ + Image: suggestionImage, + }, + }, + }, + EarlyStoppingConfigs: []configapi.EarlyStoppingConfig{ + { + AlgorithmName: "median-stop", + Image: "test-image", + ImagePullPolicy: corev1.PullAlways, + }, }, }, } - bSuggestionConfig, _ := json.Marshal(suggestionConfig) - - // Create early stopping config - earlyStoppingConfig := map[string]katibconfig.EarlyStoppingConfig{ - "median-stop": { - Image: "test-image", - ImagePullPolicy: corev1.PullAlways, - }, + bKatibConfig, err := json.Marshal(katibConfig) + if err != nil { + stdlog.Fatal(err) + } + yamlKatibConfig := make(map[string]interface{}) + if err = yaml.Unmarshal(bKatibConfig, yamlKatibConfig); err != nil { + stdlog.Fatal(err) + } + bKatibConfig, err = yaml.Marshal(yamlKatibConfig) + if err != nil { + stdlog.Fatal(err) } - bEarlyStoppingConfig, _ := json.Marshal(earlyStoppingConfig) - return &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: katibConfigName, Namespace: namespace, }, Data: map[string]string{ - consts.LabelSuggestionTag: string(bSuggestionConfig), - consts.LabelEarlyStoppingTag: string(bEarlyStoppingConfig), + consts.LabelKatibConfigTag: string(bKatibConfig), }, } } diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index 2245bb16cfa..d56a59de0a9 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -97,7 +97,7 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { trialResources := viper.Get(consts.ConfigTrialResources) if trialResources != nil { // Cast interface to gvk slice object - gvkList := trialResources.(trialutil.GvkListFlag) + gvkList := trialResources.([]schema.GroupVersionKind) // Watch for changes in custom resources for _, gvk := range gvkList { diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 0867128a395..9cb21ca304c 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -17,6 +17,7 @@ limitations under the License. package trial import ( + "k8s.io/apimachinery/pkg/runtime/schema" "sync" "testing" "time" @@ -65,7 +66,7 @@ func TestAdd(t *testing.T) { g.Expect(err).NotTo(gomega.HaveOccurred()) // Set Trial resources. - trialResources := trialutil.GvkListFlag{ + trialResources := []schema.GroupVersionKind{ { Group: "kubeflow.org", Version: "v1", @@ -121,7 +122,7 @@ func TestReconcileBatchJob(t *testing.T) { recFn := SetupTestReconcile(r) // Set Job resource - trialResources := trialutil.GvkListFlag{ + trialResources := []schema.GroupVersionKind{ { Group: "batch", Version: "v1", diff --git a/pkg/controller.v1beta1/trial/util/flag_util.go b/pkg/controller.v1beta1/trial/util/flag_util.go index a100a0ff91e..6ae1840cf37 100644 --- a/pkg/controller.v1beta1/trial/util/flag_util.go +++ b/pkg/controller.v1beta1/trial/util/flag_util.go @@ -17,30 +17,8 @@ limitations under the License. package util import ( - "fmt" - "strings" - "k8s.io/apimachinery/pkg/runtime/schema" ) -// GvkListFlag is the custom flag to parse GroupVersionKind list for trial resources. -type GvkListFlag []schema.GroupVersionKind - -// Set is the method to convert gvk to string value -func (flag *GvkListFlag) String() string { - gvkStrings := []string{} - for _, x := range []schema.GroupVersionKind(*flag) { - gvkStrings = append(gvkStrings, x.String()) - } - return strings.Join(gvkStrings, ",") -} - -// Set is the method to set gvk from string flag value -func (flag *GvkListFlag) Set(value string) error { - gvk, _ := schema.ParseKindArg(value) - if gvk == nil { - return fmt.Errorf("Invalid GroupVersionKind: %v", value) - } - *flag = append(*flag, *gvk) - return nil -} +// GroupVersionKinds is the custom object to parse GroupVersionKinds for trial resources. +type GroupVersionKinds []schema.GroupVersionKind diff --git a/pkg/mock/v1beta1/experiment/manifest/generator.go b/pkg/mock/v1beta1/experiment/manifest/generator.go index 462a15e58c4..cdc286f50aa 100644 --- a/pkg/mock/v1beta1/experiment/manifest/generator.go +++ b/pkg/mock/v1beta1/experiment/manifest/generator.go @@ -8,9 +8,9 @@ import ( reflect "reflect" gomock "github.com/golang/mock/gomock" - v1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" - v1beta10 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" - katibconfig "github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig" + v1beta1 "github.com/kubeflow/katib/pkg/apis/config/v1beta1" + v1beta10 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" + v1beta11 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" client "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -39,10 +39,10 @@ func (m *MockGenerator) EXPECT() *MockGeneratorMockRecorder { } // GetEarlyStoppingConfigData mocks base method. -func (m *MockGenerator) GetEarlyStoppingConfigData(arg0 string) (katibconfig.EarlyStoppingConfig, error) { +func (m *MockGenerator) GetEarlyStoppingConfigData(arg0 string) (v1beta1.EarlyStoppingConfig, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetEarlyStoppingConfigData", arg0) - ret0, _ := ret[0].(katibconfig.EarlyStoppingConfig) + ret0, _ := ret[0].(v1beta1.EarlyStoppingConfig) ret1, _ := ret[1].(error) return ret0, ret1 } @@ -54,10 +54,10 @@ func (mr *MockGeneratorMockRecorder) GetEarlyStoppingConfigData(arg0 interface{} } // GetMetricsCollectorConfigData mocks base method. -func (m *MockGenerator) GetMetricsCollectorConfigData(arg0 v1beta1.CollectorKind) (katibconfig.MetricsCollectorConfig, error) { +func (m *MockGenerator) GetMetricsCollectorConfigData(arg0 v1beta10.CollectorKind) (v1beta1.MetricsCollectorConfig, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetMetricsCollectorConfigData", arg0) - ret0, _ := ret[0].(katibconfig.MetricsCollectorConfig) + ret0, _ := ret[0].(v1beta1.MetricsCollectorConfig) ret1, _ := ret[1].(error) return ret0, ret1 } @@ -69,7 +69,7 @@ func (mr *MockGeneratorMockRecorder) GetMetricsCollectorConfigData(arg0 interfac } // GetRunSpecWithHyperParameters mocks base method. -func (m *MockGenerator) GetRunSpecWithHyperParameters(arg0 *v1beta10.Experiment, arg1, arg2 string, arg3 []v1beta1.ParameterAssignment) (*unstructured.Unstructured, error) { +func (m *MockGenerator) GetRunSpecWithHyperParameters(arg0 *v1beta11.Experiment, arg1, arg2 string, arg3 []v1beta10.ParameterAssignment) (*unstructured.Unstructured, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetRunSpecWithHyperParameters", arg0, arg1, arg2, arg3) ret0, _ := ret[0].(*unstructured.Unstructured) @@ -84,10 +84,10 @@ func (mr *MockGeneratorMockRecorder) GetRunSpecWithHyperParameters(arg0, arg1, a } // GetSuggestionConfigData mocks base method. -func (m *MockGenerator) GetSuggestionConfigData(arg0 string) (katibconfig.SuggestionConfig, error) { +func (m *MockGenerator) GetSuggestionConfigData(arg0 string) (v1beta1.SuggestionConfig, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetSuggestionConfigData", arg0) - ret0, _ := ret[0].(katibconfig.SuggestionConfig) + ret0, _ := ret[0].(v1beta1.SuggestionConfig) ret1, _ := ret[1].(error) return ret0, ret1 } @@ -99,7 +99,7 @@ func (mr *MockGeneratorMockRecorder) GetSuggestionConfigData(arg0 interface{}) * } // GetTrialTemplate mocks base method. -func (m *MockGenerator) GetTrialTemplate(arg0 *v1beta10.Experiment) (string, error) { +func (m *MockGenerator) GetTrialTemplate(arg0 *v1beta11.Experiment) (string, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetTrialTemplate", arg0) ret0, _ := ret[0].(string) diff --git a/pkg/util/v1beta1/katibconfig/config.go b/pkg/util/v1beta1/katibconfig/config.go index 41238770e79..baf2e0ff20d 100644 --- a/pkg/util/v1beta1/katibconfig/config.go +++ b/pkg/util/v1beta1/katibconfig/config.go @@ -18,297 +18,139 @@ package katibconfig import ( "context" - "encoding/json" "fmt" + "os" "strings" corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/equality" - "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/serializer" apitypes "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" common "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" ) -// SuggestionConfig is the JSON suggestion structure in Katib config. -type SuggestionConfig struct { - corev1.Container `json:",inline"` - ServiceAccountName string `json:"serviceAccountName,omitempty"` - VolumeMountPath string `json:"volumeMountPath,omitempty"` - PersistentVolumeClaimSpec corev1.PersistentVolumeClaimSpec `json:"persistentVolumeClaimSpec,omitempty"` - PersistentVolumeSpec corev1.PersistentVolumeSpec `json:"persistentVolumeSpec,omitempty"` - PersistentVolumeLabels map[string]string `json:"persistentVolumeLabels,omitempty"` -} - -// EarlyStoppingConfig is the JSON early stopping structure in Katib config. -type EarlyStoppingConfig struct { - Image string `json:"image"` - ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"` - Resource corev1.ResourceRequirements `json:"resources,omitempty"` -} - -// MetricsCollectorConfig is the JSON metrics collector structure in Katib config. -type MetricsCollectorConfig struct { - Image string `json:"image"` - ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"` - Resource corev1.ResourceRequirements `json:"resources,omitempty"` - WaitAllProcesses *bool `json:"waitAllProcesses,omitempty"` -} +var ( + ErrKatibConfigNil = fmt.Errorf("failed to parse katib-config.yaml in ConfigMap: %s", consts.KatibConfigMapName) +) // GetSuggestionConfigData gets the config data for the given suggestion algorithm name. -func GetSuggestionConfigData(algorithmName string, client client.Client) (SuggestionConfig, error) { - configMap := &corev1.ConfigMap{} - suggestionConfigData := SuggestionConfig{} - err := client.Get( - context.TODO(), - apitypes.NamespacedName{Name: consts.KatibConfigMapName, Namespace: consts.DefaultKatibNamespace}, - configMap) - if err != nil { - return SuggestionConfig{}, err - } - - // Try to find suggestion data in config map - config, ok := configMap.Data[consts.LabelSuggestionTag] - if !ok { - return SuggestionConfig{}, fmt.Errorf("failed to find suggestions config in ConfigMap: %s", consts.KatibConfigMapName) - } - - // Parse suggestion data to map where key = algorithm name, value = SuggestionConfig - suggestionsConfig := map[string]SuggestionConfig{} - if err := json.Unmarshal([]byte(config), &suggestionsConfig); err != nil { - return SuggestionConfig{}, err +func GetSuggestionConfigData(algorithmName string, client client.Client) (configapi.SuggestionConfig, error) { + katibCfg := &configapi.KatibConfig{} + if err := fromConfigMap(katibCfg, client); err != nil { + return configapi.SuggestionConfig{}, err } // Try to find SuggestionConfig for the algorithm - suggestionConfigData, ok = suggestionsConfig[algorithmName] - if !ok { - return SuggestionConfig{}, fmt.Errorf("failed to find suggestion config for algorithm: %s in ConfigMap: %s", algorithmName, consts.KatibConfigMapName) + var suggestionConfigData *configapi.SuggestionConfig + for i := range katibCfg.RuntimeConfig.SuggestionConfigs { + if katibCfg.RuntimeConfig.SuggestionConfigs[i].AlgorithmName == algorithmName { + suggestionConfigData = &katibCfg.RuntimeConfig.SuggestionConfigs[i] + } + } + if suggestionConfigData == nil { + return configapi.SuggestionConfig{}, fmt.Errorf("failed to find suggestion config for algorithm: %s in ConfigMap: %s", algorithmName, consts.KatibConfigMapName) } // Get image from config image := suggestionConfigData.Image if strings.TrimSpace(image) == "" { - return SuggestionConfig{}, fmt.Errorf("required value for image configuration of algorithm name: %s", algorithmName) - } - - // Set Image Pull Policy - suggestionConfigData.ImagePullPolicy = setImagePullPolicy(suggestionConfigData.ImagePullPolicy) - - // Set resource requirements for suggestion - suggestionConfigData.Resources = setResourceRequirements(suggestionConfigData.Resources) - - // Set default suggestion container volume mount path - if suggestionConfigData.VolumeMountPath == "" { - suggestionConfigData.VolumeMountPath = consts.DefaultContainerSuggestionVolumeMountPath + return configapi.SuggestionConfig{}, fmt.Errorf("required value for image configuration of algorithm name: %s", algorithmName) } - - // Get persistent volume claim spec from config - pvcSpec := suggestionConfigData.PersistentVolumeClaimSpec - - // Set default access modes - if len(pvcSpec.AccessModes) == 0 { - pvcSpec.AccessModes = []corev1.PersistentVolumeAccessMode{ - consts.DefaultSuggestionVolumeAccessMode, - } - } - - // Set default resources - if len(pvcSpec.Resources.Requests) == 0 { - defaultVolumeStorage, _ := resource.ParseQuantity(consts.DefaultSuggestionVolumeStorage) - pvcSpec.Resources.Requests = make(map[corev1.ResourceName]resource.Quantity) - pvcSpec.Resources.Requests[corev1.ResourceStorage] = defaultVolumeStorage - } - - // Set PVC back for suggestion config. - suggestionConfigData.PersistentVolumeClaimSpec = pvcSpec - - // Get PV from config only if it exists. - if !equality.Semantic.DeepEqual(suggestionConfigData.PersistentVolumeSpec, corev1.PersistentVolumeSpec{}) { - - // Set PersistentVolumeReclaimPolicy to "Delete" to automatically delete PV once PVC is deleted. - // Kubernetes doesn't allow to specify ownerReferences for the cluster-scoped - // resources (which PV is) with namespace-scoped owner (which Suggestion is). - suggestionConfigData.PersistentVolumeSpec.PersistentVolumeReclaimPolicy = corev1.PersistentVolumeReclaimDelete - - } - - return suggestionConfigData, nil + return *suggestionConfigData, nil } // GetEarlyStoppingConfigData gets the config data for the given early stopping algorithm name. -func GetEarlyStoppingConfigData(algorithmName string, client client.Client) (EarlyStoppingConfig, error) { - configMap := &corev1.ConfigMap{} - earlyStoppingConfigData := EarlyStoppingConfig{} - err := client.Get( - context.TODO(), - apitypes.NamespacedName{Name: consts.KatibConfigMapName, Namespace: consts.DefaultKatibNamespace}, - configMap) - if err != nil { - return EarlyStoppingConfig{}, err +func GetEarlyStoppingConfigData(algorithmName string, client client.Client) (configapi.EarlyStoppingConfig, error) { + katibCfg := &configapi.KatibConfig{} + if err := fromConfigMap(katibCfg, client); err != nil { + return configapi.EarlyStoppingConfig{}, err } - // Try to find early stopping data in config map. - config, ok := configMap.Data[consts.LabelEarlyStoppingTag] - if !ok { - return EarlyStoppingConfig{}, fmt.Errorf("failed to find early stopping config in ConfigMap: %s", consts.KatibConfigMapName) + // Try to find EarlyStoppingConfig for the algorithm + var earlyStoppingConfigData *configapi.EarlyStoppingConfig + for i := range katibCfg.RuntimeConfig.EarlyStoppingConfigs { + if katibCfg.RuntimeConfig.EarlyStoppingConfigs[i].AlgorithmName == algorithmName { + earlyStoppingConfigData = &katibCfg.RuntimeConfig.EarlyStoppingConfigs[i] + } } - - // Parse early stopping data to map where key = algorithm name, value = EarlyStoppingConfig. - earlyStoppingsConfig := map[string]EarlyStoppingConfig{} - if err := json.Unmarshal([]byte(config), &earlyStoppingsConfig); err != nil { - return EarlyStoppingConfig{}, err - } - - // Try to find EarlyStoppingConfig for the algorithm. - earlyStoppingConfigData, ok = earlyStoppingsConfig[algorithmName] - if !ok { - return EarlyStoppingConfig{}, fmt.Errorf("failed to find early stopping config for algorithm: %s in ConfigMap: %s", algorithmName, consts.KatibConfigMapName) + if earlyStoppingConfigData == nil { + return configapi.EarlyStoppingConfig{}, fmt.Errorf("failed to find early stopping config for algorithm: %s in ConfigMap: %s", algorithmName, consts.KatibConfigMapName) } // Get image from config. image := earlyStoppingConfigData.Image if strings.TrimSpace(image) == "" { - return EarlyStoppingConfig{}, fmt.Errorf("required value for image configuration of algorithm name: %s", algorithmName) + return configapi.EarlyStoppingConfig{}, fmt.Errorf("required value for image configuration of algorithm name: %s", algorithmName) } - // Set Image Pull Policy. - earlyStoppingConfigData.ImagePullPolicy = setImagePullPolicy(earlyStoppingConfigData.ImagePullPolicy) - - // Set resource requirements - earlyStoppingConfigData.Resource = setResourceRequirements(earlyStoppingConfigData.Resource) - - return earlyStoppingConfigData, nil + return *earlyStoppingConfigData, nil } // GetMetricsCollectorConfigData gets the config data for the given collector kind. -func GetMetricsCollectorConfigData(cKind common.CollectorKind, client client.Client) (MetricsCollectorConfig, error) { - configMap := &corev1.ConfigMap{} - metricsCollectorConfigData := MetricsCollectorConfig{} - err := client.Get( - context.TODO(), - apitypes.NamespacedName{Name: consts.KatibConfigMapName, Namespace: consts.DefaultKatibNamespace}, - configMap) - if err != nil { - return MetricsCollectorConfig{}, err +func GetMetricsCollectorConfigData(cKind common.CollectorKind, client client.Client) (configapi.MetricsCollectorConfig, error) { + katibCfg := &configapi.KatibConfig{} + if err := fromConfigMap(katibCfg, client); err != nil { + return configapi.MetricsCollectorConfig{}, err } - // Try to find metrics collector data in config map - config, ok := configMap.Data[consts.LabelMetricsCollectorSidecar] - if !ok { - return MetricsCollectorConfig{}, fmt.Errorf("failed to find metrics collector config in ConfigMap: %s", consts.KatibConfigMapName) - } - // Parse metrics collector data to map where key = collector kind, value = MetricsCollectorConfig + // Try to find MetricsCollectorConfig for the collector kind + var metricsCollectorConfigData *configapi.MetricsCollectorConfig kind := string(cKind) - mcsConfig := map[string]MetricsCollectorConfig{} - if err := json.Unmarshal([]byte(config), &mcsConfig); err != nil { - return MetricsCollectorConfig{}, err + for i := range katibCfg.RuntimeConfig.MetricsCollectorConfigs { + if katibCfg.RuntimeConfig.MetricsCollectorConfigs[i].CollectorKind == kind { + metricsCollectorConfigData = &katibCfg.RuntimeConfig.MetricsCollectorConfigs[i] + } } - - // Try to find MetricsCollectorConfig for the collector kind - metricsCollectorConfigData, ok = mcsConfig[kind] - if !ok { - return MetricsCollectorConfig{}, fmt.Errorf("failed to find metrics collector config for kind: %s in ConfigMap: %s", kind, consts.KatibConfigMapName) + if metricsCollectorConfigData == nil { + return configapi.MetricsCollectorConfig{}, fmt.Errorf("failed to find metrics collector config for kind: %s in ConfigMap: %s", kind, consts.KatibConfigMapName) } // Get image from config image := metricsCollectorConfigData.Image if strings.TrimSpace(image) == "" { - return MetricsCollectorConfig{}, fmt.Errorf("required value for image configuration of metrics collector kind: %s", kind) + return configapi.MetricsCollectorConfig{}, fmt.Errorf("required value for image configuration of metrics collector kind: %s", kind) } - // Set Image Pull Policy - metricsCollectorConfigData.ImagePullPolicy = setImagePullPolicy(metricsCollectorConfigData.ImagePullPolicy) - - // Set resource requirements for metrics collector - metricsCollectorConfigData.Resource = setResourceRequirements(metricsCollectorConfigData.Resource) - - return metricsCollectorConfigData, nil + return *metricsCollectorConfigData, nil } -func setImagePullPolicy(imagePullPolicy corev1.PullPolicy) corev1.PullPolicy { - if imagePullPolicy != corev1.PullAlways && imagePullPolicy != corev1.PullIfNotPresent && imagePullPolicy != corev1.PullNever { - return consts.DefaultImagePullPolicy +// GetInitConfigData gets the init config data. +func GetInitConfigData(scheme *runtime.Scheme, katibCfgPath string) (configapi.InitConfig, error) { + var katibCfg configapi.KatibConfig + if err := fromFile(scheme, &katibCfg, katibCfgPath); err != nil { + return configapi.InitConfig{}, fmt.Errorf("%w: %s", ErrKatibConfigNil, err.Error()) } - return imagePullPolicy + return katibCfg.InitConfig, nil } -func setResourceRequirements(configResource corev1.ResourceRequirements) corev1.ResourceRequirements { - - // If requests are empty create new map - if len(configResource.Requests) == 0 { - configResource.Requests = make(map[corev1.ResourceName]resource.Quantity) - } - - // Get CPU, Memory and Disk Requests from config - cpuRequest := configResource.Requests[corev1.ResourceCPU] - memRequest := configResource.Requests[corev1.ResourceMemory] - diskRequest := configResource.Requests[corev1.ResourceEphemeralStorage] - - // If resource is empty set default value for CPU, Memory, Disk - if cpuRequest.IsZero() { - defaultCPURequest, _ := resource.ParseQuantity(consts.DefaultCPURequest) - configResource.Requests[corev1.ResourceCPU] = defaultCPURequest - } - if memRequest.IsZero() { - defaultMemRequest, _ := resource.ParseQuantity(consts.DefaultMemRequest) - configResource.Requests[corev1.ResourceMemory] = defaultMemRequest - } - if diskRequest.IsZero() { - defaultDiskRequest, _ := resource.ParseQuantity(consts.DefaultDiskRequest) - configResource.Requests[corev1.ResourceEphemeralStorage] = defaultDiskRequest - } - - // If limits are empty create new map - if len(configResource.Limits) == 0 { - configResource.Limits = make(map[corev1.ResourceName]resource.Quantity) - } - - // Get CPU, Memory and Disk Limits from config - cpuLimit := configResource.Limits[corev1.ResourceCPU] - memLimit := configResource.Limits[corev1.ResourceMemory] - diskLimit := configResource.Limits[corev1.ResourceEphemeralStorage] - - // If limit is empty set default value for CPU, Memory, Disk - if cpuLimit.IsZero() { - defaultCPULimit, _ := resource.ParseQuantity(consts.DefaultCPULimit) - configResource.Limits[corev1.ResourceCPU] = defaultCPULimit - } - if memLimit.IsZero() { - defaultMemLimit, _ := resource.ParseQuantity(consts.DefaultMemLimit) - configResource.Limits[corev1.ResourceMemory] = defaultMemLimit - } - if diskLimit.IsZero() { - defaultDiskLimit, _ := resource.ParseQuantity(consts.DefaultDiskLimit) - configResource.Limits[corev1.ResourceEphemeralStorage] = defaultDiskLimit - } - - // If user explicitly sets CPU value to -1, nuke it. - if cpuLimit.Sign() == -1 { - delete(configResource.Limits, corev1.ResourceCPU) +func fromFile(scheme *runtime.Scheme, katibConfig *configapi.KatibConfig, katibConfigPath string) error { + if len(katibConfigPath) == 0 { + scheme.Default(katibConfig) + return nil } - if cpuRequest.Sign() == -1 { - delete(configResource.Requests, corev1.ResourceCPU) - } - - // If user explicitly sets Memory value to -1, nuke it. - if memLimit.Sign() == -1 { - delete(configResource.Limits, corev1.ResourceMemory) - } - if memRequest.Sign() == -1 { - delete(configResource.Requests, corev1.ResourceMemory) + config, err := os.ReadFile(katibConfigPath) + if err != nil { + return err } + codecs := serializer.NewCodecFactory(scheme) + return runtime.DecodeInto(codecs.UniversalDecoder(), config, katibConfig) +} - // If user explicitly sets ephemeral-storage value to something negative, nuke it. - // This enables compatibility with the GKE nodepool autoscalers, which cannot scale - // pods which define ephemeral-storage resource constraints. - if diskLimit.Sign() == -1 { - delete(configResource.Limits, corev1.ResourceEphemeralStorage) +func fromConfigMap(katibConfig *configapi.KatibConfig, client client.Client) error { + configMap := &corev1.ConfigMap{} + err := client.Get(context.TODO(), apitypes.NamespacedName{Name: consts.KatibConfigMapName, Namespace: consts.DefaultKatibNamespace}, configMap) + if err != nil { + return err } - if diskRequest.Sign() == -1 { - delete(configResource.Requests, corev1.ResourceEphemeralStorage) + // Try to find katib-config.yaml data in configMap. + config, ok := configMap.Data[consts.LabelKatibConfigTag] + if !ok { + return fmt.Errorf("failed to find katib-config.yaml in ConfigMap: %s", consts.KatibConfigMapName) } - - return configResource + codecs := serializer.NewCodecFactory(client.Scheme()) + return runtime.DecodeInto(codecs.UniversalDecoder(), []byte(config), katibConfig) } diff --git a/pkg/util/v1beta1/katibconfig/config_test.go b/pkg/util/v1beta1/katibconfig/config_test.go index 0ff3c455adb..ca9b48c62fe 100644 --- a/pkg/util/v1beta1/katibconfig/config_test.go +++ b/pkg/util/v1beta1/katibconfig/config_test.go @@ -19,46 +19,65 @@ package katibconfig import ( "encoding/json" "fmt" - "reflect" + "log" + "os" + "path/filepath" "testing" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "gopkg.in/yaml.v3" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes/scheme" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" commonv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" ) -type katibConfig struct { - suggestion map[string]*SuggestionConfig - earlyStopping map[string]*EarlyStoppingConfig - metricsCollector map[commonv1beta1.CollectorKind]*MetricsCollectorConfig -} - func TestGetSuggestionConfigData(t *testing.T) { const testAlgorithmName = "test-suggestion" + scm := runtime.NewScheme() + if err := configapi.AddToScheme(scm); err != nil { + t.Fatal(err) + } + if err := clientgoscheme.AddToScheme(scm); err != nil { + t.Fatal(err) + } tests := []struct { testDescription string - katibConfig *katibConfig - expected *SuggestionConfig + katibConfig *configapi.KatibConfig + expected *configapi.SuggestionConfig inputAlgorithmName string err bool }{ { testDescription: "All parameters correctly are specified", - katibConfig: func() *katibConfig { - kc := &katibConfig{suggestion: map[string]*SuggestionConfig{testAlgorithmName: newFakeSuggestionConfig()}} - kc.suggestion[testAlgorithmName].ImagePullPolicy = corev1.PullAlways - kc.suggestion[testAlgorithmName].Resources = *newFakeCustomResourceRequirements() + katibConfig: func() *configapi.KatibConfig { + kc := &configapi.KatibConfig{ + TypeMeta: metav1.TypeMeta{ + Kind: "KatibConfig", + APIVersion: "config.kubeflow.org/v1beta1", + }, + RuntimeConfig: configapi.RuntimeConfig{ + SuggestionConfigs: []configapi.SuggestionConfig{ + *newFakeSuggestionConfig(testAlgorithmName), + }, + }, + } + kc.RuntimeConfig.SuggestionConfigs[0].ImagePullPolicy = corev1.PullAlways + kc.RuntimeConfig.SuggestionConfigs[0].Resources = *newFakeCustomResourceRequirements() return kc }(), - expected: func() *SuggestionConfig { - c := newFakeSuggestionConfig() + expected: func() *configapi.SuggestionConfig { + c := newFakeSuggestionConfig(testAlgorithmName) c.ImagePullPolicy = corev1.PullAlways c.Resources = *newFakeCustomResourceRequirements() return c @@ -72,122 +91,88 @@ func TestGetSuggestionConfigData(t *testing.T) { err: true, }, { - testDescription: fmt.Sprintf("There is not %s field in katib-config configMap", consts.LabelSuggestionTag), - katibConfig: &katibConfig{}, + testDescription: "There is not runtime.suggestions field in katib-config configMap", + katibConfig: &configapi.KatibConfig{}, err: true, }, { - testDescription: "There is not the AlgorithmName", - katibConfig: &katibConfig{suggestion: map[string]*SuggestionConfig{testAlgorithmName: newFakeSuggestionConfig()}}, + testDescription: "There is not the AlgorithmName", + katibConfig: &configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + SuggestionConfigs: []configapi.SuggestionConfig{ + *newFakeSuggestionConfig(testAlgorithmName), + }, + }, + }, inputAlgorithmName: "invalid-algorithm-name", err: true, }, { testDescription: "Image filed is empty in katib-config configMap", - katibConfig: func() *katibConfig { - kc := &katibConfig{suggestion: map[string]*SuggestionConfig{testAlgorithmName: newFakeSuggestionConfig()}} - kc.suggestion[testAlgorithmName].Image = "" + katibConfig: func() *configapi.KatibConfig { + kc := &configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + SuggestionConfigs: []configapi.SuggestionConfig{ + *newFakeSuggestionConfig(testAlgorithmName), + }, + }, + } + kc.RuntimeConfig.SuggestionConfigs[0].Image = "" return kc }(), inputAlgorithmName: testAlgorithmName, err: true, }, - { - testDescription: fmt.Sprintf("GetSuggestionConfigData sets %s to imagePullPolicy", consts.DefaultImagePullPolicy), - katibConfig: func() *katibConfig { - kc := &katibConfig{suggestion: map[string]*SuggestionConfig{testAlgorithmName: newFakeSuggestionConfig()}} - kc.suggestion[testAlgorithmName].ImagePullPolicy = "" - return kc - }(), - expected: newFakeSuggestionConfig(), - inputAlgorithmName: testAlgorithmName, - err: false, - }, - { - testDescription: "GetSuggestionConfigData sets resource.requests and resource.limits for the suggestion service", - katibConfig: func() *katibConfig { - kc := &katibConfig{suggestion: map[string]*SuggestionConfig{testAlgorithmName: newFakeSuggestionConfig()}} - kc.suggestion[testAlgorithmName].Resources = corev1.ResourceRequirements{} - return kc - }(), - expected: newFakeSuggestionConfig(), - inputAlgorithmName: testAlgorithmName, - err: false, - }, - { - testDescription: fmt.Sprintf("GetSuggestionConfigData sets %s to volumeMountPath", consts.DefaultContainerSuggestionVolumeMountPath), - katibConfig: func() *katibConfig { - kc := &katibConfig{suggestion: map[string]*SuggestionConfig{testAlgorithmName: newFakeSuggestionConfig()}} - kc.suggestion[testAlgorithmName].VolumeMountPath = "" - return kc - }(), - expected: newFakeSuggestionConfig(), - inputAlgorithmName: testAlgorithmName, - err: false, - }, - { - testDescription: "GetSuggestionConfigData sets accessMode and resource.requests for PVC", - katibConfig: func() *katibConfig { - kc := &katibConfig{suggestion: map[string]*SuggestionConfig{testAlgorithmName: newFakeSuggestionConfig()}} - kc.suggestion[testAlgorithmName].PersistentVolumeClaimSpec = corev1.PersistentVolumeClaimSpec{} - return kc - }(), - expected: newFakeSuggestionConfig(), - inputAlgorithmName: testAlgorithmName, - err: false, - }, - { - testDescription: fmt.Sprintf("GetSuggestionConfigData does not set %s to persistentVolumeReclaimPolicy", corev1.PersistentVolumeReclaimDelete), - katibConfig: func() *katibConfig { - kc := &katibConfig{suggestion: map[string]*SuggestionConfig{testAlgorithmName: newFakeSuggestionConfig()}} - kc.suggestion[testAlgorithmName].PersistentVolumeSpec = corev1.PersistentVolumeSpec{} - return kc - }(), - expected: func() *SuggestionConfig { - c := newFakeSuggestionConfig() - c.PersistentVolumeSpec = corev1.PersistentVolumeSpec{} - return c - }(), - inputAlgorithmName: testAlgorithmName, - err: false, - }, } for _, tt := range tests { t.Run(tt.testDescription, func(t *testing.T) { - fakeKubeClient := newFakeKubeClient(newFakeKatibConfigMap(tt.katibConfig)) + fakeKubeClient := newFakeKubeClient(scm, newFakeKatibConfigMap(tt.katibConfig)) actual, err := GetSuggestionConfigData(tt.inputAlgorithmName, fakeKubeClient) if (err != nil) != tt.err { t.Errorf("want error: %v, actual: %v", tt.err, err) } else if tt.expected != nil { - if !reflect.DeepEqual(actual, *tt.expected) { - t.Errorf("Generated SuggestionConfig is invalid.\n\nactual:\n%v\n\nexpected:\n%v\n\n", actual, *tt.expected) + if diff := cmp.Diff(*tt.expected, actual); len(diff) != 0 { + t.Logf("katibConfig: %v", tt.katibConfig) + t.Errorf("Generated SuggestionConfig is invalid. (-want,+got):\n%s", diff) } } }) } - } func TestGetEarlyStoppingConfigData(t *testing.T) { const testAlgorithmName = "test-early-stopping" + scm := runtime.NewScheme() + if err := configapi.AddToScheme(scm); err != nil { + t.Fatal(err) + } + if err := clientgoscheme.AddToScheme(scm); err != nil { + t.Fatal(err) + } tests := []struct { testDescription string - katibConfig *katibConfig - expected *EarlyStoppingConfig + katibConfig *configapi.KatibConfig + expected *configapi.EarlyStoppingConfig inputAlgorithmName string err bool }{ { testDescription: "All parameters correctly are specified", - katibConfig: func() *katibConfig { - kc := &katibConfig{earlyStopping: map[string]*EarlyStoppingConfig{testAlgorithmName: newFakeEarlyStoppingConfig()}} - kc.earlyStopping[testAlgorithmName].ImagePullPolicy = corev1.PullIfNotPresent + katibConfig: func() *configapi.KatibConfig { + kc := &configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + EarlyStoppingConfigs: []configapi.EarlyStoppingConfig{ + *newFakeEarlyStoppingConfig(testAlgorithmName), + }, + }, + } + kc.RuntimeConfig.EarlyStoppingConfigs[0].ImagePullPolicy = corev1.PullIfNotPresent return kc }(), - expected: func() *EarlyStoppingConfig { - c := newFakeEarlyStoppingConfig() + expected: func() *configapi.EarlyStoppingConfig { + c := newFakeEarlyStoppingConfig(testAlgorithmName) c.ImagePullPolicy = corev1.PullIfNotPresent return c }(), @@ -200,48 +185,48 @@ func TestGetEarlyStoppingConfigData(t *testing.T) { err: true, }, { - testDescription: fmt.Sprintf("There is not %s field in katib-config configMap", consts.LabelEarlyStoppingTag), - katibConfig: &katibConfig{}, + testDescription: "There is not runtime.earlyStoppings field in katib-config configMap", + katibConfig: &configapi.KatibConfig{}, err: true, }, { - testDescription: "There is not the AlgorithmName", - katibConfig: &katibConfig{earlyStopping: map[string]*EarlyStoppingConfig{testAlgorithmName: newFakeEarlyStoppingConfig()}}, + testDescription: "There is not the AlgorithmName", + katibConfig: &configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + EarlyStoppingConfigs: []configapi.EarlyStoppingConfig{ + *newFakeEarlyStoppingConfig(testAlgorithmName), + }, + }, + }, inputAlgorithmName: "invalid-algorithm-name", err: true, }, { testDescription: "Image filed is empty in katib-config configMap", - katibConfig: func() *katibConfig { - kc := &katibConfig{earlyStopping: map[string]*EarlyStoppingConfig{testAlgorithmName: newFakeEarlyStoppingConfig()}} - kc.earlyStopping[testAlgorithmName].Image = "" + katibConfig: func() *configapi.KatibConfig { + kc := &configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + EarlyStoppingConfigs: []configapi.EarlyStoppingConfig{ + *newFakeEarlyStoppingConfig(testAlgorithmName), + }, + }, + } + kc.RuntimeConfig.EarlyStoppingConfigs[0].Image = "" return kc }(), inputAlgorithmName: testAlgorithmName, err: true, }, - { - testDescription: fmt.Sprintf("GetEarlyStoppingConfigData sets %s to imagePullPolicy", consts.DefaultImagePullPolicy), - katibConfig: func() *katibConfig { - kc := &katibConfig{earlyStopping: map[string]*EarlyStoppingConfig{testAlgorithmName: newFakeEarlyStoppingConfig()}} - kc.earlyStopping[testAlgorithmName].ImagePullPolicy = "" - return kc - }(), - expected: newFakeEarlyStoppingConfig(), - inputAlgorithmName: testAlgorithmName, - err: false, - }, } - for _, tt := range tests { t.Run(tt.testDescription, func(t *testing.T) { - fakeKubeClient := newFakeKubeClient(newFakeKatibConfigMap(tt.katibConfig)) + fakeKubeClient := newFakeKubeClient(scm, newFakeKatibConfigMap(tt.katibConfig)) actual, err := GetEarlyStoppingConfigData(tt.inputAlgorithmName, fakeKubeClient) if (err != nil) != tt.err { t.Errorf("want error: %v, actual: %v", tt.err, err) } else if tt.expected != nil { - if !reflect.DeepEqual(actual, *tt.expected) { - t.Errorf("Generated EarlyStoppingConfig is invalid.\n\nactual:\n%v\n\nexpected:\n%v\n\n", actual, *tt.expected) + if diff := cmp.Diff(*tt.expected, actual); len(diff) != 0 { + t.Errorf("Generated EarlyStoppingConfig is invalid. (-want,+got):\n%s", diff) } } }) @@ -253,30 +238,36 @@ func TestGetMetricsCollectorConfigData(t *testing.T) { invalidCollectorKind commonv1beta1.CollectorKind = "invalidCollector" testCollectorKind commonv1beta1.CollectorKind = "testCollector" ) - - nukeResource, _ := resource.ParseQuantity("-1") - nukeResourceRequirements := map[corev1.ResourceName]resource.Quantity{ - corev1.ResourceCPU: nukeResource, - corev1.ResourceMemory: nukeResource, - corev1.ResourceEphemeralStorage: nukeResource, + scm := runtime.NewScheme() + if err := configapi.AddToScheme(scm); err != nil { + t.Fatal(err) + } + if err := clientgoscheme.AddToScheme(scm); err != nil { + t.Fatal(err) } tests := []struct { testDescription string - katibConfig *katibConfig - expected *MetricsCollectorConfig + katibConfig *configapi.KatibConfig + expected *configapi.MetricsCollectorConfig inputCollectorKind commonv1beta1.CollectorKind err bool }{ { testDescription: "All parameters correctly are specified", - katibConfig: func() *katibConfig { - kc := &katibConfig{metricsCollector: map[commonv1beta1.CollectorKind]*MetricsCollectorConfig{testCollectorKind: newFakeMetricsCollectorConfig()}} - kc.metricsCollector[testCollectorKind].ImagePullPolicy = corev1.PullNever + katibConfig: func() *configapi.KatibConfig { + kc := &configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + MetricsCollectorConfigs: []configapi.MetricsCollectorConfig{ + *newFakeMetricsCollectorConfig(testCollectorKind), + }, + }, + } + kc.RuntimeConfig.MetricsCollectorConfigs[0].ImagePullPolicy = corev1.PullNever return kc }(), - expected: func() *MetricsCollectorConfig { - c := newFakeMetricsCollectorConfig() + expected: func() *configapi.MetricsCollectorConfig { + c := newFakeMetricsCollectorConfig(testCollectorKind) c.ImagePullPolicy = corev1.PullNever return c }(), @@ -290,101 +281,185 @@ func TestGetMetricsCollectorConfigData(t *testing.T) { }, { testDescription: fmt.Sprintf("There is not %s field in katib-config configMap", consts.LabelMetricsCollectorSidecar), - katibConfig: &katibConfig{}, + katibConfig: &configapi.KatibConfig{}, err: true, }, { - testDescription: "There is not the cKind", - katibConfig: &katibConfig{metricsCollector: map[commonv1beta1.CollectorKind]*MetricsCollectorConfig{testCollectorKind: newFakeMetricsCollectorConfig()}}, + testDescription: "There is not the cKind", + katibConfig: &configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + MetricsCollectorConfigs: []configapi.MetricsCollectorConfig{ + *newFakeMetricsCollectorConfig(testCollectorKind), + }, + }, + }, inputCollectorKind: invalidCollectorKind, err: true, }, { testDescription: "Image filed is empty in katib-config configMap", - katibConfig: func() *katibConfig { - kc := &katibConfig{metricsCollector: map[commonv1beta1.CollectorKind]*MetricsCollectorConfig{testCollectorKind: newFakeMetricsCollectorConfig()}} - kc.metricsCollector[testCollectorKind].Image = "" - return kc - }(), - inputCollectorKind: testCollectorKind, - err: true, - }, - { - testDescription: fmt.Sprintf("GetMetricsConfigData sets %s to imagePullPolicy", consts.DefaultImagePullPolicy), - katibConfig: func() *katibConfig { - kc := &katibConfig{metricsCollector: map[commonv1beta1.CollectorKind]*MetricsCollectorConfig{testCollectorKind: newFakeMetricsCollectorConfig()}} - kc.metricsCollector[testCollectorKind].ImagePullPolicy = "" - return kc - }(), - expected: newFakeMetricsCollectorConfig(), - inputCollectorKind: testCollectorKind, - err: false, - }, - { - testDescription: "GetMetricsConfigData nukes resource.requests and resource.limits for the metrics collector", - katibConfig: func() *katibConfig { - kc := &katibConfig{metricsCollector: map[commonv1beta1.CollectorKind]*MetricsCollectorConfig{testCollectorKind: newFakeMetricsCollectorConfig()}} - kc.metricsCollector[testCollectorKind].Resource = corev1.ResourceRequirements{ - Requests: nukeResourceRequirements, - Limits: nukeResourceRequirements, + katibConfig: func() *configapi.KatibConfig { + kc := &configapi.KatibConfig{ + RuntimeConfig: configapi.RuntimeConfig{ + MetricsCollectorConfigs: []configapi.MetricsCollectorConfig{ + *newFakeMetricsCollectorConfig(testCollectorKind), + }, + }, } + kc.RuntimeConfig.MetricsCollectorConfigs[0].Image = "" return kc }(), - expected: func() *MetricsCollectorConfig { - c := newFakeMetricsCollectorConfig() - c.Resource = corev1.ResourceRequirements{ - Requests: map[corev1.ResourceName]resource.Quantity{}, - Limits: map[corev1.ResourceName]resource.Quantity{}, - } - return c - }(), inputCollectorKind: testCollectorKind, - err: false, + err: true, }, } for _, tt := range tests { t.Run(tt.testDescription, func(t *testing.T) { - fakeKubeClient := newFakeKubeClient(newFakeKatibConfigMap(tt.katibConfig)) + fakeKubeClient := newFakeKubeClient(scm, newFakeKatibConfigMap(tt.katibConfig)) actual, err := GetMetricsCollectorConfigData(tt.inputCollectorKind, fakeKubeClient) if (err != nil) != tt.err { t.Errorf("want error: %v, actual: %v", tt.err, err) } else if tt.expected != nil { - if !reflect.DeepEqual(actual, *tt.expected) { - t.Errorf("Generated MetricsCollectorConfig is invalid.\n\nactual:\n%v\n\nexpected:\n%v\n\n", actual, *tt.expected) + if diff := cmp.Diff(*tt.expected, actual); len(diff) != 0 { + t.Errorf("Generated MetricsCollectorConfig is invalid. (-want,+got):\n%s", diff) } } }) } } -func newFakeKubeClient(katibConfigMap *corev1.ConfigMap) client.Client { - fakeClientBuilder := fake.NewClientBuilder().WithScheme(scheme.Scheme) +func TestGetInitConfigData(t *testing.T) { + tmpDir, err := os.MkdirTemp("", "temp") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tmpDir) + scm := runtime.NewScheme() + if err = configapi.AddToScheme(scm); err != nil { + t.Fatal(err) + } + + fullInitConfig := filepath.Join(tmpDir, "fullInitConfig.yaml") + if err = os.WriteFile(fullInitConfig, []byte(` +apiVersion: config.kubeflow.org/v1beta1 +kind: KatibConfig +init: + controller: + experimentSuggestionName: test + metricsAddr: :8081 + healthzAddr: :18081 + injectSecurityContext: true + enableGRPCProbeInSuggestion: false + trialResources: + - Job.v1.batch + - TFJob.v1.kubeflow.org + - PyTorchJob.v1.kubeflow.org + - MPIJob.v1.kubeflow.org + - XGBoostJob.v1.kubeflow.org + - MXJob.v1.kubeflow.org + webhookPort: 18443 + enableLeaderElection: true + leaderElectionID: xyz0123 +runtime: + suggestions: + - algorithmName: random + image: docker.io/kubeflowkatib/suggestion-hyperopt:latest +`), os.FileMode(0600)); err != nil { + t.Fatal(err) + } + + cases := map[string]struct { + katibConfigFile string + wantInitConfigData configapi.InitConfig + wantError error + }{ + "KatibConfigFile is empty": { + katibConfigFile: "", + wantInitConfigData: configapi.InitConfig{ + ControllerConfig: configapi.ControllerConfig{ + ExperimentSuggestionName: configapi.DefaultExperimentSuggestionName, + MetricsAddr: configapi.DefaultMetricsAddr, + HealthzAddr: configapi.DefaultHealthzAddr, + InjectSecurityContext: &configapi.DefaultInjectSecurityContext, + EnableGRPCProbeInSuggestion: &configapi.DefaultEnableGRPCProbeInSuggestion, + TrialResources: configapi.DefaultTrialResources, + WebhookPort: &configapi.DefaultWebhookPort, + EnableLeaderElection: &configapi.DefaultEnableLeaderElection, + LeaderElectionID: configapi.DefaultLeaderElectionID, + }, + }, + }, + "invalid katibConfigFile": { + katibConfigFile: "invalid", + wantError: ErrKatibConfigNil, + }, + "full init config": { + katibConfigFile: fullInitConfig, + wantInitConfigData: configapi.InitConfig{ + ControllerConfig: configapi.ControllerConfig{ + ExperimentSuggestionName: "test", + MetricsAddr: ":8081", + HealthzAddr: ":18081", + InjectSecurityContext: pointer.Bool(true), + EnableGRPCProbeInSuggestion: pointer.Bool(false), + TrialResources: []string{ + "Job.v1.batch", + "TFJob.v1.kubeflow.org", + "PyTorchJob.v1.kubeflow.org", + "MPIJob.v1.kubeflow.org", + "XGBoostJob.v1.kubeflow.org", + "MXJob.v1.kubeflow.org", + }, + WebhookPort: pointer.Int(18443), + EnableLeaderElection: pointer.Bool(true), + LeaderElectionID: "xyz0123", + }, + }, + }, + } + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + got, err := GetInitConfigData(scm, tc.katibConfigFile) + if diff := cmp.Diff(tc.wantError, err, cmpopts.EquateErrors()); len(diff) != 0 { + t.Errorf("Unexpected error from GetInitConfigData() (-want,+got):\n%s", diff) + } + if diff := cmp.Diff(tc.wantInitConfigData, got); len(diff) != 0 { + t.Errorf("Unexpected InitConfig from GetInitConfigData() (-want,+got):\n%s", diff) + } + }) + } +} + +func newFakeKubeClient(scm *runtime.Scheme, katibConfigMap *corev1.ConfigMap) client.Client { + fakeClientBuilder := fake.NewClientBuilder().WithScheme(scm) if katibConfigMap != nil { fakeClientBuilder.WithObjects(katibConfigMap) } return fakeClientBuilder.Build() } -func newFakeKatibConfigMap(config *katibConfig) *corev1.ConfigMap { +func newFakeKatibConfigMap(config *configapi.KatibConfig) *corev1.ConfigMap { if config == nil { return nil } data := map[string]string{} - if config.suggestion != nil { - bSuggestionConfig, _ := json.Marshal(config.suggestion) - data[consts.LabelSuggestionTag] = string(bSuggestionConfig) + if config != nil { + bKatibConfig, err := json.Marshal(config) + if err != nil { + log.Fatal(err) + } + yamlKatibConfig := make(map[string]interface{}) + if err = yaml.Unmarshal(bKatibConfig, yamlKatibConfig); err != nil { + log.Fatal(err) + } + bKatibConfig, err = yaml.Marshal(yamlKatibConfig) + if err != nil { + log.Fatal(err) + } + data[consts.LabelKatibConfigTag] = string(bKatibConfig) } - if config.earlyStopping != nil { - bEarlyStoppingConfig, _ := json.Marshal(config.earlyStopping) - data[consts.LabelEarlyStoppingTag] = string(bEarlyStoppingConfig) - } - if config.metricsCollector != nil { - bMetricsCollector, _ := json.Marshal(config.metricsCollector) - data[consts.LabelMetricsCollectorSidecar] = string(bMetricsCollector) - } - return &corev1.ConfigMap{ TypeMeta: metav1.TypeMeta{ Kind: "ConfigMap", @@ -398,10 +473,11 @@ func newFakeKatibConfigMap(config *katibConfig) *corev1.ConfigMap { } } -func newFakeSuggestionConfig() *SuggestionConfig { +func newFakeSuggestionConfig(algorithmName string) *configapi.SuggestionConfig { defaultVolumeStorage, _ := resource.ParseQuantity(consts.DefaultSuggestionVolumeStorage) - return &SuggestionConfig{ + return &configapi.SuggestionConfig{ + AlgorithmName: algorithmName, Container: corev1.Container{ Image: "suggestion-image", ImagePullPolicy: consts.DefaultImagePullPolicy, @@ -424,16 +500,18 @@ func newFakeSuggestionConfig() *SuggestionConfig { } } -func newFakeEarlyStoppingConfig() *EarlyStoppingConfig { - return &EarlyStoppingConfig{ +func newFakeEarlyStoppingConfig(algorithmName string) *configapi.EarlyStoppingConfig { + return &configapi.EarlyStoppingConfig{ + AlgorithmName: algorithmName, Image: "early-stopping-image", ImagePullPolicy: consts.DefaultImagePullPolicy, Resource: *setFakeResourceRequirements(), } } -func newFakeMetricsCollectorConfig() *MetricsCollectorConfig { - return &MetricsCollectorConfig{ +func newFakeMetricsCollectorConfig(collectorKind commonv1beta1.CollectorKind) *configapi.MetricsCollectorConfig { + return &configapi.MetricsCollectorConfig{ + CollectorKind: string(collectorKind), Image: "metrics-collector-image", ImagePullPolicy: consts.DefaultImagePullPolicy, Resource: *setFakeResourceRequirements(), diff --git a/pkg/webhook/v1beta1/experiment/validator/validator_test.go b/pkg/webhook/v1beta1/experiment/validator/validator_test.go index 10bd66aa0c5..561379f8c97 100644 --- a/pkg/webhook/v1beta1/experiment/validator/validator_test.go +++ b/pkg/webhook/v1beta1/experiment/validator/validator_test.go @@ -18,6 +18,7 @@ package validator import ( "errors" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" "testing" "github.com/golang/mock/gomock" @@ -35,7 +36,6 @@ import ( util "github.com/kubeflow/katib/pkg/controller.v1beta1/util" manifestmock "github.com/kubeflow/katib/pkg/mock/v1beta1/experiment/manifest" - "github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig" ) func init() { @@ -49,11 +49,11 @@ func TestValidateExperiment(t *testing.T) { p := manifestmock.NewMockGenerator(mockCtrl) g := New(p) - suggestionConfigData := katibconfig.SuggestionConfig{} + suggestionConfigData := configapi.SuggestionConfig{} suggestionConfigData.Image = "algorithmImage" - metricsCollectorConfigData := katibconfig.MetricsCollectorConfig{} + metricsCollectorConfigData := configapi.MetricsCollectorConfig{} metricsCollectorConfigData.Image = "metricsCollectorImage" - earlyStoppingConfigData := katibconfig.EarlyStoppingConfig{} + earlyStoppingConfigData := configapi.EarlyStoppingConfig{} p.EXPECT().GetSuggestionConfigData(gomock.Any()).Return(suggestionConfigData, nil).AnyTimes() p.EXPECT().GetMetricsCollectorConfigData(gomock.Any()).Return(metricsCollectorConfigData, nil).AnyTimes() @@ -875,7 +875,7 @@ func TestValidateMetricsCollector(t *testing.T) { p := manifestmock.NewMockGenerator(mockCtrl) g := New(p) - metricsCollectorConfigData := katibconfig.MetricsCollectorConfig{} + metricsCollectorConfigData := configapi.MetricsCollectorConfig{} metricsCollectorConfigData.Image = "metricsCollectorImage" p.EXPECT().GetMetricsCollectorConfigData(gomock.Any()).Return(metricsCollectorConfigData, nil).AnyTimes() @@ -1174,26 +1174,26 @@ func TestValidateConfigData(t *testing.T) { p := manifestmock.NewMockGenerator(mockCtrl) g := New(p) - suggestionConfigData := katibconfig.SuggestionConfig{} + suggestionConfigData := configapi.SuggestionConfig{} suggestionConfigData.Image = "algorithmImage" validConfigCall := p.EXPECT().GetSuggestionConfigData(gomock.Any()).Return(suggestionConfigData, nil).Times(2) - invalidConfigCall := p.EXPECT().GetSuggestionConfigData(gomock.Any()).Return(katibconfig.SuggestionConfig{}, errors.New("GetSuggestionConfigData failed")) + invalidConfigCall := p.EXPECT().GetSuggestionConfigData(gomock.Any()).Return(configapi.SuggestionConfig{}, errors.New("GetSuggestionConfigData failed")) gomock.InOrder( validConfigCall, invalidConfigCall, ) - validEarlyStoppingConfigCall := p.EXPECT().GetEarlyStoppingConfigData(gomock.Any()).Return(katibconfig.EarlyStoppingConfig{}, nil) - invalidEarlyStoppingConfigCall := p.EXPECT().GetEarlyStoppingConfigData(gomock.Any()).Return(katibconfig.EarlyStoppingConfig{}, errors.New("GetEarlyStoppingConfigData failed")) + validEarlyStoppingConfigCall := p.EXPECT().GetEarlyStoppingConfigData(gomock.Any()).Return(configapi.EarlyStoppingConfig{}, nil) + invalidEarlyStoppingConfigCall := p.EXPECT().GetEarlyStoppingConfigData(gomock.Any()).Return(configapi.EarlyStoppingConfig{}, errors.New("GetEarlyStoppingConfigData failed")) gomock.InOrder( validEarlyStoppingConfigCall, invalidEarlyStoppingConfigCall, ) - p.EXPECT().GetMetricsCollectorConfigData(gomock.Any()).Return(katibconfig.MetricsCollectorConfig{}, errors.New("GetMetricsCollectorConfigData failed")) + p.EXPECT().GetMetricsCollectorConfigData(gomock.Any()).Return(configapi.MetricsCollectorConfig{}, errors.New("GetMetricsCollectorConfigData failed")) batchJobStr := convertBatchJobToString(newFakeBatchJob()) p.EXPECT().GetTrialTemplate(gomock.Any()).Return(batchJobStr, nil).AnyTimes() diff --git a/pkg/webhook/v1beta1/pod/inject_webhook.go b/pkg/webhook/v1beta1/pod/inject_webhook.go index 4dd091b0fc9..f2155625311 100644 --- a/pkg/webhook/v1beta1/pod/inject_webhook.go +++ b/pkg/webhook/v1beta1/pod/inject_webhook.go @@ -20,6 +20,7 @@ import ( "context" "encoding/json" "errors" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" "net/http" "path/filepath" "strconv" @@ -298,7 +299,7 @@ func (s *SidecarInjector) getKatibJob(object *unstructured.Unstructured, namespa return jobKind, jobName, nil } -func (s *SidecarInjector) getMetricsCollectorArgs(trial *trialsv1beta1.Trial, metricNames string, mc common.MetricsCollectorSpec, metricsCollectorConfigData katibconfig.MetricsCollectorConfig, esRules []string) ([]string, error) { +func (s *SidecarInjector) getMetricsCollectorArgs(trial *trialsv1beta1.Trial, metricNames string, mc common.MetricsCollectorSpec, metricsCollectorConfigData configapi.MetricsCollectorConfig, esRules []string) ([]string, error) { args := []string{"-t", trial.Name, "-m", metricNames, "-o-type", string(trial.Spec.Objective.Type), "-s-db", katibmanagerv1beta1.GetDBManagerAddr()} if mountPath, _ := getMountPath(mc); mountPath != "" { args = append(args, "-path", mountPath) diff --git a/pkg/webhook/v1beta1/pod/inject_webhook_test.go b/pkg/webhook/v1beta1/pod/inject_webhook_test.go index 0c87be4c2c8..6b8fe838776 100644 --- a/pkg/webhook/v1beta1/pod/inject_webhook_test.go +++ b/pkg/webhook/v1beta1/pod/inject_webhook_test.go @@ -19,6 +19,7 @@ package pod import ( "context" "fmt" + configapi "github.com/kubeflow/katib/pkg/apis/config/v1beta1" "path/filepath" "reflect" "sync" @@ -46,7 +47,6 @@ import ( "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" "github.com/kubeflow/katib/pkg/controller.v1beta1/util" mccommon "github.com/kubeflow/katib/pkg/metricscollector/v1beta1/common" - "github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig" ) var ( @@ -321,7 +321,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { MetricNames string MCSpec common.MetricsCollectorSpec EarlyStoppingRules []string - KatibConfig katibconfig.MetricsCollectorConfig + KatibConfig configapi.MetricsCollectorConfig ExpectedArgs []string Name string Err bool @@ -334,7 +334,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { Kind: common.StdOutCollector, }, }, - KatibConfig: katibconfig.MetricsCollectorConfig{ + KatibConfig: configapi.MetricsCollectorConfig{ WaitAllProcesses: &waitAllProcessesValue, }, ExpectedArgs: []string{ @@ -368,7 +368,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { }, }, }, - KatibConfig: katibconfig.MetricsCollectorConfig{}, + KatibConfig: configapi.MetricsCollectorConfig{}, ExpectedArgs: []string{ "-t", testTrialName, "-m", testMetricName, @@ -394,7 +394,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { }, }, }, - KatibConfig: katibconfig.MetricsCollectorConfig{}, + KatibConfig: configapi.MetricsCollectorConfig{}, ExpectedArgs: []string{ "-t", testTrialName, "-m", testMetricName, @@ -418,7 +418,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { }, }, }, - KatibConfig: katibconfig.MetricsCollectorConfig{}, + KatibConfig: configapi.MetricsCollectorConfig{}, ExpectedArgs: []string{ "-t", testTrialName, "-m", testMetricName, @@ -436,7 +436,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { Kind: common.CustomCollector, }, }, - KatibConfig: katibconfig.MetricsCollectorConfig{}, + KatibConfig: configapi.MetricsCollectorConfig{}, ExpectedArgs: []string{ "-t", testTrialName, "-m", testMetricName, @@ -458,7 +458,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { }, }, }, - KatibConfig: katibconfig.MetricsCollectorConfig{}, + KatibConfig: configapi.MetricsCollectorConfig{}, ExpectedArgs: []string{ "-t", testTrialName, "-m", testMetricName, @@ -476,7 +476,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { Kind: common.PrometheusMetricCollector, }, }, - KatibConfig: katibconfig.MetricsCollectorConfig{}, + KatibConfig: configapi.MetricsCollectorConfig{}, ExpectedArgs: []string{ "-t", testTrialName, "-m", testMetricName, @@ -494,7 +494,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { }, }, EarlyStoppingRules: earlyStoppingRules, - KatibConfig: katibconfig.MetricsCollectorConfig{}, + KatibConfig: configapi.MetricsCollectorConfig{}, ExpectedArgs: []string{ "-t", testTrialName, "-m", testMetricName, @@ -520,7 +520,7 @@ func TestGetMetricsCollectorArgs(t *testing.T) { }, }, EarlyStoppingRules: earlyStoppingRules, - KatibConfig: katibconfig.MetricsCollectorConfig{}, + KatibConfig: configapi.MetricsCollectorConfig{}, Name: "Trial with invalid Experiment label name. Suggestion is not created", Err: true, },