From 8f96adac5ffc4f6f3f41368397fe04d10b9401de Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 27 Sep 2023 11:30:27 +0300 Subject: [PATCH 1/3] Add initial metrics Add dependency: sigs.k8s.io/controller-runtime v0.14.6 go get sigs.k8s.io/controller-runtime@v0.14.6 go mod tidy --- go.mod | 11 ++--- go.sum | 19 ++++---- pkg/controller/queuejob/metrics.go | 45 +++++++++++++++++++ .../queuejob/queuejob_controller_ex.go | 6 +++ 4 files changed, 66 insertions(+), 15 deletions(-) create mode 100644 pkg/controller/queuejob/metrics.go diff --git a/go.mod b/go.mod index 2dd1bcea..37a31cd0 100644 --- a/go.mod +++ b/go.mod @@ -12,13 +12,16 @@ require ( github.com/prometheus/client_model v0.3.0 github.com/stretchr/testify v1.8.2 k8s.io/api v0.26.2 - k8s.io/apiextensions-apiserver v0.25.1 + k8s.io/apiextensions-apiserver v0.26.1 k8s.io/apimachinery v0.26.2 k8s.io/apiserver v0.26.2 k8s.io/client-go v0.26.2 k8s.io/klog/v2 v2.90.1 k8s.io/metrics v0.26.2 + k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 + sigs.k8s.io/controller-runtime v0.14.6 sigs.k8s.io/custom-metrics-apiserver v0.0.0 + sigs.k8s.io/structured-merge-diff/v4 v4.2.3 ) replace sigs.k8s.io/custom-metrics-apiserver => sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3 @@ -85,7 +88,7 @@ require ( go.opentelemetry.io/proto/otlp v0.19.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.6.0 // indirect - go.uber.org/zap v1.19.0 // indirect + go.uber.org/zap v1.24.0 // indirect golang.org/x/crypto v0.1.0 // indirect golang.org/x/net v0.8.0 // indirect golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect @@ -93,7 +96,7 @@ require ( golang.org/x/sys v0.6.0 // indirect golang.org/x/term v0.6.0 // indirect golang.org/x/text v0.8.0 // indirect - golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect + golang.org/x/time v0.3.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20220502173005-c8bf987b8c21 // indirect google.golang.org/grpc v1.49.0 // indirect @@ -107,9 +110,7 @@ require ( k8s.io/component-base v0.26.2 // indirect k8s.io/kms v0.26.2 // indirect k8s.io/kube-openapi v0.0.0-20230303024457-afdc3dddf62d // indirect - k8s.io/utils v0.0.0-20230220204549-a5ecb0141aa5 // indirect sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect - sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect sigs.k8s.io/yaml v1.3.0 // indirect ) diff --git a/go.sum b/go.sum index 47713209..52d5f3f0 100644 --- a/go.sum +++ b/go.sum @@ -47,7 +47,6 @@ github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kd github.com/antlr/antlr4/runtime/Go/antlr v1.4.10 h1:yL7+Jz0jTC6yykIK/Wh74gnTJnrGr5AyrNMXuA0gves= github.com/antlr/antlr4/runtime/Go/antlr v1.4.10/go.mod h1:F7bn7fEU90QkQ3tnmaTx3LTKLEDqnwWODIYppRQ5hnY= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= -github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -388,13 +387,12 @@ go.opentelemetry.io/proto/otlp v0.19.0 h1:IVN6GR+mhC4s5yfcTbmzHYODqvWAp3ZedA2SJP go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= -go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= -go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= +go.uber.org/goleak v1.2.0 h1:xqgm/S+aQvhWFTtR0XK3Jvg7z8kGV8P4X14IzwN3Eqk= go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo= -go.uber.org/zap v1.19.0 h1:mZQZefskPPCMIBCSEH0v2/iUqqLrYtaeqwD6FUGUnFE= -go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= +go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= +go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -556,8 +554,8 @@ golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -573,7 +571,6 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= @@ -741,8 +738,8 @@ honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9 honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= k8s.io/api v0.26.2 h1:dM3cinp3PGB6asOySalOZxEG4CZ0IAdJsrYZXE/ovGQ= k8s.io/api v0.26.2/go.mod h1:1kjMQsFE+QHPfskEcVNgL3+Hp88B80uj0QtSOlj8itU= -k8s.io/apiextensions-apiserver v0.25.1 h1:HEIKlxj6oHaDwHgotEIX/Ld5K/RGuOFwN/TWMiQ5s5s= -k8s.io/apiextensions-apiserver v0.25.1/go.mod h1:67sgnMs2yIO2iV4DpCdS91vlP+pdnVIsG/mz60qRn44= +k8s.io/apiextensions-apiserver v0.26.1 h1:cB8h1SRk6e/+i3NOrQgSFij1B2S0Y0wDoNl66bn8RMI= +k8s.io/apiextensions-apiserver v0.26.1/go.mod h1:AptjOSXDGuE0JICx/Em15PaoO7buLwTs0dGleIHixSM= k8s.io/apimachinery v0.26.2 h1:da1u3D5wfR5u2RpLhE/ZtZS2P7QvDgLZTi9wrNZl/tQ= k8s.io/apimachinery v0.26.2/go.mod h1:ats7nN1LExKHvJ9TmwootT00Yz05MuYqPXEXaVeOy5I= k8s.io/apiserver v0.26.2 h1:Pk8lmX4G14hYqJd1poHGC08G03nIHVqdJMR0SD3IH3o= @@ -766,6 +763,8 @@ rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35 h1:+xBL5uTc+BkPBwmMi3vYfUJjq+N3K+H6PXeETwf5cPI= sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.35/go.mod h1:WxjusMwXlKzfAs4p9km6XJRndVt2FROgMVCE4cdohFo= +sigs.k8s.io/controller-runtime v0.14.6 h1:oxstGVvXGNnMvY7TAESYk+lzr6S3V5VFxQ6d92KcwQA= +sigs.k8s.io/controller-runtime v0.14.6/go.mod h1:WqIdsAY6JBsjfc/CqO0CORmNtoCtE4S6qbPc9s68h+0= sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3 h1:puQ5YlyBjhxg+OQ1YPMJXwtk7WhC4E6AlWIQ9pC8jws= sigs.k8s.io/custom-metrics-apiserver v1.25.1-0.20230306170449-63d8c93851f3/go.mod h1:9nUXR/EgdYZto1aQ6yhwOksPR7J979jSyOqic1IgaOo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= diff --git a/pkg/controller/queuejob/metrics.go b/pkg/controller/queuejob/metrics.go new file mode 100644 index 00000000..cb10a9c5 --- /dev/null +++ b/pkg/controller/queuejob/metrics.go @@ -0,0 +1,45 @@ +/* +Copyright 2023 The Multi-Cluster App Dispatcher Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package queuejob + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +func registerMetrics(controller *XController) { + allocatableCapacityCpu := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + Subsystem: "mcad", + Name: "allocatable_capacity_cpu", + Help: "Allocatable CPU Capacity (in milicores)", + }, func() float64 { return controller.GetAllocatableCapacity().MilliCPU }) + metrics.Registry.MustRegister(allocatableCapacityCpu) + + allocatableCapacityMemory := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + Subsystem: "mcad", + Name: "allocatable_capacity_memory", + Help: "Allocatable Memory Capacity", + }, func() float64 { return controller.GetAllocatableCapacity().Memory }) + metrics.Registry.MustRegister(allocatableCapacityMemory) + + allocatableCapacityGpu := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + Subsystem: "mcad", + Name: "allocatable_capacity_gpu", + Help: "Allocatable GPU Capacity", + }, func() float64 { return float64(controller.GetAllocatableCapacity().GPU) }) + metrics.Registry.MustRegister(allocatableCapacityGpu) +} diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index 8d356403..a8e645d7 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -221,6 +221,10 @@ func (qjm *XController) allocatableCapacity() *clusterstateapi.Resource { return capacity } +func (qjm *XController) GetAllocatableCapacity() *clusterstateapi.Resource { + return qjm.allocatableCapacity() +} + // NewJobController create new AppWrapper Controller func NewJobController(restConfig *rest.Config, mcadConfig *config.MCADConfiguration, extConfig *config.MCADConfigurationExtended) *XController { cc := &XController{ @@ -242,6 +246,8 @@ func NewJobController(restConfig *rest.Config, mcadConfig *config.MCADConfigurat // multi-cluster mode, so for now it is turned-off: https://github.com/project-codeflare/multi-cluster-app-dispatcher/issues/585 // cc.metricsAdapter = adapter.New(serverOption, config, cc.cache) + registerMetrics(cc) + cc.genericresources = genericresource.NewAppWrapperGenericResource(restConfig) appWrapperClient, err := clientset.NewForConfig(restConfig) From 7713655646550e1a3908e0886cec25b155e654e7 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Tue, 17 Oct 2023 10:17:53 +0300 Subject: [PATCH 2/3] Update metrics once a minute --- pkg/controller/queuejob/metrics.go | 55 +++++++++++++++---- .../queuejob/queuejob_controller_ex.go | 3 +- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/pkg/controller/queuejob/metrics.go b/pkg/controller/queuejob/metrics.go index cb10a9c5..3ca39c5a 100644 --- a/pkg/controller/queuejob/metrics.go +++ b/pkg/controller/queuejob/metrics.go @@ -18,28 +18,59 @@ package queuejob import ( "github.com/prometheus/client_golang/prometheus" + "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/metrics" + "time" ) -func registerMetrics(controller *XController) { - allocatableCapacityCpu := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ +var ( + allocatableCapacityCpu = prometheus.NewGauge(prometheus.GaugeOpts{ Subsystem: "mcad", Name: "allocatable_capacity_cpu", Help: "Allocatable CPU Capacity (in milicores)", - }, func() float64 { return controller.GetAllocatableCapacity().MilliCPU }) - metrics.Registry.MustRegister(allocatableCapacityCpu) - - allocatableCapacityMemory := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + }) + allocatableCapacityMemory = prometheus.NewGauge(prometheus.GaugeOpts{ Subsystem: "mcad", Name: "allocatable_capacity_memory", Help: "Allocatable Memory Capacity", - }, func() float64 { return controller.GetAllocatableCapacity().Memory }) - metrics.Registry.MustRegister(allocatableCapacityMemory) - - allocatableCapacityGpu := prometheus.NewGaugeFunc(prometheus.GaugeOpts{ + }) + allocatableCapacityGpu = prometheus.NewGauge(prometheus.GaugeOpts{ Subsystem: "mcad", Name: "allocatable_capacity_gpu", Help: "Allocatable GPU Capacity", - }, func() float64 { return float64(controller.GetAllocatableCapacity().GPU) }) - metrics.Registry.MustRegister(allocatableCapacityGpu) + }) +) + +func registerMetrics() { + klog.V(10).Infof("Registering metrics") + metrics.Registry.MustRegister( + allocatableCapacityCpu, + allocatableCapacityMemory, + allocatableCapacityGpu, + ) +} + +func updateMetricsLoop(controller *XController, stopCh <-chan struct{}) { + ticker := time.NewTicker(time.Minute * 1) + go func() { + updateMetrics(controller) + for { + select { + case <-ticker.C: + klog.V(10).Infof("Update metrics loop tick") + updateMetrics(controller) + case <-stopCh: + klog.V(10).Infof("Exiting update metrics loop") + ticker.Stop() + return + } + } + }() +} + +func updateMetrics(controller *XController) { + res := controller.GetAllocatableCapacity() + allocatableCapacityCpu.Set(res.MilliCPU) + allocatableCapacityMemory.Set(res.Memory) + allocatableCapacityGpu.Set(float64(res.GPU)) } diff --git a/pkg/controller/queuejob/queuejob_controller_ex.go b/pkg/controller/queuejob/queuejob_controller_ex.go index a8e645d7..39d75f86 100644 --- a/pkg/controller/queuejob/queuejob_controller_ex.go +++ b/pkg/controller/queuejob/queuejob_controller_ex.go @@ -246,7 +246,7 @@ func NewJobController(restConfig *rest.Config, mcadConfig *config.MCADConfigurat // multi-cluster mode, so for now it is turned-off: https://github.com/project-codeflare/multi-cluster-app-dispatcher/issues/585 // cc.metricsAdapter = adapter.New(serverOption, config, cc.cache) - registerMetrics(cc) + registerMetrics() cc.genericresources = genericresource.NewAppWrapperGenericResource(restConfig) @@ -1459,6 +1459,7 @@ func (qjm *XController) backoff(ctx context.Context, q *arbv1.AppWrapper, reason // Run starts AppWrapper Controller func (cc *XController) Run(stopCh <-chan struct{}) { + updateMetricsLoop(cc, stopCh) go cc.appwrapperInformer.Informer().Run(stopCh) cache.WaitForCacheSync(stopCh, cc.appWrapperSynced) From 2a4a551068f7945395aa4772484151d38943cbf8 Mon Sep 17 00:00:00 2001 From: Ronen Schaffer Date: Wed, 25 Oct 2023 17:15:27 +0300 Subject: [PATCH 3/3] Fix typo --- pkg/controller/queuejob/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/controller/queuejob/metrics.go b/pkg/controller/queuejob/metrics.go index 3ca39c5a..a916c56d 100644 --- a/pkg/controller/queuejob/metrics.go +++ b/pkg/controller/queuejob/metrics.go @@ -27,7 +27,7 @@ var ( allocatableCapacityCpu = prometheus.NewGauge(prometheus.GaugeOpts{ Subsystem: "mcad", Name: "allocatable_capacity_cpu", - Help: "Allocatable CPU Capacity (in milicores)", + Help: "Allocatable CPU Capacity (in millicores)", }) allocatableCapacityMemory = prometheus.NewGauge(prometheus.GaugeOpts{ Subsystem: "mcad",