From a6a2f4d43e893a0dacdafae944c61c98b0e5707c Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Tue, 12 Nov 2024 15:44:17 +0100 Subject: [PATCH 1/2] feat: Added new option enableProvidedByTopology We are reintroducing a feature originally present in v2.10.0 to prevent pods from getting stuck in the `pending` state in clusters with non-cloud nodes. This feature is now optional and can be enabled via the Helm Chart. By default, it remains disabled to avoid compatibility issues with Nomad clusters, which have a different CSI spec implementation. --- chart/templates/controller/deployment.yaml | 4 +++ chart/templates/core/storageclass.yaml | 8 ++++++ chart/templates/node/daemonset.yaml | 4 +++ chart/values.yaml | 4 +++ cmd/aio/main.go | 7 +++++ cmd/controller/main.go | 7 +++++ cmd/node/main.go | 6 ++++ docs/kubernetes/README.md | 12 ++++++-- internal/driver/controller.go | 31 +++++++++++++-------- internal/driver/controller_test.go | 8 ++++++ internal/driver/driver.go | 1 + internal/driver/node.go | 32 ++++++++++++++-------- internal/driver/node_test.go | 1 + internal/driver/sanity_test.go | 2 ++ 14 files changed, 102 insertions(+), 25 deletions(-) diff --git a/chart/templates/controller/deployment.yaml b/chart/templates/controller/deployment.yaml index 5d178257..f375d8c3 100644 --- a/chart/templates/controller/deployment.yaml +++ b/chart/templates/controller/deployment.yaml @@ -167,6 +167,10 @@ spec: key: {{ .Values.controller.hcloudToken.existingSecret.key }} {{- end }} {{- end }} + {{- if .Values.global.enableProvidedByTopology}} + - name: ENABLE_PROVIDED_BY_TOPOLOGY + value: "t" + {{ end }} {{- if .Values.controller.extraEnvVars }} {{- include "common.tplvalues.render" (dict "value" .Values.controller.extraEnvVars "context" $) | nindent 12 }} {{- end }} diff --git a/chart/templates/core/storageclass.yaml b/chart/templates/core/storageclass.yaml index 1bc246d2..f5f282c8 100644 --- a/chart/templates/core/storageclass.yaml +++ b/chart/templates/core/storageclass.yaml @@ -1,3 +1,4 @@ +{{ $global := .Values.global }} {{- if .Values.storageClasses }} {{- range $key, $val := .Values.storageClasses }} kind: StorageClass @@ -10,6 +11,13 @@ provisioner: csi.hetzner.cloud volumeBindingMode: WaitForFirstConsumer allowVolumeExpansion: true reclaimPolicy: {{ $val.reclaimPolicy | quote }} +{{- if $global.enableProvidedByTopology }} +allowedTopologies: +- matchLabelExpressions: + - key: instance.hetzner.cloud/provided-by + values: + - "cloud" +{{- end}} --- {{- end }} {{- end }} \ No newline at end of file diff --git a/chart/templates/node/daemonset.yaml b/chart/templates/node/daemonset.yaml index ec1ee4c5..cde83e3b 100644 --- a/chart/templates/node/daemonset.yaml +++ b/chart/templates/node/daemonset.yaml @@ -121,6 +121,10 @@ spec: {{- end }} - name: ENABLE_METRICS value: {{if .Values.metrics.enabled}}"true"{{ else }}"false"{{end}} + {{- if .Values.global.enableProvidedByTopology}} + - name: ENABLE_PROVIDED_BY_TOPOLOGY + value: "t" + {{ end }} {{- if .Values.node.extraEnvVars }} {{- include "common.tplvalues.render" (dict "value" .Values.node.extraEnvVars "context" $) | nindent 12 }} {{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index 481862f7..36af3a02 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -11,6 +11,9 @@ global: ## - myRegistryKeySecretName ## imagePullSecrets: [] + ## @param node.enableProvidedByTopology Enables workaround for upstream Kubernetes issue where nodes without the CSI plugin are still considered for scheduling. + ## + enableProvidedByTopology: false ## @section Common parameters ## @@ -648,6 +651,7 @@ node: ## kubeletDir: /var/lib/k0s/kubelet ## For microk8s: ## kubeletDir: /var/snap/microk8s/common/var/lib/kubelet + ## kubeletDir: /var/lib/kubelet ## @section Other Parameters diff --git a/cmd/aio/main.go b/cmd/aio/main.go index b610195a..7d41ccae 100644 --- a/cmd/aio/main.go +++ b/cmd/aio/main.go @@ -63,10 +63,16 @@ func main() { volumeResizeService := volumes.NewLinuxResizeService(logger.With("component", "linux-resize-service")) volumeStatsService := volumes.NewLinuxStatsService(logger.With("component", "linux-stats-service")) + var enableProvidedByTopology bool + if featFlag, exists := os.LookupEnv("ENABLE_PROVIDED_BY_TOPOLOGY"); exists { + enableProvidedByTopology, _ = strconv.ParseBool(featFlag) + } + nodeService := driver.NewNodeService( logger.With("component", "driver-node-service"), strconv.FormatInt(serverID, 10), serverLocation, + enableProvidedByTopology, volumeMountService, volumeResizeService, volumeStatsService, @@ -84,6 +90,7 @@ func main() { logger.With("component", "driver-controller-service"), volumeService, serverLocation, + enableProvidedByTopology, ) // common diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 27a5c769..709195b7 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -3,6 +3,7 @@ package main import ( "log/slog" "os" + "strconv" proto "github.com/container-storage-interface/spec/lib/go/csi" @@ -54,6 +55,11 @@ func main() { location = server.Datacenter.Location.Name } + var enableProvidedByTopology bool + if featFlag, exists := os.LookupEnv("ENABLE_PROVIDED_BY_TOPOLOGY"); exists { + enableProvidedByTopology, _ = strconv.ParseBool(featFlag) + } + volumeService := volumes.NewIdempotentService( logger.With("component", "idempotent-volume-service"), api.NewVolumeService( @@ -65,6 +71,7 @@ func main() { logger.With("component", "driver-controller-service"), volumeService, location, + enableProvidedByTopology, ) identityService := driver.NewIdentityService( diff --git a/cmd/node/main.go b/cmd/node/main.go index 772ec47e..7af914c0 100644 --- a/cmd/node/main.go +++ b/cmd/node/main.go @@ -53,10 +53,16 @@ func main() { volumeStatsService := volumes.NewLinuxStatsService(logger.With("component", "linux-stats-service")) identityService := driver.NewIdentityService(logger.With("component", "driver-identity-service")) + var enableProvidedByTopology bool + if featFlag, exists := os.LookupEnv("ENABLE_PROVIDED_BY_TOPOLOGY"); exists { + enableProvidedByTopology, _ = strconv.ParseBool(featFlag) + } + nodeService := driver.NewNodeService( logger.With("component", "driver-node-service"), strconv.FormatInt(serverID, 10), serverLocation, + enableProvidedByTopology, volumeMountService, volumeResizeService, volumeStatsService, diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index fe85dc71..6f75e041 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -209,9 +209,17 @@ $ kubectl apply -f https://raw.githubusercontent.com/hetznercloud/csi-driver/v2. ## Integration with Root Servers -Root servers can be part of the cluster, but the CSI plugin doesn't work there. +Root servers can be part of the cluster, but the CSI plugin doesn't work there and the current behaviour of the scheduler can cause Pods to be stuck in `Pending`. -Labels are needed to indicate whether a node is a cloud VM or a dedicated server from Robot. If you are using the `hcloud-cloud-controller-manager` version 1.21.0 or later, these labels are added automatically. Otherwise, you will need to label the nodes manually. +To address this behavior, you can set `enableProvidedByTopology` to `true` in the Helm Chart configuration. This setting prevents pods from being scheduled on nodes — specifically, Robot servers — where Hetzner volumes are unavailable. Enabling this option adds the `instance.hetzner.cloud/provided-by` label to the [allowed topologies](https://kubernetes.io/docs/concepts/storage/storage-classes/#allowed-topologies) section of the storage classes that are created. Additionally, this label is included in the `topologyKeys` section of `csinode` objects, and a node affinity is set up for each persistent volume. + + +```yaml +global: + enableProvidedByTopology: true +``` + +To ensure proper topology evaluation, labels are needed to indicate whether a node is a cloud VM or a dedicated server from Robot. If you are using the `hcloud-cloud-controller-manager` version 1.21.0 or later, these labels are added automatically. Otherwise, you will need to label the nodes manually. ### Adding labels manually diff --git a/internal/driver/controller.go b/internal/driver/controller.go index d7572145..2591a920 100644 --- a/internal/driver/controller.go +++ b/internal/driver/controller.go @@ -18,20 +18,23 @@ import ( type ControllerService struct { proto.UnimplementedControllerServer - logger *slog.Logger - volumeService volumes.Service - location string + logger *slog.Logger + volumeService volumes.Service + location string + enableProvidedByTopology bool } func NewControllerService( logger *slog.Logger, volumeService volumes.Service, location string, + enableProvidedByTopology bool, ) *ControllerService { return &ControllerService{ - logger: logger, - volumeService: volumeService, - location: location, + logger: logger, + volumeService: volumeService, + location: location, + enableProvidedByTopology: enableProvidedByTopology, } } @@ -88,16 +91,22 @@ func (s *ControllerService) CreateVolume(ctx context.Context, req *proto.CreateV "volume-name", volume.Name, ) + topology := &proto.Topology{ + Segments: map[string]string{ + TopologySegmentLocation: volume.Location, + }, + } + + if s.enableProvidedByTopology { + topology.Segments[ProvidedByLabel] = "cloud" + } + resp := &proto.CreateVolumeResponse{ Volume: &proto.Volume{ VolumeId: strconv.FormatInt(volume.ID, 10), CapacityBytes: volume.SizeBytes(), AccessibleTopology: []*proto.Topology{ - { - Segments: map[string]string{ - TopologySegmentLocation: volume.Location, - }, - }, + topology, }, VolumeContext: map[string]string{ "fsFormatOptions": req.Parameters["fsFormatOptions"], diff --git a/internal/driver/controller_test.go b/internal/driver/controller_test.go index 8b781714..d1f7c0b5 100644 --- a/internal/driver/controller_test.go +++ b/internal/driver/controller_test.go @@ -33,6 +33,7 @@ func newControllerServiceTestEnv() *controllerServiceTestEnv { logger, volumeService, "testloc", + false, ), volumeService: volumeService, } @@ -41,6 +42,8 @@ func newControllerServiceTestEnv() *controllerServiceTestEnv { func TestControllerServiceCreateVolume(t *testing.T) { env := newControllerServiceTestEnv() + env.service.enableProvidedByTopology = true + env.volumeService.CreateFunc = func(ctx context.Context, opts volumes.CreateOpts) (*csi.Volume, error) { if opts.Name != "testvol" { t.Errorf("unexpected name passed to volume service: %s", opts.Name) @@ -94,6 +97,11 @@ func TestControllerServiceCreateVolume(t *testing.T) { if loc := top.Segments[TopologySegmentLocation]; loc != "testloc" { t.Errorf("unexpected location segment in topology: %s", loc) } + if env.service.enableProvidedByTopology { + if provider := top.Segments[ProvidedByLabel]; provider != "cloud" { + t.Errorf("unexpected provider segment in topology: %s", provider) + } + } } else { t.Errorf("unexpected number of topologies: %d", len(resp.Volume.AccessibleTopology)) } diff --git a/internal/driver/driver.go b/internal/driver/driver.go index 8fbcd0f6..ce7d8c29 100644 --- a/internal/driver/driver.go +++ b/internal/driver/driver.go @@ -9,4 +9,5 @@ const ( DefaultVolumeSize = MinVolumeSize TopologySegmentLocation = PluginName + "/location" + ProvidedByLabel = "instance.hetzner.cloud/provided-by" ) diff --git a/internal/driver/node.go b/internal/driver/node.go index 5b084ea1..0e03ee44 100644 --- a/internal/driver/node.go +++ b/internal/driver/node.go @@ -15,29 +15,32 @@ import ( type NodeService struct { proto.UnimplementedNodeServer - logger *slog.Logger - serverID string - serverLocation string - volumeMountService volumes.MountService - volumeResizeService volumes.ResizeService - volumeStatsService volumes.StatsService + logger *slog.Logger + serverID string + serverLocation string + enableProvidedByTopology bool + volumeMountService volumes.MountService + volumeResizeService volumes.ResizeService + volumeStatsService volumes.StatsService } func NewNodeService( logger *slog.Logger, serverID string, serverLocation string, + enableProvidedByTopology bool, volumeMountService volumes.MountService, volumeResizeService volumes.ResizeService, volumeStatsService volumes.StatsService, ) *NodeService { return &NodeService{ - logger: logger, - serverID: serverID, - serverLocation: serverLocation, - volumeMountService: volumeMountService, - volumeResizeService: volumeResizeService, - volumeStatsService: volumeStatsService, + logger: logger, + serverID: serverID, + serverLocation: serverLocation, + enableProvidedByTopology: enableProvidedByTopology, + volumeMountService: volumeMountService, + volumeResizeService: volumeResizeService, + volumeStatsService: volumeStatsService, } } @@ -181,6 +184,11 @@ func (s *NodeService) NodeGetInfo(_ context.Context, _ *proto.NodeGetInfoRequest }, }, } + + if s.enableProvidedByTopology { + resp.AccessibleTopology.Segments[ProvidedByLabel] = "cloud" + } + return resp, nil } diff --git a/internal/driver/node_test.go b/internal/driver/node_test.go index 624c2afc..22a19823 100644 --- a/internal/driver/node_test.go +++ b/internal/driver/node_test.go @@ -35,6 +35,7 @@ func newNodeServerTestEnv() nodeServiceTestEnv { testutil.NewNopLogger(), "1", "loc", + false, volumeMountService, volumeResizeService, volumeStatsService, diff --git a/internal/driver/sanity_test.go b/internal/driver/sanity_test.go index 43094309..4b34609c 100644 --- a/internal/driver/sanity_test.go +++ b/internal/driver/sanity_test.go @@ -46,6 +46,7 @@ func TestSanity(t *testing.T) { logger.With("component", "driver-controller-service"), volumeService, "testloc", + false, ) identityService := NewIdentityService( @@ -56,6 +57,7 @@ func TestSanity(t *testing.T) { logger.With("component", "driver-node-service"), "123456", "loc", + false, volumeMountService, volumeResizeService, volumeStatsService, From 4f06570b643e6720ae6cebd924a70d7f09d718b8 Mon Sep 17 00:00:00 2001 From: lukasmetzner Date: Tue, 12 Nov 2024 16:03:37 +0100 Subject: [PATCH 2/2] refactor: created function for parsing enableProvidedByTopology feature --- cmd/aio/main.go | 5 +---- cmd/controller/main.go | 6 +----- cmd/node/main.go | 5 +---- internal/app/app.go | 9 +++++++++ 4 files changed, 12 insertions(+), 13 deletions(-) diff --git a/cmd/aio/main.go b/cmd/aio/main.go index 7d41ccae..84abb09d 100644 --- a/cmd/aio/main.go +++ b/cmd/aio/main.go @@ -63,10 +63,7 @@ func main() { volumeResizeService := volumes.NewLinuxResizeService(logger.With("component", "linux-resize-service")) volumeStatsService := volumes.NewLinuxStatsService(logger.With("component", "linux-stats-service")) - var enableProvidedByTopology bool - if featFlag, exists := os.LookupEnv("ENABLE_PROVIDED_BY_TOPOLOGY"); exists { - enableProvidedByTopology, _ = strconv.ParseBool(featFlag) - } + enableProvidedByTopology := app.GetEnableProvidedByTopology() nodeService := driver.NewNodeService( logger.With("component", "driver-node-service"), diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 709195b7..a29e6e26 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -3,7 +3,6 @@ package main import ( "log/slog" "os" - "strconv" proto "github.com/container-storage-interface/spec/lib/go/csi" @@ -55,10 +54,7 @@ func main() { location = server.Datacenter.Location.Name } - var enableProvidedByTopology bool - if featFlag, exists := os.LookupEnv("ENABLE_PROVIDED_BY_TOPOLOGY"); exists { - enableProvidedByTopology, _ = strconv.ParseBool(featFlag) - } + enableProvidedByTopology := app.GetEnableProvidedByTopology() volumeService := volumes.NewIdempotentService( logger.With("component", "idempotent-volume-service"), diff --git a/cmd/node/main.go b/cmd/node/main.go index 7af914c0..c3daed82 100644 --- a/cmd/node/main.go +++ b/cmd/node/main.go @@ -53,10 +53,7 @@ func main() { volumeStatsService := volumes.NewLinuxStatsService(logger.With("component", "linux-stats-service")) identityService := driver.NewIdentityService(logger.With("component", "driver-identity-service")) - var enableProvidedByTopology bool - if featFlag, exists := os.LookupEnv("ENABLE_PROVIDED_BY_TOPOLOGY"); exists { - enableProvidedByTopology, _ = strconv.ParseBool(featFlag) - } + enableProvidedByTopology := app.GetEnableProvidedByTopology() nodeService := driver.NewNodeService( logger.With("component", "driver-node-service"), diff --git a/internal/app/app.go b/internal/app/app.go index 7118240e..9051907a 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -49,6 +49,15 @@ func CreateLogger() *slog.Logger { return logger } +// GetEnableProvidedByTopology parses the ENABLE_PROVIDED_BY_TOPOLOGY environment variable and returns false by default. +func GetEnableProvidedByTopology() bool { + var enableProvidedByTopology bool + if featFlag, exists := os.LookupEnv("ENABLE_PROVIDED_BY_TOPOLOGY"); exists { + enableProvidedByTopology, _ = strconv.ParseBool(featFlag) + } + return enableProvidedByTopology +} + // CreateListener creates and binds the unix socket in location specified by the CSI_ENDPOINT environment variable. func CreateListener() (net.Listener, error) { endpoint := os.Getenv("CSI_ENDPOINT")