Skip to content

Commit

Permalink
fix: improve emulator cloud provider reliability
Browse files Browse the repository at this point in the history
Copy the labels from machine request to the machine request
status.

Use a single client for rtnetlink watches and management.
Retry service account creation.

Signed-off-by: Artem Chernyshev <[email protected]>
  • Loading branch information
Unix4ever committed Aug 16, 2024
1 parent 9b87697 commit edecbfe
Show file tree
Hide file tree
Showing 23 changed files with 480 additions and 226 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2024-04-23T19:38:57Z by kres 0610b40-dirty.
# Generated on 2024-08-14T14:55:16Z by kres 7be2a05.

_out
hack/compose/docker-compose.override.yml
hack/compose/docker-compose-provider.override.yml
72 changes: 72 additions & 0 deletions .kres.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ kind: common.Build
spec:
ignoredPaths:
- "hack/compose/docker-compose.override.yml"
- "hack/compose/docker-compose-provider.override.yml"
---
kind: service.CodeCov
spec:
Expand All @@ -24,6 +25,10 @@ spec:
toplevel: true
- name: docker-compose-down
toplevel: true
- name: docker-compose-provider-up
toplevel: true
- name: docker-compose-provider-down
toplevel: true
---
kind: custom.Step
name: docker-compose-up
Expand Down Expand Up @@ -91,3 +96,70 @@ spec:
DOCKER_BUILDKIT=1
GO_LDFLAGS="$(GO_LDFLAGS)"
docker compose -p talemu --file ./hack/compose/docker-compose.yml --file ./hack/compose/docker-compose.override.yml down --rmi local --remove-orphans --volumes=$(REMOVE_VOLUMES)
---
kind: custom.Step
name: docker-compose-provider-up
spec:
makefile:
enabled: true
phony: true
script:
- >-
ARTIFACTS="$(ARTIFACTS)"
SHA="$(SHA)"
TAG="$(TAG)"
USERNAME="$(USERNAME)"
REGISTRY="$(REGISTRY)"
PROTOBUF_TS_VERSION="$(PROTOBUF_TS_VERSION)"
NODE_BUILD_ARGS="$(NODE_BUILD_ARGS)"
TOOLCHAIN="$(TOOLCHAIN)"
CGO_ENABLED="$(CGO_ENABLED)"
GO_BUILDFLAGS="$(GO_BUILDFLAGS)"
GOLANGCILINT_VERSION="$(GOLANGCILINT_VERSION)"
GOFUMPT_VERSION="$(GOFUMPT_VERSION)"
GOIMPORTS_VERSION="$(GOIMPORTS_VERSION)"
PROTOBUF_GO_VERSION="$(PROTOBUF_GO_VERSION)"
GRPC_GO_VERSION="$(GRPC_GO_VERSION)"
GRPC_GATEWAY_VERSION="$(GRPC_GATEWAY_VERSION)"
VTPROTOBUF_VERSION="$(VTPROTOBUF_VERSION)"
DEEPCOPY_VERSION="$(DEEPCOPY_VERSION)"
TESTPKGS="$(TESTPKGS)"
COMPOSE_DOCKER_CLI_BUILD=1
DOCKER_BUILDKIT=1
GO_LDFLAGS="$(GO_LDFLAGS)"
docker compose -p talemu-cloud-provider --file ./hack/compose/docker-compose-provider.yml --file ./hack/compose/docker-compose-provider.override.yml up --build
---
kind: custom.Step
name: docker-compose-provider-down
spec:
makefile:
enabled: true
phony: true
variables:
- name: REMOVE_VOLUMES
defaultValue: false
script:
- >-
ARTIFACTS="$(ARTIFACTS)"
SHA="$(SHA)"
TAG="$(TAG)"
USERNAME="$(USERNAME)"
REGISTRY="$(REGISTRY)"
PROTOBUF_TS_VERSION="$(PROTOBUF_TS_VERSION)"
NODE_BUILD_ARGS="$(NODE_BUILD_ARGS)"
TOOLCHAIN="$(TOOLCHAIN)"
CGO_ENABLED="$(CGO_ENABLED)"
GO_BUILDFLAGS="$(GO_BUILDFLAGS)"
GOLANGCILINT_VERSION="$(GOLANGCILINT_VERSION)"
GOFUMPT_VERSION="$(GOFUMPT_VERSION)"
GOIMPORTS_VERSION="$(GOIMPORTS_VERSION)"
PROTOBUF_GO_VERSION="$(PROTOBUF_GO_VERSION)"
GRPC_GO_VERSION="$(GRPC_GO_VERSION)"
GRPC_GATEWAY_VERSION="$(GRPC_GATEWAY_VERSION)"
VTPROTOBUF_VERSION="$(VTPROTOBUF_VERSION)"
DEEPCOPY_VERSION="$(DEEPCOPY_VERSION)"
TESTPKGS="$(TESTPKGS)"
COMPOSE_DOCKER_CLI_BUILD=1
DOCKER_BUILDKIT=1
GO_LDFLAGS="$(GO_LDFLAGS)"
docker compose -p talemu-cloud-provider --file ./hack/compose/docker-compose-provider.yml --file ./hack/compose/docker-compose-provider.override.yml down --rmi local --remove-orphans --volumes=$(REMOVE_VOLUMES)
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2024-07-31T12:19:21Z by kres faf91e3.
# Generated on 2024-08-14T14:55:16Z by kres 7be2a05.

ARG TOOLCHAIN

Expand All @@ -11,7 +11,7 @@ FROM ghcr.io/siderolabs/ca-certificates:v1.7.0 AS image-ca-certificates
FROM ghcr.io/siderolabs/fhs:v1.7.0 AS image-fhs

# runs markdownlint
FROM docker.io/oven/bun:1.1.20-alpine AS lint-markdown
FROM docker.io/oven/bun:1.1.22-alpine AS lint-markdown
WORKDIR /src
RUN bun i [email protected] [email protected]
COPY .markdownlint.json .
Expand Down
24 changes: 16 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# THIS FILE WAS AUTOMATICALLY GENERATED, PLEASE DO NOT EDIT.
#
# Generated on 2024-07-31T12:19:21Z by kres faf91e3.
# Generated on 2024-08-14T14:58:43Z by kres 7be2a05.

# common variables

Expand All @@ -18,14 +18,14 @@ REGISTRY ?= ghcr.io
USERNAME ?= siderolabs
REGISTRY_AND_USERNAME ?= $(REGISTRY)/$(USERNAME)
PROTOBUF_GO_VERSION ?= 1.34.2
GRPC_GO_VERSION ?= 1.4.0
GRPC_GATEWAY_VERSION ?= 2.20.0
GRPC_GO_VERSION ?= 1.5.1
GRPC_GATEWAY_VERSION ?= 2.21.0
VTPROTOBUF_VERSION ?= 0.6.0
GOIMPORTS_VERSION ?= 0.23.0
GOIMPORTS_VERSION ?= 0.24.0
DEEPCOPY_VERSION ?= v0.5.6
GOLANGCILINT_VERSION ?= v1.59.1
GOLANGCILINT_VERSION ?= v1.60.1
GOFUMPT_VERSION ?= v0.6.0
GO_VERSION ?= 1.22.5
GO_VERSION ?= 1.23.0
GO_BUILDFLAGS ?=
GO_LDFLAGS ?=
CGO_ENABLED ?= 0
Expand Down Expand Up @@ -67,7 +67,7 @@ COMMON_ARGS += --build-arg=DEEPCOPY_VERSION="$(DEEPCOPY_VERSION)"
COMMON_ARGS += --build-arg=GOLANGCILINT_VERSION="$(GOLANGCILINT_VERSION)"
COMMON_ARGS += --build-arg=GOFUMPT_VERSION="$(GOFUMPT_VERSION)"
COMMON_ARGS += --build-arg=TESTPKGS="$(TESTPKGS)"
TOOLCHAIN ?= docker.io/golang:1.22-alpine
TOOLCHAIN ?= docker.io/golang:1.23-alpine

# extra variables

Expand Down Expand Up @@ -135,7 +135,7 @@ else
GO_LDFLAGS += -s
endif

all: unit-tests talemu image-talemu talemu-cloud-provider image-talemu-cloud-provider docker-compose-up docker-compose-down lint
all: unit-tests talemu image-talemu talemu-cloud-provider image-talemu-cloud-provider docker-compose-up docker-compose-down docker-compose-provider-up docker-compose-provider-down lint

$(ARTIFACTS): ## Creates artifacts directory.
@mkdir -p $(ARTIFACTS)
Expand Down Expand Up @@ -225,6 +225,14 @@ docker-compose-up:
docker-compose-down:
ARTIFACTS="$(ARTIFACTS)" SHA="$(SHA)" TAG="$(TAG)" USERNAME="$(USERNAME)" REGISTRY="$(REGISTRY)" PROTOBUF_TS_VERSION="$(PROTOBUF_TS_VERSION)" NODE_BUILD_ARGS="$(NODE_BUILD_ARGS)" TOOLCHAIN="$(TOOLCHAIN)" CGO_ENABLED="$(CGO_ENABLED)" GO_BUILDFLAGS="$(GO_BUILDFLAGS)" GOLANGCILINT_VERSION="$(GOLANGCILINT_VERSION)" GOFUMPT_VERSION="$(GOFUMPT_VERSION)" GOIMPORTS_VERSION="$(GOIMPORTS_VERSION)" PROTOBUF_GO_VERSION="$(PROTOBUF_GO_VERSION)" GRPC_GO_VERSION="$(GRPC_GO_VERSION)" GRPC_GATEWAY_VERSION="$(GRPC_GATEWAY_VERSION)" VTPROTOBUF_VERSION="$(VTPROTOBUF_VERSION)" DEEPCOPY_VERSION="$(DEEPCOPY_VERSION)" TESTPKGS="$(TESTPKGS)" COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 GO_LDFLAGS="$(GO_LDFLAGS)" docker compose -p talemu --file ./hack/compose/docker-compose.yml --file ./hack/compose/docker-compose.override.yml down --rmi local --remove-orphans --volumes=$(REMOVE_VOLUMES)

.PHONY: docker-compose-provider-up
docker-compose-provider-up:
ARTIFACTS="$(ARTIFACTS)" SHA="$(SHA)" TAG="$(TAG)" USERNAME="$(USERNAME)" REGISTRY="$(REGISTRY)" PROTOBUF_TS_VERSION="$(PROTOBUF_TS_VERSION)" NODE_BUILD_ARGS="$(NODE_BUILD_ARGS)" TOOLCHAIN="$(TOOLCHAIN)" CGO_ENABLED="$(CGO_ENABLED)" GO_BUILDFLAGS="$(GO_BUILDFLAGS)" GOLANGCILINT_VERSION="$(GOLANGCILINT_VERSION)" GOFUMPT_VERSION="$(GOFUMPT_VERSION)" GOIMPORTS_VERSION="$(GOIMPORTS_VERSION)" PROTOBUF_GO_VERSION="$(PROTOBUF_GO_VERSION)" GRPC_GO_VERSION="$(GRPC_GO_VERSION)" GRPC_GATEWAY_VERSION="$(GRPC_GATEWAY_VERSION)" VTPROTOBUF_VERSION="$(VTPROTOBUF_VERSION)" DEEPCOPY_VERSION="$(DEEPCOPY_VERSION)" TESTPKGS="$(TESTPKGS)" COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 GO_LDFLAGS="$(GO_LDFLAGS)" docker compose -p talemu-cloud-provider --file ./hack/compose/docker-compose-provider.yml --file ./hack/compose/docker-compose-provider.override.yml up --build

.PHONY: docker-compose-provider-down
docker-compose-provider-down:
ARTIFACTS="$(ARTIFACTS)" SHA="$(SHA)" TAG="$(TAG)" USERNAME="$(USERNAME)" REGISTRY="$(REGISTRY)" PROTOBUF_TS_VERSION="$(PROTOBUF_TS_VERSION)" NODE_BUILD_ARGS="$(NODE_BUILD_ARGS)" TOOLCHAIN="$(TOOLCHAIN)" CGO_ENABLED="$(CGO_ENABLED)" GO_BUILDFLAGS="$(GO_BUILDFLAGS)" GOLANGCILINT_VERSION="$(GOLANGCILINT_VERSION)" GOFUMPT_VERSION="$(GOFUMPT_VERSION)" GOIMPORTS_VERSION="$(GOIMPORTS_VERSION)" PROTOBUF_GO_VERSION="$(PROTOBUF_GO_VERSION)" GRPC_GO_VERSION="$(GRPC_GO_VERSION)" GRPC_GATEWAY_VERSION="$(GRPC_GATEWAY_VERSION)" VTPROTOBUF_VERSION="$(VTPROTOBUF_VERSION)" DEEPCOPY_VERSION="$(DEEPCOPY_VERSION)" TESTPKGS="$(TESTPKGS)" COMPOSE_DOCKER_CLI_BUILD=1 DOCKER_BUILDKIT=1 GO_LDFLAGS="$(GO_LDFLAGS)" docker compose -p talemu-cloud-provider --file ./hack/compose/docker-compose-provider.yml --file ./hack/compose/docker-compose-provider.override.yml down --rmi local --remove-orphans --volumes=$(REMOVE_VOLUMES)

.PHONY: rekres
rekres:
@docker pull $(KRES_IMAGE)
Expand Down
29 changes: 25 additions & 4 deletions cmd/talemu-cloud-provider/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

emuruntime "github.com/siderolabs/talemu/internal/pkg/emu"
"github.com/siderolabs/talemu/internal/pkg/kubefactory"
"github.com/siderolabs/talemu/internal/pkg/machine/network"
"github.com/siderolabs/talemu/internal/pkg/machine/runtime"
"github.com/siderolabs/talemu/internal/pkg/machine/runtime/resources/emu"
"github.com/siderolabs/talemu/internal/pkg/provider"
Expand All @@ -50,9 +51,19 @@ var rootCmd = &cobra.Command{
}

if cfg.createServiceAccount {
err = createServiceAccount(cmd.Context())
if err != nil {
return err
for {
err = createServiceAccount(cmd.Context())
if err == nil {
break
}

logger.Error("failed to create service account", zap.Error(err))

select {
case <-cmd.Context().Done():
return err
case <-time.After(time.Second * 5):
}
}
}

Expand Down Expand Up @@ -97,7 +108,15 @@ var rootCmd = &cobra.Command{
return err
}

if err = provider.RegisterControllers(runtime, kubernetes); err != nil {
nc := network.NewClient()

if err = nc.Run(cmd.Context()); err != nil {
return err
}

defer nc.Close() //nolint:errcheck

if err = provider.RegisterControllers(runtime, kubernetes, nc); err != nil {
return err
}

Expand All @@ -113,6 +132,8 @@ func createServiceAccount(ctx context.Context) error {
return err
}

defer rootClient.Close() //nolint:errcheck

name := access.CloudProviderServiceAccountPrefix + meta.ProviderID

sa := access.ParseServiceAccountFromName(name)
Expand Down
15 changes: 12 additions & 3 deletions cmd/talemu/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
emuruntime "github.com/siderolabs/talemu/internal/pkg/emu"
"github.com/siderolabs/talemu/internal/pkg/kubefactory"
"github.com/siderolabs/talemu/internal/pkg/machine"
"github.com/siderolabs/talemu/internal/pkg/machine/network"
"github.com/siderolabs/talemu/internal/pkg/machine/runtime"
"github.com/siderolabs/talemu/internal/pkg/machine/runtime/resources/emu"
)
Expand Down Expand Up @@ -81,17 +82,25 @@ var rootCmd = &cobra.Command{
return runtime.Run(ctx)
})

nc := network.NewClient()

if err = nc.Run(cmd.Context()); err != nil {
return err
}

defer nc.Close() //nolint:errcheck

for i := range cfg.machinesCount {
machine, err := machine.NewMachine(fmt.Sprintf("%04d1802-c798-4da7-a410-f09abb48c8d8", i+1000), logger, emulatorState)
m, err := machine.NewMachine(fmt.Sprintf("%04d1802-c798-4da7-a410-f09abb48c8d8", i+1000), logger, emulatorState)
if err != nil {
return err
}

eg.Go(func() error {
return machine.Run(ctx, params, i+1000, kubernetes)
return m.Run(ctx, params, i+1000, kubernetes, machine.WithNetworkClient(nc))
})

machines = append(machines, machine)
machines = append(machines, m)
}

var errors error
Expand Down
36 changes: 36 additions & 0 deletions hack/compose/docker-compose-provider.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
version: '3.8'
services:
talemu-cloud-provider:
volumes:
- state:/_out/provider
container_name: talemu-cloud-provider
restart: on-failure
cap_add:
- NET_ADMIN
build:
target: image-talemu-cloud-provider
context: ../../
dockerfile: Dockerfile
args:
- ARTIFACTS=${ARTIFACTS:?error}
- SHA=${SHA:?error}
- TAG=${TAG:?error}
- USERNAME=${USERNAME:?error}
- REGISTRY=${REGISTRY:?error}
- NODE_BUILD_ARGS=${NODE_BUILD_ARGS}
- TOOLCHAIN=${TOOLCHAIN:?error}
- CGO_ENABLED=${CGO_ENABLED:?error}
- GO_BUILDFLAGS=${GO_BUILDFLAGS}
- GOLANGCILINT_VERSION=${GOLANGCILINT_VERSION:?error}
- GOFUMPT_VERSION=${GOFUMPT_VERSION:?error}
- GOIMPORTS_VERSION=${GOIMPORTS_VERSION:?error}
- PROTOBUF_GO_VERSION=${PROTOBUF_GO_VERSION:?error}
- GRPC_GO_VERSION=${GRPC_GO_VERSION:?error}
- GRPC_GATEWAY_VERSION=${GRPC_GATEWAY_VERSION:?error}
- VTPROTOBUF_VERSION=${VTPROTOBUF_VERSION:?error}
- DEEPCOPY_VERSION=${DEEPCOPY_VERSION:?error}
- TESTPKGS=${TESTPKGS:?error}
- GO_LDFLAGS=${GO_LDFLAGS}

volumes:
state:
17 changes: 16 additions & 1 deletion internal/pkg/emu/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package emu

import (
"context"
"time"

"github.com/cosi-project/runtime/pkg/controller"
"github.com/cosi-project/runtime/pkg/controller/runtime"
Expand Down Expand Up @@ -70,5 +71,19 @@ func (rt *Runtime) RegisterController(ctrl controller.Controller) error {
func (rt *Runtime) Run(ctx context.Context) error {
rt.logger.Info("starting global runtime")

return rt.runtime.Run(ctx)
for {
err := rt.runtime.Run(ctx)

if err == nil {
return nil
}

rt.logger.Error("global runtime crashed", zap.Error(err))

select {
case <-ctx.Done():
return err
case <-time.After(time.Second * 10):
}
}
}
Loading

0 comments on commit edecbfe

Please sign in to comment.