Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PRODENG-2850 Reworked drain node logic and upgrade the aws-simple tf chart #547

Merged
merged 4 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@ integration-test:

.PHONY: smoke-small
smoke-small:
go test -v ./test/smoke/... -run TestSmallCluster -timeout 20m
go test -count=1 -v ./test/smoke/... -run TestSmallCluster -timeout 20m
james-nesbitt marked this conversation as resolved.
Show resolved Hide resolved

.PHONY: smoke-full
smoke-full:
go test -v ./test/smoke/... -run TestSupportedMatrixCluster -timeout 40m
go test -count=1 -v ./test/smoke/... -run TestSupportedMatrixCluster -timeout 50m

.PHONY: clean-launchpad-chart
clean-launchpad-chart:
Expand Down
3 changes: 0 additions & 3 deletions examples/terraform/aws-simple/.terraform.lock.hcl

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 23 additions & 0 deletions examples/terraform/aws-simple/iam.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
resource "aws_iam_role" "common_role" {
name = "common-iam-role-${var.name}"
path = "/"
assume_role_policy = data.aws_iam_policy_document.assume_role.json
}

data "aws_iam_policy_document" "assume_role" {
statement {
effect = "Allow"

principals {
type = "Service"
identifiers = ["ec2.amazonaws.com"]
}

actions = ["sts:AssumeRole"]
}
}

resource "aws_iam_instance_profile" "common_profile" {
name = "common-instance-profile-${var.name}"
role = aws_iam_role.common_role.name
}
2 changes: 2 additions & 0 deletions examples/terraform/aws-simple/provision.tf
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ module "provision" {
role : ngd.role
public : ngd.public
user_data : ngd.user_data
instance_profile_name : aws_iam_instance_profile.common_profile.name
tags : local.tags
} }

// ingress/lb (should likely merge with an input to allow more flexibility
Expand Down
71 changes: 35 additions & 36 deletions examples/terraform/aws-simple/terraform.tfvars.template
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
// used to name infrastructure (CHANGE THIS)
name = "mcc-smoke-test"
//name = "test"

aws = {
region = "us-east-1"
region = "us-east-2"
}

launchpad = {
Expand All @@ -14,55 +14,54 @@ launchpad = {

mke_connect = {
username = "admin"
password = "" // an MKE passwords must be provided
password = "" // an MKE passwords must be provided
insecure = false
}

skip_create = false
skip_destroy = true // don't bother running launchpad reset
}

// configure the network stack
network = {
cidr = "172.31.0.0/16"
"cidr": "172.31.0.0/16",
"enable_nat_gateway": false,
"enable_vpn_gateway": false,
"tags": {}
}

// configure the subnets in the vpc
subnets = {
"Main" = {
cidr = "172.31.0.0/17"
nodegroups = ["ACon", "AWrk_Ubu22", "AWrk_Roc9", "AWrk_Win2022"]
private = false
"main" = {
"cidr" = "172.31.0.0/17",
"private" = false,
"nodegroups" = ["MngrUbuntu22", "WrkUbuntu22"]
}
}


// machine node groups by role & platform
// one definition for each group of machines to include in the stack
nodegroups = {
"ACon" = { // managers for A group
role = "manager"
platform = "ubuntu_22.04"
count = 1
type = "m6a.2xlarge"
"MngrUbuntu22" = {
"platform" = "ubuntu_22.04",
"count" = 1,
"type" = "m6a.2xlarge",
"volume_size" = "100",
"public" = true,
"role" = "manager",
"user_data" = "sudo ufw allow 7946/tcp ; sudo ufw allow 10250/tcp "
},
"AWrk_Ubu22" = { // workers for A group
role = "worker"
platform = "ubuntu_22.04"
count = 1
type = "c6a.xlarge"
volume_size = 100
},
"AWrk_Roc9" = { // workers for A group
role = "worker"
platform = "rocky_9"
count = 1
type = "c6a.xlarge"
volume_size = 100
},
// "AWrk_Win2022" = {
// role = "worker"
// platform = "windows_core_2022"
// count = 1
// type = "c6a.xlarge"
// },
"WrkUbuntu22" = {
"platform" = "ubuntu_22.04",
"count" = 1,
"type" = "c6a.xlarge",
"volume_size" = "100",
"public" = true,
"role" = "worker",
"user_data" = "sudo ufw allow 7946/tcp ; sudo ufw allow 10250/tcp "
}
}

// set a windows password, if you have windows nodes
// windows passwords must match a pattern, or connections will fail.
// use something like: `testp@ss!`
# windows_password = ""
// windows_password = ""
6 changes: 5 additions & 1 deletion pkg/product/mke/api/cluster_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"strconv"
"strings"
"sync"
"time"

"github.com/Mirantis/mcc/pkg/constant"
common "github.com/Mirantis/mcc/pkg/product/common/api"
Expand Down Expand Up @@ -267,7 +268,10 @@ func pingHost(h *Host, address string, waitgroup *sync.WaitGroup, errCh chan<- e
}
return nil
},
retry.Attempts(12), // last attempt should wait ~7min
retry.MaxJitter(time.Second*3),
retry.Delay(time.Second*30),
retry.DelayType(retry.FixedDelay),
retry.Attempts(10), // should try for ~5min
)
if err != nil {
errCh <- fmt.Errorf("MKE health check failed: %w", err)
Expand Down
32 changes: 26 additions & 6 deletions pkg/product/mke/phase/uninstall_mcr.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,32 @@ func (p *UninstallMCR) Title() string {

// Run installs the engine on each host.
func (p *UninstallMCR) Run() error {
workers := p.Config.Spec.Workers()
managers := p.Config.Spec.Managers()
swarmLeader := p.Config.Spec.SwarmLeader()

// Drain all workers
for _, h := range workers {
if err := mcr.DrainNode(swarmLeader, h); err != nil {
return fmt.Errorf("%s: drain worker node: %w", h, err)
}
}

// Drain all managers
for _, h := range managers {
if swarmLeader.Address() == h.Address() {
continue
}
if err := mcr.DrainNode(swarmLeader, h); err != nil {
return fmt.Errorf("%s: draining manager node: %w", h, err)
}
}

// Drain the leader
if err := mcr.DrainNode(swarmLeader, swarmLeader); err != nil {
return fmt.Errorf("%s: drain leader node: %w", swarmLeader, err)
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why drain in this approach separately from the uninstallMCR. Why not just uninstall with the drain in this staged approach instead? Is it because the leader might get deleted before all of the managers are drained?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, there is a big issue if the Lead manager gets MCR uninstalled before we drain all the other nodes. If it does, which it did previously, we get errors draining the remaining nodes.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you already keep the leader drain for the end; this keeps the leader safe, even if you don't separate the drain from the removed.

if err := phase.RunParallelOnHosts(p.Config.Spec.Hosts, p.Config, p.uninstallMCR); err != nil {
return fmt.Errorf("uninstall container runtime: %w", err)
}
Expand All @@ -31,12 +57,6 @@ func (p *UninstallMCR) Run() error {
func (p *UninstallMCR) uninstallMCR(h *api.Host, config *api.ClusterConfig) error {
log.Infof("%s: uninstalling container runtime", h)

leader := config.Spec.SwarmLeader()

if err := mcr.DrainNode(leader, h); err != nil {
return fmt.Errorf("%s: drain node: %w", h, err)
}

uVolumeCmd := h.Configurer.DockerCommandf("volume prune -f")
log.Infof("%s: unmounted dangling volumes", h)

Expand Down
Loading