Mirantis · james-nesbitt · Jan 28, 2025 · Jan 24, 2025 · Jan 24, 2025 · Jan 24, 2025
diff --git a/Makefile b/Makefile
@@ -85,11 +85,11 @@ integration-test:
 
 .PHONY: smoke-small
 smoke-small:
-	go test -v ./test/smoke/... -run TestSmallCluster -timeout 20m
+	go test -count=1 -v ./test/smoke/... -run TestSmallCluster -timeout 20m
 
 .PHONY: smoke-full
 smoke-full:
-	go test -v ./test/smoke/... -run TestSupportedMatrixCluster -timeout 40m
+	go test -count=1 -v ./test/smoke/... -run TestSupportedMatrixCluster -timeout 50m
 
 .PHONY: clean-launchpad-chart
 clean-launchpad-chart:

diff --git a/examples/terraform/aws-simple/.terraform.lock.hcl b/examples/terraform/aws-simple/.terraform.lock.hcl
diff --git a/examples/terraform/aws-simple/iam.tf b/examples/terraform/aws-simple/iam.tf
@@ -0,0 +1,23 @@
+resource "aws_iam_role" "common_role" {
+  name               = "common-iam-role-${var.name}"
+  path               = "/"
+  assume_role_policy = data.aws_iam_policy_document.assume_role.json
+}
+
+data "aws_iam_policy_document" "assume_role" {
+  statement {
+    effect = "Allow"
+
+    principals {
+      type        = "Service"
+      identifiers = ["ec2.amazonaws.com"]
+    }
+
+    actions = ["sts:AssumeRole"]
+  }
+}
+
+resource "aws_iam_instance_profile" "common_profile" {
+  name = "common-instance-profile-${var.name}"
+  role = aws_iam_role.common_role.name
+}
diff --git a/examples/terraform/aws-simple/provision.tf b/examples/terraform/aws-simple/provision.tf
@@ -26,6 +26,8 @@ module "provision" {
     role : ngd.role
     public : ngd.public
     user_data : ngd.user_data
+    instance_profile_name : aws_iam_instance_profile.common_profile.name
+    tags : local.tags
   } }
 
   // ingress/lb (should likely merge with an input to allow more flexibility

diff --git a/examples/terraform/aws-simple/terraform.tfvars.template b/examples/terraform/aws-simple/terraform.tfvars.template
@@ -1,8 +1,8 @@
 // used to name infrastructure (CHANGE THIS)
-name = "mcc-smoke-test"
+//name = "test"
 
 aws = {
-  region = "us-east-1"
+  region = "us-east-2"
 }
 
 launchpad = {
@@ -14,55 +14,54 @@ launchpad = {
 
   mke_connect = {
     username = "admin"
-    password = ""      // an MKE passwords must be provided
+    password = "" // an MKE passwords must be provided
     insecure = false
   }
+
+  skip_create  = false
+  skip_destroy = true // don't bother running launchpad reset
 }
 
 // configure the network stack
 network = {
-  cidr = "172.31.0.0/16"
+  "cidr": "172.31.0.0/16",
+  "enable_nat_gateway": false,
+  "enable_vpn_gateway": false,
+  "tags": {}
 }
+
+// configure the subnets in the vpc
 subnets = {
-  "Main" = {
-    cidr       = "172.31.0.0/17"
-    nodegroups = ["ACon", "AWrk_Ubu22", "AWrk_Roc9", "AWrk_Win2022"]
-    private    = false
+  "main" = {
+    "cidr" = "172.31.0.0/17",
+    "private" = false,
+    "nodegroups" = ["MngrUbuntu22",  "WrkUbuntu22"]
   }
 }
 
-
-// machine node groups by role & platform
+// one definition for each group of machines to include in the stack
 nodegroups = {
-  "ACon" = { // managers for A group
-    role     = "manager"
-    platform = "ubuntu_22.04"
-    count    = 1
-    type     = "m6a.2xlarge"
+  "MngrUbuntu22" = {
+    "platform" = "ubuntu_22.04",
+    "count" = 1,
+    "type" = "m6a.2xlarge",
+    "volume_size" = "100",
+    "public" = true,
+    "role" = "manager",
+    "user_data" = "sudo ufw allow 7946/tcp ; sudo ufw allow 10250/tcp "
   },
-  "AWrk_Ubu22" = { // workers for A group
-    role        = "worker"
-    platform    = "ubuntu_22.04"
-    count       = 1
-    type        = "c6a.xlarge"
-    volume_size = 100
-  },
-  "AWrk_Roc9" = { // workers for A group
-    role        = "worker"
-    platform    = "rocky_9"
-    count       = 1
-    type        = "c6a.xlarge"
-    volume_size = 100
-  },
-  //  "AWrk_Win2022" = {
-  //    role        = "worker"
-  //    platform   = "windows_core_2022"
-  //    count       = 1
-  //    type        = "c6a.xlarge"
-  //  },
+  "WrkUbuntu22" = {
+    "platform" = "ubuntu_22.04",
+    "count" = 1,
+    "type" = "c6a.xlarge",
+    "volume_size" = "100",
+    "public" = true,
+    "role" = "worker",
+    "user_data" = "sudo ufw allow 7946/tcp ; sudo ufw allow 10250/tcp "
+  }
 }
 
 // set a windows password, if you have windows nodes
 // windows passwords must match a pattern, or connections will fail.
 // use something like: `testp@ss!`
-# windows_password = ""
+// windows_password = ""
diff --git a/pkg/product/mke/api/cluster_spec.go b/pkg/product/mke/api/cluster_spec.go
@@ -8,6 +8,7 @@ import (
 	"strconv"
 	"strings"
 	"sync"
+	"time"
 
 	"github.com/Mirantis/mcc/pkg/constant"
 	common "github.com/Mirantis/mcc/pkg/product/common/api"
@@ -267,7 +268,10 @@ func pingHost(h *Host, address string, waitgroup *sync.WaitGroup, errCh chan<- e
 			}
 			return nil
 		},
-		retry.Attempts(12), // last attempt should wait ~7min
+		retry.MaxJitter(time.Second*3),
+		retry.Delay(time.Second*30),
+		retry.DelayType(retry.FixedDelay),
+		retry.Attempts(10), // should try for ~5min
 	)
 	if err != nil {
 		errCh <- fmt.Errorf("MKE health check failed: %w", err)

diff --git a/pkg/product/mke/phase/uninstall_mcr.go b/pkg/product/mke/phase/uninstall_mcr.go
@@ -22,6 +22,32 @@ func (p *UninstallMCR) Title() string {
 
 // Run installs the engine on each host.
 func (p *UninstallMCR) Run() error {
+	workers := p.Config.Spec.Workers()
+	managers := p.Config.Spec.Managers()
+	swarmLeader := p.Config.Spec.SwarmLeader()
+
+	// Drain all workers
+	for _, h := range workers {
+		if err := mcr.DrainNode(swarmLeader, h); err != nil {
+			return fmt.Errorf("%s: drain worker node: %w", h, err)
+		}
+	}
+
+	// Drain all managers
+	for _, h := range managers {
+		if swarmLeader.Address() == h.Address() {
+			continue
+		}
+		if err := mcr.DrainNode(swarmLeader, h); err != nil {
+			return fmt.Errorf("%s: draining manager node: %w", h, err)
+		}
+	}
+
+	// Drain the leader
+	if err := mcr.DrainNode(swarmLeader, swarmLeader); err != nil {
+		return fmt.Errorf("%s: drain leader node: %w", swarmLeader, err)
+	}
+
 	if err := phase.RunParallelOnHosts(p.Config.Spec.Hosts, p.Config, p.uninstallMCR); err != nil {
 		return fmt.Errorf("uninstall container runtime: %w", err)
 	}
@@ -31,12 +57,6 @@ func (p *UninstallMCR) Run() error {
 func (p *UninstallMCR) uninstallMCR(h *api.Host, config *api.ClusterConfig) error {
 	log.Infof("%s: uninstalling container runtime", h)
 
-	leader := config.Spec.SwarmLeader()
-
-	if err := mcr.DrainNode(leader, h); err != nil {
-		return fmt.Errorf("%s: drain node: %w", h, err)
-	}
-
 	uVolumeCmd := h.Configurer.DockerCommandf("volume prune -f")
 	log.Infof("%s: unmounted dangling volumes", h)