From 73ad3d90193b83e5656cda9cec07e3dc866c99c6 Mon Sep 17 00:00:00 2001
From: Tyler Gu <jiaweig3@illinois.edu>
Date: Thu, 5 Sep 2024 16:58:34 -0500
Subject: [PATCH] Support helm and some fixes

Signed-off-by: Tyler Gu <jiaweig3@illinois.edu>
---
 acto/deploy.py                        | 108 +++++++++++++-------------
 acto/kubectl_client/helm.py           |  46 +++++++++++
 acto/kubectl_client/kubectl.py        |  97 +++++++++++++++++------
 acto/kubernetes_engine/kind.py        |   9 ++-
 acto/lib/operator_config.py           |   3 +-
 acto/runner/fault_injection_runner.py |  58 ++++++++++++++
 acto/system_state/replica_set.py      |   8 +-
 7 files changed, 247 insertions(+), 82 deletions(-)
 create mode 100644 acto/kubectl_client/helm.py
 create mode 100644 acto/runner/fault_injection_runner.py

diff --git a/acto/deploy.py b/acto/deploy.py
index a2debb4dfe..c03f11bf42 100644
--- a/acto/deploy.py
+++ b/acto/deploy.py
@@ -1,47 +1,36 @@
+import logging
 import time
 
-import kubernetes
 import yaml
 
 import acto.utils as utils
-from acto.common import *
+from acto.common import kubernetes_client, print_event
 from acto.kubectl_client.kubectl import KubectlClient
 from acto.lib.operator_config import DELEGATED_NAMESPACE, DeployConfig
 from acto.utils import get_thread_logger
 from acto.utils.preprocess import add_acto_label
 
 
-def wait_for_pod_ready(apiclient: kubernetes.client.ApiClient):
-    logger = get_thread_logger(with_prefix=True)
-    logger.debug("Waiting for all pods to be ready")
-    time.sleep(5)
-    pod_ready = False
-    for tick in range(600):
-        # check if all pods are ready
-        pods = kubernetes.client.CoreV1Api(
-            apiclient).list_pod_for_all_namespaces().items
-
-        all_pods_ready = True
-        for pod in pods:
-            if pod.status.phase == "Succeeded":
-                continue
-            if not utils.is_pod_ready(pod):
-                all_pods_ready = False
-
-        if all_pods_ready:
-            logger.info("Operator ready")
-            pod_ready = True
-            break
-        time.sleep(5)
-    logger.info("All pods took %d seconds to get ready" % (tick * 5))
-    if not pod_ready:
-        logger.error("Some pods failed to be ready within timeout")
+def wait_for_pod_ready(kubectl_client: KubectlClient) -> bool:
+    """Wait for all pods to be ready"""
+    now = time.time()
+    p = kubectl_client.wait_for_all_pods(timeout=600)
+    if p.returncode != 0:
+        logging.error(
+            "Failed to wait for all pods to be ready due to error from kubectl"
+            + f" (returncode={p.returncode})"
+            + f" (stdout={p.stdout})"
+            + f" (stderr={p.stderr})"
+        )
         return False
-    else:
-        return True
+    logging.info(
+        "Waited for all pods to be ready for %d seconds", time.time() - now
+    )
+    return True
 
 
-class Deploy():
+class Deploy:
+    """Deploy the operator using the deploy config"""
 
     def __init__(self, deploy_config: DeployConfig) -> None:
         self._deploy_config = deploy_config
@@ -52,24 +41,30 @@ def __init__(self, deploy_config: DeployConfig) -> None:
                 self._operator_yaml = step.apply.file
                 break
         else:
-            raise Exception("No operator yaml found in deploy config")
-        
+            raise RuntimeError("No operator yaml found in deploy config")
+
         # Extract the operator_container_name from config
         self._operator_container_name = None
         for step in self._deploy_config.steps:
             if step.apply and step.apply.operator:
-                self._operator_container_name = step.apply.operator_container_name
+                self._operator_container_name = (
+                    step.apply.operator_container_name
+                )
                 break
 
     @property
     def operator_yaml(self) -> str:
+        """Get the operator yaml file path"""
         return self._operator_yaml
 
-    def deploy(self,
-               kubeconfig: str,
-               context_name: str,
-               kubectl_client: KubectlClient,
-               namespace: str):
+    def deploy(
+        self,
+        kubeconfig: str,
+        context_name: str,
+        kubectl_client: KubectlClient,
+        namespace: str,
+    ):
+        """Deploy the operator using the deploy config"""
         logger = get_thread_logger(with_prefix=True)
         print_event("Deploying operator...")
         api_client = kubernetes_client(kubeconfig, context_name)
@@ -97,14 +92,16 @@ def deploy(self,
                 p = kubectl_client.kubectl(args, capture_output=True)
                 if p.returncode != 0:
                     logger.error(
-                        "Failed to deploy operator due to error from kubectl" +
-                        f" (returncode={p.returncode})" +
-                        f" (stdout={p.stdout})" +
-                        f" (stderr={p.stderr})")
+                        "Failed to deploy operator due to error from kubectl"
+                        + f" (returncode={p.returncode})"
+                        + f" (stdout={p.stdout})"
+                        + f" (stderr={p.stderr})"
+                    )
                     return False
-                elif not wait_for_pod_ready(api_client):
+                elif not wait_for_pod_ready(kubectl_client):
                     logger.error(
-                        "Failed to deploy operator due to timeout waiting for pod to be ready")
+                        "Failed to deploy operator due to timeout waiting for pod to be ready"
+                    )
                     return False
             elif step.wait:
                 # Simply wait for the specified duration
@@ -112,7 +109,7 @@ def deploy(self,
 
         # Add acto label to the operator pod
         add_acto_label(api_client, namespace)
-        if not wait_for_pod_ready(api_client):
+        if not wait_for_pod_ready(kubectl_client):
             logger.error("Failed to deploy operator")
             return False
 
@@ -121,14 +118,17 @@ def deploy(self,
         print_event("Operator deployed")
         return True
 
-    def deploy_with_retry(self,
-                          kubeconfig: str,
-                          context_name: str,
-                          kubectl_client: KubectlClient,
-                          namespace: str,
-                          retry_count: int = 3):
+    def deploy_with_retry(
+        self,
+        kubeconfig: str,
+        context_name: str,
+        kubectl_client: KubectlClient,
+        namespace: str,
+        retry_count: int = 3,
+    ):
+        """Deploy the operator with retry"""
         logger = get_thread_logger(with_prefix=True)
-        for i in range(retry_count):
+        for _ in range(retry_count):
             if self.deploy(kubeconfig, context_name, kubectl_client, namespace):
                 return True
             else:
@@ -136,7 +136,8 @@ def deploy_with_retry(self,
         return False
 
     def operator_name(self) -> str:
-        with open(self._operator_yaml) as f:
+        """Get the name of the operator deployment"""
+        with open(self._operator_yaml, "r", encoding="utf-8") as f:
             operator_yamls = yaml.load_all(f, Loader=yaml.FullLoader)
             for yaml_ in operator_yamls:
                 if yaml_["kind"] == "Deployment":
@@ -145,4 +146,5 @@ def operator_name(self) -> str:
 
     @property
     def operator_container_name(self) -> str:
+        """Get the name of the operator container"""
         return self._operator_container_name
diff --git a/acto/kubectl_client/helm.py b/acto/kubectl_client/helm.py
new file mode 100644
index 0000000000..21d8f15b5f
--- /dev/null
+++ b/acto/kubectl_client/helm.py
@@ -0,0 +1,46 @@
+import subprocess
+from typing import Optional
+
+
+class Helm:
+    """Helm client class"""
+
+    def __init__(self, kubeconfig: str, context_name: str) -> None:
+        self.kubeconfig = kubeconfig
+        self.context_name = context_name
+
+    def helm(self, args: list) -> subprocess.CompletedProcess:
+        """Executes a helm command"""
+        cmd = ["helm"]
+        cmd.extend(args)
+        cmd.extend(["--kubeconfig", self.kubeconfig])
+        cmd.extend(["--kube-context", self.context_name])
+        return subprocess.run(cmd, capture_output=True, text=True, check=False)
+
+    def repo_add(self, name: str, url: str) -> subprocess.CompletedProcess:
+        """Adds a helm repository"""
+        cmd = ["repo", "add", name, url]
+        return self.helm(cmd)
+
+    def install(
+        self,
+        release_name: str,
+        chart: str,
+        namespace: str,
+        repo: Optional[str] = None,
+        args: Optional[list] = None,
+    ) -> subprocess.CompletedProcess:
+        """Installs a helm chart"""
+        cmd = [
+            "install",
+            release_name,
+            chart,
+            "--namespace",
+            namespace,
+            "--create-namespace",
+        ]
+        if repo:
+            cmd.extend(["--repo", repo])
+        if args:
+            cmd.extend(args)
+        return self.helm(cmd)
diff --git a/acto/kubectl_client/kubectl.py b/acto/kubectl_client/kubectl.py
index 8f9ffa805f..da4a8181c8 100644
--- a/acto/kubectl_client/kubectl.py
+++ b/acto/kubectl_client/kubectl.py
@@ -1,44 +1,95 @@
+import logging
 import subprocess
+from typing import Optional
 
 
 class KubectlClient:
+    """Kubectl client class"""
 
     def __init__(self, kubeconfig: str, context_name: str):
 
         if not kubeconfig:
-            raise ValueError('kubeconfig is required')
+            raise ValueError("kubeconfig is required")
         if not context_name:
-            raise ValueError('context_name is required')
+            raise ValueError("context_name is required")
 
         self.kubeconfig = kubeconfig
         self.context_name = context_name
 
-    def exec(self,
-             pod: str,
-             namespace: str,
-             commands: list,
-             capture_output=False,
-             text=False) -> subprocess.CompletedProcess:
-        '''Executes a command in a pod'''
-        cmd = ['exec']
+    def exec(
+        self,
+        pod: str,
+        namespace: str,
+        commands: list,
+        capture_output=False,
+        text=False,
+    ) -> subprocess.CompletedProcess:
+        """Executes a command in a pod"""
+        cmd = ["exec"]
         cmd.extend([pod])
-        cmd.extend(['--namespace', namespace])
-        cmd.extend(['--'])
+        cmd.extend(["--namespace", namespace])
+        cmd.extend(["--"])
         cmd.extend(commands)
 
         return self.kubectl(cmd, capture_output, text)
 
-    def kubectl(self,
-                args: list,
-                capture_output=False,
-                text=False,
-                timeout: int = 600) -> subprocess.CompletedProcess:
-        '''Executes a kubectl command'''
-        cmd = ['kubectl']
-        cmd.extend(['--kubeconfig', self.kubeconfig])
-        cmd.extend(['--context', self.context_name])
+    def kubectl(
+        self, args: list, capture_output=False, text=False, timeout: int = 600
+    ) -> subprocess.CompletedProcess:
+        """Executes a kubectl command"""
+        cmd = ["kubectl"]
+        cmd.extend(["--kubeconfig", self.kubeconfig])
+        cmd.extend(["--context", self.context_name])
 
         cmd.extend(args)
 
-        p = subprocess.run(cmd, capture_output=capture_output, text=text, timeout=timeout)
-        return p
\ No newline at end of file
+        logging.info("Running kubectl command: %s", " ".join(cmd))
+        p = subprocess.run(
+            cmd,
+            capture_output=capture_output,
+            text=text,
+            timeout=timeout,
+            check=False,
+        )
+        return p
+
+    def wait(
+        self,
+        file: str,
+        for_condition: str,
+        timeout: int = 600,
+        namespace: Optional[str] = None,
+    ) -> subprocess.CompletedProcess:
+        """Waits for a condition to be true"""
+        cmd = [
+            "wait",
+            "-f",
+            file,
+            "--for",
+            for_condition,
+            "--timeout",
+            f"{timeout}s",
+        ]
+        if namespace:
+            cmd.extend(["-n", namespace])
+        else:
+            cmd.extend(["--all-namespaces"])
+        return self.kubectl(cmd, capture_output=True, text=True)
+
+    def wait_for_all_pods(
+        self, timeout: int = 600, namespace: Optional[str] = None
+    ) -> subprocess.CompletedProcess:
+        """Waits for all pods to be ready"""
+        cmd = [
+            "wait",
+            "--for=condition=Ready",
+            "--timeout",
+            f"{timeout}s",
+            "pods",
+            "--all",
+        ]
+        if namespace:
+            cmd.extend(["-n", namespace])
+        else:
+            cmd.extend(["--all-namespaces"])
+        return self.kubectl(cmd, capture_output=True, text=True)
diff --git a/acto/kubernetes_engine/kind.py b/acto/kubernetes_engine/kind.py
index 09dac36752..ee5c1d00cf 100644
--- a/acto/kubernetes_engine/kind.py
+++ b/acto/kubernetes_engine/kind.py
@@ -2,7 +2,7 @@
 import os
 import subprocess
 import time
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
 
 import kubernetes
 import yaml
@@ -22,7 +22,7 @@ def __init__(
         posthooks: List[base.KubernetesEnginePostHookType] = None,
         feature_gates: Dict[str, bool] = None,
         num_nodes=1,
-        version="",
+        version: Optional[str] = None,
     ):
         self._config_path = os.path.join(
             CONST.CLUSTER_CONFIG_FOLDER, f"KIND-{acto_namespace}.yaml"
@@ -39,7 +39,7 @@ def __init__(
         extra_mounts.append(
             {"hostPath": "profile/data", "containerPath": "/tmp/profile"}
         )
-        for _ in range(num_nodes - 1):
+        for _ in range(num_nodes):
             config_dict["nodes"].append(
                 {
                     "role": "worker",
@@ -108,7 +108,8 @@ def create_cluster(self, name: str, kubeconfig: str):
 
         cmd.extend(["--config", self._config_path])
 
-        cmd.extend(["--image", f"kindest/node:{self._k8s_version}"])
+        if self._k8s_version:
+            cmd.extend(["--image", f"kindest/node:{self._k8s_version}"])
 
         p = subprocess.run(cmd, check=False)
         i = 0
diff --git a/acto/lib/operator_config.py b/acto/lib/operator_config.py
index 9d50e37006..0543c3d061 100644
--- a/acto/lib/operator_config.py
+++ b/acto/lib/operator_config.py
@@ -8,7 +8,8 @@
 class ApplyStep(pydantic.BaseModel, extra="forbid"):
     """Configuration for each step of kubectl apply"""
 
-    file: str = pydantic.Field(description="Path to the file for kubectl apply")
+    file: str = pydantic.Field(
+        description="Path to the file for kubectl apply")
     operator: bool = pydantic.Field(
         description="If the file contains the operator deployment",
         default=False,
diff --git a/acto/runner/fault_injection_runner.py b/acto/runner/fault_injection_runner.py
new file mode 100644
index 0000000000..65259c4f1f
--- /dev/null
+++ b/acto/runner/fault_injection_runner.py
@@ -0,0 +1,58 @@
+"""Runner module for Acto"""
+
+import logging
+from typing import Callable
+
+import kubernetes
+
+from acto.common import kubernetes_client
+from acto.kubectl_client import KubectlClient
+from acto.snapshot import Snapshot
+
+RunnerHookType = Callable[[kubernetes.client.ApiClient], None]
+CustomSystemStateHookType = Callable[
+    [kubernetes.client.ApiClient, str, int], dict
+]
+
+
+class FaultInjectionRunner:
+    """Runner class for Acto.
+    This class is used to run the cmd and collect system state,
+    delta, operator log, events and input files.
+    """
+
+    def __init__(
+        self,
+        trial_dir: str,
+        kubeconfig: str,
+        context_name: str,
+    ):
+        self.trial_dir = trial_dir
+        self.kubeconfig = kubeconfig
+        self.context_name = context_name
+
+        self.kubectl_client = KubectlClient(kubeconfig, context_name)
+
+        apiclient = kubernetes_client(kubeconfig, context_name)
+
+    def run(
+        self,
+        file_path: str,
+        namespace: str,
+    ) -> tuple[Snapshot, bool]:
+        """Apply the input CR"""
+
+        cmd = ["apply", "-f", file_path, "-n", namespace]
+
+        # submit the CR
+        cli_result = self.kubectl_client.kubectl(
+            cmd, capture_output=True, text=True
+        )
+
+        if cli_result.returncode != 0:
+            logging.error(
+                "kubectl apply failed with return code %d",
+                cli_result.returncode,
+            )
+            logging.error("STDOUT: %s", cli_result.stdout)
+            logging.error("STDERR: %s", cli_result.stderr)
diff --git a/acto/system_state/replica_set.py b/acto/system_state/replica_set.py
index 7cf40816ef..2a2ec33d9b 100644
--- a/acto/system_state/replica_set.py
+++ b/acto/system_state/replica_set.py
@@ -1,4 +1,5 @@
 """ReplicaSet state model."""
+
 import kubernetes
 import kubernetes.client.models as kubernetes_models
 import pydantic
@@ -38,7 +39,12 @@ def check_health(self) -> tuple[bool, str]:
             ):
                 return False, f"ReplicaSet[{name}] generation mismatch"
 
-            if replica_set.spec.replicas != replica_set.status.ready_replicas:
+            if (
+                replica_set.status.ready_replicas is None
+                and replica_set.spec.replicas == 0
+            ):
+                pass
+            elif replica_set.spec.replicas != replica_set.status.ready_replicas:
                 return False, f"ReplicaSet[{name}] replicas mismatch"
 
             if replica_set.status.conditions is not None: