Krkn telemetry integration (#435)

* adapted config.yaml to the new feature * temporarly pointing requirement.txt to the lib feature branch * run_kraken.py + arcaflow scenarios refactoring typo * plugin scenario * node scenarios return failed scenarios * container scenarios fix * time scenarios * cluster shutdown scenarios * namespace scenarios * zone outage scenarios * app outage scenarios * pvc scenarios * network chaos scenarios * run_kraken.py adaptation to telemetry * prometheus telemetry upload + config.yaml some fixes typos and logs max retries in config telemetry id with run_uuid safe_logger * catch send_telemetry exception * scenario collection bug fixes * telemetry enabled check * telemetry run tag * requirements pointing to main + archive_size * requirements.txt and config.yaml update * added telemetry config to common config * fixed scenario array elements for telemetry
krkn-chaos · Aug 10, 2023 · 39c0152 · 39c0152
1 parent 491dc17
commit 39c0152
Show file tree

Hide file tree

Showing 19 changed files with 960 additions and 579 deletions.
diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml
@@ -29,3 +29,15 @@ tunings:
     wait_duration: 6                                       # Duration to wait between each chaos scenario.
     iterations: 1                                          # Number of times to execute the scenarios.
     daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever.
+telemetry:
+    enabled: False                                           # enable/disables the telemetry collection feature
+    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
+    username: username                                      # telemetry service username
+    password: password                                      # telemetry service password
+    prometheus_backup: True                                 # enables/disables prometheus data collection
+    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
+    backup_threads: 5                                       # number of telemetry download/upload threads
+    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
+    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
+    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
+    archive_size: 10000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
diff --git a/config/config.yaml b/config/config.yaml
@@ -7,41 +7,8 @@ kraken:
     signal_address: 0.0.0.0                                # Signal listening address
     port: 8081                                             # Signal port
     chaos_scenarios:                                       # List of policies/chaos scenarios to load
-        -   arcaflow_scenarios:
-                - scenarios/arcaflow/cpu-hog/input.yaml
-                - scenarios/arcaflow/memory-hog/input.yaml
-        -   container_scenarios:                                 # List of chaos pod scenarios to load
-            - -    scenarios/openshift/container_etcd.yml
-        -   plugin_scenarios:
-            - scenarios/openshift/etcd.yml
-            - scenarios/openshift/regex_openshift_pod_kill.yml
-            - scenarios/openshift/vmware_node_scenarios.yml
-            - scenarios/openshift/ibmcloud_node_scenarios.yml
-            - scenarios/openshift/network_chaos_ingress.yml
-            - scenarios/openshift/pod_network_outage.yml
-            - scenarios/openshift/pod_network_shaping.yml
-        -   node_scenarios:                                # List of chaos node scenarios to load
-            -   scenarios/openshift/node_scenarios_example.yml
-        -   plugin_scenarios:
-            - scenarios/openshift/openshift-apiserver.yml
-            - scenarios/openshift/openshift-kube-apiserver.yml
-        -   time_scenarios:                                # List of chaos time scenarios to load
-            - scenarios/openshift/time_scenarios_example.yml
-        -   cluster_shut_down_scenarios:
-            - - scenarios/openshift/cluster_shut_down_scenario.yml
-              - scenarios/openshift/post_action_shut_down.py
-        -   namespace_scenarios:
-             - - scenarios/openshift/regex_namespace.yaml
-             - - scenarios/openshift/ingress_namespace.yaml
-               - scenarios/openshift/post_action_namespace.py
-        -   zone_outages:
-            - scenarios/openshift/zone_outage.yaml
-        -   application_outages:
-            - scenarios/openshift/app_outage.yaml
-        -   pvc_scenarios:
-            - scenarios/openshift/pvc_scenario.yaml
-        -   network_chaos:
-            - scenarios/openshift/network_chaos.yaml
+        - application_outages:
+              - scenarios/openshift/app_outage.yaml
 
 cerberus:
     cerberus_enabled: False                                # Enable it when cerberus is previously installed
@@ -65,3 +32,20 @@ tunings:
     wait_duration: 60                                      # Duration to wait between each chaos scenario
     iterations: 1                                          # Number of times to execute the scenarios
     daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever
+telemetry:
+    enabled: False                                           # enable/disables the telemetry collection feature
+    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
+    username: username                                      # telemetry service username
+    password: password                                      # telemetry service password
+    prometheus_backup: True                                 # enables/disables prometheus data collection
+    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
+    backup_threads: 5                                       # number of telemetry download/upload threads
+    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
+    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
+    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
+    archive_size: 10000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
+                                                            # the higher the number of archive files will be produced and uploaded (and processed by backup_threads
+                                                            # simultaneously).
+                                                            # For unstable/slow connection is better to keep this value low
+                                                            # increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the
+                                                            # failed chunk without affecting the whole upload.
diff --git a/kraken/application_outage/actions.py b/kraken/application_outage/actions.py
@@ -4,25 +4,32 @@
 import kraken.cerberus.setup as cerberus
 from jinja2 import Template
 import kraken.invoke.command as runcommand
-
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 # Reads the scenario config, applies and deletes a network policy to
 # block the traffic for the specified duration
-def run(scenarios_list, config, wait_duration):
+def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
     failed_post_scenarios = ""
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_scenarios = []
     for app_outage_config in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = app_outage_config
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry, app_outage_config)
         if len(app_outage_config) > 1:
-            with open(app_outage_config, "r") as f:
-                app_outage_config_yaml = yaml.full_load(f)
-                scenario_config = app_outage_config_yaml["application_outage"]
-                pod_selector = scenario_config.get("pod_selector", "{}")
-                traffic_type = scenario_config.get("block", "[Ingress, Egress]")
-                namespace = scenario_config.get("namespace", "")
-                duration = scenario_config.get("duration", 60)
+            try:
+                with open(app_outage_config, "r") as f:
+                    app_outage_config_yaml = yaml.full_load(f)
+                    scenario_config = app_outage_config_yaml["application_outage"]
+                    pod_selector = scenario_config.get("pod_selector", "{}")
+                    traffic_type = scenario_config.get("block", "[Ingress, Egress]")
+                    namespace = scenario_config.get("namespace", "")
+                    duration = scenario_config.get("duration", 60)
 
-                start_time = int(time.time())
+                    start_time = int(time.time())
 
-                network_policy_template = """---
+                    network_policy_template = """---
 apiVersion: networking.k8s.io/v1
 kind: NetworkPolicy
 metadata:
@@ -31,28 +38,38 @@ def run(scenarios_list, config, wait_duration):
   podSelector:
     matchLabels: {{ pod_selector }}
   policyTypes: {{ traffic_type }}
-                """
-                t = Template(network_policy_template)
-                rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
-                # Write the rendered template to a file
-                with open("kraken_network_policy.yaml", "w") as f:
-                    f.write(rendered_spec)
-                # Block the traffic by creating network policy
-                logging.info("Creating the network policy")
-                runcommand.invoke(
-                    "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
-                )
-
-                # wait for the specified duration
-                logging.info("Waiting for the specified duration in the config: %s" % (duration))
-                time.sleep(duration)
-
-                # unblock the traffic by deleting the network policy
-                logging.info("Deleting the network policy")
-                runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
-
-                logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
-                time.sleep(wait_duration)
-
-                end_time = int(time.time())
-                cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+                    """
+                    t = Template(network_policy_template)
+                    rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
+                    # Write the rendered template to a file
+                    with open("kraken_network_policy.yaml", "w") as f:
+                        f.write(rendered_spec)
+                    # Block the traffic by creating network policy
+                    logging.info("Creating the network policy")
+                    runcommand.invoke(
+                        "kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
+                    )
+
+                    # wait for the specified duration
+                    logging.info("Waiting for the specified duration in the config: %s" % (duration))
+                    time.sleep(duration)
+
+                    # unblock the traffic by deleting the network policy
+                    logging.info("Deleting the network policy")
+                    runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
+
+                    logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
+                    time.sleep(wait_duration)
+
+                    end_time = int(time.time())
+                    cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
+            except Exception as e :
+                scenario_telemetry.exitStatus = 1
+                failed_scenarios.append(app_outage_config)
+                telemetry.log_exception(app_outage_config)
+            else:
+                scenario_telemetry.exitStatus = 0
+            scenario_telemetry.endTimeStamp = time.time()
+            scenario_telemetries.append(scenario_telemetry)
+    return failed_scenarios, scenario_telemetries
+
diff --git a/kraken/arcaflow_plugin/arcaflow_plugin.py b/kraken/arcaflow_plugin/arcaflow_plugin.py
@@ -1,3 +1,5 @@
+import time
+
 import arcaflow
 import os
 import yaml
@@ -6,22 +8,31 @@
 from pathlib import Path
 from typing import List
 from .context_auth import ContextAuth
+from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
 
 
-def run(scenarios_list: List[str], kubeconfig_path: str):
+def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
+    scenario_telemetries: list[ScenarioTelemetry] = []
+    failed_post_scenarios = []
     for scenario in scenarios_list:
+        scenario_telemetry = ScenarioTelemetry()
+        scenario_telemetry.scenario = scenario
+        scenario_telemetry.startTimeStamp = time.time()
+        telemetry.set_parameters_base64(scenario_telemetry,scenario)
         engine_args = build_args(scenario)
-        run_workflow(engine_args, kubeconfig_path)
+        status_code = run_workflow(engine_args, kubeconfig_path)
+        scenario_telemetry.endTimeStamp = time.time()
+        scenario_telemetry.exitStatus = status_code
+        scenario_telemetries.append(scenario_telemetry)
+        if status_code != 0:
+            failed_post_scenarios.append(scenario)
+    return failed_post_scenarios, scenario_telemetries
 
 
-def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str):
+def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int:
     set_arca_kubeconfig(engine_args, kubeconfig_path)
     exit_status = arcaflow.run(engine_args)
-    if exit_status != 0:
-        logging.error(
-            f"failed to run arcaflow scenario {engine_args.input}"
-        )
-        sys.exit(exit_status)
+    return exit_status
 
 
 def build_args(input_file: str) -> arcaflow.EngineArgs: