Skip to content

Commit

Permalink
Add ebs loss scenario
Browse files Browse the repository at this point in the history
- Added scenario config
- Added new folder containing scenarion scripts
- Added scenario in run_kraken.py and kraken config

Resolves krkn-chaos#678

Signed-off-by: jtydlack <[email protected]>
  • Loading branch information
jtydlack committed Sep 27, 2024
1 parent 32142cc commit 4ce16e1
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 1 deletion.
2 changes: 2 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ kraken:
- scenarios/kube/service_hijacking.yaml
- syn_flood:
- scenarios/kube/syn_flood.yaml
- aws:
- scenarios/openshift/detach_ebs_volumes.yaml

cerberus:
cerberus_enabled: False # Enable it when cerberus is previously installed
Expand Down
Empty file added kraken/aws/__init__.py
Empty file.
152 changes: 152 additions & 0 deletions kraken/aws/detach_ebs_volumes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import boto3
import logging
import re
import time
import yaml

from krkn_lib.k8s import KrknKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes


def run(
scenarios_list: list[str],
wait_duration: int,
krkn_lib: KrknKubernetes,
telemetry: KrknTelemetryKubernetes,
) -> (list[str], list[ScenarioTelemetry]):

logging.info("Detach ebs volumes scenario running...")
scenario_telemetries = list[ScenarioTelemetry]()
failed_post_scenarios = []
for scenario in scenarios_list:
scenario_telemetry = ScenarioTelemetry()
scenario_telemetry.scenario = scenario
scenario_telemetry.start_timestamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, scenario)

# Loading parameters from scenario config
with open(scenario) as stream:
scenario_config = yaml.safe_load(stream)

volume_ids = scenario_config["ebs_volume_id"]
volume_ids = re.split(r",+\s+|,+|\s+", volume_ids)
regions = scenario_config["region"]
# TODO support for multiple regions
# regions = re.split(r",+\s+|,+|\s+", regions)
aws_access_key_id = scenario_config["aws_access_key_id"]
aws_secret_access_key = scenario_config["aws_secret_access_key"]
chaos_duration = scenario_config["chaos_duration"]

# TODO implement detaching volumes based on tag and instance
volume_tag = scenario_config["ebs_volume_tag"]

# Get the EBS attachment details
ec2, ec2_client = get_ec2_session(
regions, aws_access_key_id, aws_secret_access_key
)
volume_details = get_ebs_volume_attachment_details(
volume_ids, ec2_client
)
logging.info("Obtaining attachment details...")
for volume in volume_details:
logging.info(
f"Volume {volume['VolumeId']} status: {volume['State']}"
)

# Try detach volume
detach_ebs_volume(volume, ec2_client, ec2, chaos_duration)

logging.info(
f"End of scenario {scenario}. "
f"Waiting for the specified duration {wait_duration}..."
)
time.sleep(wait_duration)

scenario_telemetry.exit_status = 0
scenario_telemetry.end_timestamp = time.time()
scenario_telemetries.append(scenario_telemetry)
logging.info(f"Scenario {scenario} successfully finished")

return failed_post_scenarios, scenario_telemetries


def fail(
scenario_telemetry: ScenarioTelemetry,
scenario_telemetries: list[ScenarioTelemetry],
):
scenario_telemetry.exit_status = 1
scenario_telemetry.end_timestamp = time.time()
scenario_telemetries.append(scenario_telemetry)


def get_ebs_volume_attachment_details(volume_ids: list, ec2_client):
response = ec2_client.describe_volumes(VolumeIds=volume_ids)
volumes_details = response["Volumes"]
return volumes_details


def get_ebs_volume_state(volume_id: str, ec2_resource):
volume = ec2_resource.Volume(volume_id)
state = volume.state
return state


def detach_ebs_volume(volume: dict, ec2_client, ec2_resource, duration: int):
if volume["State"] == "in-use":
logging.info(f"Detaching volume {volume['VolumeId']}...")
ec2_client.detach_volume(VolumeId=volume['VolumeId'])
if check_attachment_state(volume, ec2_resource, "available") == 1:
return
logging.info(f"Volume {volume['VolumeId']} successfully detached")
logging.info("Waiting for chaos duration...")
time.sleep(duration)

# Attach volume back
attach_ebs_volume(volume, ec2_client)
if check_attachment_state(volume, ec2_resource, "in-use") == 1:
return
logging.info(f"Volume {volume['VolumeId']} successfully attached")


def attach_ebs_volume(volume: dict, ec2_client):
for attachment in volume["Attachments"]:
ec2_client.attach_volume(
InstanceId=attachment["InstanceId"],
Device=attachment["Device"],
VolumeId=volume["VolumeId"],
)


def get_ec2_session(
region: str, aws_access_key_id: str, aws_secret_access_key: str
):
ec2 = boto3.resource(
"ec2",
region_name=region,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
ec2_client = boto3.client(
"ec2",
region_name=region,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
return ec2, ec2_client


def check_attachment_state(volume, ec2, desired_state: str):
time.sleep(5)
state = get_ebs_volume_state(volume["VolumeId"], ec2)
for i in range(5):
if state == desired_state:
return 0
logging.debug(f"Volume in undesired state {state}...")
time.sleep(3)
else:
logging.error(
f"Something went wrong, last volume {volume['VolumeId']} "
f"state was {state}, desired state {desired_state}"
)
return 1
7 changes: 6 additions & 1 deletion run_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import kraken.arcaflow_plugin as arcaflow_plugin
import kraken.prometheus as prometheus_plugin
import kraken.service_hijacking.service_hijacking as service_hijacking_plugin
import kraken.aws.detach_ebs_volumes as detach_ebs_volumes
import server as server
from kraken import plugins, syn_flood
from krkn_lib.k8s import KrknKubernetes
Expand All @@ -44,6 +45,7 @@

report_file = ""


# Main function
def main(cfg) -> int:
# Start kraken
Expand Down Expand Up @@ -464,6 +466,10 @@ def main(cfg) -> int:
telemetry_request_id
)
chaos_telemetry.scenarios.extend(scenario_telemetries)
elif scenario_type == "aws":
logging.info("Running AWS Chaos")
failed_post_scenarios, scenario_telemetries = detach_ebs_volumes.run(scenarios_list, wait_duration, kubecli, telemetry_k8s)
# chaos_telemetry.scenarios.extend(scenario_telemetries)

# Check for critical alerts when enabled
post_critical_alerts = 0
Expand All @@ -481,7 +487,6 @@ def main(cfg) -> int:
logging.error("Post chaos critical alerts firing please check, exiting")
break


iteration += 1
logging.info("")

Expand Down
9 changes: 9 additions & 0 deletions scenarios/openshift/detach_ebs_volumes.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
aws_access_key_id:
aws_secret_access_key:
chaos_duration: 30

region:
ebs_volume_id:
ebs_volume_tag:
node_name:

0 comments on commit 4ce16e1

Please sign in to comment.