Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ebs loss scenario #692

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions krkn/scenario_plugins/node_actions/abstract_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,16 @@ def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
self.helper_node_start_scenario(instance_kill_count, node, timeout)
logging.info("helper_node_stop_start_scenario has been successfully injected!")

# Node scenario to detach and attach the disk
def disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration):
logging.info("Starting node_stop_start_scenario injection")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's be sure the logging information here matches the action we are doing, should be something like "Starting to detach disk"

disk_attachment_details = self.disk_attachment_info(instance_kill_count, node)
self.disk_detach_scenario(instance_kill_count, node, timeout)
logging.info("Waiting for %s seconds before attaching the disk" % (duration))
time.sleep(duration)
self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout)
logging.info("disk_detach_attach_scenario has been successfully injected!")

# Node scenario to terminate the node
def node_termination_scenario(self, instance_kill_count, node, timeout):
pass
Expand Down
99 changes: 99 additions & 0 deletions krkn/scenario_plugins/node_actions/aws_node_scenarios.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class AWS:
def __init__(self):
self.boto_client = boto3.client("ec2")
self.boto_instance = boto3.resource("ec2").Instance("id")
self.boto_resource = boto3.resource("ec2")

# Get the instance ID of the node
def get_instance_id(self, node):
Expand Down Expand Up @@ -179,6 +180,60 @@ def delete_network_acl(self, acl_id):

raise RuntimeError()

# TODO Detach volume
def detach_volumes(self, volumes_ids: list):
for volume in volumes_ids:
try:
self.boto_client.detach_volume(VolumeId=volume, Force=True)
except Exception as e:
logging.error(
"Detaching volume %s failed with exception: %s"
% (volume, e)
)

# TODO Attach volume
def attach_volume(self, attachment: dict):
try:
if self.get_volume_state(attachment["VolumeId"]) == "in-use":
return
logging.info(
"Attaching the %s volumes to instance %s."
% (attachment["VolumeId"], attachment["InstanceId"])
)
self.boto_client.attach_volume(
InstanceId=attachment["InstanceId"],
Device=attachment["Device"],
VolumeId=attachment["VolumeId"]
)
except Exception as e:
logging.error(
"Failed attaching disk %s to the %s instance. "
"Encountered following exception: %s"
% (attachment['VolumeId'], attachment['InstanceId'], e)
)
# raise RuntimeError()

# Get IDs of node volumes
def get_volumes_ids(self, instance_id: list):
response = self.boto_client.describe_instances(InstanceIds=instance_id)
instance_attachment_details = response['Reservations'][0]['Instances'][0]['BlockDeviceMappings']
volume_ids = []
for device in instance_attachment_details:
volume_id = device['Ebs']['VolumeId']
volume_ids.append(volume_id)
return volume_ids

# Get volumes attachment details
def get_volume_attachment_details(self, volume_ids: list):
response = self.boto_client.describe_volumes(VolumeIds=volume_ids)
volumes_details = response["Volumes"]
return volumes_details

# TODO Get volume state
def get_volume_state(self, volume_id: str):
volume = self.boto_resource.Volume(volume_id)
state = volume.state
return state

# krkn_lib
class aws_node_scenarios(abstract_node_scenarios):
Expand Down Expand Up @@ -290,3 +345,47 @@ def node_reboot_scenario(self, instance_kill_count, node, timeout):
logging.error("node_reboot_scenario injection failed!")

raise RuntimeError()

# TODO Get volume attachment info
def disk_attachment_info(self, instance_kill_count, node):
for _ in range(instance_kill_count):
try:
logging.info("Obtaining disk attachment information")
instance_id = (self.aws.get_instance_id(node)).split()
volumes_ids = self.aws.get_volumes_ids(instance_id)
vol_attachment_details = self.aws.get_volume_attachment_details(
volumes_ids
)
return vol_attachment_details
except Exception as e:
logging.error(
"Failed to obtain disk attachment information of %s node. "
"Encounteres following exception: %s." % (node, e)
)
raise RuntimeError()

# TODO Node scenario to detach the volume
def disk_detach_scenario(self, instance_kill_count, node, timeout):
for _ in range(instance_kill_count):
try:
logging.info("Starting disk_detach_scenario injection")
instance_id = (self.aws.get_instance_id(node)).split()
volumes_ids = self.aws.get_volumes_ids(instance_id)
logging.info(
"Detaching the %s volumes from instance %s "
% (volumes_ids, node)
)
self.aws.detach_volumes(volumes_ids)
except Exception as e:
logging.error(
"Failed to detach disk from %s node. Encountered following"
"exception: %s." % (node, e)
)
logging.debug("")
raise RuntimeError()

# TODO Node scenario to attach the volume
def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout):
for _ in range(instance_kill_count):
for attachment in attachment_details:
self.aws.attach_volume(attachment["Attachments"][0])
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
logging.info("action" + str(action))
# Get the scenario specifics for running action nodes
run_kill_count = get_yaml_item_value(node_scenario, "runs", 1)
if action == "node_stop_start_scenario":
if action in ("node_stop_start_scenario", "disk_detach_attach_scenario"):
duration = get_yaml_item_value(node_scenario, "duration", 120)

timeout = get_yaml_item_value(node_scenario, "timeout", 120)
Expand Down Expand Up @@ -200,6 +200,10 @@ def run_node(self, single_node, node_scenario_object, action, node_scenario):
node_scenario_object.node_reboot_scenario(
run_kill_count, single_node, timeout
)
elif action == "disk_detach_attach_scenario":
node_scenario_object.disk_detach_attach_scenario(
run_kill_count, single_node, timeout
)
elif action == "stop_start_kubelet_scenario":
node_scenario_object.stop_start_kubelet_scenario(
run_kill_count, single_node, timeout
Expand Down
2 changes: 1 addition & 1 deletion run_kraken.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ def main(cfg) -> int:
end_time,
alert_profile,
elastic_colllect_alerts,
elastic_alerts_index,
elastic_alerts_index
)

else:
Expand Down
Loading