Skip to content

Commit

Permalink
Update ebs loss scenario
Browse files Browse the repository at this point in the history
- Added excetions
- Supporting multiple regions
- Volume selection by volume id or node/instance id
- Aws access key id and aws secret access key not needed if system
wide credentials configured

Signed-off-by: jtydlack <[email protected]>
  • Loading branch information
jtydlack committed Sep 26, 2024
1 parent 49382b6 commit f34099e
Show file tree
Hide file tree
Showing 2 changed files with 188 additions and 65 deletions.
239 changes: 181 additions & 58 deletions kraken/aws/detach_ebs_volumes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,13 @@
import time
import yaml

from botocore.exceptions import ClientError, EndpointConnectionError
from urllib3.exceptions import ConnectionError, NewConnectionError

from krkn_lib.k8s import KrknKubernetes
from krkn_lib.models.telemetry import ScenarioTelemetry
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
from krkn_lib.utils.functions import get_yaml_item_value


def run(
Expand All @@ -29,33 +33,36 @@ def run(
with open(scenario) as stream:
scenario_config = yaml.safe_load(stream)

volume_ids = scenario_config["ebs_volume_id"]
volume_ids = re.split(r",+\s+|,+|\s+", volume_ids)
regions = scenario_config["region"]
# TODO support for multiple regions
# regions = re.split(r",+\s+|,+|\s+", regions)
# Load and validate credentials
aws_access_key_id = scenario_config["aws_access_key_id"]
aws_secret_access_key = scenario_config["aws_secret_access_key"]
chaos_duration = scenario_config["chaos_duration"]
regions = scenario_config["regions"]

# TODO implement detaching volumes based on tag and instance
volume_tag = scenario_config["ebs_volume_tag"]

# Get the EBS attachment details
ec2, ec2_client = get_ec2_session(
regions, aws_access_key_id, aws_secret_access_key
)
volume_details = get_ebs_volume_attachment_details(
volume_ids, ec2_client
)
logging.info("Obtaining attachment details...")
for volume in volume_details:
logging.info(
f"Volume {volume['VolumeId']} status: {volume['State']}"
# Check that aws credentials are valid
if not validate_credentials(aws_access_key_id, aws_secret_access_key):
logging.error(
f"Scenario {scenario} failed with "
f"exception: AWS was not able to validate provided access "
f"credentials. Please make sure the aws global credentials or "
f"credentials provided in scenario config are correct."
)
fail(scenario_telemetry, scenario_telemetries)
failed_post_scenarios.append(scenario)
return failed_post_scenarios, scenario_telemetries

# Try detach volume
detach_ebs_volume(volume, ec2_client, ec2, chaos_duration)
for region in regions:
if not region["region"]:
logging.error("Region name must be specified! Skipping region")
continue
logging.info(f"Detaching specified volumes for "
f"region {region['region']}...")
detach_volumes_for_region(
region,
chaos_duration,
aws_secret_access_key,
aws_access_key_id
)

logging.info(
f"End of scenario {scenario}. "
Expand All @@ -80,35 +87,146 @@ def fail(
scenario_telemetries.append(scenario_telemetry)


def get_ebs_volume_attachment_details(volume_ids: list, ec2_client):
response = ec2_client.describe_volumes(VolumeIds=volume_ids)
volumes_details = response["Volumes"]
return volumes_details
def detach_volumes_for_region(
region: dict,
chaos_duration: int,
aws_access_key_id: str,
aws_secret_access_key: str
):
region_name = region["region"]
instance_ids = region["node_ids"]
if instance_ids:
instance_ids = re.split(r",+\s+|,+|\s+", instance_ids)
volume_ids = get_yaml_item_value(region, "ebs_volume_ids", [])
if volume_ids:
volume_ids = re.split(r",+\s+|,+|\s+", volume_ids)

# Get the EBS attachment details
ec2, ec2_client = get_ec2_session(
region_name, aws_access_key_id, aws_secret_access_key
)

def get_ebs_volume_state(volume_id: str, ec2_resource):
volume = ec2_resource.Volume(volume_id)
state = volume.state
return state
if instance_ids:
try:
volume_ids = get_instance_attachment_details(
instance_ids, volume_ids, ec2_client
)
except ClientError as e:
logging.error(
f"Detaching volumes in region {region_name} failed "
f"with exception: {e}. Skipping region"
)
return
except (ConnectionError, NewConnectionError, EndpointConnectionError) as e:
logging.error(
f"Detaching volumes in region {region_name} failed "
f"with exception: {e}. Please make sure region name is "
f"correct. Skipping region"
)
return

# Check that there are any volume ids
if not volume_ids:
logging.warning(
f"No volumes for region {region_name}! Skipping region"
)
return

logging.info("Obtaining attachment details...")

try:
volume_details = get_ebs_volume_attachment_details(
volume_ids, ec2_client
)
except ClientError as e:
logging.error(
f"Detaching volumes in region {region_name} failed "
f"with exception: {e}. Skipping region"
)
return

for volume in volume_details:
logging.info(
f"Volume {volume['VolumeId']} status: {volume['State']}"
)

# Detaching volume and attaching it back
detach_ebs_volume(volume, ec2_client, ec2, chaos_duration)

return


def detach_ebs_volume(volume: dict, ec2_client, ec2_resource, duration: int):
if volume["State"] == "in-use":
logging.info(f"Detaching volume {volume['VolumeId']}...")
ec2_client.detach_volume(VolumeId=volume['VolumeId'])
if check_attachment_state(volume, ec2_resource, "available") == 1:
try:
ec2_client.detach_volume(VolumeId=volume['VolumeId'], Force=True)
except ClientError as e:
logging.error(
f"Detaching volume failed with exception: {e}. Skipping"
)
return
if not check_attachment_state(volume, ec2_resource, "available"):
return
logging.info(f"Volume {volume['VolumeId']} successfully detached")
logging.info("Waiting for chaos duration...")
time.sleep(duration)

# Attach volume back
attach_ebs_volume(volume, ec2_client)
if check_attachment_state(volume, ec2_resource, "in-use") == 1:
if not check_attachment_state(volume, ec2_resource, "in-use"):
return
logging.info(f"Volume {volume['VolumeId']} successfully attached")


def get_ec2_session(
region: str, aws_access_key_id: str, aws_secret_access_key: str
):
if aws_access_key_id and aws_secret_access_key:
ec2_client = boto3.client(
"ec2",
region_name=region,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)

ec2_resource = boto3.resource(
"ec2",
region_name=region,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key
)
else:
ec2_resource = boto3.resource("ec2", region_name=region)
ec2_client = boto3.client("ec2", region_name=region)

return ec2_resource, ec2_client


def get_ebs_volume_attachment_details(volume_ids: list, ec2_client):
response = ec2_client.describe_volumes(VolumeIds=volume_ids)
volumes_details = response["Volumes"]
return volumes_details


def get_instance_attachment_details(
instance_ids: list, volume_ids: list, ec2_client
):
response = ec2_client.describe_instances(InstanceIds=instance_ids)
instances_attachment_details = response['Reservations'][0]['Instances'][0]['BlockDeviceMappings']
for device in instances_attachment_details:
volume_id = device['Ebs']['VolumeId']
volume_ids.append(volume_id)
volume_ids = list(dict.fromkeys(volume_ids))
return volume_ids


def get_ebs_volume_state(volume_id: str, ec2_resource):
volume = ec2_resource.Volume(volume_id)
state = volume.state
return state


def attach_ebs_volume(volume: dict, ec2_client):
for attachment in volume["Attachments"]:
ec2_client.attach_volume(
Expand All @@ -118,35 +236,40 @@ def attach_ebs_volume(volume: dict, ec2_client):
)


def get_ec2_session(
region: str, aws_access_key_id: str, aws_secret_access_key: str
):
ec2 = boto3.resource(
"ec2",
region_name=region,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
ec2_client = boto3.client(
"ec2",
region_name=region,
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
)
return ec2, ec2_client


def check_attachment_state(volume, ec2, desired_state: str):
time.sleep(5)
state = get_ebs_volume_state(volume["VolumeId"], ec2)
for i in range(5):
state = get_ebs_volume_state(volume["VolumeId"], ec2)
if state == desired_state:
return 0
logging.debug(f"Volume in undesired state {state}...")
return True
logging.debug(f"Volume {volume} in undesired state {state}...")
time.sleep(3)
else:
logging.error(
f"Something went wrong, last volume {volume['VolumeId']} "
f"state was {state}, desired state {desired_state}"

logging.warning(
f"Something went wrong, volume {volume['VolumeId']} "
f"state was {state}, desired state is {desired_state}. "
f"Please check status of your volume!"
)
return False


def validate_credentials(aws_access_key_id: str, aws_secret_access_key: str):

logging.info("Validating credentials...")
if aws_access_key_id and aws_secret_access_key:
sts = boto3.client(
"sts",
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name="us-east-2"
)
return 1
else:
sts = boto3.client("sts", region_name="us-east-2")

try:
sts.get_caller_identity()
except ClientError:
return False

logging.info("Credential are validated")
return True
14 changes: 7 additions & 7 deletions scenarios/openshift/detach_ebs_volumes.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
aws_access_key_id:
aws_secret_access_key:
chaos_duration: 30
aws_access_key_id: # only if system-wide credentials are not
aws_secret_access_key: # configured or using different credentials
chaos_duration:

region:
ebs_volume_id:
ebs_volume_tag:
node_name:
regions:
- region:
ebs_volume_ids:
node_ids:

0 comments on commit f34099e

Please sign in to comment.