Skip to content

Commit

Permalink
Feature: Multi-Attach for io2 block devices
Browse files Browse the repository at this point in the history
Signed-off-by: Eddie Torres <[email protected]>
  • Loading branch information
torredil committed Oct 31, 2023
1 parent fdb9866 commit 76231e7
Show file tree
Hide file tree
Showing 17 changed files with 902 additions and 204 deletions.
81 changes: 81 additions & 0 deletions docs/multi-attach.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Multi-Attach

The multi-attach capability allows you to attach a single EBS volume to multiple EC2 instances located within the same Availability Zone (AZ). This shared volume can be utilized by several pods running on distinct nodes.

Multi-attach is enabled by specifying `ReadWriteMany` for the `PersistentVolumeClaim.spec.accessMode`.

## Important

- Application-level coordination (e.g., via I/O fencing) is required to use multi-attach safely. Failure to do so can result in data loss and silent data corruption. Refer to the AWS documentation on Multi-Attach for more information.
- Currently, the EBS CSI driver only supports multi-attach for `IO2` volumes in `Block` mode.

Refer to the official AWS documentation on [Multi-Attach](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-volumes-multi.html) for more information, best practices, and limitations of this capability.

## Example

1. Create a `StorageClass` referencing an `IO2` volume type:
```
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: ebs-sc
provisioner: ebs.csi.aws.com
volumeBindingMode: WaitForFirstConsumer
parameters:
type: io2
iops: "1000"
```

2. Create a `PersistentVolumeClaim` referencing the `ReadWriteMany` access and `Block` device modes:
```
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: block-claim
spec:
accessModes:
- ReadWriteMany
volumeMode: Block
storageClassName: ebs-sc
resources:
requests:
storage: 4Gi
```

3. Create a `DaemonSet` referencing the `PersistentVolumeClaim` created in the previous step:
```
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: app-daemon
spec:
selector:
matchLabels:
name: app
template:
metadata:
labels:
name: app
spec:
containers:
- name: app
image: busybox
command: ["/bin/sh", "-c"]
args: ["tail -f /dev/null"]
volumeDevices:
- name: data
devicePath: /dev/xvda
volumes:
- name: data
persistentVolumeClaim:
claimName: block-claim
```

4. Verify the `DaemonSet` is running:
```
$ kubectl get pods -A
NAMESPACE NAME READY STATUS RESTARTS AGE
default app-daemon-9hdgw 1/1 Running 0 18s
default app-daemon-xm8zr 1/1 Running 0 18s
```
78 changes: 23 additions & 55 deletions pkg/cloud/cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ type DiskOptions struct {
OutpostArn string
Encrypted bool
BlockExpress bool
MultiAttachEnabled bool
// KmsKeyID represents a fully qualified resource name to the key to use for encryption.
// example: arn:aws:kms:us-east-1:012345678910:key/abcd1234-a123-456a-a12b-a123b4cd56ef
KmsKeyID string
Expand Down Expand Up @@ -345,6 +346,10 @@ func (c *cloud) CreateDisk(ctx context.Context, volumeName string, diskOptions *
return nil, fmt.Errorf("invalid AWS VolumeType %q", diskOptions.VolumeType)
}

if diskOptions.MultiAttachEnabled && createType != VolumeTypeIO2 {
return nil, fmt.Errorf("CreateDisk: multi-attach is only supported for io2 volumes")
}

if maxIops > 0 {
if diskOptions.IOPS > 0 {
requestedIops = int64(diskOptions.IOPS)
Expand Down Expand Up @@ -381,11 +386,12 @@ func (c *cloud) CreateDisk(ctx context.Context, volumeName string, diskOptions *
clientToken := sha256.Sum256([]byte(volumeName))

requestInput := &ec2.CreateVolumeInput{
AvailabilityZone: aws.String(zone),
ClientToken: aws.String(hex.EncodeToString(clientToken[:])),
Size: aws.Int64(capacityGiB),
VolumeType: aws.String(createType),
Encrypted: aws.Bool(diskOptions.Encrypted),
AvailabilityZone: aws.String(zone),
ClientToken: aws.String(hex.EncodeToString(clientToken[:])),
Size: aws.Int64(capacityGiB),
VolumeType: aws.String(createType),
Encrypted: aws.Bool(diskOptions.Encrypted),
MultiAttachEnabled: aws.Bool(diskOptions.MultiAttachEnabled),
}

if !util.IsSBE(zone) {
Expand Down Expand Up @@ -549,40 +555,19 @@ func (c *cloud) AttachDisk(ctx context.Context, volumeID, nodeID string) (string

resp, attachErr := c.ec2.AttachVolumeWithContext(ctx, request)
if attachErr != nil {
var awsErr awserr.Error
if errors.As(attachErr, &awsErr) {
if awsErr.Code() == "VolumeInUse" {
return "", ErrVolumeInUse
}
}
return "", fmt.Errorf("could not attach volume %q to node %q: %w", volumeID, nodeID, attachErr)
}
klog.V(5).InfoS("[Debug] AttachVolume", "volumeID", volumeID, "nodeID", nodeID, "resp", resp)
}

attachment, err := c.WaitForAttachmentState(ctx, volumeID, volumeAttachedState, *instance.InstanceId, device.Path, device.IsAlreadyAssigned)
_, err = c.WaitForAttachmentState(ctx, volumeID, volumeAttachedState, *instance.InstanceId, device.Path, device.IsAlreadyAssigned)

// This is the only situation where we taint the device
if err != nil {
device.Taint()
return "", err
}

// Double check the attachment to be 100% sure we attached the correct volume at the correct mountpoint
// It could happen otherwise that we see the volume attached from a previous/separate AttachVolume call,
// which could theoretically be against a different device (or even instance).
if attachment == nil {
// Impossible?
return "", fmt.Errorf("unexpected state: attachment nil after attached %q to %q", volumeID, nodeID)
}
if device.Path != aws.StringValue(attachment.Device) {
// Already checked in waitForAttachmentState(), but just to be sure...
return "", fmt.Errorf("disk attachment of %q to %q failed: requested device %q but found %q", volumeID, nodeID, device.Path, aws.StringValue(attachment.Device))
}
if *instance.InstanceId != aws.StringValue(attachment.InstanceId) {
return "", fmt.Errorf("disk attachment of %q to %q failed: requested instance %q but found %q", volumeID, nodeID, *instance.InstanceId, aws.StringValue(attachment.InstanceId))
}

// TODO: Check volume capability matches for ALREADY_EXISTS
// This could happen when request volume already attached to request node,
// but is incompatible with the specified volume_capability or readonly flag
Expand Down Expand Up @@ -674,42 +659,25 @@ func (c *cloud) WaitForAttachmentState(ctx context.Context, volumeID, expectedSt
return false, nil
}

if len(volume.Attachments) > 1 {
// Shouldn't happen; log so we know if it is
if volume.MultiAttachEnabled != nil && !*volume.MultiAttachEnabled && len(volume.Attachments) > 1 {
klog.InfoS("Found multiple attachments for volume", "volumeID", volumeID, "volume", volume)
return false, fmt.Errorf("volume %q has multiple attachments", volumeID)
}

attachmentState := ""

for _, a := range volume.Attachments {
if attachmentState != "" {
// Shouldn't happen; log so we know if it is
klog.InfoS("Found multiple attachments for volume", "volumeID", volumeID, "volume", volume)
}
if a.State != nil {
if a.State != nil && aws.StringValue(a.InstanceId) == expectedInstance {
attachmentState = aws.StringValue(a.State)
attachment = a
attachmentState = *a.State
} else {
// Shouldn't happen; log so we know if it is
klog.InfoS("Ignoring nil attachment state for volume", "volumeID", volumeID, "attachment", a)
}
}
if attachmentState == "" {
attachmentState = volumeDetachedState

if attachment == nil {
return false, fmt.Errorf("WaitForAttachmentState: attachment nil but expected %q to be attached to %q", volumeID, expectedInstance)
}
if attachment != nil {
// AWS eventual consistency can go back in time.
// For example, we're waiting for a volume to be attached as /dev/xvdba, but AWS can tell us it's
// attached as /dev/xvdbb, where it was attached before and it was already detached.
// Retry couple of times, hoping AWS starts reporting the right status.
device := aws.StringValue(attachment.Device)
if expectedDevice != "" && device != "" && device != expectedDevice {
klog.InfoS("Expected device for volume not found", "expectedDevice", expectedDevice, "expectedState", expectedState, "volumeID", volumeID, "device", device, "attachmentState", attachmentState)
return false, nil
}
instanceID := aws.StringValue(attachment.InstanceId)
if expectedInstance != "" && instanceID != "" && instanceID != expectedInstance {
klog.InfoS("Expected instance for volume not found", "expectedInstance", expectedInstance, "expectedState", expectedState, "volumeID", volumeID, "instanceID", instanceID, "attachmentState", attachmentState)
return false, nil
}
if attachment.Device != nil && aws.StringValue(attachment.Device) != expectedDevice {
return false, fmt.Errorf("WaitForAttachmentState: device %q but expected %q", aws.StringValue(attachment.Device), expectedDevice)
}

// if we expected volume to be attached and it was reported as already attached via DescribeInstance call
Expand Down
Loading

0 comments on commit 76231e7

Please sign in to comment.