Skip to content

Commit

Permalink
Add proper implementation of UnprepareResources in kubelet plugin
Browse files Browse the repository at this point in the history
Previously, we always returned SUCCESS from every call to UnprepareResources,
regardless of whether we successfully unprepared the claim or not. The idea
being that we would always garbage collect it later, once the claim got
deallocated by the controller.

This commit changes the semantics to carry out the unprepare at the point it is
actually requested, returning an error if this is not possible for some reason.
In the past, such errors would have been missed, and the resources could have
been deallocated prematurely.

Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed May 7, 2024
1 parent 6c1ff1d commit 0f8ce59
Showing 1 changed file with 51 additions and 8 deletions.
59 changes: 51 additions & 8 deletions cmd/nvidia-dra-plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,11 @@ func (d *driver) Shutdown(ctx context.Context) error {
}

func (d *driver) NodePrepareResources(ctx context.Context, req *drapbv1.NodePrepareResourcesRequest) (*drapbv1.NodePrepareResourcesResponse, error) {

klog.Infof("NodePrepareResource is called: number of claims: %d", len(req.Claims))
preparedResources := &drapbv1.NodePrepareResourcesResponse{Claims: map[string]*drapbv1.NodePrepareResourceResponse{}}

// In production version some common operations of d.nodeUnprepareResources
// should be done outside of the loop, for instance updating the CR could
// In production version some common operations of d.nodePrepareResources
// could be done outside of this loop, for instance updating the CR could
// be done once after all HW was prepared.
for _, claim := range req.Claims {
preparedResources.Claims[claim.Uid] = d.nodePrepareResource(ctx, claim)
Expand All @@ -114,6 +113,20 @@ func (d *driver) NodePrepareResources(ctx context.Context, req *drapbv1.NodePrep
return preparedResources, nil
}

func (d *driver) NodeUnprepareResources(ctx context.Context, req *drapbv1.NodeUnprepareResourcesRequest) (*drapbv1.NodeUnprepareResourcesResponse, error) {
klog.Infof("NodeUnprepareResource is called: number of claims: %d", len(req.Claims))
unpreparedResources := &drapbv1.NodeUnprepareResourcesResponse{Claims: map[string]*drapbv1.NodeUnprepareResourceResponse{}}

// In production version some common operations of d.nodeUnprepareResources
// could be done outside of this loop, for instance updating the CR could
// be done once after all HW was prepared.
for _, claim := range req.Claims {
unpreparedResources.Claims[claim.Uid] = d.nodeUnprepareResource(ctx, claim)
}

return unpreparedResources, nil
}

func (d *driver) nodePrepareResource(ctx context.Context, claim *drapbv1.Claim) *drapbv1.NodePrepareResourceResponse {
d.Lock()
defer d.Unlock()
Expand Down Expand Up @@ -141,11 +154,30 @@ func (d *driver) nodePrepareResource(ctx context.Context, claim *drapbv1.Claim)
return &drapbv1.NodePrepareResourceResponse{CDIDevices: prepared}
}

func (d *driver) NodeUnprepareResources(ctx context.Context, req *drapbv1.NodeUnprepareResourcesRequest) (*drapbv1.NodeUnprepareResourcesResponse, error) {
// We don't upprepare as part of NodeUnprepareResource, we do it
// asynchronously when the claims themselves are deleted and the
// AllocatedClaim has been removed.
return &drapbv1.NodeUnprepareResourcesResponse{}, nil
func (d *driver) nodeUnprepareResource(ctx context.Context, claim *drapbv1.Claim) *drapbv1.NodeUnprepareResourceResponse {
d.Lock()
defer d.Unlock()

isUnprepared, err := d.isUnprepared(ctx, claim.Uid)
if err != nil {
return &drapbv1.NodeUnprepareResourceResponse{
Error: fmt.Sprintf("error checking if claim is already unprepared: %v", err),
}
}

if isUnprepared {
klog.Infof("Already unprepared, nothing to do for claim '%v'", claim.Uid)
return &drapbv1.NodeUnprepareResourceResponse{}
}

err = d.unprepare(ctx, claim.Uid)
if err != nil {
return &drapbv1.NodeUnprepareResourceResponse{
Error: fmt.Sprintf("error unpreparing devices for claim %v: %v", claim.Uid, err),
}
}

return &drapbv1.NodeUnprepareResourceResponse{}
}

func (d *driver) isPrepared(ctx context.Context, claimUID string) (bool, []string, error) {
Expand All @@ -159,6 +191,17 @@ func (d *driver) isPrepared(ctx context.Context, claimUID string) (bool, []strin
return false, nil, nil
}

func (d *driver) isUnprepared(ctx context.Context, claimUID string) (bool, error) {
err := d.nasclient.Get(ctx)
if err != nil {
return false, err
}
if _, exists := d.nascr.Spec.PreparedClaims[claimUID]; !exists {
return true, nil
}
return false, nil
}

func (d *driver) prepare(ctx context.Context, claimUID string) ([]string, error) {
var err error
var prepared []string
Expand Down

0 comments on commit 0f8ce59

Please sign in to comment.