Skip to content

Commit

Permalink
Merge pull request #5189 from blackpiglet/1.9-CSI-snapshot-timeout-co…
Browse files Browse the repository at this point in the history
…nfigurable

[v1.9 cherry-pick] Make CSI snapshot creation timeout configurable for backup and schedule.
  • Loading branch information
qiuming-best authored Aug 9, 2022
2 parents 8487585 + 5f86cfa commit 7a749b8
Show file tree
Hide file tree
Showing 17 changed files with 71 additions and 9 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/5189-jxun
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Make CSI snapshot creation timeout configurable.
5 changes: 5 additions & 0 deletions config/crd/v1/bases/velero.io_backups.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ spec:
spec:
description: BackupSpec defines the specification for a Velero backup.
properties:
csiSnapshotTimeout:
description: CSISnapshotTimeout specifies the time used to wait for
CSI VolumeSnapshot status turns to ReadyToUse during creation, before
returning error as timeout. The default value is 10 minute.
type: string
defaultVolumesToRestic:
description: DefaultVolumesToRestic specifies whether restic should
be used to take a backup of all pod volumes by default.
Expand Down
5 changes: 5 additions & 0 deletions config/crd/v1/bases/velero.io_schedules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,11 @@ spec:
description: Template is the definition of the Backup to be run on
the provided schedule
properties:
csiSnapshotTimeout:
description: CSISnapshotTimeout specifies the time used to wait
for CSI VolumeSnapshot status turns to ReadyToUse during creation,
before returning error as timeout. The default value is 10 minute.
type: string
defaultVolumesToRestic:
description: DefaultVolumesToRestic specifies whether restic should
be used to take a backup of all pod volumes by default.
Expand Down
4 changes: 2 additions & 2 deletions config/crd/v1/crds/crds.go

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions pkg/apis/velero/v1/backup.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,12 @@ type BackupSpec struct {
// +optional
// +nullable
OrderedResources map[string]string `json:"orderedResources,omitempty"`

// CSISnapshotTimeout specifies the time used to wait for CSI VolumeSnapshot status turns to
// ReadyToUse during creation, before returning error as timeout.
// The default value is 10 minute.
// +optional
CSISnapshotTimeout metav1.Duration `json:"csiSnapshotTimeout,omitempty"`
}

// BackupHooks contains custom behaviors that should be executed at different phases of the backup.
Expand Down
1 change: 1 addition & 0 deletions pkg/apis/velero/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions pkg/builder/backup_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,3 +233,9 @@ func (b *BackupBuilder) OrderedResources(orders map[string]string) *BackupBuilde
b.object.Spec.OrderedResources = orders
return b
}

// CSISnapshotTimeout sets the Backup's CSISnapshotTimeout
func (b *BackupBuilder) CSISnapshotTimeout(timeout time.Duration) *BackupBuilder {
b.object.Spec.CSISnapshotTimeout.Duration = timeout
return b
}
5 changes: 4 additions & 1 deletion pkg/cmd/cli/backup/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ type CreateOptions struct {
SnapshotLocations []string
FromSchedule string
OrderedResources string
CSISnapshotTimeout time.Duration

client veleroclient.Interface
}
Expand All @@ -122,6 +123,7 @@ func (o *CreateOptions) BindFlags(flags *pflag.FlagSet) {
flags.StringSliceVar(&o.SnapshotLocations, "volume-snapshot-locations", o.SnapshotLocations, "List of locations (at most one per provider) where volume snapshots should be stored.")
flags.VarP(&o.Selector, "selector", "l", "Only back up resources matching this label selector.")
flags.StringVar(&o.OrderedResources, "ordered-resources", "", "Mapping Kinds to an ordered list of specific resources of that Kind. Resource names are separated by commas and their names are in format 'namespace/resourcename'. For cluster scope resource, simply use resource name. Key-value pairs in the mapping are separated by semi-colon. Example: 'pods=ns1/pod1,ns1/pod2;persistentvolumeclaims=ns1/pvc4,ns1/pvc8'. Optional.")
flags.DurationVar(&o.CSISnapshotTimeout, "csi-snapshot-timeout", o.CSISnapshotTimeout, "How long to wait for CSI snapshot creation before timeout.")
f := flags.VarPF(&o.SnapshotVolumes, "snapshot-volumes", "", "Take snapshots of PersistentVolumes as part of the backup.")
// this allows the user to just specify "--snapshot-volumes" as shorthand for "--snapshot-volumes=true"
// like a normal bool flag
Expand Down Expand Up @@ -332,7 +334,8 @@ func (o *CreateOptions) BuildBackup(namespace string) (*velerov1api.Backup, erro
LabelSelector(o.Selector.LabelSelector).
TTL(o.TTL).
StorageLocation(o.StorageLocation).
VolumeSnapshotLocations(o.SnapshotLocations...)
VolumeSnapshotLocations(o.SnapshotLocations...).
CSISnapshotTimeout(o.CSISnapshotTimeout)
if len(o.OrderedResources) > 0 {
orders, err := ParseOrderedResources(o.OrderedResources)
if err != nil {
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/cli/backup/create_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package backup
import (
"context"
"testing"
"time"

"github.com/stretchr/testify/assert"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -35,6 +36,7 @@ func TestCreateOptions_BuildBackup(t *testing.T) {
o.Labels.Set("velero.io/test=true")
o.OrderedResources = "pods=p1,p2;persistentvolumeclaims=pvc1,pvc2"
orders, err := ParseOrderedResources(o.OrderedResources)
o.CSISnapshotTimeout = 20 * time.Minute
assert.NoError(t, err)

backup, err := o.BuildBackup(testNamespace)
Expand All @@ -46,6 +48,7 @@ func TestCreateOptions_BuildBackup(t *testing.T) {
SnapshotVolumes: o.SnapshotVolumes.Value,
IncludeClusterResources: o.IncludeClusterResources.Value,
OrderedResources: orders,
CSISnapshotTimeout: metav1.Duration{Duration: o.CSISnapshotTimeout},
}, backup.Spec)

assert.Equal(t, map[string]string{
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/cli/schedule/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ func (o *CreateOptions) Run(c *cobra.Command, f client.Factory) error {
VolumeSnapshotLocations: o.BackupOptions.SnapshotLocations,
DefaultVolumesToRestic: o.BackupOptions.DefaultVolumesToRestic.Value,
OrderedResources: orders,
CSISnapshotTimeout: metav1.Duration{Duration: o.BackupOptions.CSISnapshotTimeout},
},
Schedule: o.Schedule,
UseOwnerReferencesInBackup: &o.UseOwnerReferencesInBackup,
Expand Down
6 changes: 5 additions & 1 deletion pkg/cmd/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ const (
// the default TTL for a backup
defaultBackupTTL = 30 * 24 * time.Hour

defaultCSISnapshotTimeout = 10 * time.Minute

// defaultCredentialsDirectory is the path on disk where credential
// files will be written to
defaultCredentialsDirectory = "/tmp/credentials"
Expand All @@ -111,7 +113,7 @@ type serverConfig struct {
// TODO(2.0) Deprecate defaultBackupLocation
pluginDir, metricsAddress, defaultBackupLocation string
backupSyncPeriod, podVolumeOperationTimeout, resourceTerminatingTimeout time.Duration
defaultBackupTTL, storeValidationFrequency time.Duration
defaultBackupTTL, storeValidationFrequency, defaultCSISnapshotTimeout time.Duration
restoreResourcePriorities []string
defaultVolumeSnapshotLocations map[string]string
restoreOnly bool
Expand Down Expand Up @@ -142,6 +144,7 @@ func NewCommand(f client.Factory) *cobra.Command {
defaultVolumeSnapshotLocations: make(map[string]string),
backupSyncPeriod: defaultBackupSyncPeriod,
defaultBackupTTL: defaultBackupTTL,
defaultCSISnapshotTimeout: defaultCSISnapshotTimeout,
storeValidationFrequency: defaultStoreValidationFrequency,
podVolumeOperationTimeout: defaultPodVolumeOperationTimeout,
restoreResourcePriorities: defaultRestorePriorities,
Expand Down Expand Up @@ -650,6 +653,7 @@ func (s *server) runControllers(defaultVolumeSnapshotLocations map[string]string
s.config.defaultBackupLocation,
s.config.defaultVolumesToRestic,
s.config.defaultBackupTTL,
s.config.defaultCSISnapshotTimeout,
s.sharedInformerFactory.Velero().V1().VolumeSnapshotLocations().Lister(),
defaultVolumeSnapshotLocations,
s.metrics,
Expand Down
15 changes: 12 additions & 3 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ type backupController struct {
defaultBackupLocation string
defaultVolumesToRestic bool
defaultBackupTTL time.Duration
defaultCSISnapshotTimeout time.Duration
snapshotLocationLister velerov1listers.VolumeSnapshotLocationLister
defaultSnapshotLocations map[string]string
metrics *metrics.ServerMetrics
Expand All @@ -111,6 +112,7 @@ func NewBackupController(
defaultBackupLocation string,
defaultVolumesToRestic bool,
defaultBackupTTL time.Duration,
defaultCSISnapshotTimeout time.Duration,
volumeSnapshotLocationLister velerov1listers.VolumeSnapshotLocationLister,
defaultSnapshotLocations map[string]string,
metrics *metrics.ServerMetrics,
Expand All @@ -135,6 +137,7 @@ func NewBackupController(
defaultBackupLocation: defaultBackupLocation,
defaultVolumesToRestic: defaultVolumesToRestic,
defaultBackupTTL: defaultBackupTTL,
defaultCSISnapshotTimeout: defaultCSISnapshotTimeout,
snapshotLocationLister: volumeSnapshotLocationLister,
defaultSnapshotLocations: defaultSnapshotLocations,
metrics: metrics,
Expand Down Expand Up @@ -359,6 +362,11 @@ func (c *backupController) prepareBackupRequest(backup *velerov1api.Backup) *pkg
request.Spec.TTL.Duration = c.defaultBackupTTL
}

if request.Spec.CSISnapshotTimeout.Duration == 0 {
// set default CSI VolumeSnapshot timeout
request.Spec.CSISnapshotTimeout.Duration = c.defaultCSISnapshotTimeout
}

// calculate expiration
request.Status.Expiration = &metav1.Time{Time: c.clock.Now().Add(request.Spec.TTL.Duration)}

Expand Down Expand Up @@ -638,7 +646,7 @@ func (c *backupController) runBackup(backup *pkgbackup.Request) error {
backupLog.Error(err)
}

err = c.checkVolumeSnapshotReadyToUse(context.Background(), volumeSnapshots)
err = c.checkVolumeSnapshotReadyToUse(context.Background(), volumeSnapshots, backup.Spec.CSISnapshotTimeout.Duration)
if err != nil {
backupLog.Errorf("fail to wait VolumeSnapshot change to Ready: %s", err.Error())
}
Expand Down Expand Up @@ -879,9 +887,10 @@ func encodeToJSONGzip(data interface{}, desc string) (*bytes.Buffer, []error) {
// using goroutine here instead of waiting in CSI plugin, because it's not easy to make BackupItemAction
// parallel by now. After BackupItemAction parallel is implemented, this logic should be moved to CSI plugin
// as https://github.com/vmware-tanzu/velero-plugin-for-csi/pull/100
func (c *backupController) checkVolumeSnapshotReadyToUse(ctx context.Context, volumesnapshots []*snapshotv1api.VolumeSnapshot) error {
func (c *backupController) checkVolumeSnapshotReadyToUse(ctx context.Context, volumesnapshots []*snapshotv1api.VolumeSnapshot,
csiSnapshotTimeout time.Duration) error {
eg, _ := errgroup.WithContext(ctx)
timeout := 10 * time.Minute
timeout := csiSnapshotTimeout
interval := 5 * time.Second

for _, vs := range volumesnapshots {
Expand Down
4 changes: 4 additions & 0 deletions site/content/docs/main/api-types/backup.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ metadata:
namespace: velero
# Parameters about the backup. Required.
spec:
# CSISnapshotTimeout specifies the time used to wait for
# CSI VolumeSnapshot status turns to ReadyToUse during creation, before
# returning error as timeout. The default value is 10 minute.
csiSnapshotTimeout: 10m
# Array of namespaces to include in the backup. If unspecified, all namespaces are included.
# Optional.
includedNamespaces:
Expand Down
4 changes: 4 additions & 0 deletions site/content/docs/main/api-types/schedule.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ spec:
schedule: 0 7 * * *
# Template is the spec that should be used for each backup triggered by this schedule.
template:
# CSISnapshotTimeout specifies the time used to wait for
# CSI VolumeSnapshot status turns to ReadyToUse during creation, before
# returning error as timeout. The default value is 10 minute.
csiSnapshotTimeout: 10m
# Array of namespaces to include in the scheduled backup. If unspecified, all namespaces are included.
# Optional.
includedNamespaces:
Expand Down
4 changes: 2 additions & 2 deletions site/content/docs/main/examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ cd velero
kubectl apply -f examples/nginx-app/with-pv.yaml
```

1. Create a backup with PV snapshotting:
1. Create a backup with PV snapshotting. `--csi-snapshot-timeout` is used to setup time to wait before CSI snapshot creation timeout. The default value is 10 minutes:

```bash
velero backup create nginx-backup --include-namespaces nginx-example
velero backup create nginx-backup --include-namespaces nginx-example --csi-snapshot-timeout=20m
```

1. Simulate a disaster:
Expand Down
5 changes: 5 additions & 0 deletions site/content/docs/v1.9/api-types/backup.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ metadata:
namespace: velero
# Parameters about the backup. Required.
spec:
# Available since v1.9.1.
# CSISnapshotTimeout specifies the time used to wait for
# CSI VolumeSnapshot status turns to ReadyToUse during creation, before
# returning error as timeout. The default value is 10 minute.
csiSnapshotTimeout: 10m
# Array of namespaces to include in the backup. If unspecified, all namespaces are included.
# Optional.
includedNamespaces:
Expand Down
5 changes: 5 additions & 0 deletions site/content/docs/v1.9/api-types/schedule.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ spec:
schedule: 0 7 * * *
# Template is the spec that should be used for each backup triggered by this schedule.
template:
# Available since v1.9.1.
# CSISnapshotTimeout specifies the time used to wait for
# CSI VolumeSnapshot status turns to ReadyToUse during creation, before
# returning error as timeout. The default value is 10 minute.
csiSnapshotTimeout: 10m
# Array of namespaces to include in the scheduled backup. If unspecified, all namespaces are included.
# Optional.
includedNamespaces:
Expand Down

0 comments on commit 7a749b8

Please sign in to comment.