From 4f2c780e9e4216d20a08af95d82b6e7a3ced600a Mon Sep 17 00:00:00 2001 From: Pallav Date: Mon, 6 May 2024 12:00:36 +0000 Subject: [PATCH] Fix Cloud Native Restore Path --- go.mod | 2 +- go.sum | 4 +- pkg/executor/nfs/nfsrestoreresources.go | 2 +- .../stork/drivers/volume/volume.go | 28 +++ .../controllers/applicationbackup.go | 41 +++- .../controllers/applicationrestore.go | 199 ++++++++++-------- .../pkg/resourcecollector/persistentvolume.go | 2 +- .../client-go/kubecli/kubevirt_test_utils.go | 2 - vendor/modules.txt | 2 +- 9 files changed, 188 insertions(+), 94 deletions(-) diff --git a/go.mod b/go.mod index d827333c1..df2bedf8e 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,7 @@ require ( github.com/hashicorp/go-version v1.6.0 github.com/kubernetes-csi/external-snapshotter/client/v4 v4.2.0 github.com/kubernetes-incubator/external-storage v0.20.4-openstorage-rc7 - github.com/libopenstorage/stork v1.4.1-0.20240502172400-885982c36058 + github.com/libopenstorage/stork v1.4.1-0.20240506103309-0605fa31d8ff github.com/portworx/pxc v0.33.0 github.com/portworx/sched-ops v1.20.4-rc1.0.20240424153814-f3083bdb4578 github.com/sirupsen/logrus v1.9.3 diff --git a/go.sum b/go.sum index 781d94120..afc32fc36 100644 --- a/go.sum +++ b/go.sum @@ -3362,8 +3362,8 @@ github.com/libopenstorage/stork v1.4.1-0.20230502135851-9cacb19e1df5/go.mod h1:R github.com/libopenstorage/stork v1.4.1-0.20230519043154-cbc10dffaf19/go.mod h1:Xm4DHoViynFXMQKBXGj3IkA77LY2RBFkNtv6vbo3wNw= github.com/libopenstorage/stork v1.4.1-0.20230601053837-5dd68f026569/go.mod h1:+mKPMCPNhS/XOF2RPcNFijkr67CCCWp0o8OXVG6xxAk= github.com/libopenstorage/stork v1.4.1-0.20230610103146-72cf75320066/go.mod h1:Yst+fnOYjWk6SA5pXZBKm19wtiinjxQ/vgYTXI3k80Q= -github.com/libopenstorage/stork v1.4.1-0.20240502172400-885982c36058 h1:bjV6pxioNGTmK900pwF9UQ2Te5ov1lsoH+1jha9dFWg= -github.com/libopenstorage/stork v1.4.1-0.20240502172400-885982c36058/go.mod h1:kp5qtpq+BgjL5WqiOpDvbPH1WGReO5AaqXDbb+XpvzM= +github.com/libopenstorage/stork v1.4.1-0.20240506103309-0605fa31d8ff h1:MEz/7q5zYBJaX6aLSxOcDWoOLZCb5a7jJlEkYtZ/t6A= +github.com/libopenstorage/stork v1.4.1-0.20240506103309-0605fa31d8ff/go.mod h1:kp5qtpq+BgjL5WqiOpDvbPH1WGReO5AaqXDbb+XpvzM= github.com/libopenstorage/systemutils v0.0.0-20160208220149-44ac83be3ce1/go.mod h1:xwNGC7xiz/BQ/wbMkvHujL8Gjgseg+x41xMek7sKRRQ= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= diff --git a/pkg/executor/nfs/nfsrestoreresources.go b/pkg/executor/nfs/nfsrestoreresources.go index 2a3424c10..c92143f71 100644 --- a/pkg/executor/nfs/nfsrestoreresources.go +++ b/pkg/executor/nfs/nfsrestoreresources.go @@ -384,7 +384,7 @@ func getPVCToPVMapping(allObjects []runtime.Unstructured) (map[string]*v1.Persis } func isGenericCSIPersistentVolume(pv *v1.PersistentVolume) (bool, error) { - driverName, err := volume.GetPVDriver(pv) + driverName, err := volume.GetPVDriverForRestore(pv) if err != nil { return false, err } diff --git a/vendor/github.com/libopenstorage/stork/drivers/volume/volume.go b/vendor/github.com/libopenstorage/stork/drivers/volume/volume.go index 5bd4b310d..97346f9b7 100644 --- a/vendor/github.com/libopenstorage/stork/drivers/volume/volume.go +++ b/vendor/github.com/libopenstorage/stork/drivers/volume/volume.go @@ -67,6 +67,16 @@ var ( CSIDriverName, KDMPDriverName, } + + orderedListOfDriversForRestore = []string{ + AWSDriverName, + AzureDriverName, + GCEDriverName, + LinstorDriverName, + PortworxDriverName, + CSIDriverName, + KDMPDriverName, + } ) // Driver defines an external volume driver interface. @@ -399,6 +409,24 @@ func GetPVDriver(pv *v1.PersistentVolume) (string, error) { } } +// GetPVDriverForRestore gets the driver associated with a PV. Returns ErrNotFound if the PV is +// not owned by any available driver +func GetPVDriverForRestore(pv *v1.PersistentVolume) (string, error) { + for _, driverName := range orderedListOfDriversForRestore { + d, ok := volDrivers[driverName] + if !ok { + continue + } + if d.OwnsPV(pv) { + return driverName, nil + } + } + return "", &errors.ErrNotSupported{ + Feature: "VolumeDriver", + Reason: fmt.Sprintf("PV %v provisioned using unsupported driver", pv.Name), + } +} + // ClusterPairNotSupported to be used by drivers that don't support pairing type ClusterPairNotSupported struct{} diff --git a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackup.go b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackup.go index d119c9d03..3da6b4452 100644 --- a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackup.go +++ b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationbackup.go @@ -439,6 +439,32 @@ func (a *ApplicationBackupController) handle(ctx context.Context, backup *stork_ return nil } + // if stork got restarted we would have got IncludeResource Memory map cleaned up. + // Hence re-create memory map of it. + if IsBackupObjectTypeVirtualMachine(backup) && + backup.Status.Stage != stork_api.ApplicationBackupStageInitial && + backup.Status.Stage != stork_api.ApplicationBackupStageImportResource && + (len(a.vmIncludeResource[string(backup.UID)]) == 0 || len(a.vmIncludeResourceMap[string(backup.UID)]) == 0) { + logrus.Infof("Stork seems restarted, repopulating VM resource Map for backup %v", backup.Name) + // First VMs from various filters provided. + vmList, objectMap, err := resourcecollector.GetVMIncludeListFromBackup(backup) + if err != nil { + logrus.Debugf("failed to import VM resources, after stork reboot. returning for retry") + return err + } + nsMap := make(map[string]bool) + // Second fetch VM resources from the list of filtered VMs and freeze/thaw rule for each of them. + // also set SkipVmAutoExecRules to true as we dont need to recreate it at this stage. + skipVmAutoRuleCommands := true + vmIncludeResources, objectMap, _, _ := resourcecollector.GetVMIncludeResourceInfoList(vmList, + objectMap, nsMap, skipVmAutoRuleCommands) + + // update in memory data structure for later use. + a.vmIncludeResourceMap[string(backup.UID)] = objectMap + a.vmIncludeResource[string(backup.UID)] = vmIncludeResources + a.vmNsListMap[string(backup.UID)] = nsMap + } + switch backup.Status.Stage { case stork_api.ApplicationBackupStageInitial: // Validate parameters @@ -750,6 +776,12 @@ func (a *ApplicationBackupController) backupVolumes(backup *stork_api.Applicatio var objectMap map[stork_api.ObjectInfo]bool if IsBackupObjectTypeVirtualMachine(backup) { objectMap = a.vmIncludeResourceMap[string(backup.UID)] + if len(objectMap) == 0 { + // for debugging purpose only. + // Its possible that will have empty rsources to backup during schedule backups due + // to vm or namespace being deleted. + logrus.Warnf("found empty includeResources for VM backup during volumeBakup stage") + } } else { objectMap = stork_api.CreateObjectsMap(backup.Spec.IncludeResources) } @@ -1559,7 +1591,6 @@ func (a *ApplicationBackupController) uploadObject( if err != nil { return err } - _, err = writer.Write(data) if err != nil { closeErr := writer.Close() @@ -1776,10 +1807,15 @@ func (a *ApplicationBackupController) backupResources( var objectMap map[stork_api.ObjectInfo]bool if IsBackupObjectTypeVirtualMachine(backup) { objectMap = a.vmIncludeResourceMap[string(backup.UID)] + if len(objectMap) == 0 { + // for debugging purpose + // its possible we will not have any resources during schedule backups due + // vm or namespace deletions + logrus.Warnf("found empty resources for VM backup during resourceBackup stage...") + } } else { objectMap = stork_api.CreateObjectsMap(backup.Spec.IncludeResources) } - namespacelist := backup.Spec.Namespaces // GetResources takes more time, if we have more number of namespaces // So, submitting it in batches and in between each batch, @@ -1927,7 +1963,6 @@ func (a *ApplicationBackupController) backupResources( } return nil } - // Do any additional preparation for the resources if required if err = a.prepareResources(backup, allObjects); err != nil { message := fmt.Sprintf("Error preparing resources for backup: %v", err) diff --git a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationrestore.go b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationrestore.go index c7f302a89..66860dddf 100644 --- a/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationrestore.go +++ b/vendor/github.com/libopenstorage/stork/pkg/applicationmanager/controllers/applicationrestore.go @@ -662,8 +662,11 @@ func (a *ApplicationRestoreController) restoreVolumes(restore *storkapi.Applicat if len(restore.Status.Volumes) != pvcCount { // Here backupVolumeInfoMappings is framed based on driver name mapping, hence startRestore() // gets called once per driver - if !nfs { - for driverName, vInfos := range backupVolumeInfoMappings { + for driverName, vInfos := range backupVolumeInfoMappings { + // Portworx driver restore is not supported via job as it can be a secured px volume + // to access the token, we need to run the restore in the same pod as the stork controller + // this check is equivalent to (if !nfs || (nfs && driverName == volume.PortworxDriverName)) + if !nfs || driverName == volume.PortworxDriverName { backupVolInfos := vInfos driver, err := volume.Get(driverName) // BL NFS + kdmp = nfs code path @@ -674,82 +677,95 @@ func (a *ApplicationRestoreController) restoreVolumes(restore *storkapi.Applicat if err != nil { return err } - // For each driver, check if it needs any additional resources to be - // restored before starting the volume restore - objects, err := a.downloadResources(backup, restore.Spec.BackupLocation, restore.Namespace) - if err != nil { - log.ApplicationRestoreLog(restore).Errorf("Error downloading resources: %v", err) - return err - } - // Skip pv/pvc if replacepolicy is set to retain to avoid creating - if restore.Spec.ReplacePolicy == storkapi.ApplicationRestoreReplacePolicyRetain { - backupVolInfos, existingRestoreVolInfos, err = a.skipVolumesFromRestoreList(restore, objects, driver, vInfos) - if err != nil { - log.ApplicationRestoreLog(restore).Errorf("Error while checking pvcs: %v", err) - return err + var preRestoreObjects, objects []runtime.Unstructured + // this check is equivalent to (if nfs && driverName == volume.PortworxDriverName) + if nfs { + if restore.Spec.ReplacePolicy == storkapi.ApplicationRestoreReplacePolicyRetain { + // Skip pv/pvc if replacepolicy is set to retain to avoid creating with empty object + backupVolInfos, existingRestoreVolInfos, err = a.skipVolumesFromRestoreList(restore, objects, driver, vInfos) + if err != nil { + log.ApplicationRestoreLog(restore).Errorf("Error while checking pvcs: %v", err) + return err + } } - } - var storageClassesBytes []byte - if driverName == "csi" { - storageClassesBytes, err = a.downloadObject(backup, backup.Spec.BackupLocation, backup.Namespace, "storageclasses.json", false) + } else { + // For each driver, check if it needs any additional resources to be + // restored before starting the volume restore + objects, err := a.downloadResources(backup, restore.Spec.BackupLocation, restore.Namespace) if err != nil { - log.ApplicationRestoreLog(restore).Errorf("Error in a.downloadObject %v", err) + log.ApplicationRestoreLog(restore).Errorf("Error downloading resources: %v", err) return err } - } - preRestoreObjects, err := driver.GetPreRestoreResources(backup, restore, objects, storageClassesBytes) - if err != nil { - log.ApplicationRestoreLog(restore).Errorf("Error getting PreRestore Resources: %v", err) - return err - } - - // Pre-delete resources for CSI driver - if (driverName == "csi" || driverName == "kdmp") && restore.Spec.ReplacePolicy == storkapi.ApplicationRestoreReplacePolicyDelete { - objectMap := storkapi.CreateObjectsMap(restore.Spec.IncludeResources) - objectBasedOnIncludeResources := make([]runtime.Unstructured, 0) - var opts resourcecollector.Options - for _, o := range objects { - skip, err := a.resourceCollector.PrepareResourceForApply( - o, - objects, - objectMap, - restore.Spec.NamespaceMapping, - nil, // no need to set storage class mappings at this stage - nil, - restore.Spec.IncludeOptionalResourceTypes, - nil, - &opts, - restore.Spec.BackupLocation, - restore.Namespace, - ) + // Skip pv/pvc if replacepolicy is set to retain to avoid creating + if restore.Spec.ReplacePolicy == storkapi.ApplicationRestoreReplacePolicyRetain { + backupVolInfos, existingRestoreVolInfos, err = a.skipVolumesFromRestoreList(restore, objects, driver, vInfos) if err != nil { + log.ApplicationRestoreLog(restore).Errorf("Error while checking pvcs: %v", err) return err } - if !skip { - objectBasedOnIncludeResources = append( - objectBasedOnIncludeResources, - o, - ) + } + var storageClassesBytes []byte + if driverName == "csi" { + storageClassesBytes, err = a.downloadObject(backup, backup.Spec.BackupLocation, backup.Namespace, "storageclasses.json", false) + if err != nil { + log.ApplicationRestoreLog(restore).Errorf("Error in a.downloadObject %v", err) + return err } } - tempObjects, err := a.getNamespacedObjectsToDelete( - restore, - objectBasedOnIncludeResources, - ) + preRestoreObjects, err = driver.GetPreRestoreResources(backup, restore, objects, storageClassesBytes) if err != nil { + log.ApplicationRestoreLog(restore).Errorf("Error getting PreRestore Resources: %v", err) return err } - err = a.resourceCollector.DeleteResources( - a.dynamicInterface, - tempObjects, updateCr) - if err != nil { - return err + + // Pre-delete resources for CSI driver + if (driverName == "csi" || driverName == "kdmp") && restore.Spec.ReplacePolicy == storkapi.ApplicationRestoreReplacePolicyDelete { + objectMap := storkapi.CreateObjectsMap(restore.Spec.IncludeResources) + objectBasedOnIncludeResources := make([]runtime.Unstructured, 0) + var opts resourcecollector.Options + for _, o := range objects { + skip, err := a.resourceCollector.PrepareResourceForApply( + o, + objects, + objectMap, + restore.Spec.NamespaceMapping, + nil, // no need to set storage class mappings at this stage + nil, + restore.Spec.IncludeOptionalResourceTypes, + nil, + &opts, + restore.Spec.BackupLocation, + restore.Namespace, + ) + if err != nil { + return err + } + if !skip { + objectBasedOnIncludeResources = append( + objectBasedOnIncludeResources, + o, + ) + } + } + tempObjects, err := a.getNamespacedObjectsToDelete( + restore, + objectBasedOnIncludeResources, + ) + if err != nil { + return err + } + err = a.resourceCollector.DeleteResources( + a.dynamicInterface, + tempObjects, updateCr) + if err != nil { + return err + } } - } - // pvc creation is not part of kdmp - if driverName != volume.KDMPDriverName { - if err := a.applyResources(restore, preRestoreObjects, updateCr); err != nil { - return err + // pvc creation is not part of kdmp + if driverName != volume.KDMPDriverName { + if err := a.applyResources(restore, preRestoreObjects, updateCr); err != nil { + return err + } } } restore, err = a.updateRestoreCRInVolumeStage( @@ -841,10 +857,9 @@ func (a *ApplicationRestoreController) restoreVolumes(restore *storkapi.Applicat return err } } - } } - // Check whether ResourceExport is present or not + // If NFS we create resourceExportCR but we will ensure to ignore PX volumes in the restore if nfs { err = a.client.Update(context.TODO(), restore) if err != nil { @@ -1319,7 +1334,6 @@ func getNamespacedPVCLocation(pvc *v1.PersistentVolumeClaim) string { // getPVCToPVMapping constructs a mapping of PVC name/namespace to PV objects func getPVCToPVMapping(allObjects []runtime.Unstructured) (map[string]*v1.PersistentVolume, error) { - // Get mapping of PVC name to PV name pvNameToPVCName := make(map[string]string) for _, o := range allObjects { @@ -1365,7 +1379,7 @@ func getPVCToPVMapping(allObjects []runtime.Unstructured) (map[string]*v1.Persis } func isGenericCSIPersistentVolume(pv *v1.PersistentVolume) (bool, error) { - driverName, err := volume.GetPVDriver(pv) + driverName, err := volume.GetPVDriverForRestore(pv) if err != nil { return false, err } @@ -1407,21 +1421,26 @@ func (a *ApplicationRestoreController) skipVolumesFromRestoreList( logrus.Infof("skipping namespace %s for restore", bkupVolInfo.Namespace) continue } + ns := val + var pvcName string // Declare the pvcName variable + if objects != nil { + // get corresponding pvc object from objects list + pvcObject, err := volume.GetPVCFromObjects(objects, bkupVolInfo) + if err != nil { + return newVolInfos, existingInfos, err + } + pvcName = pvcObject.Name - // get corresponding pvc object from objects list - pvcObject, err := volume.GetPVCFromObjects(objects, bkupVolInfo) - if err != nil { - return newVolInfos, existingInfos, err + } else { + pvcName = bkupVolInfo.PersistentVolumeClaim } - - ns := val - pvc, err := core.Instance().GetPersistentVolumeClaim(pvcObject.Name, ns) + pvc, err := core.Instance().GetPersistentVolumeClaim(pvcName, ns) if err != nil { if k8s_errors.IsNotFound(err) { newVolInfos = append(newVolInfos, bkupVolInfo) continue } - return newVolInfos, existingInfos, fmt.Errorf("erorr getting pvc %s/%s: %v", ns, pvcObject.Name, err) + return newVolInfos, existingInfos, fmt.Errorf("error getting pvc %s/%s: %v", ns, pvcName, err) // Update the error message } pvName := pvc.Spec.VolumeName var zones []string @@ -1599,10 +1618,24 @@ func (a *ApplicationRestoreController) applyResources( namespacedName := types.NamespacedName{} namespacedName.Namespace = restore.Namespace namespacedName.Name = restore.Name - - pvNameMappings, err := a.getPVNameMappings(restore, objects) - if err != nil { - return err + // The applyResources is getting called in both the volume stage and resource stage. + // In the volume stage, it is getting called for applying the preRestore object. + // During the volume stage, we will not have restoreVolume updated in the volumeInfo structure. + // In between two driver's PVC restore processing, there is a chance that applicationrestore CR will status.VolumeInfo + // updated with the basic information of the volume, with out restoreVolume name. + // List of prerestore resource for each driver is as follow: + // aws, azure, gke driver does not have any preRestore object. + // kdmp - restore PVC spec but we do not apply it in the volume stage, as we do not call applyResource for kdmp case. + // PXD - for px volumes, we apply the secrets of encrypted volumes. + // That means , we do not need to call getPVNameMappings during volume stage. + // So, avoiding the call to getPVNameMappings, if it getting called from volume stage. + var pvNameMappings map[string]string + var err error + if restore.Status.Stage != storkapi.ApplicationRestoreStageVolumes { + pvNameMappings, err = a.getPVNameMappings(restore, objects) + if err != nil { + return err + } } objectMap := storkapi.CreateObjectsMap(restore.Spec.IncludeResources) tempObjects := make([]runtime.Unstructured, 0) @@ -1903,7 +1936,7 @@ func (a *ApplicationRestoreController) restoreResources( logrus.Debugf("resource export: %s, status: %s", resourceExport.Name, resourceExport.Status.Status) switch resourceExport.Status.Status { case kdmpapi.ResourceExportStatusFailed: - message = fmt.Sprintf("Error applying resources: %v", err) + message = fmt.Sprintf("Error applying resources: %v", resourceExport.Status.Reason) restore.Status.Status = storkapi.ApplicationRestoreStatusFailed restore.Status.Stage = storkapi.ApplicationRestoreStageFinal restore.Status.Reason = message @@ -2305,7 +2338,7 @@ func (a *ApplicationRestoreController) processVMResourcesForVMRestoreFromNFS(res logrus.Debugf("resource export: %s, status: %s", resourceExport.Name, resourceExport.Status.Status) switch resourceExport.Status.Status { case kdmpapi.ResourceExportStatusFailed: - message = fmt.Sprintf("Error applying resources: %v", err) + message = fmt.Sprintf("Error applying resources: %v", resourceExport.Status.Reason) restore.Status.Status = storkapi.ApplicationRestoreStatusFailed restore.Status.Stage = storkapi.ApplicationRestoreStageFinal restore.Status.Reason = message diff --git a/vendor/github.com/libopenstorage/stork/pkg/resourcecollector/persistentvolume.go b/vendor/github.com/libopenstorage/stork/pkg/resourcecollector/persistentvolume.go index 982288fa4..89b889e76 100644 --- a/vendor/github.com/libopenstorage/stork/pkg/resourcecollector/persistentvolume.go +++ b/vendor/github.com/libopenstorage/stork/pkg/resourcecollector/persistentvolume.go @@ -207,7 +207,7 @@ func (r *ResourceCollector) preparePVResourceForApply( // checks proper driver by looking at pv name if driverName == "" { var err error - driverName, err = volume.GetPVDriver(&pv) + driverName, err = volume.GetPVDriverForRestore(&pv) if err != nil { return false, err } diff --git a/vendor/kubevirt.io/client-go/kubecli/kubevirt_test_utils.go b/vendor/kubevirt.io/client-go/kubecli/kubevirt_test_utils.go index 3d01b19f9..0566827c2 100644 --- a/vendor/kubevirt.io/client-go/kubecli/kubevirt_test_utils.go +++ b/vendor/kubevirt.io/client-go/kubecli/kubevirt_test_utils.go @@ -1,5 +1,3 @@ -// +build skipcompile - package kubecli import ( diff --git a/vendor/modules.txt b/vendor/modules.txt index aa5c51fc2..393d021e4 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -441,7 +441,7 @@ github.com/libopenstorage/openstorage-sdk-clients/sdk/golang github.com/libopenstorage/secrets github.com/libopenstorage/secrets/aws/credentials github.com/libopenstorage/secrets/k8s -# github.com/libopenstorage/stork v1.4.1-0.20240502172400-885982c36058 +# github.com/libopenstorage/stork v1.4.1-0.20240506103309-0605fa31d8ff ## explicit; go 1.21 github.com/libopenstorage/stork/drivers github.com/libopenstorage/stork/drivers/volume