From 6ef74cc72508f30e34c9de3d7fc5131375ace56d Mon Sep 17 00:00:00 2001 From: Lalatendu Das Date: Mon, 29 Jul 2024 08:56:05 +0000 Subject: [PATCH] pb-7504: make NFS job pod to use root for resource backup - When we use GCP based file store as NFS backup location, the job pod using that doesn't have write permission for group user, this causes the non-root user permission denied error during backup and restore. - This is GKE specific behaviour hence a check added to force all job pod to run as a root user eradicating the permission denied error. Signed-off-by: Lalatendu Das --- pkg/drivers/nfsbackup/nfsbackup.go | 25 ++++++++++++++++++++++--- pkg/drivers/nfsrestore/nfsrestore.go | 25 ++++++++++++++++++++++--- pkg/drivers/utils/utils.go | 18 ++++++++++++++++++ 3 files changed, 62 insertions(+), 6 deletions(-) diff --git a/pkg/drivers/nfsbackup/nfsbackup.go b/pkg/drivers/nfsbackup/nfsbackup.go index 9704b131..ffbc9854 100644 --- a/pkg/drivers/nfsbackup/nfsbackup.go +++ b/pkg/drivers/nfsbackup/nfsbackup.go @@ -278,14 +278,33 @@ func jobForBackupResource( }, } + uid := utils.KdmpJobUid + // For GCP based clusters the NFS PVC mounted with a anomalous GID permissions( i.e. sans GID write permission) + // hence avoiding passing any specific UID or GID so that Job pod will always run as ROOT user. + // This makes the job pod to fail in GCP based cluster with PSA enabled environment. + + // check the cluster is GCP based or not + isGcpBasedCluster, err := utils.IsGcpHostedCluster() + if err != nil { + logrus.Errorf("failed to check the cluster is GCP based or not: %v", err) + return nil, fmt.Errorf("failed to check the cluster is GCP based or not for job [%s/%s]", jobOption.Namespace, jobOption.RestoreExportName) + } + if isGcpBasedCluster { + logrus.Debugf("Found a GCP based cluster hence not adding any specific UID/GID to the job, it will run with root user") + uid = "" + } + // The Job is intended to backup resources to NFS backuplocation // and it doesn't need a specific JOB uid/gid since it will be sqaushed at NFS server // hence used a global hardcoded UID/GID. // Not passing the groupId as we do not want to set the RunAsGroup field in the securityContext // This helps us in setting the primaryGroup ID to root for the user ID. - job, err = utils.AddSecurityContextToJob(job, utils.KdmpJobUid, "") - if err != nil { - return nil, err + logrus.Infof("DAS ............. Adding security context to the job") + if uid != "" { + job, err = utils.AddSecurityContextToJob(job, uid, "") + if err != nil { + return nil, err + } } // Add the image secret in job spec only if it is present in the stork deployment. diff --git a/pkg/drivers/nfsrestore/nfsrestore.go b/pkg/drivers/nfsrestore/nfsrestore.go index 881ef272..c9ae7f24 100644 --- a/pkg/drivers/nfsrestore/nfsrestore.go +++ b/pkg/drivers/nfsrestore/nfsrestore.go @@ -321,11 +321,30 @@ func jobForRestoreResource( }, }, } + + uid := utils.KdmpJobUid + // For GCP based clusters the NFS PVC mounted with a anomalous GID permissions( i.e. sans GID write permission) + // hence avoiding passing any specific UID or GID so that Job pod will always run as ROOT user. + // This makes the job pod to fail in GCP based cluster with PSA enabled environment. + + // check the cluster is GCP based or not + isGcpBasedCluster, err := utils.IsGcpHostedCluster() + if err != nil { + logrus.Errorf("failed to check the cluster is GCP based or not: %v", err) + return nil, fmt.Errorf("failed to check the cluster is GCP based or not for job [%s/%s]", jobOption.Namespace, jobOption.RestoreExportName) + } + if isGcpBasedCluster { + logrus.Debugf("Found a GCP based cluster hence not adding any specific UID/GID to the job, it will run with root user") + uid = "" + } // Not passing the groupId as we do not want to set the RunAsGroup field in the securityContext // This helps us in setting the primaryGroup ID to root for the user ID. - job, err = utils.AddSecurityContextToJob(job, utils.KdmpJobUid, "") - if err != nil { - return nil, err + logrus.Infof("DAS ............. Adding security context to the job restore -path") + if uid != "" { + job, err = utils.AddSecurityContextToJob(job, uid, "") + if err != nil { + return nil, err + } } // Add the image secret in job spec only if it is present in the stork deployment. if len(imageRegistrySecret) != 0 { diff --git a/pkg/drivers/utils/utils.go b/pkg/drivers/utils/utils.go index 5c328b2e..f52be1ea 100644 --- a/pkg/drivers/utils/utils.go +++ b/pkg/drivers/utils/utils.go @@ -1088,3 +1088,21 @@ func GetOcpNsUidGid(nsName string, psaJobUid string, psaJobGid string) (string, } return psaJobUid, psaJobGid, isOcp, nil } + +// Checks if the cluster is GCP hosted cluster. +func IsGcpHostedCluster() (bool, error) { + // Any GCP hosted cluster be it vanilla , OCP or GKE + // it is expected to have a ProviderId in its spec witha prefix of "gce" + nodes, err := core.Instance().GetNodes() + if err != nil { + return false, fmt.Errorf("failed to get nodes: %v", err) + } + + for _, node := range nodes.Items { + providerID := node.Spec.ProviderID + if strings.HasPrefix(providerID, "gce://") { + return true, nil + } + } + return false, nil +}