diff --git a/pkg/controllers/dataexport/reconcile.go b/pkg/controllers/dataexport/reconcile.go index 84d7b91f5..681bfdcb5 100644 --- a/pkg/controllers/dataexport/reconcile.go +++ b/pkg/controllers/dataexport/reconcile.go @@ -526,7 +526,7 @@ func (c *Controller) sync(ctx context.Context, in *kdmpapi.DataExport) (bool, er if err != nil { logrus.Infof("job-pod name and namespace extraction failed: %v", err) } - appendPodLogToStork(name, namespace) + utils.DisplayJobpodLogandEvents(name, namespace) } } cleanupTask := func() (interface{}, bool, error) { @@ -607,41 +607,6 @@ func parseExcludeFileListKey(pvcStorageClass string, excludeFileListValue string return excludeFileList, nil } -func appendPodLogToStork(jobName string, namespace string) { - // Get job and check whether it has live pod attaced to it - job, err := batch.Instance().GetJob(jobName, namespace) - if err != nil && !k8sErrors.IsNotFound(err) { - logrus.Infof("failed in getting job %v/%v with err: %v", namespace, jobName, err) - } - pods, err := core.Instance().GetPods( - job.Namespace, - map[string]string{ - "job-name": job.Name, - }, - ) - if err != nil { - logrus.Infof("failed in fetching job pods %s/%s: %v", namespace, jobName, err) - } - for _, pod := range pods.Items { - numLogLines := int64(50) - podDescribe, err := core.Instance().GetPodByName(pod.Name, pod.Namespace) - if err != nil { - logrus.Infof("Error fetching description of job-pod[%s] :%v", pod.Name, err) - } - logrus.Infof("start of job-pod [%s]'s description", pod.Name) - logrus.Infof("Describe %v", podDescribe) - logrus.Infof("end of job-pod [%s]'s description", pod.Name) - podLog, err := core.Instance().GetPodLog(pod.Name, pod.Namespace, &corev1.PodLogOptions{TailLines: &numLogLines}) - if err != nil { - logrus.Infof("error fetching log of job-pod %s: %v", pod.Name, err) - } else { - logrus.Infof("start of job-pod [%s]'s log...", pod.Name) - logrus.Infof(podLog) - logrus.Infof("end of job-pod [%s]'s log...", pod.Name) - } - } -} - func (c *Controller) createJobCredCertSecrets( dataExport *kdmpapi.DataExport, vb *kdmpapi.VolumeBackup, diff --git a/pkg/controllers/resourceexport/reconcile.go b/pkg/controllers/resourceexport/reconcile.go index 9b04a6bf8..efcb3dc84 100644 --- a/pkg/controllers/resourceexport/reconcile.go +++ b/pkg/controllers/resourceexport/reconcile.go @@ -203,6 +203,15 @@ func (c *Controller) process(ctx context.Context, in *kdmpapi.ResourceExport) (b } switch progress.State { case drivers.JobStateFailed: + if resourceExport.Status.Status == kdmpapi.ResourceExportStatusFailed { + if resourceExport.Status.TransferID != "" { + namespace, name, err := utils.ParseJobID(resourceExport.Status.TransferID) + if err != nil { + logrus.Infof("job-pod name and namespace extraction failed: %v", err) + } + utils.DisplayJobpodLogandEvents(name, namespace) + } + } errMsg := fmt.Sprintf("%s transfer job failed: %s", resourceExport.Status.TransferID, progress.Reason) // If a job has failed it means it has tried all possible retires and given up. // In such a scenario we need to fail RE CR and move to clean up stage diff --git a/pkg/drivers/kopiabackup/kopiabackup.go b/pkg/drivers/kopiabackup/kopiabackup.go index febf532da..c47284d00 100644 --- a/pkg/drivers/kopiabackup/kopiabackup.go +++ b/pkg/drivers/kopiabackup/kopiabackup.go @@ -211,6 +211,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { // Check whether mount point failure mountFailed := utils.IsJobPodMountFailed(job, namespace) if mountFailed { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("job [%v/%v] failed to mount pvc, please check job pod's description for more detail", namespace, name) return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil } @@ -224,6 +225,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { jobErr, nodeErr := utils.IsJobOrNodeFailed(job) var errMsg string if jobErr { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) return utils.ToJobStatus(0, errMsg, jobStatus), nil } @@ -242,6 +244,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { } errMsg := fmt.Sprintf("failed to fetch volumebackup %s/%s status: %v", namespace, name, err) logrus.Errorf("%s: %v", fn, errMsg) + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) return nil, fmt.Errorf(errMsg) } diff --git a/pkg/drivers/kopiadelete/kopiadelete.go b/pkg/drivers/kopiadelete/kopiadelete.go index bc4452305..b67b3e92b 100644 --- a/pkg/drivers/kopiadelete/kopiadelete.go +++ b/pkg/drivers/kopiadelete/kopiadelete.go @@ -143,6 +143,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { // Check whether mount point failure mountFailed := utils.IsJobPodMountFailed(job, namespace) if mountFailed { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("job [%v/%v] failed to mount pvc, please check job pod's description for more detail", namespace, name) return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil } @@ -158,13 +159,9 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { jobStatus = job.Status.Conditions[0].Type } - if err != nil { - errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err) - logrus.Errorf("%s: %v", fn, errMsg) - return nil, fmt.Errorf(errMsg) - } if utils.IsJobFailed(job) { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) return utils.ToJobStatus(0, errMsg, jobStatus), nil } diff --git a/pkg/drivers/kopiamaintenance/kopiamaintenance.go b/pkg/drivers/kopiamaintenance/kopiamaintenance.go index 665cf58f2..d1718fbb0 100644 --- a/pkg/drivers/kopiamaintenance/kopiamaintenance.go +++ b/pkg/drivers/kopiamaintenance/kopiamaintenance.go @@ -147,13 +147,9 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { jobStatus = job.Status.Conditions[0].Type } - if err != nil { - errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err) - logrus.Errorf("%s: %v", fn, errMsg) - return nil, fmt.Errorf(errMsg) - } if utils.IsJobFailed(job) { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("check maintenance [%s/%s] job for details: %s", namespace, name, drivers.ErrJobFailed) return utils.ToJobStatus(0, errMsg, jobStatus), nil } diff --git a/pkg/drivers/kopiarestore/kopiarestore.go b/pkg/drivers/kopiarestore/kopiarestore.go index 72e74463c..771f3961a 100644 --- a/pkg/drivers/kopiarestore/kopiarestore.go +++ b/pkg/drivers/kopiarestore/kopiarestore.go @@ -119,6 +119,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { // Check whether mount point failure mountFailed := utils.IsJobPodMountFailed(job, namespace) if mountFailed { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("job [%v/%v] failed to mount pvc, please check job pod's description for more detail", namespace, name) return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil } @@ -133,13 +134,9 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { jobStatus = job.Status.Conditions[0].Type } - if err != nil { - errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err) - logrus.Errorf("%s: %v", fn, errMsg) - return nil, fmt.Errorf(errMsg) - } if utils.IsJobFailed(job) { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) return utils.ToJobStatus(0, errMsg, jobStatus), nil } diff --git a/pkg/drivers/nfsbackup/nfsbackup.go b/pkg/drivers/nfsbackup/nfsbackup.go index 2f382e52f..8a35494fb 100644 --- a/pkg/drivers/nfsbackup/nfsbackup.go +++ b/pkg/drivers/nfsbackup/nfsbackup.go @@ -86,6 +86,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { // Check whether mount point failure mountFailed := utils.IsJobPodMountFailed(job, namespace) if mountFailed { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name) return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil } @@ -104,6 +105,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { jobErr, nodeErr := utils.IsJobOrNodeFailed(job) var errMsg string if jobErr { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) return utils.ToJobStatus(0, errMsg, jobStatus), nil } diff --git a/pkg/drivers/nfscsirestore/nfscsirestore.go b/pkg/drivers/nfscsirestore/nfscsirestore.go index a3b7cc2e6..520c84973 100644 --- a/pkg/drivers/nfscsirestore/nfscsirestore.go +++ b/pkg/drivers/nfscsirestore/nfscsirestore.go @@ -83,6 +83,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { // Check for mount point failure mountFailed := utils.IsJobPodMountFailed(job, namespace) if mountFailed { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name) return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil } @@ -101,6 +102,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { jobErr, nodeErr := utils.IsJobOrNodeFailed(job) if jobErr { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) return utils.ToJobStatus(0, errMsg, jobStatus), nil } @@ -109,10 +111,6 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { return utils.ToJobStatus(0, errMsg, jobStatus), nil } - if utils.IsJobFailed(job) { - errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) - return utils.ToJobStatus(0, errMsg, jobStatus), nil - } if utils.IsJobPending(job) { logrus.Warnf("restore job %s is in pending state", job.Name) return utils.ToJobStatus(0, "", jobStatus), nil @@ -174,7 +172,7 @@ func addJobLabels(jobOpts drivers.JobOpts) map[string]string { labels = make(map[string]string) } - labels[drivers.DriverNameLabel] = drivers.NFSRestore + labels[drivers.DriverNameLabel] = drivers.NFSCSIRestore labels = utils.SetDisableIstioLabel(labels, jobOpts) return labels } diff --git a/pkg/drivers/nfsdelete/nfsdelete.go b/pkg/drivers/nfsdelete/nfsdelete.go index f8adf265f..0e7038897 100644 --- a/pkg/drivers/nfsdelete/nfsdelete.go +++ b/pkg/drivers/nfsdelete/nfsdelete.go @@ -109,6 +109,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { // Check for mount point failure mountFailed := utils.IsJobPodMountFailed(job, namespace) if mountFailed { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name) return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil } @@ -125,6 +126,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { } if utils.IsJobFailed(job) { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) return utils.ToJobStatus(0, errMsg, jobStatus), nil } diff --git a/pkg/drivers/nfsrestore/nfsrestore.go b/pkg/drivers/nfsrestore/nfsrestore.go index 1f628b715..22d49edb4 100644 --- a/pkg/drivers/nfsrestore/nfsrestore.go +++ b/pkg/drivers/nfsrestore/nfsrestore.go @@ -86,6 +86,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { // Check for mount point failure mountFailed := utils.IsJobPodMountFailed(job, namespace) if mountFailed { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name) return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil } @@ -104,6 +105,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) { var errMsg string if jobErr { + utils.DisplayJobpodLogandEvents(job.Name, job.Namespace) errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed) return utils.ToJobStatus(0, errMsg, jobStatus), nil } diff --git a/pkg/drivers/utils/utils.go b/pkg/drivers/utils/utils.go index 5b48b586b..27acf1046 100644 --- a/pkg/drivers/utils/utils.go +++ b/pkg/drivers/utils/utils.go @@ -14,6 +14,7 @@ import ( "github.com/portworx/kdmp/pkg/drivers" "github.com/portworx/kdmp/pkg/version" "github.com/portworx/sched-ops/k8s/apps" + "github.com/portworx/sched-ops/k8s/batch" "github.com/portworx/sched-ops/k8s/core" "github.com/sirupsen/logrus" batchv1 "k8s.io/api/batch/v1" @@ -878,6 +879,62 @@ func IsJobPodMountFailed(job *batchv1.Job, namespace string) bool { return false } +// DisplayJobpodLogandEvents - Prints the Job pod description, log and events +func DisplayJobpodLogandEvents(jobName string, namespace string) { + // Get job from the namespace + job, err := batch.Instance().GetJob(jobName, namespace) + if err != nil && !apierrors.IsNotFound(err) { + logrus.Infof("failed to get job [%v] in namespace [%v].err: %v", jobName, namespace, err) + } + // Get the pods of the job from the namespace + pods, err := core.Instance().GetPods( + job.Namespace, + map[string]string{ + "job-name": job.Name, + }, + ) + if err != nil { + logrus.Infof("failed to fetch pod of job [%v] in namespace [%v].err: %v", jobName, namespace, err) + } + for _, pod := range pods.Items { + numLogLines := int64(64) + podDescribe, err := core.Instance().GetPodByName(pod.Name, pod.Namespace) + if err != nil { + logrus.Infof("error fetching description of job-pod [%s] in namespace [%s].err: %v", pod.Name, pod.Namespace, err) + } + logrus.Info("###---POD DESCRIBE---###") + logrus.Infof("start of job-pod [%s] description", pod.Name) + logrus.Infof("%v", podDescribe) + logrus.Infof("end of job-pod [%s] description", pod.Name) + logrus.Info("###---POD DESCRIBE---###") + + opts := metav1.ListOptions{ + FieldSelector: "involvedObject.name=" + pod.Name, + } + events, err := core.Instance().ListEvents(namespace, opts) + if err != nil { + logrus.Infof("error fetching events for pod [%s] of namespace [%s]: %v", namespace, pod.Name, err) + } else { + logrus.Info("###---POD EVENTS---###") + logrus.Infof("start of events of pod [%s] of job [%s] of namespace [%s]", pod.Name, jobName, pod.Namespace) + logrus.Infof("%v", events) + logrus.Infof("end of events of pod [%s]", pod.Name) + logrus.Info("###---POD EVENTS---###") + } + + podLog, err := core.Instance().GetPodLog(pod.Name, pod.Namespace, &corev1.PodLogOptions{TailLines: &numLogLines}) + if err != nil { + logrus.Infof("error fetching log of job-pod %s: %v", pod.Name, err) + } else { + logrus.Info("###---POD LOGS---###") + logrus.Infof("start of job-pod [%s] log", pod.Name) + logrus.Infof("%v", podLog) + logrus.Infof("end of job-pod [%s] log", pod.Name) + logrus.Info("###---POD LOGS---###") + } + } +} + func GetDisableIstioConfig(jobOpts drivers.JobOpts) bool { kdmpData, err := core.Instance().GetConfigMap(jobOpts.JobConfigMap, jobOpts.JobConfigMapNs) if err != nil {