Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PB-4572:Added logging of pod describe,events and Jobpod log in job failure scenarios #335

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 1 addition & 36 deletions pkg/controllers/dataexport/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ func (c *Controller) sync(ctx context.Context, in *kdmpapi.DataExport) (bool, er
if err != nil {
logrus.Infof("job-pod name and namespace extraction failed: %v", err)
}
appendPodLogToStork(name, namespace)
utils.DisplayJobpodLogandEvents(name, namespace)
}
}
cleanupTask := func() (interface{}, bool, error) {
Expand Down Expand Up @@ -607,41 +607,6 @@ func parseExcludeFileListKey(pvcStorageClass string, excludeFileListValue string
return excludeFileList, nil
}

func appendPodLogToStork(jobName string, namespace string) {
// Get job and check whether it has live pod attaced to it
job, err := batch.Instance().GetJob(jobName, namespace)
if err != nil && !k8sErrors.IsNotFound(err) {
logrus.Infof("failed in getting job %v/%v with err: %v", namespace, jobName, err)
}
pods, err := core.Instance().GetPods(
job.Namespace,
map[string]string{
"job-name": job.Name,
},
)
if err != nil {
logrus.Infof("failed in fetching job pods %s/%s: %v", namespace, jobName, err)
}
for _, pod := range pods.Items {
numLogLines := int64(50)
podDescribe, err := core.Instance().GetPodByName(pod.Name, pod.Namespace)
if err != nil {
logrus.Infof("Error fetching description of job-pod[%s] :%v", pod.Name, err)
}
logrus.Infof("start of job-pod [%s]'s description", pod.Name)
logrus.Infof("Describe %v", podDescribe)
logrus.Infof("end of job-pod [%s]'s description", pod.Name)
podLog, err := core.Instance().GetPodLog(pod.Name, pod.Namespace, &corev1.PodLogOptions{TailLines: &numLogLines})
if err != nil {
logrus.Infof("error fetching log of job-pod %s: %v", pod.Name, err)
} else {
logrus.Infof("start of job-pod [%s]'s log...", pod.Name)
logrus.Infof(podLog)
logrus.Infof("end of job-pod [%s]'s log...", pod.Name)
}
}
}

func (c *Controller) createJobCredCertSecrets(
dataExport *kdmpapi.DataExport,
vb *kdmpapi.VolumeBackup,
Expand Down
9 changes: 9 additions & 0 deletions pkg/controllers/resourceexport/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,15 @@ func (c *Controller) process(ctx context.Context, in *kdmpapi.ResourceExport) (b
}
switch progress.State {
case drivers.JobStateFailed:
if resourceExport.Status.Status == kdmpapi.ResourceExportStatusFailed {
if resourceExport.Status.TransferID != "" {
namespace, name, err := utils.ParseJobID(resourceExport.Status.TransferID)
if err != nil {
logrus.Infof("job-pod name and namespace extraction failed: %v", err)
}
utils.DisplayJobpodLogandEvents(name, namespace)
}
}
errMsg := fmt.Sprintf("%s transfer job failed: %s", resourceExport.Status.TransferID, progress.Reason)
// If a job has failed it means it has tried all possible retires and given up.
// In such a scenario we need to fail RE CR and move to clean up stage
Expand Down
3 changes: 3 additions & 0 deletions pkg/drivers/kopiabackup/kopiabackup.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check whether mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed to mount pvc, please check job pod's description for more detail", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -224,6 +225,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
jobErr, nodeErr := utils.IsJobOrNodeFailed(job)
var errMsg string
if jobErr {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand All @@ -242,6 +244,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
}
errMsg := fmt.Sprintf("failed to fetch volumebackup %s/%s status: %v", namespace, name, err)
logrus.Errorf("%s: %v", fn, errMsg)
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
return nil, fmt.Errorf(errMsg)
}

Expand Down
7 changes: 2 additions & 5 deletions pkg/drivers/kopiadelete/kopiadelete.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check whether mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed to mount pvc, please check job pod's description for more detail", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -158,13 +159,9 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
jobStatus = job.Status.Conditions[0].Type

}
if err != nil {
errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err)
logrus.Errorf("%s: %v", fn, errMsg)
return nil, fmt.Errorf(errMsg)
}

if utils.IsJobFailed(job) {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
6 changes: 1 addition & 5 deletions pkg/drivers/kopiamaintenance/kopiamaintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,9 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
jobStatus = job.Status.Conditions[0].Type

}
if err != nil {
errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err)
logrus.Errorf("%s: %v", fn, errMsg)
return nil, fmt.Errorf(errMsg)
}

if utils.IsJobFailed(job) {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("check maintenance [%s/%s] job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
7 changes: 2 additions & 5 deletions pkg/drivers/kopiarestore/kopiarestore.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check whether mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed to mount pvc, please check job pod's description for more detail", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -133,13 +134,9 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
jobStatus = job.Status.Conditions[0].Type

}
if err != nil {
errMsg := fmt.Sprintf("failed to get restart count for job %s/%s job: %v", namespace, name, err)
logrus.Errorf("%s: %v", fn, errMsg)
return nil, fmt.Errorf(errMsg)
}

if utils.IsJobFailed(job) {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/drivers/nfsbackup/nfsbackup.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check whether mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -104,6 +105,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
jobErr, nodeErr := utils.IsJobOrNodeFailed(job)
var errMsg string
if jobErr {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
8 changes: 3 additions & 5 deletions pkg/drivers/nfscsirestore/nfscsirestore.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check for mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -101,6 +102,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
jobErr, nodeErr := utils.IsJobOrNodeFailed(job)

if jobErr {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand All @@ -109,10 +111,6 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}

if utils.IsJobFailed(job) {
errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
if utils.IsJobPending(job) {
logrus.Warnf("restore job %s is in pending state", job.Name)
return utils.ToJobStatus(0, "", jobStatus), nil
Expand Down Expand Up @@ -174,7 +172,7 @@ func addJobLabels(jobOpts drivers.JobOpts) map[string]string {
labels = make(map[string]string)
}

labels[drivers.DriverNameLabel] = drivers.NFSRestore
labels[drivers.DriverNameLabel] = drivers.NFSCSIRestore
labels = utils.SetDisableIstioLabel(labels, jobOpts)
return labels
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/drivers/nfsdelete/nfsdelete.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check for mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -125,6 +126,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
}

if utils.IsJobFailed(job) {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/drivers/nfsrestore/nfsrestore.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check for mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -104,6 +105,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {

var errMsg string
if jobErr {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
57 changes: 57 additions & 0 deletions pkg/drivers/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/portworx/kdmp/pkg/drivers"
"github.com/portworx/kdmp/pkg/version"
"github.com/portworx/sched-ops/k8s/apps"
"github.com/portworx/sched-ops/k8s/batch"
"github.com/portworx/sched-ops/k8s/core"
"github.com/sirupsen/logrus"
batchv1 "k8s.io/api/batch/v1"
Expand Down Expand Up @@ -878,6 +879,62 @@ func IsJobPodMountFailed(job *batchv1.Job, namespace string) bool {
return false
}

// DisplayJobpodLogandEvents - Prints the Job pod description, log and events
func DisplayJobpodLogandEvents(jobName string, namespace string) {
// Get job from the namespace
job, err := batch.Instance().GetJob(jobName, namespace)
if err != nil && !apierrors.IsNotFound(err) {
logrus.Infof("failed to get job [%v] in namespace [%v].err: %v", jobName, namespace, err)
}
// Get the pods of the job from the namespace
pods, err := core.Instance().GetPods(
job.Namespace,
map[string]string{
"job-name": job.Name,
},
)
if err != nil {
logrus.Infof("failed to fetch pod of job [%v] in namespace [%v].err: %v", jobName, namespace, err)
}
for _, pod := range pods.Items {
numLogLines := int64(64)
podDescribe, err := core.Instance().GetPodByName(pod.Name, pod.Namespace)
if err != nil {
logrus.Infof("error fetching description of job-pod [%s] in namespace [%s].err: %v", pod.Name, pod.Namespace, err)
}
logrus.Info("###---POD DESCRIBE---###")
logrus.Infof("start of job-pod [%s] description", pod.Name)
logrus.Infof("%v", podDescribe)
logrus.Infof("end of job-pod [%s] description", pod.Name)
logrus.Info("###---POD DESCRIBE---###")

opts := metav1.ListOptions{
FieldSelector: "involvedObject.name=" + pod.Name,
}
events, err := core.Instance().ListEvents(namespace, opts)
if err != nil {
logrus.Infof("error fetching events for pod [%s] of namespace [%s]: %v", namespace, pod.Name, err)
} else {
logrus.Info("###---POD EVENTS---###")
logrus.Infof("start of events of pod [%s] of job [%s] of namespace [%s]", pod.Name, jobName, pod.Namespace)
logrus.Infof("%v", events)
logrus.Infof("end of events of pod [%s]", pod.Name)
logrus.Info("###---POD EVENTS---###")
}

podLog, err := core.Instance().GetPodLog(pod.Name, pod.Namespace, &corev1.PodLogOptions{TailLines: &numLogLines})
if err != nil {
logrus.Infof("error fetching log of job-pod %s: %v", pod.Name, err)
} else {
logrus.Info("###---POD LOGS---###")
logrus.Infof("start of job-pod [%s] log", pod.Name)
logrus.Infof("%v", podLog)
logrus.Infof("end of job-pod [%s] log", pod.Name)
logrus.Info("###---POD LOGS---###")
}
}
}

func GetDisableIstioConfig(jobOpts drivers.JobOpts) bool {
kdmpData, err := core.Instance().GetConfigMap(jobOpts.JobConfigMap, jobOpts.JobConfigMapNs)
if err != nil {
Expand Down
Loading