Skip to content

Commit

Permalink
PB-4572:Added logging of pod describe,events and Jobpod log in job fa…
Browse files Browse the repository at this point in the history
…ilure scenarios
  • Loading branch information
aks-px committed Feb 5, 2024
1 parent ab9ae1d commit b57d4ce
Show file tree
Hide file tree
Showing 11 changed files with 83 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pkg/controllers/dataexport/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -526,7 +526,7 @@ func (c *Controller) sync(ctx context.Context, in *kdmpapi.DataExport) (bool, er
if err != nil {
logrus.Infof("job-pod name and namespace extraction failed: %v", err)
}
appendPodLogToStork(name, namespace)
utils.DisplayJobpodLogandEvents(name, namespace)
}
}
cleanupTask := func() (interface{}, bool, error) {
Expand Down
9 changes: 9 additions & 0 deletions pkg/controllers/resourceexport/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,15 @@ func (c *Controller) process(ctx context.Context, in *kdmpapi.ResourceExport) (b
}
switch progress.State {
case drivers.JobStateFailed:
if resourceExport.Status.Status == kdmpapi.ResourceExportStatusFailed {
if resourceExport.Status.TransferID != "" {
namespace, name, err := utils.ParseJobID(resourceExport.Status.TransferID)
if err != nil {
logrus.Infof("job-pod name and namespace extraction failed: %v", err)
}
utils.DisplayJobpodLogandEvents(name, namespace)
}
}
errMsg := fmt.Sprintf("%s transfer job failed: %s", resourceExport.Status.TransferID, progress.Reason)
// If a job has failed it means it has tried all possible retires and given up.
// In such a scenario we need to fail RE CR and move to clean up stage
Expand Down
3 changes: 3 additions & 0 deletions pkg/drivers/kopiabackup/kopiabackup.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check whether mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed to mount pvc, please check job pod's description for more detail", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -224,6 +225,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
jobErr, nodeErr := utils.IsJobOrNodeFailed(job)
var errMsg string
if jobErr {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand All @@ -242,6 +244,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
}
errMsg := fmt.Sprintf("failed to fetch volumebackup %s/%s status: %v", namespace, name, err)
logrus.Errorf("%s: %v", fn, errMsg)
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
return nil, fmt.Errorf(errMsg)
}

Expand Down
2 changes: 2 additions & 0 deletions pkg/drivers/kopiadelete/kopiadelete.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check whether mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed to mount pvc, please check job pod's description for more detail", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -165,6 +166,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
}

if utils.IsJobFailed(job) {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
1 change: 1 addition & 0 deletions pkg/drivers/kopiamaintenance/kopiamaintenance.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
}

if utils.IsJobFailed(job) {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("check maintenance [%s/%s] job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/drivers/kopiarestore/kopiarestore.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check whether mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed to mount pvc, please check job pod's description for more detail", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -140,6 +141,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
}

if utils.IsJobFailed(job) {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/drivers/nfsbackup/nfsbackup.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check whether mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -104,6 +105,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
jobErr, nodeErr := utils.IsJobOrNodeFailed(job)
var errMsg string
if jobErr {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/drivers/nfscsirestore/nfscsirestore.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check for mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -101,6 +102,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
jobErr, nodeErr := utils.IsJobOrNodeFailed(job)

if jobErr {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/drivers/nfsdelete/nfsdelete.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check for mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -125,6 +126,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
}

if utils.IsJobFailed(job) {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/drivers/nfsrestore/nfsrestore.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {
// Check for mount point failure
mountFailed := utils.IsJobPodMountFailed(job, namespace)
if mountFailed {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg := fmt.Sprintf("job [%v/%v] failed while mounting NFS mount endpoint", namespace, name)
return utils.ToJobStatus(0, errMsg, batchv1.JobFailed), nil
}
Expand All @@ -104,6 +105,7 @@ func (d Driver) JobStatus(id string) (*drivers.JobStatus, error) {

var errMsg string
if jobErr {
utils.DisplayJobpodLogandEvents(job.Name, job.Namespace)
errMsg = fmt.Sprintf("check %s/%s job for details: %s", namespace, name, drivers.ErrJobFailed)
return utils.ToJobStatus(0, errMsg, jobStatus), nil
}
Expand Down
57 changes: 57 additions & 0 deletions pkg/drivers/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/portworx/kdmp/pkg/drivers"
"github.com/portworx/kdmp/pkg/version"
"github.com/portworx/sched-ops/k8s/apps"
"github.com/portworx/sched-ops/k8s/batch"
"github.com/portworx/sched-ops/k8s/core"
"github.com/sirupsen/logrus"
batchv1 "k8s.io/api/batch/v1"
Expand Down Expand Up @@ -878,6 +879,62 @@ func IsJobPodMountFailed(job *batchv1.Job, namespace string) bool {
return false
}

// DisplayJobpodLogandEvents - Prints the Job pod description, log and events
func DisplayJobpodLogandEvents(jobName string, namespace string) {
// Get job from the namespace
job, err := batch.Instance().GetJob(jobName, namespace)
if err != nil && !apierrors.IsNotFound(err) {
logrus.Infof("failed to get job [%v] in namespace [%v].err: %v", jobName, namespace, err)
}
// Get the pods of the job from the namespace
pods, err := core.Instance().GetPods(
job.Namespace,
map[string]string{
"job-name": job.Name,
},
)
if err != nil {
logrus.Infof("failed to fetch pod of job [%v] in namespace [%v].err: %v", jobName, namespace, err)
}
for _, pod := range pods.Items {
numLogLines := int64(64)
podDescribe, err := core.Instance().GetPodByName(pod.Name, pod.Namespace)
if err != nil {
logrus.Infof("error fetching description of job-pod [%s] in namespace [%s].err: %v", pod.Name, pod.Namespace, err)
}
logrus.Info("###---POD DESCRIBE---###")
logrus.Infof("start of job-pod [%s] description", pod.Name)
logrus.Infof("%v", podDescribe)
logrus.Infof("end of job-pod [%s] description", pod.Name)
logrus.Info("###---POD DESCRIBE---###")

opts := metav1.ListOptions{
FieldSelector: "involvedObject.name=" + pod.Name,
}
events, err := core.Instance().ListEvents(namespace, opts)
if err != nil {
logrus.Infof("error fetching events for pod [%s] of namespace [%s]: %v", namespace, pod.Name, err)
} else {
logrus.Info("###---POD EVENTS---###")
logrus.Infof("start of events of pod [%s] of job [%s] of namespace [%s]", pod.Name, jobName, pod.Namespace)
logrus.Infof("%v", events)
logrus.Infof("end of events of pod [%s]", pod.Name)
logrus.Info("###---POD EVENTS---###")
}

podLog, err := core.Instance().GetPodLog(pod.Name, pod.Namespace, &corev1.PodLogOptions{TailLines: &numLogLines})
if err != nil {
logrus.Infof("error fetching log of job-pod %s: %v", pod.Name, err)
} else {
logrus.Info("###---POD LOGS---###")
logrus.Infof("start of job-pod [%s] log", pod.Name)
logrus.Infof("%v", podLog)
logrus.Infof("end of job-pod [%s] log", pod.Name)
logrus.Info("###---POD LOGS---###")
}
}
}

func GetDisableIstioConfig(jobOpts drivers.JobOpts) bool {
kdmpData, err := core.Instance().GetConfigMap(jobOpts.JobConfigMap, jobOpts.JobConfigMapNs)
if err != nil {
Expand Down

0 comments on commit b57d4ce

Please sign in to comment.