From 34e417bdac2717a660f4e51aa4a9ed9a6d13d879 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Thu, 5 Dec 2024 15:03:33 +0800 Subject: [PATCH] add diagnostic for data mover exposer Signed-off-by: Lyndon-Li --- changelogs/unreleased/8482-Lyndon-Li | 2 +- pkg/controller/data_download_controller.go | 6 ++- pkg/controller/data_upload_controller.go | 6 ++- pkg/exposer/csi_snapshot.go | 9 +++-- pkg/exposer/csi_snapshot_test.go | 47 +++++++++------------- pkg/exposer/generic_restore.go | 8 ++-- pkg/exposer/generic_restore_test.go | 37 +++++++---------- 7 files changed, 56 insertions(+), 59 deletions(-) diff --git a/changelogs/unreleased/8482-Lyndon-Li b/changelogs/unreleased/8482-Lyndon-Li index c1cec7e241..33be17e0af 100644 --- a/changelogs/unreleased/8482-Lyndon-Li +++ b/changelogs/unreleased/8482-Lyndon-Li @@ -1 +1 @@ -Fix issue #8125, add diagnostic info for data mover exposers when expose timeout \ No newline at end of file +Fix issue #8125, log diagnostic info for data mover exposers when expose timeout \ No newline at end of file diff --git a/pkg/controller/data_download_controller.go b/pkg/controller/data_download_controller.go index aad034b8ca..33be8a0af1 100644 --- a/pkg/controller/data_download_controller.go +++ b/pkg/controller/data_download_controller.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "strings" "time" "github.com/pkg/errors" @@ -677,7 +678,10 @@ func (r *DataDownloadReconciler) onPrepareTimeout(ctx context.Context, dd *veler return } - log.Warn(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd))) + diags := strings.Split(r.restoreExposer.DiagnoseExpose(ctx, getDataDownloadOwnerObject(dd)), "\n") + for _, diag := range diags { + log.Warnf("[Diagnose DD expose]%s", diag) + } r.restoreExposer.CleanUp(ctx, getDataDownloadOwnerObject(dd)) diff --git a/pkg/controller/data_upload_controller.go b/pkg/controller/data_upload_controller.go index 0c44bad660..6a2aadef32 100644 --- a/pkg/controller/data_upload_controller.go +++ b/pkg/controller/data_upload_controller.go @@ -19,6 +19,7 @@ package controller import ( "context" "fmt" + "strings" "time" snapshotter "github.com/kubernetes-csi/external-snapshotter/client/v7/clientset/versioned/typed/volumesnapshot/v1" @@ -755,7 +756,10 @@ func (r *DataUploadReconciler) onPrepareTimeout(ctx context.Context, du *velerov volumeSnapshotName = du.Spec.CSISnapshot.VolumeSnapshot } - log.Warn(ep.DiagnoseExpose(ctx, getOwnerObject(du))) + diags := strings.Split(ep.DiagnoseExpose(ctx, getOwnerObject(du)), "\n") + for _, diag := range diags { + log.Warnf("[Diagnose DU expose]%s", diag) + } ep.CleanUp(ctx, getOwnerObject(du), volumeSnapshotName, du.Spec.SourceNamespace) diff --git a/pkg/exposer/csi_snapshot.go b/pkg/exposer/csi_snapshot.go index 71c9a2a62d..9b9ebe5479 100644 --- a/pkg/exposer/csi_snapshot.go +++ b/pkg/exposer/csi_snapshot.go @@ -313,20 +313,23 @@ func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject cor backupPVCName := ownerObject.Name backupVSName := ownerObject.Name - diag := fmt.Sprintf("***************************begin diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name) + diag := "begin diagnose CSI exposer\n" pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, backupPodName, metav1.GetOptions{}) if err != nil { + pod = nil diag += fmt.Sprintf("error getting backup pod %s, err: %v\n", backupPodName, err) } pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, backupPVCName, metav1.GetOptions{}) if err != nil { + pvc = nil diag += fmt.Sprintf("error getting backup pvc %s, err: %v\n", backupPVCName, err) } vs, err := e.csiSnapshotClient.VolumeSnapshots(ownerObject.Namespace).Get(ctx, backupVSName, metav1.GetOptions{}) if err != nil { + vs = nil diag += fmt.Sprintf("error getting backup vs %s, err: %v\n", backupVSName, err) } @@ -335,7 +338,7 @@ func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject cor if pod.Spec.NodeName != "" { if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil { - diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName) + diag += fmt.Sprintf("node-agent is not running in node %s, err: %v\n", pod.Spec.NodeName, err) } } } @@ -364,7 +367,7 @@ func (e *csiSnapshotExposer) DiagnoseExpose(ctx context.Context, ownerObject cor } } - diag += fmt.Sprintf("***************************end diagnose CSI exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name) + diag += "end diagnose CSI exposer" return diag } diff --git a/pkg/exposer/csi_snapshot_test.go b/pkg/exposer/csi_snapshot_test.go index d7e4a768ed..77d7926356 100644 --- a/pkg/exposer/csi_snapshot_test.go +++ b/pkg/exposer/csi_snapshot_test.go @@ -1166,12 +1166,11 @@ func Test_csiSnapshotExposer_DiagnoseExpose(t *testing.T) { { name: "no pod, pvc, vs", ownerBackup: backup, - expected: `***************************begin diagnose CSI exposer[velero/fake-backup]*************************** + expected: `begin diagnose CSI exposer error getting backup pod fake-backup, err: pods "fake-backup" not found error getting backup pvc fake-backup, err: persistentvolumeclaims "fake-backup" not found error getting backup vs fake-backup, err: volumesnapshots.snapshot.storage.k8s.io "fake-backup" not found -***************************end diagnose CSI exposer[velero/fake-backup]*************************** -`, +end diagnose CSI exposer`, }, { name: "pod without node name, pvc without volume name, vs without status", @@ -1183,13 +1182,12 @@ error getting backup vs fake-backup, err: volumesnapshots.snapshot.storage.k8s.i snapshotClientObj: []runtime.Object{ &backupVSWithoutStatus, }, - expected: `***************************begin diagnose CSI exposer[velero/fake-backup]*************************** + expected: `begin diagnose CSI exposer Pod velero/fake-backup, phase Pending, node name Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-backup, phase Pending, binding to VS velero/fake-backup, bind to , readyToUse false, errMessage -***************************end diagnose CSI exposer[velero/fake-backup]*************************** -`, +end diagnose CSI exposer`, }, { name: "pod without node name, pvc without volume name, vs without VSC", @@ -1201,13 +1199,12 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage snapshotClientObj: []runtime.Object{ &backupVSWithoutVSC, }, - expected: `***************************begin diagnose CSI exposer[velero/fake-backup]*************************** + expected: `begin diagnose CSI exposer Pod velero/fake-backup, phase Pending, node name Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-backup, phase Pending, binding to VS velero/fake-backup, bind to , readyToUse false, errMessage -***************************end diagnose CSI exposer[velero/fake-backup]*************************** -`, +end diagnose CSI exposer`, }, { name: "pod with node name, no node agent", @@ -1219,14 +1216,13 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage snapshotClientObj: []runtime.Object{ &backupVSWithoutVSC, }, - expected: `***************************begin diagnose CSI exposer[velero/fake-backup]*************************** + expected: `begin diagnose CSI exposer Pod velero/fake-backup, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message -node-agent is not running in node fake-node +node-agent is not running in node fake-node, err: daemonset pod not found in running state in node fake-node PVC velero/fake-backup, phase Pending, binding to VS velero/fake-backup, bind to , readyToUse false, errMessage -***************************end diagnose CSI exposer[velero/fake-backup]*************************** -`, +end diagnose CSI exposer`, }, { name: "pod with node name, node agent is running", @@ -1239,13 +1235,12 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage snapshotClientObj: []runtime.Object{ &backupVSWithoutVSC, }, - expected: `***************************begin diagnose CSI exposer[velero/fake-backup]*************************** + expected: `begin diagnose CSI exposer Pod velero/fake-backup, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-backup, phase Pending, binding to VS velero/fake-backup, bind to , readyToUse false, errMessage -***************************end diagnose CSI exposer[velero/fake-backup]*************************** -`, +end diagnose CSI exposer`, }, { name: "pvc with volume name, no pv", @@ -1258,14 +1253,13 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage snapshotClientObj: []runtime.Object{ &backupVSWithoutVSC, }, - expected: `***************************begin diagnose CSI exposer[velero/fake-backup]*************************** + expected: `begin diagnose CSI exposer Pod velero/fake-backup, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-backup, phase Pending, binding to fake-pv error getting backup pv fake-pv, err: persistentvolumes "fake-pv" not found VS velero/fake-backup, bind to , readyToUse false, errMessage -***************************end diagnose CSI exposer[velero/fake-backup]*************************** -`, +end diagnose CSI exposer`, }, { name: "pvc with volume name, pv exists", @@ -1279,14 +1273,13 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage snapshotClientObj: []runtime.Object{ &backupVSWithoutVSC, }, - expected: `***************************begin diagnose CSI exposer[velero/fake-backup]*************************** + expected: `begin diagnose CSI exposer Pod velero/fake-backup, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-backup, phase Pending, binding to fake-pv PV fake-pv, phase Pending, reason , message fake-pv-message VS velero/fake-backup, bind to , readyToUse false, errMessage -***************************end diagnose CSI exposer[velero/fake-backup]*************************** -`, +end diagnose CSI exposer`, }, { name: "vs with vsc, vsc doesn't exist", @@ -1300,15 +1293,14 @@ VS velero/fake-backup, bind to , readyToUse false, errMessage snapshotClientObj: []runtime.Object{ &backupVSWithVSC, }, - expected: `***************************begin diagnose CSI exposer[velero/fake-backup]*************************** + expected: `begin diagnose CSI exposer Pod velero/fake-backup, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-backup, phase Pending, binding to fake-pv PV fake-pv, phase Pending, reason , message fake-pv-message VS velero/fake-backup, bind to fake-vsc, readyToUse false, errMessage fake-vs-message error getting backup vsc fake-vsc, err: volumesnapshotcontents.snapshot.storage.k8s.io "fake-vsc" not found -***************************end diagnose CSI exposer[velero/fake-backup]*************************** -`, +end diagnose CSI exposer`, }, { name: "vs with vsc, vsc exists", @@ -1323,15 +1315,14 @@ error getting backup vsc fake-vsc, err: volumesnapshotcontents.snapshot.storage. &backupVSWithVSC, &backupVSC, }, - expected: `***************************begin diagnose CSI exposer[velero/fake-backup]*************************** + expected: `begin diagnose CSI exposer Pod velero/fake-backup, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-backup, phase Pending, binding to fake-pv PV fake-pv, phase Pending, reason , message fake-pv-message VS velero/fake-backup, bind to fake-vsc, readyToUse false, errMessage fake-vs-message VSC fake-vsc, readyToUse false, errMessage fake-vsc-message, handle -***************************end diagnose CSI exposer[velero/fake-backup]*************************** -`, +end diagnose CSI exposer`, }, } for _, tt := range tests { diff --git a/pkg/exposer/generic_restore.go b/pkg/exposer/generic_restore.go index f523bce0a2..7a7df90385 100644 --- a/pkg/exposer/generic_restore.go +++ b/pkg/exposer/generic_restore.go @@ -204,15 +204,17 @@ func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject restorePodName := ownerObject.Name restorePVCName := ownerObject.Name - diag := fmt.Sprintf("***************************begin diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name) + diag := "begin diagnose restore exposer\n" pod, err := e.kubeClient.CoreV1().Pods(ownerObject.Namespace).Get(ctx, restorePodName, metav1.GetOptions{}) if err != nil { + pod = nil diag += fmt.Sprintf("error getting restore pod %s, err: %v\n", restorePodName, err) } pvc, err := e.kubeClient.CoreV1().PersistentVolumeClaims(ownerObject.Namespace).Get(ctx, restorePVCName, metav1.GetOptions{}) if err != nil { + pvc = nil diag += fmt.Sprintf("error getting restore pvc %s, err: %v\n", restorePVCName, err) } @@ -221,7 +223,7 @@ func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject if pod.Spec.NodeName != "" { if err := nodeagent.KbClientIsRunningInNode(ctx, ownerObject.Namespace, pod.Spec.NodeName, e.kubeClient); err != nil { - diag += fmt.Sprintf("node-agent is not running in node %s\n", pod.Spec.NodeName) + diag += fmt.Sprintf("node-agent is not running in node %s, err: %v\n", pod.Spec.NodeName, err) } } } @@ -238,7 +240,7 @@ func (e *genericRestoreExposer) DiagnoseExpose(ctx context.Context, ownerObject } } - diag += fmt.Sprintf("***************************end diagnose restore exposer[%s/%s]***************************\n", ownerObject.Namespace, ownerObject.Name) + diag += "end diagnose restore exposer" return diag } diff --git a/pkg/exposer/generic_restore_test.go b/pkg/exposer/generic_restore_test.go index bcc78b9810..2eec0ce182 100644 --- a/pkg/exposer/generic_restore_test.go +++ b/pkg/exposer/generic_restore_test.go @@ -646,11 +646,10 @@ func Test_ReastoreDiagnoseExpose(t *testing.T) { { name: "no pod, pvc", ownerRestore: restore, - expected: `***************************begin diagnose restore exposer[velero/fake-restore]*************************** + expected: `begin diagnose restore exposer error getting restore pod fake-restore, err: pods "fake-restore" not found error getting restore pvc fake-restore, err: persistentvolumeclaims "fake-restore" not found -***************************end diagnose restore exposer[velero/fake-restore]*************************** -`, +end diagnose restore exposer`, }, { name: "pod without node name, pvc without volume name, vs without status", @@ -659,12 +658,11 @@ error getting restore pvc fake-restore, err: persistentvolumeclaims "fake-restor &restorePodWithoutNodeName, &restorePVCWithoutVolumeName, }, - expected: `***************************begin diagnose restore exposer[velero/fake-restore]*************************** + expected: `begin diagnose restore exposer Pod velero/fake-restore, phase Pending, node name Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-restore, phase Pending, binding to -***************************end diagnose restore exposer[velero/fake-restore]*************************** -`, +end diagnose restore exposer`, }, { name: "pod without node name, pvc without volume name", @@ -673,12 +671,11 @@ PVC velero/fake-restore, phase Pending, binding to &restorePodWithoutNodeName, &restorePVCWithoutVolumeName, }, - expected: `***************************begin diagnose restore exposer[velero/fake-restore]*************************** + expected: `begin diagnose restore exposer Pod velero/fake-restore, phase Pending, node name Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-restore, phase Pending, binding to -***************************end diagnose restore exposer[velero/fake-restore]*************************** -`, +end diagnose restore exposer`, }, { name: "pod with node name, no node agent", @@ -687,13 +684,12 @@ PVC velero/fake-restore, phase Pending, binding to &restorePodWithNodeName, &restorePVCWithoutVolumeName, }, - expected: `***************************begin diagnose restore exposer[velero/fake-restore]*************************** + expected: `begin diagnose restore exposer Pod velero/fake-restore, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message -node-agent is not running in node fake-node +node-agent is not running in node fake-node, err: daemonset pod not found in running state in node fake-node PVC velero/fake-restore, phase Pending, binding to -***************************end diagnose restore exposer[velero/fake-restore]*************************** -`, +end diagnose restore exposer`, }, { name: "pod with node name, node agent is running", @@ -703,12 +699,11 @@ PVC velero/fake-restore, phase Pending, binding to &restorePVCWithoutVolumeName, &nodeAgentPod, }, - expected: `***************************begin diagnose restore exposer[velero/fake-restore]*************************** + expected: `begin diagnose restore exposer Pod velero/fake-restore, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-restore, phase Pending, binding to -***************************end diagnose restore exposer[velero/fake-restore]*************************** -`, +end diagnose restore exposer`, }, { name: "pvc with volume name, no pv", @@ -718,13 +713,12 @@ PVC velero/fake-restore, phase Pending, binding to &restorePVCWithVolumeName, &nodeAgentPod, }, - expected: `***************************begin diagnose restore exposer[velero/fake-restore]*************************** + expected: `begin diagnose restore exposer Pod velero/fake-restore, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-restore, phase Pending, binding to fake-pv error getting restore pv fake-pv, err: persistentvolumes "fake-pv" not found -***************************end diagnose restore exposer[velero/fake-restore]*************************** -`, +end diagnose restore exposer`, }, { name: "pvc with volume name, pv exists", @@ -735,13 +729,12 @@ error getting restore pv fake-pv, err: persistentvolumes "fake-pv" not found &restorePV, &nodeAgentPod, }, - expected: `***************************begin diagnose restore exposer[velero/fake-restore]*************************** + expected: `begin diagnose restore exposer Pod velero/fake-restore, phase Pending, node name fake-node Pod condition Initialized, status True, reason , message fake-pod-message PVC velero/fake-restore, phase Pending, binding to fake-pv PV fake-pv, phase Pending, reason , message fake-pv-message -***************************end diagnose restore exposer[velero/fake-restore]*************************** -`, +end diagnose restore exposer`, }, } for _, test := range tests {