diff --git a/.config/api-rules/violation_exceptions.list b/.config/api-rules/violation_exceptions.list index 3c744bdd..e76c50d7 100644 --- a/.config/api-rules/violation_exceptions.list +++ b/.config/api-rules/violation_exceptions.list @@ -80,3 +80,4 @@ API rule violation: names_match,k8s.io/apimachinery/pkg/util/intstr,IntOrString, API rule violation: names_match,kmodules.xyz/client-go/api/v1,CertificateSpec,URIs API rule violation: names_match,kmodules.xyz/client-go/api/v1,TimeOfDay,Time API rule violation: names_match,kubeops.dev/sidekick/apis/apps/v1alpha1,SidekickSpec,DeprecatedServiceAccount +API rule violation: names_match,kubeops.dev/sidekick/apis/apps/v1alpha1,SidekickStatus,ContainerRestartCountsPerPod diff --git a/apis/apps/v1alpha1/openapi_generated.go b/apis/apps/v1alpha1/openapi_generated.go index 244fa83e..efa14694 100644 --- a/apis/apps/v1alpha1/openapi_generated.go +++ b/apis/apps/v1alpha1/openapi_generated.go @@ -19592,13 +19592,11 @@ func schema_sidekick_apis_apps_v1alpha1_LeaderStatus(ref common.ReferenceCallbac Properties: map[string]spec.Schema{ "name": { SchemaProps: spec.SchemaProps{ - Default: "", - Type: []string{"string"}, - Format: "", + Type: []string{"string"}, + Format: "", }, }, }, - Required: []string{"name"}, }, }, } @@ -19775,12 +19773,19 @@ func schema_sidekick_apis_apps_v1alpha1_SidekickSpec(ref common.ReferenceCallbac }, "restartPolicy": { SchemaProps: spec.SchemaProps{ - Description: "Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy\n\nPossible enum values:\n - `\"Always\"`\n - `\"Never\"`\n - `\"OnFailure\"`", + Description: "Restart policy for all containers within the pod. One of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy If your sidekick has restartPolicy = \"OnFailure\", keep in mind that your Pod running the Job will be terminated once the job backoff limit has been reached. This can make debugging the Job's executable more difficult. We suggest setting restartPolicy = \"Never\" when debugging the Job or using a logging system to ensure output from failed Jobs is not lost inadvertently.\n\nPossible enum values:\n - `\"Always\"`\n - `\"Never\"`\n - `\"OnFailure\"`", Type: []string{"string"}, Format: "", Enum: []interface{}{"Always", "Never", "OnFailure"}, }, }, + "backoffLimit": { + SchemaProps: spec.SchemaProps{ + Description: "Specifies the number of retries before marking this job failed.", + Type: []string{"integer"}, + Format: "int32", + }, + }, "terminationGracePeriodSeconds": { SchemaProps: spec.SchemaProps{ Description: "Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds.", @@ -20127,7 +20132,6 @@ func schema_sidekick_apis_apps_v1alpha1_SidekickStatus(ref common.ReferenceCallb "pod": { SchemaProps: spec.SchemaProps{ Description: "Possible enum values:\n - `\"Failed\"` means that all containers in the pod have terminated, and at least one container has terminated in a failure (exited with a non-zero exit code or was stopped by the system).\n - `\"Pending\"` means the pod has been accepted by the system, but one or more of the containers has not been started. This includes time before being bound to a node, as well as time spent pulling images onto the host.\n - `\"Running\"` means the pod has been bound to a node and all of the containers have been started. At least one container is still running or is in the process of being restarted.\n - `\"Succeeded\"` means that all containers in the pod have voluntarily terminated with a container exit code of 0, and the system is not going to restart any of these containers.\n - `\"Unknown\"` means that for some reason the state of the pod could not be obtained, typically due to an error in communicating with the host of the pod. Deprecated: It isn't being set since 2015 (74da3b14b0c0f658b3bb8d2def5094686d0e9095)", - Default: "", Type: []string{"string"}, Format: "", Enum: []interface{}{"Failed", "Pending", "Running", "Succeeded", "Unknown"}, @@ -20161,8 +20165,39 @@ func schema_sidekick_apis_apps_v1alpha1_SidekickStatus(ref common.ReferenceCallb }, }, }, + "containerRestartCountPerPod": { + SchemaProps: spec.SchemaProps{ + Description: "ContainerRestartCountsPerPod stores the sum of all container restart counts of a pod", + Type: []string{"object"}, + AdditionalProperties: &spec.SchemaOrBool{ + Allows: true, + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: 0, + Type: []string{"integer"}, + Format: "int32", + }, + }, + }, + }, + }, + "failureCount": { + SchemaProps: spec.SchemaProps{ + Description: "FailuerCount tracks the total number of failed pods", + Type: []string{"object"}, + AdditionalProperties: &spec.SchemaOrBool{ + Allows: true, + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: false, + Type: []string{"boolean"}, + Format: "", + }, + }, + }, + }, + }, }, - Required: []string{"leader", "pod"}, }, }, Dependencies: []string{ diff --git a/apis/apps/v1alpha1/sidekick_types.go b/apis/apps/v1alpha1/sidekick_types.go index 6889632b..934841aa 100644 --- a/apis/apps/v1alpha1/sidekick_types.go +++ b/apis/apps/v1alpha1/sidekick_types.go @@ -39,6 +39,16 @@ const ( PodSelectionPolicyLast LeaderSelectionPolicy = "Last" ) +// +kubebuilder:validation:Enum=Pending;Current;Failed;Succeeded +type SideKickPhase string + +const ( + SideKickPhaseCurrent SideKickPhase = "Current" + SideKickPhaseFailed SideKickPhase = "Failed" + SidekickPhaseSucceeded SideKickPhase = "Succeeded" + SideKickPhasePending SideKickPhase = "Pending" +) + type LeaderSpec struct { Name string `json:"name,omitempty"` @@ -100,8 +110,17 @@ type SidekickSpec struct { // One of Always, OnFailure, Never. // Default to Always. // More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy + // If your sidekick has restartPolicy = "OnFailure", keep in mind that your Pod running the Job will be + // terminated once the job backoff limit has been reached. This can make debugging the Job's executable + // more difficult. We suggest setting restartPolicy = "Never" when debugging the Job or using a logging + // system to ensure output from failed Jobs is not lost inadvertently. + // +kubebuilder:validation:Enum=Never;Always;OnFailure // +optional RestartPolicy core.RestartPolicy `json:"restartPolicy,omitempty"` + // Specifies the number of retries before marking this job failed. + // +optional + BackoffLimit *int32 `json:"backoffLimit,omitempty"` + // Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. // Value must be non-negative integer. The value zero indicates stop immediately via // the kill signal (no opportunity to shut down). @@ -519,17 +538,17 @@ type VolumeMount struct { } type LeaderStatus struct { - Name string `json:"name"` + Name string `json:"name,omitempty"` } // SidekickStatus defines the observed state of Sidekick type SidekickStatus struct { - Leader LeaderStatus `json:"leader"` - Pod core.PodPhase `json:"pod"` + Leader LeaderStatus `json:"leader,omitempty"` + Pod core.PodPhase `json:"pod,omitempty"` // Specifies the current phase of the sidekick CR // +optional - Phase string `json:"phase,omitempty"` + Phase SideKickPhase `json:"phase,omitempty"` // observedGeneration is the most recent generation observed for this resource. It corresponds to the // resource's generation, which is updated on mutation by the API Server. // +optional @@ -537,6 +556,10 @@ type SidekickStatus struct { // Conditions applied to the database, such as approval or denial. // +optional Conditions []kmapi.Condition `json:"conditions,omitempty"` + // ContainerRestartCountsPerPod stores the sum of all container restart counts of a pod + ContainerRestartCountsPerPod map[string]int32 `json:"containerRestartCountPerPod,omitempty"` + // FailuerCount tracks the total number of failed pods + FailureCount map[string]bool `json:"failureCount,omitempty"` } // +genclient diff --git a/apis/apps/v1alpha1/zz_generated.deepcopy.go b/apis/apps/v1alpha1/zz_generated.deepcopy.go index 2cb902a6..afb9b4e8 100644 --- a/apis/apps/v1alpha1/zz_generated.deepcopy.go +++ b/apis/apps/v1alpha1/zz_generated.deepcopy.go @@ -234,6 +234,11 @@ func (in *SidekickSpec) DeepCopyInto(out *SidekickSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.BackoffLimit != nil { + in, out := &in.BackoffLimit, &out.BackoffLimit + *out = new(int32) + **out = **in + } if in.TerminationGracePeriodSeconds != nil { in, out := &in.TerminationGracePeriodSeconds, &out.TerminationGracePeriodSeconds *out = new(int64) @@ -380,6 +385,20 @@ func (in *SidekickStatus) DeepCopyInto(out *SidekickStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.ContainerRestartCountsPerPod != nil { + in, out := &in.ContainerRestartCountsPerPod, &out.ContainerRestartCountsPerPod + *out = make(map[string]int32, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.FailureCount != nil { + in, out := &in.FailureCount, &out.FailureCount + *out = make(map[string]bool, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } return } diff --git a/crds/apps.k8s.appscode.com_sidekicks.yaml b/crds/apps.k8s.appscode.com_sidekicks.yaml index 2e9cf7e5..0ce1de32 100644 --- a/crds/apps.k8s.appscode.com_sidekicks.yaml +++ b/crds/apps.k8s.appscode.com_sidekicks.yaml @@ -1068,6 +1068,11 @@ spec: description: AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. type: boolean + backoffLimit: + description: Specifies the number of retries before marking this job + failed. + format: int32 + type: integer containers: description: List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container @@ -5399,7 +5404,17 @@ spec: type: array restartPolicy: description: 'Restart policy for all containers within the pod. One - of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy' + of Always, OnFailure, Never. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy + If your sidekick has restartPolicy = "OnFailure", keep in mind that + your Pod running the Job will be terminated once the job backoff + limit has been reached. This can make debugging the Job''s executable + more difficult. We suggest setting restartPolicy = "Never" when + debugging the Job or using a logging system to ensure output from + failed Jobs is not lost inadvertently.' + enum: + - Never + - Always + - OnFailure type: string runtimeClassName: description: 'RuntimeClassName refers to a RuntimeClass object in @@ -7659,12 +7674,22 @@ spec: - type type: object type: array + containerRestartCountPerPod: + additionalProperties: + format: int32 + type: integer + description: ContainerRestartCountsPerPod stores the sum of all container + restart counts of a pod + type: object + failureCount: + additionalProperties: + type: boolean + description: FailuerCount tracks the total number of failed pods + type: object leader: properties: name: type: string - required: - - name type: object observedGeneration: description: observedGeneration is the most recent generation observed @@ -7674,14 +7699,16 @@ spec: type: integer phase: description: Specifies the current phase of the sidekick CR + enum: + - Pending + - Current + - Failed + - Succeeded type: string pod: description: PodPhase is a label for the condition of a pod at the current time. type: string - required: - - leader - - pod type: object type: object served: true diff --git a/pkg/controllers/apps/pod.go b/pkg/controllers/apps/pod.go new file mode 100644 index 00000000..caa58511 --- /dev/null +++ b/pkg/controllers/apps/pod.go @@ -0,0 +1,130 @@ +/* +Copyright AppsCode Inc. and Contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package apps + +import ( + "context" + + appsv1alpha1 "kubeops.dev/sidekick/apis/apps/v1alpha1" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/klog/v2" + cu "kmodules.xyz/client-go/client" + core_util "kmodules.xyz/client-go/core/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func (r *SidekickReconciler) removePodFinalizerIfMarkedForDeletion(ctx context.Context, req ctrl.Request) (bool, error) { + var pod corev1.Pod + err := r.Get(ctx, req.NamespacedName, &pod) + if err != nil && !errors.IsNotFound(err) { + return false, err + } + + if err == nil && pod.DeletionTimestamp != nil { + // Increase the failureCount if the pod was terminated externally + // if the pod was terminated externally, then it will not have + // deletionInitiatorKey set in its annotations + + _, exists := pod.ObjectMeta.Annotations[deletionInitiatorKey] + hash, exists1 := pod.ObjectMeta.Annotations[podHash] + if !exists { + var sk appsv1alpha1.Sidekick + err = r.Get(ctx, req.NamespacedName, &sk) + if err != nil && !errors.IsNotFound(err) { + return false, err + } + // if sidekick is not found or it is in deletion state, + // ignore updating failureCount in this case + + if err == nil && sk.DeletionTimestamp == nil && exists1 { + if sk.Status.FailureCount == nil { + sk.Status.FailureCount = make(map[string]bool) + } + sk.Status.FailureCount[hash] = true + err = r.updateSidekickStatus(ctx, &sk) + if err != nil && !errors.IsNotFound(err) { + return false, err + } + } + } + + // removing finalizer, the reason behind adding this finalizer is stated below + // where we created the pod + if core_util.HasFinalizer(pod.ObjectMeta, getFinalizerName()) { + err = r.removePodFinalizer(ctx, &pod) + if err != nil { + return false, err + } + return true, nil + } + } + return false, nil +} + +func (r *SidekickReconciler) deletePod(ctx context.Context, pod *corev1.Pod) error { + err := r.setDeletionInitiatorAnnotation(ctx, pod) + if err != nil { + return err + } + return r.Delete(ctx, pod) +} + +func getContainerRestartCounts(pod *corev1.Pod) int32 { + restartCounter := int32(0) + for _, cs := range pod.Status.ContainerStatuses { + restartCounter += cs.RestartCount + } + for _, ics := range pod.Status.InitContainerStatuses { + restartCounter += ics.RestartCount + } + return restartCounter +} + +func getTotalContainerRestartCounts(sidekick *appsv1alpha1.Sidekick) int32 { + totalContainerRestartCount := int32(0) + for _, value := range sidekick.Status.ContainerRestartCountsPerPod { + totalContainerRestartCount += value + } + return totalContainerRestartCount +} + +func (r *SidekickReconciler) setDeletionInitiatorAnnotation(ctx context.Context, pod *corev1.Pod) error { + _, err := cu.CreateOrPatch(ctx, r.Client, pod, + func(in client.Object, createOp bool) client.Object { + po := in.(*corev1.Pod) + po.ObjectMeta.Annotations[deletionInitiatorKey] = deletionInitiatesBySidekickOperator + return po + }, + ) + klog.Infoln("pod annotation set: ", pod.Annotations) + + return err +} + +func (r *SidekickReconciler) removePodFinalizer(ctx context.Context, pod *corev1.Pod) error { + _, err := cu.CreateOrPatch(ctx, r.Client, pod, + func(in client.Object, createOp bool) client.Object { + po := in.(*corev1.Pod) + po.ObjectMeta = core_util.RemoveFinalizer(po.ObjectMeta, getFinalizerName()) + return po + }, + ) + return client.IgnoreNotFound(err) +} diff --git a/pkg/controllers/apps/sidekick.go b/pkg/controllers/apps/sidekick.go new file mode 100644 index 00000000..1b5fb993 --- /dev/null +++ b/pkg/controllers/apps/sidekick.go @@ -0,0 +1,184 @@ +/* +Copyright AppsCode Inc. and Contributors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package apps + +import ( + "context" + + appsv1alpha1 "kubeops.dev/sidekick/apis/apps/v1alpha1" + + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/klog/v2" + "k8s.io/utils/ptr" + cu "kmodules.xyz/client-go/client" + core_util "kmodules.xyz/client-go/core/v1" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +func (r *SidekickReconciler) handleSidekickFinalizer(ctx context.Context, sidekick *appsv1alpha1.Sidekick) error { + if sidekick.DeletionTimestamp != nil { + if core_util.HasFinalizer(sidekick.ObjectMeta, getFinalizerName()) { + return r.terminate(ctx, sidekick) + } + } + + _, err := cu.CreateOrPatch(ctx, r.Client, sidekick, + func(in client.Object, createOp bool) client.Object { + sk := in.(*appsv1alpha1.Sidekick) + sk.ObjectMeta = core_util.AddFinalizer(sk.ObjectMeta, getFinalizerName()) + return sk + }, + ) + return err +} + +func (r *SidekickReconciler) updateSidekickPhase(ctx context.Context, req ctrl.Request, sidekick *appsv1alpha1.Sidekick) (bool, error) { + // TODO: currently not setting pending phase + + phase, err := r.calculateSidekickPhase(ctx, sidekick) + if err != nil { + return false, err + } + + if phase == appsv1alpha1.SideKickPhaseFailed { + var pod corev1.Pod + if err = r.Get(ctx, req.NamespacedName, &pod); err != nil { + return true, client.IgnoreNotFound(err) + } + if pod.Status.Phase == corev1.PodRunning { + pod.Status.Phase = corev1.PodFailed + // Note: (taken from https://kubernetes.io/docs/concepts/workloads/controllers/job/#pod-backoff-failure-policy ) + // If your sidekick has restartPolicy = "OnFailure", keep in mind that your Pod running the Job will be + // terminated once the job backoff limit has been reached. This can make debugging the Job's executable + // more difficult. We suggest setting restartPolicy = "Never" when debugging the Job or using a logging + // system to ensure output from failed Jobs is not lost inadvertently. + err = r.Status().Update(ctx, &pod) + if err != nil { + return false, err + } + } + // increasing backOffLimit or changing restartPolicy will change + // the sidekick phase from failed to current + return true, nil + } + if sidekick.Status.Phase == appsv1alpha1.SidekickPhaseSucceeded { + return true, nil + } + return false, nil +} + +func (r *SidekickReconciler) updateSidekickStatus(ctx context.Context, sidekick *appsv1alpha1.Sidekick) error { + _, err := cu.PatchStatus(ctx, r.Client, sidekick, func(obj client.Object) client.Object { + sk := obj.(*appsv1alpha1.Sidekick) + sk.Status = sidekick.Status + return sk + }) + return err +} + +func getFailureCountFromSidekickStatus(sidekick *appsv1alpha1.Sidekick) int32 { + failureCount := int32(0) + for _, val := range sidekick.Status.FailureCount { + if val { + failureCount++ + } + } + return failureCount +} + +func (r *SidekickReconciler) getSidekickPhase(sidekick *appsv1alpha1.Sidekick, pod *corev1.Pod) appsv1alpha1.SideKickPhase { + // if restartPolicy is always, we will always try to keep a pod running + // if pod.status.phase == failed, then we will start a new pod + // TODO: which of these two should come first? + if sidekick.Spec.RestartPolicy == corev1.RestartPolicyAlways { + return appsv1alpha1.SideKickPhaseCurrent + } + if sidekick.Status.Phase == appsv1alpha1.SidekickPhaseSucceeded { + return appsv1alpha1.SidekickPhaseSucceeded + } + + // now restartPolicy onFailure & Never remaining + // In both cases we return phase as succeeded if our + // pod return with exit code 0 + if pod != nil && pod.Status.Phase == corev1.PodSucceeded && pod.ObjectMeta.DeletionTimestamp == nil { + return appsv1alpha1.SidekickPhaseSucceeded + } + + // Now we will figure if we should update the sidekick phase + // as failed or not by checking the backOffLimit + + backOffCounts := getTotalBackOffCounts(sidekick) + // TODO: is it > or >= ? + if backOffCounts > *sidekick.Spec.BackoffLimit { + return appsv1alpha1.SideKickPhaseFailed + } + return appsv1alpha1.SideKickPhaseCurrent +} + +func (r *SidekickReconciler) calculateSidekickPhase(ctx context.Context, sidekick *appsv1alpha1.Sidekick) (appsv1alpha1.SideKickPhase, error) { + if sidekick.Status.ContainerRestartCountsPerPod == nil { + sidekick.Status.ContainerRestartCountsPerPod = make(map[string]int32) + } + if sidekick.Status.FailureCount == nil { + sidekick.Status.FailureCount = make(map[string]bool) + } + + var pod corev1.Pod + err := r.Get(ctx, client.ObjectKeyFromObject(sidekick), &pod) + if err != nil && !errors.IsNotFound(err) { + return sidekick.Status.Phase, err + } + + if err == nil { + restartCounter := getContainerRestartCounts(&pod) + podSpecificHash := pod.Annotations[podHash] + sidekick.Status.ContainerRestartCountsPerPod[podSpecificHash] = restartCounter + if pod.Status.Phase == corev1.PodFailed && pod.ObjectMeta.DeletionTimestamp == nil { + sidekick.Status.FailureCount[podSpecificHash] = true + // case: restartPolicy OnFailure and backOffLimit crosses when last time pod restarts, + // in that situation we need to fail the pod, but this manual failure shouldn't take into account + if sidekick.Spec.RestartPolicy != corev1.RestartPolicyNever && getTotalBackOffCounts(sidekick) > *sidekick.Spec.BackoffLimit { + sidekick.Status.FailureCount[podSpecificHash] = false + } + } + + } + + phase := r.getSidekickPhase(sidekick, &pod) + sidekick.Status.Phase = phase + err = r.updateSidekickStatus(ctx, sidekick) + if err != nil { + return "", err + } + return phase, nil +} + +func getTotalBackOffCounts(sidekick *appsv1alpha1.Sidekick) int32 { + // failureCount keeps track of the total number of pods that had pod.status.phase == failed + failureCount := getFailureCountFromSidekickStatus(sidekick) + // totalContainerRestartCount counts the overall + // restart counts of all the containers over all + // pods created by sidekick obj + totalContainerRestartCount := getTotalContainerRestartCounts(sidekick) + if sidekick.Spec.BackoffLimit == nil { + sidekick.Spec.BackoffLimit = ptr.To(int32(0)) + } + klog.Infoln(*sidekick.Spec.BackoffLimit, totalContainerRestartCount, failureCount) + return failureCount + totalContainerRestartCount +} diff --git a/pkg/controllers/apps/sidekick_controller.go b/pkg/controllers/apps/sidekick_controller.go index 724bb871..4ec845aa 100644 --- a/pkg/controllers/apps/sidekick_controller.go +++ b/pkg/controllers/apps/sidekick_controller.go @@ -18,6 +18,8 @@ package apps import ( "context" + "crypto/sha256" + "encoding/hex" "fmt" "regexp" "sort" @@ -48,11 +50,18 @@ import ( ) const ( - keyHash = "sidekick.appscode.com/hash" - keyLeader = "sidekick.appscode.com/leader" - SidekickPhaseCurrent = "Current" + keyHash = "sidekick.appscode.com/hash" + keyLeader = "sidekick.appscode.com/leader" + podHash = "sidekick.appscode.com/pod-specific-hash" + finalizerSuffix = "finalizer" + deletionInitiatorKey = "sidekick.appscode.com/deletion-initiator" + deletionInitiatesBySidekickOperator = "sidekick-operator" ) +func getFinalizerName() string { + return appsv1alpha1.SchemeGroupVersion.Group + "/" + finalizerSuffix +} + // SidekickReconciler reconciles a Sidekick object type SidekickReconciler struct { client.Client @@ -77,6 +86,14 @@ func (r *SidekickReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c logger := log.FromContext(ctx, "sidekick", req.Name, "ns", req.Namespace) ctx = log.IntoContext(ctx, logger) + isPodFinalizerRemoved, err := r.removePodFinalizerIfMarkedForDeletion(ctx, req) + if err != nil { + return ctrl.Result{}, err + } + if isPodFinalizerRemoved { + return ctrl.Result{}, nil + } + var sidekick appsv1alpha1.Sidekick if err := r.Get(ctx, req.NamespacedName, &sidekick); err != nil { logger.Error(err, "unable to fetch Sidekick") @@ -86,39 +103,33 @@ func (r *SidekickReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c return ctrl.Result{}, client.IgnoreNotFound(err) } - if sidekick.DeletionTimestamp != nil { - if core_util.HasFinalizer(sidekick.ObjectMeta, appsv1alpha1.SchemeGroupVersion.Group) { - return ctrl.Result{}, r.terminate(ctx, &sidekick) - } + err = r.handleSidekickFinalizer(ctx, &sidekick) + if err != nil { + return ctrl.Result{}, err } - _, err := cu.CreateOrPatch(context.TODO(), r.Client, &sidekick, - func(in client.Object, createOp bool) client.Object { - sk := in.(*appsv1alpha1.Sidekick) - sk.ObjectMeta = core_util.AddFinalizer(sk.ObjectMeta, appsv1alpha1.SchemeGroupVersion.Group) - - return sk - }, - ) + dropKey, err := r.updateSidekickPhase(ctx, req, &sidekick) if err != nil { return ctrl.Result{}, err } + if dropKey { + return ctrl.Result{}, nil + } leader, err := r.getLeader(ctx, sidekick) + if errors.IsNotFound(err) || (err == nil && leader.Name != sidekick.Status.Leader.Name) { var pod corev1.Pod e2 := r.Get(ctx, req.NamespacedName, &pod) if e2 == nil { - err := r.Delete(ctx, &pod) + err := r.deletePod(ctx, &pod) if err != nil { return ctrl.Result{}, err } - sidekick.Status.Leader.Name = "" sidekick.Status.Pod = "" - sidekick.Status.Phase = SidekickPhaseCurrent sidekick.Status.ObservedGeneration = sidekick.GetGeneration() - err = r.Status().Update(ctx, &sidekick) + err = r.updateSidekickStatus(ctx, &sidekick) if err != nil { return ctrl.Result{}, err } @@ -142,17 +153,16 @@ func (r *SidekickReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c actualHash := pod.Annotations[keyHash] if expectedHash != actualHash || leader.Name != pod.Annotations[keyLeader] || - leader.Spec.NodeName != pod.Spec.NodeName { - err := r.Delete(ctx, &pod) + leader.Spec.NodeName != pod.Spec.NodeName || (pod.Status.Phase == corev1.PodFailed && sidekick.Spec.RestartPolicy == corev1.RestartPolicyNever) { + err := r.deletePod(ctx, &pod) if err != nil { return ctrl.Result{}, err } sidekick.Status.Leader.Name = "" sidekick.Status.Pod = "" - sidekick.Status.Phase = SidekickPhaseCurrent sidekick.Status.ObservedGeneration = sidekick.GetGeneration() - err = r.Status().Update(ctx, &sidekick) + err = r.updateSidekickStatus(ctx, &sidekick) if err != nil { return ctrl.Result{}, err } @@ -161,9 +171,8 @@ func (r *SidekickReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c // sidekick.Status.Leader.Name = "" sidekick.Status.Pod = pod.Status.Phase - sidekick.Status.Phase = SidekickPhaseCurrent sidekick.Status.ObservedGeneration = sidekick.GetGeneration() - err := r.Status().Update(ctx, &sidekick) + err := r.updateSidekickStatus(ctx, &sidekick) if err != nil { return ctrl.Result{}, err } @@ -178,7 +187,6 @@ func (r *SidekickReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c o2 := metav1.NewControllerRef(leader, corev1.SchemeGroupVersion.WithKind("Pod")) o2.Controller = ptr.To(false) o2.BlockOwnerDeletion = ptr.To(false) - pod = corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: sidekick.Name, @@ -231,9 +239,12 @@ func (r *SidekickReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c if pod.Annotations == nil { pod.Annotations = make(map[string]string) } + curTime := time.Now().String() + curTimeHash := sha256.Sum256([]byte(curTime)) + // Do not alter the assign order pod.Annotations[keyHash] = meta.GenerationHash(&sidekick) + pod.Annotations[podHash] = hex.EncodeToString(curTimeHash[:])[:16] pod.Annotations[keyLeader] = leader.Name - for _, c := range sidekick.Spec.Containers { c2, err := convContainer(leader, c) if err != nil { @@ -248,17 +259,26 @@ func (r *SidekickReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c } pod.Spec.InitContainers = append(pod.Spec.InitContainers, *c2) } + // Adding finalizer to pod because when user will delete this pod using + // kubectl delete, then pod will be gracefully terminated which will led + // to pod.status.phase: succeeded. We need to control this behaviour. + // By adding finalizer, we will know who is deleting the object + _, e3 := cu.CreateOrPatch(context.TODO(), r.Client, &pod, + func(in client.Object, createOp bool) client.Object { + po := in.(*corev1.Pod) + po.ObjectMeta = core_util.AddFinalizer(po.ObjectMeta, getFinalizerName()) + return po + }, + ) - e3 := r.Create(ctx, &pod) if e3 != nil { return ctrl.Result{}, e3 } sidekick.Status.Leader.Name = leader.Name sidekick.Status.Pod = pod.Status.Phase - sidekick.Status.Phase = SidekickPhaseCurrent sidekick.Status.ObservedGeneration = sidekick.GetGeneration() - err = r.Status().Update(ctx, &sidekick) + err = r.updateSidekickStatus(ctx, &sidekick) if err != nil { return ctrl.Result{}, err } @@ -454,7 +474,7 @@ func (r *SidekickReconciler) terminate(ctx context.Context, sidekick *appsv1alph _, err = cu.CreateOrPatch(context.TODO(), r.Client, sidekick, func(in client.Object, createOp bool) client.Object { sk := in.(*appsv1alpha1.Sidekick) - sk.ObjectMeta = core_util.RemoveFinalizer(sk.ObjectMeta, appsv1alpha1.SchemeGroupVersion.Group) + sk.ObjectMeta = core_util.RemoveFinalizer(sk.ObjectMeta, getFinalizerName()) return sk },