Skip to content

Commit

Permalink
Update: use incluster client to get last line of node container logs
Browse files Browse the repository at this point in the history
  • Loading branch information
vimystic committed Apr 2, 2024
1 parent 8aaae51 commit ba39b77
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 16 deletions.
5 changes: 4 additions & 1 deletion api/v1/self_healing_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ type SelfHealSpec struct {
//
// +optional
HeightDriftMitigation *HeightDriftMitigationSpec `json:"heightDriftMitigation"`
StuckPodMitigation *StuckPodMitigationSpec `json:"stuckPodMitigation"`
// Take action when a pod is stuck.
//
// +optional
StuckPodMitigation *StuckPodMitigationSpec `json:"stuckPodMitigation"`
}

type PVCAutoScaleSpec struct {
Expand Down
32 changes: 19 additions & 13 deletions controllers/selfhealing_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/strangelove-ventures/cosmos-operator/internal/fullnode"
"github.com/strangelove-ventures/cosmos-operator/internal/healthcheck"
"github.com/strangelove-ventures/cosmos-operator/internal/kube"
v1 "k8s.io/api/core/v1"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
Expand Down Expand Up @@ -122,17 +123,7 @@ func (r *SelfHealingReconciler) mitigateHeightDrift(ctx context.Context, reporte
}

pods := r.driftDetector.LaggingPods(ctx, crd)
var deleted int
for _, pod := range pods {
// CosmosFullNodeController will detect missing pod and re-create it.
if err := r.Delete(ctx, pod); kube.IgnoreNotFound(err) != nil {
reporter.Error(err, "Failed to delete pod", "pod", pod.Name)
reporter.RecordError("HeightDriftMitigationDeletePod", err)
continue
}
reporter.Info("Deleted pod for meeting height drift threshold", "pod", pod.Name)
deleted++
}
deleted := r.DeletePods(pods, "HeightDriftMitigationDeletePod", reporter, ctx)
if deleted > 0 {
msg := fmt.Sprintf("Height lagged behind by %d or more blocks; deleted pod(s)", crd.Spec.SelfHeal.HeightDriftMitigation.Threshold)
reporter.RecordInfo("HeightDriftMitigation", msg)
Expand All @@ -145,10 +136,25 @@ func (r *SelfHealingReconciler) mitigateStuckPods(ctx context.Context, reporter
}

pods := r.stuckDetector.StuckPods(ctx, crd)
deleted := r.DeletePods(pods, "StuckPodMitigationDeletePod", reporter, ctx)
if deleted > 0 {
msg := fmt.Sprintf("Stuck for %d seconds; deleted pod(s)", crd.Spec.SelfHeal.StuckPodMitigation.Threshold)
reporter.RecordInfo("StuckPodMitigation", msg)
}
}

if pods != nil {
fmt.Println(pods)
func (r *SelfHealingReconciler) DeletePods(pods []*v1.Pod, reason string, reporter kube.Reporter, ctx context.Context) int {
var deleted int
for _, pod := range pods {
if err := r.Delete(ctx, pod); kube.IgnoreNotFound(err) != nil {
reporter.Error(err, "Failed to delete pod", "pod", pod.Name)
reporter.RecordError(reason, err)
continue
}
reporter.Info("Deleted pod for ", reason, " pod:", pod.Name)
deleted++
}
return deleted
}

// SetupWithManager sets up the controller with the Manager.
Expand Down
62 changes: 60 additions & 2 deletions internal/fullnode/stuck_detection.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@ package fullnode

import (
"context"
"fmt"
"io/ioutil"

Check failure on line 6 in internal/fullnode/stuck_detection.go

View workflow job for this annotation

GitHub Actions / lint

SA1019: "io/ioutil" has been deprecated since Go 1.19: As of Go 1.16, the same functionality is now provided by package [io] or package [os], and those implementations should be preferred in new code. See the specific function documentation for details. (staticcheck)
"strings"
"time"

cosmosv1 "github.com/strangelove-ventures/cosmos-operator/api/v1"
"github.com/strangelove-ventures/cosmos-operator/internal/kube"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/client"
)

type StuckPodDetection struct {
Expand All @@ -15,8 +22,59 @@ type StuckPodDetection struct {
computeRollout func(maxUnavail *intstr.IntOrString, desired, ready int) int

Check failure on line 22 in internal/fullnode/stuck_detection.go

View workflow job for this annotation

GitHub Actions / lint

field `computeRollout` is unused (unused)
}

func NewStuckDetection(collector StatusCollector) DriftDetection {
return DriftDetection{
available: kube.AvailablePods,
collector: collector,
computeRollout: kube.ComputeRollout,
}
}

// StuckPods returns pods that are stuck on a block height due to a cometbft issue that manifests on sentries using horcrux.
func (d StuckPodDetection) StuckPods(ctx context.Context, crd *cosmosv1.CosmosFullNode) []*corev1.Pod {

Check failure on line 34 in internal/fullnode/stuck_detection.go

View workflow job for this annotation

GitHub Actions / lint

unnecessary leading newline (whitespace)
//TODO
return nil

pods := d.collector.Collect(ctx, client.ObjectKeyFromObject(crd)).Synced().Pods()

fmt.Println(pods[0])

config, err := rest.InClusterConfig()
if err != nil {
panic(err.Error())
}

clientset, err := kubernetes.NewForConfig(config)
if err != nil {
panic(err.Error())
}

getPodLogsLastLine(clientset, pods[0])

//MORE TODO HERE

return []*corev1.Pod{}
}

func getPodLogsLastLine(clientset *kubernetes.Clientset, pod *corev1.Pod) {
podLogOpts := corev1.PodLogOptions{}
logRequest := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &podLogOpts)

logStream, err := logRequest.Stream(context.Background())
if err != nil {
fmt.Printf("Error getting logs for pod %s: %v\n", pod.Name, err)
return
}
defer logStream.Close()

logBytes, err := ioutil.ReadAll(logStream)
if err != nil {
fmt.Printf("Error reading logs for pod %s: %v\n", pod.Name, err)
return
}

logLines := strings.Split(strings.TrimRight(string(logBytes), "\n"), "\n")
if len(logLines) > 0 {
fmt.Println("Last line of logs for pod", pod.Name+":", logLines[len(logLines)-1])
} else {
fmt.Println("No logs found for pod", pod.Name)
}
}

0 comments on commit ba39b77

Please sign in to comment.