improve error handling

2025-09-05 01:04:16 +00:00 · 2025-01-27 14:52:55 +01:00
parent 8b67f08832
commit cd30c5ba11
4 changed files with 219 additions and 22 deletions
--- a/controllers/kubernetes.go
+++ b/controllers/kubernetes.go
@@ -4,8 +4,13 @@ import (
 	"context"
 	"fmt"
 	"reflect"
+	"strings"
+
+	batchv1 "k8s.io/api/batch/v1"
+	corev1 "k8s.io/api/core/v1"

 	"k8s.io/client-go/util/retry"
+	"k8s.io/kubectl/pkg/util/podutils"
 	ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
 )

@@ -26,3 +31,185 @@ func TryToUpdateStatus(ctx context.Context, client ctrlruntimeclient.Client, obj
 	})

 }
+
+const (
+	// Indicates that health assessment failed and actual health status is unknown
+	HealthStatusUnknown HealthStatusCode = "Unknown"
+	// Progressing health status means that resource is not healthy but still have a chance to reach healthy state
+	HealthStatusProgressing HealthStatusCode = "Progressing"
+	// Resource is 100% healthy
+	HealthStatusHealthy HealthStatusCode = "Healthy"
+	// Assigned to resources that are suspended or paused. The typical example is a
+	// [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob.
+	HealthStatusSuspended HealthStatusCode = "Suspended"
+	HealthStatusPaused    HealthStatusCode = "Paused"
+	// Degrade status is used if resource status indicates failure or resource could not reach healthy state
+	// within some timeout.
+	HealthStatusDegraded HealthStatusCode = "Degraded"
+	// Indicates that resource is missing in the cluster.
+	HealthStatusMissing HealthStatusCode = "Missing"
+)
+
+// Represents resource health status
+type HealthStatusCode string
+
+type HealthStatus struct {
+	Status  HealthStatusCode `json:"status,omitempty"`
+	Message string           `json:"message,omitempty"`
+}
+
+func getCorev1PodHealth(pod *corev1.Pod) *HealthStatus {
+	// This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure,
+	// corev1.RestartPolicyNever, otherwise it breaks the resource hook logic.
+	// The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook,
+	// then we will prematurely fail the PreSync/PostSync hook. Meanwhile, when that error condition is resolved
+	// (e.g. the image is available), the resource hook pod will unexpectedly be executed even though the sync has
+	// completed.
+	if pod.Spec.RestartPolicy == corev1.RestartPolicyAlways {
+		var status HealthStatusCode
+		var messages []string
+
+		for _, containerStatus := range pod.Status.ContainerStatuses {
+			waiting := containerStatus.State.Waiting
+			// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
+			if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
+				status = HealthStatusDegraded
+				messages = append(messages, waiting.Message)
+			}
+		}
+
+		if status != "" {
+			return &HealthStatus{
+				Status:  status,
+				Message: strings.Join(messages, ", "),
+			}
+		}
+	}
+
+	getFailMessage := func(ctr *corev1.ContainerStatus) string {
+		if ctr.State.Terminated != nil {
+			if ctr.State.Terminated.Message != "" {
+				return ctr.State.Terminated.Message
+			}
+			if ctr.State.Terminated.Reason == "OOMKilled" {
+				return ctr.State.Terminated.Reason
+			}
+			if ctr.State.Terminated.ExitCode != 0 {
+				return fmt.Sprintf("container %q failed with exit code %d", ctr.Name, ctr.State.Terminated.ExitCode)
+			}
+		}
+		return ""
+	}
+
+	switch pod.Status.Phase {
+	case corev1.PodPending:
+		return &HealthStatus{
+			Status:  HealthStatusProgressing,
+			Message: pod.Status.Message,
+		}
+	case corev1.PodSucceeded:
+		return &HealthStatus{
+			Status:  HealthStatusHealthy,
+			Message: pod.Status.Message,
+		}
+	case corev1.PodFailed:
+		if pod.Status.Message != "" {
+			// Pod has a nice error message. Use that.
+			return &HealthStatus{Status: HealthStatusDegraded, Message: pod.Status.Message}
+		}
+		for _, ctr := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
+			if msg := getFailMessage(&ctr); msg != "" {
+				return &HealthStatus{Status: HealthStatusDegraded, Message: msg}
+			}
+		}
+
+		return &HealthStatus{Status: HealthStatusDegraded, Message: ""}
+	case corev1.PodRunning:
+		switch pod.Spec.RestartPolicy {
+		case corev1.RestartPolicyAlways:
+			// if pod is ready, it is automatically healthy
+			if podutils.IsPodReady(pod) {
+				return &HealthStatus{
+					Status:  HealthStatusHealthy,
+					Message: pod.Status.Message,
+				}
+			}
+			// if it's not ready, check to see if any container terminated, if so, it's degraded
+			for _, ctrStatus := range pod.Status.ContainerStatuses {
+				if ctrStatus.LastTerminationState.Terminated != nil {
+					return &HealthStatus{
+						Status:  HealthStatusDegraded,
+						Message: pod.Status.Message,
+					}
+				}
+			}
+			// otherwise we are progressing towards a ready state
+			return &HealthStatus{
+				Status:  HealthStatusProgressing,
+				Message: pod.Status.Message,
+			}
+		case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
+			// pods set with a restart policy of OnFailure or Never, have a finite life.
+			// These pods are typically resource hooks. Thus, we consider these as Progressing
+			// instead of healthy.
+			return &HealthStatus{
+				Status:  HealthStatusProgressing,
+				Message: pod.Status.Message,
+			}
+		}
+	}
+	return &HealthStatus{
+		Status:  HealthStatusUnknown,
+		Message: pod.Status.Message,
+	}
+}
+
+func getBatchv1JobHealth(job *batchv1.Job) *HealthStatus {
+	failed := false
+	var failMsg string
+	complete := false
+	var message string
+	isSuspended := false
+	for _, condition := range job.Status.Conditions {
+		switch condition.Type {
+		case batchv1.JobFailed:
+			failed = true
+			complete = true
+			failMsg = condition.Message
+		case batchv1.JobComplete:
+			complete = true
+			message = condition.Message
+		case batchv1.JobSuspended:
+			complete = true
+			message = condition.Message
+			if condition.Status == corev1.ConditionTrue {
+				isSuspended = true
+			}
+		}
+	}
+
+	if !complete {
+		return &HealthStatus{
+			Status:  HealthStatusProgressing,
+			Message: message,
+		}
+	}
+	if failed {
+		return &HealthStatus{
+			Status:  HealthStatusDegraded,
+			Message: failMsg,
+		}
+	}
+	if isSuspended {
+		return &HealthStatus{
+			Status:  HealthStatusSuspended,
+			Message: failMsg,
+		}
+	}
+
+	return &HealthStatus{
+		Status:  HealthStatusHealthy,
+		Message: message,
+	}
+
+}