improve error handling

2025-06-26 15:11:37 +00:00 · 2025-01-27 14:52:55 +01:00 · 2025-01-27 14:52:55 +01:00 · cd30c5ba11
commit cd30c5ba11
parent 8b67f08832
4 changed files with 219 additions and 22 deletions
--- a/controllers/kubernetes.go
+++ b/controllers/kubernetes.go
@ -4,8 +4,13 @@ import (
 	"context"
 	"fmt"
 	"reflect"
+	"strings"
+
+	batchv1 "k8s.io/api/batch/v1"
+	corev1 "k8s.io/api/core/v1"

 	"k8s.io/client-go/util/retry"
+	"k8s.io/kubectl/pkg/util/podutils"
 	ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
 )

@ -26,3 +31,185 @@ func TryToUpdateStatus(ctx context.Context, client ctrlruntimeclient.Client, obj
 	})

 }
+
+const (
+	// Indicates that health assessment failed and actual health status is unknown
+	HealthStatusUnknown HealthStatusCode = "Unknown"
+	// Progressing health status means that resource is not healthy but still have a chance to reach healthy state
+	HealthStatusProgressing HealthStatusCode = "Progressing"
+	// Resource is 100% healthy
+	HealthStatusHealthy HealthStatusCode = "Healthy"
+	// Assigned to resources that are suspended or paused. The typical example is a
+	// [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob.
+	HealthStatusSuspended HealthStatusCode = "Suspended"
+	HealthStatusPaused    HealthStatusCode = "Paused"
+	// Degrade status is used if resource status indicates failure or resource could not reach healthy state
+	// within some timeout.
+	HealthStatusDegraded HealthStatusCode = "Degraded"
+	// Indicates that resource is missing in the cluster.
+	HealthStatusMissing HealthStatusCode = "Missing"
+)
+
+// Represents resource health status
+type HealthStatusCode string
+
+type HealthStatus struct {
+	Status  HealthStatusCode `json:"status,omitempty"`
+	Message string           `json:"message,omitempty"`
+}
+
+func getCorev1PodHealth(pod *corev1.Pod) *HealthStatus {
+	// This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure,
+	// corev1.RestartPolicyNever, otherwise it breaks the resource hook logic.
+	// The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook,
+	// then we will prematurely fail the PreSync/PostSync hook. Meanwhile, when that error condition is resolved
+	// (e.g. the image is available), the resource hook pod will unexpectedly be executed even though the sync has
+	// completed.
+	if pod.Spec.RestartPolicy == corev1.RestartPolicyAlways {
+		var status HealthStatusCode
+		var messages []string
+
+		for _, containerStatus := range pod.Status.ContainerStatuses {
+			waiting := containerStatus.State.Waiting
+			// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
+			if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
+				status = HealthStatusDegraded
+				messages = append(messages, waiting.Message)
+			}
+		}
+
+		if status != "" {
+			return &HealthStatus{
+				Status:  status,
+				Message: strings.Join(messages, ", "),
+			}
+		}
+	}
+
+	getFailMessage := func(ctr *corev1.ContainerStatus) string {
+		if ctr.State.Terminated != nil {
+			if ctr.State.Terminated.Message != "" {
+				return ctr.State.Terminated.Message
+			}
+			if ctr.State.Terminated.Reason == "OOMKilled" {
+				return ctr.State.Terminated.Reason
+			}
+			if ctr.State.Terminated.ExitCode != 0 {
+				return fmt.Sprintf("container %q failed with exit code %d", ctr.Name, ctr.State.Terminated.ExitCode)
+			}
+		}
+		return ""
+	}
+
+	switch pod.Status.Phase {
+	case corev1.PodPending:
+		return &HealthStatus{
+			Status:  HealthStatusProgressing,
+			Message: pod.Status.Message,
+		}
+	case corev1.PodSucceeded:
+		return &HealthStatus{
+			Status:  HealthStatusHealthy,
+			Message: pod.Status.Message,
+		}
+	case corev1.PodFailed:
+		if pod.Status.Message != "" {
+			// Pod has a nice error message. Use that.
+			return &HealthStatus{Status: HealthStatusDegraded, Message: pod.Status.Message}
+		}
+		for _, ctr := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
+			if msg := getFailMessage(&ctr); msg != "" {
+				return &HealthStatus{Status: HealthStatusDegraded, Message: msg}
+			}
+		}
+
+		return &HealthStatus{Status: HealthStatusDegraded, Message: ""}
+	case corev1.PodRunning:
+		switch pod.Spec.RestartPolicy {
+		case corev1.RestartPolicyAlways:
+			// if pod is ready, it is automatically healthy
+			if podutils.IsPodReady(pod) {
+				return &HealthStatus{
+					Status:  HealthStatusHealthy,
+					Message: pod.Status.Message,
+				}
+			}
+			// if it's not ready, check to see if any container terminated, if so, it's degraded
+			for _, ctrStatus := range pod.Status.ContainerStatuses {
+				if ctrStatus.LastTerminationState.Terminated != nil {
+					return &HealthStatus{
+						Status:  HealthStatusDegraded,
+						Message: pod.Status.Message,
+					}
+				}
+			}
+			// otherwise we are progressing towards a ready state
+			return &HealthStatus{
+				Status:  HealthStatusProgressing,
+				Message: pod.Status.Message,
+			}
+		case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
+			// pods set with a restart policy of OnFailure or Never, have a finite life.
+			// These pods are typically resource hooks. Thus, we consider these as Progressing
+			// instead of healthy.
+			return &HealthStatus{
+				Status:  HealthStatusProgressing,
+				Message: pod.Status.Message,
+			}
+		}
+	}
+	return &HealthStatus{
+		Status:  HealthStatusUnknown,
+		Message: pod.Status.Message,
+	}
+}
+
+func getBatchv1JobHealth(job *batchv1.Job) *HealthStatus {
+	failed := false
+	var failMsg string
+	complete := false
+	var message string
+	isSuspended := false
+	for _, condition := range job.Status.Conditions {
+		switch condition.Type {
+		case batchv1.JobFailed:
+			failed = true
+			complete = true
+			failMsg = condition.Message
+		case batchv1.JobComplete:
+			complete = true
+			message = condition.Message
+		case batchv1.JobSuspended:
+			complete = true
+			message = condition.Message
+			if condition.Status == corev1.ConditionTrue {
+				isSuspended = true
+			}
+		}
+	}
+
+	if !complete {
+		return &HealthStatus{
+			Status:  HealthStatusProgressing,
+			Message: message,
+		}
+	}
+	if failed {
+		return &HealthStatus{
+			Status:  HealthStatusDegraded,
+			Message: failMsg,
+		}
+	}
+	if isSuspended {
+		return &HealthStatus{
+			Status:  HealthStatusSuspended,
+			Message: failMsg,
+		}
+	}
+
+	return &HealthStatus{
+		Status:  HealthStatusHealthy,
+		Message: message,
+	}
+
+}
--- a/controllers/osartifact_controller.go
+++ b/controllers/osartifact_controller.go
@ -119,12 +119,7 @@ func (r *OSArtifactReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		})
 		return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
 	case osbuilder.Error:
-		meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{
-			Type:   "Ready",
-			Status: metav1.ConditionFalse,
-			Reason: "Error",
-		})
-		return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
+		return ctrl.Result{}, nil
 	default:
 		return r.checkBuild(ctx, artifact)
 	}
@ -210,6 +205,12 @@ func (r *OSArtifactReconciler) checkBuild(ctx context.Context, artifact *osbuild
 			return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
 		case corev1.PodFailed:
 			artifact.Status.Phase = osbuilder.Error
+			meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{
+				Type:    "Ready",
+				Status:  metav1.ConditionFalse,
+				Reason:  "Error",
+				Message: getCorev1PodHealth(&pod).Message,
+			})
 			return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
 		case corev1.PodPending, corev1.PodRunning:
 			return ctrl.Result{}, nil
@ -275,6 +276,7 @@ func (r *OSArtifactReconciler) checkExport(ctx context.Context, artifact *osbuil
 					},
 				},
 				Spec: batchv1.JobSpec{
+					BackoffLimit: ptr(int32(1)),
 					Template: corev1.PodTemplateSpec{
 						Spec: corev1.PodSpec{
 							RestartPolicy: corev1.RestartPolicyOnFailure,
@ -360,24 +362,35 @@ func (r *OSArtifactReconciler) checkExport(ctx context.Context, artifact *osbuil
 				return ctrl.Result{Requeue: true}, nil
 			}

-		} else if job.Spec.Completions == nil || *job.Spec.Completions == 1 {
+		} else if job.Spec.Completions != nil {
 			if job.Status.Succeeded > 0 {
 				artifact.Status.Phase = osbuilder.Ready
 				if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil {
 					log.FromContext(ctx).Error(err, "failed to update artifact status")
 					return ctrl.Result{}, err
 				}
-			}
-		} else if *job.Spec.BackoffLimit <= job.Status.Failed {
-			artifact.Status.Phase = osbuilder.Error
-			if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil {
-				log.FromContext(ctx).Error(err, "failed to update artifact status")
-				return ctrl.Result{}, err
+				return ctrl.Result{}, nil
+			} else if job.Status.Failed > 0 {
+				artifact.Status.Phase = osbuilder.Error
+				h := getBatchv1JobHealth(job)
+				if h.Status == HealthStatusDegraded {
+					meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{
+						Type:    "Ready",
+						Status:  metav1.ConditionFalse,
+						Reason:  "Error",
+						Message: h.Message,
+					})
+					if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil {
+						log.FromContext(ctx).Error(err, "failed to update artifact status")
+						return ctrl.Result{}, err
+					}
+					return ctrl.Result{}, nil
+				}
 			}
 		}
 	}

-	return ctrl.Result{}, nil
+	return requeue, nil
 }

 // SetupWithManager sets up the controller with the Manager.
--- a/go.mod
+++ b/go.mod
@ -3,12 +3,10 @@ module github.com/kairos-io/osbuilder
 go 1.23.4

 require (
-	github.com/onsi/ginkgo/v2 v2.22.2
-	github.com/onsi/gomega v1.36.2
-	github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
 	k8s.io/api v0.32.1
 	k8s.io/apimachinery v0.32.1
 	k8s.io/client-go v0.32.1
+	k8s.io/kubectl v0.32.1
 	sigs.k8s.io/controller-runtime v0.20.0
 )

@ -26,14 +24,12 @@ require (
 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 	github.com/go-openapi/jsonreference v0.20.2 // indirect
 	github.com/go-openapi/swag v0.23.0 // indirect
-	github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
 	github.com/gogo/protobuf v1.3.2 // indirect
 	github.com/golang/protobuf v1.5.4 // indirect
 	github.com/google/btree v1.1.3 // indirect
 	github.com/google/gnostic-models v0.6.8 // indirect
 	github.com/google/go-cmp v0.6.0 // indirect
 	github.com/google/gofuzz v1.2.0 // indirect
-	github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
@ -41,6 +37,8 @@ require (
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/onsi/ginkgo/v2 v2.22.2 // indirect
+	github.com/onsi/gomega v1.36.2 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/prometheus/client_golang v1.19.1 // indirect
 	github.com/prometheus/client_model v0.6.1 // indirect
@ -57,7 +55,6 @@ require (
 	golang.org/x/term v0.27.0 // indirect
 	golang.org/x/text v0.21.0 // indirect
 	golang.org/x/time v0.7.0 // indirect
-	golang.org/x/tools v0.28.0 // indirect
 	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
 	google.golang.org/protobuf v1.36.1 // indirect
 	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
--- a/go.sum
+++ b/go.sum
@ -75,8 +75,6 @@ github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU
 github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk=
 github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8=
 github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY=
-github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 h1:Ii+DKncOVM8Cu1Hc+ETb5K+23HdAMvESYE3ZJ5b5cMI=
-github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5/go.mod h1:iIss55rKnNBTvrwdmkUpLnDpZoAHvWaiq5+iMmen4AE=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@ -180,6 +178,8 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
 k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
 k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y=
 k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4=
+k8s.io/kubectl v0.32.1 h1:/btLtXLQUU1rWx8AEvX9jrb9LaI6yeezt3sFALhB8M8=
+k8s.io/kubectl v0.32.1/go.mod h1:sezNuyWi1STk4ZNPVRIFfgjqMI6XMf+oCVLjZen/pFQ=
 k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro=
 k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
 sigs.k8s.io/controller-runtime v0.20.0 h1:jjkMo29xEXH+02Md9qaVXfEIaMESSpy3TBWPrsfQkQs=