improve error handling

This commit is contained in:
Lukasz Zajaczkowski 2025-01-27 14:52:55 +01:00
parent 8b67f08832
commit cd30c5ba11
4 changed files with 219 additions and 22 deletions

View File

@ -4,8 +4,13 @@ import (
"context"
"fmt"
"reflect"
"strings"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/util/retry"
"k8s.io/kubectl/pkg/util/podutils"
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
)
@ -26,3 +31,185 @@ func TryToUpdateStatus(ctx context.Context, client ctrlruntimeclient.Client, obj
})
}
const (
// Indicates that health assessment failed and actual health status is unknown
HealthStatusUnknown HealthStatusCode = "Unknown"
// Progressing health status means that resource is not healthy but still have a chance to reach healthy state
HealthStatusProgressing HealthStatusCode = "Progressing"
// Resource is 100% healthy
HealthStatusHealthy HealthStatusCode = "Healthy"
// Assigned to resources that are suspended or paused. The typical example is a
// [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob.
HealthStatusSuspended HealthStatusCode = "Suspended"
HealthStatusPaused HealthStatusCode = "Paused"
// Degrade status is used if resource status indicates failure or resource could not reach healthy state
// within some timeout.
HealthStatusDegraded HealthStatusCode = "Degraded"
// Indicates that resource is missing in the cluster.
HealthStatusMissing HealthStatusCode = "Missing"
)
// Represents resource health status
type HealthStatusCode string
type HealthStatus struct {
Status HealthStatusCode `json:"status,omitempty"`
Message string `json:"message,omitempty"`
}
func getCorev1PodHealth(pod *corev1.Pod) *HealthStatus {
// This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure,
// corev1.RestartPolicyNever, otherwise it breaks the resource hook logic.
// The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook,
// then we will prematurely fail the PreSync/PostSync hook. Meanwhile, when that error condition is resolved
// (e.g. the image is available), the resource hook pod will unexpectedly be executed even though the sync has
// completed.
if pod.Spec.RestartPolicy == corev1.RestartPolicyAlways {
var status HealthStatusCode
var messages []string
for _, containerStatus := range pod.Status.ContainerStatuses {
waiting := containerStatus.State.Waiting
// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
status = HealthStatusDegraded
messages = append(messages, waiting.Message)
}
}
if status != "" {
return &HealthStatus{
Status: status,
Message: strings.Join(messages, ", "),
}
}
}
getFailMessage := func(ctr *corev1.ContainerStatus) string {
if ctr.State.Terminated != nil {
if ctr.State.Terminated.Message != "" {
return ctr.State.Terminated.Message
}
if ctr.State.Terminated.Reason == "OOMKilled" {
return ctr.State.Terminated.Reason
}
if ctr.State.Terminated.ExitCode != 0 {
return fmt.Sprintf("container %q failed with exit code %d", ctr.Name, ctr.State.Terminated.ExitCode)
}
}
return ""
}
switch pod.Status.Phase {
case corev1.PodPending:
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}
case corev1.PodSucceeded:
return &HealthStatus{
Status: HealthStatusHealthy,
Message: pod.Status.Message,
}
case corev1.PodFailed:
if pod.Status.Message != "" {
// Pod has a nice error message. Use that.
return &HealthStatus{Status: HealthStatusDegraded, Message: pod.Status.Message}
}
for _, ctr := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
if msg := getFailMessage(&ctr); msg != "" {
return &HealthStatus{Status: HealthStatusDegraded, Message: msg}
}
}
return &HealthStatus{Status: HealthStatusDegraded, Message: ""}
case corev1.PodRunning:
switch pod.Spec.RestartPolicy {
case corev1.RestartPolicyAlways:
// if pod is ready, it is automatically healthy
if podutils.IsPodReady(pod) {
return &HealthStatus{
Status: HealthStatusHealthy,
Message: pod.Status.Message,
}
}
// if it's not ready, check to see if any container terminated, if so, it's degraded
for _, ctrStatus := range pod.Status.ContainerStatuses {
if ctrStatus.LastTerminationState.Terminated != nil {
return &HealthStatus{
Status: HealthStatusDegraded,
Message: pod.Status.Message,
}
}
}
// otherwise we are progressing towards a ready state
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}
case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
// pods set with a restart policy of OnFailure or Never, have a finite life.
// These pods are typically resource hooks. Thus, we consider these as Progressing
// instead of healthy.
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}
}
}
return &HealthStatus{
Status: HealthStatusUnknown,
Message: pod.Status.Message,
}
}
func getBatchv1JobHealth(job *batchv1.Job) *HealthStatus {
failed := false
var failMsg string
complete := false
var message string
isSuspended := false
for _, condition := range job.Status.Conditions {
switch condition.Type {
case batchv1.JobFailed:
failed = true
complete = true
failMsg = condition.Message
case batchv1.JobComplete:
complete = true
message = condition.Message
case batchv1.JobSuspended:
complete = true
message = condition.Message
if condition.Status == corev1.ConditionTrue {
isSuspended = true
}
}
}
if !complete {
return &HealthStatus{
Status: HealthStatusProgressing,
Message: message,
}
}
if failed {
return &HealthStatus{
Status: HealthStatusDegraded,
Message: failMsg,
}
}
if isSuspended {
return &HealthStatus{
Status: HealthStatusSuspended,
Message: failMsg,
}
}
return &HealthStatus{
Status: HealthStatusHealthy,
Message: message,
}
}

View File

@ -119,12 +119,7 @@ func (r *OSArtifactReconciler) Reconcile(ctx context.Context, req ctrl.Request)
})
return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
case osbuilder.Error:
meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{
Type: "Ready",
Status: metav1.ConditionFalse,
Reason: "Error",
})
return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
return ctrl.Result{}, nil
default:
return r.checkBuild(ctx, artifact)
}
@ -210,6 +205,12 @@ func (r *OSArtifactReconciler) checkBuild(ctx context.Context, artifact *osbuild
return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
case corev1.PodFailed:
artifact.Status.Phase = osbuilder.Error
meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{
Type: "Ready",
Status: metav1.ConditionFalse,
Reason: "Error",
Message: getCorev1PodHealth(&pod).Message,
})
return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
case corev1.PodPending, corev1.PodRunning:
return ctrl.Result{}, nil
@ -275,6 +276,7 @@ func (r *OSArtifactReconciler) checkExport(ctx context.Context, artifact *osbuil
},
},
Spec: batchv1.JobSpec{
BackoffLimit: ptr(int32(1)),
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
RestartPolicy: corev1.RestartPolicyOnFailure,
@ -360,24 +362,35 @@ func (r *OSArtifactReconciler) checkExport(ctx context.Context, artifact *osbuil
return ctrl.Result{Requeue: true}, nil
}
} else if job.Spec.Completions == nil || *job.Spec.Completions == 1 {
} else if job.Spec.Completions != nil {
if job.Status.Succeeded > 0 {
artifact.Status.Phase = osbuilder.Ready
if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil {
log.FromContext(ctx).Error(err, "failed to update artifact status")
return ctrl.Result{}, err
}
}
} else if *job.Spec.BackoffLimit <= job.Status.Failed {
artifact.Status.Phase = osbuilder.Error
if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil {
log.FromContext(ctx).Error(err, "failed to update artifact status")
return ctrl.Result{}, err
return ctrl.Result{}, nil
} else if job.Status.Failed > 0 {
artifact.Status.Phase = osbuilder.Error
h := getBatchv1JobHealth(job)
if h.Status == HealthStatusDegraded {
meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{
Type: "Ready",
Status: metav1.ConditionFalse,
Reason: "Error",
Message: h.Message,
})
if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil {
log.FromContext(ctx).Error(err, "failed to update artifact status")
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}
}
}
}
return ctrl.Result{}, nil
return requeue, nil
}
// SetupWithManager sets up the controller with the Manager.

9
go.mod
View File

@ -3,12 +3,10 @@ module github.com/kairos-io/osbuilder
go 1.23.4
require (
github.com/onsi/ginkgo/v2 v2.22.2
github.com/onsi/gomega v1.36.2
github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
k8s.io/api v0.32.1
k8s.io/apimachinery v0.32.1
k8s.io/client-go v0.32.1
k8s.io/kubectl v0.32.1
sigs.k8s.io/controller-runtime v0.20.0
)
@ -26,14 +24,12 @@ require (
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.23.0 // indirect
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
github.com/gogo/protobuf v1.3.2 // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/google/btree v1.1.3 // indirect
github.com/google/gnostic-models v0.6.8 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
@ -41,6 +37,8 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/onsi/ginkgo/v2 v2.22.2 // indirect
github.com/onsi/gomega v1.36.2 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.19.1 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
@ -57,7 +55,6 @@ require (
golang.org/x/term v0.27.0 // indirect
golang.org/x/text v0.21.0 // indirect
golang.org/x/time v0.7.0 // indirect
golang.org/x/tools v0.28.0 // indirect
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
google.golang.org/protobuf v1.36.1 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect

4
go.sum
View File

@ -75,8 +75,6 @@ github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU
github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk=
github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8=
github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY=
github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 h1:Ii+DKncOVM8Cu1Hc+ETb5K+23HdAMvESYE3ZJ5b5cMI=
github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5/go.mod h1:iIss55rKnNBTvrwdmkUpLnDpZoAHvWaiq5+iMmen4AE=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@ -180,6 +178,8 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y=
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4=
k8s.io/kubectl v0.32.1 h1:/btLtXLQUU1rWx8AEvX9jrb9LaI6yeezt3sFALhB8M8=
k8s.io/kubectl v0.32.1/go.mod h1:sezNuyWi1STk4ZNPVRIFfgjqMI6XMf+oCVLjZen/pFQ=
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro=
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
sigs.k8s.io/controller-runtime v0.20.0 h1:jjkMo29xEXH+02Md9qaVXfEIaMESSpy3TBWPrsfQkQs=