mirror of
https://github.com/kairos-io/osbuilder.git
synced 2025-06-26 15:11:37 +00:00
improve error handling
This commit is contained in:
parent
8b67f08832
commit
cd30c5ba11
@ -4,8 +4,13 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"strings"
|
||||
|
||||
batchv1 "k8s.io/api/batch/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
|
||||
"k8s.io/client-go/util/retry"
|
||||
"k8s.io/kubectl/pkg/util/podutils"
|
||||
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
|
||||
)
|
||||
|
||||
@ -26,3 +31,185 @@ func TryToUpdateStatus(ctx context.Context, client ctrlruntimeclient.Client, obj
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
const (
|
||||
// Indicates that health assessment failed and actual health status is unknown
|
||||
HealthStatusUnknown HealthStatusCode = "Unknown"
|
||||
// Progressing health status means that resource is not healthy but still have a chance to reach healthy state
|
||||
HealthStatusProgressing HealthStatusCode = "Progressing"
|
||||
// Resource is 100% healthy
|
||||
HealthStatusHealthy HealthStatusCode = "Healthy"
|
||||
// Assigned to resources that are suspended or paused. The typical example is a
|
||||
// [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob.
|
||||
HealthStatusSuspended HealthStatusCode = "Suspended"
|
||||
HealthStatusPaused HealthStatusCode = "Paused"
|
||||
// Degrade status is used if resource status indicates failure or resource could not reach healthy state
|
||||
// within some timeout.
|
||||
HealthStatusDegraded HealthStatusCode = "Degraded"
|
||||
// Indicates that resource is missing in the cluster.
|
||||
HealthStatusMissing HealthStatusCode = "Missing"
|
||||
)
|
||||
|
||||
// Represents resource health status
|
||||
type HealthStatusCode string
|
||||
|
||||
type HealthStatus struct {
|
||||
Status HealthStatusCode `json:"status,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
func getCorev1PodHealth(pod *corev1.Pod) *HealthStatus {
|
||||
// This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure,
|
||||
// corev1.RestartPolicyNever, otherwise it breaks the resource hook logic.
|
||||
// The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook,
|
||||
// then we will prematurely fail the PreSync/PostSync hook. Meanwhile, when that error condition is resolved
|
||||
// (e.g. the image is available), the resource hook pod will unexpectedly be executed even though the sync has
|
||||
// completed.
|
||||
if pod.Spec.RestartPolicy == corev1.RestartPolicyAlways {
|
||||
var status HealthStatusCode
|
||||
var messages []string
|
||||
|
||||
for _, containerStatus := range pod.Status.ContainerStatuses {
|
||||
waiting := containerStatus.State.Waiting
|
||||
// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
|
||||
if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
|
||||
status = HealthStatusDegraded
|
||||
messages = append(messages, waiting.Message)
|
||||
}
|
||||
}
|
||||
|
||||
if status != "" {
|
||||
return &HealthStatus{
|
||||
Status: status,
|
||||
Message: strings.Join(messages, ", "),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
getFailMessage := func(ctr *corev1.ContainerStatus) string {
|
||||
if ctr.State.Terminated != nil {
|
||||
if ctr.State.Terminated.Message != "" {
|
||||
return ctr.State.Terminated.Message
|
||||
}
|
||||
if ctr.State.Terminated.Reason == "OOMKilled" {
|
||||
return ctr.State.Terminated.Reason
|
||||
}
|
||||
if ctr.State.Terminated.ExitCode != 0 {
|
||||
return fmt.Sprintf("container %q failed with exit code %d", ctr.Name, ctr.State.Terminated.ExitCode)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
switch pod.Status.Phase {
|
||||
case corev1.PodPending:
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusProgressing,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
case corev1.PodSucceeded:
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusHealthy,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
case corev1.PodFailed:
|
||||
if pod.Status.Message != "" {
|
||||
// Pod has a nice error message. Use that.
|
||||
return &HealthStatus{Status: HealthStatusDegraded, Message: pod.Status.Message}
|
||||
}
|
||||
for _, ctr := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
|
||||
if msg := getFailMessage(&ctr); msg != "" {
|
||||
return &HealthStatus{Status: HealthStatusDegraded, Message: msg}
|
||||
}
|
||||
}
|
||||
|
||||
return &HealthStatus{Status: HealthStatusDegraded, Message: ""}
|
||||
case corev1.PodRunning:
|
||||
switch pod.Spec.RestartPolicy {
|
||||
case corev1.RestartPolicyAlways:
|
||||
// if pod is ready, it is automatically healthy
|
||||
if podutils.IsPodReady(pod) {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusHealthy,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
}
|
||||
// if it's not ready, check to see if any container terminated, if so, it's degraded
|
||||
for _, ctrStatus := range pod.Status.ContainerStatuses {
|
||||
if ctrStatus.LastTerminationState.Terminated != nil {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusDegraded,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
}
|
||||
}
|
||||
// otherwise we are progressing towards a ready state
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusProgressing,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
|
||||
// pods set with a restart policy of OnFailure or Never, have a finite life.
|
||||
// These pods are typically resource hooks. Thus, we consider these as Progressing
|
||||
// instead of healthy.
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusProgressing,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
}
|
||||
}
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusUnknown,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
}
|
||||
|
||||
func getBatchv1JobHealth(job *batchv1.Job) *HealthStatus {
|
||||
failed := false
|
||||
var failMsg string
|
||||
complete := false
|
||||
var message string
|
||||
isSuspended := false
|
||||
for _, condition := range job.Status.Conditions {
|
||||
switch condition.Type {
|
||||
case batchv1.JobFailed:
|
||||
failed = true
|
||||
complete = true
|
||||
failMsg = condition.Message
|
||||
case batchv1.JobComplete:
|
||||
complete = true
|
||||
message = condition.Message
|
||||
case batchv1.JobSuspended:
|
||||
complete = true
|
||||
message = condition.Message
|
||||
if condition.Status == corev1.ConditionTrue {
|
||||
isSuspended = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !complete {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusProgressing,
|
||||
Message: message,
|
||||
}
|
||||
}
|
||||
if failed {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusDegraded,
|
||||
Message: failMsg,
|
||||
}
|
||||
}
|
||||
if isSuspended {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusSuspended,
|
||||
Message: failMsg,
|
||||
}
|
||||
}
|
||||
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusHealthy,
|
||||
Message: message,
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -119,12 +119,7 @@ func (r *OSArtifactReconciler) Reconcile(ctx context.Context, req ctrl.Request)
|
||||
})
|
||||
return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
|
||||
case osbuilder.Error:
|
||||
meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{
|
||||
Type: "Ready",
|
||||
Status: metav1.ConditionFalse,
|
||||
Reason: "Error",
|
||||
})
|
||||
return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
|
||||
return ctrl.Result{}, nil
|
||||
default:
|
||||
return r.checkBuild(ctx, artifact)
|
||||
}
|
||||
@ -210,6 +205,12 @@ func (r *OSArtifactReconciler) checkBuild(ctx context.Context, artifact *osbuild
|
||||
return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
|
||||
case corev1.PodFailed:
|
||||
artifact.Status.Phase = osbuilder.Error
|
||||
meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{
|
||||
Type: "Ready",
|
||||
Status: metav1.ConditionFalse,
|
||||
Reason: "Error",
|
||||
Message: getCorev1PodHealth(&pod).Message,
|
||||
})
|
||||
return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact)
|
||||
case corev1.PodPending, corev1.PodRunning:
|
||||
return ctrl.Result{}, nil
|
||||
@ -275,6 +276,7 @@ func (r *OSArtifactReconciler) checkExport(ctx context.Context, artifact *osbuil
|
||||
},
|
||||
},
|
||||
Spec: batchv1.JobSpec{
|
||||
BackoffLimit: ptr(int32(1)),
|
||||
Template: corev1.PodTemplateSpec{
|
||||
Spec: corev1.PodSpec{
|
||||
RestartPolicy: corev1.RestartPolicyOnFailure,
|
||||
@ -360,24 +362,35 @@ func (r *OSArtifactReconciler) checkExport(ctx context.Context, artifact *osbuil
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
|
||||
} else if job.Spec.Completions == nil || *job.Spec.Completions == 1 {
|
||||
} else if job.Spec.Completions != nil {
|
||||
if job.Status.Succeeded > 0 {
|
||||
artifact.Status.Phase = osbuilder.Ready
|
||||
if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil {
|
||||
log.FromContext(ctx).Error(err, "failed to update artifact status")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
}
|
||||
} else if *job.Spec.BackoffLimit <= job.Status.Failed {
|
||||
artifact.Status.Phase = osbuilder.Error
|
||||
if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil {
|
||||
log.FromContext(ctx).Error(err, "failed to update artifact status")
|
||||
return ctrl.Result{}, err
|
||||
return ctrl.Result{}, nil
|
||||
} else if job.Status.Failed > 0 {
|
||||
artifact.Status.Phase = osbuilder.Error
|
||||
h := getBatchv1JobHealth(job)
|
||||
if h.Status == HealthStatusDegraded {
|
||||
meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{
|
||||
Type: "Ready",
|
||||
Status: metav1.ConditionFalse,
|
||||
Reason: "Error",
|
||||
Message: h.Message,
|
||||
})
|
||||
if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil {
|
||||
log.FromContext(ctx).Error(err, "failed to update artifact status")
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
return requeue, nil
|
||||
}
|
||||
|
||||
// SetupWithManager sets up the controller with the Manager.
|
||||
|
9
go.mod
9
go.mod
@ -3,12 +3,10 @@ module github.com/kairos-io/osbuilder
|
||||
go 1.23.4
|
||||
|
||||
require (
|
||||
github.com/onsi/ginkgo/v2 v2.22.2
|
||||
github.com/onsi/gomega v1.36.2
|
||||
github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5
|
||||
k8s.io/api v0.32.1
|
||||
k8s.io/apimachinery v0.32.1
|
||||
k8s.io/client-go v0.32.1
|
||||
k8s.io/kubectl v0.32.1
|
||||
sigs.k8s.io/controller-runtime v0.20.0
|
||||
)
|
||||
|
||||
@ -26,14 +24,12 @@ require (
|
||||
github.com/go-openapi/jsonpointer v0.21.0 // indirect
|
||||
github.com/go-openapi/jsonreference v0.20.2 // indirect
|
||||
github.com/go-openapi/swag v0.23.0 // indirect
|
||||
github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
|
||||
github.com/gogo/protobuf v1.3.2 // indirect
|
||||
github.com/golang/protobuf v1.5.4 // indirect
|
||||
github.com/google/btree v1.1.3 // indirect
|
||||
github.com/google/gnostic-models v0.6.8 // indirect
|
||||
github.com/google/go-cmp v0.6.0 // indirect
|
||||
github.com/google/gofuzz v1.2.0 // indirect
|
||||
github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/josharian/intern v1.0.0 // indirect
|
||||
github.com/json-iterator/go v1.1.12 // indirect
|
||||
@ -41,6 +37,8 @@ require (
|
||||
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
|
||||
github.com/modern-go/reflect2 v1.0.2 // indirect
|
||||
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
|
||||
github.com/onsi/ginkgo/v2 v2.22.2 // indirect
|
||||
github.com/onsi/gomega v1.36.2 // indirect
|
||||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/prometheus/client_golang v1.19.1 // indirect
|
||||
github.com/prometheus/client_model v0.6.1 // indirect
|
||||
@ -57,7 +55,6 @@ require (
|
||||
golang.org/x/term v0.27.0 // indirect
|
||||
golang.org/x/text v0.21.0 // indirect
|
||||
golang.org/x/time v0.7.0 // indirect
|
||||
golang.org/x/tools v0.28.0 // indirect
|
||||
gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
|
||||
google.golang.org/protobuf v1.36.1 // indirect
|
||||
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
|
||||
|
4
go.sum
4
go.sum
@ -75,8 +75,6 @@ github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU
|
||||
github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk=
|
||||
github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8=
|
||||
github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY=
|
||||
github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 h1:Ii+DKncOVM8Cu1Hc+ETb5K+23HdAMvESYE3ZJ5b5cMI=
|
||||
github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5/go.mod h1:iIss55rKnNBTvrwdmkUpLnDpZoAHvWaiq5+iMmen4AE=
|
||||
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
|
||||
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
@ -180,6 +178,8 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
|
||||
k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
|
||||
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y=
|
||||
k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4=
|
||||
k8s.io/kubectl v0.32.1 h1:/btLtXLQUU1rWx8AEvX9jrb9LaI6yeezt3sFALhB8M8=
|
||||
k8s.io/kubectl v0.32.1/go.mod h1:sezNuyWi1STk4ZNPVRIFfgjqMI6XMf+oCVLjZen/pFQ=
|
||||
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro=
|
||||
k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0=
|
||||
sigs.k8s.io/controller-runtime v0.20.0 h1:jjkMo29xEXH+02Md9qaVXfEIaMESSpy3TBWPrsfQkQs=
|
||||
|
Loading…
Reference in New Issue
Block a user