diff --git a/controllers/kubernetes.go b/controllers/kubernetes.go index 91d8ab8..281cb36 100644 --- a/controllers/kubernetes.go +++ b/controllers/kubernetes.go @@ -4,8 +4,13 @@ import ( "context" "fmt" "reflect" + "strings" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" "k8s.io/client-go/util/retry" + "k8s.io/kubectl/pkg/util/podutils" ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -26,3 +31,185 @@ func TryToUpdateStatus(ctx context.Context, client ctrlruntimeclient.Client, obj }) } + +const ( + // Indicates that health assessment failed and actual health status is unknown + HealthStatusUnknown HealthStatusCode = "Unknown" + // Progressing health status means that resource is not healthy but still have a chance to reach healthy state + HealthStatusProgressing HealthStatusCode = "Progressing" + // Resource is 100% healthy + HealthStatusHealthy HealthStatusCode = "Healthy" + // Assigned to resources that are suspended or paused. The typical example is a + // [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob. + HealthStatusSuspended HealthStatusCode = "Suspended" + HealthStatusPaused HealthStatusCode = "Paused" + // Degrade status is used if resource status indicates failure or resource could not reach healthy state + // within some timeout. + HealthStatusDegraded HealthStatusCode = "Degraded" + // Indicates that resource is missing in the cluster. + HealthStatusMissing HealthStatusCode = "Missing" +) + +// Represents resource health status +type HealthStatusCode string + +type HealthStatus struct { + Status HealthStatusCode `json:"status,omitempty"` + Message string `json:"message,omitempty"` +} + +func getCorev1PodHealth(pod *corev1.Pod) *HealthStatus { + // This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure, + // corev1.RestartPolicyNever, otherwise it breaks the resource hook logic. + // The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook, + // then we will prematurely fail the PreSync/PostSync hook. Meanwhile, when that error condition is resolved + // (e.g. the image is available), the resource hook pod will unexpectedly be executed even though the sync has + // completed. + if pod.Spec.RestartPolicy == corev1.RestartPolicyAlways { + var status HealthStatusCode + var messages []string + + for _, containerStatus := range pod.Status.ContainerStatuses { + waiting := containerStatus.State.Waiting + // Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf + if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) { + status = HealthStatusDegraded + messages = append(messages, waiting.Message) + } + } + + if status != "" { + return &HealthStatus{ + Status: status, + Message: strings.Join(messages, ", "), + } + } + } + + getFailMessage := func(ctr *corev1.ContainerStatus) string { + if ctr.State.Terminated != nil { + if ctr.State.Terminated.Message != "" { + return ctr.State.Terminated.Message + } + if ctr.State.Terminated.Reason == "OOMKilled" { + return ctr.State.Terminated.Reason + } + if ctr.State.Terminated.ExitCode != 0 { + return fmt.Sprintf("container %q failed with exit code %d", ctr.Name, ctr.State.Terminated.ExitCode) + } + } + return "" + } + + switch pod.Status.Phase { + case corev1.PodPending: + return &HealthStatus{ + Status: HealthStatusProgressing, + Message: pod.Status.Message, + } + case corev1.PodSucceeded: + return &HealthStatus{ + Status: HealthStatusHealthy, + Message: pod.Status.Message, + } + case corev1.PodFailed: + if pod.Status.Message != "" { + // Pod has a nice error message. Use that. + return &HealthStatus{Status: HealthStatusDegraded, Message: pod.Status.Message} + } + for _, ctr := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) { + if msg := getFailMessage(&ctr); msg != "" { + return &HealthStatus{Status: HealthStatusDegraded, Message: msg} + } + } + + return &HealthStatus{Status: HealthStatusDegraded, Message: ""} + case corev1.PodRunning: + switch pod.Spec.RestartPolicy { + case corev1.RestartPolicyAlways: + // if pod is ready, it is automatically healthy + if podutils.IsPodReady(pod) { + return &HealthStatus{ + Status: HealthStatusHealthy, + Message: pod.Status.Message, + } + } + // if it's not ready, check to see if any container terminated, if so, it's degraded + for _, ctrStatus := range pod.Status.ContainerStatuses { + if ctrStatus.LastTerminationState.Terminated != nil { + return &HealthStatus{ + Status: HealthStatusDegraded, + Message: pod.Status.Message, + } + } + } + // otherwise we are progressing towards a ready state + return &HealthStatus{ + Status: HealthStatusProgressing, + Message: pod.Status.Message, + } + case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever: + // pods set with a restart policy of OnFailure or Never, have a finite life. + // These pods are typically resource hooks. Thus, we consider these as Progressing + // instead of healthy. + return &HealthStatus{ + Status: HealthStatusProgressing, + Message: pod.Status.Message, + } + } + } + return &HealthStatus{ + Status: HealthStatusUnknown, + Message: pod.Status.Message, + } +} + +func getBatchv1JobHealth(job *batchv1.Job) *HealthStatus { + failed := false + var failMsg string + complete := false + var message string + isSuspended := false + for _, condition := range job.Status.Conditions { + switch condition.Type { + case batchv1.JobFailed: + failed = true + complete = true + failMsg = condition.Message + case batchv1.JobComplete: + complete = true + message = condition.Message + case batchv1.JobSuspended: + complete = true + message = condition.Message + if condition.Status == corev1.ConditionTrue { + isSuspended = true + } + } + } + + if !complete { + return &HealthStatus{ + Status: HealthStatusProgressing, + Message: message, + } + } + if failed { + return &HealthStatus{ + Status: HealthStatusDegraded, + Message: failMsg, + } + } + if isSuspended { + return &HealthStatus{ + Status: HealthStatusSuspended, + Message: failMsg, + } + } + + return &HealthStatus{ + Status: HealthStatusHealthy, + Message: message, + } + +} diff --git a/controllers/osartifact_controller.go b/controllers/osartifact_controller.go index 51ef613..a7dfc0c 100644 --- a/controllers/osartifact_controller.go +++ b/controllers/osartifact_controller.go @@ -119,12 +119,7 @@ func (r *OSArtifactReconciler) Reconcile(ctx context.Context, req ctrl.Request) }) return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact) case osbuilder.Error: - meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{ - Type: "Ready", - Status: metav1.ConditionFalse, - Reason: "Error", - }) - return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact) + return ctrl.Result{}, nil default: return r.checkBuild(ctx, artifact) } @@ -210,6 +205,12 @@ func (r *OSArtifactReconciler) checkBuild(ctx context.Context, artifact *osbuild return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact) case corev1.PodFailed: artifact.Status.Phase = osbuilder.Error + meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{ + Type: "Ready", + Status: metav1.ConditionFalse, + Reason: "Error", + Message: getCorev1PodHealth(&pod).Message, + }) return ctrl.Result{}, TryToUpdateStatus(ctx, r.Client, artifact) case corev1.PodPending, corev1.PodRunning: return ctrl.Result{}, nil @@ -275,6 +276,7 @@ func (r *OSArtifactReconciler) checkExport(ctx context.Context, artifact *osbuil }, }, Spec: batchv1.JobSpec{ + BackoffLimit: ptr(int32(1)), Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ RestartPolicy: corev1.RestartPolicyOnFailure, @@ -360,24 +362,35 @@ func (r *OSArtifactReconciler) checkExport(ctx context.Context, artifact *osbuil return ctrl.Result{Requeue: true}, nil } - } else if job.Spec.Completions == nil || *job.Spec.Completions == 1 { + } else if job.Spec.Completions != nil { if job.Status.Succeeded > 0 { artifact.Status.Phase = osbuilder.Ready if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil { log.FromContext(ctx).Error(err, "failed to update artifact status") return ctrl.Result{}, err } - } - } else if *job.Spec.BackoffLimit <= job.Status.Failed { - artifact.Status.Phase = osbuilder.Error - if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil { - log.FromContext(ctx).Error(err, "failed to update artifact status") - return ctrl.Result{}, err + return ctrl.Result{}, nil + } else if job.Status.Failed > 0 { + artifact.Status.Phase = osbuilder.Error + h := getBatchv1JobHealth(job) + if h.Status == HealthStatusDegraded { + meta.SetStatusCondition(&artifact.Status.Conditions, metav1.Condition{ + Type: "Ready", + Status: metav1.ConditionFalse, + Reason: "Error", + Message: h.Message, + }) + if err := TryToUpdateStatus(ctx, r.Client, artifact); err != nil { + log.FromContext(ctx).Error(err, "failed to update artifact status") + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } } } } - return ctrl.Result{}, nil + return requeue, nil } // SetupWithManager sets up the controller with the Manager. diff --git a/go.mod b/go.mod index 82f258d..3447323 100644 --- a/go.mod +++ b/go.mod @@ -3,12 +3,10 @@ module github.com/kairos-io/osbuilder go 1.23.4 require ( - github.com/onsi/ginkgo/v2 v2.22.2 - github.com/onsi/gomega v1.36.2 - github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 k8s.io/api v0.32.1 k8s.io/apimachinery v0.32.1 k8s.io/client-go v0.32.1 + k8s.io/kubectl v0.32.1 sigs.k8s.io/controller-runtime v0.20.0 ) @@ -26,14 +24,12 @@ require ( github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect - github.com/go-task/slim-sprig/v3 v3.0.0 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/google/btree v1.1.3 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/google/go-cmp v0.6.0 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect github.com/google/uuid v1.6.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -41,6 +37,8 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/onsi/ginkgo/v2 v2.22.2 // indirect + github.com/onsi/gomega v1.36.2 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.19.1 // indirect github.com/prometheus/client_model v0.6.1 // indirect @@ -57,7 +55,6 @@ require ( golang.org/x/term v0.27.0 // indirect golang.org/x/text v0.21.0 // indirect golang.org/x/time v0.7.0 // indirect - golang.org/x/tools v0.28.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/protobuf v1.36.1 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect diff --git a/go.sum b/go.sum index dee6e44..1786f83 100644 --- a/go.sum +++ b/go.sum @@ -75,8 +75,6 @@ github.com/onsi/ginkgo/v2 v2.22.2 h1:/3X8Panh8/WwhU/3Ssa6rCKqPLuAkVY2I0RoyDLySlU github.com/onsi/ginkgo/v2 v2.22.2/go.mod h1:oeMosUL+8LtarXBHu/c0bx2D/K9zyQ6uX3cTyztHwsk= github.com/onsi/gomega v1.36.2 h1:koNYke6TVk6ZmnyHrCXba/T/MoLBXFjeC1PtvYgw0A8= github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlRPHzY= -github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5 h1:Ii+DKncOVM8Cu1Hc+ETb5K+23HdAMvESYE3ZJ5b5cMI= -github.com/phayes/freeport v0.0.0-20220201140144-74d24b5ae9f5/go.mod h1:iIss55rKnNBTvrwdmkUpLnDpZoAHvWaiq5+iMmen4AE= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= @@ -180,6 +178,8 @@ k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f h1:GA7//TjRY9yWGy1poLzYYJJ4JRdzg3+O6e8I+e+8T5Y= k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f/go.mod h1:R/HEjbvWI0qdfb8viZUeVZm0X6IZnxAydC7YU42CMw4= +k8s.io/kubectl v0.32.1 h1:/btLtXLQUU1rWx8AEvX9jrb9LaI6yeezt3sFALhB8M8= +k8s.io/kubectl v0.32.1/go.mod h1:sezNuyWi1STk4ZNPVRIFfgjqMI6XMf+oCVLjZen/pFQ= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 h1:M3sRQVHv7vB20Xc2ybTt7ODCeFj6JSWYFzOFnYeS6Ro= k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/controller-runtime v0.20.0 h1:jjkMo29xEXH+02Md9qaVXfEIaMESSpy3TBWPrsfQkQs=