mirror of
https://github.com/kairos-io/osbuilder.git
synced 2025-06-27 07:26:54 +00:00
216 lines
6.6 KiB
Go
216 lines
6.6 KiB
Go
package controllers
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"reflect"
|
|
"strings"
|
|
|
|
batchv1 "k8s.io/api/batch/v1"
|
|
corev1 "k8s.io/api/core/v1"
|
|
|
|
"k8s.io/client-go/util/retry"
|
|
"k8s.io/kubectl/pkg/util/podutils"
|
|
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
|
|
)
|
|
|
|
func TryToUpdateStatus(ctx context.Context, client ctrlruntimeclient.Client, object ctrlruntimeclient.Object) error {
|
|
key := ctrlruntimeclient.ObjectKeyFromObject(object)
|
|
|
|
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
|
|
original := object.DeepCopyObject().(ctrlruntimeclient.Object)
|
|
if err := client.Get(ctx, key, object); err != nil {
|
|
return fmt.Errorf("could not fetch current %s/%s state, got error: %+v", object.GetName(), object.GetNamespace(), err)
|
|
}
|
|
|
|
if reflect.DeepEqual(object, original) {
|
|
return nil
|
|
}
|
|
|
|
return client.Status().Patch(ctx, original, ctrlruntimeclient.MergeFrom(object))
|
|
})
|
|
|
|
}
|
|
|
|
const (
|
|
// Indicates that health assessment failed and actual health status is unknown
|
|
HealthStatusUnknown HealthStatusCode = "Unknown"
|
|
// Progressing health status means that resource is not healthy but still have a chance to reach healthy state
|
|
HealthStatusProgressing HealthStatusCode = "Progressing"
|
|
// Resource is 100% healthy
|
|
HealthStatusHealthy HealthStatusCode = "Healthy"
|
|
// Assigned to resources that are suspended or paused. The typical example is a
|
|
// [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob.
|
|
HealthStatusSuspended HealthStatusCode = "Suspended"
|
|
HealthStatusPaused HealthStatusCode = "Paused"
|
|
// Degrade status is used if resource status indicates failure or resource could not reach healthy state
|
|
// within some timeout.
|
|
HealthStatusDegraded HealthStatusCode = "Degraded"
|
|
// Indicates that resource is missing in the cluster.
|
|
HealthStatusMissing HealthStatusCode = "Missing"
|
|
)
|
|
|
|
// Represents resource health status
|
|
type HealthStatusCode string
|
|
|
|
type HealthStatus struct {
|
|
Status HealthStatusCode `json:"status,omitempty"`
|
|
Message string `json:"message,omitempty"`
|
|
}
|
|
|
|
func getCorev1PodHealth(pod *corev1.Pod) *HealthStatus {
|
|
// This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure,
|
|
// corev1.RestartPolicyNever, otherwise it breaks the resource hook logic.
|
|
// The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook,
|
|
// then we will prematurely fail the PreSync/PostSync hook. Meanwhile, when that error condition is resolved
|
|
// (e.g. the image is available), the resource hook pod will unexpectedly be executed even though the sync has
|
|
// completed.
|
|
if pod.Spec.RestartPolicy == corev1.RestartPolicyAlways {
|
|
var status HealthStatusCode
|
|
var messages []string
|
|
|
|
for _, containerStatus := range pod.Status.ContainerStatuses {
|
|
waiting := containerStatus.State.Waiting
|
|
// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
|
|
if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
|
|
status = HealthStatusDegraded
|
|
messages = append(messages, waiting.Message)
|
|
}
|
|
}
|
|
|
|
if status != "" {
|
|
return &HealthStatus{
|
|
Status: status,
|
|
Message: strings.Join(messages, ", "),
|
|
}
|
|
}
|
|
}
|
|
|
|
getFailMessage := func(ctr *corev1.ContainerStatus) string {
|
|
if ctr.State.Terminated != nil {
|
|
if ctr.State.Terminated.Message != "" {
|
|
return ctr.State.Terminated.Message
|
|
}
|
|
if ctr.State.Terminated.Reason == "OOMKilled" {
|
|
return ctr.State.Terminated.Reason
|
|
}
|
|
if ctr.State.Terminated.ExitCode != 0 {
|
|
return fmt.Sprintf("container %q failed with exit code %d", ctr.Name, ctr.State.Terminated.ExitCode)
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
switch pod.Status.Phase {
|
|
case corev1.PodPending:
|
|
return &HealthStatus{
|
|
Status: HealthStatusProgressing,
|
|
Message: pod.Status.Message,
|
|
}
|
|
case corev1.PodSucceeded:
|
|
return &HealthStatus{
|
|
Status: HealthStatusHealthy,
|
|
Message: pod.Status.Message,
|
|
}
|
|
case corev1.PodFailed:
|
|
if pod.Status.Message != "" {
|
|
// Pod has a nice error message. Use that.
|
|
return &HealthStatus{Status: HealthStatusDegraded, Message: pod.Status.Message}
|
|
}
|
|
for _, ctr := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
|
|
if msg := getFailMessage(&ctr); msg != "" {
|
|
return &HealthStatus{Status: HealthStatusDegraded, Message: msg}
|
|
}
|
|
}
|
|
|
|
return &HealthStatus{Status: HealthStatusDegraded, Message: ""}
|
|
case corev1.PodRunning:
|
|
switch pod.Spec.RestartPolicy {
|
|
case corev1.RestartPolicyAlways:
|
|
// if pod is ready, it is automatically healthy
|
|
if podutils.IsPodReady(pod) {
|
|
return &HealthStatus{
|
|
Status: HealthStatusHealthy,
|
|
Message: pod.Status.Message,
|
|
}
|
|
}
|
|
// if it's not ready, check to see if any container terminated, if so, it's degraded
|
|
for _, ctrStatus := range pod.Status.ContainerStatuses {
|
|
if ctrStatus.LastTerminationState.Terminated != nil {
|
|
return &HealthStatus{
|
|
Status: HealthStatusDegraded,
|
|
Message: pod.Status.Message,
|
|
}
|
|
}
|
|
}
|
|
// otherwise we are progressing towards a ready state
|
|
return &HealthStatus{
|
|
Status: HealthStatusProgressing,
|
|
Message: pod.Status.Message,
|
|
}
|
|
case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
|
|
// pods set with a restart policy of OnFailure or Never, have a finite life.
|
|
// These pods are typically resource hooks. Thus, we consider these as Progressing
|
|
// instead of healthy.
|
|
return &HealthStatus{
|
|
Status: HealthStatusProgressing,
|
|
Message: pod.Status.Message,
|
|
}
|
|
}
|
|
}
|
|
return &HealthStatus{
|
|
Status: HealthStatusUnknown,
|
|
Message: pod.Status.Message,
|
|
}
|
|
}
|
|
|
|
func getBatchv1JobHealth(job *batchv1.Job) *HealthStatus {
|
|
failed := false
|
|
var failMsg string
|
|
complete := false
|
|
var message string
|
|
isSuspended := false
|
|
for _, condition := range job.Status.Conditions {
|
|
switch condition.Type {
|
|
case batchv1.JobFailed:
|
|
failed = true
|
|
complete = true
|
|
failMsg = condition.Message
|
|
case batchv1.JobComplete:
|
|
complete = true
|
|
message = condition.Message
|
|
case batchv1.JobSuspended:
|
|
complete = true
|
|
message = condition.Message
|
|
if condition.Status == corev1.ConditionTrue {
|
|
isSuspended = true
|
|
}
|
|
}
|
|
}
|
|
|
|
if !complete {
|
|
return &HealthStatus{
|
|
Status: HealthStatusProgressing,
|
|
Message: message,
|
|
}
|
|
}
|
|
if failed {
|
|
return &HealthStatus{
|
|
Status: HealthStatusDegraded,
|
|
Message: failMsg,
|
|
}
|
|
}
|
|
if isSuspended {
|
|
return &HealthStatus{
|
|
Status: HealthStatusSuspended,
|
|
Message: failMsg,
|
|
}
|
|
}
|
|
|
|
return &HealthStatus{
|
|
Status: HealthStatusHealthy,
|
|
Message: message,
|
|
}
|
|
|
|
}
|