osbuilder/controllers/kubernetes.go
2025-01-27 14:52:55 +01:00

216 lines
6.6 KiB
Go

package controllers
import (
"context"
"fmt"
"reflect"
"strings"
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/client-go/util/retry"
"k8s.io/kubectl/pkg/util/podutils"
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
)
func TryToUpdateStatus(ctx context.Context, client ctrlruntimeclient.Client, object ctrlruntimeclient.Object) error {
key := ctrlruntimeclient.ObjectKeyFromObject(object)
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
original := object.DeepCopyObject().(ctrlruntimeclient.Object)
if err := client.Get(ctx, key, object); err != nil {
return fmt.Errorf("could not fetch current %s/%s state, got error: %+v", object.GetName(), object.GetNamespace(), err)
}
if reflect.DeepEqual(object, original) {
return nil
}
return client.Status().Patch(ctx, original, ctrlruntimeclient.MergeFrom(object))
})
}
const (
// Indicates that health assessment failed and actual health status is unknown
HealthStatusUnknown HealthStatusCode = "Unknown"
// Progressing health status means that resource is not healthy but still have a chance to reach healthy state
HealthStatusProgressing HealthStatusCode = "Progressing"
// Resource is 100% healthy
HealthStatusHealthy HealthStatusCode = "Healthy"
// Assigned to resources that are suspended or paused. The typical example is a
// [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob.
HealthStatusSuspended HealthStatusCode = "Suspended"
HealthStatusPaused HealthStatusCode = "Paused"
// Degrade status is used if resource status indicates failure or resource could not reach healthy state
// within some timeout.
HealthStatusDegraded HealthStatusCode = "Degraded"
// Indicates that resource is missing in the cluster.
HealthStatusMissing HealthStatusCode = "Missing"
)
// Represents resource health status
type HealthStatusCode string
type HealthStatus struct {
Status HealthStatusCode `json:"status,omitempty"`
Message string `json:"message,omitempty"`
}
func getCorev1PodHealth(pod *corev1.Pod) *HealthStatus {
// This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure,
// corev1.RestartPolicyNever, otherwise it breaks the resource hook logic.
// The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook,
// then we will prematurely fail the PreSync/PostSync hook. Meanwhile, when that error condition is resolved
// (e.g. the image is available), the resource hook pod will unexpectedly be executed even though the sync has
// completed.
if pod.Spec.RestartPolicy == corev1.RestartPolicyAlways {
var status HealthStatusCode
var messages []string
for _, containerStatus := range pod.Status.ContainerStatuses {
waiting := containerStatus.State.Waiting
// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
status = HealthStatusDegraded
messages = append(messages, waiting.Message)
}
}
if status != "" {
return &HealthStatus{
Status: status,
Message: strings.Join(messages, ", "),
}
}
}
getFailMessage := func(ctr *corev1.ContainerStatus) string {
if ctr.State.Terminated != nil {
if ctr.State.Terminated.Message != "" {
return ctr.State.Terminated.Message
}
if ctr.State.Terminated.Reason == "OOMKilled" {
return ctr.State.Terminated.Reason
}
if ctr.State.Terminated.ExitCode != 0 {
return fmt.Sprintf("container %q failed with exit code %d", ctr.Name, ctr.State.Terminated.ExitCode)
}
}
return ""
}
switch pod.Status.Phase {
case corev1.PodPending:
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}
case corev1.PodSucceeded:
return &HealthStatus{
Status: HealthStatusHealthy,
Message: pod.Status.Message,
}
case corev1.PodFailed:
if pod.Status.Message != "" {
// Pod has a nice error message. Use that.
return &HealthStatus{Status: HealthStatusDegraded, Message: pod.Status.Message}
}
for _, ctr := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
if msg := getFailMessage(&ctr); msg != "" {
return &HealthStatus{Status: HealthStatusDegraded, Message: msg}
}
}
return &HealthStatus{Status: HealthStatusDegraded, Message: ""}
case corev1.PodRunning:
switch pod.Spec.RestartPolicy {
case corev1.RestartPolicyAlways:
// if pod is ready, it is automatically healthy
if podutils.IsPodReady(pod) {
return &HealthStatus{
Status: HealthStatusHealthy,
Message: pod.Status.Message,
}
}
// if it's not ready, check to see if any container terminated, if so, it's degraded
for _, ctrStatus := range pod.Status.ContainerStatuses {
if ctrStatus.LastTerminationState.Terminated != nil {
return &HealthStatus{
Status: HealthStatusDegraded,
Message: pod.Status.Message,
}
}
}
// otherwise we are progressing towards a ready state
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}
case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
// pods set with a restart policy of OnFailure or Never, have a finite life.
// These pods are typically resource hooks. Thus, we consider these as Progressing
// instead of healthy.
return &HealthStatus{
Status: HealthStatusProgressing,
Message: pod.Status.Message,
}
}
}
return &HealthStatus{
Status: HealthStatusUnknown,
Message: pod.Status.Message,
}
}
func getBatchv1JobHealth(job *batchv1.Job) *HealthStatus {
failed := false
var failMsg string
complete := false
var message string
isSuspended := false
for _, condition := range job.Status.Conditions {
switch condition.Type {
case batchv1.JobFailed:
failed = true
complete = true
failMsg = condition.Message
case batchv1.JobComplete:
complete = true
message = condition.Message
case batchv1.JobSuspended:
complete = true
message = condition.Message
if condition.Status == corev1.ConditionTrue {
isSuspended = true
}
}
}
if !complete {
return &HealthStatus{
Status: HealthStatusProgressing,
Message: message,
}
}
if failed {
return &HealthStatus{
Status: HealthStatusDegraded,
Message: failMsg,
}
}
if isSuspended {
return &HealthStatus{
Status: HealthStatusSuspended,
Message: failMsg,
}
}
return &HealthStatus{
Status: HealthStatusHealthy,
Message: message,
}
}