mirror of
https://github.com/kairos-io/osbuilder.git
synced 2025-09-05 01:04:16 +00:00
improve error handling
This commit is contained in:
@@ -4,8 +4,13 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"reflect"
|
||||
"strings"
|
||||
|
||||
batchv1 "k8s.io/api/batch/v1"
|
||||
corev1 "k8s.io/api/core/v1"
|
||||
|
||||
"k8s.io/client-go/util/retry"
|
||||
"k8s.io/kubectl/pkg/util/podutils"
|
||||
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
|
||||
)
|
||||
|
||||
@@ -26,3 +31,185 @@ func TryToUpdateStatus(ctx context.Context, client ctrlruntimeclient.Client, obj
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
const (
|
||||
// Indicates that health assessment failed and actual health status is unknown
|
||||
HealthStatusUnknown HealthStatusCode = "Unknown"
|
||||
// Progressing health status means that resource is not healthy but still have a chance to reach healthy state
|
||||
HealthStatusProgressing HealthStatusCode = "Progressing"
|
||||
// Resource is 100% healthy
|
||||
HealthStatusHealthy HealthStatusCode = "Healthy"
|
||||
// Assigned to resources that are suspended or paused. The typical example is a
|
||||
// [suspended](https://kubernetes.io/docs/tasks/job/automated-tasks-with-cron-jobs/#suspend) CronJob.
|
||||
HealthStatusSuspended HealthStatusCode = "Suspended"
|
||||
HealthStatusPaused HealthStatusCode = "Paused"
|
||||
// Degrade status is used if resource status indicates failure or resource could not reach healthy state
|
||||
// within some timeout.
|
||||
HealthStatusDegraded HealthStatusCode = "Degraded"
|
||||
// Indicates that resource is missing in the cluster.
|
||||
HealthStatusMissing HealthStatusCode = "Missing"
|
||||
)
|
||||
|
||||
// Represents resource health status
|
||||
type HealthStatusCode string
|
||||
|
||||
type HealthStatus struct {
|
||||
Status HealthStatusCode `json:"status,omitempty"`
|
||||
Message string `json:"message,omitempty"`
|
||||
}
|
||||
|
||||
func getCorev1PodHealth(pod *corev1.Pod) *HealthStatus {
|
||||
// This logic cannot be applied when the pod.Spec.RestartPolicy is: corev1.RestartPolicyOnFailure,
|
||||
// corev1.RestartPolicyNever, otherwise it breaks the resource hook logic.
|
||||
// The issue is, if we mark a pod with ImagePullBackOff as Degraded, and the pod is used as a resource hook,
|
||||
// then we will prematurely fail the PreSync/PostSync hook. Meanwhile, when that error condition is resolved
|
||||
// (e.g. the image is available), the resource hook pod will unexpectedly be executed even though the sync has
|
||||
// completed.
|
||||
if pod.Spec.RestartPolicy == corev1.RestartPolicyAlways {
|
||||
var status HealthStatusCode
|
||||
var messages []string
|
||||
|
||||
for _, containerStatus := range pod.Status.ContainerStatuses {
|
||||
waiting := containerStatus.State.Waiting
|
||||
// Article listing common container errors: https://medium.com/kokster/debugging-crashloopbackoffs-with-init-containers-26f79e9fb5bf
|
||||
if waiting != nil && (strings.HasPrefix(waiting.Reason, "Err") || strings.HasSuffix(waiting.Reason, "Error") || strings.HasSuffix(waiting.Reason, "BackOff")) {
|
||||
status = HealthStatusDegraded
|
||||
messages = append(messages, waiting.Message)
|
||||
}
|
||||
}
|
||||
|
||||
if status != "" {
|
||||
return &HealthStatus{
|
||||
Status: status,
|
||||
Message: strings.Join(messages, ", "),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
getFailMessage := func(ctr *corev1.ContainerStatus) string {
|
||||
if ctr.State.Terminated != nil {
|
||||
if ctr.State.Terminated.Message != "" {
|
||||
return ctr.State.Terminated.Message
|
||||
}
|
||||
if ctr.State.Terminated.Reason == "OOMKilled" {
|
||||
return ctr.State.Terminated.Reason
|
||||
}
|
||||
if ctr.State.Terminated.ExitCode != 0 {
|
||||
return fmt.Sprintf("container %q failed with exit code %d", ctr.Name, ctr.State.Terminated.ExitCode)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
switch pod.Status.Phase {
|
||||
case corev1.PodPending:
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusProgressing,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
case corev1.PodSucceeded:
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusHealthy,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
case corev1.PodFailed:
|
||||
if pod.Status.Message != "" {
|
||||
// Pod has a nice error message. Use that.
|
||||
return &HealthStatus{Status: HealthStatusDegraded, Message: pod.Status.Message}
|
||||
}
|
||||
for _, ctr := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
|
||||
if msg := getFailMessage(&ctr); msg != "" {
|
||||
return &HealthStatus{Status: HealthStatusDegraded, Message: msg}
|
||||
}
|
||||
}
|
||||
|
||||
return &HealthStatus{Status: HealthStatusDegraded, Message: ""}
|
||||
case corev1.PodRunning:
|
||||
switch pod.Spec.RestartPolicy {
|
||||
case corev1.RestartPolicyAlways:
|
||||
// if pod is ready, it is automatically healthy
|
||||
if podutils.IsPodReady(pod) {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusHealthy,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
}
|
||||
// if it's not ready, check to see if any container terminated, if so, it's degraded
|
||||
for _, ctrStatus := range pod.Status.ContainerStatuses {
|
||||
if ctrStatus.LastTerminationState.Terminated != nil {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusDegraded,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
}
|
||||
}
|
||||
// otherwise we are progressing towards a ready state
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusProgressing,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
case corev1.RestartPolicyOnFailure, corev1.RestartPolicyNever:
|
||||
// pods set with a restart policy of OnFailure or Never, have a finite life.
|
||||
// These pods are typically resource hooks. Thus, we consider these as Progressing
|
||||
// instead of healthy.
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusProgressing,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
}
|
||||
}
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusUnknown,
|
||||
Message: pod.Status.Message,
|
||||
}
|
||||
}
|
||||
|
||||
func getBatchv1JobHealth(job *batchv1.Job) *HealthStatus {
|
||||
failed := false
|
||||
var failMsg string
|
||||
complete := false
|
||||
var message string
|
||||
isSuspended := false
|
||||
for _, condition := range job.Status.Conditions {
|
||||
switch condition.Type {
|
||||
case batchv1.JobFailed:
|
||||
failed = true
|
||||
complete = true
|
||||
failMsg = condition.Message
|
||||
case batchv1.JobComplete:
|
||||
complete = true
|
||||
message = condition.Message
|
||||
case batchv1.JobSuspended:
|
||||
complete = true
|
||||
message = condition.Message
|
||||
if condition.Status == corev1.ConditionTrue {
|
||||
isSuspended = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !complete {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusProgressing,
|
||||
Message: message,
|
||||
}
|
||||
}
|
||||
if failed {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusDegraded,
|
||||
Message: failMsg,
|
||||
}
|
||||
}
|
||||
if isSuspended {
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusSuspended,
|
||||
Message: failMsg,
|
||||
}
|
||||
}
|
||||
|
||||
return &HealthStatus{
|
||||
Status: HealthStatusHealthy,
|
||||
Message: message,
|
||||
}
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user