mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-28 22:17:14 +00:00
Merge pull request #36017 from foxish/kubectl-new-2
Automatic merge from submit-queue Set reason and message on Pod during nodecontroller eviction **What this PR does / why we need it**: Pods which are evicted by the nodecontroller due to network partition, or unresponsive kubelet should be differentiated from termination initiated by other sources. The reason/message are consumed by kubectl to provide a better summary using get/describe. **Which issue this PR fixes** *(optional, in `fixes #<issue number>(, #<issue_number>, ...)` format, will close that issue when PR gets merged)*: fixes #35725 **Release note**: ```release-note Pods that are terminating due to eviction by the nodecontroller (typically due to unresponsive kubelet, or network partition) now surface in `kubectl get` output as being in state "Unknown", along with a longer description in `kubectl describe` output. ```
This commit is contained in:
commit
f2b5600567
@ -41,6 +41,7 @@ go_library(
|
||||
"//pkg/runtime:go_default_library",
|
||||
"//pkg/types:go_default_library",
|
||||
"//pkg/util/clock:go_default_library",
|
||||
"//pkg/util/errors:go_default_library",
|
||||
"//pkg/util/flowcontrol:go_default_library",
|
||||
"//pkg/util/metrics:go_default_library",
|
||||
"//pkg/util/node:go_default_library",
|
||||
@ -73,6 +74,7 @@ go_test(
|
||||
"//pkg/client/cache:go_default_library",
|
||||
"//pkg/client/clientset_generated/internalclientset:go_default_library",
|
||||
"//pkg/client/clientset_generated/internalclientset/fake:go_default_library",
|
||||
"//pkg/client/testing/core:go_default_library",
|
||||
"//pkg/cloudprovider:go_default_library",
|
||||
"//pkg/cloudprovider/providers/fake:go_default_library",
|
||||
"//pkg/controller:go_default_library",
|
||||
@ -80,6 +82,7 @@ go_test(
|
||||
"//pkg/types:go_default_library",
|
||||
"//pkg/util/diff:go_default_library",
|
||||
"//pkg/util/flowcontrol:go_default_library",
|
||||
"//pkg/util/node:go_default_library",
|
||||
"//pkg/util/sets:go_default_library",
|
||||
"//pkg/util/wait:go_default_library",
|
||||
"//vendor:github.com/golang/glog",
|
||||
|
@ -21,6 +21,7 @@ import (
|
||||
"strings"
|
||||
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/errors"
|
||||
"k8s.io/kubernetes/pkg/client/cache"
|
||||
clientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
|
||||
"k8s.io/kubernetes/pkg/client/record"
|
||||
@ -28,6 +29,8 @@ import (
|
||||
"k8s.io/kubernetes/pkg/fields"
|
||||
"k8s.io/kubernetes/pkg/kubelet/util/format"
|
||||
"k8s.io/kubernetes/pkg/types"
|
||||
utilerrors "k8s.io/kubernetes/pkg/util/errors"
|
||||
"k8s.io/kubernetes/pkg/util/node"
|
||||
utilruntime "k8s.io/kubernetes/pkg/util/runtime"
|
||||
"k8s.io/kubernetes/pkg/version"
|
||||
|
||||
@ -46,6 +49,8 @@ func deletePods(kubeClient clientset.Interface, recorder record.EventRecorder, n
|
||||
selector := fields.OneTermEqualSelector(api.PodHostField, nodeName)
|
||||
options := api.ListOptions{FieldSelector: selector}
|
||||
pods, err := kubeClient.Core().Pods(api.NamespaceAll).List(options)
|
||||
var updateErrList []error
|
||||
|
||||
if err != nil {
|
||||
return remaining, err
|
||||
}
|
||||
@ -59,6 +64,15 @@ func deletePods(kubeClient clientset.Interface, recorder record.EventRecorder, n
|
||||
if pod.Spec.NodeName != nodeName {
|
||||
continue
|
||||
}
|
||||
|
||||
// Set reason and message in the pod object.
|
||||
if _, err = setPodTerminationReason(kubeClient, &pod, nodeName); err != nil {
|
||||
if errors.IsConflict(err) {
|
||||
updateErrList = append(updateErrList,
|
||||
fmt.Errorf("update status failed for pod %q: %v", format.Pod(&pod), err))
|
||||
continue
|
||||
}
|
||||
}
|
||||
// if the pod has already been marked for deletion, we still return true that there are remaining pods.
|
||||
if pod.DeletionGracePeriodSeconds != nil {
|
||||
remaining = true
|
||||
@ -77,9 +91,31 @@ func deletePods(kubeClient clientset.Interface, recorder record.EventRecorder, n
|
||||
}
|
||||
remaining = true
|
||||
}
|
||||
|
||||
if len(updateErrList) > 0 {
|
||||
return false, utilerrors.NewAggregate(updateErrList)
|
||||
}
|
||||
return remaining, nil
|
||||
}
|
||||
|
||||
// setPodTerminationReason attempts to set a reason and message in the pod status, updates it in the apiserver,
|
||||
// and returns an error if it encounters one.
|
||||
func setPodTerminationReason(kubeClient clientset.Interface, pod *api.Pod, nodeName string) (*api.Pod, error) {
|
||||
if pod.Status.Reason == node.NodeUnreachablePodReason {
|
||||
return pod, nil
|
||||
}
|
||||
|
||||
pod.Status.Reason = node.NodeUnreachablePodReason
|
||||
pod.Status.Message = fmt.Sprintf(node.NodeUnreachablePodMessage, nodeName, pod.Name)
|
||||
|
||||
var updatedPod *api.Pod
|
||||
var err error
|
||||
if updatedPod, err = kubeClient.Core().Pods(pod.Namespace).UpdateStatus(pod); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return updatedPod, nil
|
||||
}
|
||||
|
||||
func forcefullyDeletePod(c clientset.Interface, pod *api.Pod) error {
|
||||
var zero int64
|
||||
err := c.Core().Pods(pod.Namespace).Delete(pod.Name, &api.DeleteOptions{GracePeriodSeconds: &zero})
|
||||
|
@ -29,12 +29,14 @@ import (
|
||||
"k8s.io/kubernetes/pkg/client/cache"
|
||||
clientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
|
||||
"k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset/fake"
|
||||
testcore "k8s.io/kubernetes/pkg/client/testing/core"
|
||||
"k8s.io/kubernetes/pkg/cloudprovider"
|
||||
fakecloud "k8s.io/kubernetes/pkg/cloudprovider/providers/fake"
|
||||
"k8s.io/kubernetes/pkg/controller"
|
||||
"k8s.io/kubernetes/pkg/controller/informers"
|
||||
"k8s.io/kubernetes/pkg/types"
|
||||
"k8s.io/kubernetes/pkg/util/diff"
|
||||
"k8s.io/kubernetes/pkg/util/node"
|
||||
"k8s.io/kubernetes/pkg/util/wait"
|
||||
)
|
||||
|
||||
@ -538,6 +540,145 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPodStatusChange(t *testing.T) {
|
||||
fakeNow := unversioned.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
|
||||
evictionTimeout := 10 * time.Minute
|
||||
|
||||
// Because of the logic that prevents NC from evicting anything when all Nodes are NotReady
|
||||
// we need second healthy node in tests. Because of how the tests are written we need to update
|
||||
// the status of this Node.
|
||||
healthyNodeNewStatus := api.NodeStatus{
|
||||
Conditions: []api.NodeCondition{
|
||||
{
|
||||
Type: api.NodeReady,
|
||||
Status: api.ConditionTrue,
|
||||
// Node status has just been updated, and is NotReady for 10min.
|
||||
LastHeartbeatTime: unversioned.Date(2015, 1, 1, 12, 9, 0, 0, time.UTC),
|
||||
LastTransitionTime: unversioned.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// Node created long time ago, node controller posted Unknown for a long period of time.
|
||||
table := []struct {
|
||||
fakeNodeHandler *FakeNodeHandler
|
||||
daemonSets []extensions.DaemonSet
|
||||
timeToPass time.Duration
|
||||
newNodeStatus api.NodeStatus
|
||||
secondNodeNewStatus api.NodeStatus
|
||||
expectedPodUpdate bool
|
||||
expectedReason string
|
||||
description string
|
||||
}{
|
||||
{
|
||||
fakeNodeHandler: &FakeNodeHandler{
|
||||
Existing: []*api.Node{
|
||||
{
|
||||
ObjectMeta: api.ObjectMeta{
|
||||
Name: "node0",
|
||||
CreationTimestamp: unversioned.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
Labels: map[string]string{
|
||||
unversioned.LabelZoneRegion: "region1",
|
||||
unversioned.LabelZoneFailureDomain: "zone1",
|
||||
},
|
||||
},
|
||||
Status: api.NodeStatus{
|
||||
Conditions: []api.NodeCondition{
|
||||
{
|
||||
Type: api.NodeReady,
|
||||
Status: api.ConditionUnknown,
|
||||
LastHeartbeatTime: unversioned.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: unversioned.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
ObjectMeta: api.ObjectMeta{
|
||||
Name: "node1",
|
||||
CreationTimestamp: unversioned.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC),
|
||||
Labels: map[string]string{
|
||||
unversioned.LabelZoneRegion: "region1",
|
||||
unversioned.LabelZoneFailureDomain: "zone1",
|
||||
},
|
||||
},
|
||||
Status: api.NodeStatus{
|
||||
Conditions: []api.NodeCondition{
|
||||
{
|
||||
Type: api.NodeReady,
|
||||
Status: api.ConditionTrue,
|
||||
LastHeartbeatTime: unversioned.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: unversioned.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Clientset: fake.NewSimpleClientset(&api.PodList{Items: []api.Pod{*newPod("pod0", "node0")}}),
|
||||
},
|
||||
timeToPass: 60 * time.Minute,
|
||||
newNodeStatus: api.NodeStatus{
|
||||
Conditions: []api.NodeCondition{
|
||||
{
|
||||
Type: api.NodeReady,
|
||||
Status: api.ConditionUnknown,
|
||||
// Node status was updated by nodecontroller 1hr ago
|
||||
LastHeartbeatTime: unversioned.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
LastTransitionTime: unversioned.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC),
|
||||
},
|
||||
},
|
||||
},
|
||||
secondNodeNewStatus: healthyNodeNewStatus,
|
||||
expectedPodUpdate: true,
|
||||
expectedReason: node.NodeUnreachablePodReason,
|
||||
description: "Node created long time ago, node controller posted Unknown for a " +
|
||||
"long period of time, the pod status must include reason for termination.",
|
||||
},
|
||||
}
|
||||
|
||||
for _, item := range table {
|
||||
nodeController, _ := NewNodeControllerFromClient(nil, item.fakeNodeHandler,
|
||||
evictionTimeout, testRateLimiterQPS, testRateLimiterQPS, testLargeClusterThreshold, testUnhealtyThreshold, testNodeMonitorGracePeriod,
|
||||
testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, nil, 0, false)
|
||||
nodeController.now = func() unversioned.Time { return fakeNow }
|
||||
if err := nodeController.monitorNodeStatus(); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
if item.timeToPass > 0 {
|
||||
nodeController.now = func() unversioned.Time { return unversioned.Time{Time: fakeNow.Add(item.timeToPass)} }
|
||||
item.fakeNodeHandler.Existing[0].Status = item.newNodeStatus
|
||||
item.fakeNodeHandler.Existing[1].Status = item.secondNodeNewStatus
|
||||
}
|
||||
if err := nodeController.monitorNodeStatus(); err != nil {
|
||||
t.Errorf("unexpected error: %v", err)
|
||||
}
|
||||
zones := getZones(item.fakeNodeHandler)
|
||||
for _, zone := range zones {
|
||||
nodeController.zonePodEvictor[zone].Try(func(value TimedValue) (bool, time.Duration) {
|
||||
nodeUid, _ := value.UID.(string)
|
||||
deletePods(item.fakeNodeHandler, nodeController.recorder, value.Value, nodeUid, nodeController.daemonSetStore)
|
||||
return true, 0
|
||||
})
|
||||
}
|
||||
|
||||
podReasonUpdate := false
|
||||
for _, action := range item.fakeNodeHandler.Actions() {
|
||||
if action.GetVerb() == "update" && action.GetResource().Resource == "pods" {
|
||||
updateReason := action.(testcore.UpdateActionImpl).GetObject().(*api.Pod).Status.Reason
|
||||
podReasonUpdate = true
|
||||
if updateReason != item.expectedReason {
|
||||
t.Errorf("expected pod status reason: %+v, got %+v for %+v", item.expectedReason, updateReason, item.description)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if podReasonUpdate != item.expectedPodUpdate {
|
||||
t.Errorf("expected pod update: %+v, got %+v for %+v", podReasonUpdate, item.expectedPodUpdate, item.description)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestMonitorNodeStatusEvictPodsWithDisruption(t *testing.T) {
|
||||
fakeNow := unversioned.Date(2015, 1, 1, 12, 0, 0, 0, time.UTC)
|
||||
evictionTimeout := 10 * time.Minute
|
||||
|
@ -95,6 +95,7 @@ go_library(
|
||||
"//pkg/util/integer:go_default_library",
|
||||
"//pkg/util/intstr:go_default_library",
|
||||
"//pkg/util/jsonpath:go_default_library",
|
||||
"//pkg/util/node:go_default_library",
|
||||
"//pkg/util/sets:go_default_library",
|
||||
"//pkg/util/slice:go_default_library",
|
||||
"//pkg/util/uuid:go_default_library",
|
||||
|
@ -30,8 +30,6 @@ import (
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/ghodss/yaml"
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/kubernetes/federation/apis/federation"
|
||||
"k8s.io/kubernetes/pkg/api"
|
||||
"k8s.io/kubernetes/pkg/api/events"
|
||||
@ -49,7 +47,11 @@ import (
|
||||
"k8s.io/kubernetes/pkg/runtime"
|
||||
utilerrors "k8s.io/kubernetes/pkg/util/errors"
|
||||
"k8s.io/kubernetes/pkg/util/jsonpath"
|
||||
"k8s.io/kubernetes/pkg/util/node"
|
||||
"k8s.io/kubernetes/pkg/util/sets"
|
||||
|
||||
"github.com/ghodss/yaml"
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -731,7 +733,10 @@ func printPodBase(pod *api.Pod, w io.Writer, options PrintOptions) error {
|
||||
}
|
||||
}
|
||||
}
|
||||
if pod.DeletionTimestamp != nil {
|
||||
|
||||
if pod.DeletionTimestamp != nil && pod.Status.Reason == node.NodeUnreachablePodReason {
|
||||
reason = "Unknown"
|
||||
} else if pod.DeletionTimestamp != nil {
|
||||
reason = "Terminating"
|
||||
}
|
||||
|
||||
|
@ -31,6 +31,13 @@ import (
|
||||
"k8s.io/kubernetes/pkg/types"
|
||||
)
|
||||
|
||||
const (
|
||||
// The reason and message set on a pod when its state cannot be confirmed as kubelet is unresponsive
|
||||
// on the node it is (was) running.
|
||||
NodeUnreachablePodReason = "NodeLost"
|
||||
NodeUnreachablePodMessage = "Node %v which was running pod %v is unresponsive"
|
||||
)
|
||||
|
||||
func GetHostname(hostnameOverride string) string {
|
||||
var hostname string = hostnameOverride
|
||||
if hostname == "" {
|
||||
|
Loading…
Reference in New Issue
Block a user