diff --git a/pkg/api/resource.go b/pkg/api/resource.go index 1ea96d16e3a..27b7fd66500 100644 --- a/pkg/api/resource.go +++ b/pkg/api/resource.go @@ -53,3 +53,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity { } return &resource.Quantity{} } + +func (self *ResourceList) StorageOverlay() *resource.Quantity { + if val, ok := (*self)[ResourceStorageOverlay]; ok { + return &val + } + return &resource.Quantity{} +} diff --git a/pkg/api/v1/resource.go b/pkg/api/v1/resource.go index 2dca986679f..0d1c1dccd29 100644 --- a/pkg/api/v1/resource.go +++ b/pkg/api/v1/resource.go @@ -54,3 +54,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity { } return &resource.Quantity{} } + +func (self *ResourceList) StorageOverlay() *resource.Quantity { + if val, ok := (*self)[ResourceStorageOverlay]; ok { + return &val + } + return &resource.Quantity{} +} diff --git a/pkg/api/validation/validation.go b/pkg/api/validation/validation.go index d50a20f0713..df5ee287501 100644 --- a/pkg/api/validation/validation.go +++ b/pkg/api/validation/validation.go @@ -3299,6 +3299,7 @@ func validateResourceName(value string, fldPath *field.Path) field.ErrorList { // Refer to docs/design/resources.md for more details. func validateContainerResourceName(value string, fldPath *field.Path) field.ErrorList { allErrs := validateResourceName(value, fldPath) + if len(strings.Split(value, "/")) == 1 { if !helper.IsStandardContainerResourceName(value) { return append(allErrs, field.Invalid(fldPath, value, "must be a standard resource for containers")) diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index cdfad9e4e8b..458cb6e7b1b 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -33,6 +33,7 @@ import ( "k8s.io/kubernetes/pkg/api/v1" v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos" "k8s.io/kubernetes/pkg/features" + statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1" "k8s.io/kubernetes/pkg/kubelet/cm" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" "k8s.io/kubernetes/pkg/kubelet/lifecycle" @@ -151,9 +152,9 @@ func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePod // start the eviction manager monitoring go func() { for { - if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil { - glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod)) - m.waitForPodCleanup(podCleanedUpFunc, evictedPod) + if evictedPods := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPods != nil { + glog.Infof("eviction manager: pods %s evicted, waiting for pod to be cleaned up", format.Pods(evictedPods)) + m.waitForPodsCleanup(podCleanedUpFunc, evictedPods) } else { time.Sleep(monitoringInterval) } @@ -210,7 +211,7 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio // synchronize is the main control loop that enforces eviction thresholds. // Returns the pod that was killed, or nil if no pod was killed. -func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod { +func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) []*v1.Pod { // if we have nothing to do, just return thresholds := m.config.Thresholds if len(thresholds) == 0 { @@ -309,6 +310,14 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act m.lastObservations = observations m.Unlock() + // evict pods if there is a resource uage violation from local volume temporary storage + // If eviction happenes in localVolumeEviction function, skip the rest of eviction action + if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) { + if evictedPods := m.localStorageEviction(activePods); len(evictedPods) > 0 { + return evictedPods + } + } + // determine the set of resources under starvation starvedResources := getStarvedResources(thresholds) if len(starvedResources) == 0 { @@ -387,24 +396,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act if err != nil { glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err) } - return pod + return []*v1.Pod{pod} } glog.Infof("eviction manager: unable to evict any pods from the node") return nil } -func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) { +func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) { timeout := m.clock.NewTimer(podCleanupTimeout) tick := m.clock.Tick(podCleanupPollFreq) for { select { case <-timeout.C(): - glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod)) + glog.Warningf("eviction manager: timed out waiting for pods %s to be cleaned up", format.Pods(pods)) return case <-tick: - if podCleanedUpFunc(pod) { - glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod)) - return + for i, pod := range pods { + if !podCleanedUpFunc(pod) { + break + } + if i == len(pods)-1 { + glog.Infof("eviction manager: pods %s successfully cleaned up", format.Pods(pods)) + return + } } } } @@ -436,3 +450,96 @@ func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceNam } return false } + +// localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs +// to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too. +func (m *managerImpl) localStorageEviction(pods []*v1.Pod) []*v1.Pod { + summary, err := m.summaryProvider.Get() + if err != nil { + glog.Errorf("Could not get summary provider") + return nil + } + + statsFunc := cachedStatsFunc(summary.Pods) + evicted := []*v1.Pod{} + for _, pod := range pods { + podStats, ok := statsFunc(pod) + if !ok { + continue + } + + if m.emptyDirLimitEviction(podStats, pod) { + evicted = append(evicted, pod) + continue + } + + if m.containerOverlayLimitEviction(podStats, pod) { + evicted = append(evicted, pod) + } + } + + return evicted +} + +func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { + podVolumeUsed := make(map[string]*resource.Quantity) + for _, volume := range podStats.VolumeStats { + podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI) + } + for i := range pod.Spec.Volumes { + source := &pod.Spec.Volumes[i].VolumeSource + if source.EmptyDir != nil { + size := source.EmptyDir.SizeLimit + used := podVolumeUsed[pod.Spec.Volumes[i].Name] + if used != nil && size.Sign() == 1 && used.Cmp(size) > 0 { + // the emptyDir usage exceeds the size limit, evict the pod + return m.evictPod(pod, v1.ResourceName("EmptyDir"), fmt.Sprintf("emptyDir usage exceeds the limit %q", size.String())) + } + } + } + return false +} + +func (m *managerImpl) containerOverlayLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { + thresholdsMap := make(map[string]*resource.Quantity) + for _, container := range pod.Spec.Containers { + overlayLimit := container.Resources.Limits.StorageOverlay() + if overlayLimit != nil && overlayLimit.Value() != 0 { + thresholdsMap[container.Name] = overlayLimit + } + } + + for _, containerStat := range podStats.Containers { + rootfs := diskUsage(containerStat.Rootfs) + if overlayThreshold, ok := thresholdsMap[containerStat.Name]; ok { + if overlayThreshold.Cmp(*rootfs) < 0 { + return m.evictPod(pod, v1.ResourceName("containerOverlay"), fmt.Sprintf("container's overlay usage exceeds the limit %q", overlayThreshold.String())) + + } + } + } + return false +} + +func (m *managerImpl) evictPod(pod *v1.Pod, resourceName v1.ResourceName, evictMsg string) bool { + if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && + kubelettypes.IsCriticalPod(pod) && kubepod.IsStaticPod(pod) { + glog.Errorf("eviction manager: cannot evict a critical pod %s", format.Pod(pod)) + return false + } + status := v1.PodStatus{ + Phase: v1.PodFailed, + Message: fmt.Sprintf(message, resourceName), + Reason: reason, + } + // record that we are evicting the pod + m.recorder.Eventf(pod, v1.EventTypeWarning, reason, evictMsg) + gracePeriod := int64(0) + err := m.killPodFunc(pod, status, &gracePeriod) + if err != nil { + glog.Errorf("eviction manager: pod %s failed to evict %v", format.Pod(pod), err) + } else { + glog.Infof("eviction manager: pod %s is evicted successfully", format.Pod(pod)) + } + return true +} diff --git a/staging/src/k8s.io/client-go/pkg/api/resource.go b/staging/src/k8s.io/client-go/pkg/api/resource.go index 1ea96d16e3a..27b7fd66500 100644 --- a/staging/src/k8s.io/client-go/pkg/api/resource.go +++ b/staging/src/k8s.io/client-go/pkg/api/resource.go @@ -53,3 +53,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity { } return &resource.Quantity{} } + +func (self *ResourceList) StorageOverlay() *resource.Quantity { + if val, ok := (*self)[ResourceStorageOverlay]; ok { + return &val + } + return &resource.Quantity{} +} diff --git a/staging/src/k8s.io/client-go/pkg/api/v1/resource.go b/staging/src/k8s.io/client-go/pkg/api/v1/resource.go index 2dca986679f..0d1c1dccd29 100644 --- a/staging/src/k8s.io/client-go/pkg/api/v1/resource.go +++ b/staging/src/k8s.io/client-go/pkg/api/v1/resource.go @@ -54,3 +54,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity { } return &resource.Quantity{} } + +func (self *ResourceList) StorageOverlay() *resource.Quantity { + if val, ok := (*self)[ResourceStorageOverlay]; ok { + return &val + } + return &resource.Quantity{} +} diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index b8956ab1381..bbb9301383b 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -77,6 +77,7 @@ go_test( "kubelet_test.go", "lifecycle_hook_test.go", "local_storage_allocatable_eviction_test.go", + "local_storage_isolation_eviction_test.go", "log_path_test.go", "memory_eviction_test.go", "mirror_pod_test.go", @@ -137,6 +138,7 @@ go_test( "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library", "//vendor/k8s.io/apimachinery/pkg/watch:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/tools/cache:go_default_library", ], ) diff --git a/test/e2e_node/local_storage_isolation_eviction_test.go b/test/e2e_node/local_storage_isolation_eviction_test.go new file mode 100644 index 00000000000..24d452903fc --- /dev/null +++ b/test/e2e_node/local_storage_isolation_eviction_test.go @@ -0,0 +1,291 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e_node + +import ( + "fmt" + "time" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/apis/componentconfig" + "k8s.io/kubernetes/test/e2e/framework" +) + +type podEvictSpec struct { + evicted bool + pod v1.Pod +} + +const ( + totalEvict = 3 +) + +// Eviction Policy is described here: +// https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md + +var _ = framework.KubeDescribe("LocalStorageCapacityIsolationEviction [Slow] [Serial] [Disruptive] [Feature:LocalStorageCapacityIsolation]", func() { + + f := framework.NewDefaultFramework("localstorage-eviction-test") + + emptyDirVolumeName := "volume-emptydir-pod" + podTestSpecs := []podEvictSpec{ + {evicted: true, // This pod should be evicted because emptyDir (defualt storage type) usage violation + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "emptydir-hog-pod"}, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: "container-emptydir-hog-pod", + Command: []string{ + "sh", + "-c", + "dd if=/dev/urandom of=target-file of=/cache/target-file bs=50000 count=1; while true; do sleep 5; done", + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: emptyDirVolumeName, + MountPath: "/cache", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: emptyDirVolumeName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: *resource.NewQuantity(int64(1000), resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + + {evicted: true, // This pod should be evicted because emptyDir (memory type) usage violation + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "emptydir-memory-pod"}, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: "container-emptydir-memory-pod", + Command: []string{ + "sh", + "-c", + "dd if=/dev/urandom of=target-file of=/cache/target-file bs=50000 count=1; while true; do sleep 5; done", + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: emptyDirVolumeName, + MountPath: "/cache", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: emptyDirVolumeName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + Medium: "Memory", + SizeLimit: *resource.NewQuantity(int64(10000), resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + + {evicted: false, + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "container-emptydir-pod-critical"}, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: "container-emptydir-hog-pod", + Command: []string{ + "sh", + "-c", + "dd if=/dev/urandom of=target-file of=/cache/target-file bs=50000 count=1; while true; do sleep 5; done", + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: emptyDirVolumeName, + MountPath: "/cache", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: emptyDirVolumeName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: *resource.NewQuantity(int64(100000), resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + + {evicted: true, // This pod should be evicted because container overlay usage violation + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "container-hog-pod"}, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: "container-hog-pod", + Command: []string{ + "sh", + "-c", + "dd if=/dev/urandom of=target-file bs=50000 count=1; while true; do sleep 5; done", + }, + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceStorageOverlay: *resource.NewMilliQuantity( + int64(40000), + resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + } + + evictionTestTimeout := 4 * time.Minute + testCondition := "EmptyDir/ContainerOverlay usage limit violation" + Context(fmt.Sprintf("EmptyDirEviction when we run containers that should cause %s", testCondition), func() { + tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) { + initialConfig.FeatureGates += ", LocalStorageCapacityIsolation=true" + }) + err := utilfeature.DefaultFeatureGate.Set("LocalStorageCapacityIsolation=true") + if err != nil { + framework.Failf("Failed to enable feature gate for LocalStorageCapacityIsolation: %v", err) + return + } + + runLocalStorageIsolationEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasInodePressure) + }) +}) + +// runLocalStorageEvictionTest sets up a testing environment given the provided nodes, and checks a few things: +// pods that exceed their local storage limit are evicted +// pods that didn't exceed their local storage limit are not evicted +// runLocalStorageEvictionTest then cleans up the testing environment by deleting provided nodes, +func runLocalStorageIsolationEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podEvictSpec, evictionTestTimeout time.Duration, + hasPressureCondition func(*framework.Framework, string) (bool, error)) { + + Context(fmt.Sprintf("EmptyDirEviction when we run containers that should cause %s", testCondition), func() { + + BeforeEach(func() { + By("seting up pods to be used by tests") + + for _, spec := range podTestSpecs { + By(fmt.Sprintf("creating pod with container: %s", spec.pod.Name)) + f.PodClient().CreateSync(&spec.pod) + } + }) + + It(fmt.Sprintf("Test should eventually see %s, and then evict the correct pods", testCondition), func() { + evictNum := 0 + evictMap := make(map[string]string) + Eventually(func() error { + // Gather current information + updatedPodList, err := f.ClientSet.Core().Pods(f.Namespace.Name).List(metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to get the list of pod: %v", err) + } + updatedPods := updatedPodList.Items + + for _, p := range updatedPods { + framework.Logf("fetching pod %s; phase= %v", p.Name, p.Status.Phase) + for _, testPod := range podTestSpecs { + if p.Name == testPod.pod.Name { + if !testPod.evicted { + Expect(p.Status.Phase).NotTo(Equal(v1.PodFailed), + fmt.Sprintf("%s pod failed (and shouldn't have failed)", p.Name)) + } else { + if _, ok := evictMap[p.Name]; !ok && p.Status.Phase == v1.PodFailed { + evictNum++ + evictMap[p.Name] = p.Name + } + } + } + } + + } + if evictNum == totalEvict { + return nil + } + return fmt.Errorf("pods that caused %s have not been evicted", testCondition) + }, evictionTestTimeout, evictionPollInterval).Should(BeNil()) + + By("making sure we can start a new pod after the test") + podName := "test-admit-pod" + f.PodClient().CreateSync(&v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: framework.GetPauseImageNameForHostArch(), + Name: podName, + }, + }, + }, + }) + }) + + AfterEach(func() { + By("deleting pods") + for _, spec := range podTestSpecs { + By(fmt.Sprintf("deleting pod: %s", spec.pod.Name)) + f.PodClient().DeleteSync(spec.pod.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) + } + + if CurrentGinkgoTestDescription().Failed { + if framework.TestContext.DumpLogsOnFailure { + logPodEvents(f) + logNodeEvents(f) + } + By("sleeping to allow for cleanup of test") + time.Sleep(postTestConditionMonitoringPeriod) + } + }) + }) +}