From 0b13aee0c0f9bd06eb323ea249db29547b66bc46 Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Tue, 30 May 2017 13:54:15 -0700 Subject: [PATCH] Add EmptyDir Volume and local storage for container overlay Isolation This PR adds two features: 1. add support for isolating the emptyDir volume use. If user sets a size limit for emptyDir volume, kubelet's eviction manager monitors its usage and evict the pod if the usage exceeds the limit. 2. add support for isolating the local storage for container overlay. If the container's overly usage exceeds the limit defined in container spec, eviction manager will evict the pod. --- pkg/api/resource.go | 7 + pkg/api/v1/resource.go | 7 + pkg/api/validation/validation.go | 1 + pkg/kubelet/eviction/eviction_manager.go | 127 +++++++- .../src/k8s.io/client-go/pkg/api/resource.go | 7 + .../k8s.io/client-go/pkg/api/v1/resource.go | 7 + test/e2e_node/BUILD | 2 + .../local_storage_isolation_eviction_test.go | 291 ++++++++++++++++++ 8 files changed, 439 insertions(+), 10 deletions(-) create mode 100644 test/e2e_node/local_storage_isolation_eviction_test.go diff --git a/pkg/api/resource.go b/pkg/api/resource.go index 1ea96d16e3a..27b7fd66500 100644 --- a/pkg/api/resource.go +++ b/pkg/api/resource.go @@ -53,3 +53,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity { } return &resource.Quantity{} } + +func (self *ResourceList) StorageOverlay() *resource.Quantity { + if val, ok := (*self)[ResourceStorageOverlay]; ok { + return &val + } + return &resource.Quantity{} +} diff --git a/pkg/api/v1/resource.go b/pkg/api/v1/resource.go index 2dca986679f..0d1c1dccd29 100644 --- a/pkg/api/v1/resource.go +++ b/pkg/api/v1/resource.go @@ -54,3 +54,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity { } return &resource.Quantity{} } + +func (self *ResourceList) StorageOverlay() *resource.Quantity { + if val, ok := (*self)[ResourceStorageOverlay]; ok { + return &val + } + return &resource.Quantity{} +} diff --git a/pkg/api/validation/validation.go b/pkg/api/validation/validation.go index d50a20f0713..df5ee287501 100644 --- a/pkg/api/validation/validation.go +++ b/pkg/api/validation/validation.go @@ -3299,6 +3299,7 @@ func validateResourceName(value string, fldPath *field.Path) field.ErrorList { // Refer to docs/design/resources.md for more details. func validateContainerResourceName(value string, fldPath *field.Path) field.ErrorList { allErrs := validateResourceName(value, fldPath) + if len(strings.Split(value, "/")) == 1 { if !helper.IsStandardContainerResourceName(value) { return append(allErrs, field.Invalid(fldPath, value, "must be a standard resource for containers")) diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index cdfad9e4e8b..458cb6e7b1b 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -33,6 +33,7 @@ import ( "k8s.io/kubernetes/pkg/api/v1" v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos" "k8s.io/kubernetes/pkg/features" + statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1" "k8s.io/kubernetes/pkg/kubelet/cm" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" "k8s.io/kubernetes/pkg/kubelet/lifecycle" @@ -151,9 +152,9 @@ func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePod // start the eviction manager monitoring go func() { for { - if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil { - glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod)) - m.waitForPodCleanup(podCleanedUpFunc, evictedPod) + if evictedPods := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPods != nil { + glog.Infof("eviction manager: pods %s evicted, waiting for pod to be cleaned up", format.Pods(evictedPods)) + m.waitForPodsCleanup(podCleanedUpFunc, evictedPods) } else { time.Sleep(monitoringInterval) } @@ -210,7 +211,7 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio // synchronize is the main control loop that enforces eviction thresholds. // Returns the pod that was killed, or nil if no pod was killed. -func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod { +func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) []*v1.Pod { // if we have nothing to do, just return thresholds := m.config.Thresholds if len(thresholds) == 0 { @@ -309,6 +310,14 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act m.lastObservations = observations m.Unlock() + // evict pods if there is a resource uage violation from local volume temporary storage + // If eviction happenes in localVolumeEviction function, skip the rest of eviction action + if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) { + if evictedPods := m.localStorageEviction(activePods); len(evictedPods) > 0 { + return evictedPods + } + } + // determine the set of resources under starvation starvedResources := getStarvedResources(thresholds) if len(starvedResources) == 0 { @@ -387,24 +396,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act if err != nil { glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err) } - return pod + return []*v1.Pod{pod} } glog.Infof("eviction manager: unable to evict any pods from the node") return nil } -func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) { +func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) { timeout := m.clock.NewTimer(podCleanupTimeout) tick := m.clock.Tick(podCleanupPollFreq) for { select { case <-timeout.C(): - glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod)) + glog.Warningf("eviction manager: timed out waiting for pods %s to be cleaned up", format.Pods(pods)) return case <-tick: - if podCleanedUpFunc(pod) { - glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod)) - return + for i, pod := range pods { + if !podCleanedUpFunc(pod) { + break + } + if i == len(pods)-1 { + glog.Infof("eviction manager: pods %s successfully cleaned up", format.Pods(pods)) + return + } } } } @@ -436,3 +450,96 @@ func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceNam } return false } + +// localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs +// to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too. +func (m *managerImpl) localStorageEviction(pods []*v1.Pod) []*v1.Pod { + summary, err := m.summaryProvider.Get() + if err != nil { + glog.Errorf("Could not get summary provider") + return nil + } + + statsFunc := cachedStatsFunc(summary.Pods) + evicted := []*v1.Pod{} + for _, pod := range pods { + podStats, ok := statsFunc(pod) + if !ok { + continue + } + + if m.emptyDirLimitEviction(podStats, pod) { + evicted = append(evicted, pod) + continue + } + + if m.containerOverlayLimitEviction(podStats, pod) { + evicted = append(evicted, pod) + } + } + + return evicted +} + +func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { + podVolumeUsed := make(map[string]*resource.Quantity) + for _, volume := range podStats.VolumeStats { + podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI) + } + for i := range pod.Spec.Volumes { + source := &pod.Spec.Volumes[i].VolumeSource + if source.EmptyDir != nil { + size := source.EmptyDir.SizeLimit + used := podVolumeUsed[pod.Spec.Volumes[i].Name] + if used != nil && size.Sign() == 1 && used.Cmp(size) > 0 { + // the emptyDir usage exceeds the size limit, evict the pod + return m.evictPod(pod, v1.ResourceName("EmptyDir"), fmt.Sprintf("emptyDir usage exceeds the limit %q", size.String())) + } + } + } + return false +} + +func (m *managerImpl) containerOverlayLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool { + thresholdsMap := make(map[string]*resource.Quantity) + for _, container := range pod.Spec.Containers { + overlayLimit := container.Resources.Limits.StorageOverlay() + if overlayLimit != nil && overlayLimit.Value() != 0 { + thresholdsMap[container.Name] = overlayLimit + } + } + + for _, containerStat := range podStats.Containers { + rootfs := diskUsage(containerStat.Rootfs) + if overlayThreshold, ok := thresholdsMap[containerStat.Name]; ok { + if overlayThreshold.Cmp(*rootfs) < 0 { + return m.evictPod(pod, v1.ResourceName("containerOverlay"), fmt.Sprintf("container's overlay usage exceeds the limit %q", overlayThreshold.String())) + + } + } + } + return false +} + +func (m *managerImpl) evictPod(pod *v1.Pod, resourceName v1.ResourceName, evictMsg string) bool { + if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) && + kubelettypes.IsCriticalPod(pod) && kubepod.IsStaticPod(pod) { + glog.Errorf("eviction manager: cannot evict a critical pod %s", format.Pod(pod)) + return false + } + status := v1.PodStatus{ + Phase: v1.PodFailed, + Message: fmt.Sprintf(message, resourceName), + Reason: reason, + } + // record that we are evicting the pod + m.recorder.Eventf(pod, v1.EventTypeWarning, reason, evictMsg) + gracePeriod := int64(0) + err := m.killPodFunc(pod, status, &gracePeriod) + if err != nil { + glog.Errorf("eviction manager: pod %s failed to evict %v", format.Pod(pod), err) + } else { + glog.Infof("eviction manager: pod %s is evicted successfully", format.Pod(pod)) + } + return true +} diff --git a/staging/src/k8s.io/client-go/pkg/api/resource.go b/staging/src/k8s.io/client-go/pkg/api/resource.go index 1ea96d16e3a..27b7fd66500 100644 --- a/staging/src/k8s.io/client-go/pkg/api/resource.go +++ b/staging/src/k8s.io/client-go/pkg/api/resource.go @@ -53,3 +53,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity { } return &resource.Quantity{} } + +func (self *ResourceList) StorageOverlay() *resource.Quantity { + if val, ok := (*self)[ResourceStorageOverlay]; ok { + return &val + } + return &resource.Quantity{} +} diff --git a/staging/src/k8s.io/client-go/pkg/api/v1/resource.go b/staging/src/k8s.io/client-go/pkg/api/v1/resource.go index 2dca986679f..0d1c1dccd29 100644 --- a/staging/src/k8s.io/client-go/pkg/api/v1/resource.go +++ b/staging/src/k8s.io/client-go/pkg/api/v1/resource.go @@ -54,3 +54,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity { } return &resource.Quantity{} } + +func (self *ResourceList) StorageOverlay() *resource.Quantity { + if val, ok := (*self)[ResourceStorageOverlay]; ok { + return &val + } + return &resource.Quantity{} +} diff --git a/test/e2e_node/BUILD b/test/e2e_node/BUILD index b8956ab1381..bbb9301383b 100644 --- a/test/e2e_node/BUILD +++ b/test/e2e_node/BUILD @@ -77,6 +77,7 @@ go_test( "kubelet_test.go", "lifecycle_hook_test.go", "local_storage_allocatable_eviction_test.go", + "local_storage_isolation_eviction_test.go", "log_path_test.go", "memory_eviction_test.go", "mirror_pod_test.go", @@ -137,6 +138,7 @@ go_test( "//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library", "//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library", "//vendor/k8s.io/apimachinery/pkg/watch:go_default_library", + "//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library", "//vendor/k8s.io/client-go/tools/cache:go_default_library", ], ) diff --git a/test/e2e_node/local_storage_isolation_eviction_test.go b/test/e2e_node/local_storage_isolation_eviction_test.go new file mode 100644 index 00000000000..24d452903fc --- /dev/null +++ b/test/e2e_node/local_storage_isolation_eviction_test.go @@ -0,0 +1,291 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e_node + +import ( + "fmt" + "time" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilfeature "k8s.io/apiserver/pkg/util/feature" + "k8s.io/kubernetes/pkg/api/v1" + "k8s.io/kubernetes/pkg/apis/componentconfig" + "k8s.io/kubernetes/test/e2e/framework" +) + +type podEvictSpec struct { + evicted bool + pod v1.Pod +} + +const ( + totalEvict = 3 +) + +// Eviction Policy is described here: +// https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md + +var _ = framework.KubeDescribe("LocalStorageCapacityIsolationEviction [Slow] [Serial] [Disruptive] [Feature:LocalStorageCapacityIsolation]", func() { + + f := framework.NewDefaultFramework("localstorage-eviction-test") + + emptyDirVolumeName := "volume-emptydir-pod" + podTestSpecs := []podEvictSpec{ + {evicted: true, // This pod should be evicted because emptyDir (defualt storage type) usage violation + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "emptydir-hog-pod"}, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: "container-emptydir-hog-pod", + Command: []string{ + "sh", + "-c", + "dd if=/dev/urandom of=target-file of=/cache/target-file bs=50000 count=1; while true; do sleep 5; done", + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: emptyDirVolumeName, + MountPath: "/cache", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: emptyDirVolumeName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: *resource.NewQuantity(int64(1000), resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + + {evicted: true, // This pod should be evicted because emptyDir (memory type) usage violation + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "emptydir-memory-pod"}, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: "container-emptydir-memory-pod", + Command: []string{ + "sh", + "-c", + "dd if=/dev/urandom of=target-file of=/cache/target-file bs=50000 count=1; while true; do sleep 5; done", + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: emptyDirVolumeName, + MountPath: "/cache", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: emptyDirVolumeName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + Medium: "Memory", + SizeLimit: *resource.NewQuantity(int64(10000), resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + + {evicted: false, + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "container-emptydir-pod-critical"}, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: "container-emptydir-hog-pod", + Command: []string{ + "sh", + "-c", + "dd if=/dev/urandom of=target-file of=/cache/target-file bs=50000 count=1; while true; do sleep 5; done", + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: emptyDirVolumeName, + MountPath: "/cache", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: emptyDirVolumeName, + VolumeSource: v1.VolumeSource{ + EmptyDir: &v1.EmptyDirVolumeSource{ + SizeLimit: *resource.NewQuantity(int64(100000), resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + + {evicted: true, // This pod should be evicted because container overlay usage violation + pod: v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "container-hog-pod"}, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: "gcr.io/google_containers/busybox:1.24", + Name: "container-hog-pod", + Command: []string{ + "sh", + "-c", + "dd if=/dev/urandom of=target-file bs=50000 count=1; while true; do sleep 5; done", + }, + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceStorageOverlay: *resource.NewMilliQuantity( + int64(40000), + resource.BinarySI), + }, + }, + }, + }, + }, + }, + }, + } + + evictionTestTimeout := 4 * time.Minute + testCondition := "EmptyDir/ContainerOverlay usage limit violation" + Context(fmt.Sprintf("EmptyDirEviction when we run containers that should cause %s", testCondition), func() { + tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) { + initialConfig.FeatureGates += ", LocalStorageCapacityIsolation=true" + }) + err := utilfeature.DefaultFeatureGate.Set("LocalStorageCapacityIsolation=true") + if err != nil { + framework.Failf("Failed to enable feature gate for LocalStorageCapacityIsolation: %v", err) + return + } + + runLocalStorageIsolationEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasInodePressure) + }) +}) + +// runLocalStorageEvictionTest sets up a testing environment given the provided nodes, and checks a few things: +// pods that exceed their local storage limit are evicted +// pods that didn't exceed their local storage limit are not evicted +// runLocalStorageEvictionTest then cleans up the testing environment by deleting provided nodes, +func runLocalStorageIsolationEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podEvictSpec, evictionTestTimeout time.Duration, + hasPressureCondition func(*framework.Framework, string) (bool, error)) { + + Context(fmt.Sprintf("EmptyDirEviction when we run containers that should cause %s", testCondition), func() { + + BeforeEach(func() { + By("seting up pods to be used by tests") + + for _, spec := range podTestSpecs { + By(fmt.Sprintf("creating pod with container: %s", spec.pod.Name)) + f.PodClient().CreateSync(&spec.pod) + } + }) + + It(fmt.Sprintf("Test should eventually see %s, and then evict the correct pods", testCondition), func() { + evictNum := 0 + evictMap := make(map[string]string) + Eventually(func() error { + // Gather current information + updatedPodList, err := f.ClientSet.Core().Pods(f.Namespace.Name).List(metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("failed to get the list of pod: %v", err) + } + updatedPods := updatedPodList.Items + + for _, p := range updatedPods { + framework.Logf("fetching pod %s; phase= %v", p.Name, p.Status.Phase) + for _, testPod := range podTestSpecs { + if p.Name == testPod.pod.Name { + if !testPod.evicted { + Expect(p.Status.Phase).NotTo(Equal(v1.PodFailed), + fmt.Sprintf("%s pod failed (and shouldn't have failed)", p.Name)) + } else { + if _, ok := evictMap[p.Name]; !ok && p.Status.Phase == v1.PodFailed { + evictNum++ + evictMap[p.Name] = p.Name + } + } + } + } + + } + if evictNum == totalEvict { + return nil + } + return fmt.Errorf("pods that caused %s have not been evicted", testCondition) + }, evictionTestTimeout, evictionPollInterval).Should(BeNil()) + + By("making sure we can start a new pod after the test") + podName := "test-admit-pod" + f.PodClient().CreateSync(&v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Image: framework.GetPauseImageNameForHostArch(), + Name: podName, + }, + }, + }, + }) + }) + + AfterEach(func() { + By("deleting pods") + for _, spec := range podTestSpecs { + By(fmt.Sprintf("deleting pod: %s", spec.pod.Name)) + f.PodClient().DeleteSync(spec.pod.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout) + } + + if CurrentGinkgoTestDescription().Failed { + if framework.TestContext.DumpLogsOnFailure { + logPodEvents(f) + logNodeEvents(f) + } + By("sleeping to allow for cleanup of test") + time.Sleep(postTestConditionMonitoringPeriod) + } + }) + }) +}