Merge pull request #45686 from jingxu97/May/emptyDir

Automatic merge from submit-queue

Add EmptyDir volume capacity isolation

This PR adds the support for isolating the emptyDir volume use. If user
sets a size limit for emptyDir volume, kubelet's eviction manager monitors its usage
and evict the pod if the usage exceeds the limit.

This feature is part of local storage capacity isolation and described in the proposal kubernetes/community#306

**Release note**:

```release-note
Alpha feature: allows users to set storage limit to isolate EmptyDir volumes. It enforces the limit by evicting pods that exceed their storage limits  
```
This commit is contained in:
Kubernetes Submit Queue 2017-06-05 23:08:58 -07:00 committed by GitHub
commit cb681321c7
8 changed files with 439 additions and 10 deletions

View File

@ -53,3 +53,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity {
}
return &resource.Quantity{}
}
func (self *ResourceList) StorageOverlay() *resource.Quantity {
if val, ok := (*self)[ResourceStorageOverlay]; ok {
return &val
}
return &resource.Quantity{}
}

View File

@ -54,3 +54,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity {
}
return &resource.Quantity{}
}
func (self *ResourceList) StorageOverlay() *resource.Quantity {
if val, ok := (*self)[ResourceStorageOverlay]; ok {
return &val
}
return &resource.Quantity{}
}

View File

@ -3299,6 +3299,7 @@ func validateResourceName(value string, fldPath *field.Path) field.ErrorList {
// Refer to docs/design/resources.md for more details.
func validateContainerResourceName(value string, fldPath *field.Path) field.ErrorList {
allErrs := validateResourceName(value, fldPath)
if len(strings.Split(value, "/")) == 1 {
if !helper.IsStandardContainerResourceName(value) {
return append(allErrs, field.Invalid(fldPath, value, "must be a standard resource for containers"))

View File

@ -33,6 +33,7 @@ import (
"k8s.io/kubernetes/pkg/api/v1"
v1qos "k8s.io/kubernetes/pkg/api/v1/helper/qos"
"k8s.io/kubernetes/pkg/features"
statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1"
"k8s.io/kubernetes/pkg/kubelet/cm"
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
@ -151,9 +152,9 @@ func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePod
// start the eviction manager monitoring
go func() {
for {
if evictedPod := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPod != nil {
glog.Infof("eviction manager: pod %s evicted, waiting for pod to be cleaned up", format.Pod(evictedPod))
m.waitForPodCleanup(podCleanedUpFunc, evictedPod)
if evictedPods := m.synchronize(diskInfoProvider, podFunc, nodeProvider); evictedPods != nil {
glog.Infof("eviction manager: pods %s evicted, waiting for pod to be cleaned up", format.Pods(evictedPods))
m.waitForPodsCleanup(podCleanedUpFunc, evictedPods)
} else {
time.Sleep(monitoringInterval)
}
@ -210,7 +211,7 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio
// synchronize is the main control loop that enforces eviction thresholds.
// Returns the pod that was killed, or nil if no pod was killed.
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) *v1.Pod {
func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, nodeProvider NodeProvider) []*v1.Pod {
// if we have nothing to do, just return
thresholds := m.config.Thresholds
if len(thresholds) == 0 {
@ -309,6 +310,14 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
m.lastObservations = observations
m.Unlock()
// evict pods if there is a resource uage violation from local volume temporary storage
// If eviction happenes in localVolumeEviction function, skip the rest of eviction action
if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
if evictedPods := m.localStorageEviction(activePods); len(evictedPods) > 0 {
return evictedPods
}
}
// determine the set of resources under starvation
starvedResources := getStarvedResources(thresholds)
if len(starvedResources) == 0 {
@ -387,24 +396,29 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
if err != nil {
glog.Warningf("eviction manager: error while evicting pod %s: %v", format.Pod(pod), err)
}
return pod
return []*v1.Pod{pod}
}
glog.Infof("eviction manager: unable to evict any pods from the node")
return nil
}
func (m *managerImpl) waitForPodCleanup(podCleanedUpFunc PodCleanedUpFunc, pod *v1.Pod) {
func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods []*v1.Pod) {
timeout := m.clock.NewTimer(podCleanupTimeout)
tick := m.clock.Tick(podCleanupPollFreq)
for {
select {
case <-timeout.C():
glog.Warningf("eviction manager: timed out waiting for pod %s to be cleaned up", format.Pod(pod))
glog.Warningf("eviction manager: timed out waiting for pods %s to be cleaned up", format.Pods(pods))
return
case <-tick:
if podCleanedUpFunc(pod) {
glog.Infof("eviction manager: pod %s successfully cleaned up", format.Pod(pod))
return
for i, pod := range pods {
if !podCleanedUpFunc(pod) {
break
}
if i == len(pods)-1 {
glog.Infof("eviction manager: pods %s successfully cleaned up", format.Pods(pods))
return
}
}
}
}
@ -436,3 +450,96 @@ func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceNam
}
return false
}
// localStorageEviction checks the EmptyDir volume usage for each pod and determine whether it exceeds the specified limit and needs
// to be evicted. It also checks every container in the pod, if the container overlay usage exceeds the limit, the pod will be evicted too.
func (m *managerImpl) localStorageEviction(pods []*v1.Pod) []*v1.Pod {
summary, err := m.summaryProvider.Get()
if err != nil {
glog.Errorf("Could not get summary provider")
return nil
}
statsFunc := cachedStatsFunc(summary.Pods)
evicted := []*v1.Pod{}
for _, pod := range pods {
podStats, ok := statsFunc(pod)
if !ok {
continue
}
if m.emptyDirLimitEviction(podStats, pod) {
evicted = append(evicted, pod)
continue
}
if m.containerOverlayLimitEviction(podStats, pod) {
evicted = append(evicted, pod)
}
}
return evicted
}
func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
podVolumeUsed := make(map[string]*resource.Quantity)
for _, volume := range podStats.VolumeStats {
podVolumeUsed[volume.Name] = resource.NewQuantity(int64(*volume.UsedBytes), resource.BinarySI)
}
for i := range pod.Spec.Volumes {
source := &pod.Spec.Volumes[i].VolumeSource
if source.EmptyDir != nil {
size := source.EmptyDir.SizeLimit
used := podVolumeUsed[pod.Spec.Volumes[i].Name]
if used != nil && size.Sign() == 1 && used.Cmp(size) > 0 {
// the emptyDir usage exceeds the size limit, evict the pod
return m.evictPod(pod, v1.ResourceName("EmptyDir"), fmt.Sprintf("emptyDir usage exceeds the limit %q", size.String()))
}
}
}
return false
}
func (m *managerImpl) containerOverlayLimitEviction(podStats statsapi.PodStats, pod *v1.Pod) bool {
thresholdsMap := make(map[string]*resource.Quantity)
for _, container := range pod.Spec.Containers {
overlayLimit := container.Resources.Limits.StorageOverlay()
if overlayLimit != nil && overlayLimit.Value() != 0 {
thresholdsMap[container.Name] = overlayLimit
}
}
for _, containerStat := range podStats.Containers {
rootfs := diskUsage(containerStat.Rootfs)
if overlayThreshold, ok := thresholdsMap[containerStat.Name]; ok {
if overlayThreshold.Cmp(*rootfs) < 0 {
return m.evictPod(pod, v1.ResourceName("containerOverlay"), fmt.Sprintf("container's overlay usage exceeds the limit %q", overlayThreshold.String()))
}
}
}
return false
}
func (m *managerImpl) evictPod(pod *v1.Pod, resourceName v1.ResourceName, evictMsg string) bool {
if utilfeature.DefaultFeatureGate.Enabled(features.ExperimentalCriticalPodAnnotation) &&
kubelettypes.IsCriticalPod(pod) && kubepod.IsStaticPod(pod) {
glog.Errorf("eviction manager: cannot evict a critical pod %s", format.Pod(pod))
return false
}
status := v1.PodStatus{
Phase: v1.PodFailed,
Message: fmt.Sprintf(message, resourceName),
Reason: reason,
}
// record that we are evicting the pod
m.recorder.Eventf(pod, v1.EventTypeWarning, reason, evictMsg)
gracePeriod := int64(0)
err := m.killPodFunc(pod, status, &gracePeriod)
if err != nil {
glog.Errorf("eviction manager: pod %s failed to evict %v", format.Pod(pod), err)
} else {
glog.Infof("eviction manager: pod %s is evicted successfully", format.Pod(pod))
}
return true
}

View File

@ -53,3 +53,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity {
}
return &resource.Quantity{}
}
func (self *ResourceList) StorageOverlay() *resource.Quantity {
if val, ok := (*self)[ResourceStorageOverlay]; ok {
return &val
}
return &resource.Quantity{}
}

View File

@ -54,3 +54,10 @@ func (self *ResourceList) NvidiaGPU() *resource.Quantity {
}
return &resource.Quantity{}
}
func (self *ResourceList) StorageOverlay() *resource.Quantity {
if val, ok := (*self)[ResourceStorageOverlay]; ok {
return &val
}
return &resource.Quantity{}
}

View File

@ -77,6 +77,7 @@ go_test(
"kubelet_test.go",
"lifecycle_hook_test.go",
"local_storage_allocatable_eviction_test.go",
"local_storage_isolation_eviction_test.go",
"log_path_test.go",
"memory_eviction_test.go",
"mirror_pod_test.go",
@ -137,6 +138,7 @@ go_test(
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/util/uuid:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/watch:go_default_library",
"//vendor/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//vendor/k8s.io/client-go/tools/cache:go_default_library",
],
)

View File

@ -0,0 +1,291 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e_node
import (
"fmt"
"time"
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilfeature "k8s.io/apiserver/pkg/util/feature"
"k8s.io/kubernetes/pkg/api/v1"
"k8s.io/kubernetes/pkg/apis/componentconfig"
"k8s.io/kubernetes/test/e2e/framework"
)
type podEvictSpec struct {
evicted bool
pod v1.Pod
}
const (
totalEvict = 3
)
// Eviction Policy is described here:
// https://github.com/kubernetes/kubernetes/blob/master/docs/proposals/kubelet-eviction.md
var _ = framework.KubeDescribe("LocalStorageCapacityIsolationEviction [Slow] [Serial] [Disruptive] [Feature:LocalStorageCapacityIsolation]", func() {
f := framework.NewDefaultFramework("localstorage-eviction-test")
emptyDirVolumeName := "volume-emptydir-pod"
podTestSpecs := []podEvictSpec{
{evicted: true, // This pod should be evicted because emptyDir (defualt storage type) usage violation
pod: v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "emptydir-hog-pod"},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",
Name: "container-emptydir-hog-pod",
Command: []string{
"sh",
"-c",
"dd if=/dev/urandom of=target-file of=/cache/target-file bs=50000 count=1; while true; do sleep 5; done",
},
VolumeMounts: []v1.VolumeMount{
{
Name: emptyDirVolumeName,
MountPath: "/cache",
},
},
},
},
Volumes: []v1.Volume{
{
Name: emptyDirVolumeName,
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: *resource.NewQuantity(int64(1000), resource.BinarySI),
},
},
},
},
},
},
},
{evicted: true, // This pod should be evicted because emptyDir (memory type) usage violation
pod: v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "emptydir-memory-pod"},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",
Name: "container-emptydir-memory-pod",
Command: []string{
"sh",
"-c",
"dd if=/dev/urandom of=target-file of=/cache/target-file bs=50000 count=1; while true; do sleep 5; done",
},
VolumeMounts: []v1.VolumeMount{
{
Name: emptyDirVolumeName,
MountPath: "/cache",
},
},
},
},
Volumes: []v1.Volume{
{
Name: emptyDirVolumeName,
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{
Medium: "Memory",
SizeLimit: *resource.NewQuantity(int64(10000), resource.BinarySI),
},
},
},
},
},
},
},
{evicted: false,
pod: v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "container-emptydir-pod-critical"},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",
Name: "container-emptydir-hog-pod",
Command: []string{
"sh",
"-c",
"dd if=/dev/urandom of=target-file of=/cache/target-file bs=50000 count=1; while true; do sleep 5; done",
},
VolumeMounts: []v1.VolumeMount{
{
Name: emptyDirVolumeName,
MountPath: "/cache",
},
},
},
},
Volumes: []v1.Volume{
{
Name: emptyDirVolumeName,
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: *resource.NewQuantity(int64(100000), resource.BinarySI),
},
},
},
},
},
},
},
{evicted: true, // This pod should be evicted because container overlay usage violation
pod: v1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "container-hog-pod"},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",
Name: "container-hog-pod",
Command: []string{
"sh",
"-c",
"dd if=/dev/urandom of=target-file bs=50000 count=1; while true; do sleep 5; done",
},
Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{
v1.ResourceStorageOverlay: *resource.NewMilliQuantity(
int64(40000),
resource.BinarySI),
},
},
},
},
},
},
},
}
evictionTestTimeout := 4 * time.Minute
testCondition := "EmptyDir/ContainerOverlay usage limit violation"
Context(fmt.Sprintf("EmptyDirEviction when we run containers that should cause %s", testCondition), func() {
tempSetCurrentKubeletConfig(f, func(initialConfig *componentconfig.KubeletConfiguration) {
initialConfig.FeatureGates += ", LocalStorageCapacityIsolation=true"
})
err := utilfeature.DefaultFeatureGate.Set("LocalStorageCapacityIsolation=true")
if err != nil {
framework.Failf("Failed to enable feature gate for LocalStorageCapacityIsolation: %v", err)
return
}
runLocalStorageIsolationEvictionTest(f, testCondition, podTestSpecs, evictionTestTimeout, hasInodePressure)
})
})
// runLocalStorageEvictionTest sets up a testing environment given the provided nodes, and checks a few things:
// pods that exceed their local storage limit are evicted
// pods that didn't exceed their local storage limit are not evicted
// runLocalStorageEvictionTest then cleans up the testing environment by deleting provided nodes,
func runLocalStorageIsolationEvictionTest(f *framework.Framework, testCondition string, podTestSpecs []podEvictSpec, evictionTestTimeout time.Duration,
hasPressureCondition func(*framework.Framework, string) (bool, error)) {
Context(fmt.Sprintf("EmptyDirEviction when we run containers that should cause %s", testCondition), func() {
BeforeEach(func() {
By("seting up pods to be used by tests")
for _, spec := range podTestSpecs {
By(fmt.Sprintf("creating pod with container: %s", spec.pod.Name))
f.PodClient().CreateSync(&spec.pod)
}
})
It(fmt.Sprintf("Test should eventually see %s, and then evict the correct pods", testCondition), func() {
evictNum := 0
evictMap := make(map[string]string)
Eventually(func() error {
// Gather current information
updatedPodList, err := f.ClientSet.Core().Pods(f.Namespace.Name).List(metav1.ListOptions{})
if err != nil {
return fmt.Errorf("failed to get the list of pod: %v", err)
}
updatedPods := updatedPodList.Items
for _, p := range updatedPods {
framework.Logf("fetching pod %s; phase= %v", p.Name, p.Status.Phase)
for _, testPod := range podTestSpecs {
if p.Name == testPod.pod.Name {
if !testPod.evicted {
Expect(p.Status.Phase).NotTo(Equal(v1.PodFailed),
fmt.Sprintf("%s pod failed (and shouldn't have failed)", p.Name))
} else {
if _, ok := evictMap[p.Name]; !ok && p.Status.Phase == v1.PodFailed {
evictNum++
evictMap[p.Name] = p.Name
}
}
}
}
}
if evictNum == totalEvict {
return nil
}
return fmt.Errorf("pods that caused %s have not been evicted", testCondition)
}, evictionTestTimeout, evictionPollInterval).Should(BeNil())
By("making sure we can start a new pod after the test")
podName := "test-admit-pod"
f.PodClient().CreateSync(&v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,
},
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
Containers: []v1.Container{
{
Image: framework.GetPauseImageNameForHostArch(),
Name: podName,
},
},
},
})
})
AfterEach(func() {
By("deleting pods")
for _, spec := range podTestSpecs {
By(fmt.Sprintf("deleting pod: %s", spec.pod.Name))
f.PodClient().DeleteSync(spec.pod.Name, &metav1.DeleteOptions{}, framework.DefaultPodDeletionTimeout)
}
if CurrentGinkgoTestDescription().Failed {
if framework.TestContext.DumpLogsOnFailure {
logPodEvents(f)
logNodeEvents(f)
}
By("sleeping to allow for cleanup of test")
time.Sleep(postTestConditionMonitoringPeriod)
}
})
})
}