Merge pull request #42435 from dashpole/timestamps_for_fsstats

Automatic merge from submit-queue (batch tested with PRs 42369, 42375, 42397, 42435, 42455)

[Bug Fix]: Avoid evicting more pods than necessary by adding Timestamps for fsstats and ignoring stale stats

Continuation of #33121.  Credit for most of this goes to @sjenning.  I added volume fs timestamps.

**why is this a bug** 
This PR attempts to fix part of https://github.com/kubernetes/kubernetes/issues/31362 which results in multiple pods getting evicted unnecessarily whenever the node runs into resource pressure. This PR reduces the chances of such disruptions by avoiding reacting to old/stale metrics.
Without this PR, kubernetes nodes under resource pressure will cause unnecessary disruptions to user workloads. 
This PR will also help deflake a node e2e test suite.

The eviction manager currently avoids evicting pods if metrics are old.  However, timestamp data is not available for filesystem data, and this causes lots of extra evictions.
See the [inode eviction test flakes](https://k8s-testgrid.appspot.com/google-node#kubelet-flaky-gce-e2e) for examples.
This should probably be treated as a bugfix, as it should help mitigate extra evictions.

cc: @kubernetes/sig-storage-pr-reviews  @kubernetes/sig-node-pr-reviews @vishh @derekwaynecarr @sjenning
This commit is contained in:
Kubernetes Submit Queue 2017-03-03 23:21:48 -08:00 committed by GitHub
commit f9ccee7714
12 changed files with 52 additions and 20 deletions

View File

@ -191,6 +191,8 @@ type VolumeStats struct {
// FsStats contains data about filesystem usage.
type FsStats struct {
// The time at which these stats were updated.
Time metav1.Time `json:"time"`
// AvailableBytes represents the storage space available (bytes) for the filesystem.
// +optional
AvailableBytes *uint64 `json:"availableBytes,omitempty"`

View File

@ -664,14 +664,14 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider
result[evictionapi.SignalNodeFsAvailable] = signalObservation{
available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
// TODO: add timestamp to stat (see memory stat)
time: nodeFs.Time,
}
}
if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
result[evictionapi.SignalNodeFsInodesFree] = signalObservation{
available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI),
capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI),
// TODO: add timestamp to stat (see memory stat)
time: nodeFs.Time,
}
}
}
@ -681,13 +681,13 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider
result[evictionapi.SignalImageFsAvailable] = signalObservation{
available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI),
capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI),
// TODO: add timestamp to stat (see memory stat)
time: imageFs.Time,
}
if imageFs.InodesFree != nil && imageFs.Inodes != nil {
result[evictionapi.SignalImageFsInodesFree] = signalObservation{
available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI),
capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI),
// TODO: add timestamp to stat (see memory stat)
time: imageFs.Time,
}
}
}

View File

@ -128,12 +128,14 @@ func (sb *summaryBuilder) build() (*stats.Summary, error) {
}
rootStats := sb.containerInfoV2ToStats("", &rootInfo)
cStats, _ := sb.latestContainerStats(&rootInfo)
nodeStats := stats.NodeStats{
NodeName: sb.node.Name,
CPU: rootStats.CPU,
Memory: rootStats.Memory,
Network: sb.containerInfoV2ToNetworkStats("node:"+sb.node.Name, &rootInfo),
Fs: &stats.FsStats{
Time: metav1.NewTime(cStats.Timestamp),
AvailableBytes: &sb.rootFsInfo.Available,
CapacityBytes: &sb.rootFsInfo.Capacity,
UsedBytes: &sb.rootFsInfo.Usage,
@ -144,6 +146,7 @@ func (sb *summaryBuilder) build() (*stats.Summary, error) {
StartTime: rootStats.StartTime,
Runtime: &stats.RuntimeStats{
ImageFs: &stats.FsStats{
Time: metav1.NewTime(cStats.Timestamp),
AvailableBytes: &sb.imageFsInfo.Available,
CapacityBytes: &sb.imageFsInfo.Capacity,
UsedBytes: &sb.imageStats.TotalStorageBytes,
@ -181,8 +184,14 @@ func (sb *summaryBuilder) containerInfoV2FsStats(
info *cadvisorapiv2.ContainerInfo,
cs *stats.ContainerStats) {
lcs, found := sb.latestContainerStats(info)
if !found {
return
}
// The container logs live on the node rootfs device
cs.Logs = &stats.FsStats{
Time: metav1.NewTime(lcs.Timestamp),
AvailableBytes: &sb.rootFsInfo.Available,
CapacityBytes: &sb.rootFsInfo.Capacity,
InodesFree: sb.rootFsInfo.InodesFree,
@ -196,15 +205,12 @@ func (sb *summaryBuilder) containerInfoV2FsStats(
// The container rootFs lives on the imageFs devices (which may not be the node root fs)
cs.Rootfs = &stats.FsStats{
Time: metav1.NewTime(lcs.Timestamp),
AvailableBytes: &sb.imageFsInfo.Available,
CapacityBytes: &sb.imageFsInfo.Capacity,
InodesFree: sb.imageFsInfo.InodesFree,
Inodes: sb.imageFsInfo.Inodes,
}
lcs, found := sb.latestContainerStats(info)
if !found {
return
}
cfs := lcs.Filesystem
if cfs != nil {

View File

@ -120,7 +120,7 @@ func (s *volumeStatCalculator) parsePodVolumeStats(podName string, metric *volum
inodesUsed := uint64(metric.InodesUsed.Value())
return stats.VolumeStats{
Name: podName,
FsStats: stats.FsStats{AvailableBytes: &available, CapacityBytes: &capacity, UsedBytes: &used,
Inodes: &inodes, InodesFree: &inodesFree, InodesUsed: &inodesUsed},
FsStats: stats.FsStats{Time: metric.Time, AvailableBytes: &available, CapacityBytes: &capacity,
UsedBytes: &used, Inodes: &inodes, InodesFree: &inodesFree, InodesUsed: &inodesUsed},
}
}

View File

@ -50,7 +50,6 @@ go_test(
name = "go_default_test",
srcs = [
"metrics_nil_test.go",
"metrics_statfs_test.go",
"plugins_test.go",
"util_test.go",
],
@ -66,13 +65,15 @@ go_test(
"//vendor:k8s.io/apimachinery/pkg/types",
"//vendor:k8s.io/apimachinery/pkg/util/sets",
"//vendor:k8s.io/apimachinery/pkg/watch",
"//vendor:k8s.io/client-go/util/testing",
],
)
go_test(
name = "go_default_xtest",
srcs = ["metrics_du_test.go"],
srcs = [
"metrics_du_test.go",
"metrics_statfs_test.go",
],
tags = ["automanaged"],
deps = [
"//pkg/volume:go_default_library",

View File

@ -18,6 +18,7 @@ package volume
import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/volume/util"
)
@ -40,7 +41,7 @@ func NewMetricsDu(path string) MetricsProvider {
// and gathering filesystem info for the Volume path.
// See MetricsProvider.GetMetrics
func (md *metricsDu) GetMetrics() (*Metrics, error) {
metrics := &Metrics{}
metrics := &Metrics{Time: metav1.Now()}
if md.path == "" {
return metrics, NewNoPathDefinedError()
}

View File

@ -80,7 +80,7 @@ func TestMetricsDuRequirePath(t *testing.T) {
metrics := NewMetricsDu("")
actual, err := metrics.GetMetrics()
expected := &Metrics{}
if *actual != *expected {
if !volumetest.MetricsEqualIgnoreTimestamp(actual, expected) {
t.Errorf("Expected empty Metrics from uninitialized MetricsDu, actual %v", *actual)
}
if err == nil {
@ -94,7 +94,7 @@ func TestMetricsDuRequireRealDirectory(t *testing.T) {
metrics := NewMetricsDu("/not/a/real/directory")
actual, err := metrics.GetMetrics()
expected := &Metrics{}
if *actual != *expected {
if !volumetest.MetricsEqualIgnoreTimestamp(actual, expected) {
t.Errorf("Expected empty Metrics from incorrectly initialized MetricsDu, actual %v", *actual)
}
if err == nil {

View File

@ -18,6 +18,7 @@ package volume
import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/volume/util"
)
@ -39,7 +40,7 @@ func NewMetricsStatFS(path string) MetricsProvider {
// GetMetrics calculates the volume usage and device free space by executing "du"
// and gathering filesystem info for the Volume path.
func (md *metricsStatFS) GetMetrics() (*Metrics, error) {
metrics := &Metrics{}
metrics := &Metrics{Time: metav1.Now()}
if md.path == "" {
return metrics, NewNoPathDefinedError()
}

View File

@ -14,20 +14,22 @@ See the License for the specific language governing permissions and
limitations under the License.
*/
package volume
package volume_test
import (
"os"
"testing"
utiltesting "k8s.io/client-go/util/testing"
. "k8s.io/kubernetes/pkg/volume"
volumetest "k8s.io/kubernetes/pkg/volume/testing"
)
func TestGetMetricsStatFS(t *testing.T) {
metrics := NewMetricsStatFS("")
actual, err := metrics.GetMetrics()
expected := &Metrics{}
if *actual != *expected {
if !volumetest.MetricsEqualIgnoreTimestamp(actual, expected) {
t.Errorf("Expected empty Metrics from uninitialized MetricsStatFS, actual %v", *actual)
}
if err == nil {
@ -36,7 +38,7 @@ func TestGetMetricsStatFS(t *testing.T) {
metrics = NewMetricsStatFS("/not/a/real/directory")
actual, err = metrics.GetMetrics()
if *actual != *expected {
if !volumetest.MetricsEqualIgnoreTimestamp(actual, expected) {
t.Errorf("Expected empty Metrics from incorrectly initialized MetricsStatFS, actual %v", *actual)
}
if err == nil {

View File

@ -754,3 +754,13 @@ func CreateTestPVC(capacity string, accessModes []v1.PersistentVolumeAccessMode)
}
return &claim
}
func MetricsEqualIgnoreTimestamp(a *Metrics, b *Metrics) bool {
available := a.Available == b.Available
capacity := a.Capacity == b.Capacity
used := a.Used == b.Used
inodes := a.Inodes == b.Inodes
inodesFree := a.InodesFree == b.InodesFree
inodesUsed := a.InodesUsed == b.InodesUsed
return available && capacity && used && inodes && inodesFree && inodesUsed
}

View File

@ -26,6 +26,7 @@ import (
"github.com/golang/glog"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/api/v1"
)
@ -52,6 +53,9 @@ type MetricsProvider interface {
// Metrics represents the used and available bytes of the Volume.
type Metrics struct {
// The time at which these stats were updated.
Time metav1.Time
// Used represents the total bytes used by the Volume.
// Note: For block devices this maybe more than the total size of the files.
Used *resource.Quantity

View File

@ -116,6 +116,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
"MajorPageFaults": bounded(0, 10),
}),
"Rootfs": ptrMatchAllFields(gstruct.Fields{
"Time": recent(maxStatsAge),
"AvailableBytes": fsCapacityBounds,
"CapacityBytes": fsCapacityBounds,
"UsedBytes": bounded(kb, 10*mb),
@ -124,6 +125,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
"InodesUsed": bounded(0, 1E8),
}),
"Logs": ptrMatchAllFields(gstruct.Fields{
"Time": recent(maxStatsAge),
"AvailableBytes": fsCapacityBounds,
"CapacityBytes": fsCapacityBounds,
"UsedBytes": bounded(kb, 10*mb),
@ -145,6 +147,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
"test-empty-dir": gstruct.MatchAllFields(gstruct.Fields{
"Name": Equal("test-empty-dir"),
"FsStats": gstruct.MatchAllFields(gstruct.Fields{
"Time": recent(maxStatsAge),
"AvailableBytes": fsCapacityBounds,
"CapacityBytes": fsCapacityBounds,
"UsedBytes": bounded(kb, 1*mb),
@ -183,6 +186,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
"TxErrors": bounded(0, 100000),
})),
"Fs": ptrMatchAllFields(gstruct.Fields{
"Time": recent(maxStatsAge),
"AvailableBytes": fsCapacityBounds,
"CapacityBytes": fsCapacityBounds,
"UsedBytes": bounded(kb, 10*gb),
@ -192,6 +196,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
}),
"Runtime": ptrMatchAllFields(gstruct.Fields{
"ImageFs": ptrMatchAllFields(gstruct.Fields{
"Time": recent(maxStatsAge),
"AvailableBytes": fsCapacityBounds,
"CapacityBytes": fsCapacityBounds,
"UsedBytes": bounded(kb, 10*gb),