mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 11:21:47 +00:00
Merge pull request #42435 from dashpole/timestamps_for_fsstats
Automatic merge from submit-queue (batch tested with PRs 42369, 42375, 42397, 42435, 42455) [Bug Fix]: Avoid evicting more pods than necessary by adding Timestamps for fsstats and ignoring stale stats Continuation of #33121. Credit for most of this goes to @sjenning. I added volume fs timestamps. **why is this a bug** This PR attempts to fix part of https://github.com/kubernetes/kubernetes/issues/31362 which results in multiple pods getting evicted unnecessarily whenever the node runs into resource pressure. This PR reduces the chances of such disruptions by avoiding reacting to old/stale metrics. Without this PR, kubernetes nodes under resource pressure will cause unnecessary disruptions to user workloads. This PR will also help deflake a node e2e test suite. The eviction manager currently avoids evicting pods if metrics are old. However, timestamp data is not available for filesystem data, and this causes lots of extra evictions. See the [inode eviction test flakes](https://k8s-testgrid.appspot.com/google-node#kubelet-flaky-gce-e2e) for examples. This should probably be treated as a bugfix, as it should help mitigate extra evictions. cc: @kubernetes/sig-storage-pr-reviews @kubernetes/sig-node-pr-reviews @vishh @derekwaynecarr @sjenning
This commit is contained in:
commit
f9ccee7714
@ -191,6 +191,8 @@ type VolumeStats struct {
|
||||
|
||||
// FsStats contains data about filesystem usage.
|
||||
type FsStats struct {
|
||||
// The time at which these stats were updated.
|
||||
Time metav1.Time `json:"time"`
|
||||
// AvailableBytes represents the storage space available (bytes) for the filesystem.
|
||||
// +optional
|
||||
AvailableBytes *uint64 `json:"availableBytes,omitempty"`
|
||||
|
@ -664,14 +664,14 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider
|
||||
result[evictionapi.SignalNodeFsAvailable] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI),
|
||||
// TODO: add timestamp to stat (see memory stat)
|
||||
time: nodeFs.Time,
|
||||
}
|
||||
}
|
||||
if nodeFs.InodesFree != nil && nodeFs.Inodes != nil {
|
||||
result[evictionapi.SignalNodeFsInodesFree] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI),
|
||||
// TODO: add timestamp to stat (see memory stat)
|
||||
time: nodeFs.Time,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -681,13 +681,13 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider
|
||||
result[evictionapi.SignalImageFsAvailable] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI),
|
||||
// TODO: add timestamp to stat (see memory stat)
|
||||
time: imageFs.Time,
|
||||
}
|
||||
if imageFs.InodesFree != nil && imageFs.Inodes != nil {
|
||||
result[evictionapi.SignalImageFsInodesFree] = signalObservation{
|
||||
available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI),
|
||||
capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI),
|
||||
// TODO: add timestamp to stat (see memory stat)
|
||||
time: imageFs.Time,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -128,12 +128,14 @@ func (sb *summaryBuilder) build() (*stats.Summary, error) {
|
||||
}
|
||||
|
||||
rootStats := sb.containerInfoV2ToStats("", &rootInfo)
|
||||
cStats, _ := sb.latestContainerStats(&rootInfo)
|
||||
nodeStats := stats.NodeStats{
|
||||
NodeName: sb.node.Name,
|
||||
CPU: rootStats.CPU,
|
||||
Memory: rootStats.Memory,
|
||||
Network: sb.containerInfoV2ToNetworkStats("node:"+sb.node.Name, &rootInfo),
|
||||
Fs: &stats.FsStats{
|
||||
Time: metav1.NewTime(cStats.Timestamp),
|
||||
AvailableBytes: &sb.rootFsInfo.Available,
|
||||
CapacityBytes: &sb.rootFsInfo.Capacity,
|
||||
UsedBytes: &sb.rootFsInfo.Usage,
|
||||
@ -144,6 +146,7 @@ func (sb *summaryBuilder) build() (*stats.Summary, error) {
|
||||
StartTime: rootStats.StartTime,
|
||||
Runtime: &stats.RuntimeStats{
|
||||
ImageFs: &stats.FsStats{
|
||||
Time: metav1.NewTime(cStats.Timestamp),
|
||||
AvailableBytes: &sb.imageFsInfo.Available,
|
||||
CapacityBytes: &sb.imageFsInfo.Capacity,
|
||||
UsedBytes: &sb.imageStats.TotalStorageBytes,
|
||||
@ -181,8 +184,14 @@ func (sb *summaryBuilder) containerInfoV2FsStats(
|
||||
info *cadvisorapiv2.ContainerInfo,
|
||||
cs *stats.ContainerStats) {
|
||||
|
||||
lcs, found := sb.latestContainerStats(info)
|
||||
if !found {
|
||||
return
|
||||
}
|
||||
|
||||
// The container logs live on the node rootfs device
|
||||
cs.Logs = &stats.FsStats{
|
||||
Time: metav1.NewTime(lcs.Timestamp),
|
||||
AvailableBytes: &sb.rootFsInfo.Available,
|
||||
CapacityBytes: &sb.rootFsInfo.Capacity,
|
||||
InodesFree: sb.rootFsInfo.InodesFree,
|
||||
@ -196,15 +205,12 @@ func (sb *summaryBuilder) containerInfoV2FsStats(
|
||||
|
||||
// The container rootFs lives on the imageFs devices (which may not be the node root fs)
|
||||
cs.Rootfs = &stats.FsStats{
|
||||
Time: metav1.NewTime(lcs.Timestamp),
|
||||
AvailableBytes: &sb.imageFsInfo.Available,
|
||||
CapacityBytes: &sb.imageFsInfo.Capacity,
|
||||
InodesFree: sb.imageFsInfo.InodesFree,
|
||||
Inodes: sb.imageFsInfo.Inodes,
|
||||
}
|
||||
lcs, found := sb.latestContainerStats(info)
|
||||
if !found {
|
||||
return
|
||||
}
|
||||
cfs := lcs.Filesystem
|
||||
|
||||
if cfs != nil {
|
||||
|
@ -120,7 +120,7 @@ func (s *volumeStatCalculator) parsePodVolumeStats(podName string, metric *volum
|
||||
inodesUsed := uint64(metric.InodesUsed.Value())
|
||||
return stats.VolumeStats{
|
||||
Name: podName,
|
||||
FsStats: stats.FsStats{AvailableBytes: &available, CapacityBytes: &capacity, UsedBytes: &used,
|
||||
Inodes: &inodes, InodesFree: &inodesFree, InodesUsed: &inodesUsed},
|
||||
FsStats: stats.FsStats{Time: metric.Time, AvailableBytes: &available, CapacityBytes: &capacity,
|
||||
UsedBytes: &used, Inodes: &inodes, InodesFree: &inodesFree, InodesUsed: &inodesUsed},
|
||||
}
|
||||
}
|
||||
|
@ -50,7 +50,6 @@ go_test(
|
||||
name = "go_default_test",
|
||||
srcs = [
|
||||
"metrics_nil_test.go",
|
||||
"metrics_statfs_test.go",
|
||||
"plugins_test.go",
|
||||
"util_test.go",
|
||||
],
|
||||
@ -66,13 +65,15 @@ go_test(
|
||||
"//vendor:k8s.io/apimachinery/pkg/types",
|
||||
"//vendor:k8s.io/apimachinery/pkg/util/sets",
|
||||
"//vendor:k8s.io/apimachinery/pkg/watch",
|
||||
"//vendor:k8s.io/client-go/util/testing",
|
||||
],
|
||||
)
|
||||
|
||||
go_test(
|
||||
name = "go_default_xtest",
|
||||
srcs = ["metrics_du_test.go"],
|
||||
srcs = [
|
||||
"metrics_du_test.go",
|
||||
"metrics_statfs_test.go",
|
||||
],
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/volume:go_default_library",
|
||||
|
@ -18,6 +18,7 @@ package volume
|
||||
|
||||
import (
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/kubernetes/pkg/volume/util"
|
||||
)
|
||||
|
||||
@ -40,7 +41,7 @@ func NewMetricsDu(path string) MetricsProvider {
|
||||
// and gathering filesystem info for the Volume path.
|
||||
// See MetricsProvider.GetMetrics
|
||||
func (md *metricsDu) GetMetrics() (*Metrics, error) {
|
||||
metrics := &Metrics{}
|
||||
metrics := &Metrics{Time: metav1.Now()}
|
||||
if md.path == "" {
|
||||
return metrics, NewNoPathDefinedError()
|
||||
}
|
||||
|
@ -80,7 +80,7 @@ func TestMetricsDuRequirePath(t *testing.T) {
|
||||
metrics := NewMetricsDu("")
|
||||
actual, err := metrics.GetMetrics()
|
||||
expected := &Metrics{}
|
||||
if *actual != *expected {
|
||||
if !volumetest.MetricsEqualIgnoreTimestamp(actual, expected) {
|
||||
t.Errorf("Expected empty Metrics from uninitialized MetricsDu, actual %v", *actual)
|
||||
}
|
||||
if err == nil {
|
||||
@ -94,7 +94,7 @@ func TestMetricsDuRequireRealDirectory(t *testing.T) {
|
||||
metrics := NewMetricsDu("/not/a/real/directory")
|
||||
actual, err := metrics.GetMetrics()
|
||||
expected := &Metrics{}
|
||||
if *actual != *expected {
|
||||
if !volumetest.MetricsEqualIgnoreTimestamp(actual, expected) {
|
||||
t.Errorf("Expected empty Metrics from incorrectly initialized MetricsDu, actual %v", *actual)
|
||||
}
|
||||
if err == nil {
|
||||
|
@ -18,6 +18,7 @@ package volume
|
||||
|
||||
import (
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/kubernetes/pkg/volume/util"
|
||||
)
|
||||
|
||||
@ -39,7 +40,7 @@ func NewMetricsStatFS(path string) MetricsProvider {
|
||||
// GetMetrics calculates the volume usage and device free space by executing "du"
|
||||
// and gathering filesystem info for the Volume path.
|
||||
func (md *metricsStatFS) GetMetrics() (*Metrics, error) {
|
||||
metrics := &Metrics{}
|
||||
metrics := &Metrics{Time: metav1.Now()}
|
||||
if md.path == "" {
|
||||
return metrics, NewNoPathDefinedError()
|
||||
}
|
||||
|
@ -14,20 +14,22 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package volume
|
||||
package volume_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"testing"
|
||||
|
||||
utiltesting "k8s.io/client-go/util/testing"
|
||||
. "k8s.io/kubernetes/pkg/volume"
|
||||
volumetest "k8s.io/kubernetes/pkg/volume/testing"
|
||||
)
|
||||
|
||||
func TestGetMetricsStatFS(t *testing.T) {
|
||||
metrics := NewMetricsStatFS("")
|
||||
actual, err := metrics.GetMetrics()
|
||||
expected := &Metrics{}
|
||||
if *actual != *expected {
|
||||
if !volumetest.MetricsEqualIgnoreTimestamp(actual, expected) {
|
||||
t.Errorf("Expected empty Metrics from uninitialized MetricsStatFS, actual %v", *actual)
|
||||
}
|
||||
if err == nil {
|
||||
@ -36,7 +38,7 @@ func TestGetMetricsStatFS(t *testing.T) {
|
||||
|
||||
metrics = NewMetricsStatFS("/not/a/real/directory")
|
||||
actual, err = metrics.GetMetrics()
|
||||
if *actual != *expected {
|
||||
if !volumetest.MetricsEqualIgnoreTimestamp(actual, expected) {
|
||||
t.Errorf("Expected empty Metrics from incorrectly initialized MetricsStatFS, actual %v", *actual)
|
||||
}
|
||||
if err == nil {
|
||||
|
@ -754,3 +754,13 @@ func CreateTestPVC(capacity string, accessModes []v1.PersistentVolumeAccessMode)
|
||||
}
|
||||
return &claim
|
||||
}
|
||||
|
||||
func MetricsEqualIgnoreTimestamp(a *Metrics, b *Metrics) bool {
|
||||
available := a.Available == b.Available
|
||||
capacity := a.Capacity == b.Capacity
|
||||
used := a.Used == b.Used
|
||||
inodes := a.Inodes == b.Inodes
|
||||
inodesFree := a.InodesFree == b.InodesFree
|
||||
inodesUsed := a.InodesUsed == b.InodesUsed
|
||||
return available && capacity && used && inodes && inodesFree && inodesUsed
|
||||
}
|
||||
|
@ -26,6 +26,7 @@ import (
|
||||
|
||||
"github.com/golang/glog"
|
||||
"k8s.io/apimachinery/pkg/api/resource"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/types"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
)
|
||||
@ -52,6 +53,9 @@ type MetricsProvider interface {
|
||||
|
||||
// Metrics represents the used and available bytes of the Volume.
|
||||
type Metrics struct {
|
||||
// The time at which these stats were updated.
|
||||
Time metav1.Time
|
||||
|
||||
// Used represents the total bytes used by the Volume.
|
||||
// Note: For block devices this maybe more than the total size of the files.
|
||||
Used *resource.Quantity
|
||||
|
@ -116,6 +116,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
|
||||
"MajorPageFaults": bounded(0, 10),
|
||||
}),
|
||||
"Rootfs": ptrMatchAllFields(gstruct.Fields{
|
||||
"Time": recent(maxStatsAge),
|
||||
"AvailableBytes": fsCapacityBounds,
|
||||
"CapacityBytes": fsCapacityBounds,
|
||||
"UsedBytes": bounded(kb, 10*mb),
|
||||
@ -124,6 +125,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
|
||||
"InodesUsed": bounded(0, 1E8),
|
||||
}),
|
||||
"Logs": ptrMatchAllFields(gstruct.Fields{
|
||||
"Time": recent(maxStatsAge),
|
||||
"AvailableBytes": fsCapacityBounds,
|
||||
"CapacityBytes": fsCapacityBounds,
|
||||
"UsedBytes": bounded(kb, 10*mb),
|
||||
@ -145,6 +147,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
|
||||
"test-empty-dir": gstruct.MatchAllFields(gstruct.Fields{
|
||||
"Name": Equal("test-empty-dir"),
|
||||
"FsStats": gstruct.MatchAllFields(gstruct.Fields{
|
||||
"Time": recent(maxStatsAge),
|
||||
"AvailableBytes": fsCapacityBounds,
|
||||
"CapacityBytes": fsCapacityBounds,
|
||||
"UsedBytes": bounded(kb, 1*mb),
|
||||
@ -183,6 +186,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
|
||||
"TxErrors": bounded(0, 100000),
|
||||
})),
|
||||
"Fs": ptrMatchAllFields(gstruct.Fields{
|
||||
"Time": recent(maxStatsAge),
|
||||
"AvailableBytes": fsCapacityBounds,
|
||||
"CapacityBytes": fsCapacityBounds,
|
||||
"UsedBytes": bounded(kb, 10*gb),
|
||||
@ -192,6 +196,7 @@ var _ = framework.KubeDescribe("Summary API", func() {
|
||||
}),
|
||||
"Runtime": ptrMatchAllFields(gstruct.Fields{
|
||||
"ImageFs": ptrMatchAllFields(gstruct.Fields{
|
||||
"Time": recent(maxStatsAge),
|
||||
"AvailableBytes": fsCapacityBounds,
|
||||
"CapacityBytes": fsCapacityBounds,
|
||||
"UsedBytes": bounded(kb, 10*gb),
|
||||
|
Loading…
Reference in New Issue
Block a user