mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-30 06:54:01 +00:00
Merge pull request #59769 from dashpole/capacity_ephemeral_storage
Automatic merge from submit-queue. If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Collect ephemeral storage capacity on initialization **What this PR does / why we need it**: We have had some node e2e flakes where a pod can be rejected if it requests ephemeral storage. This is because we don't set capacity and allocatable for ephemeral storage on initialization. This PR causes cAdvisor to do one round of stats collection during initialization, which will allow it to get the disk capacity when it first sets the node status. It also sets the node to NotReady if capacities have not been initialized yet. **Special notes for your reviewer**: **Release note**: ```release-note NONE ``` /assign @jingxu97 @Random-Liu /sig node /kind bug /priority important-soon
This commit is contained in:
commit
244549f02a
@ -533,6 +533,14 @@ func (cm *containerManagerImpl) Start(node *v1.Node,
|
||||
// allocatable of the node
|
||||
cm.nodeInfo = node
|
||||
|
||||
rootfs, err := cm.cadvisorInterface.RootFsInfo()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get rootfs info: %v", err)
|
||||
}
|
||||
for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) {
|
||||
cm.capacity[rName] = rCap
|
||||
}
|
||||
|
||||
// Ensure that node allocatable configuration is valid.
|
||||
if err := cm.validateNodeAllocatable(); err != nil {
|
||||
return err
|
||||
@ -575,36 +583,11 @@ func (cm *containerManagerImpl) Start(node *v1.Node,
|
||||
}, 5*time.Minute, wait.NeverStop)
|
||||
}
|
||||
|
||||
// Local storage filesystem information from `RootFsInfo` and `ImagesFsInfo` is available at a later time
|
||||
// depending on the time when cadvisor manager updates container stats. Therefore use a go routine to keep
|
||||
// retrieving the information until it is available.
|
||||
stopChan := make(chan struct{})
|
||||
go wait.Until(func() {
|
||||
if err := cm.setFsCapacity(); err != nil {
|
||||
glog.Errorf("[ContainerManager]: %v", err)
|
||||
return
|
||||
}
|
||||
close(stopChan)
|
||||
}, time.Second, stopChan)
|
||||
|
||||
// Starts device manager.
|
||||
if err := cm.deviceManager.Start(devicemanager.ActivePodsFunc(activePods), sourcesReady); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (cm *containerManagerImpl) setFsCapacity() error {
|
||||
rootfs, err := cm.cadvisorInterface.RootFsInfo()
|
||||
if err != nil {
|
||||
return fmt.Errorf("Fail to get rootfs information %v", err)
|
||||
}
|
||||
|
||||
cm.Lock()
|
||||
for rName, rCap := range cadvisor.EphemeralStorageCapacityFromFsInfo(rootfs) {
|
||||
cm.capacity[rName] = rCap
|
||||
}
|
||||
cm.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
@ -884,8 +867,6 @@ func getDockerAPIVersion(cadvisor cadvisor.Interface) *utilversion.Version {
|
||||
}
|
||||
|
||||
func (cm *containerManagerImpl) GetCapacity() v1.ResourceList {
|
||||
cm.RLock()
|
||||
defer cm.RUnlock()
|
||||
return cm.capacity
|
||||
}
|
||||
|
||||
|
@ -1294,16 +1294,6 @@ func (kl *Kubelet) initializeModules() error {
|
||||
kl.serverCertificateManager.Start()
|
||||
}
|
||||
|
||||
// Start container manager.
|
||||
node, err := kl.getNodeAnyWay()
|
||||
if err != nil {
|
||||
return fmt.Errorf("Kubelet failed to get node info: %v", err)
|
||||
}
|
||||
|
||||
if err := kl.containerManager.Start(node, kl.GetActivePods, kl.sourcesReady, kl.statusManager, kl.runtimeService); err != nil {
|
||||
return fmt.Errorf("Failed to start ContainerManager %v", err)
|
||||
}
|
||||
|
||||
// Start out of memory watcher.
|
||||
if err := kl.oomWatcher.Start(kl.nodeRef); err != nil {
|
||||
return fmt.Errorf("Failed to start OOM watcher %v", err)
|
||||
@ -1329,6 +1319,21 @@ func (kl *Kubelet) initializeRuntimeDependentModules() {
|
||||
}
|
||||
// eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs
|
||||
kl.evictionManager.Start(kl.StatsProvider, kl.GetActivePods, kl.podResourcesAreReclaimed, kl.containerManager, evictionMonitoringPeriod)
|
||||
|
||||
// trigger on-demand stats collection once so that we have capacity information for ephemeral storage.
|
||||
// ignore any errors, since if stats collection is not successful, the container manager will fail to start below.
|
||||
kl.StatsProvider.GetCgroupStats("/", true)
|
||||
// Start container manager.
|
||||
node, err := kl.getNodeAnyWay()
|
||||
if err != nil {
|
||||
// Fail kubelet and rely on the babysitter to retry starting kubelet.
|
||||
glog.Fatalf("Kubelet failed to get node info: %v", err)
|
||||
}
|
||||
// containerManager must start after cAdvisor because it needs filesystem capacity information
|
||||
if err := kl.containerManager.Start(node, kl.GetActivePods, kl.sourcesReady, kl.statusManager, kl.runtimeService); err != nil {
|
||||
// Fail kubelet and rely on the babysitter to retry starting kubelet.
|
||||
glog.Fatalf("Failed to start ContainerManager %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// Run starts the kubelet reacting to config updates
|
||||
|
@ -735,17 +735,28 @@ func (kl *Kubelet) setNodeReadyCondition(node *v1.Node) {
|
||||
// This is due to an issue with version skewed kubelet and master components.
|
||||
// ref: https://github.com/kubernetes/kubernetes/issues/16961
|
||||
currentTime := metav1.NewTime(kl.clock.Now())
|
||||
var newNodeReadyCondition v1.NodeCondition
|
||||
newNodeReadyCondition := v1.NodeCondition{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "KubeletReady",
|
||||
Message: "kubelet is posting ready status",
|
||||
LastHeartbeatTime: currentTime,
|
||||
}
|
||||
rs := append(kl.runtimeState.runtimeErrors(), kl.runtimeState.networkErrors()...)
|
||||
if len(rs) == 0 {
|
||||
newNodeReadyCondition = v1.NodeCondition{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionTrue,
|
||||
Reason: "KubeletReady",
|
||||
Message: "kubelet is posting ready status",
|
||||
LastHeartbeatTime: currentTime,
|
||||
requiredCapacities := []v1.ResourceName{v1.ResourceCPU, v1.ResourceMemory, v1.ResourcePods}
|
||||
if utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) {
|
||||
requiredCapacities = append(requiredCapacities, v1.ResourceEphemeralStorage)
|
||||
}
|
||||
missingCapacities := []string{}
|
||||
for _, resource := range requiredCapacities {
|
||||
if _, found := node.Status.Capacity[resource]; !found {
|
||||
missingCapacities = append(missingCapacities, string(resource))
|
||||
}
|
||||
} else {
|
||||
}
|
||||
if len(missingCapacities) > 0 {
|
||||
rs = append(rs, fmt.Sprintf("Missing node capacity for resources: %s", strings.Join(missingCapacities, ", ")))
|
||||
}
|
||||
if len(rs) > 0 {
|
||||
newNodeReadyCondition = v1.NodeCondition{
|
||||
Type: v1.NodeReady,
|
||||
Status: v1.ConditionFalse,
|
||||
|
@ -249,6 +249,9 @@ func TestUpdateNewNodeStatus(t *testing.T) {
|
||||
ContainerOsVersion: "Debian GNU/Linux 7 (wheezy)",
|
||||
}
|
||||
mockCadvisor.On("VersionInfo").Return(versionInfo, nil)
|
||||
maxAge := 0 * time.Second
|
||||
options := cadvisorapiv2.RequestOptions{IdType: cadvisorapiv2.TypeName, Count: 2, Recursive: false, MaxAge: &maxAge}
|
||||
mockCadvisor.On("ContainerInfoV2", "/", options).Return(map[string]cadvisorapiv2.ContainerInfo{}, nil)
|
||||
|
||||
expectedNode := &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname},
|
||||
@ -441,6 +444,9 @@ func TestUpdateExistingNodeStatus(t *testing.T) {
|
||||
ContainerOsVersion: "Debian GNU/Linux 7 (wheezy)",
|
||||
}
|
||||
mockCadvisor.On("VersionInfo").Return(versionInfo, nil)
|
||||
maxAge := 0 * time.Second
|
||||
options := cadvisorapiv2.RequestOptions{IdType: cadvisorapiv2.TypeName, Count: 2, Recursive: false, MaxAge: &maxAge}
|
||||
mockCadvisor.On("ContainerInfoV2", "/", options).Return(map[string]cadvisorapiv2.ContainerInfo{}, nil)
|
||||
|
||||
expectedNode := &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname},
|
||||
@ -645,6 +651,9 @@ func TestUpdateNodeStatusWithRuntimeStateError(t *testing.T) {
|
||||
ContainerOsVersion: "Debian GNU/Linux 7 (wheezy)",
|
||||
}
|
||||
mockCadvisor.On("VersionInfo").Return(versionInfo, nil)
|
||||
maxAge := 0 * time.Second
|
||||
options := cadvisorapiv2.RequestOptions{IdType: cadvisorapiv2.TypeName, Count: 2, Recursive: false, MaxAge: &maxAge}
|
||||
mockCadvisor.On("ContainerInfoV2", "/", options).Return(map[string]cadvisorapiv2.ContainerInfo{}, nil)
|
||||
|
||||
expectedNode := &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname},
|
||||
@ -1122,6 +1131,9 @@ func TestUpdateNewNodeStatusTooLargeReservation(t *testing.T) {
|
||||
ContainerOsVersion: "Debian GNU/Linux 7 (wheezy)",
|
||||
}
|
||||
mockCadvisor.On("VersionInfo").Return(versionInfo, nil)
|
||||
maxAge := 0 * time.Second
|
||||
options := cadvisorapiv2.RequestOptions{IdType: cadvisorapiv2.TypeName, Count: 2, Recursive: false, MaxAge: &maxAge}
|
||||
mockCadvisor.On("ContainerInfoV2", "/", options).Return(map[string]cadvisorapiv2.ContainerInfo{}, nil)
|
||||
|
||||
expectedNode := &v1.Node{
|
||||
ObjectMeta: metav1.ObjectMeta{Name: testKubeletHostname},
|
||||
|
Loading…
Reference in New Issue
Block a user