From 552e4d3a9ddb102bce50d4ddbad164880cf2f68c Mon Sep 17 00:00:00 2001 From: Szymon Scharmach Date: Wed, 18 Oct 2017 14:43:55 +0200 Subject: [PATCH] Cpu manager reconclie loop can restore state --- pkg/kubelet/cm/cpumanager/cpu_manager.go | 23 ++++++++++++++++++++-- pkg/kubelet/cm/cpumanager/policy.go | 2 ++ pkg/kubelet/cm/cpumanager/policy_static.go | 8 +++++++- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/pkg/kubelet/cm/cpumanager/cpu_manager.go b/pkg/kubelet/cm/cpumanager/cpu_manager.go index 9dc6b98b4e7..6e1fd9cacb1 100644 --- a/pkg/kubelet/cm/cpumanager/cpu_manager.go +++ b/pkg/kubelet/cm/cpumanager/cpu_manager.go @@ -160,8 +160,8 @@ func NewManager( } func (m *manager) Start(activePods ActivePodsFunc, podStatusProvider status.PodStatusProvider, containerRuntime runtimeService) { - glog.Infof("[cpumanger] starting with %s policy", m.policy.Name()) - glog.Infof("[cpumanger] reconciling every %v", m.reconcilePeriod) + glog.Infof("[cpumanager] starting with %s policy", m.policy.Name()) + glog.Infof("[cpumanager] reconciling every %v", m.reconcilePeriod) m.activePods = activePods m.podStatusProvider = podStatusProvider @@ -242,6 +242,25 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec continue } + // Check whether container is present in state, there may be 3 reasons why it's not present: + // - policy does not want to track the container + // - kubelet has just been restarted - and there is no previous state file + // - container has been removed from state by RemoveContainer call (DeletionTimestamp is set) + if _, ok := m.state.GetCPUSet(containerID); !ok { + if status.Phase == v1.PodRunning && pod.DeletionTimestamp == nil { + glog.V(4).Infof("[cpumanager] reconcileState: container is not present in state - trying to add (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID) + err := m.AddContainer(pod, &container, containerID) + if err != nil { + glog.Errorf("[cpumanager] reconcileState: failed to add container (pod: %s, container: %s, container id: %s, error: %v)", pod.Name, container.Name, containerID, err) + failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID}) + } + } else { + // if DeletionTimestamp is set, pod has already been removed from state + // skip the pod/container since it's not running and will be deleted soon + continue + } + } + cset := m.state.GetCPUSetOrDefault(containerID) if cset.IsEmpty() { // NOTE: This should not happen outside of tests. diff --git a/pkg/kubelet/cm/cpumanager/policy.go b/pkg/kubelet/cm/cpumanager/policy.go index 39eb76316b1..c79091659e3 100644 --- a/pkg/kubelet/cm/cpumanager/policy.go +++ b/pkg/kubelet/cm/cpumanager/policy.go @@ -25,6 +25,8 @@ import ( type Policy interface { Name() string Start(s state.State) + // AddContainer call is idempotent AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error + // RemoveContainer call is idempotent RemoveContainer(s state.State, containerID string) error } diff --git a/pkg/kubelet/cm/cpumanager/policy_static.go b/pkg/kubelet/cm/cpumanager/policy_static.go index dfbb0a297d0..9a461bacb63 100644 --- a/pkg/kubelet/cm/cpumanager/policy_static.go +++ b/pkg/kubelet/cm/cpumanager/policy_static.go @@ -156,9 +156,15 @@ func (p *staticPolicy) assignableCPUs(s state.State) cpuset.CPUSet { } func (p *staticPolicy) AddContainer(s state.State, pod *v1.Pod, container *v1.Container, containerID string) error { - glog.Infof("[cpumanager] static policy: AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID) if numCPUs := guaranteedCPUs(pod, container); numCPUs != 0 { + glog.Infof("[cpumanager] static policy: AddContainer (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID) // container belongs in an exclusively allocated pool + + if _, ok := s.GetCPUSet(containerID); ok { + glog.Infof("[cpumanager] static policy: container already present in state, skipping (container: %s, container id: %s)", container.Name, containerID) + return nil + } + cpuset, err := p.allocateCPUs(s, numCPUs) if err != nil { glog.Errorf("[cpumanager] unable to allocate %d CPUs (container id: %s, error: %v)", numCPUs, containerID, err)