migrate cache.go to structured logging

This commit is contained in:
Shivanshu Raj Shrivastava 2021-11-07 19:19:14 +05:30
parent d92a443ca7
commit bb31040f06
No known key found for this signature in database
GPG Key ID: F4EDF2ED69728499

View File

@ -18,6 +18,7 @@ package cache
import ( import (
"fmt" "fmt"
"os"
"sync" "sync"
"time" "time"
@ -123,7 +124,7 @@ func newNodeInfoListItem(ni *framework.NodeInfo) *nodeInfoListItem {
func (cache *schedulerCache) moveNodeInfoToHead(name string) { func (cache *schedulerCache) moveNodeInfoToHead(name string) {
ni, ok := cache.nodes[name] ni, ok := cache.nodes[name]
if !ok { if !ok {
klog.Errorf("No NodeInfo with name %v found in the cache", name) klog.ErrorS(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
return return
} }
// if the node info list item is already at the head, we are done. // if the node info list item is already at the head, we are done.
@ -151,7 +152,7 @@ func (cache *schedulerCache) moveNodeInfoToHead(name string) {
func (cache *schedulerCache) removeNodeInfoFromList(name string) { func (cache *schedulerCache) removeNodeInfoFromList(name string) {
ni, ok := cache.nodes[name] ni, ok := cache.nodes[name]
if !ok { if !ok {
klog.Errorf("No NodeInfo with name %v found in the cache", name) klog.ErrorS(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
return return
} }
@ -264,7 +265,7 @@ func (cache *schedulerCache) UpdateSnapshot(nodeSnapshot *Snapshot) error {
", trying to recover", ", trying to recover",
len(nodeSnapshot.nodeInfoList), cache.nodeTree.numNodes, len(nodeSnapshot.nodeInfoList), cache.nodeTree.numNodes,
len(nodeSnapshot.nodeInfoMap), len(cache.nodes)) len(nodeSnapshot.nodeInfoMap), len(cache.nodes))
klog.Error(errMsg) klog.ErrorS(nil, errMsg)
// We will try to recover by re-creating the lists for the next scheduling cycle, but still return an // We will try to recover by re-creating the lists for the next scheduling cycle, but still return an
// error to surface the problem, the error will likely cause a failure to the current scheduling cycle. // error to surface the problem, the error will likely cause a failure to the current scheduling cycle.
cache.updateNodeInfoSnapshotList(nodeSnapshot, true) cache.updateNodeInfoSnapshotList(nodeSnapshot, true)
@ -282,7 +283,7 @@ func (cache *schedulerCache) updateNodeInfoSnapshotList(snapshot *Snapshot, upda
snapshot.nodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes) snapshot.nodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
nodesList, err := cache.nodeTree.list() nodesList, err := cache.nodeTree.list()
if err != nil { if err != nil {
klog.Error(err) klog.ErrorS(err, "Error occurred while retrieving the list of names of the nodes from node tree")
} }
for _, nodeName := range nodesList { for _, nodeName := range nodesList {
if nodeInfo := snapshot.nodeInfoMap[nodeName]; nodeInfo != nil { if nodeInfo := snapshot.nodeInfoMap[nodeName]; nodeInfo != nil {
@ -294,7 +295,7 @@ func (cache *schedulerCache) updateNodeInfoSnapshotList(snapshot *Snapshot, upda
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo) snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
} }
} else { } else {
klog.Errorf("node %q exist in nodeTree but not in NodeInfoMap, this should not happen.", nodeName) klog.ErrorS(nil, "Node exists in nodeTree but not in NodeInfoMap, this should not happen", "node", klog.KRef("", nodeName))
} }
} }
} else { } else {
@ -381,7 +382,7 @@ func (cache *schedulerCache) finishBinding(pod *v1.Pod, now time.Time) error {
cache.mu.RLock() cache.mu.RLock()
defer cache.mu.RUnlock() defer cache.mu.RUnlock()
klog.V(5).Infof("Finished binding for pod %v. Can be expired.", key) klog.V(5).InfoS("Finished binding for pod, can be expired", "pod", klog.KObj(pod))
currState, ok := cache.podStates[key] currState, ok := cache.podStates[key]
if ok && cache.assumedPods.Has(key) { if ok && cache.assumedPods.Has(key) {
dl := now.Add(cache.ttl) dl := now.Add(cache.ttl)
@ -447,7 +448,7 @@ func (cache *schedulerCache) updatePod(oldPod, newPod *v1.Pod) error {
func (cache *schedulerCache) removePod(pod *v1.Pod) error { func (cache *schedulerCache) removePod(pod *v1.Pod) error {
n, ok := cache.nodes[pod.Spec.NodeName] n, ok := cache.nodes[pod.Spec.NodeName]
if !ok { if !ok {
klog.Errorf("node %v not found when trying to remove pod %v", pod.Spec.NodeName, pod.Name) klog.ErrorS(nil, "Node not found when trying to remove pod", "node", klog.KRef("", pod.Spec.NodeName), "pod", klog.KObj(pod))
return nil return nil
} }
if err := n.info.RemovePod(pod); err != nil { if err := n.info.RemovePod(pod); err != nil {
@ -475,10 +476,10 @@ func (cache *schedulerCache) AddPod(pod *v1.Pod) error {
case ok && cache.assumedPods.Has(key): case ok && cache.assumedPods.Has(key):
if currState.pod.Spec.NodeName != pod.Spec.NodeName { if currState.pod.Spec.NodeName != pod.Spec.NodeName {
// The pod was added to a different node than it was assumed to. // The pod was added to a different node than it was assumed to.
klog.Warningf("Pod %v was assumed to be on %v but got added to %v", key, pod.Spec.NodeName, currState.pod.Spec.NodeName) klog.InfoS("Pod was added to a different node than it was assumed", "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
// Clean this up. // Clean this up.
if err = cache.removePod(currState.pod); err != nil { if err = cache.removePod(currState.pod); err != nil {
klog.Errorf("removing pod error: %v", err) klog.ErrorS(err, "Error occurred while removing pod")
} }
cache.addPod(pod) cache.addPod(pod)
} }
@ -513,8 +514,9 @@ func (cache *schedulerCache) UpdatePod(oldPod, newPod *v1.Pod) error {
// before Update event, in which case the state would change from Assumed to Added. // before Update event, in which case the state would change from Assumed to Added.
case ok && !cache.assumedPods.Has(key): case ok && !cache.assumedPods.Has(key):
if currState.pod.Spec.NodeName != newPod.Spec.NodeName { if currState.pod.Spec.NodeName != newPod.Spec.NodeName {
klog.Errorf("Pod %v updated on a different node than previously added to.", key) klog.ErrorS(nil, "Pod updated on a different node than previously added to", "pod", klog.KObj(oldPod))
klog.Fatalf("Schedulercache is corrupted and can badly affect scheduling decisions") klog.ErrorS(nil, "SchedulerCache is corrupted and can badly affect scheduling decisions")
os.Exit(1)
} }
if err := cache.updatePod(oldPod, newPod); err != nil { if err := cache.updatePod(oldPod, newPod); err != nil {
return err return err
@ -539,11 +541,12 @@ func (cache *schedulerCache) RemovePod(pod *v1.Pod) error {
switch { switch {
case ok: case ok:
if currState.pod.Spec.NodeName != pod.Spec.NodeName { if currState.pod.Spec.NodeName != pod.Spec.NodeName {
klog.Errorf("Pod %v was assumed to be on %v but got added to %v", key, pod.Spec.NodeName, currState.pod.Spec.NodeName) klog.ErrorS(nil, "Pod was added to a different node than it was assumed", "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
if pod.Spec.NodeName != "" { if pod.Spec.NodeName != "" {
// An empty NodeName is possible when the scheduler misses a Delete // An empty NodeName is possible when the scheduler misses a Delete
// event and it gets the last known state from the informer cache. // event and it gets the last known state from the informer cache.
klog.Fatalf("Schedulercache is corrupted and can badly affect scheduling decisions") klog.ErrorS(nil, "SchedulerCache is corrupted and can badly affect scheduling decisions")
os.Exit(1)
} }
} }
return cache.expirePod(key, currState) return cache.expirePod(key, currState)
@ -723,17 +726,18 @@ func (cache *schedulerCache) cleanupAssumedPods(now time.Time) {
for key := range cache.assumedPods { for key := range cache.assumedPods {
ps, ok := cache.podStates[key] ps, ok := cache.podStates[key]
if !ok { if !ok {
klog.Fatal("Key found in assumed set but not in podStates. Potentially a logical error.") klog.ErrorS(nil, "Key found in assumed set but not in podStates, potentially a logical error")
os.Exit(1)
} }
if !ps.bindingFinished { if !ps.bindingFinished {
klog.V(5).Infof("Couldn't expire cache for pod %v/%v. Binding is still in progress.", klog.V(5).InfoS("Could not expire cache for pod as binding is still in progress",
ps.pod.Namespace, ps.pod.Name) "pod", klog.KObj(ps.pod))
continue continue
} }
if now.After(*ps.deadline) { if now.After(*ps.deadline) {
klog.Warningf("Pod %s/%s expired", ps.pod.Namespace, ps.pod.Name) klog.InfoS("Pod expired", "pod", klog.KObj(ps.pod))
if err := cache.expirePod(key, ps); err != nil { if err := cache.expirePod(key, ps); err != nil {
klog.Errorf("ExpirePod failed for %s: %v", key, err) klog.ErrorS(err, "ExpirePod failed", "pod", klog.KObj(ps.pod))
} }
} }
} }