migrate cache.go to structured logging

This commit is contained in:
Shivanshu Raj Shrivastava 2021-11-07 19:19:14 +05:30
parent d92a443ca7
commit bb31040f06
No known key found for this signature in database
GPG Key ID: F4EDF2ED69728499

View File

@ -18,6 +18,7 @@ package cache
import (
"fmt"
"os"
"sync"
"time"
@ -123,7 +124,7 @@ func newNodeInfoListItem(ni *framework.NodeInfo) *nodeInfoListItem {
func (cache *schedulerCache) moveNodeInfoToHead(name string) {
ni, ok := cache.nodes[name]
if !ok {
klog.Errorf("No NodeInfo with name %v found in the cache", name)
klog.ErrorS(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
return
}
// if the node info list item is already at the head, we are done.
@ -151,7 +152,7 @@ func (cache *schedulerCache) moveNodeInfoToHead(name string) {
func (cache *schedulerCache) removeNodeInfoFromList(name string) {
ni, ok := cache.nodes[name]
if !ok {
klog.Errorf("No NodeInfo with name %v found in the cache", name)
klog.ErrorS(nil, "No node info with given name found in the cache", "node", klog.KRef("", name))
return
}
@ -264,7 +265,7 @@ func (cache *schedulerCache) UpdateSnapshot(nodeSnapshot *Snapshot) error {
", trying to recover",
len(nodeSnapshot.nodeInfoList), cache.nodeTree.numNodes,
len(nodeSnapshot.nodeInfoMap), len(cache.nodes))
klog.Error(errMsg)
klog.ErrorS(nil, errMsg)
// We will try to recover by re-creating the lists for the next scheduling cycle, but still return an
// error to surface the problem, the error will likely cause a failure to the current scheduling cycle.
cache.updateNodeInfoSnapshotList(nodeSnapshot, true)
@ -282,7 +283,7 @@ func (cache *schedulerCache) updateNodeInfoSnapshotList(snapshot *Snapshot, upda
snapshot.nodeInfoList = make([]*framework.NodeInfo, 0, cache.nodeTree.numNodes)
nodesList, err := cache.nodeTree.list()
if err != nil {
klog.Error(err)
klog.ErrorS(err, "Error occurred while retrieving the list of names of the nodes from node tree")
}
for _, nodeName := range nodesList {
if nodeInfo := snapshot.nodeInfoMap[nodeName]; nodeInfo != nil {
@ -294,7 +295,7 @@ func (cache *schedulerCache) updateNodeInfoSnapshotList(snapshot *Snapshot, upda
snapshot.havePodsWithRequiredAntiAffinityNodeInfoList = append(snapshot.havePodsWithRequiredAntiAffinityNodeInfoList, nodeInfo)
}
} else {
klog.Errorf("node %q exist in nodeTree but not in NodeInfoMap, this should not happen.", nodeName)
klog.ErrorS(nil, "Node exists in nodeTree but not in NodeInfoMap, this should not happen", "node", klog.KRef("", nodeName))
}
}
} else {
@ -381,7 +382,7 @@ func (cache *schedulerCache) finishBinding(pod *v1.Pod, now time.Time) error {
cache.mu.RLock()
defer cache.mu.RUnlock()
klog.V(5).Infof("Finished binding for pod %v. Can be expired.", key)
klog.V(5).InfoS("Finished binding for pod, can be expired", "pod", klog.KObj(pod))
currState, ok := cache.podStates[key]
if ok && cache.assumedPods.Has(key) {
dl := now.Add(cache.ttl)
@ -447,7 +448,7 @@ func (cache *schedulerCache) updatePod(oldPod, newPod *v1.Pod) error {
func (cache *schedulerCache) removePod(pod *v1.Pod) error {
n, ok := cache.nodes[pod.Spec.NodeName]
if !ok {
klog.Errorf("node %v not found when trying to remove pod %v", pod.Spec.NodeName, pod.Name)
klog.ErrorS(nil, "Node not found when trying to remove pod", "node", klog.KRef("", pod.Spec.NodeName), "pod", klog.KObj(pod))
return nil
}
if err := n.info.RemovePod(pod); err != nil {
@ -475,10 +476,10 @@ func (cache *schedulerCache) AddPod(pod *v1.Pod) error {
case ok && cache.assumedPods.Has(key):
if currState.pod.Spec.NodeName != pod.Spec.NodeName {
// The pod was added to a different node than it was assumed to.
klog.Warningf("Pod %v was assumed to be on %v but got added to %v", key, pod.Spec.NodeName, currState.pod.Spec.NodeName)
klog.InfoS("Pod was added to a different node than it was assumed", "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
// Clean this up.
if err = cache.removePod(currState.pod); err != nil {
klog.Errorf("removing pod error: %v", err)
klog.ErrorS(err, "Error occurred while removing pod")
}
cache.addPod(pod)
}
@ -513,8 +514,9 @@ func (cache *schedulerCache) UpdatePod(oldPod, newPod *v1.Pod) error {
// before Update event, in which case the state would change from Assumed to Added.
case ok && !cache.assumedPods.Has(key):
if currState.pod.Spec.NodeName != newPod.Spec.NodeName {
klog.Errorf("Pod %v updated on a different node than previously added to.", key)
klog.Fatalf("Schedulercache is corrupted and can badly affect scheduling decisions")
klog.ErrorS(nil, "Pod updated on a different node than previously added to", "pod", klog.KObj(oldPod))
klog.ErrorS(nil, "SchedulerCache is corrupted and can badly affect scheduling decisions")
os.Exit(1)
}
if err := cache.updatePod(oldPod, newPod); err != nil {
return err
@ -539,11 +541,12 @@ func (cache *schedulerCache) RemovePod(pod *v1.Pod) error {
switch {
case ok:
if currState.pod.Spec.NodeName != pod.Spec.NodeName {
klog.Errorf("Pod %v was assumed to be on %v but got added to %v", key, pod.Spec.NodeName, currState.pod.Spec.NodeName)
klog.ErrorS(nil, "Pod was added to a different node than it was assumed", "pod", klog.KObj(pod), "assumedNode", klog.KRef("", pod.Spec.NodeName), "currentNode", klog.KRef("", currState.pod.Spec.NodeName))
if pod.Spec.NodeName != "" {
// An empty NodeName is possible when the scheduler misses a Delete
// event and it gets the last known state from the informer cache.
klog.Fatalf("Schedulercache is corrupted and can badly affect scheduling decisions")
klog.ErrorS(nil, "SchedulerCache is corrupted and can badly affect scheduling decisions")
os.Exit(1)
}
}
return cache.expirePod(key, currState)
@ -723,17 +726,18 @@ func (cache *schedulerCache) cleanupAssumedPods(now time.Time) {
for key := range cache.assumedPods {
ps, ok := cache.podStates[key]
if !ok {
klog.Fatal("Key found in assumed set but not in podStates. Potentially a logical error.")
klog.ErrorS(nil, "Key found in assumed set but not in podStates, potentially a logical error")
os.Exit(1)
}
if !ps.bindingFinished {
klog.V(5).Infof("Couldn't expire cache for pod %v/%v. Binding is still in progress.",
ps.pod.Namespace, ps.pod.Name)
klog.V(5).InfoS("Could not expire cache for pod as binding is still in progress",
"pod", klog.KObj(ps.pod))
continue
}
if now.After(*ps.deadline) {
klog.Warningf("Pod %s/%s expired", ps.pod.Namespace, ps.pod.Name)
klog.InfoS("Pod expired", "pod", klog.KObj(ps.pod))
if err := cache.expirePod(key, ps); err != nil {
klog.Errorf("ExpirePod failed for %s: %v", key, err)
klog.ErrorS(err, "ExpirePod failed", "pod", klog.KObj(ps.pod))
}
}
}