diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 26f991e79cf..3dd0eeb1135 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -675,22 +675,19 @@ func (kl *Kubelet) Run(updates <-chan PodUpdate) { glog.Errorf("Failed to start ContainerManager, system may not be properly isolated: %v", err) } + err = kl.oomWatcher.Start(kl.nodeRef) + if err != nil { + kl.recorder.Eventf(kl.nodeRef, "kubeletSetupFailed", "Failed to start OOM watcher %v", err) + glog.Errorf("Failed to start OOM watching: %v", err) + } + go util.Until(kl.updateRuntimeUp, 5*time.Second, util.NeverStop) go kl.syncNodeStatus() // Run the system oom watcher forever. - go util.Until(kl.runOOMWatcher, time.Second, util.NeverStop) kl.statusManager.Start() kl.syncLoop(updates, kl) } -// Watches for system OOMs. -func (kl *Kubelet) runOOMWatcher() { - glog.V(5).Infof("Starting to record system OOMs") - if err := kl.oomWatcher.RecordSysOOMs(kl.nodeRef); err != nil { - glog.Errorf("failed to record system OOMs - %v", err) - } -} - // syncNodeStatus periodically synchronizes node status to master. func (kl *Kubelet) syncNodeStatus() { if kl.kubeClient == nil { diff --git a/pkg/kubelet/oom_watcher.go b/pkg/kubelet/oom_watcher.go index b4f00f7b037..58a95783cdd 100644 --- a/pkg/kubelet/oom_watcher.go +++ b/pkg/kubelet/oom_watcher.go @@ -17,8 +17,6 @@ limitations under the License. package kubelet import ( - "fmt" - "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/client/record" "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/cadvisor" @@ -29,7 +27,7 @@ import ( ) type OOMWatcher interface { - RecordSysOOMs(ref *api.ObjectReference) error + Start(ref *api.ObjectReference) error } type realOOMWatcher struct { @@ -47,7 +45,7 @@ func NewOOMWatcher(cadvisor cadvisor.Interface, recorder record.EventRecorder) O const systemOOMEvent = "SystemOOM" // Watches cadvisor for system oom's and records an event for every system oom encountered. -func (ow *realOOMWatcher) RecordSysOOMs(ref *api.ObjectReference) error { +func (ow *realOOMWatcher) Start(ref *api.ObjectReference) error { request := events.Request{ EventType: map[cadvisorApi.EventType]bool{ cadvisorApi.EventOom: true, @@ -59,9 +57,15 @@ func (ow *realOOMWatcher) RecordSysOOMs(ref *api.ObjectReference) error { if err != nil { return err } - for event := range eventChannel.GetChannel() { - glog.V(2).Infof("got sys oom event from cadvisor: %v", event) - ow.recorder.PastEventf(ref, util.Time{event.Timestamp}, systemOOMEvent, "System OOM encountered") - } - return fmt.Errorf("failed to watch cadvisor for sys oom events") + + go func() { + defer util.HandleCrash() + + for event := range eventChannel.GetChannel() { + glog.V(2).Infof("Got sys oom event from cadvisor: %v", event) + ow.recorder.PastEventf(ref, util.Time{event.Timestamp}, systemOOMEvent, "System OOM encountered") + } + glog.Errorf("Unexpectedly stopped receiving OOM notifications from cAdvisor") + }() + return nil } diff --git a/pkg/kubelet/oom_watcher_test.go b/pkg/kubelet/oom_watcher_test.go index 26ec50bad0a..57e9aaea25d 100644 --- a/pkg/kubelet/oom_watcher_test.go +++ b/pkg/kubelet/oom_watcher_test.go @@ -54,9 +54,11 @@ func TestBasic(t *testing.T) { mockCadvisor := &cadvisor.Fake{} node := &api.ObjectReference{} oomWatcher := NewOOMWatcher(mockCadvisor, fakeRecorder) - go func() { - oomWatcher.RecordSysOOMs(node) - }() + err := oomWatcher.Start(node) + if err != nil { + t.Errorf("Should not have failed: %v", err) + } + // TODO: Improve this test once cadvisor exports events.EventChannel as an interface // and thereby allow using a mock version of cadvisor. }