diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index 0b3f5c03346..b1c522ef665 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -36,6 +36,7 @@ import ( kubecontainer "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/container" "github.com/GoogleCloudPlatform/kubernetes/pkg/types" "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + docker "github.com/fsouza/go-dockerclient" "github.com/golang/glog" "github.com/golang/groupcache/lru" ) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index a22d4c32f59..e0216412774 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -53,6 +53,7 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/version" "github.com/GoogleCloudPlatform/kubernetes/pkg/volume" "github.com/GoogleCloudPlatform/kubernetes/pkg/watch" + docker "github.com/fsouza/go-dockerclient" "github.com/golang/glog" cadvisorApi "github.com/google/cadvisor/info/v1" ) @@ -214,6 +215,8 @@ func NewMainKubelet( volumeManager := newVolumeManager() + oomWatcher := NewOOMWatcher(cadvisorInterface, recorder) + klet := &Kubelet{ hostname: hostname, dockerClient: dockerClient, @@ -243,6 +246,7 @@ func NewMainKubelet( nodeStatusUpdateFrequency: nodeStatusUpdateFrequency, resourceContainer: resourceContainer, os: osInterface, + oomWatcher: oomWatcher, } klet.podManager = newBasicPodManager(klet.kubeClient) @@ -388,12 +392,12 @@ type Kubelet struct { // status. Kubelet may fail to update node status reliablly if the value is too small, // as it takes time to gather all necessary node information. nodeStatusUpdateFrequency time.Duration - // The name of the resource-only container to run the Kubelet in (empty for no container). // Name must be absolute. resourceContainer string - os kubecontainer.OSInterface + os kubecontainer.OSInterface + oomWatcher OOMWatcher } // getRootDir returns the full path to the directory under which kubelet can @@ -583,10 +587,20 @@ func (kl *Kubelet) Run(updates <-chan PodUpdate) { } go kl.syncNodeStatus() + // Run the system oom watcher forever. + go util.Until(kl.runOOMWatcher, time.Second, util.NeverStop) kl.statusManager.Start() kl.syncLoop(updates, kl) } +// Watches for system OOMs. +func (kl *Kubelet) runOOMWatcher() { + glog.V(5).Infof("Starting to record system OOMs") + if err := kl.oomWatcher.RecordSysOOMs(kl.nodeRef); err != nil { + glog.Errorf("failed to record system OOMs - %v", err) + } +} + // syncNodeStatus periodically synchronizes node status to master. func (kl *Kubelet) syncNodeStatus() { if kl.kubeClient == nil { @@ -1809,6 +1823,8 @@ func (kl *Kubelet) tryUpdateNodeStatus() error { } oldNodeUnschedulable = node.Spec.Unschedulable } + + // Update the current status on the API server _, err = kl.kubeClient.Nodes().UpdateStatus(node) return err } diff --git a/pkg/kubelet/oom_watcher.go b/pkg/kubelet/oom_watcher.go new file mode 100644 index 00000000000..81f0bec53ca --- /dev/null +++ b/pkg/kubelet/oom_watcher.go @@ -0,0 +1,67 @@ +/* +Copyright 2015 Google Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubelet + +import ( + "fmt" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/client/record" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/cadvisor" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/golang/glog" + "github.com/google/cadvisor/events" + cadvisorApi "github.com/google/cadvisor/info/v1" +) + +type OOMWatcher interface { + RecordSysOOMs(ref *api.ObjectReference) error +} + +type realOOMWatcher struct { + cadvisor cadvisor.Interface + recorder record.EventRecorder +} + +func NewOOMWatcher(cadvisor cadvisor.Interface, recorder record.EventRecorder) OOMWatcher { + return &realOOMWatcher{ + cadvisor: cadvisor, + recorder: recorder, + } +} + +const systemOOMEvent = "SystemOOM" + +// Watches cadvisor for system oom's and records an event for every system oom encountered. +func (ow *realOOMWatcher) RecordSysOOMs(ref *api.ObjectReference) error { + request := events.Request{ + EventType: map[cadvisorApi.EventType]bool{ + cadvisorApi.EventOom: true, + }, + ContainerName: "/", + IncludeSubcontainers: false, + } + eventChannel, err := ow.cadvisor.WatchEvents(&request) + if err != nil { + return err + } + for event := range eventChannel.GetChannel() { + glog.V(2).Infof("got sys oom event from cadvisor: %v", event) + ow.recorder.PastEventf(ref, util.Time{event.Timestamp}, systemOOMEvent, "System OOM encountered") + } + return fmt.Errorf("failed to watch cadvisor for sys oom events") +} diff --git a/pkg/kubelet/oom_watcher_test.go b/pkg/kubelet/oom_watcher_test.go new file mode 100644 index 00000000000..866b67d8fdf --- /dev/null +++ b/pkg/kubelet/oom_watcher_test.go @@ -0,0 +1,62 @@ +/* +Copyright 2015 Google Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package kubelet + +import ( + "fmt" + "testing" + + "github.com/GoogleCloudPlatform/kubernetes/pkg/api" + "github.com/GoogleCloudPlatform/kubernetes/pkg/kubelet/cadvisor" + "github.com/GoogleCloudPlatform/kubernetes/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" +) + +type fakeEvent struct { + object runtime.Object + timestamp util.Time + reason string + message string +} + +type fakeRecorder struct { + events []fakeEvent +} + +func (f fakeRecorder) Event(object runtime.Object, reason, message string) { + f.events = append(f.events, fakeEvent{object, util.Now(), reason, message}) +} + +func (f fakeRecorder) Eventf(object runtime.Object, reason, messageFmt string, args ...interface{}) { + f.events = append(f.events, fakeEvent{object, util.Now(), reason, fmt.Sprintf(messageFmt, args...)}) +} + +func (f fakeRecorder) PastEventf(object runtime.Object, timestamp util.Time, reason, messageFmt string, args ...interface{}) { + f.events = append(f.events, fakeEvent{object, timestamp, reason, fmt.Sprintf(messageFmt, args...)}) +} + +func TestBasic(t *testing.T) { + fakeRecorder := fakeRecorder{} + mockCadvisor := &cadvisor.Fake{} + node := &api.ObjectReference{} + oomWatcher := NewOOMWatcher(mockCadvisor, fakeRecorder) + go func() { + oomWatcher.RecordSysOOMs(node) + }() + // TODO: Improve this test once cadvisor exports events.EventChannel as an interface + // and thereby allow using a mock version of cadvisor. +}