From a29ade2f33e1ec6d18375be97f8494492da7acfa Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Fri, 19 Jun 2015 22:46:50 -0700 Subject: [PATCH 1/5] Disable creation of cbr0, the kubelet does it now. Conditionalize the docker bridge. --- cluster/saltbase/salt/docker/docker-defaults | 7 ++++++- cluster/saltbase/salt/docker/init.sls | 6 ------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cluster/saltbase/salt/docker/docker-defaults b/cluster/saltbase/salt/docker/docker-defaults index a8ec4256d5a..43e8fb9eafe 100644 --- a/cluster/saltbase/salt/docker/docker-defaults +++ b/cluster/saltbase/salt/docker/docker-defaults @@ -2,5 +2,10 @@ DOCKER_OPTS="" {% if grains.docker_opts is defined and grains.docker_opts %} DOCKER_OPTS="${DOCKER_OPTS} {{grains.docker_opts}}" {% endif %} -DOCKER_OPTS="${DOCKER_OPTS} --bridge cbr0 --iptables=false --ip-masq=false" + +{% set docker_bridge = "" %} +{% if grains['roles'][0] == 'kubernetes-pool' %} + {% set docker_bridge = "--bridge cbr0" %} +{% endif %} +DOCKER_OPTS="${DOCKER_OPTS} {{docker_bridge}} --iptables=false --ip-masq=false" DOCKER_NOFILE=1000000 diff --git a/cluster/saltbase/salt/docker/init.sls b/cluster/saltbase/salt/docker/init.sls index 9728ed57293..06d37e26f27 100644 --- a/cluster/saltbase/salt/docker/init.sls +++ b/cluster/saltbase/salt/docker/init.sls @@ -48,11 +48,6 @@ net.ipv4.ip_forward: sysctl.present: - value: 1 -cbr0: - container_bridge.ensure: - - cidr: {{ grains['cbr-cidr'] }} - - mtu: 1460 - {{ environment_file }}: file.managed: - source: salt://docker/docker-defaults @@ -124,7 +119,6 @@ docker: - enable: True - watch: - file: {{ environment_file }} - - container_bridge: cbr0 {% if override_docker_ver != '' %} - require: - pkg: lxc-docker-{{ override_docker_ver }} From 192ffdfb25ccd3e5c07e9e37bd55d6d0fb28c000 Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Fri, 19 Jun 2015 22:49:18 -0700 Subject: [PATCH 2/5] Fix the container bridge so that it can create cbr0 Fix the kubelet so that it tries to sync status, even if Docker is down --- cluster/saltbase/salt/docker/docker-defaults | 6 +- cluster/saltbase/salt/kubelet/default | 7 ++- cmd/kubelet/app/server.go | 6 +- pkg/kubelet/container_bridge.go | 66 +++++++++++++------- pkg/kubelet/kubelet.go | 47 ++++++++++---- pkg/kubelet/status_manager.go | 7 +++ 6 files changed, 98 insertions(+), 41 deletions(-) diff --git a/cluster/saltbase/salt/docker/docker-defaults b/cluster/saltbase/salt/docker/docker-defaults index 43e8fb9eafe..7e5725064d1 100644 --- a/cluster/saltbase/salt/docker/docker-defaults +++ b/cluster/saltbase/salt/docker/docker-defaults @@ -3,9 +3,5 @@ DOCKER_OPTS="" DOCKER_OPTS="${DOCKER_OPTS} {{grains.docker_opts}}" {% endif %} -{% set docker_bridge = "" %} -{% if grains['roles'][0] == 'kubernetes-pool' %} - {% set docker_bridge = "--bridge cbr0" %} -{% endif %} -DOCKER_OPTS="${DOCKER_OPTS} {{docker_bridge}} --iptables=false --ip-masq=false" +DOCKER_OPTS="${DOCKER_OPTS} --bridge=cbr0 --iptables=false --ip-masq=false" DOCKER_NOFILE=1000000 diff --git a/cluster/saltbase/salt/kubelet/default b/cluster/saltbase/salt/kubelet/default index 76eb4497b4d..a1c1dccc1f0 100644 --- a/cluster/saltbase/salt/kubelet/default +++ b/cluster/saltbase/salt/kubelet/default @@ -76,4 +76,9 @@ {% set cgroup_root = "--cgroup_root=/" -%} {% endif -%} -DAEMON_ARGS="{{daemon_args}} {{api_servers_with_port}} {{debugging_handlers}} {{hostname_override}} {{cloud_provider}} {{config}} --allow_privileged={{pillar['allow_privileged']}} {{pillar['log_level']}} {{cluster_dns}} {{cluster_domain}} {{docker_root}} {{kubelet_root}} {{configure_cbr0}} {{cgroup_root}} {{system_container}}" +{% set pod_cidr = "" %} +{% if grains['roles'][0] == 'kubernetes-master' %} + {% set pod_cidr = "--pod-cidr=" + grains['cbr-cidr'] %} +{% endif %} + +DAEMON_ARGS="{{daemon_args}} {{api_servers_with_port}} {{debugging_handlers}} {{hostname_override}} {{cloud_provider}} {{config}} --allow_privileged={{pillar['allow_privileged']}} {{pillar['log_level']}} {{cluster_dns}} {{cluster_domain}} {{docker_root}} {{kubelet_root}} {{configure_cbr0}} {{cgroup_root}} {{system_container}} {{pod_cidr}}" diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index b50179b4807..1cd1362da1f 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -115,6 +115,7 @@ type KubeletServer struct { DockerDaemonContainer string SystemContainer string ConfigureCBR0 bool + PodCIDR string MaxPods int DockerExecHandlerName string @@ -241,7 +242,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.BoolVar(&s.ConfigureCBR0, "configure-cbr0", s.ConfigureCBR0, "If true, kubelet will configure cbr0 based on Node.Spec.PodCIDR.") fs.IntVar(&s.MaxPods, "max-pods", 100, "Number of Pods that can run on this Kubelet.") fs.StringVar(&s.DockerExecHandlerName, "docker-exec-handler", s.DockerExecHandlerName, "Handler to use when executing a command in a container. Valid values are 'native' and 'nsenter'. Defaults to 'native'.") - + fs.StringVar(&s.PodCIDR, "pod-cidr", "", "The CIDR to use for pod IP addresses, only used in standalone mode. In cluster mode, this is obtained from the master.") // Flags intended for testing, not recommended used in production environments. fs.BoolVar(&s.ReallyCrashForTesting, "really-crash-for-testing", s.ReallyCrashForTesting, "If true, when panics occur crash. Intended for testing.") fs.Float64Var(&s.ChaosChance, "chaos-chance", s.ChaosChance, "If > 0.0, introduce random client errors and latency. Intended for testing. [default=0.0]") @@ -361,6 +362,7 @@ func (s *KubeletServer) Run(_ []string) error { DockerDaemonContainer: s.DockerDaemonContainer, SystemContainer: s.SystemContainer, ConfigureCBR0: s.ConfigureCBR0, + PodCIDR: s.PodCIDR, MaxPods: s.MaxPods, DockerExecHandler: dockerExecHandler, } @@ -714,6 +716,7 @@ type KubeletConfig struct { DockerDaemonContainer string SystemContainer string ConfigureCBR0 bool + PodCIDR string MaxPods int DockerExecHandler dockertools.ExecHandler } @@ -771,6 +774,7 @@ func createAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod kc.DockerDaemonContainer, kc.SystemContainer, kc.ConfigureCBR0, + kc.PodCIDR, kc.MaxPods, kc.DockerExecHandler) diff --git a/pkg/kubelet/container_bridge.go b/pkg/kubelet/container_bridge.go index a4a30a53015..0aaa885195c 100644 --- a/pkg/kubelet/container_bridge.go +++ b/pkg/kubelet/container_bridge.go @@ -19,6 +19,7 @@ package kubelet import ( "bytes" "net" + "os" "os/exec" "regexp" @@ -27,8 +28,39 @@ import ( var cidrRegexp = regexp.MustCompile(`inet ([0-9a-fA-F.:]*/[0-9]*)`) +func createCBR0(wantCIDR *net.IPNet) error { + // recreate cbr0 with wantCIDR + if err := exec.Command("brctl", "addbr", "cbr0").Run(); err != nil { + glog.Error(err) + return err + } + if err := exec.Command("ip", "addr", "add", wantCIDR.String(), "dev", "cbr0").Run(); err != nil { + glog.Error(err) + return err + } + if err := exec.Command("ip", "link", "set", "dev", "cbr0", "up").Run(); err != nil { + glog.Error(err) + return err + } + // restart docker + if err := exec.Command("service", "docker", "restart").Run(); err != nil { + glog.Error(err) + // For now just log the error. The containerRuntime check will catch docker failures. + // TODO (dawnchen) figure out what we should do for rkt here. + } + glog.V(2).Info("Recreated cbr0 and restarted docker") + return nil +} + func ensureCbr0(wantCIDR *net.IPNet) error { - if !cbr0CidrCorrect(wantCIDR) { + exists, err := cbr0Exists() + if err != nil { + return err + } + if !exists { + glog.V(2).Infof("CBR0 doesn't exist, attempting to create it with range: %s", wantCIDR) + return createCBR0(wantCIDR) + } else if !cbr0CidrCorrect(wantCIDR) { glog.V(2).Infof("Attempting to recreate cbr0 with address range: %s", wantCIDR) // delete cbr0 @@ -40,30 +72,22 @@ func ensureCbr0(wantCIDR *net.IPNet) error { glog.Error(err) return err } - // recreate cbr0 with wantCIDR - if err := exec.Command("brctl", "addbr", "cbr0").Run(); err != nil { - glog.Error(err) - return err - } - if err := exec.Command("ip", "addr", "add", wantCIDR.String(), "dev", "cbr0").Run(); err != nil { - glog.Error(err) - return err - } - if err := exec.Command("ip", "link", "set", "dev", "cbr0", "up").Run(); err != nil { - glog.Error(err) - return err - } - // restart docker - if err := exec.Command("service", "docker", "restart").Run(); err != nil { - glog.Error(err) - // For now just log the error. The containerRuntime check will catch docker failures. - // TODO (dawnchen) figure out what we should do for rkt here. - } - glog.V(2).Info("Recreated cbr0 and restarted docker") + return createCBR0(wantCIDR) } return nil } +func cbr0Exists() (bool, error) { + _, err := os.Stat("/sys/class/net/cbr0") + if err != nil { + if os.IsNotExist(err) { + return false, nil + } + return false, err + } + return true, nil +} + func cbr0CidrCorrect(wantCIDR *net.IPNet) bool { output, err := exec.Command("ip", "addr", "show", "cbr0").Output() if err != nil { diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 2553c6bbaeb..537c95bed48 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -147,6 +147,7 @@ func NewMainKubelet( dockerDaemonContainer string, systemContainer string, configureCBR0 bool, + podCIDR string, pods int, dockerExecHandler dockertools.ExecHandler) (*Kubelet, error) { if rootDirectory == "" { @@ -261,6 +262,7 @@ func NewMainKubelet( cgroupRoot: cgroupRoot, mounter: mounter, configureCBR0: configureCBR0, + podCIDR: podCIDR, pods: pods, syncLoopMonitor: util.AtomicValue{}, } @@ -318,6 +320,10 @@ func NewMainKubelet( } klet.containerManager = containerManager + // Start syncing node status immediately, this may set up things the runtime needs to run. + go klet.syncNodeStatus() + go klet.syncNetworkStatus() + // Wait for the runtime to be up with a timeout. if err := waitUntilRuntimeIsUp(klet.containerRuntime, maxWaitForContainerRuntime); err != nil { return nil, fmt.Errorf("timed out waiting for %q to come up: %v", containerRuntime, err) @@ -412,6 +418,9 @@ type Kubelet struct { runtimeUpThreshold time.Duration lastTimestampRuntimeUp time.Time + // Network Status information + networkConfigured bool + // Volume plugins. volumePluginMgr volume.VolumePluginMgr @@ -489,6 +498,7 @@ type Kubelet struct { // Whether or not kubelet should take responsibility for keeping cbr0 in // the correct state. configureCBR0 bool + podCIDR string // Number of Pods which can be run by this Kubelet pods int @@ -707,7 +717,6 @@ func (kl *Kubelet) Run(updates <-chan PodUpdate) { } go util.Until(kl.updateRuntimeUp, 5*time.Second, util.NeverStop) - go kl.syncNodeStatus() // Run the system oom watcher forever. kl.statusManager.Start() kl.syncLoop(updates, kl) @@ -1705,6 +1714,10 @@ func (kl *Kubelet) syncLoopIteration(updates <-chan PodUpdate, handler SyncHandl glog.Infof("Skipping pod synchronization, container runtime is not up.") return } + if !kl.networkConfigured { + time.Sleep(5 * time.Second) + glog.Infof("Skipping pod synchronization, network is not configured") + } unsyncedPod := false podSyncTypes := make(map[types.UID]SyncPodType) select { @@ -1892,6 +1905,22 @@ func (kl *Kubelet) recordNodeStatusEvent(event string) { // Maintains Node.Spec.Unschedulable value from previous run of tryUpdateNodeStatus() var oldNodeUnschedulable bool +func (kl *Kubelet) syncNetworkStatus() { + for { + networkConfigured := true + if kl.configureCBR0 { + if len(kl.podCIDR) == 0 { + networkConfigured = false + } else if err := kl.reconcileCBR0(kl.podCIDR); err != nil { + networkConfigured = false + glog.Errorf("Error configuring cbr0: %v", err) + } + } + kl.networkConfigured = networkConfigured + time.Sleep(30 * time.Second) + } +} + // setNodeStatus fills in the Status fields of the given Node, overwriting // any fields that are currently set. func (kl *Kubelet) setNodeStatus(node *api.Node) error { @@ -1925,16 +1954,6 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error { } } - networkConfigured := true - if kl.configureCBR0 { - if len(node.Spec.PodCIDR) == 0 { - networkConfigured = false - } else if err := kl.reconcileCBR0(node.Spec.PodCIDR); err != nil { - networkConfigured = false - glog.Errorf("Error configuring cbr0: %v", err) - } - } - // TODO: Post NotReady if we cannot get MachineInfo from cAdvisor. This needs to start // cAdvisor locally, e.g. for test-cmd.sh, and in integration test. info, err := kl.GetCachedMachineInfo() @@ -1982,7 +2001,7 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error { currentTime := util.Now() var newNodeReadyCondition api.NodeCondition var oldNodeReadyConditionStatus api.ConditionStatus - if containerRuntimeUp && networkConfigured { + if containerRuntimeUp && kl.networkConfigured { newNodeReadyCondition = api.NodeCondition{ Type: api.NodeReady, Status: api.ConditionTrue, @@ -1994,7 +2013,7 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error { if !containerRuntimeUp { reasons = append(reasons, "container runtime is down") } - if !networkConfigured { + if !kl.networkConfigured { reasons = append(reasons, "network not configured correctly") } newNodeReadyCondition = api.NodeCondition{ @@ -2056,6 +2075,8 @@ func (kl *Kubelet) tryUpdateNodeStatus() error { if node == nil { return fmt.Errorf("no node instance returned for %q", kl.nodeName) } + kl.podCIDR = node.Spec.PodCIDR + if err := kl.setNodeStatus(node); err != nil { return err } diff --git a/pkg/kubelet/status_manager.go b/pkg/kubelet/status_manager.go index 4df39d7c086..d8bf9bbb4e9 100644 --- a/pkg/kubelet/status_manager.go +++ b/pkg/kubelet/status_manager.go @@ -17,9 +17,11 @@ limitations under the License. package kubelet import ( + "errors" "fmt" "reflect" "sync" + "time" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/client" @@ -58,6 +60,8 @@ func (s *statusManager) Start() { err := s.syncBatch() if err != nil { glog.Warningf("Failed to updated pod status: %v", err) + // Errors and tight-looping are bad, m-kay + time.Sleep(30 * time.Second) } }, 0) } @@ -124,6 +128,9 @@ func (s *statusManager) RemoveOrphanedStatuses(podFullNames map[string]bool) { // syncBatch syncs pods statuses with the apiserver. func (s *statusManager) syncBatch() error { + if s.kubeClient == nil { + return errors.New("Kubernetes client is nil, skipping pod status updates") + } syncRequest := <-s.podStatusChannel pod := syncRequest.pod podFullName := kubecontainer.GetPodFullName(pod) From 23200d303f6d2463c2ed212cdad624d928b14613 Mon Sep 17 00:00:00 2001 From: Dawn Chen Date: Mon, 22 Jun 2015 23:07:40 -0700 Subject: [PATCH 3/5] Fix several issues on running syncPods until network is configured. Also fixed unittests and compiling. --- cluster/saltbase/salt/docker/docker-defaults | 1 - contrib/mesos/pkg/executor/service/service.go | 1 + pkg/kubelet/container_bridge.go | 22 +++++++--- pkg/kubelet/kubelet.go | 44 ++++++++++++------- pkg/kubelet/kubelet_test.go | 1 + pkg/kubelet/status_manager.go | 3 -- pkg/util/util.go | 14 ++++++ 7 files changed, 59 insertions(+), 27 deletions(-) diff --git a/cluster/saltbase/salt/docker/docker-defaults b/cluster/saltbase/salt/docker/docker-defaults index 7e5725064d1..f325b4945d5 100644 --- a/cluster/saltbase/salt/docker/docker-defaults +++ b/cluster/saltbase/salt/docker/docker-defaults @@ -2,6 +2,5 @@ DOCKER_OPTS="" {% if grains.docker_opts is defined and grains.docker_opts %} DOCKER_OPTS="${DOCKER_OPTS} {{grains.docker_opts}}" {% endif %} - DOCKER_OPTS="${DOCKER_OPTS} --bridge=cbr0 --iptables=false --ip-masq=false" DOCKER_NOFILE=1000000 diff --git a/contrib/mesos/pkg/executor/service/service.go b/contrib/mesos/pkg/executor/service/service.go index dd3b6605b9e..06ffbd6255d 100644 --- a/contrib/mesos/pkg/executor/service/service.go +++ b/contrib/mesos/pkg/executor/service/service.go @@ -354,6 +354,7 @@ func (ks *KubeletExecutorServer) createAndInitKubelet( kc.DockerDaemonContainer, kc.SystemContainer, kc.ConfigureCBR0, + kc.PodCIDR, kc.MaxPods, kc.DockerExecHandler, ) diff --git a/pkg/kubelet/container_bridge.go b/pkg/kubelet/container_bridge.go index 0aaa885195c..5fb6319ec5a 100644 --- a/pkg/kubelet/container_bridge.go +++ b/pkg/kubelet/container_bridge.go @@ -23,6 +23,7 @@ import ( "os/exec" "regexp" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" "github.com/golang/glog" ) @@ -43,10 +44,16 @@ func createCBR0(wantCIDR *net.IPNet) error { return err } // restart docker - if err := exec.Command("service", "docker", "restart").Run(); err != nil { - glog.Error(err) - // For now just log the error. The containerRuntime check will catch docker failures. - // TODO (dawnchen) figure out what we should do for rkt here. + // For now just log the error. The containerRuntime check will catch docker failures. + // TODO (dawnchen) figure out what we should do for rkt here. + if util.UsingSystemdInitSystem() { + if err := exec.Command("systemctl", "restart", "docker").Run(); err != nil { + glog.Error(err) + } + } else { + if err := exec.Command("service", "docker", "restart").Run(); err != nil { + glog.Error(err) + } } glog.V(2).Info("Recreated cbr0 and restarted docker") return nil @@ -60,7 +67,8 @@ func ensureCbr0(wantCIDR *net.IPNet) error { if !exists { glog.V(2).Infof("CBR0 doesn't exist, attempting to create it with range: %s", wantCIDR) return createCBR0(wantCIDR) - } else if !cbr0CidrCorrect(wantCIDR) { + } + if !cbr0CidrCorrect(wantCIDR) { glog.V(2).Infof("Attempting to recreate cbr0 with address range: %s", wantCIDR) // delete cbr0 @@ -78,8 +86,7 @@ func ensureCbr0(wantCIDR *net.IPNet) error { } func cbr0Exists() (bool, error) { - _, err := os.Stat("/sys/class/net/cbr0") - if err != nil { + if _, err := os.Stat("/sys/class/net/cbr0"); err != nil { if os.IsNotExist(err) { return false, nil } @@ -103,6 +110,7 @@ func cbr0CidrCorrect(wantCIDR *net.IPNet) bool { return false } cbr0CIDR.IP = cbr0IP + glog.V(5).Infof("Want cbr0 CIDR: %s, have cbr0 CIDR: %s", wantCIDR, cbr0CIDR) return wantCIDR.IP.Equal(cbr0IP) && bytes.Equal(wantCIDR.Mask, cbr0CIDR.Mask) } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 537c95bed48..9c0e4260748 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -321,8 +321,8 @@ func NewMainKubelet( klet.containerManager = containerManager // Start syncing node status immediately, this may set up things the runtime needs to run. + go util.Until(klet.syncNetworkStatus, 30*time.Second, util.NeverStop) go klet.syncNodeStatus() - go klet.syncNetworkStatus() // Wait for the runtime to be up with a timeout. if err := waitUntilRuntimeIsUp(klet.containerRuntime, maxWaitForContainerRuntime); err != nil { @@ -419,7 +419,8 @@ type Kubelet struct { lastTimestampRuntimeUp time.Time // Network Status information - networkConfigured bool + networkConfigMutex sync.Mutex + networkConfigured bool // Volume plugins. volumePluginMgr volume.VolumePluginMgr @@ -717,6 +718,7 @@ func (kl *Kubelet) Run(updates <-chan PodUpdate) { } go util.Until(kl.updateRuntimeUp, 5*time.Second, util.NeverStop) + // Run the system oom watcher forever. kl.statusManager.Start() kl.syncLoop(updates, kl) @@ -1714,9 +1716,10 @@ func (kl *Kubelet) syncLoopIteration(updates <-chan PodUpdate, handler SyncHandl glog.Infof("Skipping pod synchronization, container runtime is not up.") return } - if !kl.networkConfigured { + if !kl.doneNetworkConfigure() { time.Sleep(5 * time.Second) glog.Infof("Skipping pod synchronization, network is not configured") + return } unsyncedPod := false podSyncTypes := make(map[types.UID]SyncPodType) @@ -1871,6 +1874,7 @@ func (kl *Kubelet) reconcileCBR0(podCIDR string) error { glog.V(5).Info("PodCIDR not set. Will not configure cbr0.") return nil } + glog.V(5).Infof("PodCIDR is set to %q", podCIDR) _, cidr, err := net.ParseCIDR(podCIDR) if err != nil { return err @@ -1906,19 +1910,19 @@ func (kl *Kubelet) recordNodeStatusEvent(event string) { var oldNodeUnschedulable bool func (kl *Kubelet) syncNetworkStatus() { - for { - networkConfigured := true - if kl.configureCBR0 { - if len(kl.podCIDR) == 0 { - networkConfigured = false - } else if err := kl.reconcileCBR0(kl.podCIDR); err != nil { - networkConfigured = false - glog.Errorf("Error configuring cbr0: %v", err) - } + kl.networkConfigMutex.Lock() + defer kl.networkConfigMutex.Unlock() + + networkConfigured := true + if kl.configureCBR0 { + if len(kl.podCIDR) == 0 { + networkConfigured = false + } else if err := kl.reconcileCBR0(kl.podCIDR); err != nil { + networkConfigured = false + glog.Errorf("Error configuring cbr0: %v", err) } - kl.networkConfigured = networkConfigured - time.Sleep(30 * time.Second) } + kl.networkConfigured = networkConfigured } // setNodeStatus fills in the Status fields of the given Node, overwriting @@ -1997,11 +2001,13 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error { // Check whether container runtime can be reported as up. containerRuntimeUp := kl.containerRuntimeUp() + // Check whether network is configured properly + networkConfigured := kl.doneNetworkConfigure() currentTime := util.Now() var newNodeReadyCondition api.NodeCondition var oldNodeReadyConditionStatus api.ConditionStatus - if containerRuntimeUp && kl.networkConfigured { + if containerRuntimeUp && networkConfigured { newNodeReadyCondition = api.NodeCondition{ Type: api.NodeReady, Status: api.ConditionTrue, @@ -2013,7 +2019,7 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error { if !containerRuntimeUp { reasons = append(reasons, "container runtime is down") } - if !kl.networkConfigured { + if !networkConfigured { reasons = append(reasons, "network not configured correctly") } newNodeReadyCondition = api.NodeCondition{ @@ -2065,6 +2071,12 @@ func (kl *Kubelet) containerRuntimeUp() bool { return kl.lastTimestampRuntimeUp.Add(kl.runtimeUpThreshold).After(time.Now()) } +func (kl *Kubelet) doneNetworkConfigure() bool { + kl.networkConfigMutex.Lock() + defer kl.networkConfigMutex.Unlock() + return kl.networkConfigured +} + // tryUpdateNodeStatus tries to update node status to master. If ReconcileCBR0 // is set, this function will also confirm that cbr0 is configured correctly. func (kl *Kubelet) tryUpdateNodeStatus() error { diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 96183eda521..4957f57906d 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -127,6 +127,7 @@ func newTestKubelet(t *testing.T) *TestKubelet { } kubelet.volumeManager = newVolumeManager() kubelet.containerManager, _ = newContainerManager(mockCadvisor, "", "", "") + kubelet.networkConfigured = true return &TestKubelet{kubelet, fakeRuntime, mockCadvisor, fakeKubeClient, fakeMirrorClient} } diff --git a/pkg/kubelet/status_manager.go b/pkg/kubelet/status_manager.go index d8bf9bbb4e9..15495f0a41a 100644 --- a/pkg/kubelet/status_manager.go +++ b/pkg/kubelet/status_manager.go @@ -21,7 +21,6 @@ import ( "fmt" "reflect" "sync" - "time" "github.com/GoogleCloudPlatform/kubernetes/pkg/api" "github.com/GoogleCloudPlatform/kubernetes/pkg/client" @@ -60,8 +59,6 @@ func (s *statusManager) Start() { err := s.syncBatch() if err != nil { glog.Warningf("Failed to updated pod status: %v", err) - // Errors and tight-looping are bad, m-kay - time.Sleep(30 * time.Second) } }, 0) } diff --git a/pkg/util/util.go b/pkg/util/util.go index 17f56187ea4..fc335b95f8a 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -198,6 +198,20 @@ func CompileRegexps(regexpStrings []string) ([]*regexp.Regexp, error) { return regexps, nil } +// Detects if using systemd as the init system +// Please note that simply reading /proc/1/cmdline can be misleading because +// some installation of various init programs can automatically make /sbin/init +// a symlink or even a renamed version of their main program. +// TODO(dchen1107): realiably detects the init system using on the system: +// systemd, upstart, initd, etc. +func UsingSystemdInitSystem() bool { + if _, err := os.Stat("/run/systemd/system"); err != nil { + return true + } + + return false +} + // Writes 'value' to /proc//oom_score_adj. PID = 0 means self func ApplyOomScoreAdj(pid int, value int) error { if value < -1000 || value > 1000 { From 8d76d4ee5763751b35f6a76dffb1a125e471c89d Mon Sep 17 00:00:00 2001 From: Dawn Chen Date: Mon, 22 Jun 2015 23:12:34 -0700 Subject: [PATCH 4/5] Make master-addon service depend on both docker and kubelet service in salt. --- cluster/saltbase/salt/kube-master-addons/init.sls | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cluster/saltbase/salt/kube-master-addons/init.sls b/cluster/saltbase/salt/kube-master-addons/init.sls index 91186052205..94286345dee 100644 --- a/cluster/saltbase/salt/kube-master-addons/init.sls +++ b/cluster/saltbase/salt/kube-master-addons/init.sls @@ -37,10 +37,19 @@ master-docker-image-tags: file.touch: - name: /srv/pillar/docker-images.sls +# Current containervm image by default has both docker and kubelet +# running. But during cluster creation stage, docker and kubelet +# could be overwritten completely, or restarted due flag changes. +# The ordering of salt states for service docker, kubelet and +# master-addon below is very important to avoid the race between +# salt restart docker or kubelet and kubelet start master components. kube-master-addons: service.running: - enable: True - restart: True + - require: + - service: docker + - service: kubelet - watch: - file: master-docker-image-tags - file: /etc/kubernetes/kube-master-addons.sh From 9dbe6fe4e45aeeb5cb5abad8a7d216a3da1d6f57 Mon Sep 17 00:00:00 2001 From: Dawn Chen Date: Tue, 23 Jun 2015 16:28:41 -0700 Subject: [PATCH 5/5] Added more comments. --- cluster/saltbase/salt/kube-master-addons/init.sls | 9 ++++++++- pkg/kubelet/container_bridge.go | 3 +++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cluster/saltbase/salt/kube-master-addons/init.sls b/cluster/saltbase/salt/kube-master-addons/init.sls index 94286345dee..075a7a05113 100644 --- a/cluster/saltbase/salt/kube-master-addons/init.sls +++ b/cluster/saltbase/salt/kube-master-addons/init.sls @@ -39,10 +39,17 @@ master-docker-image-tags: # Current containervm image by default has both docker and kubelet # running. But during cluster creation stage, docker and kubelet -# could be overwritten completely, or restarted due flag changes. +# could be overwritten completely, or restarted due to flag changes. # The ordering of salt states for service docker, kubelet and # master-addon below is very important to avoid the race between # salt restart docker or kubelet and kubelet start master components. +# Without the ordering of salt states, when gce instance boot up, +# configure-vm.sh will run and download the release. At the end of +# boot, run-salt will run kube-master-addons service which installs +# master component manifest files to kubelet config directory before +# the installation of proper version kubelet. Please see +# https://github.com/GoogleCloudPlatform/kubernetes/issues/10122#issuecomment-114566063 +# for detail explanation on this very issue. kube-master-addons: service.running: - enable: True diff --git a/pkg/kubelet/container_bridge.go b/pkg/kubelet/container_bridge.go index 5fb6319ec5a..4ef58f4aaee 100644 --- a/pkg/kubelet/container_bridge.go +++ b/pkg/kubelet/container_bridge.go @@ -85,6 +85,9 @@ func ensureCbr0(wantCIDR *net.IPNet) error { return nil } +// Check if cbr0 network interface is configured or not, and take action +// when the configuration is missing on the node, and propagate the rest +// error to kubelet to handle. func cbr0Exists() (bool, error) { if _, err := os.Stat("/sys/class/net/cbr0"); err != nil { if os.IsNotExist(err) {