From 497d6019f28f6fbb09c7f51e70647a5fa7d3e959 Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Mon, 15 Jun 2015 20:23:09 -0700 Subject: [PATCH 01/27] Only clone docs on initial releases. --- build/mark-new-version.sh | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/build/mark-new-version.sh b/build/mark-new-version.sh index c45fdeeecc8..9349dcc87f6 100755 --- a/build/mark-new-version.sh +++ b/build/mark-new-version.sh @@ -91,19 +91,21 @@ fi VERSION_FILE="${KUBE_ROOT}/pkg/version/base.go" -RELEASE_DIR=release-${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH} -echo "+++ Cloning documentation and examples into ${RELEASE_DIR}/..." -mkdir ${RELEASE_DIR} -cp -r docs ${RELEASE_DIR}/docs -cp -r examples ${RELEASE_DIR}/examples +if [[ "${VERSION_PATCH}" == "0" ]]; then + RELEASE_DIR=release-${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH} + echo "+++ Cloning documentation and examples into ${RELEASE_DIR}/..." + mkdir ${RELEASE_DIR} + cp -r docs ${RELEASE_DIR}/docs + cp -r examples ${RELEASE_DIR}/examples -# Update the docs to match this version. -perl -pi -e "s/HEAD/${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}/" ${RELEASE_DIR}/docs/README.md -perl -pi -e "s/HEAD/${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}/" ${RELEASE_DIR}/examples/README.md + # Update the docs to match this version. + perl -pi -e "s/HEAD/${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}/" ${RELEASE_DIR}/docs/README.md + perl -pi -e "s/HEAD/${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}/" ${RELEASE_DIR}/examples/README.md -${KUBE_ROOT}/hack/run-gendocs.sh -git add ${RELEASE_DIR} -git commit -m "Cloning docs for ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" + ${KUBE_ROOT}/hack/run-gendocs.sh + git add ${RELEASE_DIR} + git commit -m "Cloning docs for ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" +fi GIT_MINOR="${VERSION_MINOR}.${VERSION_PATCH}" echo "+++ Updating to ${NEW_VERSION}" From f25520e10935a6dc56b53839ad45de29bb7fb43b Mon Sep 17 00:00:00 2001 From: Jeff Lowdermilk Date: Wed, 10 Jun 2015 16:23:32 -0700 Subject: [PATCH 02/27] Allow gke provider to handle internal gcloud versions for testing. (cherry picked from commit ea6756210e66573d6c872e99f4a6aa7a9274d619) --- cluster/gke/config-common.sh | 1 + cluster/gke/util.sh | 36 ++++++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/cluster/gke/config-common.sh b/cluster/gke/config-common.sh index 6dcc9deb7b6..e4bf73a9422 100644 --- a/cluster/gke/config-common.sh +++ b/cluster/gke/config-common.sh @@ -28,6 +28,7 @@ NETWORK="${NETWORK:-default}" NETWORK_RANGE="${NETWORK_RANGE:-10.240.0.0/16}" FIREWALL_SSH="${FIREWALL_SSH:-${NETWORK}-allow-ssh}" GCLOUD="${GCLOUD:-gcloud}" +CMD_GROUP="${CMD_GROUP:-alpha}" GCLOUD_CONFIG_DIR="${GCLOUD_CONFIG_DIR:-${HOME}/.config/gcloud/kubernetes}" ENABLE_CLUSTER_DNS=false diff --git a/cluster/gke/util.sh b/cluster/gke/util.sh index be01f35dda9..3146211b951 100755 --- a/cluster/gke/util.sh +++ b/cluster/gke/util.sh @@ -100,6 +100,7 @@ function verify-prereqs() { fi ${sudo_prefix} gcloud ${gcloud_prompt:-} components update preview || true ${sudo_prefix} gcloud ${gcloud_prompt:-} components update alpha|| true + ${sudo_prefix} gcloud ${gcloud_prompt:-} components update kubectl|| true ${sudo_prefix} gcloud ${gcloud_prompt:-} components update || true } @@ -116,18 +117,18 @@ function kube-up() { detect-project >&2 # Make the specified network if we need to. - if ! gcloud compute networks --project "${PROJECT}" describe "${NETWORK}" &>/dev/null; then + if ! "${GCLOUD}" compute networks --project "${PROJECT}" describe "${NETWORK}" &>/dev/null; then echo "Creating new network: ${NETWORK}" >&2 - gcloud compute networks create "${NETWORK}" --project="${PROJECT}" --range "${NETWORK_RANGE}" + "${GCLOUD}" compute networks create "${NETWORK}" --project="${PROJECT}" --range "${NETWORK_RANGE}" else echo "Using network: ${NETWORK}" >&2 fi # Allow SSH on all nodes in the network. This doesn't actually check whether # such a rule exists, only whether we've created this exact rule. - if ! gcloud compute firewall-rules --project "${PROJECT}" describe "${FIREWALL_SSH}" &>/dev/null; then + if ! "${GCLOUD}" compute firewall-rules --project "${PROJECT}" describe "${FIREWALL_SSH}" &>/dev/null; then echo "Creating new firewall for SSH: ${FIREWALL_SSH}" >&2 - gcloud compute firewall-rules create "${FIREWALL_SSH}" \ + "${GCLOUD}" compute firewall-rules create "${FIREWALL_SSH}" \ --allow="tcp:22" \ --network="${NETWORK}" \ --project="${PROJECT}" \ @@ -136,13 +137,20 @@ function kube-up() { echo "Using firewall-rule: ${FIREWALL_SSH}" >&2 fi + local create_args=( + "--zone=${ZONE}" + "--project=${PROJECT}" + "--num-nodes=${NUM_MINIONS}" + "--network=${NETWORK}" + ) + if [[ ! -z "${DOGFOOD_GCLOUD:-}" ]]; then + create_args+=("--cluster-version=${CLUSTER_API_VERSION:-}") + else + create_args+=("--cluster-api-version=${CLUSTER_API_VERSION:-}") + fi + # Bring up the cluster. - "${GCLOUD}" alpha container clusters create "${CLUSTER_NAME}" \ - --zone="${ZONE}" \ - --project="${PROJECT}" \ - --cluster-api-version="${CLUSTER_API_VERSION:-}" \ - --num-nodes="${NUM_MINIONS}" \ - --network="${NETWORK}" + "${GCLOUD}" "${CMD_GROUP}" container clusters create "${CLUSTER_NAME}" "${create_args[@]}" } # Execute prior to running tests to initialize required structure. This is @@ -191,10 +199,10 @@ function test-setup() { function get-password() { echo "... in get-password()" >&2 detect-project >&2 - KUBE_USER=$("${GCLOUD}" alpha container clusters describe \ + KUBE_USER=$("${GCLOUD}" "${CMD_GROUP}" container clusters describe \ --project="${PROJECT}" --zone="${ZONE}" "${CLUSTER_NAME}" \ | grep user | cut -f 4 -d ' ') - KUBE_PASSWORD=$("${GCLOUD}" alpha container clusters describe \ + KUBE_PASSWORD=$("${GCLOUD}" "${CMD_GROUP}" container clusters describe \ --project="${PROJECT}" --zone="${ZONE}" "${CLUSTER_NAME}" \ | grep password | cut -f 4 -d ' ') } @@ -211,7 +219,7 @@ function detect-master() { echo "... in detect-master()" >&2 detect-project >&2 KUBE_MASTER="k8s-${CLUSTER_NAME}-master" - KUBE_MASTER_IP=$("${GCLOUD}" alpha container clusters describe \ + KUBE_MASTER_IP=$("${GCLOUD}" "${CMD_GROUP}" container clusters describe \ --project="${PROJECT}" --zone="${ZONE}" "${CLUSTER_NAME}" \ | grep endpoint | cut -f 2 -d ' ') } @@ -310,6 +318,6 @@ function test-teardown() { function kube-down() { echo "... in kube-down()" >&2 detect-project >&2 - "${GCLOUD}" alpha container clusters delete --project="${PROJECT}" \ + "${GCLOUD}" "${CMD_GROUP}" container clusters delete --project="${PROJECT}" \ --zone="${ZONE}" "${CLUSTER_NAME}" --quiet } From f84cee17d1e997b7e4896b3785abd41fa6efca86 Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Wed, 10 Jun 2015 14:35:59 -0700 Subject: [PATCH 03/27] Fix the scheduler to ignore terminated pods. (cherry picked from commit 97634c7fbf32c8d5b564c0ed13eded90561db9b5) --- .../algorithm/predicates/predicates.go | 15 +++++ .../pkg/scheduler/generic_scheduler_test.go | 62 ++++++++++++++++++- 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index a271aed510f..36549821024 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -360,6 +360,20 @@ func getUsedPorts(pods ...*api.Pod) map[int]bool { return ports } +func filterNonRunningPods(pods []*api.Pod) []*api.Pod { + if len(pods) == 0 { + return pods + } + result := []*api.Pod{} + for _, pod := range pods { + if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed { + continue + } + result = append(result, pod) + } + return result +} + // MapPodsToMachines obtains a list of pods and pivots that list into a map where the keys are host names // and the values are the list of pods running on that host. func MapPodsToMachines(lister algorithm.PodLister) (map[string][]*api.Pod, error) { @@ -369,6 +383,7 @@ func MapPodsToMachines(lister algorithm.PodLister) (map[string][]*api.Pod, error if err != nil { return map[string][]*api.Pod{}, err } + pods = filterNonRunningPods(pods) for _, scheduledPod := range pods { host := scheduledPod.Spec.NodeName machineToPods[host] = append(machineToPods[host], scheduledPod) diff --git a/plugin/pkg/scheduler/generic_scheduler_test.go b/plugin/pkg/scheduler/generic_scheduler_test.go index 19e224a7194..b220b14cb72 100644 --- a/plugin/pkg/scheduler/generic_scheduler_test.go +++ b/plugin/pkg/scheduler/generic_scheduler_test.go @@ -40,6 +40,10 @@ func matchesPredicate(pod *api.Pod, existingPods []*api.Pod, node string) (bool, return pod.Name == node, nil } +func hasNoPodsPredicate(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) { + return len(existingPods) == 0, nil +} + func numericPriority(pod *api.Pod, podLister algorithm.PodLister, minionLister algorithm.MinionLister) (algorithm.HostPriorityList, error) { nodes, err := minionLister.List() result := []algorithm.HostPriority{} @@ -166,6 +170,7 @@ func TestGenericScheduler(t *testing.T) { prioritizers []algorithm.PriorityConfig nodes []string pod *api.Pod + pods []*api.Pod expectedHost string expectsErr bool }{ @@ -223,11 +228,66 @@ func TestGenericScheduler(t *testing.T) { expectsErr: true, name: "test 7", }, + { + predicates: map[string]algorithm.FitPredicate{ + "nopods": hasNoPodsPredicate, + "matches": matchesPredicate, + }, + pods: []*api.Pod{ + { + ObjectMeta: api.ObjectMeta{Name: "2"}, + Spec: api.PodSpec{ + NodeName: "2", + }, + Status: api.PodStatus{ + Phase: api.PodRunning, + }, + }, + }, + pod: &api.Pod{ObjectMeta: api.ObjectMeta{Name: "2"}}, + + prioritizers: []algorithm.PriorityConfig{{Function: numericPriority, Weight: 1}}, + nodes: []string{"1", "2"}, + expectsErr: true, + name: "test 8", + }, + { + predicates: map[string]algorithm.FitPredicate{ + "nopods": hasNoPodsPredicate, + "matches": matchesPredicate, + }, + pods: []*api.Pod{ + { + ObjectMeta: api.ObjectMeta{Name: "2"}, + Spec: api.PodSpec{ + NodeName: "2", + }, + Status: api.PodStatus{ + Phase: api.PodFailed, + }, + }, + { + ObjectMeta: api.ObjectMeta{Name: "3"}, + Spec: api.PodSpec{ + NodeName: "2", + }, + Status: api.PodStatus{ + Phase: api.PodSucceeded, + }, + }, + }, + pod: &api.Pod{ObjectMeta: api.ObjectMeta{Name: "2"}}, + + prioritizers: []algorithm.PriorityConfig{{Function: numericPriority, Weight: 1}}, + nodes: []string{"1", "2"}, + expectedHost: "2", + name: "test 9", + }, } for _, test := range tests { random := rand.New(rand.NewSource(0)) - scheduler := NewGenericScheduler(test.predicates, test.prioritizers, algorithm.FakePodLister([]*api.Pod{}), random) + scheduler := NewGenericScheduler(test.predicates, test.prioritizers, algorithm.FakePodLister(test.pods), random) machine, err := scheduler.Schedule(test.pod, algorithm.FakeMinionLister(makeNodeList(test.nodes))) if test.expectsErr { if err == nil { From 6eda9c4976582579a2aef99ae7ab6bde0dd858fa Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Fri, 12 Jun 2015 20:49:32 -0700 Subject: [PATCH 04/27] Fix a bug where the network container could be torn down before other pods. This can break PreStop that hits localhost in the pod. (cherry picked from commit aac696d44cf75954bb731cbc89fd4f9a3cbacd13) --- pkg/kubelet/dockertools/manager.go | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index dee4da35e77..bef7cadd7a4 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -1036,29 +1036,38 @@ func (dm *DockerManager) KillPod(pod kubecontainer.Pod) error { // can be Len errors + the networkPlugin teardown error. errs := make(chan error, len(pod.Containers)+1) wg := sync.WaitGroup{} + var networkID types.UID for _, container := range pod.Containers { wg.Add(1) go func(container *kubecontainer.Container) { defer util.HandleCrash() + defer wg.Done() // TODO: Handle this without signaling the pod infra container to // adapt to the generic container runtime. if container.Name == PodInfraContainerName { - err := dm.networkPlugin.TearDownPod(pod.Namespace, pod.Name, kubeletTypes.DockerID(container.ID)) - if err != nil { - glog.Errorf("Failed tearing down the infra container: %v", err) - errs <- err - } + // Store the container runtime for later deletion. + // We do this so that PreStop handlers can run in the network namespace. + networkID = container.ID + return } - err := dm.killContainer(container.ID) - if err != nil { + if err := dm.killContainer(container.ID); err != nil { glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID) errs <- err } - wg.Done() }(container) } wg.Wait() + if len(networkID) > 0 { + if err := dm.networkPlugin.TearDownPod(pod.Namespace, pod.Name, kubeletTypes.DockerID(networkID)); err != nil { + glog.Errorf("Failed tearing down the infra container: %v", err) + errs <- err + } + if err := dm.killContainer(networkID); err != nil { + glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID) + errs <- err + } + } close(errs) if len(errs) > 0 { errList := []error{} From 85bb2911a36b33127d3919cfec76f67797107f95 Mon Sep 17 00:00:00 2001 From: Dawn Chen Date: Mon, 15 Jun 2015 14:20:17 -0700 Subject: [PATCH 05/27] Apply oom_score_adj (0) to PID of user containers by default. (cherry picked from commit f6f9372d1a02dd86c494ea5048b01804883dd09d) --- pkg/kubelet/dockertools/manager.go | 43 +++++++++++++++++++----------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index bef7cadd7a4..14653452be1 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -48,9 +48,11 @@ import ( ) const ( - // The oom_score_adj of the POD infrastructure container. The default is 0, so - // any value below that makes it *less* likely to get OOM killed. - podOomScoreAdj = -100 + // The oom_score_adj of the POD infrastructure container. The default is 0 for + // any other docker containers, so any value below that makes it *less* likely + // to get OOM killed. + podOomScoreAdj = -100 + userContainerOomScoreAdj = 0 maxReasonCacheEntries = 200 @@ -1190,6 +1192,28 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe if err = dm.os.Symlink(containerLogFile, symlinkFile); err != nil { glog.Errorf("Failed to create symbolic link to the log file of pod %q container %q: %v", podFullName, container.Name, err) } + + // Set OOM score of POD container to lower than those of the other containers + // which have OOM score 0 by default in the pod. This ensures that it is + // killed only as a last resort. + containerInfo, err := dm.client.InspectContainer(string(id)) + if err != nil { + return "", err + } + + // Ensure the PID actually exists, else we'll move ourselves. + if containerInfo.State.Pid == 0 { + return "", fmt.Errorf("failed to get init PID for Docker container %q", string(id)) + } + if container.Name == PodInfraContainerName { + util.ApplyOomScoreAdj(containerInfo.State.Pid, podOomScoreAdj) + } else { + // Children processes of docker daemon will inheritant the OOM score from docker + // daemon process. We explicitly apply OOM score 0 by default to the user + // containers to avoid daemons or POD containers are killed by oom killer. + util.ApplyOomScoreAdj(containerInfo.State.Pid, userContainerOomScoreAdj) + } + return kubeletTypes.DockerID(id), err } @@ -1244,19 +1268,6 @@ func (dm *DockerManager) createPodInfraContainer(pod *api.Pod) (kubeletTypes.Doc return "", err } - // Set OOM score of POD container to lower than those of the other - // containers in the pod. This ensures that it is killed only as a last - // resort. - containerInfo, err := dm.client.InspectContainer(string(id)) - if err != nil { - return "", err - } - - // Ensure the PID actually exists, else we'll move ourselves. - if containerInfo.State.Pid == 0 { - return "", fmt.Errorf("failed to get init PID for Docker pod infra container %q", string(id)) - } - util.ApplyOomScoreAdj(containerInfo.State.Pid, podOomScoreAdj) return id, nil } From c1dd527dd10137e5454c83ed3767698e152783c2 Mon Sep 17 00:00:00 2001 From: Dawn Chen Date: Mon, 15 Jun 2015 14:38:45 -0700 Subject: [PATCH 06/27] Fix the unittests cause by applying oom_score_adj (0) to the user containers. --- pkg/kubelet/dockertools/manager_test.go | 18 +++++++++--------- pkg/kubelet/kubelet_test.go | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/kubelet/dockertools/manager_test.go b/pkg/kubelet/dockertools/manager_test.go index 1241feb8e7a..fe69ffb74c3 100644 --- a/pkg/kubelet/dockertools/manager_test.go +++ b/pkg/kubelet/dockertools/manager_test.go @@ -896,7 +896,7 @@ func TestSyncPodCreateNetAndContainer(t *testing.T) { // Create pod infra container. "create", "start", "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) fakeDocker.Lock() @@ -945,7 +945,7 @@ func TestSyncPodCreatesNetAndContainerPullsImage(t *testing.T) { // Create pod infra container. "create", "start", "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) fakeDocker.Lock() @@ -997,7 +997,7 @@ func TestSyncPodWithPodInfraCreatesContainer(t *testing.T) { // Inspect pod infra container (but does not create)" "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) fakeDocker.Lock() @@ -1038,7 +1038,7 @@ func TestSyncPodDeletesWithNoPodInfraContainer(t *testing.T) { // Create pod infra container. "create", "start", "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) // A map iteration is used to delete containers, so must not depend on @@ -1163,7 +1163,7 @@ func TestSyncPodBadHash(t *testing.T) { // Check the pod infra container. "inspect_container", // Kill and restart the bad hash container. - "inspect_container", "stop", "create", "start", + "inspect_container", "stop", "create", "start", "inspect_container", }) if err := fakeDocker.AssertStopped([]string{"1234"}); err != nil { @@ -1223,7 +1223,7 @@ func TestSyncPodsUnhealthy(t *testing.T) { // Kill the unhealthy container. "inspect_container", "stop", // Restart the unhealthy container. - "create", "start", + "create", "start", "inspect_container", }) if err := fakeDocker.AssertStopped([]string{"1234"}); err != nil { @@ -1408,7 +1408,7 @@ func TestSyncPodWithRestartPolicy(t *testing.T) { // Check the pod infra container. "inspect_container", // Restart both containers. - "create", "start", "create", "start", + "create", "start", "inspect_container", "create", "start", "inspect_container", }, []string{"succeeded", "failed"}, []string{}, @@ -1419,7 +1419,7 @@ func TestSyncPodWithRestartPolicy(t *testing.T) { // Check the pod infra container. "inspect_container", // Restart the failed container. - "create", "start", + "create", "start", "inspect_container", }, []string{"failed"}, []string{}, @@ -1832,7 +1832,7 @@ func TestSyncPodWithPodInfraCreatesContainerCallsHandler(t *testing.T) { // Check the pod infra container. "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) fakeDocker.Lock() diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index e4b52692599..7428628cc46 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -485,7 +485,7 @@ func TestSyncPodsWithTerminationLog(t *testing.T) { // Create pod infra container. "create", "start", "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", // Get pod status. "list", "inspect_container", "inspect_container", // Get pods for deleting orphaned volumes. From c453282f72caab618192e4242eb13fc8f5724ce7 Mon Sep 17 00:00:00 2001 From: CJ Cullen Date: Mon, 15 Jun 2015 12:38:14 -0700 Subject: [PATCH 07/27] Fix several potential crashes in sshtunnel open/close code. (cherry picked from commit faa9313eea03c6f3af7cdaa61d580ce59017b40d) --- pkg/master/master.go | 37 +++++++++++++++---------------------- pkg/util/ssh.go | 14 +++++++++++++- 2 files changed, 28 insertions(+), 23 deletions(-) diff --git a/pkg/master/master.go b/pkg/master/master.go index 24fec4ed937..6220306db31 100644 --- a/pkg/master/master.go +++ b/pkg/master/master.go @@ -807,7 +807,9 @@ func (m *Master) replaceTunnels(user, keyfile string, newAddrs []string) error { if err != nil { return err } - tunnels.Open() + if err := tunnels.Open(); err != nil { + return err + } if m.tunnels != nil { m.tunnels.Close() } @@ -844,31 +846,22 @@ func (m *Master) refreshTunnels(user, keyfile string) error { func (m *Master) setupSecureProxy(user, keyfile string) { // Sync loop for tunnels // TODO: switch this to watch. - go func() { - for { - if err := m.loadTunnels(user, keyfile); err != nil { - glog.Errorf("Failed to load SSH Tunnels: %v", err) - } - var sleep time.Duration - if len(m.tunnels) == 0 { - sleep = time.Second - } else { - // tunnels could lag behind current set of nodes - sleep = 10 * time.Second - } - time.Sleep(sleep) + go util.Until(func() { + if err := m.loadTunnels(user, keyfile); err != nil { + glog.Errorf("Failed to load SSH Tunnels: %v", err) } - }() + if len(m.tunnels) != 0 { + time.Sleep(9 * time.Second) + } + }, 1 * time.Second, util.NeverStop) // Refresh loop for tunnels // TODO: could make this more controller-ish - go func() { - for { - time.Sleep(5 * time.Minute) - if err := m.refreshTunnels(user, keyfile); err != nil { - glog.Errorf("Failed to refresh SSH Tunnels: %v", err) - } + go util.Until(func() { + time.Sleep(5 * time.Minute) + if err := m.refreshTunnels(user, keyfile); err != nil { + glog.Errorf("Failed to refresh SSH Tunnels: %v", err) } - }() + }, 0 * time.Second, util.NeverStop) } func (m *Master) generateSSHKey(user, keyfile string) error { diff --git a/pkg/util/ssh.go b/pkg/util/ssh.go index 706190985ba..636f82a70c1 100644 --- a/pkg/util/ssh.go +++ b/pkg/util/ssh.go @@ -97,6 +97,9 @@ func (s *SSHTunnel) Dial(network, address string) (net.Conn, error) { } func (s *SSHTunnel) tunnel(conn net.Conn, remoteHost, remotePort string) error { + if s.client == nil { + return errors.New("tunnel is not opened.") + } tunnel, err := s.client.Dial("tcp", net.JoinHostPort(remoteHost, remotePort)) if err != nil { return err @@ -107,6 +110,9 @@ func (s *SSHTunnel) tunnel(conn net.Conn, remoteHost, remotePort string) error { } func (s *SSHTunnel) Close() error { + if s.client == nil { + return errors.New("Cannot close tunnel. Tunnel was not opened.") + } if err := s.client.Close(); err != nil { return err } @@ -196,9 +202,14 @@ func MakeSSHTunnels(user, keyfile string, addresses []string) (SSHTunnelList, er func (l SSHTunnelList) Open() error { for ix := range l { if err := l[ix].Tunnel.Open(); err != nil { - return err + // Remove a failed Open from the list. + glog.Errorf("Failed to open tunnel %v: %v", l[ix], err) + l = append(l[:ix], l[ix+1:]...) } } + if len(l) == 0 { + return errors.New("Failed to open any tunnels.") + } return nil } @@ -209,6 +220,7 @@ func (l SSHTunnelList) Close() { for ix := range l { entry := l[ix] go func() { + defer HandleCrash() time.Sleep(1 * time.Minute) if err := entry.Tunnel.Close(); err != nil { glog.Errorf("Failed to close tunnel %v: %v", entry, err) From db645dd31a0d77a6737bc5c60043f0f1aff2c3ee Mon Sep 17 00:00:00 2001 From: CJ Cullen Date: Mon, 15 Jun 2015 17:13:11 -0700 Subject: [PATCH 08/27] Add ssh tunnel-open metrics (cherry picked from commit 66fb8ccb028e9228a57eec3dc4f1d398dc4d48f0) --- pkg/master/master.go | 6 ++++-- pkg/util/ssh.go | 23 +++++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/pkg/master/master.go b/pkg/master/master.go index 6220306db31..2439411a614 100644 --- a/pkg/master/master.go +++ b/pkg/master/master.go @@ -851,9 +851,11 @@ func (m *Master) setupSecureProxy(user, keyfile string) { glog.Errorf("Failed to load SSH Tunnels: %v", err) } if len(m.tunnels) != 0 { + // Sleep for 10 seconds if we have some tunnels. + // TODO (cjcullen): tunnels can lag behind actually existing nodes. time.Sleep(9 * time.Second) } - }, 1 * time.Second, util.NeverStop) + }, 1*time.Second, util.NeverStop) // Refresh loop for tunnels // TODO: could make this more controller-ish go util.Until(func() { @@ -861,7 +863,7 @@ func (m *Master) setupSecureProxy(user, keyfile string) { if err := m.refreshTunnels(user, keyfile); err != nil { glog.Errorf("Failed to refresh SSH Tunnels: %v", err) } - }, 0 * time.Second, util.NeverStop) + }, 0*time.Second, util.NeverStop) } func (m *Master) generateSSHKey(user, keyfile string) error { diff --git a/pkg/util/ssh.go b/pkg/util/ssh.go index 636f82a70c1..d4eed8a3795 100644 --- a/pkg/util/ssh.go +++ b/pkg/util/ssh.go @@ -32,9 +32,30 @@ import ( "time" "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/crypto/ssh" ) +var ( + tunnelOpenCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "ssh_tunnel_open_count", + Help: "Counter of ssh tunnel total open attempts", + }, + ) + tunnelOpenFailCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "ssh_tunnel_open_fail_count", + Help: "Counter of ssh tunnel failed open attempts", + }, + ) +) + +func init() { + prometheus.MustRegister(tunnelOpenCounter) + prometheus.MustRegister(tunnelOpenFailCounter) +} + // TODO: Unit tests for this code, we can spin up a test SSH server with instructions here: // https://godoc.org/golang.org/x/crypto/ssh#ServerConn type SSHTunnel struct { @@ -83,7 +104,9 @@ func makeSSHTunnel(user string, signer ssh.Signer, host string) (*SSHTunnel, err func (s *SSHTunnel) Open() error { var err error s.client, err = ssh.Dial("tcp", net.JoinHostPort(s.Host, s.SSHPort), s.Config) + tunnelOpenCounter.Inc() if err != nil { + tunnelOpenFailCounter.Inc() return err } return nil From e98f79e4bc6a746c2eb4d682ede9f0b06fc8c426 Mon Sep 17 00:00:00 2001 From: CJ Cullen Date: Mon, 15 Jun 2015 23:00:01 -0700 Subject: [PATCH 09/27] Fix mislooping in ssh.go. Add retries to AddSSHKeys. (cherry picked from commit 4d5d0457ef12264f295aa2be9e7f9cecba8f8e38) --- pkg/cloudprovider/gce/gce.go | 70 ++++++++++++++++++++---------------- pkg/util/ssh.go | 3 +- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/pkg/cloudprovider/gce/gce.go b/pkg/cloudprovider/gce/gce.go index 4fdaa889c7a..d7cdb8a8d05 100644 --- a/pkg/cloudprovider/gce/gce.go +++ b/pkg/cloudprovider/gce/gce.go @@ -32,6 +32,7 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/api/resource" "github.com/GoogleCloudPlatform/kubernetes/pkg/cloudprovider" "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait" "code.google.com/p/gcfg" compute "code.google.com/p/google-api-go-client/compute/v1" @@ -483,37 +484,46 @@ func (gce *GCECloud) getInstanceByName(name string) (*compute.Instance, error) { } func (gce *GCECloud) AddSSHKeyToAllInstances(user string, keyData []byte) error { - project, err := gce.service.Projects.Get(gce.projectID).Do() - if err != nil { - return err - } - hostname, err := os.Hostname() - if err != nil { - return err - } - keyString := fmt.Sprintf("%s:%s %s@%s", user, strings.TrimSpace(string(keyData)), user, hostname) - found := false - for _, item := range project.CommonInstanceMetadata.Items { - if item.Key == "sshKeys" { - item.Value = addKey(item.Value, keyString) - found = true - break + return wait.Poll(2*time.Second, 30*time.Second, func() (bool, error) { + project, err := gce.service.Projects.Get(gce.projectID).Do() + if err != nil { + glog.Errorf("Could not get project: %v", err) + return false, nil } - } - if !found { - // This is super unlikely, so log. - glog.Infof("Failed to find sshKeys metadata, creating a new item") - project.CommonInstanceMetadata.Items = append(project.CommonInstanceMetadata.Items, - &compute.MetadataItems{ - Key: "sshKeys", - Value: keyString, - }) - } - op, err := gce.service.Projects.SetCommonInstanceMetadata(gce.projectID, project.CommonInstanceMetadata).Do() - if err != nil { - return err - } - return gce.waitForGlobalOp(op) + hostname, err := os.Hostname() + if err != nil { + glog.Errorf("Could not get hostname: %v", err) + return false, nil + } + keyString := fmt.Sprintf("%s:%s %s@%s", user, strings.TrimSpace(string(keyData)), user, hostname) + found := false + for _, item := range project.CommonInstanceMetadata.Items { + if item.Key == "sshKeys" { + item.Value = addKey(item.Value, keyString) + found = true + break + } + } + if !found { + // This is super unlikely, so log. + glog.Infof("Failed to find sshKeys metadata, creating a new item") + project.CommonInstanceMetadata.Items = append(project.CommonInstanceMetadata.Items, + &compute.MetadataItems{ + Key: "sshKeys", + Value: keyString, + }) + } + op, err := gce.service.Projects.SetCommonInstanceMetadata(gce.projectID, project.CommonInstanceMetadata).Do() + if err != nil { + glog.Errorf("Could not Set Metadata: %v", err) + return false, nil + } + if err := gce.waitForGlobalOp(op); err != nil { + glog.Errorf("Could not Set Metadata: %v", err) + return false, nil + } + return true, nil + }) } func addKey(metadataBefore, keyString string) string { diff --git a/pkg/util/ssh.go b/pkg/util/ssh.go index d4eed8a3795..b5f441597ef 100644 --- a/pkg/util/ssh.go +++ b/pkg/util/ssh.go @@ -223,11 +223,12 @@ func MakeSSHTunnels(user, keyfile string, addresses []string) (SSHTunnelList, er } func (l SSHTunnelList) Open() error { - for ix := range l { + for ix := 0; ix < len(l); ix++ { if err := l[ix].Tunnel.Open(); err != nil { // Remove a failed Open from the list. glog.Errorf("Failed to open tunnel %v: %v", l[ix], err) l = append(l[:ix], l[ix+1:]...) + ix-- } } if len(l) == 0 { From 4cd4d363c5e66ca6936be8962232b5e6ebb5abf7 Mon Sep 17 00:00:00 2001 From: CJ Cullen Date: Tue, 16 Jun 2015 10:36:38 -0700 Subject: [PATCH 10/27] Change SSHTunnelList to struct to make Open() semantics better. (cherry picked from commit 48f672af92ffeb43c156b541b1fd642534168d9a) --- pkg/master/master.go | 6 +++--- pkg/util/ssh.go | 49 ++++++++++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/pkg/master/master.go b/pkg/master/master.go index 2439411a614..c659d28238b 100644 --- a/pkg/master/master.go +++ b/pkg/master/master.go @@ -210,7 +210,7 @@ type Master struct { InsecureHandler http.Handler // Used for secure proxy - tunnels util.SSHTunnelList + tunnels *util.SSHTunnelList tunnelsLock sync.Mutex installSSHKey InstallSSHKey } @@ -772,7 +772,7 @@ func (m *Master) Dial(net, addr string) (net.Conn, error) { } func (m *Master) needToReplaceTunnels(addrs []string) bool { - if len(m.tunnels) != len(addrs) { + if m.tunnels == nil || m.tunnels.Len() != len(addrs) { return true } // TODO (cjcullen): This doesn't need to be n^2 @@ -850,7 +850,7 @@ func (m *Master) setupSecureProxy(user, keyfile string) { if err := m.loadTunnels(user, keyfile); err != nil { glog.Errorf("Failed to load SSH Tunnels: %v", err) } - if len(m.tunnels) != 0 { + if m.tunnels != nil && m.tunnels.Len() != 0 { // Sleep for 10 seconds if we have some tunnels. // TODO (cjcullen): tunnels can lag behind actually existing nodes. time.Sleep(9 * time.Second) diff --git a/pkg/util/ssh.go b/pkg/util/ssh.go index b5f441597ef..7d07249aebb 100644 --- a/pkg/util/ssh.go +++ b/pkg/util/ssh.go @@ -207,9 +207,11 @@ type SSHTunnelEntry struct { Tunnel *SSHTunnel } -type SSHTunnelList []SSHTunnelEntry +type SSHTunnelList struct { + entries []SSHTunnelEntry +} -func MakeSSHTunnels(user, keyfile string, addresses []string) (SSHTunnelList, error) { +func MakeSSHTunnels(user, keyfile string, addresses []string) (*SSHTunnelList, error) { tunnels := []SSHTunnelEntry{} for ix := range addresses { addr := addresses[ix] @@ -219,19 +221,22 @@ func MakeSSHTunnels(user, keyfile string, addresses []string) (SSHTunnelList, er } tunnels = append(tunnels, SSHTunnelEntry{addr, tunnel}) } - return tunnels, nil + return &SSHTunnelList{tunnels}, nil } -func (l SSHTunnelList) Open() error { - for ix := 0; ix < len(l); ix++ { - if err := l[ix].Tunnel.Open(); err != nil { - // Remove a failed Open from the list. - glog.Errorf("Failed to open tunnel %v: %v", l[ix], err) - l = append(l[:ix], l[ix+1:]...) - ix-- +// Open attempts to open all tunnels in the list, and removes any tunnels that +// failed to open. +func (l *SSHTunnelList) Open() error { + var openTunnels []SSHTunnelEntry + for ix := range l.entries { + if err := l.entries[ix].Tunnel.Open(); err != nil { + glog.Errorf("Failed to open tunnel %v: %v", l.entries[ix], err) + } else { + openTunnels = append(openTunnels, l.entries[ix]) } } - if len(l) == 0 { + l.entries = openTunnels + if len(l.entries) == 0 { return errors.New("Failed to open any tunnels.") } return nil @@ -240,9 +245,9 @@ func (l SSHTunnelList) Open() error { // Close asynchronously closes all tunnels in the list after waiting for 1 // minute. Tunnels will still be open upon this function's return, but should // no longer be used. -func (l SSHTunnelList) Close() { - for ix := range l { - entry := l[ix] +func (l *SSHTunnelList) Close() { + for ix := range l.entries { + entry := l.entries[ix] go func() { defer HandleCrash() time.Sleep(1 * time.Minute) @@ -253,22 +258,26 @@ func (l SSHTunnelList) Close() { } } -func (l SSHTunnelList) Dial(network, addr string) (net.Conn, error) { - if len(l) == 0 { +func (l *SSHTunnelList) Dial(network, addr string) (net.Conn, error) { + if len(l.entries) == 0 { return nil, fmt.Errorf("Empty tunnel list.") } - return l[mathrand.Int()%len(l)].Tunnel.Dial(network, addr) + return l.entries[mathrand.Int()%len(l.entries)].Tunnel.Dial(network, addr) } -func (l SSHTunnelList) Has(addr string) bool { - for ix := range l { - if l[ix].Address == addr { +func (l *SSHTunnelList) Has(addr string) bool { + for ix := range l.entries { + if l.entries[ix].Address == addr { return true } } return false } +func (l *SSHTunnelList) Len() int { + return len(l.entries) +} + func EncodePrivateKey(private *rsa.PrivateKey) []byte { return pem.EncodeToMemory(&pem.Block{ Bytes: x509.MarshalPKCS1PrivateKey(private), From e2f4472d71f2c71e3a30dbc2fc5c81f902de4dc8 Mon Sep 17 00:00:00 2001 From: Jeff Lowdermilk Date: Fri, 12 Jun 2015 19:06:18 -0700 Subject: [PATCH 11/27] kubectl negotiates apiversion to use based on client,server supported (cherry picked from commit f31191224bd9b555839e3aef2a49cbb3ccb73f69) --- pkg/client/helper.go | 66 +++++++++++++++++++++++++++++ pkg/kubectl/cmd/cmd_test.go | 34 ++++++++------- pkg/kubectl/cmd/util/clientcache.go | 27 +++++++++--- pkg/kubectl/cmd/util/factory.go | 5 +-- 4 files changed, 105 insertions(+), 27 deletions(-) diff --git a/pkg/client/helper.go b/pkg/client/helper.go index 8834526a834..5021e595957 100644 --- a/pkg/client/helper.go +++ b/pkg/client/helper.go @@ -30,8 +30,11 @@ import ( "time" "github.com/GoogleCloudPlatform/kubernetes/pkg/api/latest" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/registered" "github.com/GoogleCloudPlatform/kubernetes/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" "github.com/GoogleCloudPlatform/kubernetes/pkg/version" + "github.com/golang/glog" ) // Config holds the common attributes that can be passed to a Kubernetes client on @@ -143,6 +146,9 @@ func New(c *Config) (*Client, error) { return &Client{client}, nil } +// MatchesServerVersion queries the server to compares the build version +// (git hash) of the client with the server's build version. It returns an error +// if it failed to contact the server or if the versions are not an exact match. func MatchesServerVersion(c *Config) error { client, err := New(c) if err != nil { @@ -161,6 +167,66 @@ func MatchesServerVersion(c *Config) error { return nil } +// NegotiateVersion queries the server's supported api versions to find +// a version that both client and server support. +// - If no version is provided, try the client's registered versions in order of +// preference. +// - If version is provided, but not default config (explicitly requested via +// commandline flag), and is unsupported by the server, print a warning to +// stderr and try client's registered versions in order of preference. +// - If version is config default, and the server does not support it, +// return an error. +func NegotiateVersion(c *Config, version string) (string, error) { + client, err := New(c) + if err != nil { + return "", err + } + clientVersions := util.StringSet{} + for _, v := range registered.RegisteredVersions { + clientVersions.Insert(v) + } + apiVersions, err := client.ServerAPIVersions() + if err != nil { + return "", fmt.Errorf("couldn't read version from server: %v\n", err) + } + serverVersions := util.StringSet{} + for _, v := range apiVersions.Versions { + serverVersions.Insert(v) + } + // If no version requested, use config version (may also be empty). + if len(version) == 0 { + version = c.Version + } + // If version explicitly requested verify that both client and server support it. + // If server does not support warn, but try to negotiate a lower version. + if len(version) != 0 { + if !clientVersions.Has(version) { + return "", fmt.Errorf("Client does not support API version '%s'. Client supported API versions: %v", version, clientVersions) + + } + if serverVersions.Has(version) { + return version, nil + } + // If we are using an explicit config version the server does not support, fail. + if version == c.Version { + return "", fmt.Errorf("Server does not support API version '%s'.", version) + } + } + + for _, clientVersion := range registered.RegisteredVersions { + if serverVersions.Has(clientVersion) { + // Version was not explicitly requested in command config (--api-version). + // Ok to fall back to a supported version with a warning. + if len(version) != 0 { + glog.Warningf("Server does not support API version '%s'. Falling back to '%s'.", version, clientVersion) + } + return clientVersion, nil + } + } + return "", fmt.Errorf("Failed to negotiate an api version. Server supports: %v. Client supports: %v.", + serverVersions, registered.RegisteredVersions) +} + // NewOrDie creates a Kubernetes client and panics if the provided API version is not recognized. func NewOrDie(c *Config) *Client { client, err := New(c) diff --git a/pkg/kubectl/cmd/cmd_test.go b/pkg/kubectl/cmd/cmd_test.go index 063fd00aeed..5815b250176 100644 --- a/pkg/kubectl/cmd/cmd_test.go +++ b/pkg/kubectl/cmd/cmd_test.go @@ -221,23 +221,25 @@ func stringBody(body string) io.ReadCloser { return ioutil.NopCloser(bytes.NewReader([]byte(body))) } +// TODO(jlowdermilk): refactor the Factory so we can test client versions properly, +// with different client/server version skew scenarios. // Verify that resource.RESTClients constructed from a factory respect mapping.APIVersion -func TestClientVersions(t *testing.T) { - f := cmdutil.NewFactory(nil) - - version := testapi.Version() - mapping := &meta.RESTMapping{ - APIVersion: version, - } - c, err := f.RESTClient(mapping) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - client := c.(*client.RESTClient) - if client.APIVersion() != version { - t.Errorf("unexpected Client APIVersion: %s %v", client.APIVersion, client) - } -} +//func TestClientVersions(t *testing.T) { +// f := cmdutil.NewFactory(nil) +// +// version := testapi.Version() +// mapping := &meta.RESTMapping{ +// APIVersion: version, +// } +// c, err := f.RESTClient(mapping) +// if err != nil { +// t.Errorf("unexpected error: %v", err) +// } +// client := c.(*client.RESTClient) +// if client.APIVersion() != version { +// t.Errorf("unexpected Client APIVersion: %s %v", client.APIVersion, client) +// } +//} func ExamplePrintReplicationController() { f, tf, codec := NewAPIFactory() diff --git a/pkg/kubectl/cmd/util/clientcache.go b/pkg/kubectl/cmd/util/clientcache.go index e7e64992c00..6d86d5d89fd 100644 --- a/pkg/kubectl/cmd/util/clientcache.go +++ b/pkg/kubectl/cmd/util/clientcache.go @@ -21,11 +21,20 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/client/clientcmd" ) +func NewClientCache(loader clientcmd.ClientConfig) *clientCache { + return &clientCache{ + clients: make(map[string]*client.Client), + configs: make(map[string]*client.Config), + loader: loader, + } +} + // clientCache caches previously loaded clients for reuse, and ensures MatchServerVersion // is invoked only once type clientCache struct { loader clientcmd.ClientConfig clients map[string]*client.Client + configs map[string]*client.Config defaultConfig *client.Config matchVersion bool } @@ -44,12 +53,18 @@ func (c *clientCache) ClientConfigForVersion(version string) (*client.Config, er } } } + if config, ok := c.configs[version]; ok { + return config, nil + } // TODO: have a better config copy method config := *c.defaultConfig - if len(version) != 0 { - config.Version = version + negotiatedVersion, err := client.NegotiateVersion(&config, version) + if err != nil { + return nil, err } + config.Version = negotiatedVersion client.SetKubernetesDefaults(&config) + c.configs[version] = &config return &config, nil } @@ -57,15 +72,13 @@ func (c *clientCache) ClientConfigForVersion(version string) (*client.Config, er // ClientForVersion initializes or reuses a client for the specified version, or returns an // error if that is not possible func (c *clientCache) ClientForVersion(version string) (*client.Client, error) { + if client, ok := c.clients[version]; ok { + return client, nil + } config, err := c.ClientConfigForVersion(version) if err != nil { return nil, err } - - if client, ok := c.clients[config.Version]; ok { - return client, nil - } - client, err := client.New(config) if err != nil { return nil, err diff --git a/pkg/kubectl/cmd/util/factory.go b/pkg/kubectl/cmd/util/factory.go index 1a3486047e6..ae799cfd1aa 100644 --- a/pkg/kubectl/cmd/util/factory.go +++ b/pkg/kubectl/cmd/util/factory.go @@ -102,10 +102,7 @@ func NewFactory(optionalClientConfig clientcmd.ClientConfig) *Factory { clientConfig = DefaultClientConfig(flags) } - clients := &clientCache{ - clients: make(map[string]*client.Client), - loader: clientConfig, - } + clients := NewClientCache(clientConfig) return &Factory{ clients: clients, From 38428785fea4ac2212fb7646516085863752fcf2 Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Tue, 9 Jun 2015 16:55:39 -0700 Subject: [PATCH 12/27] Update to heapster v0.14.0 (cherry picked from commit a38204a498fbf6dd2281bda74eebf1402cea95b6) --- .../google/heapster-controller.yaml | 12 ++++++------ .../cluster-monitoring/google/heapster-service.yaml | 13 +++++++++++++ .../influxdb/grafana-service.yaml | 2 +- .../influxdb/heapster-controller.yaml | 12 ++++++------ .../influxdb/heapster-service.yaml | 13 +++++++++++++ .../influxdb/influxdb-grafana-controller.yaml | 2 +- .../influxdb/influxdb-service.yaml | 2 +- test/e2e/monitoring.go | 4 ++-- 8 files changed, 43 insertions(+), 17 deletions(-) create mode 100644 cluster/addons/cluster-monitoring/google/heapster-service.yaml create mode 100644 cluster/addons/cluster-monitoring/influxdb/heapster-service.yaml diff --git a/cluster/addons/cluster-monitoring/google/heapster-controller.yaml b/cluster/addons/cluster-monitoring/google/heapster-controller.yaml index ced723bcda2..a787389bf76 100644 --- a/cluster/addons/cluster-monitoring/google/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/google/heapster-controller.yaml @@ -1,26 +1,26 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: ReplicationController metadata: - name: monitoring-heapster-v1 + name: monitoring-heapster-v2 namespace: default labels: k8s-app: heapster - version: v1 + version: v2 kubernetes.io/cluster-service: "true" spec: replicas: 1 selector: k8s-app: heapster - version: v1 + version: v2 template: metadata: labels: k8s-app: heapster - version: v1 + version: v2 kubernetes.io/cluster-service: "true" spec: containers: - - image: gcr.io/google_containers/heapster:v0.13.0 + - image: gcr.io/google_containers/heapster:v0.14.0 name: heapster command: - /heapster diff --git a/cluster/addons/cluster-monitoring/google/heapster-service.yaml b/cluster/addons/cluster-monitoring/google/heapster-service.yaml new file mode 100644 index 00000000000..8ed8ff9a65f --- /dev/null +++ b/cluster/addons/cluster-monitoring/google/heapster-service.yaml @@ -0,0 +1,13 @@ +kind: Service +apiVersion: v1 +metadata: + name: monitoring-heapster + labels: + kubernetes.io/cluster-service: "true" + name: monitoring-heapster +spec: + ports: + - port: 80 + targetPort: 8082 + selector: + k8s-app: heapster diff --git a/cluster/addons/cluster-monitoring/influxdb/grafana-service.yaml b/cluster/addons/cluster-monitoring/influxdb/grafana-service.yaml index 2c8b05e46a0..2df94a40751 100644 --- a/cluster/addons/cluster-monitoring/influxdb/grafana-service.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/grafana-service.yaml @@ -1,4 +1,4 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: Service metadata: name: monitoring-grafana diff --git a/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml b/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml index f6d4e6ee58d..a57fd61138b 100644 --- a/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml @@ -1,26 +1,26 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: ReplicationController metadata: - name: monitoring-heapster-v1 + name: monitoring-heapster-v2 namespace: default labels: k8s-app: heapster - version: v1 + version: v2 kubernetes.io/cluster-service: "true" spec: replicas: 1 selector: k8s-app: heapster - version: v1 + version: v2 template: metadata: labels: k8s-app: heapster - version: v1 + version: v2 kubernetes.io/cluster-service: "true" spec: containers: - - image: gcr.io/google_containers/heapster:v0.13.0 + - image: gcr.io/google_containers/heapster:v0.14.0 name: heapster command: - /heapster diff --git a/cluster/addons/cluster-monitoring/influxdb/heapster-service.yaml b/cluster/addons/cluster-monitoring/influxdb/heapster-service.yaml new file mode 100644 index 00000000000..8ed8ff9a65f --- /dev/null +++ b/cluster/addons/cluster-monitoring/influxdb/heapster-service.yaml @@ -0,0 +1,13 @@ +kind: Service +apiVersion: v1 +metadata: + name: monitoring-heapster + labels: + kubernetes.io/cluster-service: "true" + name: monitoring-heapster +spec: + ports: + - port: 80 + targetPort: 8082 + selector: + k8s-app: heapster diff --git a/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml b/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml index 6e5ee253dcf..94d3dc06b2b 100644 --- a/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml @@ -1,4 +1,4 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: ReplicationController metadata: name: monitoring-influx-grafana-v1 diff --git a/cluster/addons/cluster-monitoring/influxdb/influxdb-service.yaml b/cluster/addons/cluster-monitoring/influxdb/influxdb-service.yaml index 344c0871516..b115086a2e4 100644 --- a/cluster/addons/cluster-monitoring/influxdb/influxdb-service.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/influxdb-service.yaml @@ -1,4 +1,4 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: Service metadata: name: monitoring-influxdb diff --git a/test/e2e/monitoring.go b/test/e2e/monitoring.go index 4561643944d..d1807c777bc 100644 --- a/test/e2e/monitoring.go +++ b/test/e2e/monitoring.go @@ -56,8 +56,8 @@ const ( influxdbDatabaseName = "k8s" influxdbUser = "root" influxdbPW = "root" - podlistQuery = "select distinct(pod_id) from /cpu.*/" - nodelistQuery = "select distinct(hostname) from /cpu.*/" + podlistQuery = "select distinct(pod_id) from \"cpu/usage_ns_cumulative\"" + nodelistQuery = "select distinct(hostname) from \"cpu/usage_ns_cumulative\"" sleepBetweenAttempts = 5 * time.Second testTimeout = 5 * time.Minute ) From 4df6a26df7a7cace675a9d28020e0342c0b82e39 Mon Sep 17 00:00:00 2001 From: saadali Date: Thu, 4 Jun 2015 23:27:33 -0700 Subject: [PATCH 13/27] Enable InfluxDB/Grafana for GCE in addition to GCM/GCL (cherry picked from commit bc53533c833bf07c0d247ab410eedca055417954) --- .../heapster-controller-combined.yaml | 48 +++++++++++++++++++ cluster/gce/config-default.sh | 9 ++-- cluster/saltbase/salt/kube-addons/init.sls | 12 +++++ 3 files changed, 65 insertions(+), 4 deletions(-) create mode 100644 cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml diff --git a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml new file mode 100644 index 00000000000..c416f8c3023 --- /dev/null +++ b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml @@ -0,0 +1,48 @@ +apiVersion: v1beta3 +kind: ReplicationController +metadata: + name: monitoring-heapster-v1 + namespace: default + labels: + k8s-app: heapster + version: v1 + kubernetes.io/cluster-service: "true" +spec: + replicas: 1 + selector: + k8s-app: heapster + version: v1 + template: + metadata: + labels: + k8s-app: heapster + version: v1 + kubernetes.io/cluster-service: "true" + spec: + containers: + - image: gcr.io/google_containers/heapster:v0.13.0 + name: heapster + command: + - /heapster + - --source=kubernetes:https://kubernetes + - --sink=gcm + - --sink=gcl + - --sink=influxdb:http://monitoring-influxdb:8086 + - --poll_duration=2m + - --stats_resolution=1m + volumeMounts: + - name: ssl-certs + mountPath: /etc/ssl/certs + readOnly: true + - name: monitoring-token + mountPath: /etc/kubernetes/kubeconfig + readOnly: true + + volumes: + - name: ssl-certs + hostPath: + path: /etc/ssl/certs + - name: monitoring-token + secret: + secretName: token-system-monitoring + diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 949aa3f5fb7..4ec7167ce30 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -54,10 +54,11 @@ ENABLE_DOCKER_REGISTRY_CACHE=true ENABLE_NODE_MONITORING="${KUBE_ENABLE_NODE_MONITORING:-true}" # Optional: Cluster monitoring to setup as part of the cluster bring up: -# none - No cluster monitoring setup -# influxdb - Heapster, InfluxDB, and Grafana -# google - Heapster, Google Cloud Monitoring, and Google Cloud Logging -ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-google}" +# none - No cluster monitoring setup +# influxdb - Heapster, InfluxDB, and Grafana +# google - Heapster, Google Cloud Monitoring, and Google Cloud Logging +# googleinfluxdb - Enable influxdb and google +ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-googleinfluxdb}" # Optional: Enable node logging. ENABLE_NODE_LOGGING="${KUBE_ENABLE_NODE_LOGGING:-true}" diff --git a/cluster/saltbase/salt/kube-addons/init.sls b/cluster/saltbase/salt/kube-addons/init.sls index 878b7244456..ad99b40e07e 100644 --- a/cluster/saltbase/salt/kube-addons/init.sls +++ b/cluster/saltbase/salt/kube-addons/init.sls @@ -33,6 +33,18 @@ addon-dir-create: - file_mode: 644 {% endif %} +{% if pillar.get('enable_cluster_monitoring', '').lower() == 'googleinfluxdb' %} +/etc/kubernetes/addons/cluster-monitoring/googleinfluxdb: + file.recurse: + - source: salt://kube-addons/cluster-monitoring + - include_pat: E@(^.+\.yaml$|^.+\.json$) + - exclude_pat: E@(^.+heapster-controller\.yaml$|^.+heapster-controller\.json$) + - user: root + - group: root + - dir_mode: 755 + - file_mode: 644 +{% endif %} + {% if pillar.get('enable_cluster_dns', '').lower() == 'true' %} /etc/kubernetes/addons/dns/skydns-svc.yaml: file.managed: From e6bf9c6359754f4d22cdbe47f13aef0b389d66b4 Mon Sep 17 00:00:00 2001 From: saadali Date: Fri, 5 Jun 2015 17:54:06 -0700 Subject: [PATCH 14/27] Disable GCM for GCE (cherry picked from commit a839f47d4a782669d20561a52144eade835a38e5) --- .../googleinfluxdb/heapster-controller-combined.yaml | 1 - cluster/gce/config-default.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml index c416f8c3023..ca1e3acf0bb 100644 --- a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml +++ b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml @@ -25,7 +25,6 @@ spec: command: - /heapster - --source=kubernetes:https://kubernetes - - --sink=gcm - --sink=gcl - --sink=influxdb:http://monitoring-influxdb:8086 - --poll_duration=2m diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 4ec7167ce30..a4b4459db21 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -57,7 +57,7 @@ ENABLE_NODE_MONITORING="${KUBE_ENABLE_NODE_MONITORING:-true}" # none - No cluster monitoring setup # influxdb - Heapster, InfluxDB, and Grafana # google - Heapster, Google Cloud Monitoring, and Google Cloud Logging -# googleinfluxdb - Enable influxdb and google +# googleinfluxdb - Enable influxdb and google (except GCM) ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-googleinfluxdb}" # Optional: Enable node logging. From db20457dfc92daa10ad8f331100dd2b7779448e3 Mon Sep 17 00:00:00 2001 From: Alex Robinson Date: Fri, 12 Jun 2015 01:32:50 +0000 Subject: [PATCH 15/27] Update heapster's combined googleinfluxdb version to 0.14.0. (cherry picked from commit 8b1f2d2bf2372b79c52d7baf23c049dac2f1fbf8) --- .../googleinfluxdb/heapster-controller-combined.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml index ca1e3acf0bb..508e6ee9103 100644 --- a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml +++ b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml @@ -1,26 +1,26 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: ReplicationController metadata: - name: monitoring-heapster-v1 + name: monitoring-heapster-v2 namespace: default labels: k8s-app: heapster - version: v1 + version: v2 kubernetes.io/cluster-service: "true" spec: replicas: 1 selector: k8s-app: heapster - version: v1 + version: v2 template: metadata: labels: k8s-app: heapster - version: v1 + version: v2 kubernetes.io/cluster-service: "true" spec: containers: - - image: gcr.io/google_containers/heapster:v0.13.0 + - image: gcr.io/google_containers/heapster:v0.14.0 name: heapster command: - /heapster From 7dd692cc7261c19a8a9e220d09c96150c9c50beb Mon Sep 17 00:00:00 2001 From: Vishnu Kannan Date: Tue, 16 Jun 2015 11:13:27 -0700 Subject: [PATCH 16/27] Updating heapster version to v0.14.1. Adding a standalone version of heapster which exposes stats via REST API. (cherry picked from commit 31f7ea10c6b3fcfd463983fed0bfcaed710b143f) --- .../google/heapster-controller.yaml | 12 +++--- .../heapster-controller-combined.yaml | 12 +++--- .../influxdb/heapster-controller.yaml | 12 +++--- .../standalone/heapster-controller.yaml | 43 +++++++++++++++++++ .../standalone/heapster-service.yaml | 13 ++++++ cluster/gce/config-default.sh | 1 + cluster/gke/config-default.sh | 3 +- cluster/saltbase/salt/kube-addons/init.sls | 11 +++++ 8 files changed, 88 insertions(+), 19 deletions(-) create mode 100644 cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml create mode 100644 cluster/addons/cluster-monitoring/standalone/heapster-service.yaml diff --git a/cluster/addons/cluster-monitoring/google/heapster-controller.yaml b/cluster/addons/cluster-monitoring/google/heapster-controller.yaml index a787389bf76..9dd813302d8 100644 --- a/cluster/addons/cluster-monitoring/google/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/google/heapster-controller.yaml @@ -1,30 +1,30 @@ apiVersion: v1 kind: ReplicationController metadata: - name: monitoring-heapster-v2 + name: monitoring-heapster-v3 namespace: default labels: k8s-app: heapster - version: v2 + version: v3 kubernetes.io/cluster-service: "true" spec: replicas: 1 selector: k8s-app: heapster - version: v2 + version: v3 template: metadata: labels: k8s-app: heapster - version: v2 + version: v3 kubernetes.io/cluster-service: "true" spec: containers: - - image: gcr.io/google_containers/heapster:v0.14.0 + - image: gcr.io/google_containers/heapster:v0.14.1 name: heapster command: - /heapster - - --source=kubernetes:https://kubernetes + - --source=kubernetes:'' - --sink=gcm - --sink=gcl - --poll_duration=2m diff --git a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml index 508e6ee9103..40617e65f49 100644 --- a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml +++ b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml @@ -1,30 +1,30 @@ apiVersion: v1 kind: ReplicationController metadata: - name: monitoring-heapster-v2 + name: monitoring-heapster-v3 namespace: default labels: k8s-app: heapster - version: v2 + version: v3 kubernetes.io/cluster-service: "true" spec: replicas: 1 selector: k8s-app: heapster - version: v2 + version: v3 template: metadata: labels: k8s-app: heapster - version: v2 + version: v3 kubernetes.io/cluster-service: "true" spec: containers: - - image: gcr.io/google_containers/heapster:v0.14.0 + - image: gcr.io/google_containers/heapster:v0.14.1 name: heapster command: - /heapster - - --source=kubernetes:https://kubernetes + - --source=kubernetes:'' - --sink=gcl - --sink=influxdb:http://monitoring-influxdb:8086 - --poll_duration=2m diff --git a/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml b/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml index a57fd61138b..39044e8ce46 100644 --- a/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml @@ -1,30 +1,30 @@ apiVersion: v1 kind: ReplicationController metadata: - name: monitoring-heapster-v2 + name: monitoring-heapster-v3 namespace: default labels: k8s-app: heapster - version: v2 + version: v3 kubernetes.io/cluster-service: "true" spec: replicas: 1 selector: k8s-app: heapster - version: v2 + version: v3 template: metadata: labels: k8s-app: heapster - version: v2 + version: v3 kubernetes.io/cluster-service: "true" spec: containers: - - image: gcr.io/google_containers/heapster:v0.14.0 + - image: gcr.io/google_containers/heapster:v0.14.1 name: heapster command: - /heapster - - --source=kubernetes:https://kubernetes + - --source=kubernetes:'' - --sink=influxdb:http://monitoring-influxdb:8086 volumeMounts: - name: ssl-certs diff --git a/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml b/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml new file mode 100644 index 00000000000..4589a8c05fb --- /dev/null +++ b/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml @@ -0,0 +1,43 @@ +apiVersion: v1 +kind: ReplicationController +metadata: + name: monitoring-heapster-v3 + namespace: default + labels: + k8s-app: heapster + version: v3 + kubernetes.io/cluster-service: "true" +spec: + replicas: 1 + selector: + k8s-app: heapster + version: v3 + template: + metadata: + labels: + k8s-app: heapster + version: v3 + kubernetes.io/cluster-service: "true" + spec: + containers: + - image: gcr.io/google_containers/heapster:v0.14.1 + name: heapster + command: + - /heapster + - --source=kubernetes:'' + volumeMounts: + - name: ssl-certs + mountPath: /etc/ssl/certs + readOnly: true + - name: monitoring-token + mountPath: /etc/kubernetes/kubeconfig + readOnly: true + + volumes: + - name: ssl-certs + hostPath: + path: /etc/ssl/certs + - name: monitoring-token + secret: + secretName: token-system-monitoring + diff --git a/cluster/addons/cluster-monitoring/standalone/heapster-service.yaml b/cluster/addons/cluster-monitoring/standalone/heapster-service.yaml new file mode 100644 index 00000000000..8ed8ff9a65f --- /dev/null +++ b/cluster/addons/cluster-monitoring/standalone/heapster-service.yaml @@ -0,0 +1,13 @@ +kind: Service +apiVersion: v1 +metadata: + name: monitoring-heapster + labels: + kubernetes.io/cluster-service: "true" + name: monitoring-heapster +spec: + ports: + - port: 80 + targetPort: 8082 + selector: + k8s-app: heapster diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index a4b4459db21..2339c0ea199 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -58,6 +58,7 @@ ENABLE_NODE_MONITORING="${KUBE_ENABLE_NODE_MONITORING:-true}" # influxdb - Heapster, InfluxDB, and Grafana # google - Heapster, Google Cloud Monitoring, and Google Cloud Logging # googleinfluxdb - Enable influxdb and google (except GCM) +# standalone - Heapster only. Metrics available via Heapster REST API. ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-googleinfluxdb}" # Optional: Enable node logging. diff --git a/cluster/gke/config-default.sh b/cluster/gke/config-default.sh index 27a6249d442..4692882cf6d 100644 --- a/cluster/gke/config-default.sh +++ b/cluster/gke/config-default.sh @@ -34,4 +34,5 @@ ELASTICSEARCH_LOGGING_REPLICAS=1 # none - No cluster monitoring setup # influxdb - Heapster, InfluxDB, and Grafana # google - Heapster, Google Cloud Monitoring, and Google Cloud Logging -ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-none}" +# standalone - Heapster only. Metrics available via Heapster REST API. +ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-standalone}" diff --git a/cluster/saltbase/salt/kube-addons/init.sls b/cluster/saltbase/salt/kube-addons/init.sls index ad99b40e07e..35c0a348f61 100644 --- a/cluster/saltbase/salt/kube-addons/init.sls +++ b/cluster/saltbase/salt/kube-addons/init.sls @@ -33,6 +33,17 @@ addon-dir-create: - file_mode: 644 {% endif %} +{% if pillar.get('enable_cluster_monitoring', '').lower() == 'standalone' %} +/etc/kubernetes/addons/cluster-monitoring/standalone: + file.recurse: + - source: salt://kube-addons/cluster-monitoring/standalone + - include_pat: E@(^.+\.yaml$|^.+\.json$) + - user: root + - group: root + - dir_mode: 755 + - file_mode: 644 +{% endif %} + {% if pillar.get('enable_cluster_monitoring', '').lower() == 'googleinfluxdb' %} /etc/kubernetes/addons/cluster-monitoring/googleinfluxdb: file.recurse: From 311a58f1ad334d0bff1f4e86a8e4a6682b3ff473 Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Wed, 17 Jun 2015 22:34:11 -0700 Subject: [PATCH 17/27] Update the kubelet to ignore syncing Pods until the container runtime is up. (cherry picked from commit 51fb753e11a8292e9eda427f726df85b9c2ed4c3) --- pkg/kubelet/kubelet.go | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index f071f35f24e..5e222655f16 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1615,6 +1615,11 @@ func (kl *Kubelet) admitPods(allPods []*api.Pod, podSyncTypes map[types.UID]metr func (kl *Kubelet) syncLoop(updates <-chan PodUpdate, handler SyncHandler) { glog.Info("Starting kubelet main sync loop.") for { + if !kl.containerRuntimeUp() { + time.Sleep(5 * time.Second) + glog.Infof("Skipping pod synchronization, container runtime is not up.") + continue + } unsyncedPod := false podSyncTypes := make(map[types.UID]metrics.SyncPodType) select { @@ -1875,11 +1880,7 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error { } // Check whether container runtime can be reported as up. - containerRuntimeUp := func() bool { - kl.runtimeMutex.Lock() - defer kl.runtimeMutex.Unlock() - return kl.lastTimestampRuntimeUp.Add(kl.runtimeUpThreshold).After(time.Now()) - }() + containerRuntimeUp := kl.containerRuntimeUp() currentTime := util.Now() var newNodeReadyCondition api.NodeCondition @@ -1942,6 +1943,12 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error { return nil } +func (kl *Kubelet) containerRuntimeUp() bool { + kl.runtimeMutex.Lock() + defer kl.runtimeMutex.Unlock() + return kl.lastTimestampRuntimeUp.Add(kl.runtimeUpThreshold).After(time.Now()) +} + // tryUpdateNodeStatus tries to update node status to master. If ReconcileCBR0 // is set, this function will also confirm that cbr0 is configured correctly. func (kl *Kubelet) tryUpdateNodeStatus() error { From 156d315cebaa97347194beed1473b7bdcf78265b Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Wed, 17 Jun 2015 23:10:32 -0700 Subject: [PATCH 18/27] Truncate SSH usernames to 32 chars. (cherry picked from commit 2330760d344c61137caffc0e59d9383e57dbd805) --- pkg/master/master.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/master/master.go b/pkg/master/master.go index c659d28238b..2e3b8ed5d89 100644 --- a/pkg/master/master.go +++ b/pkg/master/master.go @@ -495,6 +495,11 @@ func (m *Master) init(c *Config) { var proxyDialer func(net, addr string) (net.Conn, error) if len(c.SSHUser) > 0 { + // Usernames are capped @ 32 + if len(c.SSHUser) > 32 { + glog.Warning("SSH User is too long, truncating to 32 chars") + c.SSHUser = c.SSHUser[0:32] + } glog.Infof("Setting up proxy: %s %s", c.SSHUser, c.SSHKeyfile) exists, err := util.FileExists(c.SSHKeyfile) if err != nil { From 3c754a6ec5336ef40ac0d7023e0f16be5a48d154 Mon Sep 17 00:00:00 2001 From: CJ Cullen Date: Thu, 18 Jun 2015 10:34:58 -0700 Subject: [PATCH 19/27] Pass through an explicit PROXY_SSH_USER. Use user@user instead of user@hostname in case hostname is too long. (cherry picked from commit abf1e768dce87cfc5f962f4ee0311b603aaa78df) --- cluster/gce/configure-vm.sh | 2 +- pkg/cloudprovider/gce/gce.go | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/cluster/gce/configure-vm.sh b/cluster/gce/configure-vm.sh index 9ac82ace2df..c73dfb8921d 100644 --- a/cluster/gce/configure-vm.sh +++ b/cluster/gce/configure-vm.sh @@ -501,7 +501,7 @@ EOF cat <>/etc/salt/minion.d/grains.conf cloud_config: /etc/gce.conf advertise_address: '${EXTERNAL_IP}' - proxy_ssh_user: '${INSTANCE_PREFIX}' + proxy_ssh_user: '${PROXY_SSH_USER}' EOF fi } diff --git a/pkg/cloudprovider/gce/gce.go b/pkg/cloudprovider/gce/gce.go index d7cdb8a8d05..fd2bb0c7a45 100644 --- a/pkg/cloudprovider/gce/gce.go +++ b/pkg/cloudprovider/gce/gce.go @@ -22,7 +22,6 @@ import ( "io/ioutil" "net" "net/http" - "os" "path" "strconv" "strings" @@ -490,12 +489,7 @@ func (gce *GCECloud) AddSSHKeyToAllInstances(user string, keyData []byte) error glog.Errorf("Could not get project: %v", err) return false, nil } - hostname, err := os.Hostname() - if err != nil { - glog.Errorf("Could not get hostname: %v", err) - return false, nil - } - keyString := fmt.Sprintf("%s:%s %s@%s", user, strings.TrimSpace(string(keyData)), user, hostname) + keyString := fmt.Sprintf("%s:%s %s@%s", user, strings.TrimSpace(string(keyData)), user, user) found := false for _, item := range project.CommonInstanceMetadata.Items { if item.Key == "sshKeys" { From 0dd9bbb598f56242180756a7f248637659aca18b Mon Sep 17 00:00:00 2001 From: Jeff Lowdermilk Date: Thu, 18 Jun 2015 13:39:51 -0700 Subject: [PATCH 20/27] Fix a few e2e references to allow gke to test dogfood builds (cherry picked from commit e72431033a75dbd655b2882b8ba58a0c3ef4cf7f) --- cluster/gke/util.sh | 2 +- hack/jenkins/e2e.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cluster/gke/util.sh b/cluster/gke/util.sh index 3146211b951..9acaa3f709d 100755 --- a/cluster/gke/util.sh +++ b/cluster/gke/util.sh @@ -99,7 +99,7 @@ function verify-prereqs() { sudo_prefix="sudo" fi ${sudo_prefix} gcloud ${gcloud_prompt:-} components update preview || true - ${sudo_prefix} gcloud ${gcloud_prompt:-} components update alpha|| true + ${sudo_prefix} gcloud ${gcloud_prompt:-} components update "${CMD_GROUP}"|| true ${sudo_prefix} gcloud ${gcloud_prompt:-} components update kubectl|| true ${sudo_prefix} gcloud ${gcloud_prompt:-} components update || true } diff --git a/hack/jenkins/e2e.sh b/hack/jenkins/e2e.sh index 9d628864340..6ee4a0f0acf 100755 --- a/hack/jenkins/e2e.sh +++ b/hack/jenkins/e2e.sh @@ -125,7 +125,7 @@ if [[ "${E2E_UP,,}" == "true" ]]; then # code=400,message=cluster.cluster_api_versionmustbeoneof: # 0.15.0,0.16.0. # The command should error, so we throw an || true on there. - msg=$(gcloud alpha container clusters create this-wont-work \ + msg=$(gcloud ${CMD_GROUP:-alpha} container clusters create this-wont-work \ --zone=us-central1-f --cluster-api-version=0.0.0 2>&1 \ | tr -d '[[:space:]]') || true # Strip out everything before the final colon, which gives us just From 0cae951dae96632cc9b158c7027c8c98adcc3893 Mon Sep 17 00:00:00 2001 From: CJ Cullen Date: Thu, 11 Jun 2015 08:49:00 -0700 Subject: [PATCH 21/27] Apply test firewalls to both the new gke- node tags and the old k8s- tags (cherry picked from commit 69f5765fa2b4f5a614fb13eaed39c68849192e1d) --- cluster/gke/util.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cluster/gke/util.sh b/cluster/gke/util.sh index 9acaa3f709d..c954da94f74 100755 --- a/cluster/gke/util.sh +++ b/cluster/gke/util.sh @@ -168,7 +168,8 @@ function test-setup() { detect-project >&2 # At this point, CLUSTER_NAME should have been used, so its value is final. - MINION_TAG="k8s-${CLUSTER_NAME}-node" + MINION_TAG="gke-${CLUSTER_NAME}-node" + OLD_MINION_TAG="k8s-${CLUSTER_NAME}-node" # Open up port 80 & 8080 so common containers on minions can be reached. # TODO(mbforbes): Is adding ${USER} necessary, and sufficient, to avoid @@ -177,14 +178,14 @@ function test-setup() { "${MINION_TAG}-${USER}-http-alt" \ --allow tcp:80,tcp:8080 \ --project "${PROJECT}" \ - --target-tags "${MINION_TAG}" \ + --target-tags "${MINION_TAG},${OLD_MINION_TAG}" \ --network="${NETWORK}" "${GCLOUD}" compute firewall-rules create \ "${MINION_TAG}-${USER}-nodeports" \ --allow tcp:30000-32767,udp:30000-32767 \ --project "${PROJECT}" \ - --target-tags "${MINION_TAG}" \ + --target-tags "${MINION_TAG},${OLD_MINION_TAG}" \ --network="${NETWORK}" } @@ -296,7 +297,7 @@ function test-teardown() { detect-project >&2 # At this point, CLUSTER_NAME should have been used, so its value is final. - MINION_TAG="k8s-${CLUSTER_NAME}-node" + MINION_TAG="gke-${CLUSTER_NAME}-node" # First, remove anything we did with test-setup (currently, the firewall). # NOTE: Keep in sync with names above in test-setup. From 0c19b58df2f8f3fd8b4e27646880f5d2f3aaf53b Mon Sep 17 00:00:00 2001 From: CJ Cullen Date: Fri, 12 Jun 2015 07:46:41 -0700 Subject: [PATCH 22/27] Make GKE test firewalls work with new short-hash node names. (cherry picked from commit 4e57dbdc26dce2a4984a1986a2d05e3c4e0b87c1) --- cluster/gke/util.sh | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/cluster/gke/util.sh b/cluster/gke/util.sh index c954da94f74..9de9fc78f4f 100755 --- a/cluster/gke/util.sh +++ b/cluster/gke/util.sh @@ -166,23 +166,22 @@ function test-setup() { echo "... in test-setup()" >&2 # Detect the project into $PROJECT if it isn't set detect-project >&2 + detect-minions >&2 # At this point, CLUSTER_NAME should have been used, so its value is final. - MINION_TAG="gke-${CLUSTER_NAME}-node" + MINION_TAG=$($GCLOUD compute instances describe ${MINION_NAMES[0]} | grep -o "gke-${CLUSTER_NAME}-.\{8\}-node" | head -1) OLD_MINION_TAG="k8s-${CLUSTER_NAME}-node" # Open up port 80 & 8080 so common containers on minions can be reached. - # TODO(mbforbes): Is adding ${USER} necessary, and sufficient, to avoid - # collisions here? "${GCLOUD}" compute firewall-rules create \ - "${MINION_TAG}-${USER}-http-alt" \ + "${MINION_TAG}-http-alt" \ --allow tcp:80,tcp:8080 \ --project "${PROJECT}" \ --target-tags "${MINION_TAG},${OLD_MINION_TAG}" \ --network="${NETWORK}" "${GCLOUD}" compute firewall-rules create \ - "${MINION_TAG}-${USER}-nodeports" \ + "${MINION_TAG}-nodeports" \ --allow tcp:30000-32767,udp:30000-32767 \ --project "${PROJECT}" \ --target-tags "${MINION_TAG},${OLD_MINION_TAG}" \ @@ -242,10 +241,9 @@ function detect-minions() { # MINION_NAMES function detect-minion-names { detect-project - GROUP_NAME=($(gcloud preview --project "${PROJECT}" instance-groups \ - --zone "${ZONE}" list | grep -o "k8s-${CLUSTER_NAME}-.\{8\}-group")) + detect-node-instance-group MINION_NAMES=($(gcloud preview --project "${PROJECT}" instance-groups \ - --zone "${ZONE}" instances --group "${GROUP_NAME}" list \ + --zone "${ZONE}" instances --group "${NODE_INSTANCE_GROUP}" list \ | cut -d'/' -f11)) echo "MINION_NAMES=${MINION_NAMES[*]}" } @@ -296,14 +294,15 @@ function test-teardown() { echo "... in test-teardown()" >&2 detect-project >&2 + detect-minions >&2 # At this point, CLUSTER_NAME should have been used, so its value is final. - MINION_TAG="gke-${CLUSTER_NAME}-node" + MINION_TAG=$($GCLOUD compute instances describe ${MINION_NAMES[0]} | grep -o "gke-${CLUSTER_NAME}-.\{8\}-node" | head -1) # First, remove anything we did with test-setup (currently, the firewall). # NOTE: Keep in sync with names above in test-setup. - "${GCLOUD}" compute firewall-rules delete "${MINION_TAG}-${USER}-http-alt" \ + "${GCLOUD}" compute firewall-rules delete "${MINION_TAG}-http-alt" \ --project="${PROJECT}" || true - "${GCLOUD}" compute firewall-rules delete "${MINION_TAG}-${USER}-nodeports" \ + "${GCLOUD}" compute firewall-rules delete "${MINION_TAG}-nodeports" \ --project="${PROJECT}" || true # Then actually turn down the cluster. From 42106432a0054ac8d6e6807ec9407aac69e57d81 Mon Sep 17 00:00:00 2001 From: CJ Cullen Date: Fri, 12 Jun 2015 11:36:56 -0700 Subject: [PATCH 23/27] Make GKE-CI test firewalls work on Jenkins (cherry picked from commit 6f30aa78a7980433c2bd60e4f45ec25f44a40224) --- cluster/gke/util.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cluster/gke/util.sh b/cluster/gke/util.sh index 9de9fc78f4f..313e932ebc6 100755 --- a/cluster/gke/util.sh +++ b/cluster/gke/util.sh @@ -160,6 +160,7 @@ function kube-up() { # Assumed vars: # CLUSTER_NAME # GCLOUD +# ZONE # Vars set: # MINION_TAG function test-setup() { @@ -169,7 +170,7 @@ function test-setup() { detect-minions >&2 # At this point, CLUSTER_NAME should have been used, so its value is final. - MINION_TAG=$($GCLOUD compute instances describe ${MINION_NAMES[0]} | grep -o "gke-${CLUSTER_NAME}-.\{8\}-node" | head -1) + MINION_TAG=$($GCLOUD compute instances describe ${MINION_NAMES[0]} --project="${PROJECT}" --zone="${ZONE}" | grep -o "gke-${CLUSTER_NAME}-.\{8\}-node" | head -1) OLD_MINION_TAG="k8s-${CLUSTER_NAME}-node" # Open up port 80 & 8080 so common containers on minions can be reached. @@ -290,13 +291,14 @@ function restart-apiserver() { # CLUSTER_NAME # GCLOUD # KUBE_ROOT +# ZONE function test-teardown() { echo "... in test-teardown()" >&2 detect-project >&2 detect-minions >&2 # At this point, CLUSTER_NAME should have been used, so its value is final. - MINION_TAG=$($GCLOUD compute instances describe ${MINION_NAMES[0]} | grep -o "gke-${CLUSTER_NAME}-.\{8\}-node" | head -1) + MINION_TAG=$($GCLOUD compute instances describe ${MINION_NAMES[0]} --project="${PROJECT}" --zone="${ZONE}" | grep -o "gke-${CLUSTER_NAME}-.\{8\}-node" | head -1) # First, remove anything we did with test-setup (currently, the firewall). # NOTE: Keep in sync with names above in test-setup. From 0a3ff4fa28ee8eb82cbb4a18a92543d657c77d28 Mon Sep 17 00:00:00 2001 From: CJ Cullen Date: Tue, 9 Jun 2015 11:49:40 -0700 Subject: [PATCH 24/27] Make resize tests work on GKE (cherry picked from commit abc4e5364809ccd1ac7d5def41d55f8b26a31bd4) --- cluster/gke/util.sh | 15 +++++++++++++++ hack/ginkgo-e2e.sh | 4 ++++ test/e2e/resize_nodes.go | 4 ++-- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/cluster/gke/util.sh b/cluster/gke/util.sh index 313e932ebc6..095ce5538c7 100755 --- a/cluster/gke/util.sh +++ b/cluster/gke/util.sh @@ -249,6 +249,21 @@ function detect-minion-names { echo "MINION_NAMES=${MINION_NAMES[*]}" } +# Detect instance group name generated by gke +# +# Assumed vars: +# GCLOUD +# PROJECT +# ZONE +# CLUSTER_NAME +# Vars set: +# NODE_INSTANCE_GROUP +function detect-node-instance-group { + NODE_INSTANCE_GROUP=$("${GCLOUD}" alpha container clusters describe \ + --project="${PROJECT}" --zone="${ZONE}" "${CLUSTER_NAME}" \ + | grep instanceGroupManagers | cut -d '/' -f 11) +} + # SSH to a node by name ($1) and run a command ($2). # # Assumed vars: diff --git a/hack/ginkgo-e2e.sh b/hack/ginkgo-e2e.sh index a9821cc152c..4183597a62d 100755 --- a/hack/ginkgo-e2e.sh +++ b/hack/ginkgo-e2e.sh @@ -83,6 +83,10 @@ else NODE_INSTANCE_GROUP="" fi +if [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then + detect-node-instance-group +fi + ginkgo_args=() if [[ ${GINKGO_PARALLEL} =~ ^[yY]$ ]]; then ginkgo_args+=("-p") diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index 4bdbe627f56..57deeb87c03 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -77,7 +77,7 @@ func waitForNodeInstanceGroupSize(size int) error { continue } if currentSize != size { - Logf("Waiting for node istance group size %d, current size %d", size, currentSize) + Logf("Waiting for node instance group size %d, current size %d", size, currentSize) continue } Logf("Node instance group has reached the desired size %d", size) @@ -224,7 +224,7 @@ func waitForPodsCreatedRunningResponding(c *client.Client, ns, name string, repl } var _ = Describe("Nodes", func() { - supportedProviders := []string{"gce"} + supportedProviders := []string{"gce", "gke"} var testName string var c *client.Client var ns string From 8eb382075c052ef52f99c5032e801c341a06b69e Mon Sep 17 00:00:00 2001 From: Jeff Lowdermilk Date: Thu, 11 Jun 2015 13:46:10 -0700 Subject: [PATCH 25/27] Fix #9506 (cherry picked from commit 2af7dd5b57da539f8e457e208dd05a9e4b7d79b6) --- cluster/gke/util.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster/gke/util.sh b/cluster/gke/util.sh index 095ce5538c7..6d4ea3a477d 100755 --- a/cluster/gke/util.sh +++ b/cluster/gke/util.sh @@ -259,7 +259,7 @@ function detect-minion-names { # Vars set: # NODE_INSTANCE_GROUP function detect-node-instance-group { - NODE_INSTANCE_GROUP=$("${GCLOUD}" alpha container clusters describe \ + NODE_INSTANCE_GROUP=$("${GCLOUD}" "${CMD_GROUP}" container clusters describe \ --project="${PROJECT}" --zone="${ZONE}" "${CLUSTER_NAME}" \ | grep instanceGroupManagers | cut -d '/' -f 11) } From bb63f031d4146c17113b059886aea66b09f6daf5 Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Thu, 18 Jun 2015 23:06:41 -0700 Subject: [PATCH 26/27] Kubernetes version v0.19.1 --- pkg/version/base.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/version/base.go b/pkg/version/base.go index 2f231a65c1f..032d95fef67 100644 --- a/pkg/version/base.go +++ b/pkg/version/base.go @@ -36,8 +36,8 @@ package version var ( // TODO: Deprecate gitMajor and gitMinor, use only gitVersion instead. gitMajor string = "0" // major version, always numeric - gitMinor string = "19.0+" // minor version, numeric possibly followed by "+" - gitVersion string = "v0.19.0-dev" // version from git, output of $(git describe) + gitMinor string = "19.1" // minor version, numeric possibly followed by "+" + gitVersion string = "v0.19.1" // version from git, output of $(git describe) gitCommit string = "" // sha1 from git, output of $(git rev-parse HEAD) gitTreeState string = "not a git tree" // state of git tree, either "clean" or "dirty" ) From f29107486608b1f5fa697a7bded479ca5537e921 Mon Sep 17 00:00:00 2001 From: Brendan Burns Date: Thu, 18 Jun 2015 23:07:53 -0700 Subject: [PATCH 27/27] Kubernetes version v0.19.1-dev --- pkg/version/base.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/version/base.go b/pkg/version/base.go index 032d95fef67..c1fe45cd63d 100644 --- a/pkg/version/base.go +++ b/pkg/version/base.go @@ -36,8 +36,8 @@ package version var ( // TODO: Deprecate gitMajor and gitMinor, use only gitVersion instead. gitMajor string = "0" // major version, always numeric - gitMinor string = "19.1" // minor version, numeric possibly followed by "+" - gitVersion string = "v0.19.1" // version from git, output of $(git describe) + gitMinor string = "19.1+" // minor version, numeric possibly followed by "+" + gitVersion string = "v0.19.1-dev" // version from git, output of $(git describe) gitCommit string = "" // sha1 from git, output of $(git rev-parse HEAD) gitTreeState string = "not a git tree" // state of git tree, either "clean" or "dirty" )