diff --git a/build/mark-new-version.sh b/build/mark-new-version.sh index c45fdeeecc8..9349dcc87f6 100755 --- a/build/mark-new-version.sh +++ b/build/mark-new-version.sh @@ -91,19 +91,21 @@ fi VERSION_FILE="${KUBE_ROOT}/pkg/version/base.go" -RELEASE_DIR=release-${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH} -echo "+++ Cloning documentation and examples into ${RELEASE_DIR}/..." -mkdir ${RELEASE_DIR} -cp -r docs ${RELEASE_DIR}/docs -cp -r examples ${RELEASE_DIR}/examples +if [[ "${VERSION_PATCH}" == "0" ]]; then + RELEASE_DIR=release-${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH} + echo "+++ Cloning documentation and examples into ${RELEASE_DIR}/..." + mkdir ${RELEASE_DIR} + cp -r docs ${RELEASE_DIR}/docs + cp -r examples ${RELEASE_DIR}/examples -# Update the docs to match this version. -perl -pi -e "s/HEAD/${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}/" ${RELEASE_DIR}/docs/README.md -perl -pi -e "s/HEAD/${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}/" ${RELEASE_DIR}/examples/README.md + # Update the docs to match this version. + perl -pi -e "s/HEAD/${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}/" ${RELEASE_DIR}/docs/README.md + perl -pi -e "s/HEAD/${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}/" ${RELEASE_DIR}/examples/README.md -${KUBE_ROOT}/hack/run-gendocs.sh -git add ${RELEASE_DIR} -git commit -m "Cloning docs for ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" + ${KUBE_ROOT}/hack/run-gendocs.sh + git add ${RELEASE_DIR} + git commit -m "Cloning docs for ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}" +fi GIT_MINOR="${VERSION_MINOR}.${VERSION_PATCH}" echo "+++ Updating to ${NEW_VERSION}" diff --git a/cluster/addons/cluster-monitoring/google/heapster-controller.yaml b/cluster/addons/cluster-monitoring/google/heapster-controller.yaml index ced723bcda2..9dd813302d8 100644 --- a/cluster/addons/cluster-monitoring/google/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/google/heapster-controller.yaml @@ -1,30 +1,30 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: ReplicationController metadata: - name: monitoring-heapster-v1 + name: monitoring-heapster-v3 namespace: default labels: k8s-app: heapster - version: v1 + version: v3 kubernetes.io/cluster-service: "true" spec: replicas: 1 selector: k8s-app: heapster - version: v1 + version: v3 template: metadata: labels: k8s-app: heapster - version: v1 + version: v3 kubernetes.io/cluster-service: "true" spec: containers: - - image: gcr.io/google_containers/heapster:v0.13.0 + - image: gcr.io/google_containers/heapster:v0.14.1 name: heapster command: - /heapster - - --source=kubernetes:https://kubernetes + - --source=kubernetes:'' - --sink=gcm - --sink=gcl - --poll_duration=2m diff --git a/cluster/addons/cluster-monitoring/google/heapster-service.yaml b/cluster/addons/cluster-monitoring/google/heapster-service.yaml new file mode 100644 index 00000000000..8ed8ff9a65f --- /dev/null +++ b/cluster/addons/cluster-monitoring/google/heapster-service.yaml @@ -0,0 +1,13 @@ +kind: Service +apiVersion: v1 +metadata: + name: monitoring-heapster + labels: + kubernetes.io/cluster-service: "true" + name: monitoring-heapster +spec: + ports: + - port: 80 + targetPort: 8082 + selector: + k8s-app: heapster diff --git a/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml new file mode 100644 index 00000000000..40617e65f49 --- /dev/null +++ b/cluster/addons/cluster-monitoring/googleinfluxdb/heapster-controller-combined.yaml @@ -0,0 +1,47 @@ +apiVersion: v1 +kind: ReplicationController +metadata: + name: monitoring-heapster-v3 + namespace: default + labels: + k8s-app: heapster + version: v3 + kubernetes.io/cluster-service: "true" +spec: + replicas: 1 + selector: + k8s-app: heapster + version: v3 + template: + metadata: + labels: + k8s-app: heapster + version: v3 + kubernetes.io/cluster-service: "true" + spec: + containers: + - image: gcr.io/google_containers/heapster:v0.14.1 + name: heapster + command: + - /heapster + - --source=kubernetes:'' + - --sink=gcl + - --sink=influxdb:http://monitoring-influxdb:8086 + - --poll_duration=2m + - --stats_resolution=1m + volumeMounts: + - name: ssl-certs + mountPath: /etc/ssl/certs + readOnly: true + - name: monitoring-token + mountPath: /etc/kubernetes/kubeconfig + readOnly: true + + volumes: + - name: ssl-certs + hostPath: + path: /etc/ssl/certs + - name: monitoring-token + secret: + secretName: token-system-monitoring + diff --git a/cluster/addons/cluster-monitoring/influxdb/grafana-service.yaml b/cluster/addons/cluster-monitoring/influxdb/grafana-service.yaml index 2c8b05e46a0..2df94a40751 100644 --- a/cluster/addons/cluster-monitoring/influxdb/grafana-service.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/grafana-service.yaml @@ -1,4 +1,4 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: Service metadata: name: monitoring-grafana diff --git a/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml b/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml index f6d4e6ee58d..39044e8ce46 100644 --- a/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/heapster-controller.yaml @@ -1,30 +1,30 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: ReplicationController metadata: - name: monitoring-heapster-v1 + name: monitoring-heapster-v3 namespace: default labels: k8s-app: heapster - version: v1 + version: v3 kubernetes.io/cluster-service: "true" spec: replicas: 1 selector: k8s-app: heapster - version: v1 + version: v3 template: metadata: labels: k8s-app: heapster - version: v1 + version: v3 kubernetes.io/cluster-service: "true" spec: containers: - - image: gcr.io/google_containers/heapster:v0.13.0 + - image: gcr.io/google_containers/heapster:v0.14.1 name: heapster command: - /heapster - - --source=kubernetes:https://kubernetes + - --source=kubernetes:'' - --sink=influxdb:http://monitoring-influxdb:8086 volumeMounts: - name: ssl-certs diff --git a/cluster/addons/cluster-monitoring/influxdb/heapster-service.yaml b/cluster/addons/cluster-monitoring/influxdb/heapster-service.yaml new file mode 100644 index 00000000000..8ed8ff9a65f --- /dev/null +++ b/cluster/addons/cluster-monitoring/influxdb/heapster-service.yaml @@ -0,0 +1,13 @@ +kind: Service +apiVersion: v1 +metadata: + name: monitoring-heapster + labels: + kubernetes.io/cluster-service: "true" + name: monitoring-heapster +spec: + ports: + - port: 80 + targetPort: 8082 + selector: + k8s-app: heapster diff --git a/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml b/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml index 6e5ee253dcf..94d3dc06b2b 100644 --- a/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/influxdb-grafana-controller.yaml @@ -1,4 +1,4 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: ReplicationController metadata: name: monitoring-influx-grafana-v1 diff --git a/cluster/addons/cluster-monitoring/influxdb/influxdb-service.yaml b/cluster/addons/cluster-monitoring/influxdb/influxdb-service.yaml index 344c0871516..b115086a2e4 100644 --- a/cluster/addons/cluster-monitoring/influxdb/influxdb-service.yaml +++ b/cluster/addons/cluster-monitoring/influxdb/influxdb-service.yaml @@ -1,4 +1,4 @@ -apiVersion: v1beta3 +apiVersion: v1 kind: Service metadata: name: monitoring-influxdb diff --git a/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml b/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml new file mode 100644 index 00000000000..4589a8c05fb --- /dev/null +++ b/cluster/addons/cluster-monitoring/standalone/heapster-controller.yaml @@ -0,0 +1,43 @@ +apiVersion: v1 +kind: ReplicationController +metadata: + name: monitoring-heapster-v3 + namespace: default + labels: + k8s-app: heapster + version: v3 + kubernetes.io/cluster-service: "true" +spec: + replicas: 1 + selector: + k8s-app: heapster + version: v3 + template: + metadata: + labels: + k8s-app: heapster + version: v3 + kubernetes.io/cluster-service: "true" + spec: + containers: + - image: gcr.io/google_containers/heapster:v0.14.1 + name: heapster + command: + - /heapster + - --source=kubernetes:'' + volumeMounts: + - name: ssl-certs + mountPath: /etc/ssl/certs + readOnly: true + - name: monitoring-token + mountPath: /etc/kubernetes/kubeconfig + readOnly: true + + volumes: + - name: ssl-certs + hostPath: + path: /etc/ssl/certs + - name: monitoring-token + secret: + secretName: token-system-monitoring + diff --git a/cluster/addons/cluster-monitoring/standalone/heapster-service.yaml b/cluster/addons/cluster-monitoring/standalone/heapster-service.yaml new file mode 100644 index 00000000000..8ed8ff9a65f --- /dev/null +++ b/cluster/addons/cluster-monitoring/standalone/heapster-service.yaml @@ -0,0 +1,13 @@ +kind: Service +apiVersion: v1 +metadata: + name: monitoring-heapster + labels: + kubernetes.io/cluster-service: "true" + name: monitoring-heapster +spec: + ports: + - port: 80 + targetPort: 8082 + selector: + k8s-app: heapster diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 949aa3f5fb7..2339c0ea199 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -54,10 +54,12 @@ ENABLE_DOCKER_REGISTRY_CACHE=true ENABLE_NODE_MONITORING="${KUBE_ENABLE_NODE_MONITORING:-true}" # Optional: Cluster monitoring to setup as part of the cluster bring up: -# none - No cluster monitoring setup -# influxdb - Heapster, InfluxDB, and Grafana -# google - Heapster, Google Cloud Monitoring, and Google Cloud Logging -ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-google}" +# none - No cluster monitoring setup +# influxdb - Heapster, InfluxDB, and Grafana +# google - Heapster, Google Cloud Monitoring, and Google Cloud Logging +# googleinfluxdb - Enable influxdb and google (except GCM) +# standalone - Heapster only. Metrics available via Heapster REST API. +ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-googleinfluxdb}" # Optional: Enable node logging. ENABLE_NODE_LOGGING="${KUBE_ENABLE_NODE_LOGGING:-true}" diff --git a/cluster/gce/configure-vm.sh b/cluster/gce/configure-vm.sh index 9ac82ace2df..c73dfb8921d 100644 --- a/cluster/gce/configure-vm.sh +++ b/cluster/gce/configure-vm.sh @@ -501,7 +501,7 @@ EOF cat <>/etc/salt/minion.d/grains.conf cloud_config: /etc/gce.conf advertise_address: '${EXTERNAL_IP}' - proxy_ssh_user: '${INSTANCE_PREFIX}' + proxy_ssh_user: '${PROXY_SSH_USER}' EOF fi } diff --git a/cluster/gke/config-common.sh b/cluster/gke/config-common.sh index 6dcc9deb7b6..e4bf73a9422 100644 --- a/cluster/gke/config-common.sh +++ b/cluster/gke/config-common.sh @@ -28,6 +28,7 @@ NETWORK="${NETWORK:-default}" NETWORK_RANGE="${NETWORK_RANGE:-10.240.0.0/16}" FIREWALL_SSH="${FIREWALL_SSH:-${NETWORK}-allow-ssh}" GCLOUD="${GCLOUD:-gcloud}" +CMD_GROUP="${CMD_GROUP:-alpha}" GCLOUD_CONFIG_DIR="${GCLOUD_CONFIG_DIR:-${HOME}/.config/gcloud/kubernetes}" ENABLE_CLUSTER_DNS=false diff --git a/cluster/gke/config-default.sh b/cluster/gke/config-default.sh index 27a6249d442..4692882cf6d 100644 --- a/cluster/gke/config-default.sh +++ b/cluster/gke/config-default.sh @@ -34,4 +34,5 @@ ELASTICSEARCH_LOGGING_REPLICAS=1 # none - No cluster monitoring setup # influxdb - Heapster, InfluxDB, and Grafana # google - Heapster, Google Cloud Monitoring, and Google Cloud Logging -ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-none}" +# standalone - Heapster only. Metrics available via Heapster REST API. +ENABLE_CLUSTER_MONITORING="${KUBE_ENABLE_CLUSTER_MONITORING:-standalone}" diff --git a/cluster/gke/util.sh b/cluster/gke/util.sh index be01f35dda9..6d4ea3a477d 100755 --- a/cluster/gke/util.sh +++ b/cluster/gke/util.sh @@ -99,7 +99,8 @@ function verify-prereqs() { sudo_prefix="sudo" fi ${sudo_prefix} gcloud ${gcloud_prompt:-} components update preview || true - ${sudo_prefix} gcloud ${gcloud_prompt:-} components update alpha|| true + ${sudo_prefix} gcloud ${gcloud_prompt:-} components update "${CMD_GROUP}"|| true + ${sudo_prefix} gcloud ${gcloud_prompt:-} components update kubectl|| true ${sudo_prefix} gcloud ${gcloud_prompt:-} components update || true } @@ -116,18 +117,18 @@ function kube-up() { detect-project >&2 # Make the specified network if we need to. - if ! gcloud compute networks --project "${PROJECT}" describe "${NETWORK}" &>/dev/null; then + if ! "${GCLOUD}" compute networks --project "${PROJECT}" describe "${NETWORK}" &>/dev/null; then echo "Creating new network: ${NETWORK}" >&2 - gcloud compute networks create "${NETWORK}" --project="${PROJECT}" --range "${NETWORK_RANGE}" + "${GCLOUD}" compute networks create "${NETWORK}" --project="${PROJECT}" --range "${NETWORK_RANGE}" else echo "Using network: ${NETWORK}" >&2 fi # Allow SSH on all nodes in the network. This doesn't actually check whether # such a rule exists, only whether we've created this exact rule. - if ! gcloud compute firewall-rules --project "${PROJECT}" describe "${FIREWALL_SSH}" &>/dev/null; then + if ! "${GCLOUD}" compute firewall-rules --project "${PROJECT}" describe "${FIREWALL_SSH}" &>/dev/null; then echo "Creating new firewall for SSH: ${FIREWALL_SSH}" >&2 - gcloud compute firewall-rules create "${FIREWALL_SSH}" \ + "${GCLOUD}" compute firewall-rules create "${FIREWALL_SSH}" \ --allow="tcp:22" \ --network="${NETWORK}" \ --project="${PROJECT}" \ @@ -136,13 +137,20 @@ function kube-up() { echo "Using firewall-rule: ${FIREWALL_SSH}" >&2 fi + local create_args=( + "--zone=${ZONE}" + "--project=${PROJECT}" + "--num-nodes=${NUM_MINIONS}" + "--network=${NETWORK}" + ) + if [[ ! -z "${DOGFOOD_GCLOUD:-}" ]]; then + create_args+=("--cluster-version=${CLUSTER_API_VERSION:-}") + else + create_args+=("--cluster-api-version=${CLUSTER_API_VERSION:-}") + fi + # Bring up the cluster. - "${GCLOUD}" alpha container clusters create "${CLUSTER_NAME}" \ - --zone="${ZONE}" \ - --project="${PROJECT}" \ - --cluster-api-version="${CLUSTER_API_VERSION:-}" \ - --num-nodes="${NUM_MINIONS}" \ - --network="${NETWORK}" + "${GCLOUD}" "${CMD_GROUP}" container clusters create "${CLUSTER_NAME}" "${create_args[@]}" } # Execute prior to running tests to initialize required structure. This is @@ -152,31 +160,32 @@ function kube-up() { # Assumed vars: # CLUSTER_NAME # GCLOUD +# ZONE # Vars set: # MINION_TAG function test-setup() { echo "... in test-setup()" >&2 # Detect the project into $PROJECT if it isn't set detect-project >&2 + detect-minions >&2 # At this point, CLUSTER_NAME should have been used, so its value is final. - MINION_TAG="k8s-${CLUSTER_NAME}-node" + MINION_TAG=$($GCLOUD compute instances describe ${MINION_NAMES[0]} --project="${PROJECT}" --zone="${ZONE}" | grep -o "gke-${CLUSTER_NAME}-.\{8\}-node" | head -1) + OLD_MINION_TAG="k8s-${CLUSTER_NAME}-node" # Open up port 80 & 8080 so common containers on minions can be reached. - # TODO(mbforbes): Is adding ${USER} necessary, and sufficient, to avoid - # collisions here? "${GCLOUD}" compute firewall-rules create \ - "${MINION_TAG}-${USER}-http-alt" \ + "${MINION_TAG}-http-alt" \ --allow tcp:80,tcp:8080 \ --project "${PROJECT}" \ - --target-tags "${MINION_TAG}" \ + --target-tags "${MINION_TAG},${OLD_MINION_TAG}" \ --network="${NETWORK}" "${GCLOUD}" compute firewall-rules create \ - "${MINION_TAG}-${USER}-nodeports" \ + "${MINION_TAG}-nodeports" \ --allow tcp:30000-32767,udp:30000-32767 \ --project "${PROJECT}" \ - --target-tags "${MINION_TAG}" \ + --target-tags "${MINION_TAG},${OLD_MINION_TAG}" \ --network="${NETWORK}" } @@ -191,10 +200,10 @@ function test-setup() { function get-password() { echo "... in get-password()" >&2 detect-project >&2 - KUBE_USER=$("${GCLOUD}" alpha container clusters describe \ + KUBE_USER=$("${GCLOUD}" "${CMD_GROUP}" container clusters describe \ --project="${PROJECT}" --zone="${ZONE}" "${CLUSTER_NAME}" \ | grep user | cut -f 4 -d ' ') - KUBE_PASSWORD=$("${GCLOUD}" alpha container clusters describe \ + KUBE_PASSWORD=$("${GCLOUD}" "${CMD_GROUP}" container clusters describe \ --project="${PROJECT}" --zone="${ZONE}" "${CLUSTER_NAME}" \ | grep password | cut -f 4 -d ' ') } @@ -211,7 +220,7 @@ function detect-master() { echo "... in detect-master()" >&2 detect-project >&2 KUBE_MASTER="k8s-${CLUSTER_NAME}-master" - KUBE_MASTER_IP=$("${GCLOUD}" alpha container clusters describe \ + KUBE_MASTER_IP=$("${GCLOUD}" "${CMD_GROUP}" container clusters describe \ --project="${PROJECT}" --zone="${ZONE}" "${CLUSTER_NAME}" \ | grep endpoint | cut -f 2 -d ' ') } @@ -233,14 +242,28 @@ function detect-minions() { # MINION_NAMES function detect-minion-names { detect-project - GROUP_NAME=($(gcloud preview --project "${PROJECT}" instance-groups \ - --zone "${ZONE}" list | grep -o "k8s-${CLUSTER_NAME}-.\{8\}-group")) + detect-node-instance-group MINION_NAMES=($(gcloud preview --project "${PROJECT}" instance-groups \ - --zone "${ZONE}" instances --group "${GROUP_NAME}" list \ + --zone "${ZONE}" instances --group "${NODE_INSTANCE_GROUP}" list \ | cut -d'/' -f11)) echo "MINION_NAMES=${MINION_NAMES[*]}" } +# Detect instance group name generated by gke +# +# Assumed vars: +# GCLOUD +# PROJECT +# ZONE +# CLUSTER_NAME +# Vars set: +# NODE_INSTANCE_GROUP +function detect-node-instance-group { + NODE_INSTANCE_GROUP=$("${GCLOUD}" "${CMD_GROUP}" container clusters describe \ + --project="${PROJECT}" --zone="${ZONE}" "${CLUSTER_NAME}" \ + | grep instanceGroupManagers | cut -d '/' -f 11) +} + # SSH to a node by name ($1) and run a command ($2). # # Assumed vars: @@ -283,18 +306,20 @@ function restart-apiserver() { # CLUSTER_NAME # GCLOUD # KUBE_ROOT +# ZONE function test-teardown() { echo "... in test-teardown()" >&2 detect-project >&2 + detect-minions >&2 # At this point, CLUSTER_NAME should have been used, so its value is final. - MINION_TAG="k8s-${CLUSTER_NAME}-node" + MINION_TAG=$($GCLOUD compute instances describe ${MINION_NAMES[0]} --project="${PROJECT}" --zone="${ZONE}" | grep -o "gke-${CLUSTER_NAME}-.\{8\}-node" | head -1) # First, remove anything we did with test-setup (currently, the firewall). # NOTE: Keep in sync with names above in test-setup. - "${GCLOUD}" compute firewall-rules delete "${MINION_TAG}-${USER}-http-alt" \ + "${GCLOUD}" compute firewall-rules delete "${MINION_TAG}-http-alt" \ --project="${PROJECT}" || true - "${GCLOUD}" compute firewall-rules delete "${MINION_TAG}-${USER}-nodeports" \ + "${GCLOUD}" compute firewall-rules delete "${MINION_TAG}-nodeports" \ --project="${PROJECT}" || true # Then actually turn down the cluster. @@ -310,6 +335,6 @@ function test-teardown() { function kube-down() { echo "... in kube-down()" >&2 detect-project >&2 - "${GCLOUD}" alpha container clusters delete --project="${PROJECT}" \ + "${GCLOUD}" "${CMD_GROUP}" container clusters delete --project="${PROJECT}" \ --zone="${ZONE}" "${CLUSTER_NAME}" --quiet } diff --git a/cluster/saltbase/salt/kube-addons/init.sls b/cluster/saltbase/salt/kube-addons/init.sls index 878b7244456..35c0a348f61 100644 --- a/cluster/saltbase/salt/kube-addons/init.sls +++ b/cluster/saltbase/salt/kube-addons/init.sls @@ -33,6 +33,29 @@ addon-dir-create: - file_mode: 644 {% endif %} +{% if pillar.get('enable_cluster_monitoring', '').lower() == 'standalone' %} +/etc/kubernetes/addons/cluster-monitoring/standalone: + file.recurse: + - source: salt://kube-addons/cluster-monitoring/standalone + - include_pat: E@(^.+\.yaml$|^.+\.json$) + - user: root + - group: root + - dir_mode: 755 + - file_mode: 644 +{% endif %} + +{% if pillar.get('enable_cluster_monitoring', '').lower() == 'googleinfluxdb' %} +/etc/kubernetes/addons/cluster-monitoring/googleinfluxdb: + file.recurse: + - source: salt://kube-addons/cluster-monitoring + - include_pat: E@(^.+\.yaml$|^.+\.json$) + - exclude_pat: E@(^.+heapster-controller\.yaml$|^.+heapster-controller\.json$) + - user: root + - group: root + - dir_mode: 755 + - file_mode: 644 +{% endif %} + {% if pillar.get('enable_cluster_dns', '').lower() == 'true' %} /etc/kubernetes/addons/dns/skydns-svc.yaml: file.managed: diff --git a/hack/ginkgo-e2e.sh b/hack/ginkgo-e2e.sh index a9821cc152c..4183597a62d 100755 --- a/hack/ginkgo-e2e.sh +++ b/hack/ginkgo-e2e.sh @@ -83,6 +83,10 @@ else NODE_INSTANCE_GROUP="" fi +if [[ "${KUBERNETES_PROVIDER}" == "gke" ]]; then + detect-node-instance-group +fi + ginkgo_args=() if [[ ${GINKGO_PARALLEL} =~ ^[yY]$ ]]; then ginkgo_args+=("-p") diff --git a/hack/jenkins/e2e.sh b/hack/jenkins/e2e.sh index 9d628864340..6ee4a0f0acf 100755 --- a/hack/jenkins/e2e.sh +++ b/hack/jenkins/e2e.sh @@ -125,7 +125,7 @@ if [[ "${E2E_UP,,}" == "true" ]]; then # code=400,message=cluster.cluster_api_versionmustbeoneof: # 0.15.0,0.16.0. # The command should error, so we throw an || true on there. - msg=$(gcloud alpha container clusters create this-wont-work \ + msg=$(gcloud ${CMD_GROUP:-alpha} container clusters create this-wont-work \ --zone=us-central1-f --cluster-api-version=0.0.0 2>&1 \ | tr -d '[[:space:]]') || true # Strip out everything before the final colon, which gives us just diff --git a/pkg/client/helper.go b/pkg/client/helper.go index 8834526a834..5021e595957 100644 --- a/pkg/client/helper.go +++ b/pkg/client/helper.go @@ -30,8 +30,11 @@ import ( "time" "github.com/GoogleCloudPlatform/kubernetes/pkg/api/latest" + "github.com/GoogleCloudPlatform/kubernetes/pkg/api/registered" "github.com/GoogleCloudPlatform/kubernetes/pkg/runtime" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util" "github.com/GoogleCloudPlatform/kubernetes/pkg/version" + "github.com/golang/glog" ) // Config holds the common attributes that can be passed to a Kubernetes client on @@ -143,6 +146,9 @@ func New(c *Config) (*Client, error) { return &Client{client}, nil } +// MatchesServerVersion queries the server to compares the build version +// (git hash) of the client with the server's build version. It returns an error +// if it failed to contact the server or if the versions are not an exact match. func MatchesServerVersion(c *Config) error { client, err := New(c) if err != nil { @@ -161,6 +167,66 @@ func MatchesServerVersion(c *Config) error { return nil } +// NegotiateVersion queries the server's supported api versions to find +// a version that both client and server support. +// - If no version is provided, try the client's registered versions in order of +// preference. +// - If version is provided, but not default config (explicitly requested via +// commandline flag), and is unsupported by the server, print a warning to +// stderr and try client's registered versions in order of preference. +// - If version is config default, and the server does not support it, +// return an error. +func NegotiateVersion(c *Config, version string) (string, error) { + client, err := New(c) + if err != nil { + return "", err + } + clientVersions := util.StringSet{} + for _, v := range registered.RegisteredVersions { + clientVersions.Insert(v) + } + apiVersions, err := client.ServerAPIVersions() + if err != nil { + return "", fmt.Errorf("couldn't read version from server: %v\n", err) + } + serverVersions := util.StringSet{} + for _, v := range apiVersions.Versions { + serverVersions.Insert(v) + } + // If no version requested, use config version (may also be empty). + if len(version) == 0 { + version = c.Version + } + // If version explicitly requested verify that both client and server support it. + // If server does not support warn, but try to negotiate a lower version. + if len(version) != 0 { + if !clientVersions.Has(version) { + return "", fmt.Errorf("Client does not support API version '%s'. Client supported API versions: %v", version, clientVersions) + + } + if serverVersions.Has(version) { + return version, nil + } + // If we are using an explicit config version the server does not support, fail. + if version == c.Version { + return "", fmt.Errorf("Server does not support API version '%s'.", version) + } + } + + for _, clientVersion := range registered.RegisteredVersions { + if serverVersions.Has(clientVersion) { + // Version was not explicitly requested in command config (--api-version). + // Ok to fall back to a supported version with a warning. + if len(version) != 0 { + glog.Warningf("Server does not support API version '%s'. Falling back to '%s'.", version, clientVersion) + } + return clientVersion, nil + } + } + return "", fmt.Errorf("Failed to negotiate an api version. Server supports: %v. Client supports: %v.", + serverVersions, registered.RegisteredVersions) +} + // NewOrDie creates a Kubernetes client and panics if the provided API version is not recognized. func NewOrDie(c *Config) *Client { client, err := New(c) diff --git a/pkg/cloudprovider/gce/gce.go b/pkg/cloudprovider/gce/gce.go index 4fdaa889c7a..fd2bb0c7a45 100644 --- a/pkg/cloudprovider/gce/gce.go +++ b/pkg/cloudprovider/gce/gce.go @@ -22,7 +22,6 @@ import ( "io/ioutil" "net" "net/http" - "os" "path" "strconv" "strings" @@ -32,6 +31,7 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/api/resource" "github.com/GoogleCloudPlatform/kubernetes/pkg/cloudprovider" "github.com/GoogleCloudPlatform/kubernetes/pkg/util" + "github.com/GoogleCloudPlatform/kubernetes/pkg/util/wait" "code.google.com/p/gcfg" compute "code.google.com/p/google-api-go-client/compute/v1" @@ -483,37 +483,41 @@ func (gce *GCECloud) getInstanceByName(name string) (*compute.Instance, error) { } func (gce *GCECloud) AddSSHKeyToAllInstances(user string, keyData []byte) error { - project, err := gce.service.Projects.Get(gce.projectID).Do() - if err != nil { - return err - } - hostname, err := os.Hostname() - if err != nil { - return err - } - keyString := fmt.Sprintf("%s:%s %s@%s", user, strings.TrimSpace(string(keyData)), user, hostname) - found := false - for _, item := range project.CommonInstanceMetadata.Items { - if item.Key == "sshKeys" { - item.Value = addKey(item.Value, keyString) - found = true - break + return wait.Poll(2*time.Second, 30*time.Second, func() (bool, error) { + project, err := gce.service.Projects.Get(gce.projectID).Do() + if err != nil { + glog.Errorf("Could not get project: %v", err) + return false, nil } - } - if !found { - // This is super unlikely, so log. - glog.Infof("Failed to find sshKeys metadata, creating a new item") - project.CommonInstanceMetadata.Items = append(project.CommonInstanceMetadata.Items, - &compute.MetadataItems{ - Key: "sshKeys", - Value: keyString, - }) - } - op, err := gce.service.Projects.SetCommonInstanceMetadata(gce.projectID, project.CommonInstanceMetadata).Do() - if err != nil { - return err - } - return gce.waitForGlobalOp(op) + keyString := fmt.Sprintf("%s:%s %s@%s", user, strings.TrimSpace(string(keyData)), user, user) + found := false + for _, item := range project.CommonInstanceMetadata.Items { + if item.Key == "sshKeys" { + item.Value = addKey(item.Value, keyString) + found = true + break + } + } + if !found { + // This is super unlikely, so log. + glog.Infof("Failed to find sshKeys metadata, creating a new item") + project.CommonInstanceMetadata.Items = append(project.CommonInstanceMetadata.Items, + &compute.MetadataItems{ + Key: "sshKeys", + Value: keyString, + }) + } + op, err := gce.service.Projects.SetCommonInstanceMetadata(gce.projectID, project.CommonInstanceMetadata).Do() + if err != nil { + glog.Errorf("Could not Set Metadata: %v", err) + return false, nil + } + if err := gce.waitForGlobalOp(op); err != nil { + glog.Errorf("Could not Set Metadata: %v", err) + return false, nil + } + return true, nil + }) } func addKey(metadataBefore, keyString string) string { diff --git a/pkg/kubectl/cmd/cmd_test.go b/pkg/kubectl/cmd/cmd_test.go index 063fd00aeed..5815b250176 100644 --- a/pkg/kubectl/cmd/cmd_test.go +++ b/pkg/kubectl/cmd/cmd_test.go @@ -221,23 +221,25 @@ func stringBody(body string) io.ReadCloser { return ioutil.NopCloser(bytes.NewReader([]byte(body))) } +// TODO(jlowdermilk): refactor the Factory so we can test client versions properly, +// with different client/server version skew scenarios. // Verify that resource.RESTClients constructed from a factory respect mapping.APIVersion -func TestClientVersions(t *testing.T) { - f := cmdutil.NewFactory(nil) - - version := testapi.Version() - mapping := &meta.RESTMapping{ - APIVersion: version, - } - c, err := f.RESTClient(mapping) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - client := c.(*client.RESTClient) - if client.APIVersion() != version { - t.Errorf("unexpected Client APIVersion: %s %v", client.APIVersion, client) - } -} +//func TestClientVersions(t *testing.T) { +// f := cmdutil.NewFactory(nil) +// +// version := testapi.Version() +// mapping := &meta.RESTMapping{ +// APIVersion: version, +// } +// c, err := f.RESTClient(mapping) +// if err != nil { +// t.Errorf("unexpected error: %v", err) +// } +// client := c.(*client.RESTClient) +// if client.APIVersion() != version { +// t.Errorf("unexpected Client APIVersion: %s %v", client.APIVersion, client) +// } +//} func ExamplePrintReplicationController() { f, tf, codec := NewAPIFactory() diff --git a/pkg/kubectl/cmd/util/clientcache.go b/pkg/kubectl/cmd/util/clientcache.go index e7e64992c00..6d86d5d89fd 100644 --- a/pkg/kubectl/cmd/util/clientcache.go +++ b/pkg/kubectl/cmd/util/clientcache.go @@ -21,11 +21,20 @@ import ( "github.com/GoogleCloudPlatform/kubernetes/pkg/client/clientcmd" ) +func NewClientCache(loader clientcmd.ClientConfig) *clientCache { + return &clientCache{ + clients: make(map[string]*client.Client), + configs: make(map[string]*client.Config), + loader: loader, + } +} + // clientCache caches previously loaded clients for reuse, and ensures MatchServerVersion // is invoked only once type clientCache struct { loader clientcmd.ClientConfig clients map[string]*client.Client + configs map[string]*client.Config defaultConfig *client.Config matchVersion bool } @@ -44,12 +53,18 @@ func (c *clientCache) ClientConfigForVersion(version string) (*client.Config, er } } } + if config, ok := c.configs[version]; ok { + return config, nil + } // TODO: have a better config copy method config := *c.defaultConfig - if len(version) != 0 { - config.Version = version + negotiatedVersion, err := client.NegotiateVersion(&config, version) + if err != nil { + return nil, err } + config.Version = negotiatedVersion client.SetKubernetesDefaults(&config) + c.configs[version] = &config return &config, nil } @@ -57,15 +72,13 @@ func (c *clientCache) ClientConfigForVersion(version string) (*client.Config, er // ClientForVersion initializes or reuses a client for the specified version, or returns an // error if that is not possible func (c *clientCache) ClientForVersion(version string) (*client.Client, error) { + if client, ok := c.clients[version]; ok { + return client, nil + } config, err := c.ClientConfigForVersion(version) if err != nil { return nil, err } - - if client, ok := c.clients[config.Version]; ok { - return client, nil - } - client, err := client.New(config) if err != nil { return nil, err diff --git a/pkg/kubectl/cmd/util/factory.go b/pkg/kubectl/cmd/util/factory.go index 1a3486047e6..ae799cfd1aa 100644 --- a/pkg/kubectl/cmd/util/factory.go +++ b/pkg/kubectl/cmd/util/factory.go @@ -102,10 +102,7 @@ func NewFactory(optionalClientConfig clientcmd.ClientConfig) *Factory { clientConfig = DefaultClientConfig(flags) } - clients := &clientCache{ - clients: make(map[string]*client.Client), - loader: clientConfig, - } + clients := NewClientCache(clientConfig) return &Factory{ clients: clients, diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index dee4da35e77..14653452be1 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -48,9 +48,11 @@ import ( ) const ( - // The oom_score_adj of the POD infrastructure container. The default is 0, so - // any value below that makes it *less* likely to get OOM killed. - podOomScoreAdj = -100 + // The oom_score_adj of the POD infrastructure container. The default is 0 for + // any other docker containers, so any value below that makes it *less* likely + // to get OOM killed. + podOomScoreAdj = -100 + userContainerOomScoreAdj = 0 maxReasonCacheEntries = 200 @@ -1036,29 +1038,38 @@ func (dm *DockerManager) KillPod(pod kubecontainer.Pod) error { // can be Len errors + the networkPlugin teardown error. errs := make(chan error, len(pod.Containers)+1) wg := sync.WaitGroup{} + var networkID types.UID for _, container := range pod.Containers { wg.Add(1) go func(container *kubecontainer.Container) { defer util.HandleCrash() + defer wg.Done() // TODO: Handle this without signaling the pod infra container to // adapt to the generic container runtime. if container.Name == PodInfraContainerName { - err := dm.networkPlugin.TearDownPod(pod.Namespace, pod.Name, kubeletTypes.DockerID(container.ID)) - if err != nil { - glog.Errorf("Failed tearing down the infra container: %v", err) - errs <- err - } + // Store the container runtime for later deletion. + // We do this so that PreStop handlers can run in the network namespace. + networkID = container.ID + return } - err := dm.killContainer(container.ID) - if err != nil { + if err := dm.killContainer(container.ID); err != nil { glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID) errs <- err } - wg.Done() }(container) } wg.Wait() + if len(networkID) > 0 { + if err := dm.networkPlugin.TearDownPod(pod.Namespace, pod.Name, kubeletTypes.DockerID(networkID)); err != nil { + glog.Errorf("Failed tearing down the infra container: %v", err) + errs <- err + } + if err := dm.killContainer(networkID); err != nil { + glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID) + errs <- err + } + } close(errs) if len(errs) > 0 { errList := []error{} @@ -1181,6 +1192,28 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe if err = dm.os.Symlink(containerLogFile, symlinkFile); err != nil { glog.Errorf("Failed to create symbolic link to the log file of pod %q container %q: %v", podFullName, container.Name, err) } + + // Set OOM score of POD container to lower than those of the other containers + // which have OOM score 0 by default in the pod. This ensures that it is + // killed only as a last resort. + containerInfo, err := dm.client.InspectContainer(string(id)) + if err != nil { + return "", err + } + + // Ensure the PID actually exists, else we'll move ourselves. + if containerInfo.State.Pid == 0 { + return "", fmt.Errorf("failed to get init PID for Docker container %q", string(id)) + } + if container.Name == PodInfraContainerName { + util.ApplyOomScoreAdj(containerInfo.State.Pid, podOomScoreAdj) + } else { + // Children processes of docker daemon will inheritant the OOM score from docker + // daemon process. We explicitly apply OOM score 0 by default to the user + // containers to avoid daemons or POD containers are killed by oom killer. + util.ApplyOomScoreAdj(containerInfo.State.Pid, userContainerOomScoreAdj) + } + return kubeletTypes.DockerID(id), err } @@ -1235,19 +1268,6 @@ func (dm *DockerManager) createPodInfraContainer(pod *api.Pod) (kubeletTypes.Doc return "", err } - // Set OOM score of POD container to lower than those of the other - // containers in the pod. This ensures that it is killed only as a last - // resort. - containerInfo, err := dm.client.InspectContainer(string(id)) - if err != nil { - return "", err - } - - // Ensure the PID actually exists, else we'll move ourselves. - if containerInfo.State.Pid == 0 { - return "", fmt.Errorf("failed to get init PID for Docker pod infra container %q", string(id)) - } - util.ApplyOomScoreAdj(containerInfo.State.Pid, podOomScoreAdj) return id, nil } diff --git a/pkg/kubelet/dockertools/manager_test.go b/pkg/kubelet/dockertools/manager_test.go index 1241feb8e7a..fe69ffb74c3 100644 --- a/pkg/kubelet/dockertools/manager_test.go +++ b/pkg/kubelet/dockertools/manager_test.go @@ -896,7 +896,7 @@ func TestSyncPodCreateNetAndContainer(t *testing.T) { // Create pod infra container. "create", "start", "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) fakeDocker.Lock() @@ -945,7 +945,7 @@ func TestSyncPodCreatesNetAndContainerPullsImage(t *testing.T) { // Create pod infra container. "create", "start", "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) fakeDocker.Lock() @@ -997,7 +997,7 @@ func TestSyncPodWithPodInfraCreatesContainer(t *testing.T) { // Inspect pod infra container (but does not create)" "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) fakeDocker.Lock() @@ -1038,7 +1038,7 @@ func TestSyncPodDeletesWithNoPodInfraContainer(t *testing.T) { // Create pod infra container. "create", "start", "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) // A map iteration is used to delete containers, so must not depend on @@ -1163,7 +1163,7 @@ func TestSyncPodBadHash(t *testing.T) { // Check the pod infra container. "inspect_container", // Kill and restart the bad hash container. - "inspect_container", "stop", "create", "start", + "inspect_container", "stop", "create", "start", "inspect_container", }) if err := fakeDocker.AssertStopped([]string{"1234"}); err != nil { @@ -1223,7 +1223,7 @@ func TestSyncPodsUnhealthy(t *testing.T) { // Kill the unhealthy container. "inspect_container", "stop", // Restart the unhealthy container. - "create", "start", + "create", "start", "inspect_container", }) if err := fakeDocker.AssertStopped([]string{"1234"}); err != nil { @@ -1408,7 +1408,7 @@ func TestSyncPodWithRestartPolicy(t *testing.T) { // Check the pod infra container. "inspect_container", // Restart both containers. - "create", "start", "create", "start", + "create", "start", "inspect_container", "create", "start", "inspect_container", }, []string{"succeeded", "failed"}, []string{}, @@ -1419,7 +1419,7 @@ func TestSyncPodWithRestartPolicy(t *testing.T) { // Check the pod infra container. "inspect_container", // Restart the failed container. - "create", "start", + "create", "start", "inspect_container", }, []string{"failed"}, []string{}, @@ -1832,7 +1832,7 @@ func TestSyncPodWithPodInfraCreatesContainerCallsHandler(t *testing.T) { // Check the pod infra container. "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", }) fakeDocker.Lock() diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index f071f35f24e..5e222655f16 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1615,6 +1615,11 @@ func (kl *Kubelet) admitPods(allPods []*api.Pod, podSyncTypes map[types.UID]metr func (kl *Kubelet) syncLoop(updates <-chan PodUpdate, handler SyncHandler) { glog.Info("Starting kubelet main sync loop.") for { + if !kl.containerRuntimeUp() { + time.Sleep(5 * time.Second) + glog.Infof("Skipping pod synchronization, container runtime is not up.") + continue + } unsyncedPod := false podSyncTypes := make(map[types.UID]metrics.SyncPodType) select { @@ -1875,11 +1880,7 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error { } // Check whether container runtime can be reported as up. - containerRuntimeUp := func() bool { - kl.runtimeMutex.Lock() - defer kl.runtimeMutex.Unlock() - return kl.lastTimestampRuntimeUp.Add(kl.runtimeUpThreshold).After(time.Now()) - }() + containerRuntimeUp := kl.containerRuntimeUp() currentTime := util.Now() var newNodeReadyCondition api.NodeCondition @@ -1942,6 +1943,12 @@ func (kl *Kubelet) setNodeStatus(node *api.Node) error { return nil } +func (kl *Kubelet) containerRuntimeUp() bool { + kl.runtimeMutex.Lock() + defer kl.runtimeMutex.Unlock() + return kl.lastTimestampRuntimeUp.Add(kl.runtimeUpThreshold).After(time.Now()) +} + // tryUpdateNodeStatus tries to update node status to master. If ReconcileCBR0 // is set, this function will also confirm that cbr0 is configured correctly. func (kl *Kubelet) tryUpdateNodeStatus() error { diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index e4b52692599..7428628cc46 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -485,7 +485,7 @@ func TestSyncPodsWithTerminationLog(t *testing.T) { // Create pod infra container. "create", "start", "inspect_container", // Create container. - "create", "start", + "create", "start", "inspect_container", // Get pod status. "list", "inspect_container", "inspect_container", // Get pods for deleting orphaned volumes. diff --git a/pkg/master/master.go b/pkg/master/master.go index 24fec4ed937..2e3b8ed5d89 100644 --- a/pkg/master/master.go +++ b/pkg/master/master.go @@ -210,7 +210,7 @@ type Master struct { InsecureHandler http.Handler // Used for secure proxy - tunnels util.SSHTunnelList + tunnels *util.SSHTunnelList tunnelsLock sync.Mutex installSSHKey InstallSSHKey } @@ -495,6 +495,11 @@ func (m *Master) init(c *Config) { var proxyDialer func(net, addr string) (net.Conn, error) if len(c.SSHUser) > 0 { + // Usernames are capped @ 32 + if len(c.SSHUser) > 32 { + glog.Warning("SSH User is too long, truncating to 32 chars") + c.SSHUser = c.SSHUser[0:32] + } glog.Infof("Setting up proxy: %s %s", c.SSHUser, c.SSHKeyfile) exists, err := util.FileExists(c.SSHKeyfile) if err != nil { @@ -772,7 +777,7 @@ func (m *Master) Dial(net, addr string) (net.Conn, error) { } func (m *Master) needToReplaceTunnels(addrs []string) bool { - if len(m.tunnels) != len(addrs) { + if m.tunnels == nil || m.tunnels.Len() != len(addrs) { return true } // TODO (cjcullen): This doesn't need to be n^2 @@ -807,7 +812,9 @@ func (m *Master) replaceTunnels(user, keyfile string, newAddrs []string) error { if err != nil { return err } - tunnels.Open() + if err := tunnels.Open(); err != nil { + return err + } if m.tunnels != nil { m.tunnels.Close() } @@ -844,31 +851,24 @@ func (m *Master) refreshTunnels(user, keyfile string) error { func (m *Master) setupSecureProxy(user, keyfile string) { // Sync loop for tunnels // TODO: switch this to watch. - go func() { - for { - if err := m.loadTunnels(user, keyfile); err != nil { - glog.Errorf("Failed to load SSH Tunnels: %v", err) - } - var sleep time.Duration - if len(m.tunnels) == 0 { - sleep = time.Second - } else { - // tunnels could lag behind current set of nodes - sleep = 10 * time.Second - } - time.Sleep(sleep) + go util.Until(func() { + if err := m.loadTunnels(user, keyfile); err != nil { + glog.Errorf("Failed to load SSH Tunnels: %v", err) } - }() + if m.tunnels != nil && m.tunnels.Len() != 0 { + // Sleep for 10 seconds if we have some tunnels. + // TODO (cjcullen): tunnels can lag behind actually existing nodes. + time.Sleep(9 * time.Second) + } + }, 1*time.Second, util.NeverStop) // Refresh loop for tunnels // TODO: could make this more controller-ish - go func() { - for { - time.Sleep(5 * time.Minute) - if err := m.refreshTunnels(user, keyfile); err != nil { - glog.Errorf("Failed to refresh SSH Tunnels: %v", err) - } + go util.Until(func() { + time.Sleep(5 * time.Minute) + if err := m.refreshTunnels(user, keyfile); err != nil { + glog.Errorf("Failed to refresh SSH Tunnels: %v", err) } - }() + }, 0*time.Second, util.NeverStop) } func (m *Master) generateSSHKey(user, keyfile string) error { diff --git a/pkg/util/ssh.go b/pkg/util/ssh.go index 706190985ba..7d07249aebb 100644 --- a/pkg/util/ssh.go +++ b/pkg/util/ssh.go @@ -32,9 +32,30 @@ import ( "time" "github.com/golang/glog" + "github.com/prometheus/client_golang/prometheus" "golang.org/x/crypto/ssh" ) +var ( + tunnelOpenCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "ssh_tunnel_open_count", + Help: "Counter of ssh tunnel total open attempts", + }, + ) + tunnelOpenFailCounter = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: "ssh_tunnel_open_fail_count", + Help: "Counter of ssh tunnel failed open attempts", + }, + ) +) + +func init() { + prometheus.MustRegister(tunnelOpenCounter) + prometheus.MustRegister(tunnelOpenFailCounter) +} + // TODO: Unit tests for this code, we can spin up a test SSH server with instructions here: // https://godoc.org/golang.org/x/crypto/ssh#ServerConn type SSHTunnel struct { @@ -83,7 +104,9 @@ func makeSSHTunnel(user string, signer ssh.Signer, host string) (*SSHTunnel, err func (s *SSHTunnel) Open() error { var err error s.client, err = ssh.Dial("tcp", net.JoinHostPort(s.Host, s.SSHPort), s.Config) + tunnelOpenCounter.Inc() if err != nil { + tunnelOpenFailCounter.Inc() return err } return nil @@ -97,6 +120,9 @@ func (s *SSHTunnel) Dial(network, address string) (net.Conn, error) { } func (s *SSHTunnel) tunnel(conn net.Conn, remoteHost, remotePort string) error { + if s.client == nil { + return errors.New("tunnel is not opened.") + } tunnel, err := s.client.Dial("tcp", net.JoinHostPort(remoteHost, remotePort)) if err != nil { return err @@ -107,6 +133,9 @@ func (s *SSHTunnel) tunnel(conn net.Conn, remoteHost, remotePort string) error { } func (s *SSHTunnel) Close() error { + if s.client == nil { + return errors.New("Cannot close tunnel. Tunnel was not opened.") + } if err := s.client.Close(); err != nil { return err } @@ -178,9 +207,11 @@ type SSHTunnelEntry struct { Tunnel *SSHTunnel } -type SSHTunnelList []SSHTunnelEntry +type SSHTunnelList struct { + entries []SSHTunnelEntry +} -func MakeSSHTunnels(user, keyfile string, addresses []string) (SSHTunnelList, error) { +func MakeSSHTunnels(user, keyfile string, addresses []string) (*SSHTunnelList, error) { tunnels := []SSHTunnelEntry{} for ix := range addresses { addr := addresses[ix] @@ -190,25 +221,35 @@ func MakeSSHTunnels(user, keyfile string, addresses []string) (SSHTunnelList, er } tunnels = append(tunnels, SSHTunnelEntry{addr, tunnel}) } - return tunnels, nil + return &SSHTunnelList{tunnels}, nil } -func (l SSHTunnelList) Open() error { - for ix := range l { - if err := l[ix].Tunnel.Open(); err != nil { - return err +// Open attempts to open all tunnels in the list, and removes any tunnels that +// failed to open. +func (l *SSHTunnelList) Open() error { + var openTunnels []SSHTunnelEntry + for ix := range l.entries { + if err := l.entries[ix].Tunnel.Open(); err != nil { + glog.Errorf("Failed to open tunnel %v: %v", l.entries[ix], err) + } else { + openTunnels = append(openTunnels, l.entries[ix]) } } + l.entries = openTunnels + if len(l.entries) == 0 { + return errors.New("Failed to open any tunnels.") + } return nil } // Close asynchronously closes all tunnels in the list after waiting for 1 // minute. Tunnels will still be open upon this function's return, but should // no longer be used. -func (l SSHTunnelList) Close() { - for ix := range l { - entry := l[ix] +func (l *SSHTunnelList) Close() { + for ix := range l.entries { + entry := l.entries[ix] go func() { + defer HandleCrash() time.Sleep(1 * time.Minute) if err := entry.Tunnel.Close(); err != nil { glog.Errorf("Failed to close tunnel %v: %v", entry, err) @@ -217,22 +258,26 @@ func (l SSHTunnelList) Close() { } } -func (l SSHTunnelList) Dial(network, addr string) (net.Conn, error) { - if len(l) == 0 { +func (l *SSHTunnelList) Dial(network, addr string) (net.Conn, error) { + if len(l.entries) == 0 { return nil, fmt.Errorf("Empty tunnel list.") } - return l[mathrand.Int()%len(l)].Tunnel.Dial(network, addr) + return l.entries[mathrand.Int()%len(l.entries)].Tunnel.Dial(network, addr) } -func (l SSHTunnelList) Has(addr string) bool { - for ix := range l { - if l[ix].Address == addr { +func (l *SSHTunnelList) Has(addr string) bool { + for ix := range l.entries { + if l.entries[ix].Address == addr { return true } } return false } +func (l *SSHTunnelList) Len() int { + return len(l.entries) +} + func EncodePrivateKey(private *rsa.PrivateKey) []byte { return pem.EncodeToMemory(&pem.Block{ Bytes: x509.MarshalPKCS1PrivateKey(private), diff --git a/pkg/version/base.go b/pkg/version/base.go index 2f231a65c1f..c1fe45cd63d 100644 --- a/pkg/version/base.go +++ b/pkg/version/base.go @@ -36,8 +36,8 @@ package version var ( // TODO: Deprecate gitMajor and gitMinor, use only gitVersion instead. gitMajor string = "0" // major version, always numeric - gitMinor string = "19.0+" // minor version, numeric possibly followed by "+" - gitVersion string = "v0.19.0-dev" // version from git, output of $(git describe) + gitMinor string = "19.1+" // minor version, numeric possibly followed by "+" + gitVersion string = "v0.19.1-dev" // version from git, output of $(git describe) gitCommit string = "" // sha1 from git, output of $(git rev-parse HEAD) gitTreeState string = "not a git tree" // state of git tree, either "clean" or "dirty" ) diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index a271aed510f..36549821024 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -360,6 +360,20 @@ func getUsedPorts(pods ...*api.Pod) map[int]bool { return ports } +func filterNonRunningPods(pods []*api.Pod) []*api.Pod { + if len(pods) == 0 { + return pods + } + result := []*api.Pod{} + for _, pod := range pods { + if pod.Status.Phase == api.PodSucceeded || pod.Status.Phase == api.PodFailed { + continue + } + result = append(result, pod) + } + return result +} + // MapPodsToMachines obtains a list of pods and pivots that list into a map where the keys are host names // and the values are the list of pods running on that host. func MapPodsToMachines(lister algorithm.PodLister) (map[string][]*api.Pod, error) { @@ -369,6 +383,7 @@ func MapPodsToMachines(lister algorithm.PodLister) (map[string][]*api.Pod, error if err != nil { return map[string][]*api.Pod{}, err } + pods = filterNonRunningPods(pods) for _, scheduledPod := range pods { host := scheduledPod.Spec.NodeName machineToPods[host] = append(machineToPods[host], scheduledPod) diff --git a/plugin/pkg/scheduler/generic_scheduler_test.go b/plugin/pkg/scheduler/generic_scheduler_test.go index 19e224a7194..b220b14cb72 100644 --- a/plugin/pkg/scheduler/generic_scheduler_test.go +++ b/plugin/pkg/scheduler/generic_scheduler_test.go @@ -40,6 +40,10 @@ func matchesPredicate(pod *api.Pod, existingPods []*api.Pod, node string) (bool, return pod.Name == node, nil } +func hasNoPodsPredicate(pod *api.Pod, existingPods []*api.Pod, node string) (bool, error) { + return len(existingPods) == 0, nil +} + func numericPriority(pod *api.Pod, podLister algorithm.PodLister, minionLister algorithm.MinionLister) (algorithm.HostPriorityList, error) { nodes, err := minionLister.List() result := []algorithm.HostPriority{} @@ -166,6 +170,7 @@ func TestGenericScheduler(t *testing.T) { prioritizers []algorithm.PriorityConfig nodes []string pod *api.Pod + pods []*api.Pod expectedHost string expectsErr bool }{ @@ -223,11 +228,66 @@ func TestGenericScheduler(t *testing.T) { expectsErr: true, name: "test 7", }, + { + predicates: map[string]algorithm.FitPredicate{ + "nopods": hasNoPodsPredicate, + "matches": matchesPredicate, + }, + pods: []*api.Pod{ + { + ObjectMeta: api.ObjectMeta{Name: "2"}, + Spec: api.PodSpec{ + NodeName: "2", + }, + Status: api.PodStatus{ + Phase: api.PodRunning, + }, + }, + }, + pod: &api.Pod{ObjectMeta: api.ObjectMeta{Name: "2"}}, + + prioritizers: []algorithm.PriorityConfig{{Function: numericPriority, Weight: 1}}, + nodes: []string{"1", "2"}, + expectsErr: true, + name: "test 8", + }, + { + predicates: map[string]algorithm.FitPredicate{ + "nopods": hasNoPodsPredicate, + "matches": matchesPredicate, + }, + pods: []*api.Pod{ + { + ObjectMeta: api.ObjectMeta{Name: "2"}, + Spec: api.PodSpec{ + NodeName: "2", + }, + Status: api.PodStatus{ + Phase: api.PodFailed, + }, + }, + { + ObjectMeta: api.ObjectMeta{Name: "3"}, + Spec: api.PodSpec{ + NodeName: "2", + }, + Status: api.PodStatus{ + Phase: api.PodSucceeded, + }, + }, + }, + pod: &api.Pod{ObjectMeta: api.ObjectMeta{Name: "2"}}, + + prioritizers: []algorithm.PriorityConfig{{Function: numericPriority, Weight: 1}}, + nodes: []string{"1", "2"}, + expectedHost: "2", + name: "test 9", + }, } for _, test := range tests { random := rand.New(rand.NewSource(0)) - scheduler := NewGenericScheduler(test.predicates, test.prioritizers, algorithm.FakePodLister([]*api.Pod{}), random) + scheduler := NewGenericScheduler(test.predicates, test.prioritizers, algorithm.FakePodLister(test.pods), random) machine, err := scheduler.Schedule(test.pod, algorithm.FakeMinionLister(makeNodeList(test.nodes))) if test.expectsErr { if err == nil { diff --git a/test/e2e/monitoring.go b/test/e2e/monitoring.go index 4561643944d..d1807c777bc 100644 --- a/test/e2e/monitoring.go +++ b/test/e2e/monitoring.go @@ -56,8 +56,8 @@ const ( influxdbDatabaseName = "k8s" influxdbUser = "root" influxdbPW = "root" - podlistQuery = "select distinct(pod_id) from /cpu.*/" - nodelistQuery = "select distinct(hostname) from /cpu.*/" + podlistQuery = "select distinct(pod_id) from \"cpu/usage_ns_cumulative\"" + nodelistQuery = "select distinct(hostname) from \"cpu/usage_ns_cumulative\"" sleepBetweenAttempts = 5 * time.Second testTimeout = 5 * time.Minute ) diff --git a/test/e2e/resize_nodes.go b/test/e2e/resize_nodes.go index 4bdbe627f56..57deeb87c03 100644 --- a/test/e2e/resize_nodes.go +++ b/test/e2e/resize_nodes.go @@ -77,7 +77,7 @@ func waitForNodeInstanceGroupSize(size int) error { continue } if currentSize != size { - Logf("Waiting for node istance group size %d, current size %d", size, currentSize) + Logf("Waiting for node instance group size %d, current size %d", size, currentSize) continue } Logf("Node instance group has reached the desired size %d", size) @@ -224,7 +224,7 @@ func waitForPodsCreatedRunningResponding(c *client.Client, ns, name string, repl } var _ = Describe("Nodes", func() { - supportedProviders := []string{"gce"} + supportedProviders := []string{"gce", "gke"} var testName string var c *client.Client var ns string