From d09af4a1d6d27b766c37c747ba95bf6a43fdbd6a Mon Sep 17 00:00:00 2001
From: Matt Liggett <mml@google.com>
Date: Wed, 1 Jun 2016 14:50:13 -0700
Subject: [PATCH] Stop 'drain' deleting pods with local storage.

Unless forced with --delete-local-data.  Also a refactoring of the
kubectl drain logic that selects/rejects pods and produces error/warning
messages.
---
 docs/man/man1/kubectl-drain.1            |   4 +
 docs/user-guide/kubectl/kubectl_drain.md |   3 +-
 docs/yaml/kubectl/kubectl_drain.yaml     |   4 +
 hack/verify-flags/known-flags.txt        |   1 +
 pkg/kubectl/cmd/drain.go                 | 278 ++++++++++++-----------
 pkg/kubectl/cmd/drain_test.go            |  36 +++
 6 files changed, 196 insertions(+), 130 deletions(-)

diff --git a/docs/man/man1/kubectl-drain.1 b/docs/man/man1/kubectl-drain.1
index 97a532417de..833998de53d 100644
--- a/docs/man/man1/kubectl-drain.1
+++ b/docs/man/man1/kubectl-drain.1
@@ -32,6 +32,10 @@ will make the node schedulable again.
 
 
 .SH OPTIONS
+.PP
+\fB\-\-delete\-local\-data\fP=false
+    Continue even if there are pods using emptyDir (local data that will be deleted when the node is drained).
+
 .PP
 \fB\-\-force\fP=false
     Continue even if there are pods not managed by a ReplicationController, ReplicaSet, Job, or DaemonSet.
diff --git a/docs/user-guide/kubectl/kubectl_drain.md b/docs/user-guide/kubectl/kubectl_drain.md
index 40afa26a6b0..678cfe48a74 100644
--- a/docs/user-guide/kubectl/kubectl_drain.md
+++ b/docs/user-guide/kubectl/kubectl_drain.md
@@ -73,6 +73,7 @@ $ kubectl drain foo --grace-period=900
 ### Options
 
 ```
+      --delete-local-data[=false]: Continue even if there are pods using emptyDir (local data that will be deleted when the node is drained).
       --force[=false]: Continue even if there are pods not managed by a ReplicationController, ReplicaSet, Job, or DaemonSet.
       --grace-period=-1: Period of time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used.
       --ignore-daemonsets[=false]: Ignore DaemonSet-managed pods.
@@ -110,7 +111,7 @@ $ kubectl drain foo --grace-period=900
 
 * [kubectl](kubectl.md)	 - kubectl controls the Kubernetes cluster manager
 
-###### Auto generated by spf13/cobra on 15-Apr-2016
+###### Auto generated by spf13/cobra on 6-Jun-2016
 
 <!-- BEGIN MUNGE: GENERATED_ANALYTICS -->
 [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/docs/user-guide/kubectl/kubectl_drain.md?pixel)]()
diff --git a/docs/yaml/kubectl/kubectl_drain.yaml b/docs/yaml/kubectl/kubectl_drain.yaml
index 2c50f897323..33c13f45188 100644
--- a/docs/yaml/kubectl/kubectl_drain.yaml
+++ b/docs/yaml/kubectl/kubectl_drain.yaml
@@ -16,6 +16,10 @@ description: |
   When you are ready to put the node back into service, use kubectl uncordon, which
   will make the node schedulable again.
 options:
+- name: delete-local-data
+  default_value: "false"
+  usage: |
+    Continue even if there are pods using emptyDir (local data that will be deleted when the node is drained).
 - name: force
   default_value: "false"
   usage: |
diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt
index 9513c80f081..6b601fba7da 100644
--- a/hack/verify-flags/known-flags.txt
+++ b/hack/verify-flags/known-flags.txt
@@ -89,6 +89,7 @@ default-container-mem-limit
 delay-shutdown
 delete-collection-workers
 delete-instances
+delete-local-data
 delete-namespace
 deleting-pods-burst
 deleting-pods-qps
diff --git a/pkg/kubectl/cmd/drain.go b/pkg/kubectl/cmd/drain.go
index 04ba88732d6..944707d43e0 100644
--- a/pkg/kubectl/cmd/drain.go
+++ b/pkg/kubectl/cmd/drain.go
@@ -27,9 +27,8 @@ import (
 
 	"k8s.io/kubernetes/pkg/api"
 	"k8s.io/kubernetes/pkg/api/meta"
-	"k8s.io/kubernetes/pkg/controller"
-	// "k8s.io/kubernetes/pkg/api/unversioned"
 	client "k8s.io/kubernetes/pkg/client/unversioned"
+	"k8s.io/kubernetes/pkg/controller"
 	"k8s.io/kubernetes/pkg/fields"
 	cmdutil "k8s.io/kubernetes/pkg/kubectl/cmd/util"
 	"k8s.io/kubernetes/pkg/kubectl/resource"
@@ -43,14 +42,31 @@ type DrainOptions struct {
 	Force              bool
 	GracePeriodSeconds int
 	IgnoreDaemonsets   bool
+	DeleteLocalData    bool
 	mapper             meta.RESTMapper
 	nodeInfo           *resource.Info
 	out                io.Writer
 	typer              runtime.ObjectTyper
 }
 
+// Takes a pod and returns a bool indicating whether or not to operate on the
+// pod, an optional warning message, and an optional fatal error.
+type podFilter func(api.Pod) (include bool, w *warning, f *fatal)
+type warning struct {
+	string
+}
+type fatal struct {
+	string
+}
+
 const (
-	cordon_long = `Mark node as unschedulable.
+	kDaemonsetFatal      = "DaemonSet-managed pods (use --ignore-daemonsets to ignore)"
+	kDaemonsetWarning    = "Ignoring DaemonSet-managed pods"
+	kLocalStorageFatal   = "pods with local storage (use --delete-local-data to override)"
+	kLocalStorageWarning = "Deleting pods with local storage"
+	kUnmanagedFatal      = "pods not managed by ReplicationController, ReplicaSet, Job, or DaemonSet (use --force to override)"
+	kUnmanagedWarning    = "Deleting pods not managed by ReplicationController, ReplicaSet, Job, or DaemonSet"
+	cordon_long          = `Mark node as unschedulable.
 `
 	cordon_example = `# Mark node "foo" as unschedulable.
 kubectl cordon foo
@@ -136,6 +152,7 @@ func NewCmdDrain(f *cmdutil.Factory, out io.Writer) *cobra.Command {
 	}
 	cmd.Flags().BoolVar(&options.Force, "force", false, "Continue even if there are pods not managed by a ReplicationController, ReplicaSet, Job, or DaemonSet.")
 	cmd.Flags().BoolVar(&options.IgnoreDaemonsets, "ignore-daemonsets", false, "Ignore DaemonSet-managed pods.")
+	cmd.Flags().BoolVar(&options.DeleteLocalData, "delete-local-data", false, "Continue even if there are pods using emptyDir (local data that will be deleted when the node is drained).")
 	cmd.Flags().IntVar(&options.GracePeriodSeconds, "grace-period", -1, "Period of time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used.")
 	return cmd
 }
@@ -195,148 +212,151 @@ func (o *DrainOptions) RunDrain() error {
 	return nil
 }
 
-// getPodsForDeletion returns all the pods we're going to delete.  If there are
-// any unmanaged pods and the user didn't pass --force, we return that list in
-// an error.
-func (o *DrainOptions) getPodsForDeletion() ([]api.Pod, error) {
-	pods, unreplicatedPodNames, daemonSetPodNames, err := GetPodsForDeletionOnNodeDrain(
-		o.client,
-		o.nodeInfo.Name,
-		o.factory.Decoder(true),
-		o.Force,
-		o.IgnoreDaemonsets,
-	)
-	if err != nil {
-		return []api.Pod{}, err
+func (o *DrainOptions) getController(sr *api.SerializedReference) (interface{}, error) {
+	switch sr.Reference.Kind {
+	case "ReplicationController":
+		return o.client.ReplicationControllers(sr.Reference.Namespace).Get(sr.Reference.Name)
+	case "DaemonSet":
+		return o.client.DaemonSets(sr.Reference.Namespace).Get(sr.Reference.Name)
+	case "Job":
+		return o.client.ExtensionsClient.Jobs(sr.Reference.Namespace).Get(sr.Reference.Name)
+	case "ReplicaSet":
+		return o.client.ExtensionsClient.ReplicaSets(sr.Reference.Namespace).Get(sr.Reference.Name)
 	}
-
-	daemonSetErrors := !o.IgnoreDaemonsets && len(daemonSetPodNames) > 0
-	unreplicatedErrors := !o.Force && len(unreplicatedPodNames) > 0
-
-	switch {
-	case daemonSetErrors && unreplicatedErrors:
-		return []api.Pod{}, errors.New(unmanagedMsg(unreplicatedPodNames, daemonSetPodNames, true))
-	case daemonSetErrors && !unreplicatedErrors:
-		return []api.Pod{}, errors.New(unmanagedMsg([]string{}, daemonSetPodNames, true))
-	case unreplicatedErrors && !daemonSetErrors:
-		return []api.Pod{}, errors.New(unmanagedMsg(unreplicatedPodNames, []string{}, true))
-	}
-
-	if len(unreplicatedPodNames) > 0 {
-		fmt.Fprintf(o.out, "WARNING: About to delete these %s\n", unmanagedMsg(unreplicatedPodNames, []string{}, false))
-	}
-	if len(daemonSetPodNames) > 0 {
-		fmt.Fprintf(o.out, "WARNING: Skipping %s\n", unmanagedMsg([]string{}, daemonSetPodNames, false))
-	}
-
-	return pods, nil
+	return nil, fmt.Errorf("Unknown controller kind %q", sr.Reference.Kind)
 }
 
-// GetPodsForDeletionOnNodeDrain returns pods that should be deleted on node drain as well as some extra information
-// about possibly problematic pods (unreplicated and deamon sets).
-func GetPodsForDeletionOnNodeDrain(client *client.Client, nodename string, decoder runtime.Decoder, force bool,
-	ignoreDeamonSet bool) (pods []api.Pod, unreplicatedPodNames []string, daemonSetPodNames []string, finalError error) {
-
-	pods = []api.Pod{}
-	unreplicatedPodNames = []string{}
-	daemonSetPodNames = []string{}
-	podList, err := client.Pods(api.NamespaceAll).List(api.ListOptions{FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": nodename})})
-	if err != nil {
-		return []api.Pod{}, []string{}, []string{}, err
+func (o *DrainOptions) getPodCreator(pod api.Pod) (*api.SerializedReference, error) {
+	creatorRef, found := pod.ObjectMeta.Annotations[controller.CreatedByAnnotation]
+	if !found {
+		return nil, nil
 	}
 
+	// Now verify that the specified creator actually exists.
+	sr := &api.SerializedReference{}
+	if err := runtime.DecodeInto(o.factory.Decoder(true), []byte(creatorRef), sr); err != nil {
+		return nil, err
+	}
+	// We assume the only reason for an error is because the controller is
+	// gone/missing, not for any other cause.  TODO(mml): something more
+	// sophisticated than this
+	_, err := o.getController(sr)
+	if err != nil {
+		return nil, err
+	}
+	return sr, nil
+}
+
+func (o *DrainOptions) unreplicatedFilter(pod api.Pod) (bool, *warning, *fatal) {
+	sr, err := o.getPodCreator(pod)
+	if err != nil {
+		return false, nil, &fatal{err.Error()}
+	}
+	if sr != nil {
+		return true, nil, nil
+	}
+	if !o.Force {
+		return false, nil, &fatal{kUnmanagedFatal}
+	}
+	return true, &warning{kUnmanagedWarning}, nil
+}
+
+func (o *DrainOptions) daemonsetFilter(pod api.Pod) (bool, *warning, *fatal) {
+	// Note that we return false in all cases where the pod is DaemonSet managed,
+	// regardless of flags.  We never delete them, the only question is whether
+	// their presence constitutes an error.
+	sr, err := o.getPodCreator(pod)
+	if err != nil {
+		return false, nil, &fatal{err.Error()}
+	}
+	if sr == nil || sr.Reference.Kind != "DaemonSet" {
+		return true, nil, nil
+	}
+	if _, err := o.client.DaemonSets(sr.Reference.Namespace).Get(sr.Reference.Name); err != nil {
+		return false, nil, &fatal{err.Error()}
+	}
+	if !o.IgnoreDaemonsets {
+		return false, nil, &fatal{kDaemonsetFatal}
+	}
+	return false, &warning{kDaemonsetWarning}, nil
+}
+
+func mirrorPodFilter(pod api.Pod) (bool, *warning, *fatal) {
+	if _, found := pod.ObjectMeta.Annotations[types.ConfigMirrorAnnotationKey]; found {
+		return false, nil, nil
+	}
+	return true, nil, nil
+}
+
+func hasLocalStorage(pod api.Pod) bool {
+	for _, volume := range pod.Spec.Volumes {
+		if volume.EmptyDir != nil {
+			return true
+		}
+	}
+
+	return false
+}
+
+func (o *DrainOptions) localStorageFilter(pod api.Pod) (bool, *warning, *fatal) {
+	if !hasLocalStorage(pod) {
+		return true, nil, nil
+	}
+	if !o.DeleteLocalData {
+		return false, nil, &fatal{kLocalStorageFatal}
+	}
+	return true, &warning{kLocalStorageWarning}, nil
+}
+
+// Map of status message to a list of pod names having that status.
+type podStatuses map[string][]string
+
+func (ps podStatuses) Message() string {
+	msgs := []string{}
+
+	for key, pods := range ps {
+		msgs = append(msgs, fmt.Sprintf("%s: %s", key, strings.Join(pods, ", ")))
+	}
+	return strings.Join(msgs, "; ")
+}
+
+// getPodsForDeletion returns all the pods we're going to delete.  If there are
+// any pods preventing us from deleting, we return that list in an error.
+func (o *DrainOptions) getPodsForDeletion() (pods []api.Pod, err error) {
+	podList, err := o.client.Pods(api.NamespaceAll).List(api.ListOptions{
+		FieldSelector: fields.SelectorFromSet(fields.Set{"spec.nodeName": o.nodeInfo.Name})})
+	if err != nil {
+		return pods, err
+	}
+
+	ws := podStatuses{}
+	fs := podStatuses{}
+
 	for _, pod := range podList.Items {
-		_, found := pod.ObjectMeta.Annotations[types.ConfigMirrorAnnotationKey]
-		if found {
-			// Skip mirror pod
-			continue
-		}
-		replicated := false
-		daemonset_pod := false
+		podOk := true
+		for _, filt := range []podFilter{mirrorPodFilter, o.localStorageFilter, o.unreplicatedFilter, o.daemonsetFilter} {
+			filterOk, w, f := filt(pod)
 
-		creatorRef, found := pod.ObjectMeta.Annotations[controller.CreatedByAnnotation]
-		if found {
-			// Now verify that the specified creator actually exists.
-			var sr api.SerializedReference
-			if err := runtime.DecodeInto(decoder, []byte(creatorRef), &sr); err != nil {
-				return []api.Pod{}, []string{}, []string{}, err
+			podOk = podOk && filterOk
+			if w != nil {
+				ws[w.string] = append(ws[w.string], pod.Name)
 			}
-			if sr.Reference.Kind == "ReplicationController" {
-				rc, err := client.ReplicationControllers(sr.Reference.Namespace).Get(sr.Reference.Name)
-				// Assume the only reason for an error is because the RC is
-				// gone/missing, not for any other cause.  TODO(mml): something more
-				// sophisticated than this
-				if err == nil && rc != nil {
-					replicated = true
-				}
-			} else if sr.Reference.Kind == "DaemonSet" {
-				ds, err := client.DaemonSets(sr.Reference.Namespace).Get(sr.Reference.Name)
-
-				// Assume the only reason for an error is because the DaemonSet is
-				// gone/missing, not for any other cause.  TODO(mml): something more
-				// sophisticated than this
-				if err == nil && ds != nil {
-					// Otherwise, treat daemonset-managed pods as unmanaged since
-					// DaemonSet Controller currently ignores the unschedulable bit.
-					// FIXME(mml): Add link to the issue concerning a proper way to drain
-					// daemonset pods, probably using taints.
-					daemonset_pod = true
-				}
-			} else if sr.Reference.Kind == "Job" {
-				job, err := client.ExtensionsClient.Jobs(sr.Reference.Namespace).Get(sr.Reference.Name)
-
-				// Assume the only reason for an error is because the Job is
-				// gone/missing, not for any other cause.  TODO(mml): something more
-				// sophisticated than this
-				if err == nil && job != nil {
-					replicated = true
-				}
-			} else if sr.Reference.Kind == "ReplicaSet" {
-				rs, err := client.ExtensionsClient.ReplicaSets(sr.Reference.Namespace).Get(sr.Reference.Name)
-
-				// Assume the only reason for an error is because the RS is
-				// gone/missing, not for any other cause.  TODO(mml): something more
-				// sophisticated than this
-				if err == nil && rs != nil {
-					replicated = true
-				}
+			if f != nil {
+				fs[f.string] = append(fs[f.string], pod.Name)
 			}
 		}
-
-		switch {
-		case daemonset_pod:
-			daemonSetPodNames = append(daemonSetPodNames, pod.Name)
-		case !replicated:
-			unreplicatedPodNames = append(unreplicatedPodNames, pod.Name)
-			if force {
-				pods = append(pods, pod)
-			}
-		default:
+		if podOk {
 			pods = append(pods, pod)
 		}
 	}
-	return pods, unreplicatedPodNames, daemonSetPodNames, nil
-}
 
-// Helper for generating errors or warnings about unmanaged pods.
-func unmanagedMsg(unreplicatedNames []string, daemonSetNames []string, include_guidance bool) string {
-	msgs := []string{}
-	if len(unreplicatedNames) > 0 {
-		msg := fmt.Sprintf("pods not managed by ReplicationController, ReplicaSet, Job, or DaemonSet: %s", strings.Join(unreplicatedNames, ","))
-		if include_guidance {
-			msg += " (use --force to override)"
-		}
-		msgs = append(msgs, msg)
+	if len(fs) > 0 {
+		return []api.Pod{}, errors.New(fs.Message())
 	}
-	if len(daemonSetNames) > 0 {
-		msg := fmt.Sprintf("DaemonSet-managed pods: %s", strings.Join(daemonSetNames, ","))
-		if include_guidance {
-			msg += " (use --ignore-daemonsets to ignore)"
-		}
-		msgs = append(msgs, msg)
+	if len(ws) > 0 {
+		fmt.Fprintf(o.out, "WARNING: %s\n", ws.Message())
 	}
-
-	return strings.Join(msgs, " and ")
+	return pods, nil
 }
 
 // deletePods deletes the pods on the api server
diff --git a/pkg/kubectl/cmd/drain_test.go b/pkg/kubectl/cmd/drain_test.go
index eb79b0b262e..4fb3bad4b88 100644
--- a/pkg/kubectl/cmd/drain_test.go
+++ b/pkg/kubectl/cmd/drain_test.go
@@ -325,6 +325,24 @@ func TestDrain(t *testing.T) {
 		},
 	}
 
+	emptydir_pod := api.Pod{
+		ObjectMeta: api.ObjectMeta{
+			Name:              "bar",
+			Namespace:         "default",
+			CreationTimestamp: unversioned.Time{Time: time.Now()},
+			Labels:            labels,
+		},
+		Spec: api.PodSpec{
+			NodeName: "node",
+			Volumes: []api.Volume{
+				{
+					Name:         "scratch",
+					VolumeSource: api.VolumeSource{EmptyDir: &api.EmptyDirVolumeSource{Medium: ""}},
+				},
+			},
+		},
+	}
+
 	tests := []struct {
 		description  string
 		node         *api.Node
@@ -406,6 +424,24 @@ func TestDrain(t *testing.T) {
 			expectFatal:  false,
 			expectDelete: true,
 		},
+		{
+			description:  "pod with EmptyDir",
+			node:         node,
+			expected:     cordoned_node,
+			pods:         []api.Pod{emptydir_pod},
+			args:         []string{"node", "--force"},
+			expectFatal:  true,
+			expectDelete: false,
+		},
+		{
+			description:  "pod with EmptyDir and --delete-local-data",
+			node:         node,
+			expected:     cordoned_node,
+			pods:         []api.Pod{emptydir_pod},
+			args:         []string{"node", "--force", "--delete-local-data=true"},
+			expectFatal:  false,
+			expectDelete: true,
+		},
 		{
 			description:  "empty node",
 			node:         node,