From f2d5375bc0497d10252d9ef240dc80416597eea7 Mon Sep 17 00:00:00 2001
From: Matt Liggett <mml@google.com>
Date: Wed, 27 Jan 2016 10:27:14 -0800
Subject: [PATCH] Stop deleting DaemonSet pods during drain.

We do this because they will be recreated immediately by the
DaemonSet Controller.  In addition, we also require a specific flag
(--ignore-daemonsets) when there are DaemonSet pods on the node.
---
 contrib/completions/bash/kubectl         |  1 +
 docs/man/man1/kubectl-drain.1            | 13 +++-
 docs/user-guide/kubectl/kubectl_drain.md | 12 ++--
 hack/verify-flags/known-flags.txt        |  1 +
 pkg/kubectl/cmd/drain.go                 | 79 +++++++++++++++++++-----
 pkg/kubectl/cmd/drain_test.go            | 12 +++-
 6 files changed, 96 insertions(+), 22 deletions(-)

diff --git a/contrib/completions/bash/kubectl b/contrib/completions/bash/kubectl
index e5235d5ccf3..4fa8a53294b 100644
--- a/contrib/completions/bash/kubectl
+++ b/contrib/completions/bash/kubectl
@@ -1194,6 +1194,7 @@ _kubectl_drain()
 
     flags+=("--force")
     flags+=("--grace-period=")
+    flags+=("--ignore-daemonsets")
     flags+=("--alsologtostderr")
     flags+=("--api-version=")
     flags+=("--certificate-authority=")
diff --git a/docs/man/man1/kubectl-drain.1 b/docs/man/man1/kubectl-drain.1
index 512969f2e2f..d8514b45278 100644
--- a/docs/man/man1/kubectl-drain.1
+++ b/docs/man/man1/kubectl-drain.1
@@ -18,9 +18,12 @@ Drain node in preparation for maintenance.
 .PP
 The given node will be marked unschedulable to prevent new pods from arriving.
 Then drain deletes all pods except mirror pods (which cannot be deleted through
-the API server).  If there are any pods that are neither mirror pods nor
-managed by a ReplicationController, Job, or DaemonSet, then drain will not
-delete any pods unless you use \-\-force.
+the API server).  If there are DaemonSet\-managed pods, drain will not proceed
+without \-\-ignore\-daemonsets, and regardless it will not delete any
+DaemonSet\-managed pods, because those pods would be immediately replaced by the
+DaemonSet controller, which ignores unschedulable marknigs.  If there are any
+pods that are neither mirror pods nor managed\-\-by ReplicationController,
+DaemonSet or Job\-\-, then drain will not delete any pods unless you use \-\-force.
 
 .PP
 When you are ready to put the node back into service, use kubectl uncordon, which
@@ -36,6 +39,10 @@ will make the node schedulable again.
 \fB\-\-grace\-period\fP=\-1
     Period of time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used.
 
+.PP
+\fB\-\-ignore\-daemonsets\fP=false
+    Ignore DaemonSet\-managed pods.
+
 
 .SH OPTIONS INHERITED FROM PARENT COMMANDS
 .PP
diff --git a/docs/user-guide/kubectl/kubectl_drain.md b/docs/user-guide/kubectl/kubectl_drain.md
index 6a586aa1f22..1b4199c9106 100644
--- a/docs/user-guide/kubectl/kubectl_drain.md
+++ b/docs/user-guide/kubectl/kubectl_drain.md
@@ -38,9 +38,12 @@ Drain node in preparation for maintenance.
 
 The given node will be marked unschedulable to prevent new pods from arriving.
 Then drain deletes all pods except mirror pods (which cannot be deleted through
-the API server).  If there are any pods that are neither mirror pods nor
-managed by a ReplicationController, Job, or DaemonSet, then drain will not
-delete any pods unless you use --force.
+the API server).  If there are DaemonSet-managed pods, drain will not proceed
+without --ignore-daemonsets, and regardless it will not delete any
+DaemonSet-managed pods, because those pods would be immediately replaced by the
+DaemonSet controller, which ignores unschedulable marknigs.  If there are any
+pods that are neither mirror pods nor managed--by ReplicationController,
+DaemonSet or Job--, then drain will not delete any pods unless you use --force.
 
 When you are ready to put the node back into service, use kubectl uncordon, which
 will make the node schedulable again.
@@ -66,6 +69,7 @@ $ kubectl drain foo --grace-period=900
 ```
       --force[=false]: Continue even if there are pods not managed by a ReplicationController, Job, or DaemonSet.
       --grace-period=-1: Period of time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used.
+      --ignore-daemonsets[=false]: Ignore DaemonSet-managed pods.
 ```
 
 ### Options inherited from parent commands
@@ -100,7 +104,7 @@ $ kubectl drain foo --grace-period=900
 
 * [kubectl](kubectl.md)	 - kubectl controls the Kubernetes cluster manager
 
-###### Auto generated by spf13/cobra on 28-Jan-2016
+###### Auto generated by spf13/cobra on 2-Feb-2016
 
 <!-- BEGIN MUNGE: GENERATED_ANALYTICS -->
 [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/docs/user-guide/kubectl/kubectl_drain.md?pixel)]()
diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt
index d30e43f3b9f..8772eb57781 100644
--- a/hack/verify-flags/known-flags.txt
+++ b/hack/verify-flags/known-flags.txt
@@ -138,6 +138,7 @@ host-pid-sources
 hostname-override
 http-check-frequency
 http-port
+ignore-daemonsets
 ignore-not-found
 image-gc-high-threshold
 image-gc-low-threshold
diff --git a/pkg/kubectl/cmd/drain.go b/pkg/kubectl/cmd/drain.go
index e05985d74a6..89cba686db3 100644
--- a/pkg/kubectl/cmd/drain.go
+++ b/pkg/kubectl/cmd/drain.go
@@ -17,6 +17,7 @@ limitations under the License.
 package cmd
 
 import (
+	"errors"
 	"fmt"
 	"io"
 	"reflect"
@@ -41,6 +42,7 @@ type DrainOptions struct {
 	factory            *cmdutil.Factory
 	Force              bool
 	GracePeriodSeconds int
+	IgnoreDaemonsets   bool
 	mapper             meta.RESTMapper
 	nodeInfo           *resource.Info
 	out                io.Writer
@@ -98,9 +100,12 @@ const (
 
 The given node will be marked unschedulable to prevent new pods from arriving.
 Then drain deletes all pods except mirror pods (which cannot be deleted through
-the API server).  If there are any pods that are neither mirror pods nor
-managed by a ReplicationController, Job, or DaemonSet, then drain will not
-delete any pods unless you use --force.
+the API server).  If there are DaemonSet-managed pods, drain will not proceed
+without --ignore-daemonsets, and regardless it will not delete any
+DaemonSet-managed pods, because those pods would be immediately replaced by the
+DaemonSet controller, which ignores unschedulable marknigs.  If there are any
+pods that are neither mirror pods nor managed--by ReplicationController,
+DaemonSet or Job--, then drain will not delete any pods unless you use --force.
 
 When you are ready to put the node back into service, use kubectl uncordon, which
 will make the node schedulable again.
@@ -127,6 +132,7 @@ func NewCmdDrain(f *cmdutil.Factory, out io.Writer) *cobra.Command {
 		},
 	}
 	cmd.Flags().BoolVar(&options.Force, "force", false, "Continue even if there are pods not managed by a ReplicationController, Job, or DaemonSet.")
+	cmd.Flags().BoolVar(&options.IgnoreDaemonsets, "ignore-daemonsets", false, "Ignore DaemonSet-managed pods.")
 	cmd.Flags().IntVar(&options.GracePeriodSeconds, "grace-period", -1, "Period of time in seconds given to each pod to terminate gracefully. If negative, the default value specified in the pod will be used.")
 	return cmd
 }
@@ -196,6 +202,7 @@ func (o *DrainOptions) getPodsForDeletion() ([]api.Pod, error) {
 		return pods, err
 	}
 	unreplicatedPodNames := []string{}
+	daemonSetPodNames := []string{}
 
 	for _, pod := range podList.Items {
 		_, found := pod.ObjectMeta.Annotations[types.ConfigMirrorAnnotationKey]
@@ -204,6 +211,7 @@ func (o *DrainOptions) getPodsForDeletion() ([]api.Pod, error) {
 			continue
 		}
 		replicated := false
+		daemonset_pod := false
 
 		creatorRef, found := pod.ObjectMeta.Annotations[controller.CreatedByAnnotation]
 		if found {
@@ -227,7 +235,11 @@ func (o *DrainOptions) getPodsForDeletion() ([]api.Pod, error) {
 				// gone/missing, not for any other cause.  TODO(mml): something more
 				// sophisticated than this
 				if err == nil && ds != nil {
-					replicated = true
+					// Otherwise, treat daemonset-managed pods as unmanaged since
+					// DaemonSet Controller currently ignores the unschedulable bit.
+					// FIXME(mml): Add link to the issue concerning a proper way to drain
+					// daemonset pods, probably using taints.
+					daemonset_pod = true
 				}
 			} else if sr.Reference.Kind == "Job" {
 				job, err := o.client.Jobs(sr.Reference.Namespace).Get(sr.Reference.Name)
@@ -240,24 +252,63 @@ func (o *DrainOptions) getPodsForDeletion() ([]api.Pod, error) {
 				}
 			}
 		}
-		if replicated || o.Force {
-			pods = append(pods, pod)
-		}
-		if !replicated {
+
+		switch {
+		case daemonset_pod:
+			daemonSetPodNames = append(daemonSetPodNames, pod.Name)
+		case !replicated:
 			unreplicatedPodNames = append(unreplicatedPodNames, pod.Name)
+			if o.Force {
+				pods = append(pods, pod)
+			}
+		default:
+			pods = append(pods, pod)
 		}
 	}
 
-	if len(unreplicatedPodNames) > 0 {
-		joined := strings.Join(unreplicatedPodNames, ", ")
-		if !o.Force {
-			return pods, fmt.Errorf("refusing to continue due to pods managed by neither a ReplicationController, nor a Job, nor a DaemonSet: %s (use --force to override)", joined)
-		}
-		fmt.Fprintf(o.out, "WARNING: About to delete these pods managed by neither a ReplicationController, nor a Job, nor a DaemonSet: %s\n", joined)
+	daemonSetErrors := !o.IgnoreDaemonsets && len(daemonSetPodNames) > 0
+	unreplicatedErrors := !o.Force && len(unreplicatedPodNames) > 0
+
+	switch {
+	case daemonSetErrors && unreplicatedErrors:
+		return []api.Pod{}, errors.New(unmanagedMsg(unreplicatedPodNames, daemonSetPodNames, true))
+	case daemonSetErrors && !unreplicatedErrors:
+		return []api.Pod{}, errors.New(unmanagedMsg([]string{}, daemonSetPodNames, true))
+	case unreplicatedErrors && !daemonSetErrors:
+		return []api.Pod{}, errors.New(unmanagedMsg(unreplicatedPodNames, []string{}, true))
 	}
+
+	if len(unreplicatedPodNames) > 0 {
+		fmt.Fprintf(o.out, "WARNING: About to delete these %s\n", unmanagedMsg(unreplicatedPodNames, []string{}, false))
+	}
+	if len(daemonSetPodNames) > 0 {
+		fmt.Fprintf(o.out, "WARNING: Skipping %s\n", unmanagedMsg([]string{}, daemonSetPodNames, false))
+	}
+
 	return pods, nil
 }
 
+// Helper for generating errors or warnings about unmanaged pods.
+func unmanagedMsg(unreplicatedNames []string, daemonSetNames []string, include_guidance bool) string {
+	msgs := []string{}
+	if len(unreplicatedNames) > 0 {
+		msg := fmt.Sprintf("pods not managed by ReplicationController, Job, or DaemonSet: %s", strings.Join(unreplicatedNames, ","))
+		if include_guidance {
+			msg += " (use --force to override)"
+		}
+		msgs = append(msgs, msg)
+	}
+	if len(daemonSetNames) > 0 {
+		msg := fmt.Sprintf("DaemonSet-managed pods: %s", strings.Join(daemonSetNames, ","))
+		if include_guidance {
+			msg += " (use --ignore-daemonsets to ignore)"
+		}
+		msgs = append(msgs, msg)
+	}
+
+	return strings.Join(msgs, " and ")
+}
+
 // deletePods deletes the pods on the api server
 func (o *DrainOptions) deletePods(pods []api.Pod) error {
 	deleteOptions := api.DeleteOptions{}
diff --git a/pkg/kubectl/cmd/drain_test.go b/pkg/kubectl/cmd/drain_test.go
index a9691a7fa0d..ddfd62ca727 100644
--- a/pkg/kubectl/cmd/drain_test.go
+++ b/pkg/kubectl/cmd/drain_test.go
@@ -323,8 +323,18 @@ func TestDrain(t *testing.T) {
 			pods:         []api.Pod{ds_pod},
 			rcs:          []api.ReplicationController{rc},
 			args:         []string{"node"},
+			expectFatal:  true,
+			expectDelete: false,
+		},
+		{
+			description:  "DS-managed pod with --ignore-daemonsets",
+			node:         node,
+			expected:     cordoned_node,
+			pods:         []api.Pod{ds_pod},
+			rcs:          []api.ReplicationController{rc},
+			args:         []string{"node", "--ignore-daemonsets"},
 			expectFatal:  false,
-			expectDelete: true,
+			expectDelete: false,
 		},
 		{
 			description:  "Job-managed pod",