diff --git a/cmd/kubeadm/app/util/config/cluster.go b/cmd/kubeadm/app/util/config/cluster.go index 67b30ada28e..18624f69056 100644 --- a/cmd/kubeadm/app/util/config/cluster.go +++ b/cmd/kubeadm/app/util/config/cluster.go @@ -41,6 +41,25 @@ import ( "k8s.io/kubernetes/cmd/kubeadm/app/util/apiclient" ) +// unretriableError is an error used temporarily while we are migrating from the +// ClusterStatus struct to an annotation Pod based information. When performing +// the upgrade of all control plane nodes with `kubeadm upgrade apply` and +// `kubeadm upgrade node` we don't want to retry as if we were hitting connectivity +// issues when the pod annotation is missing on the API server pods. This error will +// be used in such scenario, for failing fast, and falling back to the ClusterStatus +// retrieval in those cases. +type unretriableError struct { + err error +} + +func newUnretriableError(err error) *unretriableError { + return &unretriableError{err: err} +} + +func (ue *unretriableError) Error() string { + return fmt.Sprintf("unretriable error: %s", ue.err.Error()) +} + // FetchInitConfigurationFromCluster fetches configuration from a ConfigMap in the cluster func FetchInitConfigurationFromCluster(client clientset.Interface, w io.Writer, logPrefix string, newControlPlane bool) (*kubeadmapi.InitConfiguration, error) { fmt.Fprintf(w, "[%s] Reading configuration from the cluster...\n", logPrefix) @@ -216,6 +235,13 @@ func getAPIEndpointFromPodAnnotation(client clientset.Interface, nodeName string // static pods were not yet mirrored into the API server we want to wait for this propagation. err := wait.ExponentialBackoff(backoff, func() (bool, error) { rawAPIEndpoint, lastErr = getRawAPIEndpointFromPodAnnotationWithoutRetry(client, nodeName) + // TODO (ereslibre): this logic will need tweaking once that we get rid of the ClusterStatus, since we won't have + // the ClusterStatus safety net, we will want to remove the UnretriableError and not make the distinction here + // anymore. + if _, ok := lastErr.(*unretriableError); ok { + // Fail fast scenario, to be removed once we get rid of the ClusterStatus + return true, errors.Wrapf(lastErr, "API server Pods exist, but no API endpoint annotations were found") + } return lastErr == nil, nil }) if err != nil { @@ -246,7 +272,7 @@ func getRawAPIEndpointFromPodAnnotationWithoutRetry(client clientset.Interface, if apiServerEndpoint, ok := podList.Items[0].Annotations[constants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey]; ok { return apiServerEndpoint, nil } - return "", errors.Errorf("API server pod for node name %q hasn't got a %q annotation, cannot retrieve API endpoint", nodeName, constants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey) + return "", newUnretriableError(errors.Errorf("API server pod for node name %q hasn't got a %q annotation, cannot retrieve API endpoint", nodeName, constants.KubeAPIServerAdvertiseAddressEndpointAnnotationKey)) } // TODO: remove after 1.20, when the ClusterStatus struct is removed from the kubeadm-config ConfigMap. diff --git a/cmd/kubeadm/app/util/etcd/etcd.go b/cmd/kubeadm/app/util/etcd/etcd.go index 769ef3e3d06..0c99cd8c9d6 100644 --- a/cmd/kubeadm/app/util/etcd/etcd.go +++ b/cmd/kubeadm/app/util/etcd/etcd.go @@ -147,12 +147,20 @@ func getRawEtcdEndpointsFromPodAnnotation(client clientset.Interface, backoff wa // Let's tolerate some unexpected transient failures from the API server or load balancers. Also, if // static pods were not yet mirrored into the API server we want to wait for this propagation. err := wait.ExponentialBackoff(backoff, func() (bool, error) { - if etcdEndpoints, lastErr = getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client); lastErr != nil { + var overallEtcdPodCount int + if etcdEndpoints, overallEtcdPodCount, lastErr = getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client); lastErr != nil { return false, nil } - // If the list of etcd endpoints is empty we want to retry: this can happen if joining a secondary - // control plane while the primary control plane didn't mirror its static pods yet. - return len(etcdEndpoints) > 0, nil + // TODO (ereslibre): this logic will need tweaking once that we get rid of the ClusterStatus, since we won't have + // the ClusterStatus safety net we will have to retry in both cases. + if len(etcdEndpoints) == 0 { + if overallEtcdPodCount == 0 { + return false, nil + } + // Fail fast scenario, to be removed once we get rid of the ClusterStatus + return true, errors.New("etcd Pods exist, but no etcd endpoint annotations were found") + } + return true, nil }) if err != nil { if lastErr != nil { @@ -163,7 +171,10 @@ func getRawEtcdEndpointsFromPodAnnotation(client clientset.Interface, backoff wa return etcdEndpoints, nil } -func getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client clientset.Interface) ([]string, error) { +// getRawEtcdEndpointsFromPodAnnotationWithoutRetry returns the list of etcd endpoints as reported by etcd Pod annotations, +// along with the number of global etcd pods. This allows for callers to tell the difference between "no endpoints found", +// and "no endpoints found and pods were listed", so they can skip retrying. +func getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client clientset.Interface) ([]string, int, error) { klog.V(3).Infof("retrieving etcd endpoints from %q annotation in etcd Pods", constants.EtcdAdvertiseClientUrlsAnnotationKey) podList, err := client.CoreV1().Pods(metav1.NamespaceSystem).List( context.TODO(), @@ -172,17 +183,18 @@ func getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client clientset.Interface }, ) if err != nil { - return []string{}, err + return []string{}, 0, err } etcdEndpoints := []string{} for _, pod := range podList.Items { etcdEndpoint, ok := pod.ObjectMeta.Annotations[constants.EtcdAdvertiseClientUrlsAnnotationKey] if !ok { - return []string{}, errors.Errorf("etcd Pod %q is missing the %q annotation; cannot infer etcd advertise client URL", pod.ObjectMeta.Name, constants.EtcdAdvertiseClientUrlsAnnotationKey) + klog.V(3).Infof("etcd Pod %q is missing the %q annotation; cannot infer etcd advertise client URL using the Pod annotation", pod.ObjectMeta.Name, constants.EtcdAdvertiseClientUrlsAnnotationKey) + continue } etcdEndpoints = append(etcdEndpoints, etcdEndpoint) } - return etcdEndpoints, nil + return etcdEndpoints, len(podList.Items), nil } // TODO: remove after 1.20, when the ClusterStatus struct is removed from the kubeadm-config ConfigMap. diff --git a/cmd/kubeadm/app/util/etcd/etcd_test.go b/cmd/kubeadm/app/util/etcd/etcd_test.go index 7908ac74a06..0e1f892f9d3 100644 --- a/cmd/kubeadm/app/util/etcd/etcd_test.go +++ b/cmd/kubeadm/app/util/etcd/etcd_test.go @@ -312,7 +312,7 @@ func TestGetRawEtcdEndpointsFromPodAnnotationWithoutRetry(t *testing.T) { if rt.clientSetup != nil { rt.clientSetup(client) } - endpoints, err := getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client) + endpoints, _, err := getRawEtcdEndpointsFromPodAnnotationWithoutRetry(client) if err != nil && !rt.expectedErr { t.Errorf("got error %v, but wasn't expecting any error", err) return