diff --git a/README.md b/README.md index e0608a64..59ad065d 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ nodes: ## Network Plugins -RKE supports the following network plugins: +RKE supports the following network plugins that are deployed as addons: - Flannel - Calico @@ -120,7 +120,12 @@ The following images are no longer required, and can be replaced by `rancher/rke ## Addons -RKE supports pluggable addons on cluster bootstrap, user can specify the addon yaml in the cluster.yml file, and when running +RKE supports pluggable addons. Addons are used to deploy several cluster components including: +- Network plugin +- KubeDNS +- Ingress controller + +In addition, a user can specify the addon yaml in the cluster.yml file, and when running ```yaml rke up --config cluster.yml @@ -128,7 +133,7 @@ rke up --config cluster.yml RKE will deploy the addons yaml after the cluster starts, RKE first uploads this yaml file as a configmap in kubernetes cluster and then run a kubernetes job that mounts this config map and deploy the addons. -> Note that RKE doesn't support yet removal of the addons, so once they are deployed the first time you can't change them using rke +> Note that RKE doesn't support yet removal or update of the addons, so once they are deployed the first time you can't change them using rke To start using addons use `addons:` option in the `cluster.yml` file for example: @@ -159,6 +164,17 @@ addons_include: - ./nginx.yaml ``` +#### Addon deployment jobs + +RKE uses kubernetes Jobs to deploy addons. In some cases, addons deployment takes longer than expected. Starting with version `0.1.7-rc1`, RKE provides an option to controle the job check timeout in seconds: +```yaml +addon_job_timeout: 30 +``` + +#### Critical and uncritical addons +As of version `0.1.7-rc1`, addons are split into two categories: critical and uncritical. + +Critical addons will cause RKE to error out if they fail to deploy for any reason. While uncritical addons will just log a warning and continue with the deployment. Currently only the network plugin is considered critical. ## High Availability RKE is HA ready, you can specify more than one controlplane host in the `cluster.yml` file, and rke will deploy master components on all of them, the kubelets are configured to connect to `127.0.0.1:6443` by default which is the address of `nginx-proxy` service that proxy requests to all master nodes. diff --git a/cluster.yml b/cluster.yml index 6bd441ae..3681dab9 100644 --- a/cluster.yml +++ b/cluster.yml @@ -156,6 +156,8 @@ ignore_docker_version: false kubernetes_version: v1.10.1 +# addons are deployed using kubernetes jobs. RKE will give up on trying to get the job status after this timeout in seconds.. +addon_job_timeout: 30 # If set, this is the cluster name that will be used in the kube config file # Default value is "local" cluster_name: mycluster diff --git a/cluster/addons.go b/cluster/addons.go index 66d84d1f..6d5a8a65 100644 --- a/cluster/addons.go +++ b/cluster/addons.go @@ -36,17 +36,36 @@ type ingressOptions struct { IngressBackend string } +type addonError struct { + err string + isCritical bool +} + +func (e *addonError) Error() string { + return e.err +} + func (c *Cluster) deployK8sAddOns(ctx context.Context) error { if err := c.deployKubeDNS(ctx); err != nil { - return err + if err, ok := err.(*addonError); ok && err.isCritical { + return err + } + log.Warnf(ctx, "Failed to deploy addon execute job [%s]: %v", KubeDNSAddonResourceName, err) } - return c.deployIngress(ctx) + if err := c.deployIngress(ctx); err != nil { + if err, ok := err.(*addonError); ok && err.isCritical { + return err + } + log.Warnf(ctx, "Failed to deploy addon execute job [%s]: %v", IngressAddonResourceName, err) + + } + return nil } func (c *Cluster) deployUserAddOns(ctx context.Context) error { log.Infof(ctx, "[addons] Setting up user addons") if c.Addons != "" { - if err := c.doAddonDeploy(ctx, c.Addons, UserAddonResourceName); err != nil { + if err := c.doAddonDeploy(ctx, c.Addons, UserAddonResourceName, false); err != nil { return err } } @@ -108,7 +127,7 @@ func (c *Cluster) deployAddonsInclude(ctx context.Context) error { log.Infof(ctx, "[addons] Deploying %s", UserAddonsIncludeResourceName) logrus.Debugf("[addons] Compiled addons yaml: %s", string(manifests)) - return c.doAddonDeploy(ctx, string(manifests), UserAddonsIncludeResourceName) + return c.doAddonDeploy(ctx, string(manifests), UserAddonsIncludeResourceName, false) } func validateUserAddonYAML(addon []byte) error { @@ -158,7 +177,7 @@ func (c *Cluster) deployKubeDNS(ctx context.Context) error { if err != nil { return err } - if err := c.doAddonDeploy(ctx, kubeDNSYaml, KubeDNSAddonResourceName); err != nil { + if err := c.doAddonDeploy(ctx, kubeDNSYaml, KubeDNSAddonResourceName, false); err != nil { return err } log.Infof(ctx, "[addons] KubeDNS deployed successfully..") @@ -174,33 +193,35 @@ func (c *Cluster) deployWithKubectl(ctx context.Context, addonYaml string) error return cmd.Run() } -func (c *Cluster) doAddonDeploy(ctx context.Context, addonYaml, resourceName string) error { +func (c *Cluster) doAddonDeploy(ctx context.Context, addonYaml, resourceName string, isCritical bool) error { if c.UseKubectlDeploy { - return c.deployWithKubectl(ctx, addonYaml) + if err := c.deployWithKubectl(ctx, addonYaml); err != nil { + return &addonError{fmt.Sprintf("%v", err), isCritical} + } } err := c.StoreAddonConfigMap(ctx, addonYaml, resourceName) if err != nil { - return fmt.Errorf("Failed to save addon ConfigMap: %v", err) + return &addonError{fmt.Sprintf("Failed to save addon ConfigMap: %v", err), isCritical} } log.Infof(ctx, "[addons] Executing deploy job..") k8sClient, err := k8s.NewClient(c.LocalKubeConfigPath, c.K8sWrapTransport) if err != nil { - return err + return &addonError{fmt.Sprintf("%v", err), isCritical} } node, err := k8s.GetNode(k8sClient, c.ControlPlaneHosts[0].HostnameOverride) if err != nil { - return fmt.Errorf("Failed to get Node [%s]: %v", c.ControlPlaneHosts[0].HostnameOverride, err) + return &addonError{fmt.Sprintf("Failed to get Node [%s]: %v", c.ControlPlaneHosts[0].HostnameOverride, err), isCritical} } addonJob, err := addons.GetAddonsExcuteJob(resourceName, node.Name, c.Services.KubeAPI.Image) if err != nil { - return fmt.Errorf("Failed to deploy addon execute job: %v", err) + return &addonError{fmt.Sprintf("Failed to generate addon execute job: %v", err), isCritical} } err = c.ApplySystemAddonExcuteJob(addonJob) if err != nil { - return fmt.Errorf("Failed to deploy addon execute job: %v", err) + return &addonError{fmt.Sprintf("%v", err), isCritical} } return nil } @@ -234,7 +255,7 @@ func (c *Cluster) StoreAddonConfigMap(ctx context.Context, addonYaml string, add } func (c *Cluster) ApplySystemAddonExcuteJob(addonJob string) error { - if err := k8s.ApplyK8sSystemJob(addonJob, c.LocalKubeConfigPath, c.K8sWrapTransport); err != nil { + if err := k8s.ApplyK8sSystemJob(addonJob, c.LocalKubeConfigPath, c.K8sWrapTransport, c.AddonJobTimeout); err != nil { logrus.Error(err) return err } @@ -261,7 +282,7 @@ func (c *Cluster) deployIngress(ctx context.Context) error { if err != nil { return err } - if err := c.doAddonDeploy(ctx, ingressYaml, IngressAddonResourceName); err != nil { + if err := c.doAddonDeploy(ctx, ingressYaml, IngressAddonResourceName, false); err != nil { return err } log.Infof(ctx, "[ingress] ingress controller %s is successfully deployed", c.Ingress.Provider) diff --git a/cluster/cluster.go b/cluster/cluster.go index 9a8eaeee..52b5ad16 100644 --- a/cluster/cluster.go +++ b/cluster/cluster.go @@ -289,7 +289,14 @@ func (c *Cluster) deployAddons(ctx context.Context) error { if err := c.deployK8sAddOns(ctx); err != nil { return err } - return c.deployUserAddOns(ctx) + if err := c.deployUserAddOns(ctx); err != nil { + if err, ok := err.(*addonError); ok && err.isCritical { + return err + } + log.Warnf(ctx, "Failed to deploy addon execute job [%s]: %v", UserAddonsIncludeResourceName, err) + + } + return nil } func (c *Cluster) SyncLabelsAndTaints(ctx context.Context) error { @@ -349,9 +356,14 @@ func ConfigureCluster( if len(kubeCluster.ControlPlaneHosts) > 0 { kubeCluster.Certificates = crtBundle if err := kubeCluster.deployNetworkPlugin(ctx); err != nil { + if err, ok := err.(*addonError); ok && err.isCritical { + return err + } + log.Warnf(ctx, "Failed to deploy addon execute job [%s]: %v", NetworkPluginResourceName, err) + } + if err := kubeCluster.deployAddons(ctx); err != nil { return err } - return kubeCluster.deployAddons(ctx) } return nil } diff --git a/cluster/defaults.go b/cluster/defaults.go index fb9da5e1..3e61111c 100644 --- a/cluster/defaults.go +++ b/cluster/defaults.go @@ -3,6 +3,7 @@ package cluster import ( "context" + "github.com/rancher/rke/k8s" "github.com/rancher/rke/log" "github.com/rancher/rke/services" "github.com/rancher/types/apis/management.cattle.io/v3" @@ -87,6 +88,9 @@ func (c *Cluster) setClusterDefaults(ctx context.Context) { if len(c.Version) == 0 { c.Version = DefaultK8sVersion } + if c.AddonJobTimeout == 0 { + c.AddonJobTimeout = k8s.DefaultTimeout + } c.setClusterImageDefaults() c.setClusterServicesDefaults() c.setClusterNetworkDefaults() diff --git a/cluster/network.go b/cluster/network.go index e3ceeef8..abee1b47 100644 --- a/cluster/network.go +++ b/cluster/network.go @@ -41,26 +41,15 @@ const ( ProtocolUDP = "UDP" FlannelNetworkPlugin = "flannel" - FlannelImage = "flannel_image" - FlannelCNIImage = "flannel_cni_image" FlannelIface = "flannel_iface" - CalicoNetworkPlugin = "calico" - CalicoNodeImage = "calico_node_image" - CalicoCNIImage = "calico_cni_image" - CalicoControllersImage = "calico_controllers_image" - CalicoctlImage = "calicoctl_image" - CalicoCloudProvider = "calico_cloud_provider" + CalicoNetworkPlugin = "calico" + CalicoCloudProvider = "calico_cloud_provider" CanalNetworkPlugin = "canal" - CanalNodeImage = "canal_node_image" - CanalCNIImage = "canal_cni_image" - CanalFlannelImage = "canal_flannel_image" CanalIface = "canal_iface" WeaveNetworkPlugin = "weave" - WeaveImage = "weave_node_image" - WeaveCNIImage = "weave_cni_image" // List of map keys to be used with network templates @@ -140,7 +129,7 @@ func (c *Cluster) doFlannelDeploy(ctx context.Context) error { if err != nil { return err } - return c.doAddonDeploy(ctx, pluginYaml, NetworkPluginResourceName) + return c.doAddonDeploy(ctx, pluginYaml, NetworkPluginResourceName, true) } func (c *Cluster) doCalicoDeploy(ctx context.Context) error { @@ -158,7 +147,7 @@ func (c *Cluster) doCalicoDeploy(ctx context.Context) error { if err != nil { return err } - return c.doAddonDeploy(ctx, pluginYaml, NetworkPluginResourceName) + return c.doAddonDeploy(ctx, pluginYaml, NetworkPluginResourceName, true) } func (c *Cluster) doCanalDeploy(ctx context.Context) error { @@ -180,7 +169,7 @@ func (c *Cluster) doCanalDeploy(ctx context.Context) error { if err != nil { return err } - return c.doAddonDeploy(ctx, pluginYaml, NetworkPluginResourceName) + return c.doAddonDeploy(ctx, pluginYaml, NetworkPluginResourceName, true) } func (c *Cluster) doWeaveDeploy(ctx context.Context) error { @@ -194,7 +183,7 @@ func (c *Cluster) doWeaveDeploy(ctx context.Context) error { if err != nil { return err } - return c.doAddonDeploy(ctx, pluginYaml, NetworkPluginResourceName) + return c.doAddonDeploy(ctx, pluginYaml, NetworkPluginResourceName, true) } func (c *Cluster) getNetworkPluginManifest(pluginConfig map[string]string) (string, error) { diff --git a/k8s/job.go b/k8s/job.go index 093fc0f6..d3df7ad7 100644 --- a/k8s/job.go +++ b/k8s/job.go @@ -12,7 +12,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func ApplyK8sSystemJob(jobYaml, kubeConfigPath string, k8sWrapTransport WrapTransport) error { +func ApplyK8sSystemJob(jobYaml, kubeConfigPath string, k8sWrapTransport WrapTransport, timeout int) error { job := v1.Job{} if err := decodeYamlResource(&job, jobYaml); err != nil { return err @@ -32,7 +32,7 @@ func ApplyK8sSystemJob(jobYaml, kubeConfigPath string, k8sWrapTransport WrapTran return err } logrus.Debugf("[k8s] waiting for job %s to complete..", job.Name) - return retryTo(ensureJobCompleted, k8sClient, job, DefaultRetries, DefaultSleepSeconds) + return retryToWithTimeout(ensureJobCompleted, k8sClient, job, timeout) } func ensureJobCompleted(k8sClient *kubernetes.Clientset, j interface{}) error { diff --git a/k8s/k8s.go b/k8s/k8s.go index 29e4efcd..139c3291 100644 --- a/k8s/k8s.go +++ b/k8s/k8s.go @@ -13,6 +13,7 @@ import ( const ( DefaultRetries = 5 DefaultSleepSeconds = 5 + DefaultTimeout = 30 K8sWrapTransportTimeout = 30 ) @@ -42,6 +43,20 @@ func decodeYamlResource(resource interface{}, yamlManifest string) error { return decoder.Decode(&resource) } +func retryToWithTimeout(runFunc k8sCall, k8sClient *kubernetes.Clientset, resource interface{}, timeout int) error { + var err error + timePassed := 0 + for timePassed < timeout { + if err = runFunc(k8sClient, resource); err != nil { + time.Sleep(time.Second * time.Duration(DefaultSleepSeconds)) + timePassed += DefaultSleepSeconds + continue + } + return nil + } + return err +} + func retryTo(runFunc k8sCall, k8sClient *kubernetes.Clientset, resource interface{}, retries, sleepSeconds int) error { var err error if retries == 0 {