From e7b9c50ba1657f3b650447379ea7cc52123c862e Mon Sep 17 00:00:00 2001 From: Yifan Gu Date: Mon, 31 Aug 2015 19:25:26 -0700 Subject: [PATCH] kubelet/rkt: add support for different stage1 image. Also enable grace stop for service files. --- cmd/kubelet/app/server.go | 6 + contrib/mesos/pkg/executor/service/service.go | 1 + hack/verify-flags/known-flags.txt | 549 +++++++++--------- pkg/kubelet/kubelet.go | 2 + pkg/kubelet/rkt/config.go | 2 + pkg/kubelet/rkt/rkt.go | 33 +- 6 files changed, 309 insertions(+), 284 deletions(-) diff --git a/cmd/kubelet/app/server.go b/cmd/kubelet/app/server.go index 225d756f26a..1bdeeb99584 100644 --- a/cmd/kubelet/app/server.go +++ b/cmd/kubelet/app/server.go @@ -119,6 +119,7 @@ type KubeletServer struct { ResolverConfig string ResourceContainer string RktPath string + RktStage1Image string RootDirectory string RunOnce bool StandaloneMode bool @@ -189,6 +190,7 @@ func NewKubeletServer() *KubeletServer { RegistryBurst: 10, ResourceContainer: "/kubelet", RktPath: "", + RktStage1Image: "", RootDirectory: defaultRootDir, SyncFrequency: 10 * time.Second, SystemContainer: "", @@ -254,6 +256,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) { fs.StringVar(&s.CgroupRoot, "cgroup-root", s.CgroupRoot, "Optional root cgroup to use for pods. This is handled by the container runtime on a best effort basis. Default: '', which means use the container runtime default.") fs.StringVar(&s.ContainerRuntime, "container-runtime", s.ContainerRuntime, "The container runtime to use. Possible values: 'docker', 'rkt'. Default: 'docker'.") fs.StringVar(&s.RktPath, "rkt-path", s.RktPath, "Path of rkt binary. Leave empty to use the first rkt in $PATH. Only used if --container-runtime='rkt'") + fs.StringVar(&s.RktStage1Image, "rkt-stage1-image", s.RktStage1Image, "image to use as stage1. Local paths and http/https URLs are supported. If empty, the 'stage1.aci' in the same directory as '--rkt-path' will be used") fs.StringVar(&s.SystemContainer, "system-container", s.SystemContainer, "Optional resource-only container in which to place all non-kernel processes that are not already in a container. Empty for no container. Rolling back the flag requires a reboot. (Default: \"\").") fs.BoolVar(&s.ConfigureCBR0, "configure-cbr0", s.ConfigureCBR0, "If true, kubelet will configure cbr0 based on Node.Spec.PodCIDR.") fs.IntVar(&s.MaxPods, "max-pods", 40, "Number of Pods that can run on this Kubelet.") @@ -364,6 +367,7 @@ func (s *KubeletServer) KubeletConfig() (*KubeletConfig, error) { ResolverConfig: s.ResolverConfig, ResourceContainer: s.ResourceContainer, RktPath: s.RktPath, + RktStage1Image: s.RktStage1Image, RootDirectory: s.RootDirectory, Runonce: s.RunOnce, StandaloneMode: (len(s.APIServerList) == 0), @@ -789,6 +793,7 @@ type KubeletConfig struct { ResolverConfig string ResourceContainer string RktPath string + RktStage1Image string RootDirectory string Runonce bool StandaloneMode bool @@ -851,6 +856,7 @@ func createAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod kc.CgroupRoot, kc.ContainerRuntime, kc.RktPath, + kc.RktStage1Image, kc.Mounter, kc.DockerDaemonContainer, kc.SystemContainer, diff --git a/contrib/mesos/pkg/executor/service/service.go b/contrib/mesos/pkg/executor/service/service.go index 5508d78ee6e..ba0664efb09 100644 --- a/contrib/mesos/pkg/executor/service/service.go +++ b/contrib/mesos/pkg/executor/service/service.go @@ -325,6 +325,7 @@ func (ks *KubeletExecutorServer) createAndInitKubelet( kc.CgroupRoot, kc.ContainerRuntime, kc.RktPath, + kc.RktStage1Image, kc.Mounter, kc.DockerDaemonContainer, kc.SystemContainer, diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index 9fe4e91dc48..c082aa7a767 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -1,274 +1,275 @@ -accept-hosts -accept-paths -account-for-pod-resources -admission-control -admission-control-config-file -advertise-address -advertised-address -algorithm-provider -all-namespaces -allocate-node-cidrs -allow-privileged -api-burst -api-prefix -api-rate -api-servers -api-token -api-version -authorization-mode -authorization-policy-file -auth-path -basic-auth-file -bench-pods -bench-quiet -bench-tasks -bench-workers -bind-address -bind-pods-burst -bind-pods-qps -cadvisor-port -cert-dir -certificate-authority -cgroup-root -chaos-chance -cleanup-iptables -client-ca-file -client-certificate -client-key -cloud-config -cloud-provider -cluster-cidr -cluster-dns -cluster-domain -cluster-name -cluster-tag -concurrent-endpoint-syncs -configure-cbr0 -contain-pod-resources -container-port -container-runtime -cors-allowed-origins -create-external-load-balancer -current-release-pr -current-replicas -default-container-cpu-limit -default-container-mem-limit -delay-shutdown -deleting-pods-burst -deleting-pods-qps -deployment-label-key -dest-file -disable-filter -docker-endpoint -docker-exec-handler -dockercfg-path -driver-port -dry-run -duration-sec -e2e-output-dir -enable-debugging-handlers -enable-horizontal-pod-autoscaler -enable-server -etcd-config -etcd-prefix -etcd-server -etcd-servers -event-burst -event-qps -event-ttl -executor-bindall -executor-logv -executor-path -executor-suicide-timeout -experimental-keystone-url -experimental-prefix -external-hostname -external-ip -failover-timeout -file-check-frequency -file-suffix -forward-services -framework-name -framework-weburi -func-dest -fuzz-iters -gce-project -gce-zone -gke-cluster -google-json-key -grace-period -ha-domain -healthz-bind-address -healthz-port -horizontal-pod-autoscaler-sync-period -hostname-override -host-network-sources -http-check-frequency -http-port -ignore-not-found -image-gc-high-threshold -image-gc-low-threshold -insecure-bind-address -insecure-port -insecure-skip-tls-verify -iptables-sync-period -ir-data-source -ir-dbname -ir-influxdb-host -ir-password -ir-user -jenkins-host -jenkins-jobs -km-path -kubectl-path -kubelet-cadvisor-port -kubelet-certificate-authority -kubelet-client-certificate -kubelet-client-key -kubelet-docker-endpoint -kubelet-host-network-sources -kubelet-https -kubelet-network-plugin -kubelet-pod-infra-container-image -kubelet-port -kubelet-root-dir -kubelet-sync-frequency -kubelet-timeout -kube-master -label-columns -last-release-pr -legacy-userspace-proxy -log-flush-frequency -long-running-request-regexp -low-diskspace-threshold-mb -manifest-url -manifest-url-header -masquerade-all -master-service-namespace -max-concurrency -max-connection-bytes-per-sec -maximum-dead-containers -maximum-dead-containers-per-container -max-log-age -max-log-backups -max-log-size -max-outgoing-burst -max-outgoing-qps -max-pods -max-requests-inflight -mesos-authentication-principal -mesos-authentication-provider -mesos-authentication-secret-file -mesos-cgroup-prefix -mesos-executor-cpus -mesos-executor-mem -mesos-master -mesos-role -mesos-user -minimum-container-ttl-duration -minion-max-log-age -minion-max-log-backups -minion-max-log-size -minion-path-override -min-pr-number -min-request-timeout -namespace-sync-period -network-plugin -network-plugin-dir -node-instance-group -node-monitor-grace-period -node-monitor-period -node-startup-grace-period -node-status-update-frequency -node-sync-period -no-headers -num-nodes -oidc-ca-file -oidc-client-id -oidc-issuer-url -oidc-username-claim -oom-score-adj -output-version -out-version -path-override -pod-cidr -pod-eviction-timeout -pod-infra-container-image -pod-running -policy-config-file -poll-interval -portal-net -private-mountns -prom-push-gateway -proxy-bindall -proxy-logv -proxy-port-range -public-address-override -pvclaimbinder-sync-period -read-only-port -really-crash-for-testing -reconcile-cooldown -reconcile-interval -register-node -register-retry-count -registry-burst -registry-qps -reject-methods -reject-paths -repo-root -report-dir -required-contexts -resolv-conf -resource-container -resource-quota-sync-period -resource-version -rkt-path -root-ca-file -root-dir -run-proxy -runtime-config -scheduler-config -secure-port -service-account-key-file -service-account-lookup -service-account-private-key-file -service-address -service-cluster-ip-range -service-node-port-range -service-node-ports -service-sync-period -session-affinity -show-all -shutdown-fd -shutdown-fifo -skip-munges -sort-by -source-file -ssh-keyfile -ssh-user -static-pods-config -stats-port -storage-version -streaming-connection-idle-timeout -suicide-timeout -sync-frequency -system-container -target-port -tcp-services -tls-cert-file -tls-private-key-file -token-auth-file -ttl-secs -type-src -unix-socket -update-period -upgrade-target -use-kubernetes-cluster-service -user-whitelist -watch-cache -watch-only -whitelist-override-label -www-prefix -retry_time -file_content_in_loop -cpu-cfs-quota +accept-hosts +accept-paths +account-for-pod-resources +admission-control +admission-control-config-file +advertise-address +advertised-address +algorithm-provider +all-namespaces +allocate-node-cidrs +allow-privileged +api-burst +api-prefix +api-rate +api-servers +api-token +api-version +authorization-mode +authorization-policy-file +auth-path +basic-auth-file +bench-pods +bench-quiet +bench-tasks +bench-workers +bind-address +bind-pods-burst +bind-pods-qps +cadvisor-port +cert-dir +certificate-authority +cgroup-root +chaos-chance +cleanup-iptables +client-ca-file +client-certificate +client-key +cloud-config +cloud-provider +cluster-cidr +cluster-dns +cluster-domain +cluster-name +cluster-tag +concurrent-endpoint-syncs +configure-cbr0 +contain-pod-resources +container-port +container-runtime +cors-allowed-origins +create-external-load-balancer +current-release-pr +current-replicas +default-container-cpu-limit +default-container-mem-limit +delay-shutdown +deleting-pods-burst +deleting-pods-qps +deployment-label-key +dest-file +disable-filter +docker-endpoint +docker-exec-handler +dockercfg-path +driver-port +dry-run +duration-sec +e2e-output-dir +enable-debugging-handlers +enable-horizontal-pod-autoscaler +enable-server +etcd-config +etcd-prefix +etcd-server +etcd-servers +event-burst +event-qps +event-ttl +executor-bindall +executor-logv +executor-path +executor-suicide-timeout +experimental-keystone-url +experimental-prefix +external-hostname +external-ip +failover-timeout +file-check-frequency +file-suffix +forward-services +framework-name +framework-weburi +func-dest +fuzz-iters +gce-project +gce-zone +gke-cluster +google-json-key +grace-period +ha-domain +healthz-bind-address +healthz-port +horizontal-pod-autoscaler-sync-period +hostname-override +host-network-sources +http-check-frequency +http-port +ignore-not-found +image-gc-high-threshold +image-gc-low-threshold +insecure-bind-address +insecure-port +insecure-skip-tls-verify +iptables-sync-period +ir-data-source +ir-dbname +ir-influxdb-host +ir-password +ir-user +jenkins-host +jenkins-jobs +km-path +kubectl-path +kubelet-cadvisor-port +kubelet-certificate-authority +kubelet-client-certificate +kubelet-client-key +kubelet-docker-endpoint +kubelet-host-network-sources +kubelet-https +kubelet-network-plugin +kubelet-pod-infra-container-image +kubelet-port +kubelet-root-dir +kubelet-sync-frequency +kubelet-timeout +kube-master +label-columns +last-release-pr +legacy-userspace-proxy +log-flush-frequency +long-running-request-regexp +low-diskspace-threshold-mb +manifest-url +manifest-url-header +masquerade-all +master-service-namespace +max-concurrency +max-connection-bytes-per-sec +maximum-dead-containers +maximum-dead-containers-per-container +max-log-age +max-log-backups +max-log-size +max-outgoing-burst +max-outgoing-qps +max-pods +max-requests-inflight +mesos-authentication-principal +mesos-authentication-provider +mesos-authentication-secret-file +mesos-cgroup-prefix +mesos-executor-cpus +mesos-executor-mem +mesos-master +mesos-role +mesos-user +minimum-container-ttl-duration +minion-max-log-age +minion-max-log-backups +minion-max-log-size +minion-path-override +min-pr-number +min-request-timeout +namespace-sync-period +network-plugin +network-plugin-dir +node-instance-group +node-monitor-grace-period +node-monitor-period +node-startup-grace-period +node-status-update-frequency +node-sync-period +no-headers +num-nodes +oidc-ca-file +oidc-client-id +oidc-issuer-url +oidc-username-claim +oom-score-adj +output-version +out-version +path-override +pod-cidr +pod-eviction-timeout +pod-infra-container-image +pod-running +policy-config-file +poll-interval +portal-net +private-mountns +prom-push-gateway +proxy-bindall +proxy-logv +proxy-port-range +public-address-override +pvclaimbinder-sync-period +read-only-port +really-crash-for-testing +reconcile-cooldown +reconcile-interval +register-node +register-retry-count +registry-burst +registry-qps +reject-methods +reject-paths +repo-root +report-dir +required-contexts +resolv-conf +resource-container +resource-quota-sync-period +resource-version +rkt-path +rkt-stage1-image +root-ca-file +root-dir +run-proxy +runtime-config +scheduler-config +secure-port +service-account-key-file +service-account-lookup +service-account-private-key-file +service-address +service-cluster-ip-range +service-node-port-range +service-node-ports +service-sync-period +session-affinity +show-all +shutdown-fd +shutdown-fifo +skip-munges +sort-by +source-file +ssh-keyfile +ssh-user +static-pods-config +stats-port +storage-version +streaming-connection-idle-timeout +suicide-timeout +sync-frequency +system-container +target-port +tcp-services +tls-cert-file +tls-private-key-file +token-auth-file +ttl-secs +type-src +unix-socket +update-period +upgrade-target +use-kubernetes-cluster-service +user-whitelist +watch-cache +watch-only +whitelist-override-label +www-prefix +retry_time +file_content_in_loop +cpu-cfs-quota diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index f93edf0e80e..cde96b77e42 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -164,6 +164,7 @@ func NewMainKubelet( cgroupRoot string, containerRuntime string, rktPath string, + rktStage1Image string, mounter mount.Interface, dockerDaemonContainer string, systemContainer string, @@ -331,6 +332,7 @@ func NewMainKubelet( case "rkt": conf := &rkt.Config{ Path: rktPath, + Stage1Image: rktStage1Image, InsecureSkipVerify: true, } rktRuntime, err := rkt.New( diff --git a/pkg/kubelet/rkt/config.go b/pkg/kubelet/rkt/config.go index c59aa51ebee..91dd3d558f2 100644 --- a/pkg/kubelet/rkt/config.go +++ b/pkg/kubelet/rkt/config.go @@ -23,6 +23,8 @@ import "fmt" type Config struct { // The absolute path to the binary, or leave empty to find it in $PATH. Path string + // The image to use as stage1. + Stage1Image string // The debug flag for rkt. Debug bool // The rkt data directory. diff --git a/pkg/kubelet/rkt/rkt.go b/pkg/kubelet/rkt/rkt.go index 7c8a9249e55..5491a140630 100644 --- a/pkg/kubelet/rkt/rkt.go +++ b/pkg/kubelet/rkt/rkt.go @@ -27,7 +27,6 @@ import ( "path" "strconv" "strings" - "syscall" "time" appcschema "github.com/appc/spec/schema" @@ -42,6 +41,7 @@ import ( "k8s.io/kubernetes/pkg/credentialprovider" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" "k8s.io/kubernetes/pkg/kubelet/prober" + kubeletUtil "k8s.io/kubernetes/pkg/kubelet/util" "k8s.io/kubernetes/pkg/probe" "k8s.io/kubernetes/pkg/securitycontext" "k8s.io/kubernetes/pkg/types" @@ -467,7 +467,7 @@ func (r *runtime) makePodManifest(pod *api.Pod, pullSecrets []api.Secret) (*appc volumeMap, ok := r.volumeGetter.GetVolumes(pod.UID) if !ok { - return nil, fmt.Errorf("cannot get the volumes for pod %q", kubecontainer.GetPodFullName(pod)) + return nil, fmt.Errorf("cannot get the volumes for pod %q", kubeletUtil.FormatPodName(pod)) } // Set global volumes. @@ -533,7 +533,7 @@ func serviceFilePath(serviceName string) string { // preparePod will: // // 1. Invoke 'rkt prepare' to prepare the pod, and get the rkt pod uuid. -// 2. Creates the unit file and save it under systemdUnitDir. +// 2. Create the unit file and save it under systemdUnitDir. // // On success, it will return a string that represents name of the unit file // and the runtime pod. @@ -566,6 +566,9 @@ func (r *runtime) preparePod(pod *api.Pod, pullSecrets []api.Secret) (string, *k // Run 'rkt prepare' to get the rkt UUID. cmds := []string{"prepare", "--quiet", "--pod-manifest", manifestFile.Name()} + if r.config.Stage1Image != "" { + cmds = append(cmds, "--stage1-image", r.config.Stage1Image) + } output, err := r.runCommand(cmds...) if err != nil { return "", nil, err @@ -596,6 +599,8 @@ func (r *runtime) preparePod(pod *api.Pod, pullSecrets []api.Secret) (string, *k // This makes the service show up for 'systemctl list-units' even if it exits successfully. newUnitOption("Service", "RemainAfterExit", "true"), newUnitOption("Service", "ExecStart", runPrepared), + // This enables graceful stop. + newUnitOption("Service", "KillMode", "mixed"), } // Check if there's old rkt pod corresponding to the same pod, if so, update the restart count. @@ -615,7 +620,7 @@ func (r *runtime) preparePod(pod *api.Pod, pullSecrets []api.Secret) (string, *k } units = append(units, newUnitOption(unitKubernetesSection, unitRestartCount, strconv.Itoa(restartCount))) - glog.V(4).Infof("rkt: Creating service file %q for pod %q", serviceName, pod.Name) + glog.V(4).Infof("rkt: Creating service file %q for pod %q", serviceName, kubeletUtil.FormatPodName(pod)) serviceFile, err := os.Create(serviceFilePath(serviceName)) if err != nil { return "", nil, err @@ -674,7 +679,7 @@ func (r *runtime) generateEvents(runtimePod *kubecontainer.Pod, reason string, f // RunPod first creates the unit file for a pod, and then // starts the unit over d-bus. func (r *runtime) RunPod(pod *api.Pod, pullSecrets []api.Secret) error { - glog.V(4).Infof("Rkt starts to run pod: name %q.", pod.Name) + glog.V(4).Infof("Rkt starts to run pod: name %q.", kubeletUtil.FormatPodName(pod)) name, runtimePod, prepareErr := r.preparePod(pod, pullSecrets) @@ -684,7 +689,7 @@ func (r *runtime) RunPod(pod *api.Pod, pullSecrets []api.Secret) error { for i, c := range pod.Spec.Containers { ref, err := kubecontainer.GenerateContainerRef(pod, &c) if err != nil { - glog.Errorf("Couldn't make a ref to pod %v, container %v: '%v'", pod.Name, c.Name, err) + glog.Errorf("Couldn't make a ref to pod %q, container %v: '%v'", kubeletUtil.FormatPodName(pod), c.Name, err) continue } if prepareErr != nil { @@ -800,8 +805,11 @@ func (r *runtime) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error { r.containerRefManager.ClearRef(id) } - // TODO(yifan): More graceful stop. Replace with StopUnit and wait for a timeout. - r.systemd.KillUnit(serviceName, int32(syscall.SIGKILL)) + // Since all service file have 'KillMode=mixed', the processes in + // the unit's cgroup will receive a SIGKILL if the normal stop timeouts. + if _, err := r.systemd.StopUnit(serviceName, "replace"); err != nil { + return err + } // Remove the systemd service file as well. return os.Remove(serviceFilePath(serviceName)) } @@ -961,7 +969,7 @@ func (r *runtime) IsImagePresent(image kubecontainer.ImageSpec) (bool, error) { // SyncPod syncs the running pod to match the specified desired pod. func (r *runtime) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, podStatus api.PodStatus, pullSecrets []api.Secret, backOff *util.Backoff) error { - podFullName := kubecontainer.GetPodFullName(pod) + podFullName := kubeletUtil.FormatPodName(pod) if len(runningPod.Containers) == 0 { glog.V(4).Infof("Pod %q is not running, will start it", podFullName) return r.RunPod(pod, pullSecrets) @@ -1036,6 +1044,8 @@ func (r *runtime) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, podStatus // // In rkt runtime's implementation, per container log is get via 'journalctl -M [rkt-$UUID] -u [APP_NAME]'. // See https://github.com/coreos/rkt/blob/master/Documentation/commands.md#logging for more details. +// +// TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail. func (r *runtime) GetContainerLogs(pod *api.Pod, containerID string, tail string, follow bool, stdout, stderr io.Writer) error { id, err := parseContainerID(containerID) if err != nil { @@ -1072,6 +1082,7 @@ func (r *runtime) GarbageCollect() error { // Note: In rkt, the container ID is in the form of "UUID:appName", where // appName is the container name. +// TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail. func (r *runtime) RunInContainer(containerID string, cmd []string) ([]byte, error) { glog.V(4).Infof("Rkt running in container.") @@ -1092,6 +1103,7 @@ func (r *runtime) AttachContainer(containerID string, stdin io.Reader, stdout, s // Note: In rkt, the container ID is in the form of "UUID:appName", where UUID is // the rkt UUID, and appName is the container name. +// TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail. func (r *runtime) ExecInContainer(containerID string, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool) error { glog.V(4).Infof("Rkt execing in container.") @@ -1150,7 +1162,7 @@ func (r *runtime) findRktID(pod *kubecontainer.Pod) (string, error) { f, err := os.Open(serviceFilePath(serviceName)) if err != nil { if os.IsNotExist(err) { - return "", fmt.Errorf("no service file %v for pod %q, UID %q", serviceName, pod.Name, pod.ID) + return "", fmt.Errorf("no service file %v for runtime pod %q, ID %q", serviceName, pod.Name, pod.ID) } return "", err } @@ -1179,6 +1191,7 @@ func (r *runtime) findRktID(pod *kubecontainer.Pod) (string, error) { // - should we support nsenter + socat in a container, running with elevated privs and --pid=host? // // TODO(yifan): Merge with the same function in dockertools. +// TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail. func (r *runtime) PortForward(pod *kubecontainer.Pod, port uint16, stream io.ReadWriteCloser) error { glog.V(4).Infof("Rkt port forwarding in container.")