kubelet/rkt: add support for different stage1 image.

Also enable grace stop for service files.
This commit is contained in:
Yifan Gu 2015-08-31 19:25:26 -07:00
parent c0d0ef6dd1
commit e7b9c50ba1
6 changed files with 309 additions and 284 deletions

View File

@ -119,6 +119,7 @@ type KubeletServer struct {
ResolverConfig string
ResourceContainer string
RktPath string
RktStage1Image string
RootDirectory string
RunOnce bool
StandaloneMode bool
@ -189,6 +190,7 @@ func NewKubeletServer() *KubeletServer {
RegistryBurst: 10,
ResourceContainer: "/kubelet",
RktPath: "",
RktStage1Image: "",
RootDirectory: defaultRootDir,
SyncFrequency: 10 * time.Second,
SystemContainer: "",
@ -254,6 +256,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
fs.StringVar(&s.CgroupRoot, "cgroup-root", s.CgroupRoot, "Optional root cgroup to use for pods. This is handled by the container runtime on a best effort basis. Default: '', which means use the container runtime default.")
fs.StringVar(&s.ContainerRuntime, "container-runtime", s.ContainerRuntime, "The container runtime to use. Possible values: 'docker', 'rkt'. Default: 'docker'.")
fs.StringVar(&s.RktPath, "rkt-path", s.RktPath, "Path of rkt binary. Leave empty to use the first rkt in $PATH. Only used if --container-runtime='rkt'")
fs.StringVar(&s.RktStage1Image, "rkt-stage1-image", s.RktStage1Image, "image to use as stage1. Local paths and http/https URLs are supported. If empty, the 'stage1.aci' in the same directory as '--rkt-path' will be used")
fs.StringVar(&s.SystemContainer, "system-container", s.SystemContainer, "Optional resource-only container in which to place all non-kernel processes that are not already in a container. Empty for no container. Rolling back the flag requires a reboot. (Default: \"\").")
fs.BoolVar(&s.ConfigureCBR0, "configure-cbr0", s.ConfigureCBR0, "If true, kubelet will configure cbr0 based on Node.Spec.PodCIDR.")
fs.IntVar(&s.MaxPods, "max-pods", 40, "Number of Pods that can run on this Kubelet.")
@ -364,6 +367,7 @@ func (s *KubeletServer) KubeletConfig() (*KubeletConfig, error) {
ResolverConfig: s.ResolverConfig,
ResourceContainer: s.ResourceContainer,
RktPath: s.RktPath,
RktStage1Image: s.RktStage1Image,
RootDirectory: s.RootDirectory,
Runonce: s.RunOnce,
StandaloneMode: (len(s.APIServerList) == 0),
@ -789,6 +793,7 @@ type KubeletConfig struct {
ResolverConfig string
ResourceContainer string
RktPath string
RktStage1Image string
RootDirectory string
Runonce bool
StandaloneMode bool
@ -851,6 +856,7 @@ func createAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod
kc.CgroupRoot,
kc.ContainerRuntime,
kc.RktPath,
kc.RktStage1Image,
kc.Mounter,
kc.DockerDaemonContainer,
kc.SystemContainer,

View File

@ -325,6 +325,7 @@ func (ks *KubeletExecutorServer) createAndInitKubelet(
kc.CgroupRoot,
kc.ContainerRuntime,
kc.RktPath,
kc.RktStage1Image,
kc.Mounter,
kc.DockerDaemonContainer,
kc.SystemContainer,

View File

@ -1,274 +1,275 @@
accept-hosts
accept-paths
account-for-pod-resources
admission-control
admission-control-config-file
advertise-address
advertised-address
algorithm-provider
all-namespaces
allocate-node-cidrs
allow-privileged
api-burst
api-prefix
api-rate
api-servers
api-token
api-version
authorization-mode
authorization-policy-file
auth-path
basic-auth-file
bench-pods
bench-quiet
bench-tasks
bench-workers
bind-address
bind-pods-burst
bind-pods-qps
cadvisor-port
cert-dir
certificate-authority
cgroup-root
chaos-chance
cleanup-iptables
client-ca-file
client-certificate
client-key
cloud-config
cloud-provider
cluster-cidr
cluster-dns
cluster-domain
cluster-name
cluster-tag
concurrent-endpoint-syncs
configure-cbr0
contain-pod-resources
container-port
container-runtime
cors-allowed-origins
create-external-load-balancer
current-release-pr
current-replicas
default-container-cpu-limit
default-container-mem-limit
delay-shutdown
deleting-pods-burst
deleting-pods-qps
deployment-label-key
dest-file
disable-filter
docker-endpoint
docker-exec-handler
dockercfg-path
driver-port
dry-run
duration-sec
e2e-output-dir
enable-debugging-handlers
enable-horizontal-pod-autoscaler
enable-server
etcd-config
etcd-prefix
etcd-server
etcd-servers
event-burst
event-qps
event-ttl
executor-bindall
executor-logv
executor-path
executor-suicide-timeout
experimental-keystone-url
experimental-prefix
external-hostname
external-ip
failover-timeout
file-check-frequency
file-suffix
forward-services
framework-name
framework-weburi
func-dest
fuzz-iters
gce-project
gce-zone
gke-cluster
google-json-key
grace-period
ha-domain
healthz-bind-address
healthz-port
horizontal-pod-autoscaler-sync-period
hostname-override
host-network-sources
http-check-frequency
http-port
ignore-not-found
image-gc-high-threshold
image-gc-low-threshold
insecure-bind-address
insecure-port
insecure-skip-tls-verify
iptables-sync-period
ir-data-source
ir-dbname
ir-influxdb-host
ir-password
ir-user
jenkins-host
jenkins-jobs
km-path
kubectl-path
kubelet-cadvisor-port
kubelet-certificate-authority
kubelet-client-certificate
kubelet-client-key
kubelet-docker-endpoint
kubelet-host-network-sources
kubelet-https
kubelet-network-plugin
kubelet-pod-infra-container-image
kubelet-port
kubelet-root-dir
kubelet-sync-frequency
kubelet-timeout
kube-master
label-columns
last-release-pr
legacy-userspace-proxy
log-flush-frequency
long-running-request-regexp
low-diskspace-threshold-mb
manifest-url
manifest-url-header
masquerade-all
master-service-namespace
max-concurrency
max-connection-bytes-per-sec
maximum-dead-containers
maximum-dead-containers-per-container
max-log-age
max-log-backups
max-log-size
max-outgoing-burst
max-outgoing-qps
max-pods
max-requests-inflight
mesos-authentication-principal
mesos-authentication-provider
mesos-authentication-secret-file
mesos-cgroup-prefix
mesos-executor-cpus
mesos-executor-mem
mesos-master
mesos-role
mesos-user
minimum-container-ttl-duration
minion-max-log-age
minion-max-log-backups
minion-max-log-size
minion-path-override
min-pr-number
min-request-timeout
namespace-sync-period
network-plugin
network-plugin-dir
node-instance-group
node-monitor-grace-period
node-monitor-period
node-startup-grace-period
node-status-update-frequency
node-sync-period
no-headers
num-nodes
oidc-ca-file
oidc-client-id
oidc-issuer-url
oidc-username-claim
oom-score-adj
output-version
out-version
path-override
pod-cidr
pod-eviction-timeout
pod-infra-container-image
pod-running
policy-config-file
poll-interval
portal-net
private-mountns
prom-push-gateway
proxy-bindall
proxy-logv
proxy-port-range
public-address-override
pvclaimbinder-sync-period
read-only-port
really-crash-for-testing
reconcile-cooldown
reconcile-interval
register-node
register-retry-count
registry-burst
registry-qps
reject-methods
reject-paths
repo-root
report-dir
required-contexts
resolv-conf
resource-container
resource-quota-sync-period
resource-version
rkt-path
root-ca-file
root-dir
run-proxy
runtime-config
scheduler-config
secure-port
service-account-key-file
service-account-lookup
service-account-private-key-file
service-address
service-cluster-ip-range
service-node-port-range
service-node-ports
service-sync-period
session-affinity
show-all
shutdown-fd
shutdown-fifo
skip-munges
sort-by
source-file
ssh-keyfile
ssh-user
static-pods-config
stats-port
storage-version
streaming-connection-idle-timeout
suicide-timeout
sync-frequency
system-container
target-port
tcp-services
tls-cert-file
tls-private-key-file
token-auth-file
ttl-secs
type-src
unix-socket
update-period
upgrade-target
use-kubernetes-cluster-service
user-whitelist
watch-cache
watch-only
whitelist-override-label
www-prefix
retry_time
file_content_in_loop
cpu-cfs-quota
accept-hosts
accept-paths
account-for-pod-resources
admission-control
admission-control-config-file
advertise-address
advertised-address
algorithm-provider
all-namespaces
allocate-node-cidrs
allow-privileged
api-burst
api-prefix
api-rate
api-servers
api-token
api-version
authorization-mode
authorization-policy-file
auth-path
basic-auth-file
bench-pods
bench-quiet
bench-tasks
bench-workers
bind-address
bind-pods-burst
bind-pods-qps
cadvisor-port
cert-dir
certificate-authority
cgroup-root
chaos-chance
cleanup-iptables
client-ca-file
client-certificate
client-key
cloud-config
cloud-provider
cluster-cidr
cluster-dns
cluster-domain
cluster-name
cluster-tag
concurrent-endpoint-syncs
configure-cbr0
contain-pod-resources
container-port
container-runtime
cors-allowed-origins
create-external-load-balancer
current-release-pr
current-replicas
default-container-cpu-limit
default-container-mem-limit
delay-shutdown
deleting-pods-burst
deleting-pods-qps
deployment-label-key
dest-file
disable-filter
docker-endpoint
docker-exec-handler
dockercfg-path
driver-port
dry-run
duration-sec
e2e-output-dir
enable-debugging-handlers
enable-horizontal-pod-autoscaler
enable-server
etcd-config
etcd-prefix
etcd-server
etcd-servers
event-burst
event-qps
event-ttl
executor-bindall
executor-logv
executor-path
executor-suicide-timeout
experimental-keystone-url
experimental-prefix
external-hostname
external-ip
failover-timeout
file-check-frequency
file-suffix
forward-services
framework-name
framework-weburi
func-dest
fuzz-iters
gce-project
gce-zone
gke-cluster
google-json-key
grace-period
ha-domain
healthz-bind-address
healthz-port
horizontal-pod-autoscaler-sync-period
hostname-override
host-network-sources
http-check-frequency
http-port
ignore-not-found
image-gc-high-threshold
image-gc-low-threshold
insecure-bind-address
insecure-port
insecure-skip-tls-verify
iptables-sync-period
ir-data-source
ir-dbname
ir-influxdb-host
ir-password
ir-user
jenkins-host
jenkins-jobs
km-path
kubectl-path
kubelet-cadvisor-port
kubelet-certificate-authority
kubelet-client-certificate
kubelet-client-key
kubelet-docker-endpoint
kubelet-host-network-sources
kubelet-https
kubelet-network-plugin
kubelet-pod-infra-container-image
kubelet-port
kubelet-root-dir
kubelet-sync-frequency
kubelet-timeout
kube-master
label-columns
last-release-pr
legacy-userspace-proxy
log-flush-frequency
long-running-request-regexp
low-diskspace-threshold-mb
manifest-url
manifest-url-header
masquerade-all
master-service-namespace
max-concurrency
max-connection-bytes-per-sec
maximum-dead-containers
maximum-dead-containers-per-container
max-log-age
max-log-backups
max-log-size
max-outgoing-burst
max-outgoing-qps
max-pods
max-requests-inflight
mesos-authentication-principal
mesos-authentication-provider
mesos-authentication-secret-file
mesos-cgroup-prefix
mesos-executor-cpus
mesos-executor-mem
mesos-master
mesos-role
mesos-user
minimum-container-ttl-duration
minion-max-log-age
minion-max-log-backups
minion-max-log-size
minion-path-override
min-pr-number
min-request-timeout
namespace-sync-period
network-plugin
network-plugin-dir
node-instance-group
node-monitor-grace-period
node-monitor-period
node-startup-grace-period
node-status-update-frequency
node-sync-period
no-headers
num-nodes
oidc-ca-file
oidc-client-id
oidc-issuer-url
oidc-username-claim
oom-score-adj
output-version
out-version
path-override
pod-cidr
pod-eviction-timeout
pod-infra-container-image
pod-running
policy-config-file
poll-interval
portal-net
private-mountns
prom-push-gateway
proxy-bindall
proxy-logv
proxy-port-range
public-address-override
pvclaimbinder-sync-period
read-only-port
really-crash-for-testing
reconcile-cooldown
reconcile-interval
register-node
register-retry-count
registry-burst
registry-qps
reject-methods
reject-paths
repo-root
report-dir
required-contexts
resolv-conf
resource-container
resource-quota-sync-period
resource-version
rkt-path
rkt-stage1-image
root-ca-file
root-dir
run-proxy
runtime-config
scheduler-config
secure-port
service-account-key-file
service-account-lookup
service-account-private-key-file
service-address
service-cluster-ip-range
service-node-port-range
service-node-ports
service-sync-period
session-affinity
show-all
shutdown-fd
shutdown-fifo
skip-munges
sort-by
source-file
ssh-keyfile
ssh-user
static-pods-config
stats-port
storage-version
streaming-connection-idle-timeout
suicide-timeout
sync-frequency
system-container
target-port
tcp-services
tls-cert-file
tls-private-key-file
token-auth-file
ttl-secs
type-src
unix-socket
update-period
upgrade-target
use-kubernetes-cluster-service
user-whitelist
watch-cache
watch-only
whitelist-override-label
www-prefix
retry_time
file_content_in_loop
cpu-cfs-quota

View File

@ -164,6 +164,7 @@ func NewMainKubelet(
cgroupRoot string,
containerRuntime string,
rktPath string,
rktStage1Image string,
mounter mount.Interface,
dockerDaemonContainer string,
systemContainer string,
@ -331,6 +332,7 @@ func NewMainKubelet(
case "rkt":
conf := &rkt.Config{
Path: rktPath,
Stage1Image: rktStage1Image,
InsecureSkipVerify: true,
}
rktRuntime, err := rkt.New(

View File

@ -23,6 +23,8 @@ import "fmt"
type Config struct {
// The absolute path to the binary, or leave empty to find it in $PATH.
Path string
// The image to use as stage1.
Stage1Image string
// The debug flag for rkt.
Debug bool
// The rkt data directory.

View File

@ -27,7 +27,6 @@ import (
"path"
"strconv"
"strings"
"syscall"
"time"
appcschema "github.com/appc/spec/schema"
@ -42,6 +41,7 @@ import (
"k8s.io/kubernetes/pkg/credentialprovider"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/prober"
kubeletUtil "k8s.io/kubernetes/pkg/kubelet/util"
"k8s.io/kubernetes/pkg/probe"
"k8s.io/kubernetes/pkg/securitycontext"
"k8s.io/kubernetes/pkg/types"
@ -467,7 +467,7 @@ func (r *runtime) makePodManifest(pod *api.Pod, pullSecrets []api.Secret) (*appc
volumeMap, ok := r.volumeGetter.GetVolumes(pod.UID)
if !ok {
return nil, fmt.Errorf("cannot get the volumes for pod %q", kubecontainer.GetPodFullName(pod))
return nil, fmt.Errorf("cannot get the volumes for pod %q", kubeletUtil.FormatPodName(pod))
}
// Set global volumes.
@ -533,7 +533,7 @@ func serviceFilePath(serviceName string) string {
// preparePod will:
//
// 1. Invoke 'rkt prepare' to prepare the pod, and get the rkt pod uuid.
// 2. Creates the unit file and save it under systemdUnitDir.
// 2. Create the unit file and save it under systemdUnitDir.
//
// On success, it will return a string that represents name of the unit file
// and the runtime pod.
@ -566,6 +566,9 @@ func (r *runtime) preparePod(pod *api.Pod, pullSecrets []api.Secret) (string, *k
// Run 'rkt prepare' to get the rkt UUID.
cmds := []string{"prepare", "--quiet", "--pod-manifest", manifestFile.Name()}
if r.config.Stage1Image != "" {
cmds = append(cmds, "--stage1-image", r.config.Stage1Image)
}
output, err := r.runCommand(cmds...)
if err != nil {
return "", nil, err
@ -596,6 +599,8 @@ func (r *runtime) preparePod(pod *api.Pod, pullSecrets []api.Secret) (string, *k
// This makes the service show up for 'systemctl list-units' even if it exits successfully.
newUnitOption("Service", "RemainAfterExit", "true"),
newUnitOption("Service", "ExecStart", runPrepared),
// This enables graceful stop.
newUnitOption("Service", "KillMode", "mixed"),
}
// Check if there's old rkt pod corresponding to the same pod, if so, update the restart count.
@ -615,7 +620,7 @@ func (r *runtime) preparePod(pod *api.Pod, pullSecrets []api.Secret) (string, *k
}
units = append(units, newUnitOption(unitKubernetesSection, unitRestartCount, strconv.Itoa(restartCount)))
glog.V(4).Infof("rkt: Creating service file %q for pod %q", serviceName, pod.Name)
glog.V(4).Infof("rkt: Creating service file %q for pod %q", serviceName, kubeletUtil.FormatPodName(pod))
serviceFile, err := os.Create(serviceFilePath(serviceName))
if err != nil {
return "", nil, err
@ -674,7 +679,7 @@ func (r *runtime) generateEvents(runtimePod *kubecontainer.Pod, reason string, f
// RunPod first creates the unit file for a pod, and then
// starts the unit over d-bus.
func (r *runtime) RunPod(pod *api.Pod, pullSecrets []api.Secret) error {
glog.V(4).Infof("Rkt starts to run pod: name %q.", pod.Name)
glog.V(4).Infof("Rkt starts to run pod: name %q.", kubeletUtil.FormatPodName(pod))
name, runtimePod, prepareErr := r.preparePod(pod, pullSecrets)
@ -684,7 +689,7 @@ func (r *runtime) RunPod(pod *api.Pod, pullSecrets []api.Secret) error {
for i, c := range pod.Spec.Containers {
ref, err := kubecontainer.GenerateContainerRef(pod, &c)
if err != nil {
glog.Errorf("Couldn't make a ref to pod %v, container %v: '%v'", pod.Name, c.Name, err)
glog.Errorf("Couldn't make a ref to pod %q, container %v: '%v'", kubeletUtil.FormatPodName(pod), c.Name, err)
continue
}
if prepareErr != nil {
@ -800,8 +805,11 @@ func (r *runtime) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error {
r.containerRefManager.ClearRef(id)
}
// TODO(yifan): More graceful stop. Replace with StopUnit and wait for a timeout.
r.systemd.KillUnit(serviceName, int32(syscall.SIGKILL))
// Since all service file have 'KillMode=mixed', the processes in
// the unit's cgroup will receive a SIGKILL if the normal stop timeouts.
if _, err := r.systemd.StopUnit(serviceName, "replace"); err != nil {
return err
}
// Remove the systemd service file as well.
return os.Remove(serviceFilePath(serviceName))
}
@ -961,7 +969,7 @@ func (r *runtime) IsImagePresent(image kubecontainer.ImageSpec) (bool, error) {
// SyncPod syncs the running pod to match the specified desired pod.
func (r *runtime) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, podStatus api.PodStatus, pullSecrets []api.Secret, backOff *util.Backoff) error {
podFullName := kubecontainer.GetPodFullName(pod)
podFullName := kubeletUtil.FormatPodName(pod)
if len(runningPod.Containers) == 0 {
glog.V(4).Infof("Pod %q is not running, will start it", podFullName)
return r.RunPod(pod, pullSecrets)
@ -1036,6 +1044,8 @@ func (r *runtime) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, podStatus
//
// In rkt runtime's implementation, per container log is get via 'journalctl -M [rkt-$UUID] -u [APP_NAME]'.
// See https://github.com/coreos/rkt/blob/master/Documentation/commands.md#logging for more details.
//
// TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail.
func (r *runtime) GetContainerLogs(pod *api.Pod, containerID string, tail string, follow bool, stdout, stderr io.Writer) error {
id, err := parseContainerID(containerID)
if err != nil {
@ -1072,6 +1082,7 @@ func (r *runtime) GarbageCollect() error {
// Note: In rkt, the container ID is in the form of "UUID:appName", where
// appName is the container name.
// TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail.
func (r *runtime) RunInContainer(containerID string, cmd []string) ([]byte, error) {
glog.V(4).Infof("Rkt running in container.")
@ -1092,6 +1103,7 @@ func (r *runtime) AttachContainer(containerID string, stdin io.Reader, stdout, s
// Note: In rkt, the container ID is in the form of "UUID:appName", where UUID is
// the rkt UUID, and appName is the container name.
// TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail.
func (r *runtime) ExecInContainer(containerID string, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool) error {
glog.V(4).Infof("Rkt execing in container.")
@ -1150,7 +1162,7 @@ func (r *runtime) findRktID(pod *kubecontainer.Pod) (string, error) {
f, err := os.Open(serviceFilePath(serviceName))
if err != nil {
if os.IsNotExist(err) {
return "", fmt.Errorf("no service file %v for pod %q, UID %q", serviceName, pod.Name, pod.ID)
return "", fmt.Errorf("no service file %v for runtime pod %q, ID %q", serviceName, pod.Name, pod.ID)
}
return "", err
}
@ -1179,6 +1191,7 @@ func (r *runtime) findRktID(pod *kubecontainer.Pod) (string, error) {
// - should we support nsenter + socat in a container, running with elevated privs and --pid=host?
//
// TODO(yifan): Merge with the same function in dockertools.
// TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail.
func (r *runtime) PortForward(pod *kubecontainer.Pod, port uint16, stream io.ReadWriteCloser) error {
glog.V(4).Infof("Rkt port forwarding in container.")