Make container runtime's cgroup configurable.

Use the real cgroups for metrics generation.

Signed-off-by: Vishnu kannan <vishnuk@google.com>
This commit is contained in:
Vishnu kannan 2016-02-04 17:49:17 -08:00
parent 9d3633a16d
commit 38efc837b9
14 changed files with 172 additions and 5645 deletions

View File

@ -47,8 +47,7 @@ type KubeletServer struct {
KubeConfig util.StringFlag
APIServerList []string
DockerDaemonContainer string
RunOnce bool
RunOnce bool
// Insert a probability of random errors during calls to the master.
ChaosChance float64
@ -61,9 +60,8 @@ type KubeletServer struct {
// NewKubeletServer will create a new KubeletServer with default values.
func NewKubeletServer() *KubeletServer {
return &KubeletServer{
AuthPath: util.NewStringFlag("/var/lib/kubelet/kubernetes_auth"), // deprecated
KubeConfig: util.NewStringFlag("/var/lib/kubelet/kubeconfig"),
DockerDaemonContainer: "/docker-daemon",
AuthPath: util.NewStringFlag("/var/lib/kubelet/kubernetes_auth"), // deprecated
KubeConfig: util.NewStringFlag("/var/lib/kubelet/kubeconfig"),
SystemReserved: make(util.ConfigurationMap),
KubeReserved: make(util.ConfigurationMap),
@ -117,6 +115,7 @@ func NewKubeletServer() *KubeletServer {
RktPath: "",
RktStage1Image: "",
RootDirectory: defaultRootDir,
RuntimeContainer: "/docker-daemon",
SerializeImagePulls: true,
StreamingConnectionIdleTimeout: unversioned.Duration{4 * time.Hour},
SyncFrequency: unversioned.Duration{1 * time.Minute},
@ -223,4 +222,5 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
fs.DurationVar(&s.OutOfDiskTransitionFrequency.Duration, "outofdisk-transition-frequency", s.OutOfDiskTransitionFrequency.Duration, "Duration for which the kubelet has to wait before transitioning out of out-of-disk node condition status. Default: 5m0s")
fs.StringVar(&s.NodeIP, "node-ip", s.NodeIP, "IP address of the node. If set, kubelet will use this IP address for the node")
fs.BoolVar(&s.EnableCustomMetrics, "enable-custom-metrics", s.EnableCustomMetrics, "Support for gathering custom metrics.")
fs.StringVar(&s.RuntimeContainer, "runtime-container", s.RuntimeContainer, "Absolute name of the cgroups to create (if required) and run the runtime in (Default: /docker-daemon).")
}

View File

@ -194,7 +194,7 @@ func UnsecuredKubeletConfig(s *options.KubeletServer) (*KubeletConfig, error) {
CPUCFSQuota: s.CPUCFSQuota,
DiskSpacePolicy: diskSpacePolicy,
DockerClient: dockertools.ConnectToDockerOrDie(s.DockerEndpoint),
DockerDaemonContainer: s.DockerDaemonContainer,
RuntimeContainer: s.RuntimeContainer,
DockerExecHandler: dockerExecHandler,
EnableCustomMetrics: s.EnableCustomMetrics,
EnableDebuggingHandlers: s.EnableDebuggingHandlers,
@ -306,7 +306,16 @@ func Run(s *options.KubeletServer, kcfg *KubeletConfig) error {
}
if kcfg.ContainerManager == nil {
kcfg.ContainerManager, err = cm.NewContainerManager(kcfg.Mounter, kcfg.CAdvisorInterface)
if kcfg.SystemContainer != "" && kcfg.CgroupRoot == "" {
return fmt.Errorf("invalid configuration: system container was specified and cgroup root was not specified")
}
kcfg.ContainerManager, err = cm.NewContainerManager(kcfg.Mounter, kcfg.CAdvisorInterface, cm.NodeConfig{
RuntimeContainerName: kcfg.RuntimeContainer,
SystemContainerName: kcfg.SystemContainer,
KubeletContainerName: kcfg.ResourceContainer,
ContainerRuntime: kcfg.ContainerRuntime,
})
if err != nil {
return err
}
@ -501,7 +510,7 @@ func SimpleKubelet(client *clientset.Clientset,
CPUCFSQuota: true,
DiskSpacePolicy: diskSpacePolicy,
DockerClient: dockerClient,
DockerDaemonContainer: "/docker-daemon",
RuntimeContainer: "/docker-daemon",
DockerExecHandler: &dockertools.NativeExecHandler{},
EnableCustomMetrics: false,
EnableDebuggingHandlers: true,
@ -677,7 +686,7 @@ type KubeletConfig struct {
CPUCFSQuota bool
DiskSpacePolicy kubelet.DiskSpacePolicy
DockerClient dockertools.DockerInterface
DockerDaemonContainer string
RuntimeContainer string
DockerExecHandler dockertools.ExecHandler
EnableCustomMetrics bool
EnableDebuggingHandlers bool
@ -802,7 +811,6 @@ func CreateAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod
kc.Cloud,
kc.NodeLabels,
kc.NodeStatusUpdateFrequency,
kc.ResourceContainer,
kc.OSInterface,
kc.CgroupRoot,
kc.ContainerRuntime,
@ -810,8 +818,6 @@ func CreateAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod
kc.RktStage1Image,
kc.Mounter,
kc.Writer,
kc.DockerDaemonContainer,
kc.SystemContainer,
kc.ConfigureCBR0,
kc.NonMasqueradeCIDR,
kc.PodCIDR,

View File

@ -180,7 +180,7 @@ func (s *KubeletExecutorServer) runKubelet(
return decorated, pc, nil
}
kcfg.DockerDaemonContainer = "" // don't move the docker daemon into a cgroup
kcfg.RuntimeContainer = "" // don't move the docker daemon into a cgroup
kcfg.Hostname = kcfg.HostnameOverride
kcfg.KubeClient = apiclient
@ -216,7 +216,12 @@ func (s *KubeletExecutorServer) runKubelet(
}
kcfg.CAdvisorInterface = cAdvisorInterface
kcfg.ContainerManager, err = cm.NewContainerManager(kcfg.Mounter, cAdvisorInterface)
kcfg.ContainerManager, err = cm.NewContainerManager(kcfg.Mounter, cAdvisorInterface, cm.NodeConfig{
RuntimeContainerName: kcfg.RuntimeContainer,
SystemContainerName: kcfg.SystemContainer,
KubeletContainerName: kcfg.ResourceContainer,
ContainerRuntime: kcfg.ContainerRuntime,
})
if err != nil {
return err
}

View File

@ -139,6 +139,7 @@ kubelet
--rkt-stage1-image="": image to use as stage1. Local paths and http/https URLs are supported. If empty, the 'stage1.aci' in the same directory as '--rkt-path' will be used
--root-dir="/var/lib/kubelet": Directory path for managing kubelet files (volume mounts,etc).
--runonce[=false]: If true, exit after spawning pods from local manifests or remote urls. Exclusive with --api-servers, and --enable-server
--runtime-container="/docker-daemon": Absolute name of the cgroups to create (if required) and run the runtime in (Default: /docker-daemon).
--serialize-image-pulls[=true]: Pull images one at a time. We recommend *not* changing the default value on nodes that run docker daemon with version < 1.9 or an Aufs storage backend. Issue #10959 has more details. [default=true]
--streaming-connection-idle-timeout=4h0m0s: Maximum time a streaming connection can be idle before the connection is automatically closed. 0 indicates no timeout. Example: '5m'
--sync-frequency=1m0s: Max period between synchronizing running containers and config

View File

@ -311,6 +311,7 @@ root-ca-file
root-dir
run-proxy
runtime-config
runtime-container
save-config
scheduler-config
scheduler-name

File diff suppressed because it is too large Load Diff

View File

@ -304,6 +304,8 @@ type KubeletConfiguration struct {
NonMasqueradeCIDR string `json:"nonMasqueradeCIDR"`
// enable gathering custom metrics.
EnableCustomMetrics bool `json:"enableCustomMetrics"`
// The cgroup that container runtime is expected to be isolated in.
RuntimeContainer string `json:"runtimeContainer,omitempty"`
}
type KubeSchedulerConfiguration struct {

View File

@ -25,15 +25,19 @@ type ContainerManager interface {
// Runs the container manager's housekeeping.
// - Ensures that the Docker daemon is in a container.
// - Creates the system container where all non-containerized processes run.
Start(NodeConfig) error
Start() error
// Returns resources allocated to system containers in the machine.
// These containers include the system and Kubernetes services.
SystemContainersLimit() api.ResourceList
// Returns a NodeConfig that is being used by the container manager.
GetNodeConfig() NodeConfig
}
type NodeConfig struct {
DockerDaemonContainerName string
SystemContainerName string
KubeletContainerName string
RuntimeContainerName string
SystemContainerName string
KubeletContainerName string
ContainerRuntime string
}

View File

@ -114,11 +114,11 @@ func validateSystemRequirements(mountUtil mount.Interface) error {
// TODO(vmarmol): Add limits to the system containers.
// Takes the absolute name of the specified containers.
// Empty container name disables use of the specified container.
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface) (ContainerManager, error) {
func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.Interface, nodeConfig NodeConfig) (ContainerManager, error) {
return &containerManagerImpl{
cadvisorInterface: cadvisorInterface,
mountUtil: mountUtil,
NodeConfig: NodeConfig{},
NodeConfig: nodeConfig,
}, nil
}
@ -192,70 +192,113 @@ func (cm *containerManagerImpl) setupNode() error {
}
systemContainers := []*systemContainer{}
if cm.DockerDaemonContainerName != "" {
cont := newSystemContainer(cm.DockerDaemonContainerName)
if cm.ContainerRuntime == "docker" {
if cm.RuntimeContainerName != "" {
cont := newSystemContainer(cm.RuntimeContainerName)
info, err := cm.cadvisorInterface.MachineInfo()
var capacity = api.ResourceList{}
if err != nil {
} else {
capacity = cadvisor.CapacityFromMachineInfo(info)
}
memoryLimit := (int64(capacity.Memory().Value() * DockerMemoryLimitThresholdPercent / 100))
if memoryLimit < MinDockerMemoryLimit {
glog.Warningf("Memory limit %d for container %s is too small, reset it to %d", memoryLimit, cm.RuntimeContainerName, MinDockerMemoryLimit)
memoryLimit = MinDockerMemoryLimit
}
info, err := cm.cadvisorInterface.MachineInfo()
var capacity = api.ResourceList{}
if err != nil {
} else {
capacity = cadvisor.CapacityFromMachineInfo(info)
}
memoryLimit := (int64(capacity.Memory().Value() * DockerMemoryLimitThresholdPercent / 100))
if memoryLimit < MinDockerMemoryLimit {
glog.Warningf("Memory limit %d for container %s is too small, reset it to %d", memoryLimit, cm.DockerDaemonContainerName, MinDockerMemoryLimit)
memoryLimit = MinDockerMemoryLimit
}
glog.V(2).Infof("Configure resource-only container %s with memory limit: %d", cm.RuntimeContainerName, memoryLimit)
glog.V(2).Infof("Configure resource-only container %s with memory limit: %d", cm.DockerDaemonContainerName, memoryLimit)
dockerContainer := &fs.Manager{
Cgroups: &configs.Cgroup{
Parent: "/",
Name: cm.DockerDaemonContainerName,
Resources: &configs.Resources{
Memory: memoryLimit,
MemorySwap: -1,
AllowAllDevices: true,
dockerContainer := &fs.Manager{
Cgroups: &configs.Cgroup{
Parent: "/",
Name: cm.RuntimeContainerName,
Resources: &configs.Resources{
Memory: memoryLimit,
MemorySwap: -1,
AllowAllDevices: true,
},
},
},
}
cont.ensureStateFunc = func(manager *fs.Manager) error {
return ensureDockerInContainer(cm.cadvisorInterface, -900, dockerContainer)
}
systemContainers = append(systemContainers, cont)
} else {
cont, err := getContainerNameForProcess("docker")
if err != nil {
glog.Error(err)
} else {
cm.RuntimeContainerName = cont
}
}
cont.ensureStateFunc = func(manager *fs.Manager) error {
return ensureDockerInContainer(cm.cadvisorInterface, -900, dockerContainer)
}
systemContainers = append(systemContainers, cont)
}
if cm.SystemContainerName != "" {
if cm.SystemContainerName == "/" {
return fmt.Errorf("system container cannot be root (\"/\")")
}
cont := newSystemContainer(cm.SystemContainerName)
rootContainer := &fs.Manager{
Cgroups: &configs.Cgroup{
Parent: "/",
Name: "/",
},
}
manager := createManager(cm.SystemContainerName)
err := ensureSystemContainer(rootContainer, manager)
if err != nil {
return err
cont.ensureStateFunc = func(manager *fs.Manager) error {
return ensureSystemContainer(rootContainer, manager)
}
systemContainers = append(systemContainers, newSystemContainer(cm.SystemContainerName))
systemContainers = append(systemContainers, cont)
}
if cm.KubeletContainerName != "" {
systemContainers = append(systemContainers, newSystemContainer(cm.KubeletContainerName))
cont := newSystemContainer(cm.KubeletContainerName)
manager := fs.Manager{
Cgroups: &configs.Cgroup{
Parent: "/",
Name: cm.KubeletContainerName,
Resources: &configs.Resources{
AllowAllDevices: true,
},
},
}
cont.ensureStateFunc = func(_ *fs.Manager) error {
return manager.Apply(os.Getpid())
}
systemContainers = append(systemContainers, cont)
} else {
cont, err := getContainer(os.Getpid())
if err != nil {
glog.Error("failed to find cgroups of kubelet - %v", err)
} else {
cm.KubeletContainerName = cont
}
}
cm.systemContainers = systemContainers
return nil
}
func (cm *containerManagerImpl) Start(nodeConfig NodeConfig) error {
cm.NodeConfig = nodeConfig
func getContainerNameForProcess(name string) (string, error) {
pids, err := getPidsForProcess(name)
if err != nil {
return "", fmt.Errorf("failed to detect process id for %q - %v", name, err)
}
if len(pids) == 0 {
return "", nil
}
cont, err := getContainer(pids[0])
if err != nil {
return "", err
}
return cont, nil
}
func (cm *containerManagerImpl) GetNodeConfig() NodeConfig {
return cm.NodeConfig
}
func (cm *containerManagerImpl) Start() error {
// Setup the node
if err := cm.setupNode(); err != nil {
return err
@ -313,16 +356,13 @@ func isProcessRunningInHost(pid int) (bool, error) {
return initMntNs == processMntNs, nil
}
// Ensures that the Docker daemon is in the desired container.
func ensureDockerInContainer(cadvisor cadvisor.Interface, oomScoreAdj int, manager *fs.Manager) error {
// What container is Docker in?
out, err := exec.Command("pidof", "docker").Output()
func getPidsForProcess(name string) ([]int, error) {
out, err := exec.Command("pidof", "name").Output()
if err != nil {
return fmt.Errorf("failed to find pid of Docker container: %v", err)
return []int{}, fmt.Errorf("failed to find pid of %q: %v", name, err)
}
// The output of pidof is a list of pids.
// Docker may be forking and thus there would be more than one result.
pids := []int{}
for _, pidStr := range strings.Split(strings.TrimSpace(string(out)), " ") {
pid, err := strconv.Atoi(pidStr)
@ -331,7 +371,15 @@ func ensureDockerInContainer(cadvisor cadvisor.Interface, oomScoreAdj int, manag
}
pids = append(pids, pid)
}
return pids, nil
}
// Ensures that the Docker daemon is in the desired container.
func ensureDockerInContainer(cadvisor cadvisor.Interface, oomScoreAdj int, manager *fs.Manager) error {
pids, err := getPidsForProcess("docker")
if err != nil {
return err
}
// Move if the pid is not already in the desired container.
errs := []error{}
for _, pid := range pids {

View File

@ -25,7 +25,7 @@ type containerManagerStub struct{}
var _ ContainerManager = &containerManagerStub{}
func (cm *containerManagerStub) Start(_ NodeConfig) error {
func (cm *containerManagerStub) Start() error {
glog.V(2).Infof("Starting stub container manager")
return nil
}
@ -34,6 +34,10 @@ func (cm *containerManagerStub) SystemContainersLimit() api.ResourceList {
return api.ResourceList{}
}
func (cm *containerManagerStub) GetNodeConfig() NodeConfig {
return NodeConfig{}
}
func NewStubContainerManager() ContainerManager {
return &containerManagerStub{}
}

View File

@ -31,7 +31,7 @@ type unsupportedContainerManager struct {
var _ ContainerManager = &unsupportedContainerManager{}
func (unsupportedContainerManager) Start(_ NodeConfig) error {
func (unsupportedContainerManager) Start() error {
return fmt.Errorf("Container Manager is unsupported in this build")
}
@ -39,6 +39,10 @@ func (unsupportedContainerManager) SystemContainersLimit() api.ResourceList {
return api.ResourceList{}
}
func NewContainerManager(mounter mount.Interface, cadvisorInterface cadvisor.Interface) (ContainerManager, error) {
func (unsupportedContainerManager) GetNodeConfig() NodeConfig {
return NodeConfig{}
}
func NewContainerManager(_ mount.Interface, _ cadvisor.Interface, _ NodeConfig) (ContainerManager, error) {
return &unsupportedContainerManager{}, nil
}

View File

@ -178,7 +178,6 @@ func NewMainKubelet(
cloud cloudprovider.Interface,
nodeLabels map[string]string,
nodeStatusUpdateFrequency time.Duration,
resourceContainer string,
osInterface kubecontainer.OSInterface,
cgroupRoot string,
containerRuntime string,
@ -186,8 +185,6 @@ func NewMainKubelet(
rktStage1Image string,
mounter mount.Interface,
writer kubeio.Writer,
dockerDaemonContainer string,
systemContainer string,
configureCBR0 bool,
nonMasqueradeCIDR string,
podCIDR string,
@ -215,9 +212,6 @@ func NewMainKubelet(
if resyncInterval <= 0 {
return nil, fmt.Errorf("invalid sync frequency %d", resyncInterval)
}
if systemContainer != "" && cgroupRoot == "" {
return nil, fmt.Errorf("invalid configuration: system container was specified and cgroup root was not specified")
}
dockerClient = dockertools.NewInstrumentedDockerInterface(dockerClient)
serviceStore := cache.NewStore(cache.MetaNamespaceKeyFunc)
@ -311,25 +305,24 @@ func NewMainKubelet(
nodeRef: nodeRef,
nodeLabels: nodeLabels,
nodeStatusUpdateFrequency: nodeStatusUpdateFrequency,
resourceContainer: resourceContainer,
os: osInterface,
oomWatcher: oomWatcher,
cgroupRoot: cgroupRoot,
mounter: mounter,
writer: writer,
configureCBR0: configureCBR0,
nonMasqueradeCIDR: nonMasqueradeCIDR,
reconcileCIDR: reconcileCIDR,
maxPods: maxPods,
syncLoopMonitor: atomic.Value{},
resolverConfig: resolverConfig,
cpuCFSQuota: cpuCFSQuota,
daemonEndpoints: daemonEndpoints,
containerManager: containerManager,
flannelExperimentalOverlay: flannelExperimentalOverlay,
flannelHelper: NewFlannelHelper(),
nodeIP: nodeIP,
clock: util.RealClock{},
os: osInterface,
oomWatcher: oomWatcher,
cgroupRoot: cgroupRoot,
mounter: mounter,
writer: writer,
configureCBR0: configureCBR0,
nonMasqueradeCIDR: nonMasqueradeCIDR,
reconcileCIDR: reconcileCIDR,
maxPods: maxPods,
syncLoopMonitor: atomic.Value{},
resolverConfig: resolverConfig,
cpuCFSQuota: cpuCFSQuota,
daemonEndpoints: daemonEndpoints,
containerManager: containerManager,
flannelExperimentalOverlay: flannelExperimentalOverlay,
flannelHelper: NewFlannelHelper(),
nodeIP: nodeIP,
clock: util.RealClock{},
outOfDiskTransitionFrequency: outOfDiskTransitionFrequency,
reservation: reservation,
enableCustomMetrics: enableCustomMetrics,
@ -414,8 +407,6 @@ func NewMainKubelet(
return nil, err
}
klet.containerRuntime = rktRuntime
// No Docker daemon to put in a container.
dockerDaemonContainer = ""
default:
return nil, fmt.Errorf("unsupported container runtime %q specified", containerRuntime)
}
@ -438,13 +429,6 @@ func NewMainKubelet(
}
klet.imageManager = imageManager
// Setup container manager, can fail if the devices hierarchy is not mounted
// (it is required by Docker however).
klet.nodeConfig = cm.NodeConfig{
DockerDaemonContainerName: dockerDaemonContainer,
SystemContainerName: systemContainer,
KubeletContainerName: resourceContainer,
}
klet.runtimeState.setRuntimeSync(klet.clock.Now())
klet.runner = klet.containerRuntime
@ -613,10 +597,6 @@ type Kubelet struct {
// Store kubecontainer.PodStatus for all pods.
podCache kubecontainer.Cache
// The name of the resource-only container to run the Kubelet in (empty for no container).
// Name must be absolute.
resourceContainer string
os kubecontainer.OSInterface
// Watcher of out of memory events.
@ -913,42 +893,32 @@ func (kl *Kubelet) StartGarbageCollection() {
// initializeModules will initialize internal modules that do not require the container runtime to be up.
// Note that the modules here must not depend on modules that are not initialized here.
func (kl *Kubelet) initializeModules() error {
// Step 1: Move Kubelet to a container, if required.
if kl.resourceContainer != "" {
// Fixme: I need to reside inside ContainerManager interface.
err := util.RunInResourceContainer(kl.resourceContainer)
if err != nil {
glog.Warningf("Failed to move Kubelet to container %q: %v", kl.resourceContainer, err)
}
glog.Infof("Running in container %q", kl.resourceContainer)
}
// Step 2: Promethues metrics.
// Step 1: Promethues metrics.
metrics.Register(kl.runtimeCache)
// Step 3: Setup filesystem directories.
// Step 2: Setup filesystem directories.
if err := kl.setupDataDirs(); err != nil {
return err
}
// Step 4: If the container logs directory does not exist, create it.
// Step 3: If the container logs directory does not exist, create it.
if _, err := os.Stat(containerLogsDir); err != nil {
if err := kl.os.Mkdir(containerLogsDir, 0755); err != nil {
glog.Errorf("Failed to create directory %q: %v", containerLogsDir, err)
}
}
// Step 5: Start the image manager.
// Step 4: Start the image manager.
if err := kl.imageManager.Start(); err != nil {
return fmt.Errorf("Failed to start ImageManager, images may not be garbage collected: %v", err)
}
// Step 6: Start container manager.
if err := kl.containerManager.Start(kl.nodeConfig); err != nil {
// Step 5: Start container manager.
if err := kl.containerManager.Start(); err != nil {
return fmt.Errorf("Failed to start ContainerManager %v", err)
}
// Step 7: Start out of memory watcher.
// Step 6: Start out of memory watcher.
if err := kl.oomWatcher.Start(kl.nodeRef); err != nil {
return fmt.Errorf("Failed to start OOM watcher %v", err)
}
@ -3513,7 +3483,7 @@ func (kl *Kubelet) updatePodCIDR(cidr string) {
}
}
func (kl *Kubelet) GetNodeConfig() cm.NodeConfig {
return kl.nodeConfig
return kl.containerManager.GetNodeConfig()
}
var minRsrc = resource.MustParse("1k")

View File

@ -120,7 +120,7 @@ func (sb *summaryBuilder) build() (*Summary, error) {
systemContainers := map[string]string{
SystemContainerKubelet: sb.nodeConfig.KubeletContainerName,
SystemContainerRuntime: sb.nodeConfig.DockerDaemonContainerName, // TODO: add support for other runtimes
SystemContainerRuntime: sb.nodeConfig.RuntimeContainerName,
SystemContainerMisc: sb.nodeConfig.SystemContainerName,
}
for sys, name := range systemContainers {

View File

@ -48,9 +48,9 @@ func TestBuildSummary(t *testing.T) {
node := api.Node{}
node.Name = "FooNode"
nodeConfig := cm.NodeConfig{
DockerDaemonContainerName: "/docker-daemon",
SystemContainerName: "/system",
KubeletContainerName: "/kubelet",
RuntimeContainerName: "/docker-daemon",
SystemContainerName: "/system",
KubeletContainerName: "/kubelet",
}
const (
namespace0 = "test0"