mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-10-26 11:07:45 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			2644 lines
		
	
	
		
			85 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			2644 lines
		
	
	
		
			85 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| /*
 | |
| Copyright 2015 The Kubernetes Authors.
 | |
| 
 | |
| Licensed under the Apache License, Version 2.0 (the "License");
 | |
| you may not use this file except in compliance with the License.
 | |
| You may obtain a copy of the License at
 | |
| 
 | |
|     http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
| Unless required by applicable law or agreed to in writing, software
 | |
| distributed under the License is distributed on an "AS IS" BASIS,
 | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| See the License for the specific language governing permissions and
 | |
| limitations under the License.
 | |
| */
 | |
| 
 | |
| package rkt
 | |
| 
 | |
| import (
 | |
| 	"bufio"
 | |
| 	"bytes"
 | |
| 	"encoding/json"
 | |
| 	"fmt"
 | |
| 	"io"
 | |
| 	"io/ioutil"
 | |
| 	"os"
 | |
| 	"os/exec"
 | |
| 	"path"
 | |
| 	"path/filepath"
 | |
| 	"sort"
 | |
| 	"strconv"
 | |
| 	"strings"
 | |
| 	"sync"
 | |
| 	"syscall"
 | |
| 	"time"
 | |
| 
 | |
| 	appcschema "github.com/appc/spec/schema"
 | |
| 	appctypes "github.com/appc/spec/schema/types"
 | |
| 	"github.com/coreos/go-systemd/unit"
 | |
| 	rktapi "github.com/coreos/rkt/api/v1alpha"
 | |
| 	"github.com/golang/glog"
 | |
| 	"golang.org/x/net/context"
 | |
| 	"google.golang.org/grpc"
 | |
| 	"k8s.io/api/core/v1"
 | |
| 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 | |
| 	kubetypes "k8s.io/apimachinery/pkg/types"
 | |
| 	"k8s.io/apimachinery/pkg/util/errors"
 | |
| 	"k8s.io/apimachinery/pkg/util/uuid"
 | |
| 	utilwait "k8s.io/apimachinery/pkg/util/wait"
 | |
| 	"k8s.io/client-go/tools/record"
 | |
| 	"k8s.io/client-go/tools/remotecommand"
 | |
| 	"k8s.io/client-go/util/flowcontrol"
 | |
| 	"k8s.io/kubernetes/pkg/credentialprovider"
 | |
| 	kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/events"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/images"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/leaky"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/lifecycle"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/network"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/network/hairpin"
 | |
| 	proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/types"
 | |
| 	"k8s.io/kubernetes/pkg/kubelet/util/format"
 | |
| 	"k8s.io/kubernetes/pkg/securitycontext"
 | |
| 	"k8s.io/kubernetes/pkg/util/selinux"
 | |
| 	utilstrings "k8s.io/kubernetes/pkg/util/strings"
 | |
| 	"k8s.io/kubernetes/pkg/util/term"
 | |
| 	utilexec "k8s.io/utils/exec"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	RktType                      = "rkt"
 | |
| 	DefaultRktAPIServiceEndpoint = "localhost:15441"
 | |
| 
 | |
| 	minimumRktBinVersion = "1.13.0"
 | |
| 
 | |
| 	minimumRktApiVersion  = "1.0.0-alpha"
 | |
| 	minimumSystemdVersion = "219"
 | |
| 
 | |
| 	systemdServiceDir = "/run/systemd/system"
 | |
| 	rktDataDir        = "/var/lib/rkt"
 | |
| 	rktLocalConfigDir = "/etc/rkt"
 | |
| 
 | |
| 	kubernetesUnitPrefix    = "k8s_"
 | |
| 	unitKubernetesSection   = "X-Kubernetes"
 | |
| 	unitPodUID              = "PodUID"
 | |
| 	unitPodName             = "PodName"
 | |
| 	unitPodNamespace        = "PodNamespace"
 | |
| 	unitPodHostNetwork      = "PodHostNetwork"
 | |
| 	unitPodNetworkNamespace = "PodNetworkNamespace"
 | |
| 
 | |
| 	k8sRktKubeletAnno                = "rkt.kubernetes.io/managed-by-kubelet"
 | |
| 	k8sRktKubeletAnnoValue           = "true"
 | |
| 	k8sRktContainerHashAnno          = "rkt.kubernetes.io/container-hash"
 | |
| 	k8sRktRestartCountAnno           = "rkt.kubernetes.io/restart-count"
 | |
| 	k8sRktTerminationMessagePathAnno = "rkt.kubernetes.io/termination-message-path"
 | |
| 
 | |
| 	k8sRktLimitNoFileAnno = "systemd-unit-option.rkt.kubernetes.io/LimitNOFILE"
 | |
| 
 | |
| 	// TODO(euank): This has significant security concerns as a stage1 image is
 | |
| 	// effectively root.
 | |
| 	// Furthermore, this (using an annotation) is a hack to pass an extra
 | |
| 	// non-portable argument in. It should not be relied on to be stable.
 | |
| 	// In the future, this might be subsumed by a first-class api object, or by a
 | |
| 	// kitchen-sink params object (#17064).
 | |
| 	// See discussion in #23944
 | |
| 	// Also, do we want more granularity than path-at-the-kubelet-level and
 | |
| 	// image/name-at-the-pod-level?
 | |
| 	k8sRktStage1NameAnno = "rkt.alpha.kubernetes.io/stage1-name-override"
 | |
| 	dockerPrefix         = "docker://"
 | |
| 
 | |
| 	authDir            = "auth.d"
 | |
| 	dockerAuthTemplate = `{"rktKind":"dockerAuth","rktVersion":"v1","registries":[%q],"credentials":{"user":%q,"password":%q}}`
 | |
| 
 | |
| 	defaultRktAPIServiceAddr = "localhost:15441"
 | |
| 
 | |
| 	// ndots specifies the minimum number of dots that a domain name must contain for the resolver to consider it as FQDN (fully-qualified)
 | |
| 	// we want to able to consider SRV lookup names like _dns._udp.kube-dns.default.svc to be considered relative.
 | |
| 	// hence, setting ndots to be 5.
 | |
| 	// TODO(yifan): Move this and dockertools.ndotsDNSOption to a common package.
 | |
| 	defaultDNSOption = "ndots:5"
 | |
| 
 | |
| 	// Annotations for the ENTRYPOINT and CMD for an ACI that's converted from Docker image.
 | |
| 	// Taken from https://github.com/appc/docker2aci/blob/v0.12.3/lib/common/common.go#L33
 | |
| 	appcDockerEntrypoint  = "appc.io/docker/entrypoint"
 | |
| 	appcDockerCmd         = "appc.io/docker/cmd"
 | |
| 	appcDockerRegistryURL = "appc.io/docker/registryurl"
 | |
| 	appcDockerRepository  = "appc.io/docker/repository"
 | |
| 
 | |
| 	// TODO(yifan): Reuse this const with Docker runtime.
 | |
| 	minimumGracePeriodInSeconds = 2
 | |
| 
 | |
| 	// The network name of the network when no-op plugin is being used.
 | |
| 	// TODO(yifan): This is not ideal since today we cannot make the rkt's 'net.d' dir point to the
 | |
| 	// CNI directory specified by kubelet. Once that is fixed, we can just use the network config
 | |
| 	// under the CNI directory directly.
 | |
| 	// See https://github.com/coreos/rkt/pull/2312#issuecomment-200068370.
 | |
| 	defaultNetworkName = "rkt.kubernetes.io"
 | |
| 
 | |
| 	// defaultRequestTimeout is the default timeout of rkt requests.
 | |
| 	// Value is slightly offset from 2 minutes to make timeouts due to this
 | |
| 	// constant recognizable.
 | |
| 	defaultRequestTimeout = 2*time.Minute - 1*time.Second
 | |
| 
 | |
| 	etcHostsPath      = "/etc/hosts"
 | |
| 	etcResolvConfPath = "/etc/resolv.conf"
 | |
| )
 | |
| 
 | |
| // Runtime implements the Containerruntime for rkt. The implementation
 | |
| // uses systemd, so in order to run this runtime, systemd must be installed
 | |
| // on the machine.
 | |
| type Runtime struct {
 | |
| 	cli     cliInterface
 | |
| 	systemd systemdInterface
 | |
| 	// The grpc client for rkt api-service.
 | |
| 	apisvcConn *grpc.ClientConn
 | |
| 	apisvc     rktapi.PublicAPIClient
 | |
| 	config     *Config
 | |
| 	// TODO(yifan): Refactor this to be generic keyring.
 | |
| 	dockerKeyring credentialprovider.DockerKeyring
 | |
| 
 | |
| 	containerRefManager *kubecontainer.RefManager
 | |
| 	podDeletionProvider podDeletionProvider
 | |
| 	runtimeHelper       kubecontainer.RuntimeHelper
 | |
| 	recorder            record.EventRecorder
 | |
| 	livenessManager     proberesults.Manager
 | |
| 	imagePuller         images.ImageManager
 | |
| 	runner              kubecontainer.HandlerRunner
 | |
| 	execer              utilexec.Interface
 | |
| 	os                  kubecontainer.OSInterface
 | |
| 
 | |
| 	// Network plugin manager.
 | |
| 	network *network.PluginManager
 | |
| 
 | |
| 	// If true, the "hairpin mode" flag is set on container interfaces.
 | |
| 	// A false value means the kubelet just backs off from setting it,
 | |
| 	// it might already be true.
 | |
| 	configureHairpinMode bool
 | |
| 
 | |
| 	// used for a systemd Exec, which requires the full path.
 | |
| 	touchPath   string
 | |
| 	nsenterPath string
 | |
| 
 | |
| 	versions versions
 | |
| 
 | |
| 	// requestTimeout is the timeout of rkt requests.
 | |
| 	requestTimeout time.Duration
 | |
| 
 | |
| 	unitGetter unitServiceGetter
 | |
| }
 | |
| 
 | |
| // Field of the X-Kubernetes directive of a systemd service file
 | |
| type podServiceDirective struct {
 | |
| 	id               string
 | |
| 	name             string
 | |
| 	namespace        string
 | |
| 	hostNetwork      bool
 | |
| 	networkNamespace kubecontainer.ContainerID
 | |
| }
 | |
| 
 | |
| var _ kubecontainer.Runtime = &Runtime{}
 | |
| var _ kubecontainer.DirectStreamingRuntime = &Runtime{}
 | |
| 
 | |
| // podDeletionProvider can determine if a pod is deleted
 | |
| type podDeletionProvider interface {
 | |
| 	IsPodDeleted(kubetypes.UID) bool
 | |
| }
 | |
| 
 | |
| // cliInterface wrapps the command line calls for testing purpose.
 | |
| type cliInterface interface {
 | |
| 	// RunCommand creates rkt commands and runs it with the given config.
 | |
| 	// If the config is nil, it will use the one inferred from rkt API service.
 | |
| 	RunCommand(config *Config, args ...string) (result []string, err error)
 | |
| }
 | |
| 
 | |
| // unitServiceGetter wrapps the systemd open files for testing purpose
 | |
| type unitServiceGetter interface {
 | |
| 	getKubernetesDirective(string) (podServiceDirective, error)
 | |
| 	getNetworkNamespace(kubetypes.UID, *rktapi.Pod) (kubecontainer.ContainerID, error)
 | |
| }
 | |
| 
 | |
| // New creates the rkt container runtime which implements the container runtime interface.
 | |
| // It will test if the rkt binary is in the $PATH, and whether we can get the
 | |
| // version of it. If so, creates the rkt container runtime, otherwise returns an error.
 | |
| func New(
 | |
| 	apiEndpoint string,
 | |
| 	config *Config,
 | |
| 	runtimeHelper kubecontainer.RuntimeHelper,
 | |
| 	recorder record.EventRecorder,
 | |
| 	containerRefManager *kubecontainer.RefManager,
 | |
| 	podDeletionProvider podDeletionProvider,
 | |
| 	livenessManager proberesults.Manager,
 | |
| 	httpClient types.HttpGetter,
 | |
| 	networkPlugin network.NetworkPlugin,
 | |
| 	hairpinMode bool,
 | |
| 	execer utilexec.Interface,
 | |
| 	os kubecontainer.OSInterface,
 | |
| 	imageBackOff *flowcontrol.Backoff,
 | |
| 	serializeImagePulls bool,
 | |
| 	imagePullQPS float32,
 | |
| 	imagePullBurst int,
 | |
| 	requestTimeout time.Duration,
 | |
| ) (*Runtime, error) {
 | |
| 	// Create dbus connection.
 | |
| 	systemd, err := newSystemd()
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("rkt: cannot create systemd interface: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	// TODO(yifan): Use secure connection.
 | |
| 	apisvcConn, err := grpc.Dial(apiEndpoint, grpc.WithInsecure())
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("rkt: cannot connect to rkt api service: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	// TODO(yifan): Get the rkt path from API service.
 | |
| 	if config.Path == "" {
 | |
| 		// No default rkt path was set, so try to find one in $PATH.
 | |
| 		var err error
 | |
| 		config.Path, err = execer.LookPath("rkt")
 | |
| 		if err != nil {
 | |
| 			return nil, fmt.Errorf("cannot find rkt binary: %v", err)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	touchPath, err := execer.LookPath("touch")
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("cannot find touch binary: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	nsenterPath, err := execer.LookPath("nsenter")
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("cannot find nsenter binary: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	if requestTimeout == 0 {
 | |
| 		requestTimeout = defaultRequestTimeout
 | |
| 	}
 | |
| 
 | |
| 	rkt := &Runtime{
 | |
| 		os:                  kubecontainer.RealOS{},
 | |
| 		systemd:             systemd,
 | |
| 		apisvcConn:          apisvcConn,
 | |
| 		apisvc:              rktapi.NewPublicAPIClient(apisvcConn),
 | |
| 		config:              config,
 | |
| 		dockerKeyring:       credentialprovider.NewDockerKeyring(),
 | |
| 		containerRefManager: containerRefManager,
 | |
| 		podDeletionProvider: podDeletionProvider,
 | |
| 		runtimeHelper:       runtimeHelper,
 | |
| 		recorder:            recorder,
 | |
| 		livenessManager:     livenessManager,
 | |
| 		network:             network.NewPluginManager(networkPlugin),
 | |
| 		execer:              execer,
 | |
| 		touchPath:           touchPath,
 | |
| 		nsenterPath:         nsenterPath,
 | |
| 		requestTimeout:      requestTimeout,
 | |
| 	}
 | |
| 
 | |
| 	rkt.config, err = rkt.getConfig(rkt.config)
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("rkt: cannot get config from rkt api service: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	cmdRunner := kubecontainer.DirectStreamingRunner(rkt)
 | |
| 	rkt.runner = lifecycle.NewHandlerRunner(httpClient, cmdRunner, rkt)
 | |
| 
 | |
| 	rkt.imagePuller = images.NewImageManager(recorder, rkt, imageBackOff, serializeImagePulls, imagePullQPS, imagePullBurst)
 | |
| 
 | |
| 	if err := rkt.getVersions(); err != nil {
 | |
| 		return nil, fmt.Errorf("rkt: error getting version info: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	rkt.cli = rkt
 | |
| 	rkt.unitGetter = rkt
 | |
| 
 | |
| 	return rkt, nil
 | |
| }
 | |
| 
 | |
| func buildCommand(config *Config, args ...string) *exec.Cmd {
 | |
| 	cmd := exec.Command(config.Path)
 | |
| 	cmd.Args = append(cmd.Args, config.buildGlobalOptions()...)
 | |
| 	cmd.Args = append(cmd.Args, args...)
 | |
| 	return cmd
 | |
| }
 | |
| 
 | |
| // convertToACName converts a string into ACName.
 | |
| func convertToACName(name string) appctypes.ACName {
 | |
| 	// Note that as the 'name' already matches 'DNS_LABEL'
 | |
| 	// defined in pkg/api/types.go, there shouldn't be error or panic.
 | |
| 	acname, _ := appctypes.SanitizeACName(name)
 | |
| 	return *appctypes.MustACName(acname)
 | |
| }
 | |
| 
 | |
| // RunCommand invokes rkt binary with arguments and returns the result
 | |
| // from stdout in a list of strings. Each string in the list is a line.
 | |
| // If config is non-nil, it will use the given config instead of the config
 | |
| // inferred from rkt API service.
 | |
| func (r *Runtime) RunCommand(config *Config, args ...string) ([]string, error) {
 | |
| 	if config == nil {
 | |
| 		config = r.config
 | |
| 	}
 | |
| 	glog.V(4).Infof("rkt: Run command: %q with config: %#v", args, config)
 | |
| 
 | |
| 	var stdout, stderr bytes.Buffer
 | |
| 
 | |
| 	cmd := buildCommand(config, args...)
 | |
| 	cmd.Stdout, cmd.Stderr = &stdout, &stderr
 | |
| 	if err := cmd.Run(); err != nil {
 | |
| 		return nil, fmt.Errorf("failed to run %v: %v\nstdout: %v\nstderr: %v", args, err, stdout.String(), stderr.String())
 | |
| 	}
 | |
| 	return strings.Split(strings.TrimSpace(stdout.String()), "\n"), nil
 | |
| }
 | |
| 
 | |
| // makePodServiceFileName constructs the unit file name for a pod using its rkt pod uuid.
 | |
| func makePodServiceFileName(uuid string) string {
 | |
| 	// TODO(yifan): Add name for readability? We need to consider the
 | |
| 	// limit of the length.
 | |
| 	return fmt.Sprintf("%s%s.service", kubernetesUnitPrefix, uuid)
 | |
| }
 | |
| 
 | |
| func getRktUUIDFromServiceFileName(filename string) string {
 | |
| 	return strings.TrimPrefix(strings.TrimSuffix(filename, path.Ext(filename)), kubernetesUnitPrefix)
 | |
| }
 | |
| 
 | |
| // setIsolators sets the apps' isolators according to the security context and resource spec.
 | |
| func setIsolators(app *appctypes.App, c *v1.Container, ctx *v1.SecurityContext) error {
 | |
| 	var isolators []appctypes.Isolator
 | |
| 
 | |
| 	// Capabilities isolators.
 | |
| 	if ctx != nil {
 | |
| 		var addCaps, dropCaps []string
 | |
| 
 | |
| 		if ctx.Capabilities != nil {
 | |
| 			addCaps, dropCaps = kubecontainer.MakeCapabilities(ctx.Capabilities.Add, ctx.Capabilities.Drop)
 | |
| 		}
 | |
| 		if ctx.Privileged != nil && *ctx.Privileged {
 | |
| 			addCaps, dropCaps = allCapabilities(), []string{}
 | |
| 		}
 | |
| 		if len(addCaps) > 0 {
 | |
| 			set, err := appctypes.NewLinuxCapabilitiesRetainSet(addCaps...)
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			isolator, err := set.AsIsolator()
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			isolators = append(isolators, *isolator)
 | |
| 		}
 | |
| 		if len(dropCaps) > 0 {
 | |
| 			set, err := appctypes.NewLinuxCapabilitiesRevokeSet(dropCaps...)
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			isolator, err := set.AsIsolator()
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			isolators = append(isolators, *isolator)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Resources isolators.
 | |
| 	type resource struct {
 | |
| 		limit   string
 | |
| 		request string
 | |
| 	}
 | |
| 
 | |
| 	// If limit is empty, populate it with request and vice versa.
 | |
| 	resources := make(map[v1.ResourceName]*resource)
 | |
| 	for name, quantity := range c.Resources.Limits {
 | |
| 		resources[name] = &resource{limit: quantity.String(), request: quantity.String()}
 | |
| 	}
 | |
| 	for name, quantity := range c.Resources.Requests {
 | |
| 		r, ok := resources[name]
 | |
| 		if ok {
 | |
| 			r.request = quantity.String()
 | |
| 			continue
 | |
| 		}
 | |
| 		resources[name] = &resource{limit: quantity.String(), request: quantity.String()}
 | |
| 	}
 | |
| 
 | |
| 	for name, res := range resources {
 | |
| 		switch name {
 | |
| 		case v1.ResourceCPU:
 | |
| 			cpu, err := appctypes.NewResourceCPUIsolator(res.request, res.limit)
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			isolators = append(isolators, cpu.AsIsolator())
 | |
| 		case v1.ResourceMemory:
 | |
| 			memory, err := appctypes.NewResourceMemoryIsolator(res.request, res.limit)
 | |
| 			if err != nil {
 | |
| 				return err
 | |
| 			}
 | |
| 			isolators = append(isolators, memory.AsIsolator())
 | |
| 		default:
 | |
| 			return fmt.Errorf("resource type not supported: %v", name)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if ok := securitycontext.AddNoNewPrivileges(ctx); ok {
 | |
| 		isolator, err := newNoNewPrivilegesIsolator(true)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		isolators = append(isolators, *isolator)
 | |
| 	}
 | |
| 
 | |
| 	mergeIsolators(app, isolators)
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // mergeIsolators replaces the app.Isolators with isolators.
 | |
| func mergeIsolators(app *appctypes.App, isolators []appctypes.Isolator) {
 | |
| 	for _, is := range isolators {
 | |
| 		found := false
 | |
| 		for j, js := range app.Isolators {
 | |
| 			if is.Name.Equals(js.Name) {
 | |
| 				switch is.Name {
 | |
| 				case appctypes.LinuxCapabilitiesRetainSetName:
 | |
| 					// TODO(yifan): More fine grain merge for capability set instead of override.
 | |
| 					fallthrough
 | |
| 				case appctypes.LinuxCapabilitiesRevokeSetName:
 | |
| 					fallthrough
 | |
| 				case appctypes.ResourceCPUName:
 | |
| 					fallthrough
 | |
| 				case appctypes.ResourceMemoryName:
 | |
| 					app.Isolators[j] = is
 | |
| 				default:
 | |
| 					panic(fmt.Sprintf("unexpected isolator name: %v", is.Name))
 | |
| 				}
 | |
| 				found = true
 | |
| 				break
 | |
| 			}
 | |
| 		}
 | |
| 		if !found {
 | |
| 			app.Isolators = append(app.Isolators, is)
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // mergeEnv merges the optEnv with the image's environments.
 | |
| // The environments defined in the image will be overridden by
 | |
| // the ones with the same name in optEnv.
 | |
| func mergeEnv(app *appctypes.App, optEnv []kubecontainer.EnvVar) {
 | |
| 	envMap := make(map[string]string)
 | |
| 	for _, e := range app.Environment {
 | |
| 		envMap[e.Name] = e.Value
 | |
| 	}
 | |
| 	for _, e := range optEnv {
 | |
| 		envMap[e.Name] = e.Value
 | |
| 	}
 | |
| 	app.Environment = nil
 | |
| 	for name, value := range envMap {
 | |
| 		app.Environment = append(app.Environment, appctypes.EnvironmentVariable{
 | |
| 			Name:  name,
 | |
| 			Value: value,
 | |
| 		})
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // mergeMounts merges the mountPoints with the image's mount points.
 | |
| // The mount points defined in the image will be overridden by the ones
 | |
| // with the same container path.
 | |
| func mergeMounts(app *appctypes.App, mountPoints []appctypes.MountPoint) {
 | |
| 	mountMap := make(map[string]appctypes.MountPoint)
 | |
| 	for _, m := range app.MountPoints {
 | |
| 		mountMap[m.Path] = m
 | |
| 	}
 | |
| 	for _, m := range mountPoints {
 | |
| 		mountMap[m.Path] = m
 | |
| 	}
 | |
| 	app.MountPoints = nil
 | |
| 	for _, mount := range mountMap {
 | |
| 		app.MountPoints = append(app.MountPoints, mount)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // mergePortMappings merges the containerPorts with the image's container ports.
 | |
| // The port mappings defined in the image will be overridden by the ones
 | |
| // with the same name in optPortMappings.
 | |
| func mergePortMappings(app *appctypes.App, containerPorts []appctypes.Port) {
 | |
| 	portMap := make(map[appctypes.ACName]appctypes.Port)
 | |
| 	for _, p := range app.Ports {
 | |
| 		portMap[p.Name] = p
 | |
| 	}
 | |
| 	for _, p := range containerPorts {
 | |
| 		portMap[p.Name] = p
 | |
| 	}
 | |
| 	app.Ports = nil
 | |
| 	for _, port := range portMap {
 | |
| 		app.Ports = append(app.Ports, port)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func verifyNonRoot(app *appctypes.App, ctx *v1.SecurityContext) error {
 | |
| 	if ctx != nil && ctx.RunAsNonRoot != nil && *ctx.RunAsNonRoot {
 | |
| 		if ctx.RunAsUser != nil && *ctx.RunAsUser == 0 {
 | |
| 			return fmt.Errorf("container's runAsUser breaks non-root policy")
 | |
| 		}
 | |
| 		if ctx.RunAsUser == nil && app.User == "0" {
 | |
| 			return fmt.Errorf("container has no runAsUser and image will run as root")
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func setSupplementalGIDs(app *appctypes.App, podCtx *v1.PodSecurityContext, supplementalGids []int64) {
 | |
| 	if podCtx != nil || len(supplementalGids) != 0 {
 | |
| 		app.SupplementaryGIDs = app.SupplementaryGIDs[:0]
 | |
| 	}
 | |
| 	if podCtx != nil {
 | |
| 		for _, v := range podCtx.SupplementalGroups {
 | |
| 			app.SupplementaryGIDs = append(app.SupplementaryGIDs, int(v))
 | |
| 		}
 | |
| 		if podCtx.FSGroup != nil {
 | |
| 			app.SupplementaryGIDs = append(app.SupplementaryGIDs, int(*podCtx.FSGroup))
 | |
| 		}
 | |
| 	}
 | |
| 	for _, v := range supplementalGids {
 | |
| 		app.SupplementaryGIDs = append(app.SupplementaryGIDs, int(v))
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // setApp merges the container spec with the image's manifest.
 | |
| func setApp(imgManifest *appcschema.ImageManifest, c *v1.Container,
 | |
| 	mountPoints []appctypes.MountPoint, containerPorts []appctypes.Port, envs []kubecontainer.EnvVar,
 | |
| 	ctx *v1.SecurityContext, podCtx *v1.PodSecurityContext, supplementalGids []int64) error {
 | |
| 
 | |
| 	app := imgManifest.App
 | |
| 
 | |
| 	// Set up Exec.
 | |
| 	var command, args []string
 | |
| 	cmd, ok := imgManifest.Annotations.Get(appcDockerEntrypoint)
 | |
| 	if ok {
 | |
| 		err := json.Unmarshal([]byte(cmd), &command)
 | |
| 		if err != nil {
 | |
| 			return fmt.Errorf("cannot unmarshal ENTRYPOINT %q: %v", cmd, err)
 | |
| 		}
 | |
| 	}
 | |
| 	ag, ok := imgManifest.Annotations.Get(appcDockerCmd)
 | |
| 	if ok {
 | |
| 		err := json.Unmarshal([]byte(ag), &args)
 | |
| 		if err != nil {
 | |
| 			return fmt.Errorf("cannot unmarshal CMD %q: %v", ag, err)
 | |
| 		}
 | |
| 	}
 | |
| 	userCommand, userArgs := kubecontainer.ExpandContainerCommandAndArgs(c, envs)
 | |
| 
 | |
| 	if len(userCommand) > 0 {
 | |
| 		command = userCommand
 | |
| 		args = nil // If 'command' is specified, then drop the default args.
 | |
| 	}
 | |
| 	if len(userArgs) > 0 {
 | |
| 		args = userArgs
 | |
| 	}
 | |
| 
 | |
| 	exec := append(command, args...)
 | |
| 	if len(exec) > 0 {
 | |
| 		app.Exec = exec
 | |
| 	}
 | |
| 
 | |
| 	// Set UID and GIDs.
 | |
| 	if err := verifyNonRoot(app, ctx); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	if ctx != nil && ctx.RunAsUser != nil {
 | |
| 		app.User = strconv.Itoa(int(*ctx.RunAsUser))
 | |
| 	}
 | |
| 	setSupplementalGIDs(app, podCtx, supplementalGids)
 | |
| 
 | |
| 	// If 'User' or 'Group' are still empty at this point,
 | |
| 	// then apply the root UID and GID.
 | |
| 	// TODO(yifan): If only the GID is empty, rkt should be able to determine the GID
 | |
| 	// using the /etc/passwd file in the image.
 | |
| 	// See https://github.com/appc/docker2aci/issues/175.
 | |
| 	// Maybe we can remove this check in the future.
 | |
| 	if app.User == "" {
 | |
| 		app.User = "0"
 | |
| 		app.Group = "0"
 | |
| 	}
 | |
| 	if app.Group == "" {
 | |
| 		return fmt.Errorf("cannot determine the GID of the app %q", imgManifest.Name)
 | |
| 	}
 | |
| 
 | |
| 	// Set working directory.
 | |
| 	if len(c.WorkingDir) > 0 {
 | |
| 		app.WorkingDirectory = c.WorkingDir
 | |
| 	}
 | |
| 
 | |
| 	// Notes that we don't create Mounts section in the pod manifest here,
 | |
| 	// as Mounts will be automatically generated by rkt.
 | |
| 	mergeMounts(app, mountPoints)
 | |
| 	mergeEnv(app, envs)
 | |
| 	mergePortMappings(app, containerPorts)
 | |
| 
 | |
| 	return setIsolators(app, c, ctx)
 | |
| }
 | |
| 
 | |
| // makePodManifest transforms a kubelet pod spec to the rkt pod manifest.
 | |
| func (r *Runtime) makePodManifest(pod *v1.Pod, podIP string, pullSecrets []v1.Secret) (*appcschema.PodManifest, error) {
 | |
| 	manifest := appcschema.BlankPodManifest()
 | |
| 
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout)
 | |
| 	defer cancel()
 | |
| 	listResp, err := r.apisvc.ListPods(ctx, &rktapi.ListPodsRequest{
 | |
| 		Detail:  true,
 | |
| 		Filters: kubernetesPodFilters(pod.UID),
 | |
| 	})
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("couldn't list pods: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	restartCount := 0
 | |
| 	for _, pod := range listResp.Pods {
 | |
| 		manifest := &appcschema.PodManifest{}
 | |
| 		err = json.Unmarshal(pod.Manifest, manifest)
 | |
| 		if err != nil {
 | |
| 			glog.Warningf("rkt: error unmatshaling pod manifest: %v", err)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		if countString, ok := manifest.Annotations.Get(k8sRktRestartCountAnno); ok {
 | |
| 			num, err := strconv.Atoi(countString)
 | |
| 			if err != nil {
 | |
| 				glog.Warningf("rkt: error reading restart count on pod: %v", err)
 | |
| 				continue
 | |
| 			}
 | |
| 			if num+1 > restartCount {
 | |
| 				restartCount = num + 1
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	requiresPrivileged := false
 | |
| 	manifest.Annotations.Set(*appctypes.MustACIdentifier(k8sRktKubeletAnno), k8sRktKubeletAnnoValue)
 | |
| 	manifest.Annotations.Set(*appctypes.MustACIdentifier(types.KubernetesPodUIDLabel), string(pod.UID))
 | |
| 	manifest.Annotations.Set(*appctypes.MustACIdentifier(types.KubernetesPodNameLabel), pod.Name)
 | |
| 	manifest.Annotations.Set(*appctypes.MustACIdentifier(types.KubernetesPodNamespaceLabel), pod.Namespace)
 | |
| 	manifest.Annotations.Set(*appctypes.MustACIdentifier(types.KubernetesContainerNameLabel), leaky.PodInfraContainerName)
 | |
| 	manifest.Annotations.Set(*appctypes.MustACIdentifier(k8sRktRestartCountAnno), strconv.Itoa(restartCount))
 | |
| 	if stage1Name, ok := pod.Annotations[k8sRktStage1NameAnno]; ok {
 | |
| 		requiresPrivileged = true
 | |
| 		manifest.Annotations.Set(*appctypes.MustACIdentifier(k8sRktStage1NameAnno), stage1Name)
 | |
| 	}
 | |
| 
 | |
| 	for _, c := range pod.Spec.Containers {
 | |
| 		err := r.newAppcRuntimeApp(pod, podIP, c, requiresPrivileged, pullSecrets, manifest)
 | |
| 		if err != nil {
 | |
| 			return nil, err
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// TODO(yifan): Set pod-level isolators once it's supported in kubernetes.
 | |
| 	return manifest, nil
 | |
| }
 | |
| 
 | |
| func copyfile(src, dst string) error {
 | |
| 	data, err := ioutil.ReadFile(src)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	return ioutil.WriteFile(dst, data, 0644)
 | |
| }
 | |
| 
 | |
| // TODO(yifan): Can make rkt handle this when '--net=host'. See https://github.com/coreos/rkt/issues/2430.
 | |
| func makeHostNetworkMount(opts *kubecontainer.RunContainerOptions) (*kubecontainer.Mount, *kubecontainer.Mount, error) {
 | |
| 	mountHosts, mountResolvConf := true, true
 | |
| 	for _, mnt := range opts.Mounts {
 | |
| 		switch mnt.ContainerPath {
 | |
| 		case etcHostsPath:
 | |
| 			mountHosts = false
 | |
| 		case etcResolvConfPath:
 | |
| 			mountResolvConf = false
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	var hostsMount, resolvMount kubecontainer.Mount
 | |
| 	if mountHosts {
 | |
| 		hostsPath := filepath.Join(opts.PodContainerDir, "etc-hosts")
 | |
| 		if err := copyfile(etcHostsPath, hostsPath); err != nil {
 | |
| 			return nil, nil, err
 | |
| 		}
 | |
| 		hostsMount = kubecontainer.Mount{
 | |
| 			Name:          "kubernetes-hostnetwork-hosts-conf",
 | |
| 			ContainerPath: etcHostsPath,
 | |
| 			HostPath:      hostsPath,
 | |
| 		}
 | |
| 		opts.Mounts = append(opts.Mounts, hostsMount)
 | |
| 	}
 | |
| 
 | |
| 	if mountResolvConf {
 | |
| 		resolvPath := filepath.Join(opts.PodContainerDir, "etc-resolv-conf")
 | |
| 		if err := copyfile(etcResolvConfPath, resolvPath); err != nil {
 | |
| 			return nil, nil, err
 | |
| 		}
 | |
| 		resolvMount = kubecontainer.Mount{
 | |
| 			Name:          "kubernetes-hostnetwork-resolv-conf",
 | |
| 			ContainerPath: etcResolvConfPath,
 | |
| 			HostPath:      resolvPath,
 | |
| 		}
 | |
| 		opts.Mounts = append(opts.Mounts, resolvMount)
 | |
| 	}
 | |
| 	return &hostsMount, &resolvMount, nil
 | |
| }
 | |
| 
 | |
| // podFinishedMarkerPath returns the path to a file which should be used to
 | |
| // indicate the pod exiting, and the time thereof.
 | |
| // If the file at the path does not exist, the pod should not be exited. If it
 | |
| // does exist, then the ctime of the file should indicate the time the pod
 | |
| // exited.
 | |
| func podFinishedMarkerPath(podDir string, rktUID string) string {
 | |
| 	return filepath.Join(podDir, "finished-"+rktUID)
 | |
| }
 | |
| 
 | |
| func podFinishedMarkCommand(touchPath, podDir, rktUID string) string {
 | |
| 	// TODO, if the path has a `'` character in it, this breaks.
 | |
| 	return touchPath + " " + podFinishedMarkerPath(podDir, rktUID)
 | |
| }
 | |
| 
 | |
| // podFinishedAt returns the time that a pod exited, or a zero time if it has
 | |
| // not.
 | |
| func (r *Runtime) podFinishedAt(podUID kubetypes.UID, rktUID string) time.Time {
 | |
| 	markerFile := podFinishedMarkerPath(r.runtimeHelper.GetPodDir(podUID), rktUID)
 | |
| 	stat, err := r.os.Stat(markerFile)
 | |
| 	if err != nil {
 | |
| 		if !os.IsNotExist(err) {
 | |
| 			glog.Warningf("rkt: unexpected fs error checking pod finished marker: %v", err)
 | |
| 		}
 | |
| 		return time.Time{}
 | |
| 	}
 | |
| 	return stat.ModTime()
 | |
| }
 | |
| 
 | |
| func (r *Runtime) makeContainerLogMount(opts *kubecontainer.RunContainerOptions, container *v1.Container) (*kubecontainer.Mount, error) {
 | |
| 	if opts.PodContainerDir == "" || container.TerminationMessagePath == "" {
 | |
| 		return nil, nil
 | |
| 	}
 | |
| 
 | |
| 	// In docker runtime, the container log path contains the container ID.
 | |
| 	// However, for rkt runtime, we cannot get the container ID before the
 | |
| 	// the container is launched, so here we generate a random uuid to enable
 | |
| 	// us to map a container's termination message path to a unique log file
 | |
| 	// on the disk.
 | |
| 	randomUID := uuid.NewUUID()
 | |
| 	containerLogPath := path.Join(opts.PodContainerDir, string(randomUID))
 | |
| 	fs, err := r.os.Create(containerLogPath)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	if err := fs.Close(); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	mnt := kubecontainer.Mount{
 | |
| 		// Use a random name for the termination message mount, so that
 | |
| 		// when a container restarts, it will not overwrite the old termination
 | |
| 		// message.
 | |
| 		Name:          fmt.Sprintf("termination-message-%s", randomUID),
 | |
| 		ContainerPath: container.TerminationMessagePath,
 | |
| 		HostPath:      containerLogPath,
 | |
| 		ReadOnly:      false,
 | |
| 	}
 | |
| 	opts.Mounts = append(opts.Mounts, mnt)
 | |
| 
 | |
| 	return &mnt, nil
 | |
| }
 | |
| 
 | |
| func (r *Runtime) newAppcRuntimeApp(pod *v1.Pod, podIP string, c v1.Container, requiresPrivileged bool, pullSecrets []v1.Secret, manifest *appcschema.PodManifest) error {
 | |
| 	var annotations appctypes.Annotations = []appctypes.Annotation{
 | |
| 		{
 | |
| 			Name:  *appctypes.MustACIdentifier(k8sRktContainerHashAnno),
 | |
| 			Value: strconv.FormatUint(kubecontainer.HashContainerLegacy(&c), 10),
 | |
| 		},
 | |
| 		{
 | |
| 			Name:  *appctypes.MustACIdentifier(types.KubernetesContainerNameLabel),
 | |
| 			Value: c.Name,
 | |
| 		},
 | |
| 	}
 | |
| 
 | |
| 	if requiresPrivileged && !securitycontext.HasPrivilegedRequest(&c) {
 | |
| 		return fmt.Errorf("cannot make %q: running a custom stage1 requires a privileged security context", format.Pod(pod))
 | |
| 	}
 | |
| 	imageRef, _, err := r.imagePuller.EnsureImageExists(pod, &c, pullSecrets)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	imgManifest, err := r.getImageManifest(c.Image)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	if imgManifest.App == nil {
 | |
| 		imgManifest.App = new(appctypes.App)
 | |
| 	}
 | |
| 
 | |
| 	hash, err := appctypes.NewHash(imageRef)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// TODO: determine how this should be handled for rkt
 | |
| 	opts, _, err := r.runtimeHelper.GenerateRunContainerOptions(pod, &c, podIP)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// Create additional mount for termination message path.
 | |
| 	mount, err := r.makeContainerLogMount(opts, &c)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	mounts := append(opts.Mounts, *mount)
 | |
| 	annotations = append(annotations, appctypes.Annotation{
 | |
| 		Name:  *appctypes.MustACIdentifier(k8sRktTerminationMessagePathAnno),
 | |
| 		Value: mount.HostPath,
 | |
| 	})
 | |
| 
 | |
| 	// If run in 'hostnetwork' mode, then copy the host's /etc/resolv.conf and /etc/hosts,
 | |
| 	// and add mounts.
 | |
| 	if kubecontainer.IsHostNetworkPod(pod) {
 | |
| 		hostsMount, resolvMount, err := makeHostNetworkMount(opts)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		mounts = append(mounts, *hostsMount, *resolvMount)
 | |
| 	}
 | |
| 
 | |
| 	supplementalGids := r.runtimeHelper.GetExtraSupplementalGroupsForPod(pod)
 | |
| 	ctx := securitycontext.DetermineEffectiveSecurityContext(pod, &c)
 | |
| 
 | |
| 	volumes, mountPoints := convertKubeMounts(mounts)
 | |
| 	containerPorts, hostPorts := convertKubePortMappings(opts.PortMappings)
 | |
| 
 | |
| 	if err := setApp(imgManifest, &c, mountPoints, containerPorts, opts.Envs, ctx, pod.Spec.SecurityContext, supplementalGids); err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	ra := appcschema.RuntimeApp{
 | |
| 		Name:        convertToACName(c.Name),
 | |
| 		Image:       appcschema.RuntimeImage{ID: *hash},
 | |
| 		App:         imgManifest.App,
 | |
| 		Annotations: annotations,
 | |
| 	}
 | |
| 
 | |
| 	if c.SecurityContext != nil && c.SecurityContext.ReadOnlyRootFilesystem != nil {
 | |
| 		ra.ReadOnlyRootFS = *c.SecurityContext.ReadOnlyRootFilesystem
 | |
| 	}
 | |
| 
 | |
| 	manifest.Apps = append(manifest.Apps, ra)
 | |
| 	manifest.Volumes = append(manifest.Volumes, volumes...)
 | |
| 	manifest.Ports = append(manifest.Ports, hostPorts...)
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func runningKubernetesPodFilters(uid kubetypes.UID) []*rktapi.PodFilter {
 | |
| 	return []*rktapi.PodFilter{
 | |
| 		{
 | |
| 			States: []rktapi.PodState{
 | |
| 				rktapi.PodState_POD_STATE_RUNNING,
 | |
| 			},
 | |
| 			Annotations: []*rktapi.KeyValue{
 | |
| 				{
 | |
| 					Key:   k8sRktKubeletAnno,
 | |
| 					Value: k8sRktKubeletAnnoValue,
 | |
| 				},
 | |
| 				{
 | |
| 					Key:   types.KubernetesPodUIDLabel,
 | |
| 					Value: string(uid),
 | |
| 				},
 | |
| 			},
 | |
| 		},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func kubernetesPodFilters(uid kubetypes.UID) []*rktapi.PodFilter {
 | |
| 	return []*rktapi.PodFilter{
 | |
| 		{
 | |
| 			Annotations: []*rktapi.KeyValue{
 | |
| 				{
 | |
| 					Key:   k8sRktKubeletAnno,
 | |
| 					Value: k8sRktKubeletAnnoValue,
 | |
| 				},
 | |
| 				{
 | |
| 					Key:   types.KubernetesPodUIDLabel,
 | |
| 					Value: string(uid),
 | |
| 				},
 | |
| 			},
 | |
| 		},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func kubernetesPodsFilters() []*rktapi.PodFilter {
 | |
| 	return []*rktapi.PodFilter{
 | |
| 		{
 | |
| 			Annotations: []*rktapi.KeyValue{
 | |
| 				{
 | |
| 					Key:   k8sRktKubeletAnno,
 | |
| 					Value: k8sRktKubeletAnnoValue,
 | |
| 				},
 | |
| 			},
 | |
| 		},
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func newUnitOption(section, name, value string) *unit.UnitOption {
 | |
| 	return &unit.UnitOption{Section: section, Name: name, Value: value}
 | |
| }
 | |
| 
 | |
| // apiPodToruntimePod converts an v1.Pod to kubelet/container.Pod.
 | |
| func apiPodToruntimePod(uuid string, pod *v1.Pod) *kubecontainer.Pod {
 | |
| 	p := &kubecontainer.Pod{
 | |
| 		ID:        pod.UID,
 | |
| 		Name:      pod.Name,
 | |
| 		Namespace: pod.Namespace,
 | |
| 	}
 | |
| 	for i := range pod.Spec.Containers {
 | |
| 		c := &pod.Spec.Containers[i]
 | |
| 		p.Containers = append(p.Containers, &kubecontainer.Container{
 | |
| 			ID:    buildContainerID(&containerID{uuid, c.Name}),
 | |
| 			Name:  c.Name,
 | |
| 			Image: c.Image,
 | |
| 			Hash:  kubecontainer.HashContainerLegacy(c),
 | |
| 		})
 | |
| 	}
 | |
| 	return p
 | |
| }
 | |
| 
 | |
| // serviceFilePath returns the absolute path of the service file.
 | |
| func serviceFilePath(serviceName string) string {
 | |
| 	return path.Join(systemdServiceDir, serviceName)
 | |
| }
 | |
| 
 | |
| // shouldCreateNetns returns true if:
 | |
| // The pod does not run in host network. And
 | |
| // The pod runs inside a netns created outside of rkt.
 | |
| func (r *Runtime) shouldCreateNetns(pod *v1.Pod) bool {
 | |
| 	return !kubecontainer.IsHostNetworkPod(pod) && r.network.PluginName() != network.DefaultPluginName
 | |
| }
 | |
| 
 | |
| // usesRktHostNetwork returns true if:
 | |
| // The pod runs in the host network. Or
 | |
| // The pod runs inside a netns created outside of rkt.
 | |
| func (r *Runtime) usesRktHostNetwork(pod *v1.Pod) bool {
 | |
| 	return kubecontainer.IsHostNetworkPod(pod) || r.shouldCreateNetns(pod)
 | |
| }
 | |
| 
 | |
| // generateRunCommand crafts a 'rkt run-prepared' command with necessary parameters.
 | |
| func (r *Runtime) generateRunCommand(pod *v1.Pod, uuid, networkNamespaceID string) (string, error) {
 | |
| 	config := *r.config
 | |
| 	privileged := true
 | |
| 
 | |
| 	for _, c := range pod.Spec.Containers {
 | |
| 		ctx := securitycontext.DetermineEffectiveSecurityContext(pod, &c)
 | |
| 		if ctx == nil || ctx.Privileged == nil || *ctx.Privileged == false {
 | |
| 			privileged = false
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Use "all-run" insecure option (https://github.com/coreos/rkt/pull/2983) to take care
 | |
| 	// of privileged pod.
 | |
| 	// TODO(yifan): Have more granular app-level control of the insecure options.
 | |
| 	// See: https://github.com/coreos/rkt/issues/2996.
 | |
| 	if privileged {
 | |
| 		config.InsecureOptions = fmt.Sprintf("%s,%s", config.InsecureOptions, "all-run")
 | |
| 	}
 | |
| 
 | |
| 	runPrepared := buildCommand(&config, "run-prepared").Args
 | |
| 
 | |
| 	var hostname string
 | |
| 	var err error
 | |
| 
 | |
| 	osInfos, err := getOSReleaseInfo()
 | |
| 	if err != nil {
 | |
| 		glog.Warningf("rkt: Failed to read the os release info: %v", err)
 | |
| 	} else {
 | |
| 		// Overlay fs is not supported for SELinux yet on many distros.
 | |
| 		// See https://github.com/coreos/rkt/issues/1727#issuecomment-173203129.
 | |
| 		// For now, coreos carries a patch to support it: https://github.com/coreos/coreos-overlay/pull/1703
 | |
| 		if osInfos["ID"] != "coreos" && pod.Spec.SecurityContext != nil && pod.Spec.SecurityContext.SELinuxOptions != nil {
 | |
| 			runPrepared = append(runPrepared, "--no-overlay=true")
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Apply '--net=host' to pod that is running on host network or inside a network namespace.
 | |
| 	if r.usesRktHostNetwork(pod) {
 | |
| 		runPrepared = append(runPrepared, "--net=host")
 | |
| 	} else {
 | |
| 		runPrepared = append(runPrepared, fmt.Sprintf("--net=%s", defaultNetworkName))
 | |
| 	}
 | |
| 
 | |
| 	if kubecontainer.IsHostNetworkPod(pod) {
 | |
| 		// TODO(yifan): Let runtimeHelper.GeneratePodHostNameAndDomain() to handle this.
 | |
| 		hostname, err = r.os.Hostname()
 | |
| 		if err != nil {
 | |
| 			return "", err
 | |
| 		}
 | |
| 	} else {
 | |
| 		// Setup DNS.
 | |
| 		dnsServers, dnsSearches, _, err := r.runtimeHelper.GetClusterDNS(pod)
 | |
| 		if err != nil {
 | |
| 			return "", err
 | |
| 		}
 | |
| 		for _, server := range dnsServers {
 | |
| 			runPrepared = append(runPrepared, fmt.Sprintf("--dns=%s", server))
 | |
| 		}
 | |
| 		for _, search := range dnsSearches {
 | |
| 			runPrepared = append(runPrepared, fmt.Sprintf("--dns-search=%s", search))
 | |
| 		}
 | |
| 		if len(dnsServers) > 0 || len(dnsSearches) > 0 {
 | |
| 			runPrepared = append(runPrepared, fmt.Sprintf("--dns-opt=%s", defaultDNSOption))
 | |
| 		}
 | |
| 
 | |
| 		// TODO(yifan): host domain is not being used.
 | |
| 		hostname, _, err = r.runtimeHelper.GeneratePodHostNameAndDomain(pod)
 | |
| 		if err != nil {
 | |
| 			return "", err
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	runPrepared = append(runPrepared, fmt.Sprintf("--hostname=%s", hostname))
 | |
| 	runPrepared = append(runPrepared, uuid)
 | |
| 
 | |
| 	if r.shouldCreateNetns(pod) {
 | |
| 		// Drop the `rkt run-prepared` into the network namespace we
 | |
| 		// created.
 | |
| 		// TODO: switch to 'ip netns exec' once we can depend on a new
 | |
| 		// enough version that doesn't have bugs like
 | |
| 		// https://bugzilla.redhat.com/show_bug.cgi?id=882047
 | |
| 		nsenterExec := []string{r.nsenterPath, "--net=" + netnsPathFromName(networkNamespaceID), "--"}
 | |
| 		runPrepared = append(nsenterExec, runPrepared...)
 | |
| 	}
 | |
| 
 | |
| 	return strings.Join(runPrepared, " "), nil
 | |
| }
 | |
| 
 | |
| func (r *Runtime) cleanupPodNetwork(pod *v1.Pod, networkNamespace kubecontainer.ContainerID) error {
 | |
| 	// No-op if the pod is not running in a created netns.
 | |
| 	if !r.shouldCreateNetns(pod) {
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	glog.V(3).Infof("Calling network plugin %s to tear down pod for %s", r.network.PluginName(), format.Pod(pod))
 | |
| 	teardownErr := r.network.TearDownPod(pod.Namespace, pod.Name, networkNamespace)
 | |
| 	if teardownErr != nil {
 | |
| 		glog.Error(teardownErr)
 | |
| 	}
 | |
| 
 | |
| 	if _, err := r.execer.Command("ip", "netns", "del", networkNamespace.ID).Output(); err != nil {
 | |
| 		return fmt.Errorf("rkt: Failed to remove network namespace for pod %s: %v", format.Pod(pod), err)
 | |
| 	}
 | |
| 
 | |
| 	return teardownErr
 | |
| }
 | |
| 
 | |
| func (r *Runtime) preparePodArgs(manifest *appcschema.PodManifest, manifestFileName string) []string {
 | |
| 	// Order of precedence for the stage1:
 | |
| 	// 1) pod annotation (stage1 name)
 | |
| 	// 2) kubelet configured stage1 (stage1 path)
 | |
| 	// 3) empty; whatever rkt's compiled to default to
 | |
| 	stage1ImageCmd := ""
 | |
| 	if r.config.Stage1Image != "" {
 | |
| 		stage1ImageCmd = "--stage1-name=" + r.config.Stage1Image
 | |
| 	}
 | |
| 	if stage1Name, ok := manifest.Annotations.Get(k8sRktStage1NameAnno); ok {
 | |
| 		stage1ImageCmd = "--stage1-name=" + stage1Name
 | |
| 	}
 | |
| 
 | |
| 	// Run 'rkt prepare' to get the rkt UUID.
 | |
| 	cmds := []string{"prepare", "--quiet", "--pod-manifest", manifestFileName}
 | |
| 	if stage1ImageCmd != "" {
 | |
| 		cmds = append(cmds, stage1ImageCmd)
 | |
| 	}
 | |
| 	return cmds
 | |
| }
 | |
| 
 | |
| func (r *Runtime) getSelinuxContext(opt *v1.SELinuxOptions) (string, error) {
 | |
| 	selinuxRunner := selinux.NewSELinuxRunner()
 | |
| 	str, err := selinuxRunner.Getfilecon(r.config.Dir)
 | |
| 	if err != nil {
 | |
| 		return "", err
 | |
| 	}
 | |
| 
 | |
| 	ctx := strings.SplitN(str, ":", 4)
 | |
| 	if len(ctx) != 4 {
 | |
| 		return "", fmt.Errorf("malformated selinux context")
 | |
| 	}
 | |
| 
 | |
| 	if opt.User != "" {
 | |
| 		ctx[0] = opt.User
 | |
| 	}
 | |
| 	if opt.Role != "" {
 | |
| 		ctx[1] = opt.Role
 | |
| 	}
 | |
| 	if opt.Type != "" {
 | |
| 		ctx[2] = opt.Type
 | |
| 	}
 | |
| 	if opt.Level != "" {
 | |
| 		ctx[3] = opt.Level
 | |
| 	}
 | |
| 
 | |
| 	return strings.Join(ctx, ":"), nil
 | |
| }
 | |
| 
 | |
| // From the generateName or the podName return a basename for improving the logging with the Journal
 | |
| // journalctl -t podBaseName
 | |
| func constructSyslogIdentifier(generateName string, podName string) string {
 | |
| 	if len(generateName) > 1 && generateName[len(generateName)-1] == '-' {
 | |
| 		return generateName[0 : len(generateName)-1]
 | |
| 	}
 | |
| 	if len(generateName) > 0 {
 | |
| 		return generateName
 | |
| 	}
 | |
| 	return podName
 | |
| }
 | |
| 
 | |
| // Setup additional systemd field specified in the Pod Annotation
 | |
| func setupSystemdCustomFields(annotations map[string]string, unitOptionArray []*unit.UnitOption) ([]*unit.UnitOption, error) {
 | |
| 	// LimitNOFILE
 | |
| 	if strSize := annotations[k8sRktLimitNoFileAnno]; strSize != "" {
 | |
| 		size, err := strconv.Atoi(strSize)
 | |
| 		if err != nil {
 | |
| 			return unitOptionArray, err
 | |
| 		}
 | |
| 		if size < 1 {
 | |
| 			return unitOptionArray, fmt.Errorf("invalid value for %s: %s", k8sRktLimitNoFileAnno, strSize)
 | |
| 		}
 | |
| 		unitOptionArray = append(unitOptionArray, newUnitOption("Service", "LimitNOFILE", strSize))
 | |
| 	}
 | |
| 
 | |
| 	return unitOptionArray, nil
 | |
| }
 | |
| 
 | |
| // preparePod will:
 | |
| //
 | |
| // 1. Invoke 'rkt prepare' to prepare the pod, and get the rkt pod uuid.
 | |
| // 2. Create the unit file and save it under systemdUnitDir.
 | |
| //
 | |
| // On success, it will return a string that represents name of the unit file
 | |
| // and the runtime pod.
 | |
| func (r *Runtime) preparePod(pod *v1.Pod, podIP string, pullSecrets []v1.Secret, networkNamespaceID string) (string, *kubecontainer.Pod, error) {
 | |
| 	// Generate the appc pod manifest from the k8s pod spec.
 | |
| 	manifest, err := r.makePodManifest(pod, podIP, pullSecrets)
 | |
| 	if err != nil {
 | |
| 		return "", nil, err
 | |
| 	}
 | |
| 	manifestFile, err := ioutil.TempFile("", fmt.Sprintf("manifest-%s-", pod.Name))
 | |
| 	if err != nil {
 | |
| 		return "", nil, err
 | |
| 	}
 | |
| 	defer func() {
 | |
| 		manifestFile.Close()
 | |
| 		if err := r.os.Remove(manifestFile.Name()); err != nil {
 | |
| 			glog.Warningf("rkt: Cannot remove temp manifest file %q: %v", manifestFile.Name(), err)
 | |
| 		}
 | |
| 	}()
 | |
| 
 | |
| 	data, err := json.Marshal(manifest)
 | |
| 	if err != nil {
 | |
| 		return "", nil, err
 | |
| 	}
 | |
| 
 | |
| 	glog.V(4).Infof("Generating pod manifest for pod %q: %v", format.Pod(pod), string(data))
 | |
| 	// Since File.Write returns error if the written length is less than len(data),
 | |
| 	// so check error is enough for us.
 | |
| 	if _, err := manifestFile.Write(data); err != nil {
 | |
| 		return "", nil, err
 | |
| 	}
 | |
| 
 | |
| 	prepareCmd := r.preparePodArgs(manifest, manifestFile.Name())
 | |
| 	output, err := r.cli.RunCommand(nil, prepareCmd...)
 | |
| 	if err != nil {
 | |
| 		return "", nil, err
 | |
| 	}
 | |
| 	if len(output) != 1 {
 | |
| 		return "", nil, fmt.Errorf("invalid output from 'rkt prepare': %v", output)
 | |
| 	}
 | |
| 	uuid := output[0]
 | |
| 	glog.V(4).Infof("'rkt prepare' returns %q", uuid)
 | |
| 
 | |
| 	// Create systemd service file for the rkt pod.
 | |
| 	runPrepared, err := r.generateRunCommand(pod, uuid, networkNamespaceID)
 | |
| 	if err != nil {
 | |
| 		return "", nil, fmt.Errorf("failed to generate 'rkt run-prepared' command: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	// TODO handle pod.Spec.HostPID
 | |
| 	// TODO handle pod.Spec.HostIPC
 | |
| 
 | |
| 	// TODO per container finishedAt, not just per pod
 | |
| 	markPodFinished := podFinishedMarkCommand(r.touchPath, r.runtimeHelper.GetPodDir(pod.UID), uuid)
 | |
| 
 | |
| 	hostNetwork := kubecontainer.IsHostNetworkPod(pod)
 | |
| 	units := []*unit.UnitOption{
 | |
| 		newUnitOption("Service", "ExecStart", runPrepared),
 | |
| 		newUnitOption("Service", "ExecStopPost", markPodFinished),
 | |
| 		// This enables graceful stop.
 | |
| 		newUnitOption("Service", "KillMode", "mixed"),
 | |
| 		newUnitOption("Service", "TimeoutStopSec", fmt.Sprintf("%ds", getPodTerminationGracePeriodInSecond(pod))),
 | |
| 		// Ops helpers
 | |
| 		newUnitOption("Unit", "Description", pod.Name),
 | |
| 		newUnitOption("Service", "SyslogIdentifier", constructSyslogIdentifier(pod.GenerateName, pod.Name)),
 | |
| 		// Track pod info for garbage collection
 | |
| 		newUnitOption(unitKubernetesSection, unitPodUID, string(pod.UID)),
 | |
| 		newUnitOption(unitKubernetesSection, unitPodName, pod.Name),
 | |
| 		newUnitOption(unitKubernetesSection, unitPodNamespace, pod.Namespace),
 | |
| 		newUnitOption(unitKubernetesSection, unitPodHostNetwork, fmt.Sprintf("%v", hostNetwork)),
 | |
| 		newUnitOption(unitKubernetesSection, unitPodNetworkNamespace, networkNamespaceID),
 | |
| 	}
 | |
| 
 | |
| 	if pod.Spec.SecurityContext != nil && pod.Spec.SecurityContext.SELinuxOptions != nil {
 | |
| 		opt := pod.Spec.SecurityContext.SELinuxOptions
 | |
| 		selinuxContext, err := r.getSelinuxContext(opt)
 | |
| 		if err != nil {
 | |
| 			glog.Errorf("rkt: Failed to construct selinux context with selinux option %q: %v", opt, err)
 | |
| 			return "", nil, err
 | |
| 		}
 | |
| 		units = append(units, newUnitOption("Service", "SELinuxContext", selinuxContext))
 | |
| 	}
 | |
| 
 | |
| 	units, err = setupSystemdCustomFields(pod.Annotations, units)
 | |
| 	if err != nil {
 | |
| 		glog.Warningf("fail to add custom systemd fields provided by pod Annotations: %q", err)
 | |
| 	}
 | |
| 
 | |
| 	serviceName := makePodServiceFileName(uuid)
 | |
| 	glog.V(4).Infof("rkt: Creating service file %q for pod %q", serviceName, format.Pod(pod))
 | |
| 	serviceFile, err := r.os.Create(serviceFilePath(serviceName))
 | |
| 	if err != nil {
 | |
| 		return "", nil, err
 | |
| 	}
 | |
| 	if _, err := io.Copy(serviceFile, unit.Serialize(units)); err != nil {
 | |
| 		return "", nil, err
 | |
| 	}
 | |
| 	serviceFile.Close()
 | |
| 
 | |
| 	return serviceName, apiPodToruntimePod(uuid, pod), nil
 | |
| }
 | |
| 
 | |
| // generateEvents is a helper function that generates some container
 | |
| // life cycle events for containers in a pod.
 | |
| func (r *Runtime) generateEvents(runtimePod *kubecontainer.Pod, reason string, failure error) {
 | |
| 	// Set up container references.
 | |
| 	for _, c := range runtimePod.Containers {
 | |
| 		containerID := c.ID
 | |
| 		id, err := parseContainerID(containerID)
 | |
| 		if err != nil {
 | |
| 			glog.Warningf("Invalid container ID %q", containerID)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		ref, ok := r.containerRefManager.GetRef(containerID)
 | |
| 		if !ok {
 | |
| 			glog.Warningf("No ref for container %q", containerID)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		// Note that 'rkt id' is the pod id.
 | |
| 		uuid := utilstrings.ShortenString(id.uuid, 8)
 | |
| 		switch reason {
 | |
| 		case "Created":
 | |
| 			r.recorder.Eventf(ref, v1.EventTypeNormal, events.CreatedContainer, "Created with rkt id %v", uuid)
 | |
| 		case "Started":
 | |
| 			r.recorder.Eventf(ref, v1.EventTypeNormal, events.StartedContainer, "Started with rkt id %v", uuid)
 | |
| 		case "Failed":
 | |
| 			r.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToStartContainer, "Failed to start with rkt id %v with error %v", uuid, failure)
 | |
| 		case "Killing":
 | |
| 			r.recorder.Eventf(ref, v1.EventTypeNormal, events.KillingContainer, "Killing with rkt id %v", uuid)
 | |
| 		default:
 | |
| 			glog.Errorf("rkt: Unexpected event %q", reason)
 | |
| 		}
 | |
| 	}
 | |
| 	return
 | |
| }
 | |
| 
 | |
| // Generate a Network Namespace based on a New UUID
 | |
| // to run the Pod and all of its containers inside a dedicated unique namespace
 | |
| func generateNetworkNamespaceUUID() kubecontainer.ContainerID {
 | |
| 	return kubecontainer.ContainerID{ID: fmt.Sprintf("%s%s", kubernetesUnitPrefix, uuid.NewUUID())}
 | |
| }
 | |
| 
 | |
| func netnsPathFromName(netnsName string) string {
 | |
| 	return fmt.Sprintf("/var/run/netns/%s", netnsName)
 | |
| }
 | |
| 
 | |
| // setupPodNetwork creates a network namespace for the given pod and calls
 | |
| // configured NetworkPlugin's setup function on it.
 | |
| // It returns the namespace name, configured IP (if available), and an error if
 | |
| // one occurred.
 | |
| //
 | |
| // If the pod is running in host network or is running using the no-op plugin, then nothing will be done.
 | |
| func (r *Runtime) setupPodNetwork(pod *v1.Pod) (kubecontainer.ContainerID, string, error) {
 | |
| 	glog.V(3).Infof("Calling network plugin %s to set up pod for %s", r.network.PluginName(), format.Pod(pod))
 | |
| 
 | |
| 	var networkNamespace kubecontainer.ContainerID
 | |
| 
 | |
| 	// No-op if the pod is not running in a created netns.
 | |
| 	if !r.shouldCreateNetns(pod) {
 | |
| 		return networkNamespace, "", nil
 | |
| 	}
 | |
| 
 | |
| 	networkNamespace = generateNetworkNamespaceUUID()
 | |
| 	glog.V(5).Infof("New network namespace %q generated for pod %s", networkNamespace.ID, format.Pod(pod))
 | |
| 
 | |
| 	// Create the network namespace for the pod
 | |
| 	_, err := r.execer.Command("ip", "netns", "add", networkNamespace.ID).Output()
 | |
| 	if err != nil {
 | |
| 		return networkNamespace, "", fmt.Errorf("failed to create pod network namespace: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	// Set up networking with the network plugin
 | |
| 	err = r.network.SetUpPod(pod.Namespace, pod.Name, networkNamespace, pod.Annotations)
 | |
| 	if err != nil {
 | |
| 		return networkNamespace, "", err
 | |
| 	}
 | |
| 	status, err := r.network.GetPodNetworkStatus(pod.Namespace, pod.Name, networkNamespace)
 | |
| 	if err != nil {
 | |
| 		return networkNamespace, "", err
 | |
| 	}
 | |
| 
 | |
| 	if r.configureHairpinMode {
 | |
| 		if err = hairpin.SetUpContainerPath(netnsPathFromName(networkNamespace.ID), network.DefaultInterfaceName); err != nil {
 | |
| 			glog.Warningf("Hairpin setup failed for pod %q: %v", format.Pod(pod), err)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return networkNamespace, status.IP.String(), nil
 | |
| }
 | |
| 
 | |
| // For a hostPath volume: rkt doesn't create any missing volume on the node/host so we need to create it
 | |
| func createHostPathVolumes(pod *v1.Pod) (err error) {
 | |
| 	for _, v := range pod.Spec.Volumes {
 | |
| 		if v.VolumeSource.HostPath != nil {
 | |
| 			_, err = os.Stat(v.HostPath.Path)
 | |
| 			if os.IsNotExist(err) {
 | |
| 				if err = os.MkdirAll(v.HostPath.Path, os.ModePerm); err != nil {
 | |
| 					glog.Errorf("Create volume HostPath %q for Pod %q failed: %q", v.HostPath.Path, format.Pod(pod), err.Error())
 | |
| 					return err
 | |
| 				}
 | |
| 				glog.V(4).Infof("Created volume HostPath %q for Pod %q", v.HostPath.Path, format.Pod(pod))
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // RunPod first creates the unit file for a pod, and then
 | |
| // starts the unit over d-bus.
 | |
| func (r *Runtime) RunPod(pod *v1.Pod, pullSecrets []v1.Secret) error {
 | |
| 	glog.V(4).Infof("Rkt starts to run pod: name %q.", format.Pod(pod))
 | |
| 
 | |
| 	var err error
 | |
| 	var networkNamespace kubecontainer.ContainerID
 | |
| 	var podIP string
 | |
| 
 | |
| 	err = createHostPathVolumes(pod)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	networkNamespace, podIP, err = r.setupPodNetwork(pod)
 | |
| 	if err != nil {
 | |
| 		r.cleanupPodNetwork(pod, networkNamespace)
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	name, runtimePod, prepareErr := r.preparePod(pod, podIP, pullSecrets, networkNamespace.ID)
 | |
| 
 | |
| 	// Set container references and generate events.
 | |
| 	// If preparedPod fails, then send out 'failed' events for each container.
 | |
| 	// Otherwise, store the container references so we can use them later to send events.
 | |
| 	for i, c := range pod.Spec.Containers {
 | |
| 		ref, err := kubecontainer.GenerateContainerRef(pod, &c)
 | |
| 		if err != nil {
 | |
| 			glog.Errorf("Couldn't make a ref to pod %q, container %v: '%v'", format.Pod(pod), c.Name, err)
 | |
| 			continue
 | |
| 		}
 | |
| 		if prepareErr != nil {
 | |
| 			r.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToCreateContainer, "Failed to create rkt container with error: %v", prepareErr)
 | |
| 			continue
 | |
| 		}
 | |
| 		containerID := runtimePod.Containers[i].ID
 | |
| 		r.containerRefManager.SetRef(containerID, ref)
 | |
| 	}
 | |
| 
 | |
| 	if prepareErr != nil {
 | |
| 		r.cleanupPodNetwork(pod, networkNamespace)
 | |
| 		return prepareErr
 | |
| 	}
 | |
| 
 | |
| 	r.generateEvents(runtimePod, "Created", nil)
 | |
| 
 | |
| 	// RestartUnit has the same effect as StartUnit if the unit is not running, besides it can restart
 | |
| 	// a unit if the unit file is changed and reloaded.
 | |
| 	reschan := make(chan string)
 | |
| 	_, err = r.systemd.RestartUnit(name, "replace", reschan)
 | |
| 	if err != nil {
 | |
| 		r.generateEvents(runtimePod, "Failed", err)
 | |
| 		r.cleanupPodNetwork(pod, networkNamespace)
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	res := <-reschan
 | |
| 	if res != "done" {
 | |
| 		err := fmt.Errorf("Failed to restart unit %q: %s", name, res)
 | |
| 		r.generateEvents(runtimePod, "Failed", err)
 | |
| 		r.cleanupPodNetwork(pod, networkNamespace)
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	r.generateEvents(runtimePod, "Started", nil)
 | |
| 
 | |
| 	// This is a temporary solution until we have a clean design on how
 | |
| 	// kubelet handles events. See https://github.com/kubernetes/kubernetes/issues/23084.
 | |
| 	if err := r.runLifecycleHooks(pod, runtimePod, lifecyclePostStartHook); err != nil {
 | |
| 		if errKill := r.KillPod(pod, *runtimePod, nil); errKill != nil {
 | |
| 			return errors.NewAggregate([]error{err, errKill})
 | |
| 		}
 | |
| 		r.cleanupPodNetwork(pod, networkNamespace)
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (r *Runtime) runPreStopHook(containerID kubecontainer.ContainerID, pod *v1.Pod, container *v1.Container) error {
 | |
| 	glog.V(4).Infof("rkt: Running pre-stop hook for container %q of pod %q", container.Name, format.Pod(pod))
 | |
| 	msg, err := r.runner.Run(containerID, pod, container, container.Lifecycle.PreStop)
 | |
| 	if err != nil {
 | |
| 		ref, ok := r.containerRefManager.GetRef(containerID)
 | |
| 		if !ok {
 | |
| 			glog.Warningf("No ref for container %q", containerID)
 | |
| 		} else {
 | |
| 			r.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedPreStopHook, msg)
 | |
| 		}
 | |
| 	}
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| func (r *Runtime) runPostStartHook(containerID kubecontainer.ContainerID, pod *v1.Pod, container *v1.Container) error {
 | |
| 	glog.V(4).Infof("rkt: Running post-start hook for container %q of pod %q", container.Name, format.Pod(pod))
 | |
| 	cid, err := parseContainerID(containerID)
 | |
| 	if err != nil {
 | |
| 		return fmt.Errorf("cannot parse container ID %v", containerID)
 | |
| 	}
 | |
| 
 | |
| 	isContainerRunning := func() (done bool, err error) {
 | |
| 		ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout)
 | |
| 		defer cancel()
 | |
| 		resp, err := r.apisvc.InspectPod(ctx, &rktapi.InspectPodRequest{Id: cid.uuid})
 | |
| 		if err != nil {
 | |
| 			return false, fmt.Errorf("failed to inspect rkt pod %q for pod %q", cid.uuid, format.Pod(pod))
 | |
| 		}
 | |
| 
 | |
| 		for _, app := range resp.Pod.Apps {
 | |
| 			if app.Name == cid.appName {
 | |
| 				return app.State == rktapi.AppState_APP_STATE_RUNNING, nil
 | |
| 			}
 | |
| 		}
 | |
| 		return false, fmt.Errorf("failed to find container %q in rkt pod %q", cid.appName, cid.uuid)
 | |
| 	}
 | |
| 
 | |
| 	// TODO(yifan): Polling the pod's state for now.
 | |
| 	timeout := time.Second * 5
 | |
| 	pollInterval := time.Millisecond * 500
 | |
| 	if err := utilwait.Poll(pollInterval, timeout, isContainerRunning); err != nil {
 | |
| 		return fmt.Errorf("rkt: Pod %q doesn't become running in %v: %v", format.Pod(pod), timeout, err)
 | |
| 	}
 | |
| 
 | |
| 	msg, err := r.runner.Run(containerID, pod, container, container.Lifecycle.PostStart)
 | |
| 	if err != nil {
 | |
| 		ref, ok := r.containerRefManager.GetRef(containerID)
 | |
| 		if !ok {
 | |
| 			glog.Warningf("No ref for container %q", containerID)
 | |
| 		} else {
 | |
| 			r.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedPostStartHook, msg)
 | |
| 		}
 | |
| 	}
 | |
| 	return err
 | |
| }
 | |
| 
 | |
| type lifecycleHookType string
 | |
| 
 | |
| const (
 | |
| 	lifecyclePostStartHook lifecycleHookType = "post-start"
 | |
| 	lifecyclePreStopHook   lifecycleHookType = "pre-stop"
 | |
| )
 | |
| 
 | |
| func (r *Runtime) runLifecycleHooks(pod *v1.Pod, runtimePod *kubecontainer.Pod, typ lifecycleHookType) error {
 | |
| 	var wg sync.WaitGroup
 | |
| 	var errlist []error
 | |
| 	errCh := make(chan error, len(pod.Spec.Containers))
 | |
| 
 | |
| 	wg.Add(len(pod.Spec.Containers))
 | |
| 
 | |
| 	for i, c := range pod.Spec.Containers {
 | |
| 		var hookFunc func(kubecontainer.ContainerID, *v1.Pod, *v1.Container) error
 | |
| 
 | |
| 		switch typ {
 | |
| 		case lifecyclePostStartHook:
 | |
| 			if c.Lifecycle != nil && c.Lifecycle.PostStart != nil {
 | |
| 				hookFunc = r.runPostStartHook
 | |
| 			}
 | |
| 		case lifecyclePreStopHook:
 | |
| 			if c.Lifecycle != nil && c.Lifecycle.PreStop != nil {
 | |
| 				hookFunc = r.runPreStopHook
 | |
| 			}
 | |
| 		default:
 | |
| 			errCh <- fmt.Errorf("Unrecognized lifecycle hook type %q for container %q in pod %q", typ, c.Name, format.Pod(pod))
 | |
| 		}
 | |
| 
 | |
| 		if hookFunc == nil {
 | |
| 			wg.Done()
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		container := &pod.Spec.Containers[i]
 | |
| 		runtimeContainer := runtimePod.FindContainerByName(container.Name)
 | |
| 		if runtimeContainer == nil {
 | |
| 			// Container already gone.
 | |
| 			wg.Done()
 | |
| 			continue
 | |
| 		}
 | |
| 		containerID := runtimeContainer.ID
 | |
| 
 | |
| 		go func() {
 | |
| 			defer wg.Done()
 | |
| 			if err := hookFunc(containerID, pod, container); err != nil {
 | |
| 				glog.Errorf("rkt: Failed to run %s hook for container %q of pod %q: %v", typ, container.Name, format.Pod(pod), err)
 | |
| 				errCh <- err
 | |
| 			} else {
 | |
| 				glog.V(4).Infof("rkt: %s hook completed successfully for container %q of pod %q", typ, container.Name, format.Pod(pod))
 | |
| 			}
 | |
| 		}()
 | |
| 	}
 | |
| 
 | |
| 	wg.Wait()
 | |
| 	close(errCh)
 | |
| 
 | |
| 	for err := range errCh {
 | |
| 		errlist = append(errlist, err)
 | |
| 	}
 | |
| 	return errors.NewAggregate(errlist)
 | |
| }
 | |
| 
 | |
| // convertRktPod will convert a rktapi.Pod to a kubecontainer.Pod
 | |
| func (r *Runtime) convertRktPod(rktpod *rktapi.Pod) (*kubecontainer.Pod, error) {
 | |
| 	manifest := &appcschema.PodManifest{}
 | |
| 	err := json.Unmarshal(rktpod.Manifest, manifest)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	podUID, ok := manifest.Annotations.Get(types.KubernetesPodUIDLabel)
 | |
| 	if !ok {
 | |
| 		return nil, fmt.Errorf("pod is missing annotation %s", types.KubernetesPodUIDLabel)
 | |
| 	}
 | |
| 	podName, ok := manifest.Annotations.Get(types.KubernetesPodNameLabel)
 | |
| 	if !ok {
 | |
| 		return nil, fmt.Errorf("pod is missing annotation %s", types.KubernetesPodNameLabel)
 | |
| 	}
 | |
| 	podNamespace, ok := manifest.Annotations.Get(types.KubernetesPodNamespaceLabel)
 | |
| 	if !ok {
 | |
| 		return nil, fmt.Errorf("pod is missing annotation %s", types.KubernetesPodNamespaceLabel)
 | |
| 	}
 | |
| 
 | |
| 	kubepod := &kubecontainer.Pod{
 | |
| 		ID:        kubetypes.UID(podUID),
 | |
| 		Name:      podName,
 | |
| 		Namespace: podNamespace,
 | |
| 	}
 | |
| 
 | |
| 	for i, app := range rktpod.Apps {
 | |
| 		// The order of the apps is determined by the rkt pod manifest.
 | |
| 		// TODO(yifan): Let the server to unmarshal the annotations? https://github.com/coreos/rkt/issues/1872
 | |
| 		hashStr, ok := manifest.Apps[i].Annotations.Get(k8sRktContainerHashAnno)
 | |
| 		if !ok {
 | |
| 			return nil, fmt.Errorf("app %q is missing annotation %s", app.Name, k8sRktContainerHashAnno)
 | |
| 		}
 | |
| 		containerHash, err := strconv.ParseUint(hashStr, 10, 64)
 | |
| 		if err != nil {
 | |
| 			return nil, fmt.Errorf("couldn't parse container's hash %q: %v", hashStr, err)
 | |
| 		}
 | |
| 
 | |
| 		kubepod.Containers = append(kubepod.Containers, &kubecontainer.Container{
 | |
| 			ID:   buildContainerID(&containerID{rktpod.Id, app.Name}),
 | |
| 			Name: app.Name,
 | |
| 			// By default, the version returned by rkt API service will be "latest" if not specified.
 | |
| 			Image:   fmt.Sprintf("%s:%s", app.Image.Name, app.Image.Version),
 | |
| 			ImageID: app.Image.Id,
 | |
| 			Hash:    containerHash,
 | |
| 			State:   appStateToContainerState(app.State),
 | |
| 		})
 | |
| 	}
 | |
| 
 | |
| 	return kubepod, nil
 | |
| }
 | |
| 
 | |
| // GetPods runs 'rkt list' to get the list of rkt pods.
 | |
| // Then it will use the result to construct a list of container runtime pods.
 | |
| // If all is false, then only running pods will be returned, otherwise all pods will be
 | |
| // returned.
 | |
| func (r *Runtime) GetPods(all bool) ([]*kubecontainer.Pod, error) {
 | |
| 	glog.V(4).Infof("Rkt getting pods")
 | |
| 
 | |
| 	listReq := &rktapi.ListPodsRequest{
 | |
| 		Detail: true,
 | |
| 		Filters: []*rktapi.PodFilter{
 | |
| 			{
 | |
| 				Annotations: []*rktapi.KeyValue{
 | |
| 					{
 | |
| 						Key:   k8sRktKubeletAnno,
 | |
| 						Value: k8sRktKubeletAnnoValue,
 | |
| 					},
 | |
| 				},
 | |
| 			},
 | |
| 		},
 | |
| 	}
 | |
| 	if !all {
 | |
| 		listReq.Filters[0].States = []rktapi.PodState{rktapi.PodState_POD_STATE_RUNNING}
 | |
| 	}
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout)
 | |
| 	defer cancel()
 | |
| 	listResp, err := r.apisvc.ListPods(ctx, listReq)
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("couldn't list pods: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	pods := make(map[kubetypes.UID]*kubecontainer.Pod)
 | |
| 	var podIDs []kubetypes.UID
 | |
| 	for _, pod := range listResp.Pods {
 | |
| 		pod, err := r.convertRktPod(pod)
 | |
| 		if err != nil {
 | |
| 			glog.Warningf("rkt: Cannot construct pod from unit file: %v.", err)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		// Group pods together.
 | |
| 		oldPod, found := pods[pod.ID]
 | |
| 		if !found {
 | |
| 			pods[pod.ID] = pod
 | |
| 			podIDs = append(podIDs, pod.ID)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		oldPod.Containers = append(oldPod.Containers, pod.Containers...)
 | |
| 	}
 | |
| 
 | |
| 	// Convert map to list, using the consistent order from the podIDs array.
 | |
| 	var result []*kubecontainer.Pod
 | |
| 	for _, id := range podIDs {
 | |
| 		result = append(result, pods[id])
 | |
| 	}
 | |
| 
 | |
| 	return result, nil
 | |
| }
 | |
| 
 | |
| func getPodTerminationGracePeriodInSecond(pod *v1.Pod) int64 {
 | |
| 	var gracePeriod int64
 | |
| 	switch {
 | |
| 	case pod.DeletionGracePeriodSeconds != nil:
 | |
| 		gracePeriod = *pod.DeletionGracePeriodSeconds
 | |
| 	case pod.Spec.TerminationGracePeriodSeconds != nil:
 | |
| 		gracePeriod = *pod.Spec.TerminationGracePeriodSeconds
 | |
| 	}
 | |
| 	if gracePeriod < minimumGracePeriodInSeconds {
 | |
| 		gracePeriod = minimumGracePeriodInSeconds
 | |
| 	}
 | |
| 	return gracePeriod
 | |
| }
 | |
| 
 | |
| func (r *Runtime) waitPreStopHooks(pod *v1.Pod, runningPod *kubecontainer.Pod) {
 | |
| 	gracePeriod := getPodTerminationGracePeriodInSecond(pod)
 | |
| 
 | |
| 	done := make(chan struct{})
 | |
| 	go func() {
 | |
| 		if err := r.runLifecycleHooks(pod, runningPod, lifecyclePreStopHook); err != nil {
 | |
| 			glog.Errorf("rkt: Some pre-stop hooks failed for pod %q: %v", format.Pod(pod), err)
 | |
| 		}
 | |
| 		close(done)
 | |
| 	}()
 | |
| 
 | |
| 	select {
 | |
| 	case <-time.After(time.Duration(gracePeriod) * time.Second):
 | |
| 		glog.V(2).Infof("rkt: Some pre-stop hooks did not complete in %d seconds for pod %q", gracePeriod, format.Pod(pod))
 | |
| 	case <-done:
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // KillPod invokes 'systemctl kill' to kill the unit that runs the pod.
 | |
| // TODO: add support for gracePeriodOverride which is used in eviction scenarios
 | |
| func (r *Runtime) KillPod(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) error {
 | |
| 	glog.V(4).Infof("Rkt is killing pod: name %q.", runningPod.Name)
 | |
| 
 | |
| 	if len(runningPod.Containers) == 0 {
 | |
| 		glog.V(4).Infof("rkt: Pod %q is already being killed, no action will be taken", runningPod.Name)
 | |
| 		return nil
 | |
| 	}
 | |
| 
 | |
| 	if pod != nil {
 | |
| 		r.waitPreStopHooks(pod, &runningPod)
 | |
| 	}
 | |
| 
 | |
| 	containerID, err := parseContainerID(runningPod.Containers[0].ID)
 | |
| 	if err != nil {
 | |
| 		glog.Errorf("rkt: Failed to get rkt uuid of the pod %q: %v", runningPod.Name, err)
 | |
| 		return err
 | |
| 	}
 | |
| 	serviceName := makePodServiceFileName(containerID.uuid)
 | |
| 	serviceFile := serviceFilePath(serviceName)
 | |
| 
 | |
| 	r.generateEvents(&runningPod, "Killing", nil)
 | |
| 	for _, c := range runningPod.Containers {
 | |
| 		r.containerRefManager.ClearRef(c.ID)
 | |
| 	}
 | |
| 
 | |
| 	// Since all service file have 'KillMode=mixed', the processes in
 | |
| 	// the unit's cgroup will receive a SIGKILL if the normal stop timeouts.
 | |
| 	reschan := make(chan string)
 | |
| 	if _, err = r.systemd.StopUnit(serviceName, "replace", reschan); err != nil {
 | |
| 		glog.Errorf("rkt: Failed to stop unit %q: %v", serviceName, err)
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	res := <-reschan
 | |
| 	if res != "done" {
 | |
| 		err := fmt.Errorf("invalid result: %s", res)
 | |
| 		glog.Errorf("rkt: Failed to stop unit %q: %v", serviceName, err)
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// Clean up networking. Use the service file to get pod details since 'pod' can be nil.
 | |
| 	if err := r.cleanupPodNetworkFromServiceFile(serviceFile); err != nil {
 | |
| 		glog.Errorf("rkt: failed to tear down network for unit %q: %v", serviceName, err)
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| func (r *Runtime) Type() string {
 | |
| 	return RktType
 | |
| }
 | |
| 
 | |
| func (r *Runtime) Version() (kubecontainer.Version, error) {
 | |
| 	r.versions.RLock()
 | |
| 	defer r.versions.RUnlock()
 | |
| 	return r.versions.binVersion, nil
 | |
| }
 | |
| 
 | |
| func (r *Runtime) APIVersion() (kubecontainer.Version, error) {
 | |
| 	r.versions.RLock()
 | |
| 	defer r.versions.RUnlock()
 | |
| 	return r.versions.apiVersion, nil
 | |
| }
 | |
| 
 | |
| // Status returns error if rkt is unhealthy, nil otherwise.
 | |
| func (r *Runtime) Status() (*kubecontainer.RuntimeStatus, error) {
 | |
| 	return nil, r.checkVersion(minimumRktBinVersion, minimumRktApiVersion, minimumSystemdVersion)
 | |
| }
 | |
| 
 | |
| // SyncPod syncs the running pod to match the specified desired pod.
 | |
| func (r *Runtime) SyncPod(pod *v1.Pod, _ v1.PodStatus, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, backOff *flowcontrol.Backoff) (result kubecontainer.PodSyncResult) {
 | |
| 	var err error
 | |
| 	defer func() {
 | |
| 		if err != nil {
 | |
| 			result.Fail(err)
 | |
| 		}
 | |
| 	}()
 | |
| 	// TODO: (random-liu) Stop using running pod in SyncPod()
 | |
| 	runningPod := kubecontainer.ConvertPodStatusToRunningPod(r.Type(), podStatus)
 | |
| 	// Add references to all containers.
 | |
| 	unidentifiedContainers := make(map[kubecontainer.ContainerID]*kubecontainer.Container)
 | |
| 	for _, c := range runningPod.Containers {
 | |
| 		unidentifiedContainers[c.ID] = c
 | |
| 	}
 | |
| 
 | |
| 	restartPod := false
 | |
| 	for _, container := range pod.Spec.Containers {
 | |
| 		expectedHash := kubecontainer.HashContainerLegacy(&container)
 | |
| 
 | |
| 		c := runningPod.FindContainerByName(container.Name)
 | |
| 		if c == nil {
 | |
| 			if kubecontainer.ShouldContainerBeRestarted(&container, pod, podStatus) {
 | |
| 				glog.V(3).Infof("Container %+v is dead, but RestartPolicy says that we should restart it.", container)
 | |
| 				// TODO(yifan): Containers in one pod are fate-sharing at this moment, see:
 | |
| 				// https://github.com/appc/spec/issues/276.
 | |
| 				restartPod = true
 | |
| 				break
 | |
| 			}
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		// TODO: check for non-root image directives.  See ../docker/manager.go#SyncPod
 | |
| 
 | |
| 		// TODO(yifan): Take care of host network change.
 | |
| 		containerChanged := c.Hash != 0 && c.Hash != expectedHash
 | |
| 		if containerChanged {
 | |
| 			glog.Infof("Pod %q container %q hash changed (%d vs %d), it will be killed and re-created.", format.Pod(pod), container.Name, c.Hash, expectedHash)
 | |
| 			restartPod = true
 | |
| 			break
 | |
| 		}
 | |
| 
 | |
| 		liveness, found := r.livenessManager.Get(c.ID)
 | |
| 		if found && liveness != proberesults.Success && pod.Spec.RestartPolicy != v1.RestartPolicyNever {
 | |
| 			glog.Infof("Pod %q container %q is unhealthy, it will be killed and re-created.", format.Pod(pod), container.Name)
 | |
| 			restartPod = true
 | |
| 			break
 | |
| 		}
 | |
| 
 | |
| 		delete(unidentifiedContainers, c.ID)
 | |
| 	}
 | |
| 
 | |
| 	// If there is any unidentified containers, restart the pod.
 | |
| 	if len(unidentifiedContainers) > 0 {
 | |
| 		restartPod = true
 | |
| 	}
 | |
| 
 | |
| 	if restartPod {
 | |
| 		// Kill the pod only if the pod is actually running.
 | |
| 		if len(runningPod.Containers) > 0 {
 | |
| 			if err = r.KillPod(pod, runningPod, nil); err != nil {
 | |
| 				return
 | |
| 			}
 | |
| 		}
 | |
| 		if err = r.RunPod(pod, pullSecrets); err != nil {
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| 	return
 | |
| }
 | |
| 
 | |
| // Sort rkt pods by creation time.
 | |
| type podsByCreatedAt []*rktapi.Pod
 | |
| 
 | |
| func (s podsByCreatedAt) Len() int           { return len(s) }
 | |
| func (s podsByCreatedAt) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
 | |
| func (s podsByCreatedAt) Less(i, j int) bool { return s[i].CreatedAt < s[j].CreatedAt }
 | |
| 
 | |
| // getPodUID returns the pod's API UID, it returns
 | |
| // empty UID if the UID cannot be determined.
 | |
| func getPodUID(pod *rktapi.Pod) kubetypes.UID {
 | |
| 	for _, anno := range pod.Annotations {
 | |
| 		if anno.Key == types.KubernetesPodUIDLabel {
 | |
| 			return kubetypes.UID(anno.Value)
 | |
| 		}
 | |
| 	}
 | |
| 	return kubetypes.UID("")
 | |
| }
 | |
| 
 | |
| // podIsActive returns true if the pod is embryo, preparing or running.
 | |
| // If a pod is prepared, it is not guaranteed to be active (e.g. the systemd
 | |
| // service might fail).
 | |
| func podIsActive(pod *rktapi.Pod) bool {
 | |
| 	return pod.State == rktapi.PodState_POD_STATE_EMBRYO ||
 | |
| 		pod.State == rktapi.PodState_POD_STATE_PREPARING ||
 | |
| 		pod.State == rktapi.PodState_POD_STATE_RUNNING
 | |
| }
 | |
| 
 | |
| // GetNetNS returns the network namespace path for the given container
 | |
| func (r *Runtime) GetNetNS(containerID kubecontainer.ContainerID) (string, error) {
 | |
| 	// Currently the containerID is a UUID for a network namespace
 | |
| 	// This hack is a way to create an unique network namespace for each new starting/restarting Pod
 | |
| 	// We can do this because we played the same trick in
 | |
| 	// `networkPlugin.SetUpPod` and `networkPlugin.TearDownPod`.
 | |
| 	// See https://github.com/kubernetes/kubernetes/issues/45149
 | |
| 	return netnsPathFromName(containerID.ID), nil
 | |
| }
 | |
| 
 | |
| func (r *Runtime) GetPodContainerID(pod *kubecontainer.Pod) (kubecontainer.ContainerID, error) {
 | |
| 	return kubecontainer.ContainerID{ID: string(pod.ID)}, nil
 | |
| }
 | |
| 
 | |
| func (r *Runtime) getKubernetesDirective(serviceFilePath string) (podService podServiceDirective, err error) {
 | |
| 	f, err := os.Open(serviceFilePath)
 | |
| 	if err != nil {
 | |
| 		return podService, err
 | |
| 	}
 | |
| 	defer f.Close()
 | |
| 
 | |
| 	opts, err := unit.Deserialize(f)
 | |
| 	if err != nil {
 | |
| 		return podService, err
 | |
| 	}
 | |
| 
 | |
| 	var hostnetwork, networkNamespace string
 | |
| 	for _, o := range opts {
 | |
| 		if o.Section != unitKubernetesSection {
 | |
| 			continue
 | |
| 		}
 | |
| 		switch o.Name {
 | |
| 		case unitPodUID:
 | |
| 			podService.id = o.Value
 | |
| 		case unitPodName:
 | |
| 			podService.name = o.Value
 | |
| 		case unitPodNamespace:
 | |
| 			podService.namespace = o.Value
 | |
| 		case unitPodHostNetwork:
 | |
| 			hostnetwork = o.Value
 | |
| 		case unitPodNetworkNamespace:
 | |
| 			networkNamespace = o.Value
 | |
| 		}
 | |
| 
 | |
| 		if podService.id != "" && podService.name != "" && podService.namespace != "" && hostnetwork != "" && networkNamespace != "" {
 | |
| 			podService.hostNetwork, err = strconv.ParseBool(hostnetwork)
 | |
| 			podService.networkNamespace = kubecontainer.ContainerID{ID: networkNamespace}
 | |
| 			if err != nil {
 | |
| 				return podService, err
 | |
| 			}
 | |
| 			return podService, nil
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return podService, fmt.Errorf("failed to parse pod from file %s", serviceFilePath)
 | |
| }
 | |
| 
 | |
| func (r *Runtime) DeleteContainer(containerID kubecontainer.ContainerID) error {
 | |
| 	return fmt.Errorf("unimplemented")
 | |
| }
 | |
| 
 | |
| // Collects all the systemd units for k8s Pods
 | |
| func (r *Runtime) getPodSystemdServiceFiles() ([]os.FileInfo, error) {
 | |
| 	// Get all the current units
 | |
| 	files, err := r.os.ReadDir(systemdServiceDir)
 | |
| 	if err != nil {
 | |
| 		glog.Errorf("rkt: Failed to read the systemd service directory: %v", err)
 | |
| 		return files, err
 | |
| 	}
 | |
| 
 | |
| 	// Keep only k8s unit files
 | |
| 	k8sSystemdServiceFiles := files[:0]
 | |
| 	for _, f := range files {
 | |
| 		if strings.HasPrefix(f.Name(), kubernetesUnitPrefix) {
 | |
| 			k8sSystemdServiceFiles = append(k8sSystemdServiceFiles, f)
 | |
| 		}
 | |
| 	}
 | |
| 	return k8sSystemdServiceFiles, err
 | |
| }
 | |
| 
 | |
| // GarbageCollect collects the pods/containers.
 | |
| // After one GC iteration:
 | |
| // - The deleted pods will be removed.
 | |
| // - If the number of containers exceeds gcPolicy.MaxContainers,
 | |
| //   then containers whose ages are older than gcPolicy.minAge will
 | |
| //   be removed.
 | |
| func (r *Runtime) GarbageCollect(gcPolicy kubecontainer.ContainerGCPolicy, allSourcesReady bool, _ bool) error {
 | |
| 	var errlist []error
 | |
| 	var totalInactiveContainers int
 | |
| 	var inactivePods []*rktapi.Pod
 | |
| 	var removeCandidates []*rktapi.Pod
 | |
| 	var allPods = map[string]*rktapi.Pod{}
 | |
| 
 | |
| 	glog.V(4).Infof("rkt: Garbage collecting triggered with policy %v", gcPolicy)
 | |
| 
 | |
| 	// GC all inactive systemd service files and pods.
 | |
| 	files, err := r.getPodSystemdServiceFiles()
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout)
 | |
| 	defer cancel()
 | |
| 	resp, err := r.apisvc.ListPods(ctx, &rktapi.ListPodsRequest{Filters: kubernetesPodsFilters()})
 | |
| 	if err != nil {
 | |
| 		glog.Errorf("rkt: Failed to list pods: %v", err)
 | |
| 		return err
 | |
| 	}
 | |
| 
 | |
| 	// Mark inactive pods.
 | |
| 	for _, pod := range resp.Pods {
 | |
| 		allPods[pod.Id] = pod
 | |
| 		if !podIsActive(pod) {
 | |
| 			uid := getPodUID(pod)
 | |
| 			if uid == kubetypes.UID("") {
 | |
| 				glog.Errorf("rkt: Cannot get the UID of pod %q, pod is broken, will remove it", pod.Id)
 | |
| 				removeCandidates = append(removeCandidates, pod)
 | |
| 				continue
 | |
| 			}
 | |
| 			if r.podDeletionProvider.IsPodDeleted(uid) && allSourcesReady {
 | |
| 				removeCandidates = append(removeCandidates, pod)
 | |
| 				continue
 | |
| 			}
 | |
| 
 | |
| 			inactivePods = append(inactivePods, pod)
 | |
| 			totalInactiveContainers = totalInactiveContainers + len(pod.Apps)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Remove any orphan service files.
 | |
| 	for _, f := range files {
 | |
| 		serviceName := f.Name()
 | |
| 		rktUUID := getRktUUIDFromServiceFileName(serviceName)
 | |
| 		if _, ok := allPods[rktUUID]; !ok {
 | |
| 			glog.V(4).Infof("rkt: No rkt pod found for service file %q, will remove it", serviceName)
 | |
| 
 | |
| 			if err := r.cleanupByPodId(rktUUID); err != nil {
 | |
| 				errlist = append(errlist, fmt.Errorf("rkt: Failed to clean up rkt pod %q: %v", rktUUID, err))
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	sort.Sort(podsByCreatedAt(inactivePods))
 | |
| 
 | |
| 	// Enforce GCPolicy.MaxContainers.
 | |
| 	for _, pod := range inactivePods {
 | |
| 		if totalInactiveContainers <= gcPolicy.MaxContainers {
 | |
| 			break
 | |
| 		}
 | |
| 		creationTime := time.Unix(0, pod.CreatedAt)
 | |
| 		if creationTime.Add(gcPolicy.MinAge).Before(time.Now()) {
 | |
| 			// The pod is old and we are exceeding the MaxContainers limit.
 | |
| 			// Delete the pod.
 | |
| 			removeCandidates = append(removeCandidates, pod)
 | |
| 			totalInactiveContainers = totalInactiveContainers - len(pod.Apps)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// Remove pods and their service files.
 | |
| 	for _, pod := range removeCandidates {
 | |
| 		if err := r.removePod(pod); err != nil {
 | |
| 			errlist = append(errlist, fmt.Errorf("rkt: Failed to clean up rkt pod %q: %v", pod.Id, err))
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return errors.NewAggregate(errlist)
 | |
| }
 | |
| 
 | |
| // Read kubernetes pod UUID, namespace, netns and name from systemd service file and
 | |
| // use that to clean up any pod network that may still exist.
 | |
| func (r *Runtime) cleanupPodNetworkFromServiceFile(serviceFilePath string) error {
 | |
| 	podService, err := r.unitGetter.getKubernetesDirective(serviceFilePath)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	return r.cleanupPodNetwork(&v1.Pod{
 | |
| 		ObjectMeta: metav1.ObjectMeta{
 | |
| 			UID:       kubetypes.UID(podService.id),
 | |
| 			Name:      podService.name,
 | |
| 			Namespace: podService.namespace,
 | |
| 		},
 | |
| 		Spec: v1.PodSpec{
 | |
| 			HostNetwork: podService.hostNetwork,
 | |
| 		},
 | |
| 	}, podService.networkNamespace)
 | |
| }
 | |
| 
 | |
| // Remove the touched file created by ExecStartPost in the systemd service file
 | |
| func (r *Runtime) removeFinishedMarkerFile(serviceName string) error {
 | |
| 	serviceFile := serviceFilePath(serviceName)
 | |
| 	podDetail, err := r.unitGetter.getKubernetesDirective(serviceFile)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	podDir := r.runtimeHelper.GetPodDir(kubetypes.UID(podDetail.id))
 | |
| 	finishedFile := podFinishedMarkerPath(podDir, getRktUUIDFromServiceFileName(serviceName))
 | |
| 	return r.os.Remove(finishedFile)
 | |
| }
 | |
| 
 | |
| // Iter over each container in the pod to delete its termination log file
 | |
| func (r *Runtime) removeTerminationFiles(pod *rktapi.Pod) (errlist []error) {
 | |
| 	// container == app
 | |
| 	for _, app := range pod.Apps {
 | |
| 		for _, annotation := range app.Annotations {
 | |
| 			if annotation.GetKey() == k8sRktTerminationMessagePathAnno {
 | |
| 				if err := r.os.Remove(annotation.GetValue()); err != nil {
 | |
| 					errlist = append(errlist, fmt.Errorf("rkt: Failed to remove for pod %q container file %v", pod.Id, err))
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return errlist
 | |
| }
 | |
| 
 | |
| func (r *Runtime) cleanupByPodId(podID string) (errlist []error) {
 | |
| 	serviceName := makePodServiceFileName(podID)
 | |
| 	serviceFile := serviceFilePath(serviceName)
 | |
| 
 | |
| 	if err := r.cleanupPodNetworkFromServiceFile(serviceFile); err != nil {
 | |
| 		errlist = append(errlist, fmt.Errorf("rkt: Failed to clean up pod network from service %q: %v, the network may not be around already", serviceName, err))
 | |
| 	}
 | |
| 
 | |
| 	// GC finished marker, termination-log file, systemd service files as well.
 | |
| 	if err := r.systemd.ResetFailedUnit(serviceName); err != nil {
 | |
| 		errlist = append(errlist, fmt.Errorf("rkt: Failed to reset the failed systemd service %q: %v", serviceName, err))
 | |
| 	}
 | |
| 	if err := r.removeFinishedMarkerFile(serviceName); err != nil {
 | |
| 		errlist = append(errlist, fmt.Errorf("rkt: Failed to remove finished file %q for unit %q: %v", serviceName, podID, err))
 | |
| 	}
 | |
| 	if err := r.os.Remove(serviceFile); err != nil {
 | |
| 		errlist = append(errlist, fmt.Errorf("rkt: Failed to remove service file %q for pod %q: %v", serviceFile, podID, err))
 | |
| 	}
 | |
| 	return errlist
 | |
| }
 | |
| 
 | |
| // removePod calls 'rkt rm $UUID' to delete a rkt pod,
 | |
| // it also remove the systemd service file,
 | |
| // the finished-* marker and the termination-log files
 | |
| // related to the pod.
 | |
| func (r *Runtime) removePod(pod *rktapi.Pod) error {
 | |
| 	var errlist []error
 | |
| 	glog.V(4).Infof("rkt: GC is removing pod %q", pod)
 | |
| 
 | |
| 	if err := r.cleanupByPodId(pod.Id); err != nil {
 | |
| 		errlist = append(errlist, fmt.Errorf("rkt: Failed to remove pod %q: %v", pod.Id, err))
 | |
| 	}
 | |
| 	if err := r.removeTerminationFiles(pod); err != nil {
 | |
| 		errlist = append(errlist, fmt.Errorf("rkt: Failed to clean up pod TerminationMessageFile %q: %v", pod.Id, err))
 | |
| 	}
 | |
| 
 | |
| 	if _, err := r.cli.RunCommand(nil, "rm", pod.Id); err != nil {
 | |
| 		errlist = append(errlist, fmt.Errorf("rkt: Failed to remove pod %q: %v", pod.Id, err))
 | |
| 	}
 | |
| 
 | |
| 	return errors.NewAggregate(errlist)
 | |
| }
 | |
| 
 | |
| // rktExitError implements /pkg/util/exec.ExitError interface.
 | |
| type rktExitError struct{ *exec.ExitError }
 | |
| 
 | |
| var _ utilexec.ExitError = &rktExitError{}
 | |
| 
 | |
| func (r *rktExitError) ExitStatus() int {
 | |
| 	if status, ok := r.Sys().(syscall.WaitStatus); ok {
 | |
| 		return status.ExitStatus()
 | |
| 	}
 | |
| 	return 0
 | |
| }
 | |
| 
 | |
| func newRktExitError(e error) error {
 | |
| 	if exitErr, ok := e.(*exec.ExitError); ok {
 | |
| 		return &rktExitError{exitErr}
 | |
| 	}
 | |
| 	return e
 | |
| }
 | |
| 
 | |
| func (r *Runtime) AttachContainer(containerID kubecontainer.ContainerID, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize) error {
 | |
| 	return fmt.Errorf("unimplemented")
 | |
| }
 | |
| 
 | |
| // Note: In rkt, the container ID is in the form of "UUID:appName", where UUID is
 | |
| // the rkt UUID, and appName is the container name.
 | |
| // TODO(yifan): If the rkt is using lkvm as the stage1 image, then this function will fail.
 | |
| func (r *Runtime) ExecInContainer(containerID kubecontainer.ContainerID, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error {
 | |
| 	glog.V(4).Infof("Rkt execing in container.")
 | |
| 
 | |
| 	id, err := parseContainerID(containerID)
 | |
| 	if err != nil {
 | |
| 		return err
 | |
| 	}
 | |
| 	args := []string{"enter", fmt.Sprintf("--app=%s", id.appName), id.uuid}
 | |
| 	args = append(args, cmd...)
 | |
| 	command := buildCommand(r.config, args...)
 | |
| 
 | |
| 	if tty {
 | |
| 		p, err := kubecontainer.StartPty(command)
 | |
| 		if err != nil {
 | |
| 			return err
 | |
| 		}
 | |
| 		defer p.Close()
 | |
| 
 | |
| 		// make sure to close the stdout stream
 | |
| 		defer stdout.Close()
 | |
| 
 | |
| 		kubecontainer.HandleResizing(resize, func(size remotecommand.TerminalSize) {
 | |
| 			term.SetSize(p.Fd(), size)
 | |
| 		})
 | |
| 
 | |
| 		if stdin != nil {
 | |
| 			go io.Copy(p, stdin)
 | |
| 		}
 | |
| 		if stdout != nil {
 | |
| 			go io.Copy(stdout, p)
 | |
| 		}
 | |
| 		return newRktExitError(command.Wait())
 | |
| 	}
 | |
| 	if stdin != nil {
 | |
| 		// Use an os.Pipe here as it returns true *os.File objects.
 | |
| 		// This way, if you run 'kubectl exec <pod> -i bash' (no tty) and type 'exit',
 | |
| 		// the call below to command.Run() can unblock because its Stdin is the read half
 | |
| 		// of the pipe.
 | |
| 		r, w, err := r.os.Pipe()
 | |
| 		if err != nil {
 | |
| 			return newRktExitError(err)
 | |
| 		}
 | |
| 		go io.Copy(w, stdin)
 | |
| 
 | |
| 		command.Stdin = r
 | |
| 	}
 | |
| 	if stdout != nil {
 | |
| 		command.Stdout = stdout
 | |
| 	}
 | |
| 	if stderr != nil {
 | |
| 		command.Stderr = stderr
 | |
| 	}
 | |
| 	return newRktExitError(command.Run())
 | |
| }
 | |
| 
 | |
| // PortForward executes socat in the pod's network namespace and copies
 | |
| // data between stream (representing the user's local connection on their
 | |
| // computer) and the specified port in the container.
 | |
| //
 | |
| // TODO:
 | |
| //  - match cgroups of container
 | |
| //  - should we support nsenter + socat on the host? (current impl)
 | |
| //  - should we support nsenter + socat in a container, running with elevated privs and --pid=host?
 | |
| //
 | |
| // TODO(yifan): Merge with the same function in dockertools.
 | |
| func (r *Runtime) PortForward(pod *kubecontainer.Pod, port int32, stream io.ReadWriteCloser) error {
 | |
| 	glog.V(4).Infof("Rkt port forwarding in container.")
 | |
| 
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout)
 | |
| 	defer cancel()
 | |
| 	listResp, err := r.apisvc.ListPods(ctx, &rktapi.ListPodsRequest{
 | |
| 		Detail:  true,
 | |
| 		Filters: runningKubernetesPodFilters(pod.ID),
 | |
| 	})
 | |
| 	if err != nil {
 | |
| 		return fmt.Errorf("couldn't list pods: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	if len(listResp.Pods) != 1 {
 | |
| 		var podlist []string
 | |
| 		for _, p := range listResp.Pods {
 | |
| 			podlist = append(podlist, p.Id)
 | |
| 		}
 | |
| 		return fmt.Errorf("more than one running rkt pod for the kubernetes pod [%s]", strings.Join(podlist, ", "))
 | |
| 	}
 | |
| 	listPod := listResp.Pods[0]
 | |
| 
 | |
| 	socatPath, lookupErr := exec.LookPath("socat")
 | |
| 	if lookupErr != nil {
 | |
| 		return fmt.Errorf("unable to do port forwarding: socat not found.")
 | |
| 	}
 | |
| 
 | |
| 	// Check in config and in annotations if we're running kvm flavor
 | |
| 	isKvm := strings.Contains(r.config.Stage1Image, "kvm")
 | |
| 	for _, anno := range listPod.Annotations {
 | |
| 		if anno.Key == k8sRktStage1NameAnno {
 | |
| 			isKvm = strings.Contains(anno.Value, "kvm")
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	var args []string
 | |
| 	var fwCaller string
 | |
| 	if isKvm {
 | |
| 		podNetworks := listPod.GetNetworks()
 | |
| 		if podNetworks == nil {
 | |
| 			return fmt.Errorf("unable to get networks")
 | |
| 		}
 | |
| 		args = []string{"-", fmt.Sprintf("TCP4:%s:%d", podNetworks[0].Ipv4, port)}
 | |
| 		fwCaller = socatPath
 | |
| 	} else {
 | |
| 		args = []string{"-t", fmt.Sprintf("%d", listPod.Pid), "-n", socatPath, "-", fmt.Sprintf("TCP4:localhost:%d", port)}
 | |
| 		nsenterPath, lookupErr := exec.LookPath("nsenter")
 | |
| 		if lookupErr != nil {
 | |
| 			return fmt.Errorf("unable to do port forwarding: nsenter not found")
 | |
| 		}
 | |
| 		fwCaller = nsenterPath
 | |
| 	}
 | |
| 
 | |
| 	command := exec.Command(fwCaller, args...)
 | |
| 	command.Stdout = stream
 | |
| 
 | |
| 	// If we use Stdin, command.Run() won't return until the goroutine that's copying
 | |
| 	// from stream finishes. Unfortunately, if you have a client like telnet connected
 | |
| 	// via port forwarding, as long as the user's telnet client is connected to the user's
 | |
| 	// local listener that port forwarding sets up, the telnet session never exits. This
 | |
| 	// means that even if socat has finished running, command.Run() won't ever return
 | |
| 	// (because the client still has the connection and stream open).
 | |
| 	//
 | |
| 	// The work around is to use StdinPipe(), as Wait() (called by Run()) closes the pipe
 | |
| 	// when the command (socat) exits.
 | |
| 	inPipe, err := command.StdinPipe()
 | |
| 	if err != nil {
 | |
| 		return fmt.Errorf("unable to do port forwarding: error creating stdin pipe: %v", err)
 | |
| 	}
 | |
| 	go func() {
 | |
| 		io.Copy(inPipe, stream)
 | |
| 		inPipe.Close()
 | |
| 	}()
 | |
| 
 | |
| 	return command.Run()
 | |
| }
 | |
| 
 | |
| // UpdatePodCIDR updates the runtimeconfig with the podCIDR.
 | |
| // Currently no-ops, just implemented to satisfy the cri.
 | |
| func (r *Runtime) UpdatePodCIDR(podCIDR string) error {
 | |
| 	return nil
 | |
| }
 | |
| 
 | |
| // appStateToContainerState converts rktapi.AppState to kubecontainer.ContainerState.
 | |
| func appStateToContainerState(state rktapi.AppState) kubecontainer.ContainerState {
 | |
| 	switch state {
 | |
| 	case rktapi.AppState_APP_STATE_RUNNING:
 | |
| 		return kubecontainer.ContainerStateRunning
 | |
| 	case rktapi.AppState_APP_STATE_EXITED:
 | |
| 		return kubecontainer.ContainerStateExited
 | |
| 	}
 | |
| 	return kubecontainer.ContainerStateUnknown
 | |
| }
 | |
| 
 | |
| // getPodInfo returns the pod manifest, creation time and restart count of the pod.
 | |
| func getPodInfo(pod *rktapi.Pod) (podManifest *appcschema.PodManifest, restartCount int, err error) {
 | |
| 	// TODO(yifan): The manifest is only used for getting the annotations.
 | |
| 	// Consider to let the server to unmarshal the annotations.
 | |
| 	var manifest appcschema.PodManifest
 | |
| 	if err = json.Unmarshal(pod.Manifest, &manifest); err != nil {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	if countString, ok := manifest.Annotations.Get(k8sRktRestartCountAnno); ok {
 | |
| 		restartCount, err = strconv.Atoi(countString)
 | |
| 		if err != nil {
 | |
| 			return
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return &manifest, restartCount, nil
 | |
| }
 | |
| 
 | |
| // populateContainerStatus fills the container status according to the app's information.
 | |
| func populateContainerStatus(pod rktapi.Pod, app rktapi.App, runtimeApp appcschema.RuntimeApp, restartCount int, finishedTime time.Time) (*kubecontainer.ContainerStatus, error) {
 | |
| 	hashStr, ok := runtimeApp.Annotations.Get(k8sRktContainerHashAnno)
 | |
| 	if !ok {
 | |
| 		return nil, fmt.Errorf("No container hash in pod manifest")
 | |
| 	}
 | |
| 
 | |
| 	hashNum, err := strconv.ParseUint(hashStr, 10, 64)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	var reason, message string
 | |
| 	if app.State == rktapi.AppState_APP_STATE_EXITED {
 | |
| 		if app.ExitCode == 0 {
 | |
| 			reason = "Completed"
 | |
| 		} else {
 | |
| 			reason = "Error"
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	terminationMessagePath, ok := runtimeApp.Annotations.Get(k8sRktTerminationMessagePathAnno)
 | |
| 	if ok {
 | |
| 		if data, err := ioutil.ReadFile(terminationMessagePath); err != nil {
 | |
| 			message = fmt.Sprintf("Error on reading termination-log %s: %v", terminationMessagePath, err)
 | |
| 		} else {
 | |
| 			message = string(data)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	createdTime := time.Unix(0, pod.CreatedAt)
 | |
| 	startedTime := time.Unix(0, pod.StartedAt)
 | |
| 
 | |
| 	return &kubecontainer.ContainerStatus{
 | |
| 		ID:         buildContainerID(&containerID{uuid: pod.Id, appName: app.Name}),
 | |
| 		Name:       app.Name,
 | |
| 		State:      appStateToContainerState(app.State),
 | |
| 		CreatedAt:  createdTime,
 | |
| 		StartedAt:  startedTime,
 | |
| 		FinishedAt: finishedTime,
 | |
| 		ExitCode:   int(app.ExitCode),
 | |
| 		// By default, the version returned by rkt API service will be "latest" if not specified.
 | |
| 		Image:   fmt.Sprintf("%s:%s", app.Image.Name, app.Image.Version),
 | |
| 		ImageID: "rkt://" + app.Image.Id, // TODO(yifan): Add the prefix only in v1.PodStatus.
 | |
| 		Hash:    hashNum,
 | |
| 		// TODO(yifan): Note that now all apps share the same restart count, this might
 | |
| 		// change once apps don't share the same lifecycle.
 | |
| 		// See https://github.com/appc/spec/pull/547.
 | |
| 		RestartCount: restartCount,
 | |
| 		Reason:       reason,
 | |
| 		Message:      message,
 | |
| 	}, nil
 | |
| }
 | |
| 
 | |
| // from a running systemd unit, return the network namespace of a Pod
 | |
| // this field is inside the X-Kubernetes directive
 | |
| func (r *Runtime) getNetworkNamespace(uid kubetypes.UID, latestPod *rktapi.Pod) (networkNamespace kubecontainer.ContainerID, err error) {
 | |
| 	serviceFiles, err := r.getPodSystemdServiceFiles()
 | |
| 	if err != nil {
 | |
| 		return networkNamespace, err
 | |
| 	}
 | |
| 
 | |
| 	for _, f := range serviceFiles {
 | |
| 		fileName := f.Name()
 | |
| 		if latestPod.Id == getRktUUIDFromServiceFileName(fileName) {
 | |
| 			podService, err := r.unitGetter.getKubernetesDirective(serviceFilePath(fileName))
 | |
| 			if err != nil {
 | |
| 				return networkNamespace, err
 | |
| 			}
 | |
| 			return podService.networkNamespace, nil
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return networkNamespace, fmt.Errorf("Pod %q containing rktPod %q haven't find a corresponding NetworkNamespace in %d systemd units", uid, latestPod.Id, len(serviceFiles))
 | |
| }
 | |
| 
 | |
| // GetPodStatus returns the status for a pod specified by a given UID, name,
 | |
| // and namespace.  It will attempt to find pod's information via a request to
 | |
| // the rkt api server.
 | |
| // An error will be returned if the api server returns an error. If the api
 | |
| // server doesn't error, but doesn't provide meaningful information about the
 | |
| // pod, a status with no information (other than the passed in arguments) is
 | |
| // returned anyways.
 | |
| func (r *Runtime) GetPodStatus(uid kubetypes.UID, name, namespace string) (*kubecontainer.PodStatus, error) {
 | |
| 	podStatus := &kubecontainer.PodStatus{
 | |
| 		ID:        uid,
 | |
| 		Name:      name,
 | |
| 		Namespace: namespace,
 | |
| 	}
 | |
| 
 | |
| 	ctx, cancel := context.WithTimeout(context.Background(), r.requestTimeout)
 | |
| 	defer cancel()
 | |
| 	listResp, err := r.apisvc.ListPods(ctx, &rktapi.ListPodsRequest{
 | |
| 		Detail:  true,
 | |
| 		Filters: kubernetesPodFilters(uid),
 | |
| 	})
 | |
| 	if err != nil {
 | |
| 		return nil, fmt.Errorf("couldn't list pods: %v", err)
 | |
| 	}
 | |
| 
 | |
| 	var latestPod *rktapi.Pod
 | |
| 	var latestRestartCount int = -1
 | |
| 
 | |
| 	// In this loop, we group all containers from all pods together,
 | |
| 	// also we try to find the latest pod, so we can fill other info of the pod below.
 | |
| 	for _, pod := range listResp.Pods {
 | |
| 		manifest, restartCount, err := getPodInfo(pod)
 | |
| 		if err != nil {
 | |
| 			glog.Warningf("rkt: Couldn't get necessary info from the rkt pod, (uuid %q): %v", pod.Id, err)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		if restartCount > latestRestartCount {
 | |
| 			latestPod = pod
 | |
| 			latestRestartCount = restartCount
 | |
| 		}
 | |
| 
 | |
| 		finishedTime := r.podFinishedAt(uid, pod.Id)
 | |
| 		for i, app := range pod.Apps {
 | |
| 			// The order of the apps is determined by the rkt pod manifest.
 | |
| 			cs, err := populateContainerStatus(*pod, *app, manifest.Apps[i], restartCount, finishedTime)
 | |
| 			if err != nil {
 | |
| 				glog.Warningf("rkt: Failed to populate container status(uuid %q, app %q): %v", pod.Id, app.Name, err)
 | |
| 				continue
 | |
| 			}
 | |
| 			podStatus.ContainerStatuses = append(podStatus.ContainerStatuses, cs)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if latestPod == nil {
 | |
| 		glog.Warningf("No latestPod: rkt api-svc returns [%d]rktPods, cannot fill podStatus.IP", len(listResp.Pods))
 | |
| 		return podStatus, nil
 | |
| 	}
 | |
| 
 | |
| 	// If we are running no-op network plugin, then get the pod IP from the rkt pod status.
 | |
| 	if r.network.PluginName() == network.DefaultPluginName {
 | |
| 		for _, n := range latestPod.Networks {
 | |
| 			if n.Name == defaultNetworkName {
 | |
| 				podStatus.IP = n.Ipv4
 | |
| 				break
 | |
| 			}
 | |
| 		}
 | |
| 		return podStatus, nil
 | |
| 	}
 | |
| 
 | |
| 	networkNamespace, err := r.unitGetter.getNetworkNamespace(uid, latestPod)
 | |
| 	if err != nil {
 | |
| 		glog.Warningf("networkNamespace: %v", err)
 | |
| 	}
 | |
| 	status, err := r.network.GetPodNetworkStatus(namespace, name, networkNamespace)
 | |
| 	if err != nil {
 | |
| 		glog.Warningf("rkt: %v", err)
 | |
| 	} else if status != nil {
 | |
| 		// status can be nil when the pod is running on the host network,
 | |
| 		// in which case the pod IP will be populated by the upper layer.
 | |
| 		podStatus.IP = status.IP.String()
 | |
| 	}
 | |
| 
 | |
| 	return podStatus, nil
 | |
| }
 | |
| 
 | |
| // getOSReleaseInfo reads /etc/os-release and returns a map
 | |
| // that contains the key value pairs in that file.
 | |
| func getOSReleaseInfo() (map[string]string, error) {
 | |
| 	result := make(map[string]string)
 | |
| 
 | |
| 	path := "/etc/os-release"
 | |
| 	f, err := os.Open(path)
 | |
| 	if err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	defer f.Close()
 | |
| 
 | |
| 	scanner := bufio.NewScanner(f)
 | |
| 	for scanner.Scan() {
 | |
| 		line := scanner.Text()
 | |
| 		if len(strings.TrimSpace(line)) == 0 {
 | |
| 			// Skips empty lines
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		info := strings.SplitN(line, "=", 2)
 | |
| 		if len(info) != 2 {
 | |
| 			glog.Warningf("Unexpected entry in os-release %q", line)
 | |
| 			continue
 | |
| 		}
 | |
| 		result[info[0]] = info[1]
 | |
| 	}
 | |
| 	if err := scanner.Err(); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 	return result, nil
 | |
| }
 | |
| 
 | |
| // convertKubeMounts creates appc volumes and mount points according to the given mounts.
 | |
| // Only one volume will be created for every unique host path.
 | |
| // Only one mount point will be created for every unique container path.
 | |
| func convertKubeMounts(mounts []kubecontainer.Mount) ([]appctypes.Volume, []appctypes.MountPoint) {
 | |
| 	volumeMap := make(map[string]*appctypes.Volume)
 | |
| 	mountPointMap := make(map[string]*appctypes.MountPoint)
 | |
| 
 | |
| 	for _, mnt := range mounts {
 | |
| 		readOnly := mnt.ReadOnly
 | |
| 
 | |
| 		if _, existed := volumeMap[mnt.HostPath]; !existed {
 | |
| 			volumeMap[mnt.HostPath] = &appctypes.Volume{
 | |
| 				Name:     *appctypes.MustACName(string(uuid.NewUUID())),
 | |
| 				Kind:     "host",
 | |
| 				Source:   mnt.HostPath,
 | |
| 				ReadOnly: &readOnly,
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if _, existed := mountPointMap[mnt.ContainerPath]; existed {
 | |
| 			glog.Warningf("Multiple mount points with the same container path %v, ignore it", mnt)
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		mountPointMap[mnt.ContainerPath] = &appctypes.MountPoint{
 | |
| 			Name:     volumeMap[mnt.HostPath].Name,
 | |
| 			Path:     mnt.ContainerPath,
 | |
| 			ReadOnly: readOnly,
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	volumes := make([]appctypes.Volume, 0, len(volumeMap))
 | |
| 	mountPoints := make([]appctypes.MountPoint, 0, len(mountPointMap))
 | |
| 
 | |
| 	for _, vol := range volumeMap {
 | |
| 		volumes = append(volumes, *vol)
 | |
| 	}
 | |
| 	for _, mnt := range mountPointMap {
 | |
| 		mountPoints = append(mountPoints, *mnt)
 | |
| 	}
 | |
| 
 | |
| 	return volumes, mountPoints
 | |
| }
 | |
| 
 | |
| // convertKubePortMappings creates appc container ports and host ports according to the given port mappings.
 | |
| // The container ports and host ports are mapped by PortMapping.Name.
 | |
| func convertKubePortMappings(portMappings []kubecontainer.PortMapping) ([]appctypes.Port, []appctypes.ExposedPort) {
 | |
| 	containerPorts := make([]appctypes.Port, 0, len(portMappings))
 | |
| 	hostPorts := make([]appctypes.ExposedPort, 0, len(portMappings))
 | |
| 
 | |
| 	for _, p := range portMappings {
 | |
| 		// This matches the docker code's behaviour.
 | |
| 		if p.HostPort == 0 {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		portName := convertToACName(p.Name)
 | |
| 		containerPorts = append(containerPorts, appctypes.Port{
 | |
| 			Name:     portName,
 | |
| 			Protocol: string(p.Protocol),
 | |
| 			Port:     uint(p.ContainerPort),
 | |
| 		})
 | |
| 
 | |
| 		hostPorts = append(hostPorts, appctypes.ExposedPort{
 | |
| 			Name:     portName,
 | |
| 			HostPort: uint(p.HostPort),
 | |
| 		})
 | |
| 	}
 | |
| 
 | |
| 	return containerPorts, hostPorts
 | |
| }
 | |
| 
 | |
| func newNoNewPrivilegesIsolator(v bool) (*appctypes.Isolator, error) {
 | |
| 	b := fmt.Sprintf(`{"name": "%s", "value": %t}`, appctypes.LinuxNoNewPrivilegesName, v)
 | |
| 
 | |
| 	i := &appctypes.Isolator{
 | |
| 		Name: appctypes.LinuxNoNewPrivilegesName,
 | |
| 	}
 | |
| 	if err := i.UnmarshalJSON([]byte(b)); err != nil {
 | |
| 		return nil, err
 | |
| 	}
 | |
| 
 | |
| 	return i, nil
 | |
| }
 |