Add sysctl whitelist on the node

This commit is contained in:
Dr. Stefan Schimanski 2016-08-19 10:53:25 +02:00
parent ed36baed20
commit e356e52247
13 changed files with 489 additions and 4 deletions

View File

@ -183,6 +183,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
fs.BoolVar(&s.MakeIPTablesUtilChains, "make-iptables-util-chains", s.MakeIPTablesUtilChains, "If true, kubelet will ensure iptables utility rules are present on host.")
fs.Int32Var(&s.IPTablesMasqueradeBit, "iptables-masquerade-bit", s.IPTablesMasqueradeBit, "The bit of the fwmark space to mark packets for SNAT. Must be within the range [0, 31]. Please match this parameter with corresponding parameter in kube-proxy.")
fs.Int32Var(&s.IPTablesDropBit, "iptables-drop-bit", s.IPTablesDropBit, "The bit of the fwmark space to mark packets for dropping. Must be within the range [0, 31].")
fs.StringSliceVar(&s.AllowedUnsafeSysctls, "experimental-allowed-unsafe-sysctls", s.AllowedUnsafeSysctls, "Comma-separated whitelist of unsafe sysctls or unsafe sysctl patterns (ending in *). Use these at your own risk.")
// Flags intended for testing, not recommended used in production environments.
fs.StringVar(&s.RemoteRuntimeEndpoint, "container-runtime-endpoint", s.RemoteRuntimeEndpoint, "The unix socket endpoint of remote runtime service. If not empty, this option will override --container-runtime. This is an experimental feature. Intended for testing only.")

View File

@ -290,6 +290,7 @@ func UnsecuredKubeletConfig(s *options.KubeletServer) (*KubeletConfig, error) {
StandaloneMode: (len(s.APIServerList) == 0),
StreamingConnectionIdleTimeout: s.StreamingConnectionIdleTimeout.Duration,
SyncFrequency: s.SyncFrequency.Duration,
AllowedUnsafeSysctls: s.AllowedUnsafeSysctls,
SystemCgroups: s.SystemCgroups,
TLSOptions: tlsOptions,
Writer: writer,
@ -1098,6 +1099,7 @@ type KubeletConfig struct {
StandaloneMode bool
StreamingConnectionIdleTimeout time.Duration
SyncFrequency time.Duration
AllowedUnsafeSysctls []string
SystemCgroups string
TLSOptions *server.TLSOptions
Writer io.Writer
@ -1218,6 +1220,7 @@ func CreateAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod
kc.MakeIPTablesUtilChains,
kc.iptablesMasqueradeBit,
kc.iptablesDropBit,
kc.AllowedUnsafeSysctls,
)
if err != nil {

View File

@ -100,6 +100,7 @@ pkg/kubelet/api
pkg/kubelet/container
pkg/kubelet/envvars
pkg/kubelet/eviction
pkg/kubelet/sysctls
pkg/kubelet/util/format
pkg/kubelet/util/ioutils
pkg/kubelet/volume
@ -139,6 +140,7 @@ pkg/runtime/serializer/yaml
pkg/security
pkg/security/podsecuritypolicy/apparmor
pkg/security/podsecuritypolicy/capabilities
pkg/security/podsecuritypolicy/sysctl
pkg/serviceaccount
pkg/storage
pkg/storage/etcd3

View File

@ -164,6 +164,7 @@ executor-logv
executor-path
executor-suicide-timeout
exit-on-lock-contention
experimental-allowed-unsafe-sysctls
experimental-bootstrap-kubeconfig
experimental-flannel-overlay
experimental-keystone-url

View File

@ -419,6 +419,8 @@ type KubeletConfiguration struct {
// iptablesDropBit is the bit of the iptables fwmark space to use for dropping packets. Kubelet will ensure iptables mark and drop rules.
// Values must be within the range [0, 31]. Must be different from IPTablesMasqueradeBit
IPTablesDropBit int32 `json:"iptablesDropBit"`
// Whitelist of unsafe sysctls or sysctl patterns (ending in *).
AllowedUnsafeSysctls []string `json:"experimentalAllowedUnsafeSysctls,omitempty"`
}
type KubeSchedulerConfiguration struct {

View File

@ -474,4 +474,7 @@ type KubeletConfiguration struct {
// iptablesDropBit is the bit of the iptables fwmark space to mark for dropping packets.
// Values must be within the range [0, 31]. Must be different from other mark bits.
IPTablesDropBit *int32 `json:"iptablesDropBit"`
// Whitelist of unsafe sysctls or sysctl patterns (ending in *). Use these at your own risk.
// Resource isolation might be lacking and pod might influence each other on the same node.
AllowedUnsafeSysctls []string `json:"allowedUnsafeSysctls,omitempty"`
}

View File

@ -76,9 +76,10 @@ const (
// docker version should be at least 1.9.x
minimumDockerAPIVersion = "1.21"
// Remote API version for docker daemon version v1.10
// Remote API version for docker daemon versions
// https://docs.docker.com/engine/reference/api/docker_remote_api/
dockerV110APIVersion = "1.22"
DockerV112APIVersion = "1.24"
// ndots specifies the minimum number of dots that a domain name must contain for the resolver to consider it as FQDN (fully-qualified)
// we want to able to consider SRV lookup names like _dns._udp.kube-dns.default.svc to be considered relative.
@ -661,16 +662,19 @@ func (dm *DockerManager) runContainer(
}
// Set sysctls if requested
sysctls, err := api.SysctlsFromPodAnnotation(pod.Annotations[api.SysctlsPodAnnotationKey])
sysctls, unsafeSysctls, err := api.SysctlsFromPodAnnotations(pod.Annotations)
if err != nil {
dm.recorder.Eventf(ref, api.EventTypeWarning, events.FailedToCreateContainer, "Failed to create docker container %q of pod %q with error: %v", container.Name, format.Pod(pod), err)
return kubecontainer.ContainerID{}, err
}
if len(sysctls) > 0 {
hc.Sysctls = make(map[string]string, len(sysctls))
if len(sysctls)+len(unsafeSysctls) > 0 {
hc.Sysctls = make(map[string]string, len(sysctls)+len(unsafeSysctls))
for _, c := range sysctls {
hc.Sysctls[c.Name] = c.Value
}
for _, c := range unsafeSysctls {
hc.Sysctls[c.Name] = c.Value
}
}
// If current api version is newer than docker 1.10 requested, set OomScoreAdj to HostConfig

View File

@ -67,6 +67,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/server"
"k8s.io/kubernetes/pkg/kubelet/server/stats"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/kubelet/sysctl"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/kubelet/util/ioutils"
@ -249,6 +250,7 @@ func NewMainKubelet(
makeIPTablesUtilChains bool,
iptablesMasqueradeBit int,
iptablesDropBit int,
allowedUnsafeSysctls []string,
) (*Kubelet, error) {
if rootDirectory == "" {
return nil, fmt.Errorf("invalid root directory %q", rootDirectory)
@ -591,6 +593,26 @@ func NewMainKubelet(
klet.evictionManager = evictionManager
klet.AddPodAdmitHandler(evictionAdmitHandler)
// add sysctl admission
runtimeSupport, err := sysctl.NewRuntimeAdmitHandler(klet.containerRuntime)
if err != nil {
return nil, err
}
safeWhitelist, err := sysctl.NewWhitelist(sysctl.SafeSysctlWhitelist(), api.SysctlsPodAnnotationKey)
if err != nil {
return nil, err
}
// Safe, whitelisted sysctls can always be used as unsafe sysctls in the spec
// Hence, we concatenate those two lists.
safeAndUnsafeSysctls := append(sysctl.SafeSysctlWhitelist(), allowedUnsafeSysctls...)
unsafeWhitelist, err := sysctl.NewWhitelist(safeAndUnsafeSysctls, api.UnsafeSysctlsPodAnnotationKey)
if err != nil {
return nil, err
}
klet.AddPodAdmitHandler(runtimeSupport)
klet.AddPodAdmitHandler(safeWhitelist)
klet.AddPodAdmitHandler(unsafeWhitelist)
// enable active deadline handler
activeDeadlineHandler, err := newActiveDeadlineHandler(klet.statusManager, klet.recorder, klet.clock)
if err != nil {

View File

@ -0,0 +1,60 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sysctl
import (
"strings"
)
// Namespace represents a kernel namespace name.
type Namespace string
const (
// the Linux IPC namespace
IpcNamespace = Namespace("ipc")
// the network namespace
NetNamespace = Namespace("net")
// the zero value if no namespace is known
UnknownNamespace = Namespace("")
)
var namespaces = map[string]Namespace{
"kernel.sem": IpcNamespace,
}
var prefixNamespaces = map[string]Namespace{
"kernel.shm": IpcNamespace,
"kernel.msg": IpcNamespace,
"fs.mqueue.": IpcNamespace,
"net.": NetNamespace,
}
// NamespacedBy returns the namespace of the Linux kernel for a sysctl, or
// UnknownNamespace if the sysctl is not known to be namespaced.
func NamespacedBy(val string) Namespace {
if ns, found := namespaces[val]; found {
return ns
}
for p, ns := range prefixNamespaces {
if strings.HasPrefix(val, p) {
return ns
}
}
return UnknownNamespace
}

View File

@ -0,0 +1,36 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sysctl
import (
"testing"
)
func TestNamespacedBy(t *testing.T) {
tests := map[string]Namespace{
"kernel.shm_rmid_forced": IpcNamespace,
"net.a.b.c": NetNamespace,
"fs.mqueue.a.b.c": IpcNamespace,
"foo": UnknownNamespace,
}
for sysctl, ns := range tests {
if got := NamespacedBy(sysctl); got != ns {
t.Errorf("wrong namespace for %q: got=%s want=%s", sysctl, got, ns)
}
}
}

View File

@ -0,0 +1,96 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sysctl
import (
"fmt"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/dockertools"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
)
const (
UnsupportedReason = "SysctlUnsupported"
)
type runtimeAdmitHandler struct {
result lifecycle.PodAdmitResult
}
var _ lifecycle.PodAdmitHandler = &runtimeAdmitHandler{}
// NewRuntimeAdmitHandler returns a sysctlRuntimeAdmitHandler which checks whether
// the given runtime support sysctls.
func NewRuntimeAdmitHandler(runtime container.Runtime) (*runtimeAdmitHandler, error) {
if runtime.Type() == dockertools.DockerType {
v, err := runtime.APIVersion()
if err != nil {
return nil, fmt.Errorf("failed to get runtime version: %v", err)
}
// only Docker >= 1.12 supports sysctls
c, err := v.Compare(dockertools.DockerV112APIVersion)
if err != nil {
return nil, fmt.Errorf("failed to compare Docker version for sysctl support: %v", err)
}
if c >= 0 {
return &runtimeAdmitHandler{
result: lifecycle.PodAdmitResult{
Admit: true,
},
}, nil
}
return &runtimeAdmitHandler{
result: lifecycle.PodAdmitResult{
Admit: false,
Reason: UnsupportedReason,
Message: "Docker before 1.12 does not support sysctls",
},
}, nil
}
// for other runtimes like rkt sysctls are not supported
return &runtimeAdmitHandler{
result: lifecycle.PodAdmitResult{
Admit: false,
Reason: UnsupportedReason,
Message: fmt.Sprintf("runtime %v does not support sysctls", runtime.Type()),
},
}, nil
}
// Admit checks whether the runtime supports sysctls.
func (w *runtimeAdmitHandler) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
sysctls, unsafeSysctls, err := api.SysctlsFromPodAnnotations(attrs.Pod.Annotations)
if err != nil {
return lifecycle.PodAdmitResult{
Admit: false,
Reason: AnnotationInvalidReason,
Message: fmt.Sprintf("invalid sysctl annotation: %v", err),
}
}
if len(sysctls)+len(unsafeSysctls) > 0 {
return w.result
}
return lifecycle.PodAdmitResult{
Admit: true,
}
}

View File

@ -0,0 +1,171 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sysctl
import (
"fmt"
"strings"
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/validation"
extvalidation "k8s.io/kubernetes/pkg/apis/extensions/validation"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
)
const (
AnnotationInvalidReason = "InvalidSysctlAnnotation"
ForbiddenReason = "SysctlForbidden"
)
// SafeSysctlWhitelist returns the whitelist of safe sysctls and safe sysctl patterns (ending in *).
//
// A sysctl is called safe iff
// - it is namespaced in the container or the pod
// - it is isolated, i.e. has no influence on any other pod on the same node.
func SafeSysctlWhitelist() []string {
return []string{
"kernel.shm_rmid_forced",
"net.ipv4.ip_local_port_range",
"net.ipv4.tcp_max_syn_backlog",
"net.ipv4.tcp_syncookies",
}
}
// Whitelist provides a list of allowed sysctls and sysctl patterns (ending in *)
// and a function to check whether a given sysctl matches this list.
type Whitelist interface {
// Validate checks that all sysctls given in a api.SysctlsPodAnnotationKey annotation
// are valid according to the whitelist.
Validate(pod *api.Pod) error
}
// patternWhitelist takes a list of sysctls or sysctl patterns (ending in *) and
// checks validity via a sysctl and prefix map, rejecting those which are not known
// to be namespaced.
type patternWhitelist struct {
sysctls map[string]Namespace
prefixes map[string]Namespace
annotationKey string
}
var _ lifecycle.PodAdmitHandler = &patternWhitelist{}
// NewWhitelist creates a new Whitelist from a list of sysctls and sysctl pattern (ending in *).
func NewWhitelist(patterns []string, annotationKey string) (*patternWhitelist, error) {
w := &patternWhitelist{
sysctls: map[string]Namespace{},
prefixes: map[string]Namespace{},
annotationKey: annotationKey,
}
for _, s := range patterns {
if !extvalidation.IsValidSysctlPattern(s) {
return nil, fmt.Errorf("sysctl %q must have at most %d characters and match regex %s",
s,
validation.SysctlMaxLength,
extvalidation.SysctlPatternFmt,
)
}
if strings.HasSuffix(s, "*") {
prefix := s[:len(s)-1]
ns := NamespacedBy(prefix)
if ns == UnknownNamespace {
return nil, fmt.Errorf("the sysctls %q are not known to be namespaced", s)
}
w.prefixes[prefix] = ns
} else {
ns := NamespacedBy(s)
if ns == UnknownNamespace {
return nil, fmt.Errorf("the sysctl %q are not known to be namespaced", s)
}
w.sysctls[s] = ns
}
}
return w, nil
}
// validateSysctl checks that a sysctl is whitelisted because it is known
// to be namespaced by the Linux kernel. Note that being whitelisted is required, but not
// sufficient: the container runtime might have a stricter check and refuse to launch a pod.
//
// The parameters hostNet and hostIPC are used to forbid sysctls for pod sharing the
// respective namespaces with the host. This check is only possible for sysctls on
// the static default whitelist, not those on the custom whitelist provided by the admin.
func (w *patternWhitelist) validateSysctl(sysctl string, hostNet, hostIPC bool) error {
nsErrorFmt := "%q not allowed with host %s enabled"
if ns, found := w.sysctls[sysctl]; found {
if ns == IpcNamespace && hostIPC {
return fmt.Errorf(nsErrorFmt, sysctl, ns)
}
if ns == NetNamespace && hostNet {
return fmt.Errorf(nsErrorFmt, sysctl, ns)
}
return nil
}
for p, ns := range w.prefixes {
if strings.HasPrefix(sysctl, p) {
if ns == IpcNamespace && hostIPC {
return fmt.Errorf(nsErrorFmt, sysctl, ns)
}
if ns == NetNamespace && hostNet {
return fmt.Errorf(nsErrorFmt, sysctl, ns)
}
return nil
}
}
return fmt.Errorf("%q not whitelisted", sysctl)
}
// Admit checks that all sysctls given in a api.SysctlsPodAnnotationKey annotation
// are valid according to the whitelist.
func (w *patternWhitelist) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAdmitResult {
pod := attrs.Pod
a := pod.Annotations[w.annotationKey]
if a == "" {
return lifecycle.PodAdmitResult{
Admit: true,
}
}
sysctls, err := api.SysctlsFromPodAnnotation(a)
if err != nil {
return lifecycle.PodAdmitResult{
Admit: false,
Reason: AnnotationInvalidReason,
Message: fmt.Sprintf("invalid %s annotation: %v", w.annotationKey, err),
}
}
var hostNet, hostIPC bool
if pod.Spec.SecurityContext != nil {
hostNet = pod.Spec.SecurityContext.HostNetwork
hostIPC = pod.Spec.SecurityContext.HostIPC
}
for _, s := range sysctls {
if err := w.validateSysctl(s.Name, hostNet, hostIPC); err != nil {
return lifecycle.PodAdmitResult{
Admit: false,
Reason: ForbiddenReason,
Message: fmt.Sprintf("forbidden sysctl: %v", err),
}
}
}
return lifecycle.PodAdmitResult{
Admit: true,
}
}

View File

@ -0,0 +1,84 @@
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package sysctl
import (
"testing"
"k8s.io/kubernetes/pkg/api"
)
func TestNewWhitelist(t *testing.T) {
type Test struct {
sysctls []string
err bool
}
for _, test := range []Test{
{sysctls: []string{"kernel.msg*", "kernel.sem"}},
{sysctls: []string{" kernel.msg*"}, err: true},
{sysctls: []string{"kernel.msg* "}, err: true},
{sysctls: []string{"net.-"}, err: true},
{sysctls: []string{"net.*.foo"}, err: true},
{sysctls: []string{"foo"}, err: true},
} {
_, err := NewWhitelist(append(SafeSysctlWhitelist(), test.sysctls...), api.SysctlsPodAnnotationKey)
if test.err && err == nil {
t.Errorf("expected an error creating a whitelist for %v", test.sysctls)
} else if !test.err && err != nil {
t.Errorf("got unexpected error creating a whitelist for %v: %v", test.sysctls, err)
}
}
}
func TestWhitelist(t *testing.T) {
type Test struct {
sysctl string
hostNet, hostIPC bool
}
valid := []Test{
{sysctl: "kernel.shm_rmid_forced"},
{sysctl: "net.ipv4.ip_local_port_range"},
{sysctl: "kernel.msgmax"},
{sysctl: "kernel.sem"},
}
invalid := []Test{
{sysctl: "kernel.shm_rmid_forced", hostIPC: true},
{sysctl: "net.ipv4.ip_local_port_range", hostNet: true},
{sysctl: "foo"},
{sysctl: "net.a.b.c", hostNet: false},
{sysctl: "net.ipv4.ip_local_port_range.a.b.c", hostNet: false},
{sysctl: "kernel.msgmax", hostIPC: true},
{sysctl: "kernel.sem", hostIPC: true},
}
w, err := NewWhitelist(append(SafeSysctlWhitelist(), "kernel.msg*", "kernel.sem"), api.SysctlsPodAnnotationKey)
if err != nil {
t.Fatalf("failed to create whitelist: %v", err)
}
for _, test := range valid {
if err := w.validateSysctl(test.sysctl, test.hostNet, test.hostIPC); err != nil {
t.Errorf("expected to be whitelisted: %+v, got: %v", test, err)
}
}
for _, test := range invalid {
if err := w.validateSysctl(test.sysctl, test.hostNet, test.hostIPC); err == nil {
t.Errorf("expected to be rejected: %+v", test)
}
}
}