From fcd0ab1f6e347b61d07eaea8db333e099800863a Mon Sep 17 00:00:00 2001 From: Paul Michali Date: Wed, 24 Jan 2018 20:04:36 +0000 Subject: [PATCH] IPv6: Ensure calculated node CIDR size for pod subnets is valid With IPv4, the node CIDR prefix is set to /24, which gives 256 pods per node and 256 nodes, when assuming a /16 is used for the pod subnet. For IPv6, the node CIDR prefix, is hard coded to /64. This does not work, because currently the pod subnet prefix must be /66 or higher and must be a larger subnet (lower value) than the node CIDR prefix. In addition, the bit mask used to track the subnets (implying the number of nodes), can only handle 64K entries, so the difference between pod subnet prefix and node CIDR prefix cannot be more than 16 (bits). The node CIDR value needs to support this restriction. To address this, the following algorithm is proposed... For pod subnet prefixes of /113 or smaller, the remaining bits will be used for the node CIDR, in multiples of 8, and 9-16 bits will be reserved for the nodes, so that there are 512-64K nodes and 256, 512, 768, ... pods/node. For example, with a pod network of /111, there will be 17 bits available. This would give 8 bits for pods per node and 9 bits for nodes. The node CIDR would be /120. For a pod network of /104, there will be 24 bits available. There will be 8 bits for nodes, and 16 bits for pods/node, using a /112 node CIDR. If the pod subnet prefix is /112, then the node CIDR will be set to /120, and 256 nodes and 256 pods/node will be available. If the subnet prefix is /113 to /128, we don't have enough bits and will set the node CIDR prefix to be the same as the pod subnet prefix. This will cause a falure later, when it tests that the pod subnet prefix is larger than the node CIDR prefix. --- .../app/phases/controlplane/manifests.go | 55 +++++++++++-- .../app/phases/controlplane/manifests_test.go | 77 ++++++++++++++++++- 2 files changed, 123 insertions(+), 9 deletions(-) diff --git a/cmd/kubeadm/app/phases/controlplane/manifests.go b/cmd/kubeadm/app/phases/controlplane/manifests.go index 14aae7e33ea..851fe1ba5d8 100644 --- a/cmd/kubeadm/app/phases/controlplane/manifests.go +++ b/cmd/kubeadm/app/phases/controlplane/manifests.go @@ -21,6 +21,7 @@ import ( "net" "os" "path/filepath" + "strconv" "strings" "k8s.io/api/core/v1" @@ -223,6 +224,53 @@ func getAPIServerCommand(cfg *kubeadmapi.MasterConfiguration, k8sVersion *versio return command } +// calcNodeCidrSize determines the size of the subnets used on each node, based +// on the pod subnet provided. For IPv4, we assume that the pod subnet will +// be /16 and use /24. If the pod subnet cannot be parsed, the IPv4 value will +// be used (/24). +// +// For IPv6, the algorithm will do two three. First, the node CIDR will be set +// to a multiple of 8, using the available bits for easier readability by user. +// Second, the number of nodes will be 512 to 64K to attempt to maximize the +// number of nodes (see NOTE below). Third, pod networks of /113 and larger will +// be rejected, as the amount of bits available is too small. +// +// A special case is when the pod network size is /112, where /120 will be used, +// only allowing 256 nodes and 256 pods. +// +// If the pod network size is /113 or larger, the node CIDR will be set to the same +// size and this will be rejected later in validation. +// +// NOTE: Currently, the pod network must be /66 or larger. It is not reflected here, +// but a smaller value will fail later validation. +// +// NOTE: Currently, the design allows a maximum of 64K nodes. This algorithm splits +// the available bits to maximize the number used for nodes, but still have the node +// CIDR be a multiple of eight. +// +func calcNodeCidrSize(podSubnet string) string { + maskSize := "24" + if ip, podCidr, err := net.ParseCIDR(podSubnet); err == nil { + if ip.To4() == nil { + var nodeCidrSize int + podNetSize, totalBits := podCidr.Mask.Size() + switch { + case podNetSize == 112: + // Special case, allows 256 nodes, 256 pods/node + nodeCidrSize = 120 + case podNetSize < 112: + // Use multiple of 8 for node CIDR, with 512 to 64K nodes + nodeCidrSize = totalBits - ((totalBits-podNetSize-1)/8-1)*8 + default: + // Not enough bits, will fail later, when validate + nodeCidrSize = podNetSize + } + maskSize = strconv.Itoa(nodeCidrSize) + } + } + return maskSize +} + // getControllerManagerCommand builds the right controller manager command from the given config object and version func getControllerManagerCommand(cfg *kubeadmapi.MasterConfiguration, k8sVersion *version.Version) []string { defaultArguments := map[string]string{ @@ -259,12 +307,7 @@ func getControllerManagerCommand(cfg *kubeadmapi.MasterConfiguration, k8sVersion // Let the controller-manager allocate Node CIDRs for the Pod network. // Each node will get a subspace of the address CIDR provided with --pod-network-cidr. if cfg.Networking.PodSubnet != "" { - maskSize := "24" - if ip, _, err := net.ParseCIDR(cfg.Networking.PodSubnet); err == nil { - if ip.To4() == nil { - maskSize = "64" - } - } + maskSize := calcNodeCidrSize(cfg.Networking.PodSubnet) command = append(command, "--allocate-node-cidrs=true", "--cluster-cidr="+cfg.Networking.PodSubnet, "--node-cidr-mask-size="+maskSize) } diff --git a/cmd/kubeadm/app/phases/controlplane/manifests_test.go b/cmd/kubeadm/app/phases/controlplane/manifests_test.go index b59f423cd30..c4825dfab17 100644 --- a/cmd/kubeadm/app/phases/controlplane/manifests_test.go +++ b/cmd/kubeadm/app/phases/controlplane/manifests_test.go @@ -629,7 +629,7 @@ func TestGetControllerManagerCommand(t *testing.T) { }, { cfg: &kubeadmapi.MasterConfiguration{ - Networking: kubeadmapi.Networking{PodSubnet: "2001:101:115::/48"}, + Networking: kubeadmapi.Networking{PodSubnet: "2001:db8::/64"}, CertificatesDir: testCertsDir, KubernetesVersion: "v1.7.0", }, @@ -645,8 +645,8 @@ func TestGetControllerManagerCommand(t *testing.T) { "--use-service-account-credentials=true", "--controllers=*,bootstrapsigner,tokencleaner", "--allocate-node-cidrs=true", - "--cluster-cidr=2001:101:115::/48", - "--node-cidr-mask-size=64", + "--cluster-cidr=2001:db8::/64", + "--node-cidr-mask-size=80", }, }, } @@ -661,6 +661,77 @@ func TestGetControllerManagerCommand(t *testing.T) { } } +func TestCalcNodeCidrSize(t *testing.T) { + tests := []struct { + name string + podSubnet string + expectedPrefix string + }{ + { + name: "Malformed pod subnet", + podSubnet: "10.10.10/160", + expectedPrefix: "24", + }, + { + name: "V4: Always uses 24", + podSubnet: "10.10.10.10/16", + expectedPrefix: "24", + }, + { + name: "V6: Use pod subnet size, when not enough space", + podSubnet: "2001:db8::/128", + expectedPrefix: "128", + }, + { + name: "V6: Use pod subnet size, when not enough space", + podSubnet: "2001:db8::/113", + expectedPrefix: "113", + }, + { + name: "V6: Special case with 256 nodes", + podSubnet: "2001:db8::/112", + expectedPrefix: "120", + }, + { + name: "V6: Using /120 for node CIDR", + podSubnet: "2001:db8::/104", + expectedPrefix: "120", + }, + { + name: "V6: Using /112 for node CIDR", + podSubnet: "2001:db8::/103", + expectedPrefix: "112", + }, + { + name: "V6: Using /112 for node CIDR", + podSubnet: "2001:db8::/96", + expectedPrefix: "112", + }, + { + name: "V6: Using /104 for node CIDR", + podSubnet: "2001:db8::/95", + expectedPrefix: "104", + }, + { + name: "V6: Largest subnet currently supported", + podSubnet: "2001:db8::/66", + expectedPrefix: "80", + }, + { + name: "V6: For /64 pod net, use /80", + podSubnet: "2001:db8::/64", + expectedPrefix: "80", + }, + } + for _, test := range tests { + actualPrefix := calcNodeCidrSize(test.podSubnet) + if actualPrefix != test.expectedPrefix { + t.Errorf("Case [%s]\nCalc of node CIDR size for pod subnet %q failed: Expected %q, saw %q", + test.name, test.podSubnet, test.expectedPrefix, actualPrefix) + } + } + +} func TestGetControllerManagerCommandExternalCA(t *testing.T) { tests := []struct {