mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-25 12:43:23 +00:00
Merge pull request #44798 from zetaab/master
Automatic merge from submit-queue Statefulsets for cinder: allow multi-AZ deployments, spread pods across zones **What this PR does / why we need it**: Currently if we do not specify availability zone in cinder storageclass, the cinder is provisioned to zone called nova. However, like mentioned in issue, we have situation that we want spread statefulset across 3 different zones. Currently this is not possible with statefulsets and cinder storageclass. In this new solution, if we leave it empty the algorithm will choose the zone for the cinder drive similar style like in aws and gce storageclass solutions. **Which issue this PR fixes** fixes #44735 **Special notes for your reviewer**: example: ``` kind: StorageClass apiVersion: storage.k8s.io/v1beta1 metadata: name: all provisioner: kubernetes.io/cinder --- apiVersion: v1 kind: Service metadata: annotations: service.alpha.kubernetes.io/tolerate-unready-endpoints: "true" name: galera labels: app: mysql spec: ports: - port: 3306 name: mysql clusterIP: None selector: app: mysql --- apiVersion: apps/v1beta1 kind: StatefulSet metadata: name: mysql spec: serviceName: "galera" replicas: 3 template: metadata: labels: app: mysql annotations: pod.alpha.kubernetes.io/initialized: "true" spec: containers: - name: mysql image: adfinissygroup/k8s-mariadb-galera-centos:v002 imagePullPolicy: Always ports: - containerPort: 3306 name: mysql - containerPort: 4444 name: sst - containerPort: 4567 name: replication - containerPort: 4568 name: ist volumeMounts: - name: storage mountPath: /data readinessProbe: exec: command: - /usr/share/container-scripts/mysql/readiness-probe.sh initialDelaySeconds: 15 timeoutSeconds: 5 env: - name: POD_NAMESPACE valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.namespace volumeClaimTemplates: - metadata: name: storage annotations: volume.beta.kubernetes.io/storage-class: all spec: accessModes: [ "ReadWriteOnce" ] resources: requests: storage: 12Gi ``` If this example is deployed it will automatically create one replica per AZ. This helps us a lot making HA databases. Current storageclass for cinder is not perfect in case of statefulsets. Lets assume that cinder storageclass is defined to be in zone called nova, but because labels are not added to pv - pods can be started in any zone. The problem is that at least in our openstack it is not possible to use cinder drive located in zone x from zone y. However, should we have possibility to choose between cross-zone cinder mounts or not? Imo it is not good way of doing things that they mount volume from another zone where the pod is located(means more network traffic between zones)? What you think? Current new solution does not allow that anymore (should we have possibility to allow it? it means removing the labels from pv). There might be some things that needs to be fixed still in this release and I need help for that. Some parts of the code is not perfect. Issues what i am thinking about (I need some help for these): 1) Can everybody see in openstack what AZ their servers are? Can there be like access policy that do not show that? If AZ is not found from server specs, I have no idea how the code behaves. 2) In GetAllZones() function, is it really needed to make new serviceclient using openstack.NewComputeV2 or could I somehow use existing one 3) This fetches all servers from some openstack tenant(project). However, in some cases kubernetes is maybe deployed only to specific zone. If kube servers are located for instance in zone 1, and then there are another servers in same tenant in zone 2. There might be usecase that cinder drive is provisioned to zone-2 but it cannot start pod, because kubernetes does not have any nodes in zone-2. Could we have better way to fetch kubernetes nodes zones? Currently that information is not added to kubernetes node labels automatically in openstack (which should I think). I have added those labels manually to nodes. If that zone information is not added to nodes, the new solution does not start stateful pods at all, because it cannot target pods. cc @rootfs @anguslees @jsafrane ```release-note Default behaviour in cinder storageclass is changed. If availability is not specified, the zone is chosen by algorithm. It makes possible to spread stateful pods across many zones. ```
This commit is contained in:
commit
49626c975b
@ -370,7 +370,7 @@ func TestVolumes(t *testing.T) {
|
||||
tags := map[string]string{
|
||||
"test": "value",
|
||||
}
|
||||
vol, err := os.CreateVolume("kubernetes-test-volume-"+rand.String(10), 1, "", "", &tags)
|
||||
vol, _, err := os.CreateVolume("kubernetes-test-volume-"+rand.String(10), 1, "", "", &tags)
|
||||
if err != nil {
|
||||
t.Fatalf("Cannot create a new Cinder volume: %v", err)
|
||||
}
|
||||
|
@ -36,7 +36,7 @@ import (
|
||||
)
|
||||
|
||||
type volumeService interface {
|
||||
createVolume(opts VolumeCreateOpts) (string, error)
|
||||
createVolume(opts VolumeCreateOpts) (string, string, error)
|
||||
getVolume(diskName string) (Volume, error)
|
||||
deleteVolume(volumeName string) error
|
||||
}
|
||||
@ -74,7 +74,7 @@ type VolumeCreateOpts struct {
|
||||
Metadata map[string]string
|
||||
}
|
||||
|
||||
func (volumes *VolumesV1) createVolume(opts VolumeCreateOpts) (string, error) {
|
||||
func (volumes *VolumesV1) createVolume(opts VolumeCreateOpts) (string, string, error) {
|
||||
|
||||
create_opts := volumes_v1.CreateOpts{
|
||||
Name: opts.Name,
|
||||
@ -86,12 +86,12 @@ func (volumes *VolumesV1) createVolume(opts VolumeCreateOpts) (string, error) {
|
||||
|
||||
vol, err := volumes_v1.Create(volumes.blockstorage, create_opts).Extract()
|
||||
if err != nil {
|
||||
return "", err
|
||||
return "", "", err
|
||||
}
|
||||
return vol.ID, nil
|
||||
return vol.ID, vol.AvailabilityZone, nil
|
||||
}
|
||||
|
||||
func (volumes *VolumesV2) createVolume(opts VolumeCreateOpts) (string, error) {
|
||||
func (volumes *VolumesV2) createVolume(opts VolumeCreateOpts) (string, string, error) {
|
||||
|
||||
create_opts := volumes_v2.CreateOpts{
|
||||
Name: opts.Name,
|
||||
@ -103,9 +103,9 @@ func (volumes *VolumesV2) createVolume(opts VolumeCreateOpts) (string, error) {
|
||||
|
||||
vol, err := volumes_v2.Create(volumes.blockstorage, create_opts).Extract()
|
||||
if err != nil {
|
||||
return "", err
|
||||
return "", "", err
|
||||
}
|
||||
return vol.ID, nil
|
||||
return vol.ID, vol.AvailabilityZone, nil
|
||||
}
|
||||
|
||||
func (volumes *VolumesV1) getVolume(diskName string) (Volume, error) {
|
||||
@ -283,12 +283,12 @@ func (os *OpenStack) getVolume(diskName string) (Volume, error) {
|
||||
}
|
||||
|
||||
// Create a volume of given size (in GiB)
|
||||
func (os *OpenStack) CreateVolume(name string, size int, vtype, availability string, tags *map[string]string) (volumeName string, err error) {
|
||||
func (os *OpenStack) CreateVolume(name string, size int, vtype, availability string, tags *map[string]string) (string, string, error) {
|
||||
|
||||
volumes, err := os.volumeService("")
|
||||
if err != nil || volumes == nil {
|
||||
glog.Errorf("Unable to initialize cinder client for region: %s", os.region)
|
||||
return "", err
|
||||
return "", "", err
|
||||
}
|
||||
opts := VolumeCreateOpts{
|
||||
Name: name,
|
||||
@ -299,15 +299,15 @@ func (os *OpenStack) CreateVolume(name string, size int, vtype, availability str
|
||||
if tags != nil {
|
||||
opts.Metadata = *tags
|
||||
}
|
||||
volume_id, err := volumes.createVolume(opts)
|
||||
volumeId, volumeAZ, err := volumes.createVolume(opts)
|
||||
|
||||
if err != nil {
|
||||
glog.Errorf("Failed to create a %d GB volume: %v", size, err)
|
||||
return "", err
|
||||
return "", "", err
|
||||
}
|
||||
|
||||
glog.Infof("Created volume %v", volume_id)
|
||||
return volume_id, nil
|
||||
glog.Infof("Created volume %v in Availability Zone: %v", volumeId, volumeAZ)
|
||||
return volumeId, volumeAZ, nil
|
||||
}
|
||||
|
||||
// GetDevicePath returns the path of an attached block storage volume, specified by its id.
|
||||
|
@ -479,8 +479,8 @@ func (os *Rackspace) GetZone() (cloudprovider.Zone, error) {
|
||||
}
|
||||
|
||||
// Create a volume of given size (in GiB)
|
||||
func (rs *Rackspace) CreateVolume(name string, size int, vtype, availability string, tags *map[string]string) (volumeName string, err error) {
|
||||
return "", errors.New("unimplemented")
|
||||
func (rs *Rackspace) CreateVolume(name string, size int, vtype, availability string, tags *map[string]string) (volumeName string, volumeAZ string, err error) {
|
||||
return "", "", errors.New("unimplemented")
|
||||
}
|
||||
|
||||
func (rs *Rackspace) DeleteVolume(volumeName string) error {
|
||||
|
@ -19,6 +19,7 @@ go_library(
|
||||
tags = ["automanaged"],
|
||||
deps = [
|
||||
"//pkg/api/v1:go_default_library",
|
||||
"//pkg/client/clientset_generated/clientset:go_default_library",
|
||||
"//pkg/cloudprovider:go_default_library",
|
||||
"//pkg/cloudprovider/providers/openstack:go_default_library",
|
||||
"//pkg/cloudprovider/providers/rackspace:go_default_library",
|
||||
@ -32,6 +33,7 @@ go_library(
|
||||
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/apis/meta/v1:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/types:go_default_library",
|
||||
"//vendor/k8s.io/apimachinery/pkg/util/sets:go_default_library",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -506,8 +506,8 @@ func (testcase *testcase) ShouldTrustDevicePath() bool {
|
||||
return true
|
||||
}
|
||||
|
||||
func (testcase *testcase) CreateVolume(name string, size int, vtype, availability string, tags *map[string]string) (volumeName string, err error) {
|
||||
return "", errors.New("Not implemented")
|
||||
func (testcase *testcase) CreateVolume(name string, size int, vtype, availability string, tags *map[string]string) (volumeId string, volumeAZ string, err error) {
|
||||
return "", "", errors.New("Not implemented")
|
||||
}
|
||||
|
||||
func (testcase *testcase) GetDevicePath(diskId string) string {
|
||||
|
@ -47,7 +47,7 @@ type CinderProvider interface {
|
||||
AttachDisk(instanceID string, diskName string) (string, error)
|
||||
DetachDisk(instanceID string, partialDiskId string) error
|
||||
DeleteVolume(volumeName string) error
|
||||
CreateVolume(name string, size int, vtype, availability string, tags *map[string]string) (volumeName string, err error)
|
||||
CreateVolume(name string, size int, vtype, availability string, tags *map[string]string) (string, string, error)
|
||||
GetDevicePath(diskId string) string
|
||||
InstanceID() (string, error)
|
||||
GetAttachmentDiskPath(instanceID string, diskName string) (string, error)
|
||||
@ -239,7 +239,7 @@ type cdManager interface {
|
||||
// Detaches the disk from the kubelet's host machine.
|
||||
DetachDisk(unmounter *cinderVolumeUnmounter) error
|
||||
// Creates a volume
|
||||
CreateVolume(provisioner *cinderVolumeProvisioner) (volumeID string, volumeSizeGB int, err error)
|
||||
CreateVolume(provisioner *cinderVolumeProvisioner) (volumeID string, volumeSizeGB int, labels map[string]string, err error)
|
||||
// Deletes a volume
|
||||
DeleteVolume(deleter *cinderVolumeDeleter) error
|
||||
}
|
||||
@ -482,7 +482,7 @@ type cinderVolumeProvisioner struct {
|
||||
var _ volume.Provisioner = &cinderVolumeProvisioner{}
|
||||
|
||||
func (c *cinderVolumeProvisioner) Provision() (*v1.PersistentVolume, error) {
|
||||
volumeID, sizeGB, err := c.manager.CreateVolume(c)
|
||||
volumeID, sizeGB, labels, err := c.manager.CreateVolume(c)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -490,7 +490,7 @@ func (c *cinderVolumeProvisioner) Provision() (*v1.PersistentVolume, error) {
|
||||
pv := &v1.PersistentVolume{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: c.options.PVName,
|
||||
Labels: map[string]string{},
|
||||
Labels: labels,
|
||||
Annotations: map[string]string{
|
||||
"kubernetes.io/createdby": "cinder-dynamic-provisioner",
|
||||
},
|
||||
|
@ -116,8 +116,8 @@ func (fake *fakePDManager) DetachDisk(c *cinderVolumeUnmounter) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (fake *fakePDManager) CreateVolume(c *cinderVolumeProvisioner) (volumeID string, volumeSizeGB int, err error) {
|
||||
return "test-volume-name", 1, nil
|
||||
func (fake *fakePDManager) CreateVolume(c *cinderVolumeProvisioner) (volumeID string, volumeSizeGB int, labels map[string]string, err error) {
|
||||
return "test-volume-name", 1, nil, nil
|
||||
}
|
||||
|
||||
func (fake *fakePDManager) DeleteVolume(cd *cinderVolumeDeleter) error {
|
||||
|
@ -25,7 +25,11 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/golang/glog"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/kubernetes/pkg/api/v1"
|
||||
"k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
|
||||
"k8s.io/kubernetes/pkg/util/exec"
|
||||
"k8s.io/kubernetes/pkg/volume"
|
||||
)
|
||||
@ -135,10 +139,28 @@ func (util *CinderDiskUtil) DeleteVolume(cd *cinderVolumeDeleter) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (util *CinderDiskUtil) CreateVolume(c *cinderVolumeProvisioner) (volumeID string, volumeSizeGB int, err error) {
|
||||
func getZonesFromNodes(kubeClient clientset.Interface) (sets.String, error) {
|
||||
// TODO: caching, currently it is overkill because it calls this function
|
||||
// only when it creates dynamic PV
|
||||
zones := make(sets.String)
|
||||
nodes, err := kubeClient.Core().Nodes().List(metav1.ListOptions{})
|
||||
if err != nil {
|
||||
glog.V(2).Infof("Error listing nodes")
|
||||
return zones, err
|
||||
}
|
||||
for _, node := range nodes.Items {
|
||||
if zone, ok := node.Labels[metav1.LabelZoneFailureDomain]; ok {
|
||||
zones.Insert(zone)
|
||||
}
|
||||
}
|
||||
glog.V(4).Infof("zones found: %v", zones)
|
||||
return zones, nil
|
||||
}
|
||||
|
||||
func (util *CinderDiskUtil) CreateVolume(c *cinderVolumeProvisioner) (volumeID string, volumeSizeGB int, volumeLabels map[string]string, err error) {
|
||||
cloud, err := c.plugin.getCloudProvider()
|
||||
if err != nil {
|
||||
return "", 0, err
|
||||
return "", 0, nil, err
|
||||
}
|
||||
|
||||
capacity := c.options.PVC.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
|
||||
@ -157,21 +179,40 @@ func (util *CinderDiskUtil) CreateVolume(c *cinderVolumeProvisioner) (volumeID s
|
||||
case "availability":
|
||||
availability = v
|
||||
default:
|
||||
return "", 0, fmt.Errorf("invalid option %q for volume plugin %s", k, c.plugin.GetPluginName())
|
||||
return "", 0, nil, fmt.Errorf("invalid option %q for volume plugin %s", k, c.plugin.GetPluginName())
|
||||
}
|
||||
}
|
||||
// TODO: implement PVC.Selector parsing
|
||||
if c.options.PVC.Spec.Selector != nil {
|
||||
return "", 0, fmt.Errorf("claim.Spec.Selector is not supported for dynamic provisioning on Cinder")
|
||||
return "", 0, nil, fmt.Errorf("claim.Spec.Selector is not supported for dynamic provisioning on Cinder")
|
||||
}
|
||||
|
||||
name, err = cloud.CreateVolume(name, volSizeGB, vtype, availability, c.options.CloudTags)
|
||||
if err != nil {
|
||||
glog.V(2).Infof("Error creating cinder volume: %v", err)
|
||||
return "", 0, err
|
||||
if availability == "" {
|
||||
// No zone specified, choose one randomly in the same region
|
||||
zones, err := getZonesFromNodes(c.plugin.host.GetKubeClient())
|
||||
if err != nil {
|
||||
glog.V(2).Infof("error getting zone information: %v", err)
|
||||
return "", 0, nil, err
|
||||
}
|
||||
// if we did not get any zones, lets leave it blank and gophercloud will
|
||||
// use zone "nova" as default
|
||||
if len(zones) > 0 {
|
||||
availability = volume.ChooseZoneForVolume(zones, c.options.PVC.Name)
|
||||
}
|
||||
}
|
||||
glog.V(2).Infof("Successfully created cinder volume %s", name)
|
||||
return name, volSizeGB, nil
|
||||
|
||||
volumeId, volumeAZ, errr := cloud.CreateVolume(name, volSizeGB, vtype, availability, c.options.CloudTags)
|
||||
if errr != nil {
|
||||
glog.V(2).Infof("Error creating cinder volume: %v", errr)
|
||||
return "", 0, nil, errr
|
||||
}
|
||||
glog.V(2).Infof("Successfully created cinder volume %s", volumeId)
|
||||
|
||||
// these are needed that pod is spawning to same AZ
|
||||
volumeLabels = make(map[string]string)
|
||||
volumeLabels[metav1.LabelZoneFailureDomain] = volumeAZ
|
||||
|
||||
return volumeId, volSizeGB, volumeLabels, nil
|
||||
}
|
||||
|
||||
func probeAttachedVolume() error {
|
||||
|
Loading…
Reference in New Issue
Block a user